scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pypeline/__init__.py +1 -1
  2. pypeline/barrier.py +34 -0
  3. pypeline/composition.py +349 -0
  4. pypeline/constants.py +51 -84
  5. pypeline/dramatiq.py +470 -0
  6. pypeline/extensions.py +9 -8
  7. pypeline/flask/__init__.py +3 -5
  8. pypeline/flask/api/pipelines.py +109 -148
  9. pypeline/flask/api/schedules.py +14 -39
  10. pypeline/flask/decorators.py +18 -53
  11. pypeline/flask/flask_pypeline.py +156 -0
  12. pypeline/middleware.py +61 -0
  13. pypeline/pipeline_config_schema.py +105 -92
  14. pypeline/pypeline_yaml.py +458 -0
  15. pypeline/schedule_config_schema.py +35 -120
  16. pypeline/utils/config_utils.py +52 -310
  17. pypeline/utils/module_utils.py +35 -71
  18. pypeline/utils/pipeline_utils.py +161 -0
  19. scalable_pypeline-2.0.2.dist-info/METADATA +217 -0
  20. scalable_pypeline-2.0.2.dist-info/RECORD +27 -0
  21. scalable_pypeline-2.0.2.dist-info/entry_points.txt +3 -0
  22. tests/fixtures/__init__.py +0 -1
  23. pypeline/celery.py +0 -206
  24. pypeline/celery_beat.py +0 -254
  25. pypeline/flask/api/utils.py +0 -35
  26. pypeline/flask/flask_sermos.py +0 -156
  27. pypeline/generators.py +0 -196
  28. pypeline/logging_config.py +0 -171
  29. pypeline/pipeline/__init__.py +0 -0
  30. pypeline/pipeline/chained_task.py +0 -70
  31. pypeline/pipeline/generator.py +0 -254
  32. pypeline/sermos_yaml.py +0 -442
  33. pypeline/utils/graph_utils.py +0 -144
  34. pypeline/utils/task_utils.py +0 -552
  35. scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
  36. scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
  37. scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
  38. tests/fixtures/s3_fixtures.py +0 -52
  39. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/LICENSE +0 -0
  40. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/WHEEL +0 -0
  41. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
1
+ """ Pypeline implementation as a Flask extension
2
+ """
3
+ import os
4
+
5
+ if os.getenv("USE_GEVENT", "false").lower() == "true":
6
+ import gevent.monkey
7
+
8
+ gevent.monkey.patch_all()
9
+
10
+ import logging
11
+ from flask import Flask
12
+ from werkzeug.middleware.proxy_fix import ProxyFix
13
+ from flask_smorest import Api, Blueprint
14
+ from pypeline.extensions import pypeline_config
15
+ from pypeline.constants import DEFAULT_OPENAPI_CONFIG
16
+ from pypeline import __version__
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class FlaskPypeline:
22
+ """Pypeline Flask extension."""
23
+
24
+ def __init__(self, app: Flask = None):
25
+ """Class init"""
26
+ self.app = app
27
+ self.pypeline_config = pypeline_config if pypeline_config is not None else {}
28
+
29
+ if app is not None:
30
+ self.init_app(app)
31
+
32
+ def init_app(self, app: Flask, init_api: bool = False):
33
+ """Pypeline bootstrapping process.
34
+
35
+ Application config variables to set include:
36
+
37
+ pypeline_client_version (default: v?.?.?)
38
+
39
+ Optional, if `init_api` is True:
40
+
41
+ API_DOCUMENTATION_TITLE
42
+ API_DOCUMENTATION_DESCRIPTION
43
+ OPENAPI_VERSION
44
+ OPENAPI_URL_PREFIX
45
+ OPENAPI_SWAGGER_APP_NAME
46
+ OPENAPI_SWAGGER_UI_PATH
47
+ OPENAPI_SWAGGER_BASE_TEMPLATE
48
+ OPENAPI_SWAGGER_URL
49
+ OPENAPI_SWAGGER_UI_URL
50
+ SWAGGER_UI_DOC_EXPANSION
51
+ EXPLAIN_TEMPLATE_LOADING
52
+
53
+ Args:
54
+ app (Flask): Flask Application to initialize.
55
+ init_api (bool): If `True`, Pypeline will initialize its
56
+ core APIs (including Pipelines, Scheduled Tasks, etc.) and
57
+ provide a pre-configured OpenAPI Spec/Swagger UI interface
58
+ available at the route defined in your application's config
59
+ under `OPENAPI_URL_PREFIX` (default `/api`). Refer to
60
+ [flask-smorest](https://flask-smorest.readthedocs.io/en/latest/openapi.html)
61
+ documentation for additional configuration options.
62
+ """
63
+ # Ensure there's a PYPELINE_CLIENT_VERSION on app config
64
+ app.config.setdefault(
65
+ "PYPELINE_CLIENT_VERSION",
66
+ app.config.get("PYPELINE_CLIENT_VERSION", "v?.?.?"),
67
+ )
68
+
69
+ app.wsgi_app = ProxyFix(app.wsgi_app)
70
+ app.url_map.strict_slashes = False
71
+
72
+ # Create and register the pypeline blueprint
73
+ bp = Blueprint(
74
+ "pypeline",
75
+ __name__,
76
+ template_folder="../templates",
77
+ static_folder="../static",
78
+ url_prefix="/pypeline",
79
+ )
80
+ app.register_blueprint(bp)
81
+
82
+ # Bootstrap api if app requests
83
+ if init_api is True:
84
+ self._bootstrap_api(app)
85
+
86
+ def _bootstrap_api(self, app: Flask):
87
+ """If initializing the API, we will create the core Pypeline API paths
88
+ and initialize the default Swagger documentation.
89
+ """
90
+ # Set sensible defaults for Swagger docs. Provided `app` will
91
+ # take precedent.
92
+ for swagger_config in DEFAULT_OPENAPI_CONFIG:
93
+ app.config.setdefault(
94
+ swagger_config[0], app.config.get(swagger_config[0], swagger_config[1])
95
+ )
96
+
97
+ # Attempt to override with values from client's pypeline.yaml if
98
+ # they are available. This will add new tags and new docs if
99
+ # defined and add to the core Pypeline API docs.
100
+ api_config = self.pypeline_config.get("apiConfig", {})
101
+ api_docs = api_config.get("apiDocumentation", {})
102
+
103
+ custom_tags = api_config.get("prefixDescriptions", [])
104
+
105
+ app.config["PYPELINE_CLIENT_VERSION"] = (
106
+ api_docs.get("version", None)
107
+ if api_docs.get("version", None) is not None
108
+ else app.config["PYPELINE_CLIENT_VERSION"]
109
+ )
110
+
111
+ app.config["API_DOCUMENTATION_TITLE"] = (
112
+ api_docs.get("title", None)
113
+ if api_docs.get("title", None) is not None
114
+ else app.config["API_DOCUMENTATION_TITLE"]
115
+ )
116
+
117
+ app.config["API_DOCUMENTATION_DESCRIPTION"] = (
118
+ api_docs.get("description", None)
119
+ if api_docs.get("description", None) is not None
120
+ else app.config["API_DOCUMENTATION_DESCRIPTION"]
121
+ )
122
+
123
+ tags = [
124
+ {"name": "Pipelines", "description": "Operations related to Pipelines"},
125
+ {"name": "Schedules", "description": "Operations related to Schedules"},
126
+ ] + custom_tags
127
+
128
+ # Set up the initializing spec kwargs for API
129
+ spec_kwargs = {
130
+ "title": app.config["API_DOCUMENTATION_TITLE"],
131
+ "version": f"Pypeline: {__version__} - "
132
+ f"Client: {app.config['PYPELINE_CLIENT_VERSION']}",
133
+ "description": app.config["API_DOCUMENTATION_DESCRIPTION"],
134
+ "tags": tags,
135
+ }
136
+ try:
137
+ api = Api()
138
+ api.init_app(app, spec_kwargs=spec_kwargs)
139
+
140
+ self._register_api_namespaces(api)
141
+
142
+ except Exception as e:
143
+ api = None
144
+ logging.exception(f"Unable to initialize API ... {e}")
145
+
146
+ app.extensions.setdefault("pypeline_core_api", api)
147
+
148
+ @staticmethod
149
+ def _register_api_namespaces(api: Api):
150
+ """Register Default API namespaces"""
151
+ from pypeline.flask.api.pipelines import bp as pipelinesbp
152
+
153
+ api.register_blueprint(pipelinesbp)
154
+ from pypeline.flask.api.schedules import bp as schedulesbp
155
+
156
+ api.register_blueprint(schedulesbp)
pypeline/middleware.py ADDED
@@ -0,0 +1,61 @@
1
+ import copy
2
+ import os
3
+
4
+ from dramatiq.middleware import Middleware
5
+
6
+ from pypeline.barrier import LockingParallelBarrier
7
+ from pypeline.constants import PARALLEL_PIPELINE_CALLBACK_BARRIER_TTL
8
+
9
+
10
+ class ParallelPipeline(Middleware):
11
+ def __init__(self, redis_url):
12
+ self.redis_url = redis_url
13
+
14
+ def after_process_message(self, broker, message, *, result=None, exception=None):
15
+ from dramatiq.message import Message
16
+
17
+ if exception is None:
18
+ group_completion_uuid = message.options.get("group_completion_uuid")
19
+ if group_completion_uuid:
20
+ locking_parallel_barrier = LockingParallelBarrier(
21
+ self.redis_url,
22
+ task_key=group_completion_uuid,
23
+ lock_key=f"{group_completion_uuid}-lock",
24
+ )
25
+ try:
26
+ locking_parallel_barrier.acquire_lock(
27
+ timeout=PARALLEL_PIPELINE_CALLBACK_BARRIER_TTL
28
+ )
29
+ remaining_tasks = locking_parallel_barrier.decrement_task_count()
30
+ finally:
31
+ locking_parallel_barrier.release_lock()
32
+
33
+ if remaining_tasks <= 0:
34
+ execution_graph = message.options.get("execution_graph")
35
+
36
+ for i in range(len(execution_graph)):
37
+ message_group = execution_graph[i]
38
+
39
+ # Check if the current group matches the group_completion_uuid
40
+ if (
41
+ message_group[0]["options"]["group_completion_uuid"]
42
+ == group_completion_uuid
43
+ ):
44
+ # Check if there is a next group
45
+ if i + 1 < len(execution_graph):
46
+ next_group = execution_graph[i + 1]
47
+
48
+ completion_uuid = next_group[0]["options"][
49
+ "group_completion_uuid"
50
+ ]
51
+ locking_parallel_barrier = LockingParallelBarrier(
52
+ self.redis_url,
53
+ task_key=completion_uuid,
54
+ lock_key=f"{completion_uuid}-lock",
55
+ )
56
+ locking_parallel_barrier.set_task_count(len(next_group))
57
+ for next_message in next_group:
58
+ next_message["options"][
59
+ "execution_graph"
60
+ ] = copy.deepcopy(execution_graph)
61
+ broker.enqueue(Message(**next_message))
@@ -1,7 +1,4 @@
1
1
  """ Schemas for Pipelines
2
-
3
- TODO: Add validation that all specified nodes in DAG have corresponding
4
- node in taskDefinitions
5
2
  """
6
3
  import yaml
7
4
  from marshmallow import Schema, fields, EXCLUDE, validates_schema
@@ -9,144 +6,155 @@ from marshmallow.exceptions import ValidationError
9
6
 
10
7
 
11
8
  class ExcludeUnknownSchema(Schema):
12
- """ Remove unknown keys from loaded dictionary
13
- """
9
+ """Remove unknown keys from loaded dictionary"""
10
+
14
11
  class Meta:
15
- """ Exclude unknown properties.
16
- """
12
+ """Exclude unknown properties."""
13
+
17
14
  unknown = EXCLUDE
18
15
 
19
16
 
20
17
  class MetadataSchema(Schema):
21
- """ Schema for a pipeline's metadata object.
22
- """
23
- queue = fields.String(required=True,
24
- description="Default queue for all pipeline tasks.",
25
- example="default-queue-name")
26
- processorQueue = fields.String(
27
- required=False,
28
- description="Default processor queue for all pipeline tasks.",
29
- example="default-processor-queue-name",
30
- default="celery"
18
+ """Schema for a pipeline's metadata object."""
19
+
20
+ queue = fields.String(
21
+ required=True,
22
+ description="Default queue for all pipeline tasks.",
23
+ example="default-queue-name",
31
24
  )
32
25
  maxRetry = fields.Integer(
33
26
  required=False,
34
27
  description="A number. Maximum number of retries before giving up. "
35
- "A value of None means task will retry forever. "
36
- "By default, this option is set to 3.",
28
+ "A value of None means task will retry forever. "
29
+ "By default, this option is set to 3.",
37
30
  default=3,
38
- example=3)
31
+ example=3,
32
+ )
39
33
 
40
- maxTtl = fields.Integer(required=False,
41
- description="The soft time limit, in seconds, "
42
- "for this task. When not set the "
43
- "workers default is used. The hard "
44
- "time limit will be derived from this"
45
- "field, by adding 10 seconds.",
46
- default=60,
47
- example=60)
34
+ maxTtl = fields.Integer(
35
+ required=False,
36
+ description="The soft time limit, in seconds, "
37
+ "for this task. When not set the "
38
+ "workers default is used. The hard "
39
+ "time limit will be derived from this"
40
+ "field, by adding 10 seconds.",
41
+ default=60,
42
+ example=60,
43
+ )
48
44
 
49
45
  retryBackoff = fields.Integer(
50
46
  required=False,
51
47
  description="A number. If this option is set , it is used as a delay"
52
- " factor. For example, if this option is set to 3, the"
53
- " first retry will delay 3 seconds, the second will delay"
54
- " 6 seconds, the third will delay 12 seconds, the fourth"
55
- " will delay 24 seconds, and so on. By default, this"
56
- " option is set to False, and autoretries will not"
57
- " be delayed.",
48
+ " factor. For example, if this option is set to 3, the"
49
+ " first retry will delay 3 seconds, the second will delay"
50
+ " 6 seconds, the third will delay 12 seconds, the fourth"
51
+ " will delay 24 seconds, and so on. By default, this"
52
+ " option is set to False, and autoretries will not"
53
+ " be delayed.",
58
54
  default=3,
59
- example=3)
55
+ example=3,
56
+ )
60
57
 
61
58
  retryJitter = fields.Boolean(
62
59
  required=False,
63
60
  description="A boolean. Jitter is used to introduce randomness into "
64
- "exponential backoff delays, to prevent all tasks in the "
65
- "queue from being executed simultaneously. If this option "
66
- "is set to True, the delay value calculated by "
67
- "retry_backoff is treated as a maximum, and the actual "
68
- "delay value will be a random number between zero and that "
69
- "maximum. By default, this option is set to True.",
61
+ "exponential backoff delays, to prevent all tasks in the "
62
+ "queue from being executed simultaneously. If this option "
63
+ "is set to True, the delay value calculated by "
64
+ "retry_backoff is treated as a maximum, and the actual "
65
+ "delay value will be a random number between zero and that "
66
+ "maximum. By default, this option is set to True.",
70
67
  default=False,
71
- example=True)
68
+ example=True,
69
+ )
72
70
 
73
71
  retryBackoffMax = fields.Integer(
74
72
  required=False,
75
73
  description="A boolean. Jitter is used to introduce randomness into "
76
- "exponential backoff delays, to prevent all tasks in the "
77
- "queue from being executed simultaneously. If this option "
78
- "is set to True, the delay value calculated by "
79
- "retry_backoff is treated as a maximum, and the actual "
80
- "delay value will be a random number between zero and "
81
- "that maximum. By default, this option is set to True.",
74
+ "exponential backoff delays, to prevent all tasks in the "
75
+ "queue from being executed simultaneously. If this option "
76
+ "is set to True, the delay value calculated by "
77
+ "retry_backoff is treated as a maximum, and the actual "
78
+ "delay value will be a random number between zero and "
79
+ "that maximum. By default, this option is set to True.",
82
80
  default=600,
83
- example=600)
81
+ example=600,
82
+ )
84
83
 
85
84
 
86
85
  class TaskDefinitionsSchema(ExcludeUnknownSchema):
87
- """ Schema for a single task's configuration
88
- """
89
- handler = fields.String(required=True,
90
- description="Path to the worker task definition",
91
- example="client.workers.my_task")
86
+ """Schema for a single task's configuration"""
87
+
88
+ handler = fields.String(
89
+ required=True,
90
+ description="Path to the worker task definition",
91
+ example="client.workers.my_task",
92
+ )
92
93
 
93
- maxTtl = fields.Integer(required=False,
94
- description="Max TTL for a task in seconds.",
95
- default=60,
96
- example=60)
94
+ maxTtl = fields.Integer(
95
+ required=False,
96
+ description="Max TTL for a task in seconds.",
97
+ default=60,
98
+ example=60,
99
+ )
97
100
 
98
- queue = fields.String(required=False,
99
- description="Non-default queue for this task.",
100
- example="custom-queue-name")
101
+ queue = fields.String(
102
+ required=False,
103
+ description="Non-default queue for this task.",
104
+ example="custom-queue-name",
105
+ )
101
106
 
102
107
 
103
108
  class PipelineConfigSchemaV1(Schema):
104
- """ Overall pipeline configuration schema
105
- """
109
+ """Overall pipeline configuration schema"""
110
+
106
111
  metadata = fields.Nested(
107
112
  MetadataSchema,
108
113
  required=True,
109
- description="Metadata and configuration information for this pipeline."
114
+ description="Metadata and configuration information for this pipeline.",
110
115
  )
111
116
  dagAdjacency = fields.Dict(
112
117
  keys=fields.String(
113
118
  required=True,
114
- description=
115
- "Task's node name. *MUST* match key in taskDefinitions dict.",
116
- example="node_a"),
119
+ description="Task's node name. *MUST* match key in taskDefinitions dict.",
120
+ example="node_a",
121
+ ),
117
122
  values=fields.List(
118
123
  fields.String(
119
124
  required=True,
120
- description=
121
- "Task's node name. *Must* match key in taskDefinitions dict.")
125
+ description="Task's node name. *Must* match key in taskDefinitions dict.",
126
+ )
122
127
  ),
123
128
  required=True,
124
- description="The DAG Adjacency definition.")
129
+ description="The DAG Adjacency definition.",
130
+ )
125
131
  taskDefinitions = fields.Dict(
126
132
  keys=fields.String(
127
133
  required=True,
128
- description=
129
- "Task's node name. *Must* match related key in dagAdjacency.",
130
- example="node_a"),
134
+ description="Task's node name. *Must* match related key in dagAdjacency.",
135
+ example="node_a",
136
+ ),
131
137
  values=fields.Nested(
132
138
  TaskDefinitionsSchema,
133
139
  required=True,
134
140
  description="Definition of each task in the pipeline.",
135
- example={
136
- 'handler': 'abc.task',
137
- 'maxRetry': 1
138
- }),
141
+ example={"handler": "abc.task", "maxRetry": 1},
142
+ ),
139
143
  required=True,
140
- description="Configuration for each node defined in DAG.")
144
+ description="Configuration for each node defined in DAG.",
145
+ )
141
146
 
142
147
 
143
148
  class BasePipelineSchema(ExcludeUnknownSchema):
144
149
  __schema_version__ = None
145
150
 
146
151
  name = fields.String(required=True, description="Pipeline name")
147
- description = fields.String(required=False, missing=None,
148
- description="Description of the pipeline.",
149
- example="A valuable pipeline.")
152
+ description = fields.String(
153
+ required=False,
154
+ missing=None,
155
+ description="Description of the pipeline.",
156
+ example="A valuable pipeline.",
157
+ )
150
158
  schemaVersion = fields.Integer(required=True)
151
159
  config = fields.Dict(required=True)
152
160
 
@@ -171,9 +179,9 @@ class BasePipelineSchema(ExcludeUnknownSchema):
171
179
 
172
180
  @validates_schema
173
181
  def validate_pipeline(self, data, **kwargs):
174
- schema_version = data['schemaVersion']
182
+ schema_version = data["schemaVersion"]
175
183
  PipelineSchema = BasePipelineSchema.get_by_version(schema_version)
176
- schema = PipelineSchema(exclude=['name', 'description'])
184
+ schema = PipelineSchema(exclude=["name", "description"])
177
185
  schema.load(data)
178
186
 
179
187
 
@@ -186,7 +194,7 @@ class PipelineSchemaV1(BasePipelineSchema):
186
194
  config = fields.Nested(
187
195
  PipelineConfigSchemaV1,
188
196
  required=True,
189
- description="Metadata and configuration information for this pipeline."
197
+ description="Metadata and configuration information for this pipeline.",
190
198
  )
191
199
 
192
200
  def validate_pipeline(self, data, **kwargs):
@@ -197,15 +205,20 @@ class PipelineSchemaV1(BasePipelineSchema):
197
205
 
198
206
 
199
207
  class PipelineConfigValidator(object):
200
- """ Validate a pipeline configuration.
208
+ """Validate a pipeline configuration.
201
209
 
202
- This is stored as a string in the database under `PipelineConfig.config`
203
- in order to keep it easy for custom features to be added over time.
204
- This model represents the required / valid features so we can
205
- programmatically validate when saving, updating, viewing.
210
+ This is stored as a string in the database under `PipelineConfig.config`
211
+ in order to keep it easy for custom features to be added over time.
212
+ This model represents the required / valid features so we can
213
+ programmatically validate when saving, updating, viewing.
206
214
  """
207
- def __init__(self, config_dict: dict = None, config_yaml: str = None,
208
- schema_version: int = None):
215
+
216
+ def __init__(
217
+ self,
218
+ config_dict: dict = None,
219
+ config_yaml: str = None,
220
+ schema_version: int = None,
221
+ ):
209
222
  super().__init__()
210
223
 
211
224
  # We validate this as a dictionary. Turn into dictionary if provided
@@ -226,7 +239,7 @@ class PipelineConfigValidator(object):
226
239
  try:
227
240
  # https://github.com/marshmallow-code/marshmallow/issues/377
228
241
  # See issue above when migrating to marshmallow 3
229
- pcs = PipelineSchema._declared_fields['config'].schema
242
+ pcs = PipelineSchema._declared_fields["config"].schema
230
243
  self.validated_config = pcs.load(self.config)
231
244
  self.is_valid = True
232
245
  except ValidationError as e: