scalable-pypeline 2.0.10__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pypeline/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.0.10"
1
+ __version__ = "2.1.0"
pypeline/barrier.py CHANGED
@@ -29,6 +29,9 @@ class LockingParallelBarrier:
29
29
  """Decrement the task counter in Redis."""
30
30
  return self.redis.decr(self.task_key)
31
31
 
32
+ def task_exists(self):
33
+ return self.redis.exists(self.task_key)
34
+
32
35
  def get_task_count(self):
33
36
  """Get the current value of the task counter."""
34
37
  return int(self.redis.get(self.task_key) or 0)
pypeline/dramatiq.py CHANGED
@@ -1,6 +1,3 @@
1
- import importlib
2
- import os.path
3
- import sys
4
1
  import typing
5
2
  import pika
6
3
  import logging
@@ -10,11 +7,9 @@ from urllib.parse import urlparse
10
7
 
11
8
  from pypeline.extensions import pypeline_config
12
9
  from warnings import warn
13
- from functools import wraps
14
10
  from apscheduler.schedulers.blocking import BlockingScheduler
15
11
  from apscheduler.triggers.cron import CronTrigger
16
- from typing import Awaitable, Callable, Optional, Union, TYPE_CHECKING, TypeVar
17
- from dramatiq import Broker, Middleware, actor as register_actor, set_broker, get_broker
12
+ from dramatiq import Broker, Middleware, set_broker, get_broker
18
13
  from dramatiq.brokers.rabbitmq import RabbitmqBroker
19
14
  from dramatiq.cli import (
20
15
  CPUS,
@@ -33,29 +28,24 @@ from pypeline.constants import (
33
28
  REDIS_URL,
34
29
  RABBIT_URL,
35
30
  DEFAULT_BROKER_CALLABLE,
36
- MS_IN_SECONDS,
37
- DEFAULT_TASK_TTL,
38
- DEFAULT_RESULT_TTL,
39
- DEFAULT_TASK_MAX_RETRY,
40
- DEFAULT_TASK_MIN_BACKOFF,
41
- DEFAULT_TASK_MAX_BACKOFF,
42
31
  DEFAULT_BROKER_CONNECTION_HEARTBEAT,
43
32
  DEFAULT_BROKER_BLOCKED_CONNECTION_TIMEOUT,
44
33
  DEFAULT_BROKER_CONNECTION_ATTEMPTS,
45
34
  )
46
- from pypeline.middleware import ParallelPipeline
35
+ from pypeline.pipelines.middleware.parallel_pipeline_middleware import ParallelPipeline
36
+ from pypeline.pipelines.middleware.pypeline_middleware import PypelineMiddleware
47
37
  from pypeline.utils.config_utils import (
48
38
  retrieve_latest_schedule_config,
49
39
  get_service_config_for_worker,
50
40
  )
41
+ from pypeline.utils.dramatiq_utils import (
42
+ guess_code_directory,
43
+ list_managed_actors,
44
+ register_lazy_actor,
45
+ LazyActor,
46
+ )
47
+ from pypeline.utils.module_utils import get_callable
51
48
 
52
- if TYPE_CHECKING:
53
- from typing_extensions import ParamSpec
54
-
55
- P = ParamSpec("P")
56
- else:
57
- P = TypeVar("P")
58
- R = TypeVar("R")
59
49
 
60
50
  logging.basicConfig(level=logging.INFO)
61
51
  logger = logging.getLogger(__name__)
@@ -79,71 +69,12 @@ def configure_default_broker(broker: Broker = None):
79
69
  )
80
70
  rabbit_broker.add_middleware(Results(backend=redis_backend))
81
71
  rabbit_broker.add_middleware(ParallelPipeline(redis_url=REDIS_URL))
72
+ rabbit_broker.add_middleware(PypelineMiddleware(redis_url=REDIS_URL))
82
73
  rabbit_broker.add_middleware(CurrentMessage())
83
74
  register_actors_for_workers(rabbit_broker)
84
75
  set_broker(rabbit_broker)
85
76
 
86
77
 
87
- def guess_code_directory(broker):
88
- actor = next(iter(broker.actors.values()))
89
- modname, *_ = actor.fn.__module__.partition(".")
90
- mod = sys.modules[modname]
91
- return os.path.dirname(mod.__file__)
92
-
93
-
94
- def get_module(resource_dot_path: str):
95
- """Retrieve the module based on a 'resource dot path'.
96
- e.g. package.subdir.feature_file.MyCallable
97
- """
98
- module_path = ".".join(resource_dot_path.split(".")[:-1])
99
- module = importlib.import_module(module_path)
100
- return module
101
-
102
-
103
- def get_callable_name(resource_dot_path: str) -> str:
104
- """Retrieve the callable based on config string.
105
- e.g. package.subdir.feature_file.MyCallable
106
- """
107
- callable_name = resource_dot_path.split(".")[-1]
108
- return callable_name
109
-
110
-
111
- def get_callable(resource_dot_path: str) -> Callable:
112
- """Retrieve the actual handler class based on config string.
113
- e.g. package.subdir.feature_file.MyCallable
114
- """
115
- module = get_module(resource_dot_path)
116
- callable_name = get_callable_name(resource_dot_path)
117
- return getattr(module, callable_name)
118
-
119
-
120
- def register_lazy_actor(
121
- broker: Broker,
122
- fn: Optional[Callable[P, Union[Awaitable[R], R]]] = None,
123
- pipeline_meta: typing.Dict = {},
124
- **kwargs,
125
- ) -> typing.Type["LazyActor"]:
126
- kwargs["queue_name"] = pipeline_meta.get("queue", "default")
127
- kwargs["max_retries"] = pipeline_meta.get("maxRetry", DEFAULT_TASK_MAX_RETRY)
128
- # Convert from seconds to milliseconds
129
- kwargs["min_backoff"] = (
130
- pipeline_meta.get("retryBackoff", DEFAULT_TASK_MIN_BACKOFF) * MS_IN_SECONDS
131
- )
132
- kwargs["max_backoff"] = (
133
- pipeline_meta.get("retryBackoffMax", DEFAULT_TASK_MAX_BACKOFF) * MS_IN_SECONDS
134
- )
135
- kwargs["time_limit"] = pipeline_meta.get("maxTtl", DEFAULT_TASK_TTL) * MS_IN_SECONDS
136
- # Always store results for registered pipeline actors
137
- kwargs["store_results"] = pipeline_meta.get("store_results", False)
138
- if kwargs["store_results"]:
139
- kwargs["result_ttl"] = (
140
- pipeline_meta.get("result_ttl", DEFAULT_RESULT_TTL) * MS_IN_SECONDS
141
- )
142
- lazy_actor: LazyActor = LazyActor(fn, kwargs)
143
- lazy_actor.register(broker)
144
- return lazy_actor
145
-
146
-
147
78
  def register_actors_for_workers(broker: Broker):
148
79
  service = get_service_config_for_worker(pypeline_config)
149
80
  scheduled_jobs_config = retrieve_latest_schedule_config()
@@ -154,9 +85,18 @@ def register_actors_for_workers(broker: Broker):
154
85
  pipeline_meta = None
155
86
  for pipeline_key, pipeline in pypeline_config["pipelines"].items():
156
87
  pipeline_config = pipeline["config"]
157
- pipeline_tasks = [
158
- t["handler"] for t in pipeline_config["taskDefinitions"].values()
159
- ]
88
+ if pipeline["schemaVersion"] == 1:
89
+ pipeline_tasks = [
90
+ t["handler"] for t in pipeline_config["taskDefinitions"].values()
91
+ ]
92
+ elif pipeline["schemaVersion"] == 2:
93
+ pipeline_tasks = [
94
+ handler
95
+ for key in pipeline_config["taskDefinitions"]
96
+ for handler in pipeline_config["taskDefinitions"][key].get(
97
+ "handlers", []
98
+ )
99
+ ]
160
100
  if task["handler"] in pipeline_tasks:
161
101
  pipeline_meta = pipeline_config["metadata"]
162
102
  break
@@ -292,77 +232,6 @@ class Dramatiq:
292
232
  return decorator
293
233
 
294
234
 
295
- def format_actor(actor):
296
- return "%s@%s" % (actor.actor_name, actor.queue_name)
297
-
298
-
299
- def ensure_return_value(default_value=None):
300
- def decorator(func):
301
- @wraps(func)
302
- def wrapper(*args, **kwargs):
303
- # Call the original function
304
- result = func(*args, **kwargs)
305
- # Check if the function has returned a value
306
- if result is None:
307
- # Return the default value if the function returned None
308
- return default_value
309
- return result
310
-
311
- return wrapper
312
-
313
- return decorator
314
-
315
-
316
- class LazyActor(object):
317
- # Intermediate object that register actor on broker an call.
318
-
319
- def __init__(self, fn, kw):
320
- self.fn = fn
321
- self.kw = kw
322
- self.actor = None
323
-
324
- def __call__(self, *a, **kw):
325
- return self.fn(*a, **kw)
326
-
327
- def __repr__(self):
328
- return "<%s %s.%s>" % (
329
- self.__class__.__name__,
330
- self.fn.__module__,
331
- self.fn.__name__,
332
- )
333
-
334
- def __getattr__(self, name):
335
- if not self.actor:
336
- raise AttributeError(name)
337
- return getattr(self.actor, name)
338
-
339
- def register(self, broker):
340
- self.actor = register_actor(
341
- actor_name=f"{self.fn.__module__}.{self.fn.__name__}",
342
- broker=broker,
343
- **self.kw,
344
- )(ensure_return_value(default_value=True)(self.fn))
345
-
346
- # Next is regular actor API.
347
- def send(self, *a, **kw):
348
- return self.actor.send(*a, **kw)
349
-
350
- def message(self, *a, **kw):
351
- return self.actor.message(*a, **kw)
352
-
353
- def send_with_options(self, *a, **kw):
354
- return self.actor.send_with_options(*a, **kw)
355
-
356
-
357
- def list_managed_actors(broker, queues):
358
- queues = set(queues)
359
- all_actors = broker.actors.values()
360
- if not queues:
361
- return all_actors
362
- else:
363
- return [a for a in all_actors if a.queue_name in queues]
364
-
365
-
366
235
  @click.command("cron-scheduler")
367
236
  def cron_scheduler(): # pragma: no cover
368
237
  # Configure our broker that we will schedule registered tasks for
@@ -464,6 +333,9 @@ def pypeline_worker(
464
333
  # Wraps dramatiq worker CLI in a Flask command. This is private API of
465
334
  # dramatiq.
466
335
 
336
+ def format_actor(actor):
337
+ return "%s@%s" % (actor.actor_name, actor.queue_name)
338
+
467
339
  parser = dramatiq_argument_parser()
468
340
 
469
341
  # Set worker broker globally.
@@ -3,6 +3,7 @@
3
3
 
4
4
  import importlib.metadata
5
5
  import logging
6
+ from http import HTTPStatus
6
7
 
7
8
  from flask import jsonify
8
9
  from flask.views import MethodView
@@ -11,12 +12,17 @@ from marshmallow import Schema, fields
11
12
  from marshmallow.exceptions import ValidationError
12
13
  from webargs.flaskparser import abort
13
14
  from packaging import version
14
- from pypeline.composition import PipelineResult
15
+ from pypeline.pipelines.composition.parallel_pipeline_composition import PipelineResult
15
16
  from pypeline.constants import API_DOC_RESPONSES, API_DOC_PARAMS, API_PATH_V1
16
17
  from pypeline.flask.decorators import require_accesskey
17
18
  from pypeline.pipeline_config_schema import BasePipelineSchema, PipelineSchemaV1
19
+ from pypeline.pipeline_settings_schema import (
20
+ MissingSettingsException,
21
+ PipelineScenarioSchema,
22
+ )
23
+ from pypeline.pipelines.factory import dag_generator
18
24
  from pypeline.utils.config_utils import retrieve_latest_pipeline_config
19
- from pypeline.utils.pipeline_utils import dag_generator
25
+ from pypeline.utils.schema_utils import get_clean_validation_messages
20
26
 
21
27
  logger = logging.getLogger(__name__)
22
28
  bp = Blueprint("pipelines", __name__, url_prefix=API_PATH_V1 + "/pipelines")
@@ -50,6 +56,31 @@ class InvokePipelineSchema(Schema):
50
56
  example={"document_id": "123", "send_alert": True},
51
57
  required=False,
52
58
  )
59
+ settings = fields.Raw(
60
+ description="Payload contains settings for a given pipeline",
61
+ example={
62
+ "param1": "Dataset",
63
+ "param2": 1,
64
+ "param3": 2,
65
+ },
66
+ required=False,
67
+ )
68
+
69
+ task_replacements = fields.Raw(
70
+ description="A dictionary of task definitions as the key and the value of the index for which handler"
71
+ " should be executed. If none provided it will default to the first handler in the list at index position 0.",
72
+ example={
73
+ "a": 1,
74
+ "b": 3,
75
+ },
76
+ required=False,
77
+ )
78
+
79
+ scenarios = fields.List(
80
+ fields.Nested(PipelineScenarioSchema),
81
+ metadata={"description": "List of scenarios to run for a given pipeline"},
82
+ required=False,
83
+ )
53
84
 
54
85
 
55
86
  class InvokePipelineResponseSchema(Schema):
@@ -166,12 +197,37 @@ class PipelineInvoke(MethodView):
166
197
 
167
198
  retval = {"pipeline_id": pipeline_id, "status": "starting"}
168
199
  try:
169
- payload = payload["chain_payload"] if "chain_payload" in payload else {}
170
- pipeline = dag_generator(pipeline_id=pipeline_id, event=payload)
200
+ chain_payload = payload.get("chain_payload", {})
201
+ settings = payload.get("settings", None)
202
+ task_replacements = payload.get("task_replacements", {})
203
+ scenarios = payload.get("scenarios", [])
204
+ if pipeline_config["schemaVersion"] == 1:
205
+ pipeline = dag_generator(
206
+ pipeline_id=pipeline_id,
207
+ event=chain_payload,
208
+ )
209
+ elif pipeline_config["schemaVersion"] == 2 and task_replacements:
210
+ pipeline = dag_generator(
211
+ pipeline_id=pipeline_id,
212
+ task_replacements=task_replacements,
213
+ scenarios=scenarios,
214
+ settings=settings,
215
+ )
216
+ retval["scenarios"] = pipeline.scenarios
171
217
  pipeline.run()
172
218
  pipeline_result = PipelineResult(pipeline.execution_id)
173
219
  pipeline_result.create_result_entry(pipeline.to_json())
174
220
  retval["execution_id"] = pipeline.execution_id
221
+ except MissingSettingsException:
222
+ abort(
223
+ HTTPStatus.BAD_REQUEST,
224
+ message="Missing required settings in the request.",
225
+ )
226
+ except ValidationError as ve:
227
+ abort(
228
+ HTTPStatus.BAD_REQUEST,
229
+ message=get_clean_validation_messages(ve),
230
+ )
175
231
  except Exception as e:
176
232
  msg = "Failed to invoke pipeline ... {}".format(pipeline_id)
177
233
  logger.error(msg)
@@ -1,16 +1,14 @@
1
1
  """ API Endpoints for Scheduled Tasks
2
2
  """
3
- import os
3
+
4
4
  import logging
5
5
  from flask import jsonify, request
6
6
  from flask.views import MethodView
7
- from marshmallow import Schema, fields
8
7
  from flask_smorest import Blueprint
9
8
  from flask import abort
10
9
  from marshmallow.exceptions import ValidationError
11
10
  from pypeline.constants import API_DOC_RESPONSES, API_DOC_PARAMS, API_PATH_V1
12
11
  from pypeline.utils.config_utils import retrieve_latest_schedule_config
13
- from pypeline.schedule_config_schema import BaseScheduleSchema
14
12
  from pypeline.flask.decorators import require_accesskey
15
13
 
16
14
  logger = logging.getLogger(__name__)
@@ -1,9 +1,12 @@
1
1
  """ Schemas for Pipelines
2
2
  """
3
+
3
4
  import yaml
4
5
  from marshmallow import Schema, fields, EXCLUDE, validates_schema
5
6
  from marshmallow.exceptions import ValidationError
6
7
 
8
+ from pypeline.pipeline_settings_schema import PipelineSettingsSchema
9
+
7
10
 
8
11
  class ExcludeUnknownSchema(Schema):
9
12
  """Remove unknown keys from loaded dictionary"""
@@ -81,8 +84,17 @@ class MetadataSchema(Schema):
81
84
  example=600,
82
85
  )
83
86
 
87
+ groupName = fields.String(
88
+ required=False,
89
+ metadata={
90
+ "description": "If two pipelines logically belong to a group the user can identify that two. "
91
+ "Imagine pipeline_a and pipeline_b both process data for images. "
92
+ 'Logically we could give them a mutual group name of "Image Processing Pipelines"'
93
+ },
94
+ )
95
+
84
96
 
85
- class TaskDefinitionsSchema(ExcludeUnknownSchema):
97
+ class TaskDefinitionsSchemaV1(ExcludeUnknownSchema):
86
98
  """Schema for a single task's configuration"""
87
99
 
88
100
  handler = fields.String(
@@ -105,7 +117,31 @@ class TaskDefinitionsSchema(ExcludeUnknownSchema):
105
117
  )
106
118
 
107
119
 
108
- class PipelineConfigSchemaV1(Schema):
120
+ class TaskDefinitionsSchemaV2(ExcludeUnknownSchema):
121
+ """Schema for a single task's configuration"""
122
+
123
+ handlers = fields.List(
124
+ fields.String(
125
+ required=True,
126
+ description="Path to the worker task definition",
127
+ example="client.workers.my_task",
128
+ )
129
+ )
130
+ maxTtl = fields.Integer(
131
+ required=False,
132
+ description="Max TTL for a task in seconds.",
133
+ default=60,
134
+ example=60,
135
+ )
136
+
137
+ queue = fields.String(
138
+ required=False,
139
+ description="Non-default queue for this task.",
140
+ example="custom-queue-name",
141
+ )
142
+
143
+
144
+ class PipelineConfigSchemaBase(Schema):
109
145
  """Overall pipeline configuration schema"""
110
146
 
111
147
  metadata = fields.Nested(
@@ -128,6 +164,31 @@ class PipelineConfigSchemaV1(Schema):
128
164
  required=True,
129
165
  description="The DAG Adjacency definition.",
130
166
  )
167
+
168
+
169
+ class PipelineConfigSchemaV1(PipelineConfigSchemaBase):
170
+ """Overall pipeline configuration schema"""
171
+
172
+ taskDefinitions = fields.Dict(
173
+ keys=fields.String(
174
+ required=True,
175
+ description="Task's node name. *Must* match related key in dagAdjacency.",
176
+ example="node_a",
177
+ ),
178
+ values=fields.Nested(
179
+ TaskDefinitionsSchemaV1,
180
+ required=True,
181
+ description="Definition of each task in the pipeline.",
182
+ example={"handler": "abc.task", "maxRetry": 1},
183
+ ),
184
+ required=True,
185
+ description="Configuration for each node defined in DAG.",
186
+ )
187
+
188
+
189
+ class PipelineConfigSchemaV2(PipelineConfigSchemaBase):
190
+ """Overall pipeline configuration schema"""
191
+
131
192
  taskDefinitions = fields.Dict(
132
193
  keys=fields.String(
133
194
  required=True,
@@ -135,7 +196,7 @@ class PipelineConfigSchemaV1(Schema):
135
196
  example="node_a",
136
197
  ),
137
198
  values=fields.Nested(
138
- TaskDefinitionsSchema,
199
+ TaskDefinitionsSchemaV2,
139
200
  required=True,
140
201
  description="Definition of each task in the pipeline.",
141
202
  example={"handler": "abc.task", "maxRetry": 1},
@@ -144,6 +205,14 @@ class PipelineConfigSchemaV1(Schema):
144
205
  description="Configuration for each node defined in DAG.",
145
206
  )
146
207
 
208
+ settings = fields.Nested(
209
+ PipelineSettingsSchema,
210
+ required=False,
211
+ metadata={
212
+ "description": "Settings schema to validate the actual settings being passed through to the pipelines."
213
+ },
214
+ )
215
+
147
216
 
148
217
  class BasePipelineSchema(ExcludeUnknownSchema):
149
218
  __schema_version__ = None
@@ -185,6 +254,25 @@ class BasePipelineSchema(ExcludeUnknownSchema):
185
254
  schema.load(data)
186
255
 
187
256
 
257
+ class PipelineSchemaV2(BasePipelineSchema):
258
+ __schema_version__ = 2
259
+
260
+ class Meta:
261
+ unknown = EXCLUDE
262
+
263
+ config = fields.Nested(
264
+ PipelineConfigSchemaV2,
265
+ required=True,
266
+ description="Metadata and configuration information for this pipeline.",
267
+ )
268
+
269
+ def validate_pipeline(self, data, **kwargs):
270
+ # We need to add this function to avoid infinite recursion since
271
+ # the BasePipelineSchema class above uses the same method for
272
+ # validation
273
+ pass
274
+
275
+
188
276
  class PipelineSchemaV1(BasePipelineSchema):
189
277
  __schema_version__ = 1
190
278