scalable-pypeline 2.1.31__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. pypeline/__init__.py +1 -0
  2. pypeline/barrier.py +63 -0
  3. pypeline/constants.py +94 -0
  4. pypeline/dramatiq.py +455 -0
  5. pypeline/executable_job_config_schema.py +35 -0
  6. pypeline/extensions.py +17 -0
  7. pypeline/flask/__init__.py +16 -0
  8. pypeline/flask/api/__init__.py +0 -0
  9. pypeline/flask/api/pipelines.py +275 -0
  10. pypeline/flask/api/schedules.py +40 -0
  11. pypeline/flask/decorators.py +41 -0
  12. pypeline/flask/flask_pypeline.py +156 -0
  13. pypeline/job_runner.py +205 -0
  14. pypeline/pipeline_config_schema.py +352 -0
  15. pypeline/pipeline_settings_schema.py +561 -0
  16. pypeline/pipelines/__init__.py +0 -0
  17. pypeline/pipelines/composition/__init__.py +0 -0
  18. pypeline/pipelines/composition/parallel_pipeline_composition.py +375 -0
  19. pypeline/pipelines/composition/pypeline_composition.py +215 -0
  20. pypeline/pipelines/factory.py +86 -0
  21. pypeline/pipelines/middleware/__init__.py +0 -0
  22. pypeline/pipelines/middleware/get_active_worker_id_middleware.py +22 -0
  23. pypeline/pipelines/middleware/graceful_shutdown_middleware.py +50 -0
  24. pypeline/pipelines/middleware/parallel_pipeline_middleware.py +60 -0
  25. pypeline/pipelines/middleware/pypeline_middleware.py +202 -0
  26. pypeline/pypeline_yaml.py +468 -0
  27. pypeline/schedule_config_schema.py +125 -0
  28. pypeline/utils/__init__.py +0 -0
  29. pypeline/utils/config_utils.py +81 -0
  30. pypeline/utils/dramatiq_utils.py +134 -0
  31. pypeline/utils/executable_job_util.py +35 -0
  32. pypeline/utils/graceful_shutdown_util.py +39 -0
  33. pypeline/utils/module_utils.py +108 -0
  34. pypeline/utils/pipeline_utils.py +144 -0
  35. pypeline/utils/schema_utils.py +24 -0
  36. scalable_pypeline-2.1.31.dist-info/LICENSE +177 -0
  37. scalable_pypeline-2.1.31.dist-info/METADATA +212 -0
  38. scalable_pypeline-2.1.31.dist-info/RECORD +42 -0
  39. scalable_pypeline-2.1.31.dist-info/WHEEL +6 -0
  40. scalable_pypeline-2.1.31.dist-info/entry_points.txt +6 -0
  41. scalable_pypeline-2.1.31.dist-info/top_level.txt +2 -0
  42. tests/fixtures/__init__.py +0 -0
pypeline/job_runner.py ADDED
@@ -0,0 +1,205 @@
1
+ import os
2
+ import logging
3
+ import argparse
4
+ import threading
5
+ import multiprocessing as mp
6
+
7
+ # Prefer 'spawn' for user code using multiprocessing
8
+ if mp.get_start_method(allow_none=True) != "spawn":
9
+ mp.set_start_method("spawn", force=True)
10
+
11
+ # Avoid staging more than one message; must be set before Dramatiq import path runs
12
+ os.environ.setdefault("dramatiq_queue_prefetch", "1")
13
+
14
+ from dramatiq import Worker, get_broker, set_broker
15
+ from dramatiq.middleware import Middleware
16
+
17
+
18
+ try:
19
+ # If your project exposes a helper to configure the default broker, use it.
20
+ from pypeline.dramatiq import configure_default_broker # adjust import if needed
21
+
22
+ broker = configure_default_broker() or get_broker()
23
+ set_broker(broker)
24
+ except Exception:
25
+ # Fall back to whatever Dramatiq has as the active broker.
26
+ import pypeline.dramatiq # noqa: F401 (ensure module side-effects run)
27
+
28
+ broker = get_broker()
29
+
30
+
31
+ class OneAndDone(Middleware):
32
+ """
33
+ Signals when the first message starts ('got_work') and completes ('done').
34
+ If stop_on_failure=True, we'll also mark done after the first failure.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ got_work: threading.Event,
40
+ done: threading.Event,
41
+ *,
42
+ stop_on_failure: bool = False
43
+ ):
44
+ self.got_work = got_work
45
+ self.done = done
46
+ self.stop_on_failure = stop_on_failure
47
+
48
+ def before_process_message(self, broker, message):
49
+ # First time we see a message begin processing in this process
50
+ if not self.got_work.is_set():
51
+ self.got_work.set()
52
+
53
+ def after_process_message(self, broker, message, *, result=None, exception=None):
54
+ # On success (or also on failure if configured), finish this worker
55
+ if exception is None or self.stop_on_failure:
56
+ if not self.done.is_set():
57
+ self.done.set()
58
+
59
+
60
+ def _graceful_stop(worker: Worker, log: logging.Logger):
61
+ try:
62
+ log.info("Stopping dramatiq worker...")
63
+ worker.stop() # stop consumers; no new messages will start
64
+ worker.join()
65
+ log.info("Worker stopped.")
66
+ except Exception as e:
67
+ log.exception("Error stopping worker: %s", e)
68
+
69
+
70
+ def _close_broker(log: logging.Logger):
71
+ try:
72
+ b = get_broker()
73
+ if b is not None and hasattr(b, "close"):
74
+ b.close()
75
+ log.info("Broker closed.")
76
+ except Exception as e:
77
+ log.exception("Error closing broker: %s", e)
78
+
79
+
80
+ def job_runner(queues, idle_timeout_ms: int = 0, *, stop_on_failure: bool = False):
81
+ """
82
+ Start a single-thread Dramatiq worker. Behavior:
83
+ - Wait up to `idle_timeout_ms` for *a job to start* (time-to-first-job).
84
+ - Once a job begins, wait indefinitely for it to complete.
85
+ - After the first successful job completes (or first job, if stop_on_failure=True), stop and exit.
86
+
87
+ Args:
88
+ queues (list[str]): queues to listen to
89
+ idle_timeout_ms (int): <=0 => wait forever for first job; >0 => exit if no job starts in time
90
+ stop_on_failure (bool): if True, exit after first job even if it fails
91
+ """
92
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
93
+ log = logging.getLogger("oneshot")
94
+
95
+ # Normalize timeout (treat non-positive as "infinite")
96
+ timeout_ms = (
97
+ int(idle_timeout_ms) if idle_timeout_ms and int(idle_timeout_ms) > 0 else 0
98
+ )
99
+ log.info(
100
+ "Launching worker with queues=%s, idle_timeout_ms=%s", queues, timeout_ms or "∞"
101
+ )
102
+
103
+ got_work = threading.Event()
104
+ done = threading.Event()
105
+ broker.add_middleware(OneAndDone(got_work, done, stop_on_failure=stop_on_failure))
106
+
107
+ worker = Worker(
108
+ broker,
109
+ worker_threads=1, # strictly one at a time
110
+ queues=queues,
111
+ worker_timeout=1000, # ms; how often the worker checks for stop
112
+ )
113
+
114
+ worker.start()
115
+
116
+ def controller():
117
+ log.debug("Controller thread started.")
118
+ try:
119
+ # Phase 1: Wait for *first job to start*
120
+ if timeout_ms > 0:
121
+ started = got_work.wait(timeout_ms / 1000.0)
122
+ if not started:
123
+ log.info(
124
+ "Idle timeout reached (%d ms); no jobs started. Stopping worker.",
125
+ timeout_ms,
126
+ )
127
+ return
128
+ else:
129
+ got_work.wait()
130
+
131
+ log.info("First job started; waiting for it to finish...")
132
+ # Phase 2: Wait for the first job to complete (no timeout)
133
+ done.wait()
134
+ log.info("First job finished; shutting down.")
135
+ finally:
136
+ _graceful_stop(worker, log)
137
+ _close_broker(log)
138
+ # Hard-exit to ensure K8s Job is marked Succeeded promptly, no lingering threads.
139
+ os._exit(0)
140
+
141
+ t = threading.Thread(target=controller, name="oneshot-controller", daemon=False)
142
+ t.start()
143
+ t.join() # Block until controller completes (which shuts everything down)
144
+
145
+
146
+ def _parse_args(argv=None):
147
+ ap = argparse.ArgumentParser(description="Run a one-shot Dramatiq worker.")
148
+ ap.add_argument(
149
+ "-q",
150
+ "--queue",
151
+ action="append",
152
+ default=None,
153
+ help="Queue to listen to (repeatable). You can also pass a comma-separated list.",
154
+ )
155
+ ap.add_argument(
156
+ "--idle-timeout-ms",
157
+ type=int,
158
+ default=int(os.getenv("IDLE_TIMEOUT_MS", "0")),
159
+ help="Exit if no job starts within this time (<=0 = wait forever).",
160
+ )
161
+ ap.add_argument(
162
+ "--stop-on-failure",
163
+ action="store_true",
164
+ help="Exit after the first job even if it fails.",
165
+ )
166
+ return ap.parse_args(argv)
167
+
168
+
169
+ def main(argv=None):
170
+ args = _parse_args(argv)
171
+
172
+ # Build queue list from flags or env, support comma-separated entries.
173
+ raw_entries = (
174
+ args.queue if args.queue else [os.getenv("JOB_QUEUE", "pipeline-queue")]
175
+ )
176
+ queues = []
177
+ for entry in raw_entries:
178
+ queues.extend([q.strip() for q in str(entry).split(",") if q and q.strip()])
179
+
180
+ if not queues:
181
+ raise SystemExit("No queues provided. Use -q ... or set JOB_QUEUE.")
182
+
183
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
184
+ log = logging.getLogger("oneshot")
185
+
186
+ pid = os.getpid()
187
+ ppid = os.getppid()
188
+ log.info(
189
+ "Starting one-shot worker PID=%s, Parent PID=%s, queues=%s, idle_timeout_ms=%s, stop_on_failure=%s",
190
+ pid,
191
+ ppid,
192
+ queues,
193
+ args.idle_timeout_ms if args.idle_timeout_ms > 0 else "∞",
194
+ args.stop_on_failure,
195
+ )
196
+
197
+ job_runner(
198
+ queues,
199
+ idle_timeout_ms=args.idle_timeout_ms,
200
+ stop_on_failure=args.stop_on_failure,
201
+ )
202
+
203
+
204
+ if __name__ == "__main__":
205
+ main()
@@ -0,0 +1,352 @@
1
+ """ Schemas for Pipelines
2
+ """
3
+
4
+ import yaml
5
+ from marshmallow import Schema, fields, EXCLUDE, validates_schema
6
+ from marshmallow.exceptions import ValidationError
7
+ from marshmallow.validate import OneOf
8
+
9
+ from pypeline.pipeline_settings_schema import PipelineSettingsSchema
10
+
11
+
12
+ class ExcludeUnknownSchema(Schema):
13
+ """Remove unknown keys from loaded dictionary"""
14
+
15
+ class Meta:
16
+ """Exclude unknown properties."""
17
+
18
+ unknown = EXCLUDE
19
+
20
+
21
+ class MetadataSchema(Schema):
22
+ """Schema for a pipeline's metadata object."""
23
+
24
+ queue = fields.String(
25
+ required=True,
26
+ description="Default queue for all pipeline tasks.",
27
+ example="default-queue-name",
28
+ )
29
+ maxRetry = fields.Integer(
30
+ required=False,
31
+ description="A number. Maximum number of retries before giving up. "
32
+ "A value of None means task will retry forever. "
33
+ "By default, this option is set to 3.",
34
+ default=3,
35
+ example=3,
36
+ )
37
+
38
+ maxTtl = fields.Integer(
39
+ required=False,
40
+ description="The soft time limit, in seconds, "
41
+ "for this task. When not set the "
42
+ "workers default is used. The hard "
43
+ "time limit will be derived from this"
44
+ "field, by adding 10 seconds.",
45
+ default=60,
46
+ example=60,
47
+ )
48
+
49
+ retryBackoff = fields.Integer(
50
+ required=False,
51
+ description="A number. If this option is set , it is used as a delay"
52
+ " factor. For example, if this option is set to 3, the"
53
+ " first retry will delay 3 seconds, the second will delay"
54
+ " 6 seconds, the third will delay 12 seconds, the fourth"
55
+ " will delay 24 seconds, and so on. By default, this"
56
+ " option is set to False, and autoretries will not"
57
+ " be delayed.",
58
+ default=3,
59
+ example=3,
60
+ )
61
+
62
+ retryJitter = fields.Boolean(
63
+ required=False,
64
+ description="A boolean. Jitter is used to introduce randomness into "
65
+ "exponential backoff delays, to prevent all tasks in the "
66
+ "queue from being executed simultaneously. If this option "
67
+ "is set to True, the delay value calculated by "
68
+ "retry_backoff is treated as a maximum, and the actual "
69
+ "delay value will be a random number between zero and that "
70
+ "maximum. By default, this option is set to True.",
71
+ default=False,
72
+ example=True,
73
+ )
74
+
75
+ retryBackoffMax = fields.Integer(
76
+ required=False,
77
+ description="A boolean. Jitter is used to introduce randomness into "
78
+ "exponential backoff delays, to prevent all tasks in the "
79
+ "queue from being executed simultaneously. If this option "
80
+ "is set to True, the delay value calculated by "
81
+ "retry_backoff is treated as a maximum, and the actual "
82
+ "delay value will be a random number between zero and "
83
+ "that maximum. By default, this option is set to True.",
84
+ default=600,
85
+ example=600,
86
+ )
87
+
88
+ groupName = fields.String(
89
+ required=False,
90
+ metadata={
91
+ "description": "If two pipelines logically belong to a group the user can identify that two. "
92
+ "Imagine pipeline_a and pipeline_b both process data for images. "
93
+ 'Logically we could give them a mutual group name of "Image Processing Pipelines"'
94
+ },
95
+ )
96
+
97
+
98
+ class TaskDefinitionsSchemaV1(ExcludeUnknownSchema):
99
+ """Schema for a single task's configuration"""
100
+
101
+ handler = fields.String(
102
+ required=True,
103
+ description="Path to the worker task definition",
104
+ example="client.workers.my_task",
105
+ )
106
+
107
+ maxTtl = fields.Integer(
108
+ required=False,
109
+ description="Max TTL for a task in seconds.",
110
+ default=60,
111
+ example=60,
112
+ )
113
+
114
+ queue = fields.String(
115
+ required=False,
116
+ description="Non-default queue for this task.",
117
+ example="custom-queue-name",
118
+ )
119
+
120
+ serverType = fields.String(
121
+ required=False,
122
+ description="Recommended presets are listed in enum; custom strings are allowed.",
123
+ example="m",
124
+ metadata={"enum": ["xs", "s", "m", "l", "xl", "xxl", "xxxl", "cpu-xl"]}, # docs only
125
+ )
126
+
127
+
128
+ class TaskDefinitionsSchemaV2(ExcludeUnknownSchema):
129
+ """Schema for a single task's configuration"""
130
+
131
+ handlers = fields.List(
132
+ fields.String(
133
+ required=True,
134
+ description="Path to the worker task definition",
135
+ example="client.workers.my_task",
136
+ )
137
+ )
138
+ maxTtl = fields.Integer(
139
+ required=False,
140
+ description="Max TTL for a task in seconds.",
141
+ default=60,
142
+ example=60,
143
+ )
144
+
145
+ queue = fields.String(
146
+ required=False,
147
+ description="Non-default queue for this task.",
148
+ example="custom-queue-name",
149
+ )
150
+
151
+ serverType = fields.String(
152
+ required=False,
153
+ description="Recommended presets are listed in enum; custom strings are allowed.",
154
+ example="m",
155
+ metadata={"enum": ["xs", "s", "m", "l", "xl", "xxl", "xxxl", "cpu-xl"]}, # docs only
156
+ )
157
+
158
+
159
+ class PipelineConfigSchemaBase(Schema):
160
+ """Overall pipeline configuration schema"""
161
+
162
+ metadata = fields.Nested(
163
+ MetadataSchema,
164
+ required=True,
165
+ description="Metadata and configuration information for this pipeline.",
166
+ )
167
+ dagAdjacency = fields.Dict(
168
+ keys=fields.String(
169
+ required=True,
170
+ description="Task's node name. *MUST* match key in taskDefinitions dict.",
171
+ example="node_a",
172
+ ),
173
+ values=fields.List(
174
+ fields.String(
175
+ required=True,
176
+ description="Task's node name. *Must* match key in taskDefinitions dict.",
177
+ )
178
+ ),
179
+ required=True,
180
+ description="The DAG Adjacency definition.",
181
+ )
182
+
183
+
184
+ class PipelineConfigSchemaV1(PipelineConfigSchemaBase):
185
+ """Overall pipeline configuration schema"""
186
+
187
+ taskDefinitions = fields.Dict(
188
+ keys=fields.String(
189
+ required=True,
190
+ description="Task's node name. *Must* match related key in dagAdjacency.",
191
+ example="node_a",
192
+ ),
193
+ values=fields.Nested(
194
+ TaskDefinitionsSchemaV1,
195
+ required=True,
196
+ description="Definition of each task in the pipeline.",
197
+ example={"handler": "abc.task", "maxRetry": 1},
198
+ ),
199
+ required=True,
200
+ description="Configuration for each node defined in DAG.",
201
+ )
202
+
203
+
204
+ class PipelineConfigSchemaV2(PipelineConfigSchemaBase):
205
+ """Overall pipeline configuration schema"""
206
+
207
+ taskDefinitions = fields.Dict(
208
+ keys=fields.String(
209
+ required=True,
210
+ description="Task's node name. *Must* match related key in dagAdjacency.",
211
+ example="node_a",
212
+ ),
213
+ values=fields.Nested(
214
+ TaskDefinitionsSchemaV2,
215
+ required=True,
216
+ description="Definition of each task in the pipeline.",
217
+ example={"handler": "abc.task", "maxRetry": 1},
218
+ ),
219
+ required=True,
220
+ description="Configuration for each node defined in DAG.",
221
+ )
222
+
223
+ settings = fields.Nested(
224
+ PipelineSettingsSchema,
225
+ required=False,
226
+ metadata={
227
+ "description": "Settings schema to validate the actual settings being passed through to the pipelines."
228
+ },
229
+ )
230
+
231
+
232
+ class BasePipelineSchema(ExcludeUnknownSchema):
233
+ __schema_version__ = None
234
+
235
+ name = fields.String(required=True, description="Pipeline name")
236
+ description = fields.String(
237
+ required=False,
238
+ missing=None,
239
+ description="Description of the pipeline.",
240
+ example="A valuable pipeline.",
241
+ )
242
+ schemaVersion = fields.Integer(required=True)
243
+ config = fields.Dict(required=True)
244
+
245
+ @classmethod
246
+ def get_by_version(cls, version):
247
+ for subclass in cls.__subclasses__():
248
+ if subclass.__schema_version__ == version:
249
+ return subclass
250
+
251
+ return None
252
+
253
+ @classmethod
254
+ def get_latest(cls):
255
+ max_version = 0
256
+ max_class = None
257
+ for subclass in cls.__subclasses__():
258
+ if subclass.__schema_version__ > max_version:
259
+ max_version = max_version
260
+ max_class = subclass
261
+
262
+ return max_class
263
+
264
+ @validates_schema
265
+ def validate_pipeline(self, data, **kwargs):
266
+ schema_version = data["schemaVersion"]
267
+ PipelineSchema = BasePipelineSchema.get_by_version(schema_version)
268
+ schema = PipelineSchema(exclude=["name", "description"])
269
+ schema.load(data)
270
+
271
+
272
+ class PipelineSchemaV2(BasePipelineSchema):
273
+ __schema_version__ = 2
274
+
275
+ class Meta:
276
+ unknown = EXCLUDE
277
+
278
+ config = fields.Nested(
279
+ PipelineConfigSchemaV2,
280
+ required=True,
281
+ description="Metadata and configuration information for this pipeline.",
282
+ )
283
+
284
+ def validate_pipeline(self, data, **kwargs):
285
+ # We need to add this function to avoid infinite recursion since
286
+ # the BasePipelineSchema class above uses the same method for
287
+ # validation
288
+ pass
289
+
290
+
291
+ class PipelineSchemaV1(BasePipelineSchema):
292
+ __schema_version__ = 1
293
+
294
+ class Meta:
295
+ unknown = EXCLUDE
296
+
297
+ config = fields.Nested(
298
+ PipelineConfigSchemaV1,
299
+ required=True,
300
+ description="Metadata and configuration information for this pipeline.",
301
+ )
302
+
303
+ def validate_pipeline(self, data, **kwargs):
304
+ # We need to add this function to avoid infinite recursion since
305
+ # the BasePipelineSchema class above uses the same method for
306
+ # validation
307
+ pass
308
+
309
+
310
+ class PipelineConfigValidator(object):
311
+ """Validate a pipeline configuration.
312
+
313
+ This is stored as a string in the database under `PipelineConfig.config`
314
+ in order to keep it easy for custom features to be added over time.
315
+ This model represents the required / valid features so we can
316
+ programmatically validate when saving, updating, viewing.
317
+ """
318
+
319
+ def __init__(
320
+ self,
321
+ config_dict: dict = None,
322
+ config_yaml: str = None,
323
+ schema_version: int = None,
324
+ ):
325
+ super().__init__()
326
+
327
+ # We validate this as a dictionary. Turn into dictionary if provided
328
+ # as yaml.
329
+ if config_dict is not None:
330
+ self.config = config_dict
331
+ elif config_yaml is not None:
332
+ self.config = yaml.safe_load(config_yaml)
333
+
334
+ if schema_version is None:
335
+ PipelineSchema = BasePipelineSchema.get_latest()
336
+ else:
337
+ PipelineSchema = BasePipelineSchema.get_by_version(schema_version)
338
+
339
+ self.is_valid = False
340
+ self.validated_config = {}
341
+ self.validation_errors = {}
342
+ try:
343
+ # https://github.com/marshmallow-code/marshmallow/issues/377
344
+ # See issue above when migrating to marshmallow 3
345
+ pcs = PipelineSchema._declared_fields["config"].schema
346
+ self.validated_config = pcs.load(self.config)
347
+ self.is_valid = True
348
+ except ValidationError as e:
349
+ self.validation_errors = e.messages
350
+ raise e
351
+ except Exception as e:
352
+ raise e