PyPI - scalable-pypeline - Versions diffs - 2.1.31__py2.py3-none-any.whl - Mend

scalable-pypeline 2.1.31__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

pypeline/__init__.py +1 -0
pypeline/barrier.py +63 -0
pypeline/constants.py +94 -0
pypeline/dramatiq.py +455 -0
pypeline/executable_job_config_schema.py +35 -0
pypeline/extensions.py +17 -0
pypeline/flask/__init__.py +16 -0
pypeline/flask/api/__init__.py +0 -0
pypeline/flask/api/pipelines.py +275 -0
pypeline/flask/api/schedules.py +40 -0
pypeline/flask/decorators.py +41 -0
pypeline/flask/flask_pypeline.py +156 -0
pypeline/job_runner.py +205 -0
pypeline/pipeline_config_schema.py +352 -0
pypeline/pipeline_settings_schema.py +561 -0
pypeline/pipelines/__init__.py +0 -0
pypeline/pipelines/composition/__init__.py +0 -0
pypeline/pipelines/composition/parallel_pipeline_composition.py +375 -0
pypeline/pipelines/composition/pypeline_composition.py +215 -0
pypeline/pipelines/factory.py +86 -0
pypeline/pipelines/middleware/__init__.py +0 -0
pypeline/pipelines/middleware/get_active_worker_id_middleware.py +22 -0
pypeline/pipelines/middleware/graceful_shutdown_middleware.py +50 -0
pypeline/pipelines/middleware/parallel_pipeline_middleware.py +60 -0
pypeline/pipelines/middleware/pypeline_middleware.py +202 -0
pypeline/pypeline_yaml.py +468 -0
pypeline/schedule_config_schema.py +125 -0
pypeline/utils/__init__.py +0 -0
pypeline/utils/config_utils.py +81 -0
pypeline/utils/dramatiq_utils.py +134 -0
pypeline/utils/executable_job_util.py +35 -0
pypeline/utils/graceful_shutdown_util.py +39 -0
pypeline/utils/module_utils.py +108 -0
pypeline/utils/pipeline_utils.py +144 -0
pypeline/utils/schema_utils.py +24 -0
scalable_pypeline-2.1.31.dist-info/LICENSE +177 -0
scalable_pypeline-2.1.31.dist-info/METADATA +212 -0
scalable_pypeline-2.1.31.dist-info/RECORD +42 -0
scalable_pypeline-2.1.31.dist-info/WHEEL +6 -0
scalable_pypeline-2.1.31.dist-info/entry_points.txt +6 -0
scalable_pypeline-2.1.31.dist-info/top_level.txt +2 -0
tests/fixtures/__init__.py +0 -0

pypeline/job_runner.py ADDED Viewed

@@ -0,0 +1,205 @@
+import os
+import logging
+import argparse
+import threading
+import multiprocessing as mp
+# Prefer 'spawn' for user code using multiprocessing
+if mp.get_start_method(allow_none=True) != "spawn":
+    mp.set_start_method("spawn", force=True)
+# Avoid staging more than one message; must be set before Dramatiq import path runs
+os.environ.setdefault("dramatiq_queue_prefetch", "1")
+from dramatiq import Worker, get_broker, set_broker
+from dramatiq.middleware import Middleware
+try:
+    # If your project exposes a helper to configure the default broker, use it.
+    from pypeline.dramatiq import configure_default_broker  # adjust import if needed
+    broker = configure_default_broker() or get_broker()
+    set_broker(broker)
+except Exception:
+    # Fall back to whatever Dramatiq has as the active broker.
+    import pypeline.dramatiq  # noqa: F401 (ensure module side-effects run)
+    broker = get_broker()
+class OneAndDone(Middleware):
+    """
+    Signals when the first message starts ('got_work') and completes ('done').
+    If stop_on_failure=True, we'll also mark done after the first failure.
+    """
+    def __init__(
+        self,
+        got_work: threading.Event,
+        done: threading.Event,
+        *,
+        stop_on_failure: bool = False
+    ):
+        self.got_work = got_work
+        self.done = done
+        self.stop_on_failure = stop_on_failure
+    def before_process_message(self, broker, message):
+        # First time we see a message begin processing in this process
+        if not self.got_work.is_set():
+            self.got_work.set()
+    def after_process_message(self, broker, message, *, result=None, exception=None):
+        # On success (or also on failure if configured), finish this worker
+        if exception is None or self.stop_on_failure:
+            if not self.done.is_set():
+                self.done.set()
+def _graceful_stop(worker: Worker, log: logging.Logger):
+    try:
+        log.info("Stopping dramatiq worker...")
+        worker.stop()  # stop consumers; no new messages will start
+        worker.join()
+        log.info("Worker stopped.")
+    except Exception as e:
+        log.exception("Error stopping worker: %s", e)
+def _close_broker(log: logging.Logger):
+    try:
+        b = get_broker()
+        if b is not None and hasattr(b, "close"):
+            b.close()
+            log.info("Broker closed.")
+    except Exception as e:
+        log.exception("Error closing broker: %s", e)
+def job_runner(queues, idle_timeout_ms: int = 0, *, stop_on_failure: bool = False):
+    """
+    Start a single-thread Dramatiq worker. Behavior:
+      - Wait up to `idle_timeout_ms` for *a job to start* (time-to-first-job).
+      - Once a job begins, wait indefinitely for it to complete.
+      - After the first successful job completes (or first job, if stop_on_failure=True), stop and exit.
+    Args:
+        queues (list[str]): queues to listen to
+        idle_timeout_ms (int): <=0 => wait forever for first job; >0 => exit if no job starts in time
+        stop_on_failure (bool): if True, exit after first job even if it fails
+    """
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
+    log = logging.getLogger("oneshot")
+    # Normalize timeout (treat non-positive as "infinite")
+    timeout_ms = (
+        int(idle_timeout_ms) if idle_timeout_ms and int(idle_timeout_ms) > 0 else 0
+    )
+    log.info(
+        "Launching worker with queues=%s, idle_timeout_ms=%s", queues, timeout_ms or "∞"
+    )
+    got_work = threading.Event()
+    done = threading.Event()
+    broker.add_middleware(OneAndDone(got_work, done, stop_on_failure=stop_on_failure))
+    worker = Worker(
+        broker,
+        worker_threads=1,  # strictly one at a time
+        queues=queues,
+        worker_timeout=1000,  # ms; how often the worker checks for stop
+    )
+    worker.start()
+    def controller():
+        log.debug("Controller thread started.")
+        try:
+            # Phase 1: Wait for *first job to start*
+            if timeout_ms > 0:
+                started = got_work.wait(timeout_ms / 1000.0)
+                if not started:
+                    log.info(
+                        "Idle timeout reached (%d ms); no jobs started. Stopping worker.",
+                        timeout_ms,
+                    )
+                    return
+            else:
+                got_work.wait()
+            log.info("First job started; waiting for it to finish...")
+            # Phase 2: Wait for the first job to complete (no timeout)
+            done.wait()
+            log.info("First job finished; shutting down.")
+        finally:
+            _graceful_stop(worker, log)
+            _close_broker(log)
+            # Hard-exit to ensure K8s Job is marked Succeeded promptly, no lingering threads.
+            os._exit(0)
+    t = threading.Thread(target=controller, name="oneshot-controller", daemon=False)
+    t.start()
+    t.join()  # Block until controller completes (which shuts everything down)
+def _parse_args(argv=None):
+    ap = argparse.ArgumentParser(description="Run a one-shot Dramatiq worker.")
+    ap.add_argument(
+        "-q",
+        "--queue",
+        action="append",
+        default=None,
+        help="Queue to listen to (repeatable). You can also pass a comma-separated list.",
+    )
+    ap.add_argument(
+        "--idle-timeout-ms",
+        type=int,
+        default=int(os.getenv("IDLE_TIMEOUT_MS", "0")),
+        help="Exit if no job starts within this time (<=0 = wait forever).",
+    )
+    ap.add_argument(
+        "--stop-on-failure",
+        action="store_true",
+        help="Exit after the first job even if it fails.",
+    )
+    return ap.parse_args(argv)
+def main(argv=None):
+    args = _parse_args(argv)
+    # Build queue list from flags or env, support comma-separated entries.
+    raw_entries = (
+        args.queue if args.queue else [os.getenv("JOB_QUEUE", "pipeline-queue")]
+    )
+    queues = []
+    for entry in raw_entries:
+        queues.extend([q.strip() for q in str(entry).split(",") if q and q.strip()])
+    if not queues:
+        raise SystemExit("No queues provided. Use -q ... or set JOB_QUEUE.")
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
+    log = logging.getLogger("oneshot")
+    pid = os.getpid()
+    ppid = os.getppid()
+    log.info(
+        "Starting one-shot worker PID=%s, Parent PID=%s, queues=%s, idle_timeout_ms=%s, stop_on_failure=%s",
+        pid,
+        ppid,
+        queues,
+        args.idle_timeout_ms if args.idle_timeout_ms > 0 else "∞",
+        args.stop_on_failure,
+    )
+    job_runner(
+        queues,
+        idle_timeout_ms=args.idle_timeout_ms,
+        stop_on_failure=args.stop_on_failure,
+    )
+if __name__ == "__main__":
+    main()

pypeline/pipeline_config_schema.py ADDED Viewed

@@ -0,0 +1,352 @@
+""" Schemas for Pipelines
+"""
+import yaml
+from marshmallow import Schema, fields, EXCLUDE, validates_schema
+from marshmallow.exceptions import ValidationError
+from marshmallow.validate import OneOf
+from pypeline.pipeline_settings_schema import PipelineSettingsSchema
+class ExcludeUnknownSchema(Schema):
+    """Remove unknown keys from loaded dictionary"""
+    class Meta:
+        """Exclude unknown properties."""
+        unknown = EXCLUDE
+class MetadataSchema(Schema):
+    """Schema for a pipeline's metadata object."""
+    queue = fields.String(
+        required=True,
+        description="Default queue for all pipeline tasks.",
+        example="default-queue-name",
+    )
+    maxRetry = fields.Integer(
+        required=False,
+        description="A number. Maximum number of retries before giving up. "
+        "A value of None means task will retry forever. "
+        "By default, this option is set to 3.",
+        default=3,
+        example=3,
+    )
+    maxTtl = fields.Integer(
+        required=False,
+        description="The soft time limit, in seconds, "
+        "for this task. When not set the "
+        "workers default is used.  The hard "
+        "time limit will be derived from this"
+        "field, by adding 10 seconds.",
+        default=60,
+        example=60,
+    )
+    retryBackoff = fields.Integer(
+        required=False,
+        description="A number. If this option is set , it is used as a delay"
+        " factor. For example, if this option is set to 3, the"
+        " first retry will delay 3 seconds, the second will delay"
+        "  6 seconds, the third will delay 12 seconds, the fourth"
+        " will delay 24 seconds, and so on. By default, this"
+        " option is set to False, and autoretries will not"
+        "  be delayed.",
+        default=3,
+        example=3,
+    )
+    retryJitter = fields.Boolean(
+        required=False,
+        description="A boolean. Jitter is used to introduce randomness into "
+        "exponential backoff delays, to prevent all tasks in the "
+        "queue from being executed simultaneously. If this option "
+        "is set to True, the delay value calculated by "
+        "retry_backoff is treated as a maximum, and the actual "
+        "delay value will be a random number between zero and that "
+        "maximum. By default, this option is set to True.",
+        default=False,
+        example=True,
+    )
+    retryBackoffMax = fields.Integer(
+        required=False,
+        description="A boolean. Jitter is used to introduce randomness into "
+        "exponential backoff delays, to prevent all tasks in the "
+        "queue from being executed simultaneously. If this option "
+        "is set to True, the delay value calculated by "
+        "retry_backoff is treated as a maximum, and the actual "
+        "delay value will be a random number between zero and "
+        "that maximum. By default, this option is set to True.",
+        default=600,
+        example=600,
+    )
+    groupName = fields.String(
+        required=False,
+        metadata={
+            "description": "If two pipelines logically belong to a group the user can identify that two.  "
+            "Imagine pipeline_a and pipeline_b both process data for images.  "
+            'Logically we could give them a mutual group name of "Image Processing Pipelines"'
+        },
+    )
+class TaskDefinitionsSchemaV1(ExcludeUnknownSchema):
+    """Schema for a single task's configuration"""
+    handler = fields.String(
+        required=True,
+        description="Path to the worker task definition",
+        example="client.workers.my_task",
+    )
+    maxTtl = fields.Integer(
+        required=False,
+        description="Max TTL for a task in seconds.",
+        default=60,
+        example=60,
+    )
+    queue = fields.String(
+        required=False,
+        description="Non-default queue for this task.",
+        example="custom-queue-name",
+    )
+    serverType = fields.String(
+        required=False,
+        description="Recommended presets are listed in enum; custom strings are allowed.",
+        example="m",
+        metadata={"enum": ["xs", "s", "m", "l", "xl", "xxl", "xxxl", "cpu-xl"]},  # docs only
+    )
+class TaskDefinitionsSchemaV2(ExcludeUnknownSchema):
+    """Schema for a single task's configuration"""
+    handlers = fields.List(
+        fields.String(
+            required=True,
+            description="Path to the worker task definition",
+            example="client.workers.my_task",
+        )
+    )
+    maxTtl = fields.Integer(
+        required=False,
+        description="Max TTL for a task in seconds.",
+        default=60,
+        example=60,
+    )
+    queue = fields.String(
+        required=False,
+        description="Non-default queue for this task.",
+        example="custom-queue-name",
+    )
+    serverType = fields.String(
+        required=False,
+        description="Recommended presets are listed in enum; custom strings are allowed.",
+        example="m",
+        metadata={"enum": ["xs", "s", "m", "l", "xl", "xxl", "xxxl", "cpu-xl"]},  # docs only
+    )
+class PipelineConfigSchemaBase(Schema):
+    """Overall pipeline configuration schema"""
+    metadata = fields.Nested(
+        MetadataSchema,
+        required=True,
+        description="Metadata and configuration information for this pipeline.",
+    )
+    dagAdjacency = fields.Dict(
+        keys=fields.String(
+            required=True,
+            description="Task's node name. *MUST* match key in taskDefinitions dict.",
+            example="node_a",
+        ),
+        values=fields.List(
+            fields.String(
+                required=True,
+                description="Task's node name. *Must* match key in taskDefinitions dict.",
+            )
+        ),
+        required=True,
+        description="The DAG Adjacency definition.",
+    )
+class PipelineConfigSchemaV1(PipelineConfigSchemaBase):
+    """Overall pipeline configuration schema"""
+    taskDefinitions = fields.Dict(
+        keys=fields.String(
+            required=True,
+            description="Task's node name. *Must* match related key in dagAdjacency.",
+            example="node_a",
+        ),
+        values=fields.Nested(
+            TaskDefinitionsSchemaV1,
+            required=True,
+            description="Definition of each task in the pipeline.",
+            example={"handler": "abc.task", "maxRetry": 1},
+        ),
+        required=True,
+        description="Configuration for each node defined in DAG.",
+    )
+class PipelineConfigSchemaV2(PipelineConfigSchemaBase):
+    """Overall pipeline configuration schema"""
+    taskDefinitions = fields.Dict(
+        keys=fields.String(
+            required=True,
+            description="Task's node name. *Must* match related key in dagAdjacency.",
+            example="node_a",
+        ),
+        values=fields.Nested(
+            TaskDefinitionsSchemaV2,
+            required=True,
+            description="Definition of each task in the pipeline.",
+            example={"handler": "abc.task", "maxRetry": 1},
+        ),
+        required=True,
+        description="Configuration for each node defined in DAG.",
+    )
+    settings = fields.Nested(
+        PipelineSettingsSchema,
+        required=False,
+        metadata={
+            "description": "Settings schema to validate the actual settings being passed through to the pipelines."
+        },
+    )
+class BasePipelineSchema(ExcludeUnknownSchema):
+    __schema_version__ = None
+    name = fields.String(required=True, description="Pipeline name")
+    description = fields.String(
+        required=False,
+        missing=None,
+        description="Description of the pipeline.",
+        example="A valuable pipeline.",
+    )
+    schemaVersion = fields.Integer(required=True)
+    config = fields.Dict(required=True)
+    @classmethod
+    def get_by_version(cls, version):
+        for subclass in cls.__subclasses__():
+            if subclass.__schema_version__ == version:
+                return subclass
+        return None
+    @classmethod
+    def get_latest(cls):
+        max_version = 0
+        max_class = None
+        for subclass in cls.__subclasses__():
+            if subclass.__schema_version__ > max_version:
+                max_version = max_version
+                max_class = subclass
+        return max_class
+    @validates_schema
+    def validate_pipeline(self, data, **kwargs):
+        schema_version = data["schemaVersion"]
+        PipelineSchema = BasePipelineSchema.get_by_version(schema_version)
+        schema = PipelineSchema(exclude=["name", "description"])
+        schema.load(data)
+class PipelineSchemaV2(BasePipelineSchema):
+    __schema_version__ = 2
+    class Meta:
+        unknown = EXCLUDE
+    config = fields.Nested(
+        PipelineConfigSchemaV2,
+        required=True,
+        description="Metadata and configuration information for this pipeline.",
+    )
+    def validate_pipeline(self, data, **kwargs):
+        # We need to add this function to avoid infinite recursion since
+        # the BasePipelineSchema class above uses the same method for
+        # validation
+        pass
+class PipelineSchemaV1(BasePipelineSchema):
+    __schema_version__ = 1
+    class Meta:
+        unknown = EXCLUDE
+    config = fields.Nested(
+        PipelineConfigSchemaV1,
+        required=True,
+        description="Metadata and configuration information for this pipeline.",
+    )
+    def validate_pipeline(self, data, **kwargs):
+        # We need to add this function to avoid infinite recursion since
+        # the BasePipelineSchema class above uses the same method for
+        # validation
+        pass
+class PipelineConfigValidator(object):
+    """Validate a pipeline configuration.
+    This is stored as a string in the database under `PipelineConfig.config`
+    in order to keep it easy for custom features to be added over time.
+    This model represents the required / valid features so we can
+    programmatically validate when saving, updating, viewing.
+    """
+    def __init__(
+        self,
+        config_dict: dict = None,
+        config_yaml: str = None,
+        schema_version: int = None,
+    ):
+        super().__init__()
+        # We validate this as a dictionary. Turn into dictionary if provided
+        # as yaml.
+        if config_dict is not None:
+            self.config = config_dict
+        elif config_yaml is not None:
+            self.config = yaml.safe_load(config_yaml)
+        if schema_version is None:
+            PipelineSchema = BasePipelineSchema.get_latest()
+        else:
+            PipelineSchema = BasePipelineSchema.get_by_version(schema_version)
+        self.is_valid = False
+        self.validated_config = {}
+        self.validation_errors = {}
+        try:
+            # https://github.com/marshmallow-code/marshmallow/issues/377
+            # See issue above when migrating to marshmallow 3
+            pcs = PipelineSchema._declared_fields["config"].schema
+            self.validated_config = pcs.load(self.config)
+            self.is_valid = True
+        except ValidationError as e:
+            self.validation_errors = e.messages
+            raise e
+        except Exception as e:
+            raise e