PyPI - scalable-pypeline - Versions diffs - 2.0.10__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl - Mend

scalable-pypeline 2.0.10py2.py3-none-any.whl → 2.1.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

pypeline/__init__.py +1 -1
pypeline/barrier.py +3 -0
pypeline/dramatiq.py +26 -154
pypeline/flask/api/pipelines.py +60 -4
pypeline/flask/api/schedules.py +1 -3
pypeline/pipeline_config_schema.py +91 -3
pypeline/pipeline_settings_schema.py +334 -0
pypeline/pipelines/__init__.py +0 -0
pypeline/pipelines/composition/__init__.py +0 -0
pypeline/pipelines/composition/pypeline_composition.py +188 -0
pypeline/pipelines/factory.py +107 -0
pypeline/pipelines/middleware/__init__.py +0 -0
pypeline/pipelines/middleware/pypeline_middleware.py +188 -0
pypeline/utils/dramatiq_utils.py +126 -0
pypeline/utils/module_utils.py +27 -2
pypeline/utils/pipeline_utils.py +22 -37
pypeline/utils/schema_utils.py +24 -0
{scalable_pypeline-2.0.10.dist-info → scalable_pypeline-2.1.0.dist-info}/METADATA +1 -1
scalable_pypeline-2.1.0.dist-info/RECORD +36 -0
scalable_pypeline-2.0.10.dist-info/RECORD +0 -27
/pypeline/{composition.py → pipelines/composition/parallel_pipeline_composition.py} +0 -0
/pypeline/{middleware.py → pipelines/middleware/parallel_pipeline_middleware.py} +0 -0
{scalable_pypeline-2.0.10.dist-info → scalable_pypeline-2.1.0.dist-info}/LICENSE +0 -0
{scalable_pypeline-2.0.10.dist-info → scalable_pypeline-2.1.0.dist-info}/WHEEL +0 -0
{scalable_pypeline-2.0.10.dist-info → scalable_pypeline-2.1.0.dist-info}/entry_points.txt +0 -0
{scalable_pypeline-2.0.10.dist-info → scalable_pypeline-2.1.0.dist-info}/top_level.txt +0 -0

pypeline/pipeline_settings_schema.py ADDED Viewed

@@ -0,0 +1,334 @@
+from marshmallow import Schema, fields, validate, ValidationError, validates_schema
+class MissingSettingsException(Exception):
+    pass
+def create_pipeline_settings_schema(pipeline_settings_schema_data):
+    """
+    Dynamically create a schema to validate user data based on settings.
+    Args:
+        pipeline_settings_schema_data (dict): The settings schema data containing
+                                              field configurations.
+    Returns:
+        Schema: A dynamically created schema class for validating user data.
+    """
+    # Dictionary to store dynamically generated fields
+    schema_fields = {}
+    for key, config in pipeline_settings_schema_data["properties"].items():
+        data_type = config.get("dataType")
+        input_type = config.get("inputType")
+        field_args = {}
+        # Map dataType to Marshmallow field type
+        field_type = {
+            "string": fields.String,
+            "int": fields.Integer,
+            "float": fields.Float,
+            "boolean": fields.Boolean,
+            "datetime": fields.DateTime,
+        }.get(data_type)
+        if not field_type:
+            raise ValidationError(f"Unsupported dataType `{data_type}` for `{key}`.")
+        # Handle range validation for numeric fields
+        if data_type in ["int", "float"]:
+            if "minimum" in config or "maximum" in config:
+                field_args["validate"] = validate.Range(
+                    min=config.get("minimum"), max=config.get("maximum")
+                )
+        # Handle dropdown or radio input options
+        if input_type in ["dropdown", "radio"] and "options" in config:
+            allowed_values = [option["value"] for option in config["options"]]
+            field_args["validate"] = validate.OneOf(allowed_values)
+        # Mark the field as required if specified
+        if key in pipeline_settings_schema_data.get("required", []):
+            field_args["required"] = True
+        # Create the field and add to the schema fields dictionary
+        schema_fields[key] = field_type(**field_args)
+    # Dynamically create a schema class with the generated fields
+    DynamicPipelineSettingsSchema = type(
+        "DynamicPipelineSettingsSchema", (Schema,), schema_fields
+    )
+    return DynamicPipelineSettingsSchema()
+class OptionSchema(Schema):
+    label = fields.String(
+        required=True,
+        metadata={"description": "The display label for the option"},
+    )
+    value = fields.Raw(
+        required=True,
+        metadata={"description": "The value corresponding to the option"},
+    )
+def validate_min_max(data):
+    """Custom validator to ensure min/max match the dataType."""
+    data_type = data.get("dataType")
+    minimum = data.get("minimum")
+    maximum = data.get("maximum")
+    if data_type in ["int", "float"]:
+        if minimum is not None and not isinstance(
+            minimum, (int if data_type == "int" else float)
+        ):
+            raise ValidationError(f"`minimum` must be of type {data_type}.")
+        if maximum is not None and not isinstance(
+            maximum, (int if data_type == "int" else float)
+        ):
+            raise ValidationError(f"`maximum` must be of type {data_type}.")
+        if minimum is not None and maximum is not None and minimum > maximum:
+            raise ValidationError("`minimum` must be less than or equal to `maximum`.")
+    elif data_type not in ["int", "float"] and (
+        minimum is not None or maximum is not None
+    ):
+        raise ValidationError(
+            "`minimum` and `maximum` are only valid for numeric types (`int`, `float`)."
+        )
+class SettingSchema(Schema):
+    dataType = fields.String(
+        required=True,
+        validate=validate.OneOf(["string", "int", "float", "boolean", "datetime"]),
+        metadata={"description": "The underlying data type of the setting"},
+    )
+    inputType = fields.String(
+        required=True,
+        validate=validate.OneOf(
+            ["text", "dropdown", "radio", "checkbox", "searchable"]
+        ),
+        metadata={"description": "The type of input UI element"},
+    )
+    label = fields.String(
+        required=True,
+        metadata={"description": "The display label for the field"},
+    )
+    placeholder = fields.String(
+        metadata={"description": "Placeholder text for text input fields"}
+    )
+    minimum = fields.Raw(
+        metadata={"description": "Minimum value for numeric data types"}
+    )
+    maximum = fields.Raw(
+        metadata={"description": "Maximum value for numeric data types"}
+    )
+    options = fields.List(
+        fields.Nested(OptionSchema),
+        metadata={"description": "Options for dropdown or radio input types"},
+    )
+    searchEndpoint = fields.String(
+        metadata={"description": "Endpoint for searchable fields"}
+    )
+    class Meta:
+        ordered = True
+    @validates_schema
+    def validate_min_max(self, data, **kwargs):
+        validate_min_max(data)
+    @validates_schema
+    def validate_options(self, data, **kwargs):
+        """Ensure options are provided for dropdown or radio input types and validate value types."""
+        input_type = data.get("inputType")
+        options = data.get("options")
+        data_type = data.get("dataType")
+        if input_type in ["dropdown", "radio"]:
+            if not options:
+                raise ValidationError(
+                    "`options` are required for dropdown and radio input types.",
+                    field_name="options",
+                )
+            for option in options:
+                value = option.get("value")
+                if data_type == "int" and not isinstance(value, int):
+                    raise ValidationError(
+                        f"Option value `{value}` must be of type `int`."
+                    )
+                elif data_type == "float" and not isinstance(value, float):
+                    raise ValidationError(
+                        f"Option value `{value}` must be of type `float`."
+                    )
+                elif data_type == "boolean" and not isinstance(value, bool):
+                    raise ValidationError(
+                        f"Option value `{value}` must be of type `boolean`."
+                    )
+                elif data_type == "string" and not isinstance(value, str):
+                    raise ValidationError(
+                        f"Option value `{value}` must be of type `string`."
+                    )
+                elif data_type == "datetime" and not isinstance(
+                    value, str
+                ):  # Assuming ISO 8601 strings
+                    raise ValidationError(
+                        f"Option value `{value}` must be an ISO 8601 string for `datetime`."
+                    )
+    @validates_schema
+    def validate_search_endpoint(self, data, **kwargs):
+        """Ensure searchEndpoint is provided only for 'searchable' input types."""
+        input_type = data.get("inputType")
+        search_endpoint = data.get("searchEndpoint")
+        if input_type == "searchable" and not search_endpoint:
+            raise ValidationError(
+                "`searchEndpoint` is required for `searchable` input types.",
+                field_name="searchEndpoint",
+            )
+        elif input_type != "searchable" and search_endpoint:
+            raise ValidationError(
+                "`searchEndpoint` is not allowed for non-searchable input types.",
+                field_name="searchEndpoint",
+            )
+class PipelineSettingsSchema(Schema):
+    properties = fields.Dict(
+        keys=fields.String(),
+        values=fields.Nested(SettingSchema),
+        required=True,
+        metadata={"description": "A dictionary of settings with their configurations"},
+    )
+    required = fields.List(
+        fields.String(), required=True, description="List of required settings"
+    )
+    scenarioSettings = fields.List(
+        fields.String(),
+        required=False,
+        description="List of settings that can be overriding for different pipeline scenarios.",
+    )
+    @validates_schema
+    def validate_scenario_settings(self, data, **kwargs):
+        """Ensure scenarioSettings only contains keys defined in properties."""
+        properties = data.get("properties", {})
+        scenario_settings = data.get("scenarioSettings", [])
+        invalid_settings = [
+            setting for setting in scenario_settings if setting not in properties
+        ]
+        if invalid_settings:
+            raise ValidationError(
+                {
+                    "scenario_settings": (
+                        f"The following settings in scenarioSettings are not defined "
+                        f"in properties: {', '.join(invalid_settings)}"
+                    )
+                }
+            )
+class PipelineScenarioSchema(Schema):
+    settings = fields.Dict(
+        required=True,
+        metadata={
+            "description": "Settings to be used for a given scenario.  Should match the pypeline.yaml settings schema"
+        },
+    )
+    taskReplacements = fields.Dict(
+        keys=fields.String(),
+        values=fields.Integer(),
+        required=False,
+        metadata={
+            "description": "Tasks that should be replaced in a given scenario.  "
+            "The key corresponds to the task definition in the pypeline.yaml and the value corresponds "
+            "to the index of the task handlers where 0 is the default and first task.  Eg: {'a': 1}.  In this case "
+            "if we have a task definition 'a' with 3 handlers fn_1, fn_2, fn_3 respectively then the handler to run "
+            "for 'a' is fn_2."
+        },
+    )
+    taskReruns = fields.List(
+        fields.String(),
+        required=False,
+        metadata={
+            "description": "List of task definitions that need to be run again for a given scenario.  Here "
+            "the scenario's pipeline settings will be injected in the task being run again which could be used to "
+            "produce alternative calculations and or results."
+        },
+    )
+class PipelineScenariosSchema(Schema):
+    required = fields.List(
+        fields.Nested(PipelineScenarioSchema),
+        metadata={"description": "List of scenarios to run for a given pipeline"},
+    )
+# Example usage
+if __name__ == "__main__":
+    pipeline_settings = {"param1": "test", "param2": 1}
+    yaml_data = {
+        "properties": {
+            "param1": {
+                "dataType": "string",
+                "inputType": "text",
+                "label": "Parameter 1",
+                "placeholder": "Enter a string",
+            },
+            "param2": {
+                "dataType": "int",
+                "inputType": "text",
+                "label": "Parameter 2",
+                "minimum": 1,
+                "maximum": -1,
+            },
+            "param3": {
+                "dataType": "boolean",
+                "inputType": "checkbox",
+                "label": "Enable Feature",
+            },
+            "param4": {
+                "dataType": "float",
+                "inputType": "dropdown",
+                "label": "Choose an Option",
+                "minimum": 0.5,
+                "maximum": 2.5,
+                "options": [
+                    {"label": "Option 1", "value": 0.5},
+                    {"label": "Option 2", "value": 1.5},
+                ],
+            },
+            "param5": {
+                "dataType": "int",
+                "inputType": "radio",
+                "label": "Select a Mode",
+                "options": [
+                    {"label": "Mode A", "value": 1},
+                    {"label": "Mode B", "value": 2},
+                ],
+            },
+            "param6": {
+                "dataType": "string",
+                "inputType": "searchable",
+                "label": "Select Pipeline",
+                "searchEndpoint": "/api/pipelines",
+            },
+        },
+        "required": ["param1", "param2", "param4"],
+    }
+    schema = PipelineSettingsSchema()
+    errors = schema.validate(yaml_data)
+    if errors:
+        print("Validation errors:", errors)
+    else:
+        print("Validation successful!")

pypeline/pipelines/__init__.py ADDED Viewed

File without changes

pypeline/pipelines/composition/__init__.py ADDED Viewed

File without changes

pypeline/pipelines/composition/pypeline_composition.py ADDED Viewed

@@ -0,0 +1,188 @@
+import json
+import typing
+from copy import copy
+from uuid import uuid4
+import networkx as nx
+from dramatiq import get_broker
+from pypeline.barrier import LockingParallelBarrier
+from pypeline.constants import REDIS_URL, PARALLEL_PIPELINE_CALLBACK_BARRIER_TTL
+from pypeline.utils.dramatiq_utils import register_lazy_actor
+from pypeline.utils.module_utils import get_callable
+from pypeline.utils.pipeline_utils import get_execution_graph
+class Pypeline:
+    def __init__(
+        self,
+        pipeline: dict,
+        pipeline_settings: dict = None,
+        task_replacements: dict = {},
+        scenarios: dict = {},
+        broker=None,
+        execution_id=None,
+    ):
+        # Construct initial properties
+        self.pipeline = pipeline
+        self.broker = broker or get_broker()
+        self.execution_id = execution_id or str(uuid4())
+        self._starting_messages = []
+        self.scenarios = scenarios
+        self.pipeline_settings = pipeline_settings
+        self.task_replacements = task_replacements
+        # Get pipeline dag graph and find first task
+        pipeline_config = pipeline["config"]
+        self.graph = get_execution_graph(pipeline_config)
+        self.number_of_tasks = len(self.graph.nodes)
+        task_definitions = pipeline_config["taskDefinitions"]
+        first_task = list(pipeline_config["dagAdjacency"].keys())[0]
+        # Process the scenarios one by one
+        for scenario in self.scenarios:
+            tasks_in_reruns = scenario["taskReruns"]
+            # Find any tasks that have replacements for this scenario
+            tasks_in_replacements = list(scenario["taskReplacements"].keys())
+            distinct_scenario_tasks = list(set(tasks_in_reruns + tasks_in_replacements))
+            tasks_to_be_rerun_in_scenario = distinct_scenario_tasks
+            tasks_to_be_rerun_in_scenario = list(
+                set(
+                    task
+                    for task in distinct_scenario_tasks
+                    for task in nx.descendants(self.graph, task)
+                )
+                | set(tasks_to_be_rerun_in_scenario)
+            )
+            self.number_of_tasks = self.number_of_tasks + len(
+                tasks_to_be_rerun_in_scenario
+            )
+            scenario["tasksToRunInScenario"] = tasks_to_be_rerun_in_scenario
+            scenario["execution_id"] = scenario.get("execution_id", None) or str(
+                uuid4()
+            )
+            # Check if any of the scenarios need to be kicked off now
+            if first_task in tasks_to_be_rerun_in_scenario:
+                handler = task_definitions[first_task]["handlers"][
+                    scenario["taskReplacements"].get(first_task, 0)
+                ]
+                lazy_actor = register_lazy_actor(
+                    self.broker,
+                    get_callable(handler),
+                    pipeline_config["metadata"],
+                )
+                message = lazy_actor.message()
+                message.options["pipeline"] = pipeline
+                message.options["task_replacements"] = self.task_replacements
+                message.options["execution_id"] = scenario["execution_id"]
+                message.options["task_name"] = first_task
+                message.options["root_execution_id"] = self.execution_id
+                if self.pipeline_settings:
+                    message.kwargs["settings"] = copy(self.pipeline_settings)
+                    message.kwargs["settings"]["execution_id"] = scenario[
+                        "execution_id"
+                    ]
+                self._starting_messages.append(message)
+        for m in self._starting_messages:
+            m.options["scenarios"] = self.scenarios
+        handler = task_definitions[first_task]["handlers"][
+            self.task_replacements.get(first_task, 0)
+        ]
+        lazy_actor = register_lazy_actor(
+            self.broker,
+            get_callable(handler),
+            pipeline_config["metadata"],
+        )
+        message = lazy_actor.message()
+        message.options["pipeline"] = pipeline
+        message.options["task_replacements"] = self.task_replacements
+        message.options["execution_id"] = self.execution_id
+        message.options["task_name"] = first_task
+        message.options["scenarios"] = self.scenarios
+        message.options["root_execution_id"] = self.execution_id
+        if self.pipeline_settings:
+            message.kwargs["settings"] = copy(self.pipeline_settings)
+            message.kwargs["settings"]["execution_id"] = self.execution_id
+        self._starting_messages.append(message)
+    def run(self, *, delay=None):
+        for message in self._starting_messages:
+            task_key = (
+                f"{message.options['execution_id']}-{message.options['task_name']}"
+            )
+            locking_parallel_barrier = LockingParallelBarrier(
+                REDIS_URL, task_key=task_key, lock_key=f"{self.execution_id}-lock"
+            )
+            locking_parallel_barrier.set_task_count(1)
+            self.broker.enqueue(message, delay=delay)
+        return self
+    def __len__(self):
+        return self.number_of_tasks
+    def completed(self):
+        redis_task_keys = [
+            f"{self.execution_id}-{node}" for node in list(self.graph.nodes)
+        ]
+        redis_lock_key = f"{self.execution_id}-lock"
+        for scenario in self.scenarios:
+            scenario_task_keys = [
+                f"{scenario['execution_id']}-{task}"
+                for task in scenario["tasksToRunInScenario"]
+            ]
+            redis_task_keys = redis_task_keys + scenario_task_keys
+        for task_key in redis_task_keys:
+            locking_parallel_barrier = LockingParallelBarrier(
+                REDIS_URL, task_key=task_key, lock_key=redis_lock_key
+            )
+            try:
+                locking_parallel_barrier.acquire_lock(
+                    timeout=PARALLEL_PIPELINE_CALLBACK_BARRIER_TTL
+                )
+                task_complete = True
+                if locking_parallel_barrier.task_exists():
+                    remaining_tasks = locking_parallel_barrier.get_task_count()
+                    if remaining_tasks >= 1:
+                        task_complete = False
+                else:
+                    task_complete = False
+            finally:
+                locking_parallel_barrier.release_lock()
+            if not task_complete:
+                return task_complete
+        return True
+    def to_json(self) -> str:
+        return json.dumps(
+            {
+                "pipeline": self.pipeline,
+                "pipeline_settings": self.pipeline_settings,
+                "task_replacements": self.task_replacements,
+                "scenarios": self.scenarios,
+                "execution_id": self.execution_id,
+            }
+        )
+    @classmethod
+    def from_json(cls, json_data: str) -> typing.Type["Pypeline"]:
+        data = json.loads(json_data)
+        return cls(
+            data["pipeline"],
+            pipeline_settings=data["pipeline_settings"],
+            task_replacements=data["task_replacements"],
+            scenarios=data["scenarios"],
+            execution_id=data["execution_id"],
+        )

pypeline/pipelines/factory.py ADDED Viewed

@@ -0,0 +1,107 @@
+import typing
+from dramatiq import get_broker, Message
+from pypeline.pipelines.composition.parallel_pipeline_composition import (
+    parallel_pipeline,
+)
+from pypeline.dramatiq import LazyActor
+from pypeline.utils.dramatiq_utils import register_lazy_actor
+from pypeline.pipeline_settings_schema import (
+    MissingSettingsException,
+    create_pipeline_settings_schema,
+    PipelineScenarioSchema,
+)
+from pypeline.pipelines.composition.pypeline_composition import Pypeline
+from pypeline.utils.config_utils import retrieve_latest_pipeline_config
+from pypeline.utils.module_utils import get_callable
+from pypeline.utils.pipeline_utils import (
+    get_execution_graph,
+    topological_sort_with_parallelism,
+)
+def dag_generator(
+    pipeline_id: str,
+    task_replacements: dict = {},
+    scenarios: typing.List[typing.Dict] = [],
+    *args,
+    **kwargs
+) -> typing.Union[parallel_pipeline, Pypeline]:
+    """Generates a pipeline dag from a pre-defined pipeline yaml
+    :param pipeline_id: Id of the pipeline to generate
+    :param task_replacements: A dictionary of task names and handler index to run. E.g. {"a": 1} would run the handler
+        in the second index position.
+    :param scenarios:
+    :param args:
+    :param kwargs:
+    :return: Returns a parallel_pipeline object which can be run
+    """
+    pipeline = retrieve_latest_pipeline_config(pipeline_id=pipeline_id)
+    pipeline_config = pipeline["config"]
+    broker = get_broker()
+    broker.actors.clear()
+    if pipeline["schemaVersion"] == 2:
+        # If the pipeline_config expects settings ensure we have them
+        if (
+            "settings" in pipeline_config
+            and len(pipeline_config["settings"]["required"]) > 0
+            and "settings" not in kwargs
+        ):
+            raise MissingSettingsException()
+        # If we're here we expect to  have settings.  Pop them out of kwargs to validate
+        inputted_settings = kwargs.pop("settings", {})
+        if "settings" in pipeline_config:
+            supplied_pipeline_settings_schema = create_pipeline_settings_schema(
+                pipeline_config["settings"]
+            )
+            # Validate scenarios settings to make sure they look okay
+            validated_scenarios = PipelineScenarioSchema(many=True).load(scenarios)
+            for scenario in validated_scenarios:
+                supplied_pipeline_settings_schema.load(scenario["settings"])
+            validated_settings = supplied_pipeline_settings_schema.load(
+                inputted_settings
+            )
+            p = Pypeline(
+                pipeline,
+                pipeline_settings=validated_settings,
+                task_replacements=task_replacements,
+                scenarios=scenarios,
+                broker=broker,
+            )
+        else:
+            p = Pypeline(pipeline, task_replacements=task_replacements, broker=broker)
+        return p
+    graph = get_execution_graph(pipeline_config)
+    optimal_execution_graph = topological_sort_with_parallelism(graph.copy())
+    registered_actors: typing.Dict[str, LazyActor] = {}
+    messages: typing.List[typing.List[Message]] = []
+    task_definitions = pipeline_config["taskDefinitions"]
+    for task_group in optimal_execution_graph:
+        message_group = []
+        for task in task_group:
+            module_path = task_definitions[task]["handler"]
+            tmp_handler = get_callable(module_path)
+            lazy_actor = register_lazy_actor(
+                broker, tmp_handler, pipeline_config["metadata"]
+            )
+            registered_actors[task] = lazy_actor
+            if args and not kwargs:
+                message_group.append(registered_actors[task].message(*args))
+            elif kwargs and not args:
+                message_group.append(registered_actors[task].message(**kwargs))
+            elif args and kwargs:
+                message_group.append(registered_actors[task].message(*args, **kwargs))
+            else:
+                message_group.append(registered_actors[task].message())
+        messages.append(message_group)
+    p = parallel_pipeline(messages)
+    return p

pypeline/pipelines/middleware/__init__.py ADDED Viewed

File without changes

scalable-pypeline 2.0.10__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl

scalable-pypeline 2.0.10py2.py3-none-any.whl → 2.1.0py2.py3-none-any.whl