PyPI - dkist-processing-core - Versions diffs - 4.3.0__py3-none-any.whl - Mend

dkist-processing-core 4.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

changelog/.gitempty +0 -0
dkist_processing_core/__init__.py +13 -0
dkist_processing_core/build_utils.py +139 -0
dkist_processing_core/config.py +82 -0
dkist_processing_core/failure_callback.py +96 -0
dkist_processing_core/node.py +169 -0
dkist_processing_core/resource_queue.py +9 -0
dkist_processing_core/task.py +250 -0
dkist_processing_core/tests/__init__.py +1 -0
dkist_processing_core/tests/conftest.py +172 -0
dkist_processing_core/tests/invalid_workflow_cyclic/__init__.py +1 -0
dkist_processing_core/tests/invalid_workflow_cyclic/workflow.py +21 -0
dkist_processing_core/tests/invalid_workflow_for_docker_multi_category/__init__.py +0 -0
dkist_processing_core/tests/invalid_workflow_for_docker_multi_category/workflow.py +21 -0
dkist_processing_core/tests/task_example.py +45 -0
dkist_processing_core/tests/test_build_utils.py +128 -0
dkist_processing_core/tests/test_export.py +71 -0
dkist_processing_core/tests/test_failure_callback.py +90 -0
dkist_processing_core/tests/test_node.py +156 -0
dkist_processing_core/tests/test_task.py +82 -0
dkist_processing_core/tests/test_workflow.py +212 -0
dkist_processing_core/tests/valid_workflow_package/__init__.py +1 -0
dkist_processing_core/tests/valid_workflow_package/workflow.py +21 -0
dkist_processing_core/tests/zero_node_workflow_package/__init__.py +1 -0
dkist_processing_core/tests/zero_node_workflow_package/workflow.py +9 -0
dkist_processing_core/workflow.py +294 -0
dkist_processing_core-4.3.0.dist-info/METADATA +249 -0
dkist_processing_core-4.3.0.dist-info/RECORD +41 -0
dkist_processing_core-4.3.0.dist-info/WHEEL +5 -0
dkist_processing_core-4.3.0.dist-info/top_level.txt +4 -0
docs/Makefile +134 -0
docs/auto-proc-concept-model.png +0 -0
docs/auto_proc_brick.png +0 -0
docs/automated-processing-deployed.png +0 -0
docs/changelog.rst +6 -0
docs/conf.py +50 -0
docs/index.rst +9 -0
docs/landing_page.rst +34 -0
docs/make.bat +170 -0
docs/requirements.txt +1 -0
licenses/LICENSE.rst +11 -0

dkist_processing_core/task.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""
+Base class that is used to wrap the various DAG task methods.
+It provides support for user-defined setup and cleanup, task monitoring using Elastic APM,
+standardized logging and exception handling.
+"""
+import logging
+from abc import ABC
+from abc import abstractmethod
+from contextlib import contextmanager
+import elasticapm
+from dkist_processing_core.config import core_configurations
+__all__ = ["TaskBase"]
+logger = logging.getLogger(__name__)
+class ApmTransaction:
+    """
+    Elastic APM transaction manager for a DAG Task.
+    Without configuration, it disables itself.
+    """
+    @property
+    def apm_service_name(self) -> str:
+        """Format the service name for Elastic APM."""
+        name = f"{self._workflow_name}-{self._workflow_version}"
+        name = name.replace("_", "-")
+        name = name.replace(".", "-")
+        return name
+    @property
+    def apm_config(self) -> dict:
+        """Override the Elastic APM configuration with the workflow specific service name."""
+        core_config = core_configurations.apm_config
+        core_config["SERVICE_NAME"] = self.apm_service_name
+        return core_config
+    def __init__(self, transaction_name: str, workflow_name: str, workflow_version: str) -> None:
+        self._workflow_name = workflow_name
+        self._workflow_version = workflow_version
+        self.transaction_name = transaction_name
+        if core_configurations.elastic_apm_enabled:
+            self.client = elasticapm.Client(self.apm_config)
+            self.instrument()
+            self.client.begin_transaction(transaction_type="Task")
+            logger.info(f"APM Configured: {self=} {self.apm_config=}")
+        else:
+            logger.warning(f"APM Not Configured")
+    @contextmanager
+    def capture_span(self, name: str, *args, **kwargs):
+        if core_configurations.elastic_apm_enabled:
+            try:
+                with elasticapm.capture_span(name, *args, **kwargs):
+                    yield
+            finally:
+                pass
+        else:
+            try:
+                yield
+            finally:
+                pass
+    def close(self, exc_type=None):
+        if core_configurations.elastic_apm_enabled:
+            result = "Success"
+            if exc_type is not None:
+                result = "Error"  # pragma: no cover
+                self.client.capture_exception(handled=False)  # pragma: no cover
+            self.client.end_transaction(name=self.transaction_name, result=result)
+            self.client.close()
+    @staticmethod
+    def instrument():
+        """Vendored implementation of elasticapm.instrumentation.control.instrument changed to omit certain frameworks."""
+        omit_frameworks = {
+            "elasticapm.instrumentation.packages.redis.RedisInstrumentation",
+            "elasticapm.instrumentation.packages.redis.RedisPipelineInstrumentation",
+            "elasticapm.instrumentation.packages.redis.RedisConnectionInstrumentation",
+            "elasticapm.instrumentation.packages.asyncio.aioredis.RedisConnectionPoolInstrumentation",
+            "elasticapm.instrumentation.packages.asyncio.aioredis.RedisPipelineInstrumentation",
+            "elasticapm.instrumentation.packages.asyncio.aioredis.RedisConnectionInstrumentation",
+        }
+        from elasticapm.instrumentation.control import _lock
+        from elasticapm.instrumentation.register import _cls_register
+        from elasticapm.instrumentation.register import _instrumentation_singletons
+        from elasticapm.instrumentation.register import import_string
+        # from elasticapm.instrumentation.control.instrument
+        with _lock:
+            # update to vendored code
+            filtered_cls_register = _cls_register.difference(omit_frameworks)
+            # from elasticapm.instrumentation.register.get_instrumentation_objects
+            for cls_str in filtered_cls_register:
+                if cls_str not in _instrumentation_singletons:
+                    cls = import_string(cls_str)
+                    _instrumentation_singletons[cls_str] = cls()
+                obj = _instrumentation_singletons[cls_str]
+                # from elasticapm.instrumentation.control.instrument
+                obj.instrument()
+    def __repr__(self):
+        return f"{self.__class__.__name__}(transaction_name={self.transaction_name}, workflow_name={self._workflow_name}, workflow_version={self._workflow_version})"
+class TaskBase(ABC):
+    """
+    A Task is the interface between processing code and its execution.  Processing code can follow this interface through subclassing remain agnostic to the execution environment.
+    Each DAG task must implement its own subclass of this abstract wrapper class.
+    Intended instantiation is as a context manager
+    >>> class RealTask(TaskBase):
+    >>>     def run(self):
+    >>>         pass
+    >>>
+    >>> with RealTask(1, "a", "b") as task:
+    >>>     task()
+    Task names in airflow are the same as the class name
+    Additional methods can be added but will only be called if they are referenced via run,
+    pre_run, post_run, or __exit__
+    overriding methods other than run, pre_run, post_run, and in special cases __exit__ is
+    discouraged as they are used internally to support the abstraction.
+    e.g. __init__ is called by the core api without user involvement so adding parameters will not
+    result in them being passed in as there is no client interface to __init__.
+    To use the apm infrastructure in subclass code one would do the following:
+    >>> def foo(self):
+    >>>     with self.apm_step("do detailed work"):
+    >>>         pass # do work
+    Parameters
+    ----------
+    recipe_run_id : int
+        id of the recipe run used to identify the workflow run this task is part of
+    workflow_name : str
+        name of the workflow to which this instance of the task belongs
+    workflow_version : str
+        version of the workflow to which this instance of the task belongs
+    """
+    retries = 0
+    retry_delay_seconds = 60
+    def __init__(
+        self,
+        recipe_run_id: int,
+        workflow_name: str,
+        workflow_version: str,
+    ):
+        """
+        Instantiate a Task.
+        The details of instantiation may vary based upon the export target but this signature is what is expected by the intantiation transformation (Node) code.
+        """
+        self.recipe_run_id = int(recipe_run_id)
+        self.workflow_name = workflow_name
+        self.workflow_version = workflow_version
+        self.task_name = self.__class__.__name__
+        logger.info(f"Task {self.task_name} initialized")
+        self.apm = ApmTransaction(
+            transaction_name=self.task_name,
+            workflow_name=self.workflow_name,
+            workflow_version=self.workflow_version,
+        )
+        self.apm_step = self.apm.capture_span  # abbreviated syntax for capture span context mgr
+    def pre_run(self) -> None:
+        """Intended to be overridden and will execute prior to run() with Elastic APM span capturing."""
+    @abstractmethod
+    def run(self) -> None:
+        """Abstract method that must be overridden to execute the desired DAG task."""
+    def post_run(self) -> None:
+        """Intended to be overridden and will execute after run() with Elastic APM span capturing."""
+    def rollback(self) -> None:
+        """Rollback any changes to persistent stores performed by the task."""
+    def __call__(self) -> None:
+        """
+        DAG task wrapper. Execution is instrumented with Application Performance Monitoring if configured.
+        The standard execution sequence is:
+        1 run
+        2 record provenance
+        Returns
+        -------
+        None
+        """
+        logger.info(f"Task {self.task_name} started")
+        with self.apm_step("Pre Run", span_type="code.core", labels={"type": "core"}):
+            self.pre_run()
+        with self.apm_step("Run", span_type="code.core", labels={"type": "core"}):
+            self.run()
+        with self.apm_step("Post Run", span_type="code.core", labels={"type": "core"}):
+            self.post_run()
+        logger.info(f"Task {self.task_name} complete")
+    def __enter__(self):
+        """
+        Override to execute setup tasks before task execution.
+        Only override this method with tasks that need to happen
+        regardless of tasks having an exception, ensure that no additional exception
+        will be raised, and always call super().__enter__
+        """
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Override to execute teardown tasks after task execution regardless of task execution success.
+        Only override this method with tasks that need to happen
+        regardless of tasks having an exception, ensure that no additional exception
+        will be raised, and always call super().__exit__
+        """
+        self.apm.close(exc_type=exc_type)
+    def __repr__(self):
+        """Return the representation of the task."""
+        return (
+            f"{self.__class__.__name__}("
+            f"recipe_run_id={self.recipe_run_id}, "
+            f"workflow_name={self.workflow_name}, "
+            f"workflow_version={self.workflow_version}, "
+            f")"
+        )
+    def __str__(self):
+        """Return a string representation of the task."""
+        return repr(self)

dkist_processing_core/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """init."""

dkist_processing_core/tests/conftest.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""Global test fixtures."""
+from contextlib import contextmanager
+from pathlib import Path
+from shutil import rmtree
+from typing import Any
+from unittest.mock import MagicMock
+import pytest
+from talus import DurableProducer
+from dkist_processing_core import ResourceQueue
+from dkist_processing_core import TaskBase
+from dkist_processing_core import Workflow
+from dkist_processing_core.node import Node
+from dkist_processing_core.node import task_type_hint
+from dkist_processing_core.tests.task_example import Task
+@pytest.fixture(scope="module")
+def export_path() -> str:
+    """Export path object that will be removed on teardown."""
+    path = Path("export/")
+    yield str(path)
+    rmtree(path, ignore_errors=True)
+@pytest.fixture(scope="session")
+def task_subclass():
+    """Sub class of the abstract task base class implementing methods that are expected to be subclassed with inspect-able metadata."""
+    return Task
+@pytest.fixture(scope="session")
+def error_task_subclass():
+    """Subclass of the abstract task base class implementing methods that are expected to be subclassed with inspect-able metadata."""
+    class Task(TaskBase):
+        def __init__(self, *args, **kwargs):
+            self.run_was_called = False
+            self.post_run_was_called = False
+            super().__init__(*args, **kwargs)
+        def run(self):
+            self.run_was_called = True
+        def post_run(self) -> None:
+            self.post_run_was_called = True
+            raise RuntimeError("error recording provenance")
+    return Task
+@pytest.fixture()
+def task_instance(task_subclass):
+    """Create an instance of the task subclass defined in task_subclass."""
+    with task_subclass(
+        recipe_run_id=1, workflow_name="workflow_name", workflow_version="version"
+    ) as task:
+        yield task
+@pytest.fixture()
+def workflow():
+    """Create an instance of the Workflow abstraction without tasks."""
+    input_data = "input"
+    output_data = "output"
+    category = "instrument"
+    detail = "workflow_information"
+    version = "V6-12342"
+    tags = ["tag1", "tag2"]
+    workflow_instance = Workflow(
+        input_data=input_data,
+        output_data=output_data,
+        category=category,
+        detail=detail,
+        workflow_version=version,
+        workflow_package=__package__,
+        tags=tags,
+    )
+    return (
+        workflow_instance,
+        input_data,
+        output_data,
+        category,
+        detail,
+        version,
+        tags,
+    )
+@pytest.fixture()
+def workflow_tasks(task_subclass) -> list[task_type_hint]:
+    """List of Tasks that can be composed into a workflow."""
+    class TaskA(task_subclass):
+        pass
+    class TaskB(task_subclass):
+        pass
+    class TaskC(task_subclass):
+        pass
+    class TaskD(task_subclass):
+        pass
+    return [TaskA, TaskB, TaskC, TaskD]
+@pytest.fixture(params=["default", "non_default"])
+def queue_name(request):
+    """Name of the queue on the Node"""
+    if request.param == "default":
+        return ResourceQueue.DEFAULT
+    return ResourceQueue.HIGH_MEMORY
+@pytest.fixture(params=["default", "non_default"])
+def pip_extras(request):
+    """Extra pip requirements for Node initialization"""
+    if request.param == "default":
+        return None
+    return ["asdf"]
+@pytest.fixture(params=["0_upstream", "1_upstream", "2_upstream"])
+def node(
+    workflow_tasks, request, queue_name, pip_extras
+) -> tuple[Node, task_type_hint, Any, str, str]:
+    """Node instance and its component parts."""
+    version = "V6-123"
+    name = f"{request.param}_{version}"
+    TaskA, TaskB, TaskC, _ = workflow_tasks
+    upstreams = {
+        "0_upstream": (None, []),
+        "1_upstream": (TaskB, [TaskB]),
+        "2_upstream": ([TaskB, TaskC], [TaskB, TaskC]),
+    }
+    upstream = upstreams[request.param]
+    package = __package__
+    return (
+        Node(
+            workflow_name=name,
+            workflow_version=version,
+            workflow_package=package,
+            task=TaskA,
+            upstreams=upstream[0],
+            resource_queue=queue_name,
+            pip_extras=pip_extras,
+        ),
+        TaskA,
+        upstream[1],
+        name,
+        version,
+    )
+@pytest.fixture()
+def fake_producer():
+    return MagicMock(spec=DurableProducer)
+@pytest.fixture()
+def fake_producer_factory(fake_producer):
+    @contextmanager
+    def fake_factory():
+        try:
+            yield fake_producer
+        finally:
+            pass
+    return fake_factory

dkist_processing_core/tests/invalid_workflow_cyclic/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """init."""

dkist_processing_core/tests/invalid_workflow_cyclic/workflow.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Example invalid workflow."""
+from dkist_processing_core import Workflow
+from dkist_processing_core.tests.task_example import Task
+from dkist_processing_core.tests.task_example import Task2
+from dkist_processing_core.tests.task_example import Task3
+# |<--------------------|
+# |-->Task -> Task2 --> |
+example = Workflow(
+    input_data="test-data",
+    output_data="invalid",
+    category="core",
+    workflow_package=__package__,
+)
+example.add_node(task=Task, upstreams=None)
+example.add_node(task=Task2, upstreams=Task)
+example.add_node(task=Task3, upstreams=Task2)
+example.add_node(task=Task2, upstreams=Task3)

dkist_processing_core/tests/invalid_workflow_for_docker_multi_category/__init__.py ADDED Viewed

File without changes

dkist_processing_core/tests/invalid_workflow_for_docker_multi_category/workflow.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Example invalid workflow for """
+from dkist_processing_core import Workflow
+from dkist_processing_core.tests.task_example import Task
+category_a = Workflow(
+    input_data="test-data",
+    output_data="invalid",
+    category="A",
+    workflow_package=__package__,
+)
+category_a.add_node(task=Task, upstreams=None)
+category_b = Workflow(
+    input_data="test-data",
+    output_data="invalid",
+    category="B",
+    workflow_package=__package__,
+)
+category_b.add_node(task=Task, upstreams=None)

dkist_processing_core/tests/task_example.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Example task subclass used in the tests."""
+from dkist_processing_core import TaskBase
+class Task(TaskBase):
+    """Example task for testing."""
+    log_url = "http://localhost:8080/log?execution_date=2021-01-07T18%3A19%3A38.214767%2B00%3A00&task_id=task_a&dag_id=test_dag"
+    def __init__(self, *args, **kwargs):
+        """Task base construction."""
+        self.run_was_called = False
+        self.pre_run_was_called = False
+        self.post_run_was_called = False
+        super().__init__(*args, **kwargs)
+    def run(self):
+        """Override base class run method."""
+        self.run_was_called = True
+    def pre_run(self) -> None:
+        """Override base class pre-run method."""
+        self.pre_run_was_called = True
+    def post_run(self) -> None:
+        """Override base class post-run method."""
+        self.post_run_was_called = True
+class Task2(Task):
+    """Test task class."""
+    pass
+class Task3(Task):
+    """Test task class."""
+    pass
+class Task4(Task):
+    """Test task class."""
+    pass

dkist_processing_core/tests/test_build_utils.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Tests for the build utils."""
+import os
+import subprocess
+from pathlib import Path
+from shutil import rmtree
+import pytest
+from airflow.exceptions import AirflowException
+from airflow.exceptions import DuplicateTaskIdFound
+from dkist_processing_core.build_utils import export_dags
+from dkist_processing_core.build_utils import export_notebook_dockerfile
+from dkist_processing_core.build_utils import export_notebooks
+from dkist_processing_core.build_utils import validate_workflows
+from dkist_processing_core.tests import invalid_workflow_cyclic
+from dkist_processing_core.tests import invalid_workflow_for_docker_multi_category
+from dkist_processing_core.tests import valid_workflow_package
+from dkist_processing_core.tests import zero_node_workflow_package
+def test_validate_workflow_valid():
+    """
+    Given: A workflow package with a valid workflow.
+    When: validating the workflow.
+    Then: No errors raised.
+    """
+    validate_workflows(valid_workflow_package)
+@pytest.mark.parametrize(
+    "workflow_package",
+    [
+        invalid_workflow_cyclic,
+        zero_node_workflow_package,
+    ],
+)
+def test_validate_workflow_invalid(workflow_package):
+    """
+    Given: A workflow package with an invalid workflow.
+    When: validating the workflow.
+    Then: Errors raised.
+    """
+    exceptions = (ValueError, DuplicateTaskIdFound)
+    with pytest.raises(exceptions):
+        validate_workflows(workflow_package)
+def test_validate_workflow_zero_nodes():
+    """
+    Given: A workflow package with an invalid workflow of zero nodes.
+    When: validating the workflow.
+    Then: Errors raised.
+    """
+    exceptions = (ValueError, AirflowException)
+    with pytest.raises(exceptions):
+        validate_workflows(zero_node_workflow_package)
+def test_export_dag(export_path):
+    """
+    Given: A path to export to and a package containing a valid workflow.
+    When: Workflows in the package are exported.
+    Then: Expected export file exists.
+    """
+    export_dags(valid_workflow_package, export_path)
+    path = export_path / Path("test-data_to_valid_core_dev.py")
+    assert path.exists()
+def test_export_notebook(export_path):
+    """
+    Given: A path to export to and a package containing a valid workflow.
+    When: Workflows in the package are exported as ipynb.
+    Then: Expected export files exists.
+    """
+    paths = export_notebooks(valid_workflow_package, export_path)
+    assert len(paths) >= 1
+    assert all([p.exists() for p in paths])
+@pytest.fixture()
+def repository_root_path() -> Path:
+    """Return a directory relative to repository root"""
+    repo_root_parts = []
+    cwd = Path.cwd()  # expecting to be 2 levels below repo root
+    for part in cwd.parts:
+        repo_root_parts.append(part)
+        if part == "dkist-processing-core":
+            break
+    return Path(*repo_root_parts)
+@pytest.fixture()
+def notebook_export_path(repository_root_path) -> Path:
+    """Return a directory relative to repository root"""
+    export_path = Path("notebooks/")
+    yield Path("notebooks/")
+    rmtree(export_path, ignore_errors=True)
+@pytest.mark.long()
+def test_export_notebook_dockerfile(repository_root_path, notebook_export_path):
+    """
+    Given: A path to export to and a package containing a valid workflow.
+    When: Workflows in the package are exported as a valid Dockerfile.
+    Then: Expected export file exists.
+    """
+    os.chdir(str(repository_root_path))
+    print(Path.cwd())
+    dockerfile_path = export_notebook_dockerfile(valid_workflow_package, notebook_export_path)
+    assert dockerfile_path.exists()
+    image_name = "test_export_notebook_dockerfile:latest"
+    subprocess.run(["docker", "build", "-t", image_name, dockerfile_path.parent], check=True)
+    dockerfile_path.unlink()
+@pytest.mark.long()
+def test_export_notebook_dockerfile_invalid_workflow_package(
+    repository_root_path, notebook_export_path
+):
+    """
+    Given: A path to export to and a package containing a valid workflow.
+    When: Workflows in the package are exported as a valid Dockerfile.
+    Then: Expected export file exists.
+    """
+    os.chdir(str(repository_root_path))
+    with pytest.raises(ValueError):
+        export_notebook_dockerfile(invalid_workflow_for_docker_multi_category, notebook_export_path)