PyPI - so-campaign-manager - Versions diffs - 0.0.4__py3-none-any.whl - Mend

so-campaign-manager 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

so_campaign_manager-0.0.4.dist-info/METADATA +179 -0
so_campaign_manager-0.0.4.dist-info/RECORD +44 -0
so_campaign_manager-0.0.4.dist-info/WHEEL +5 -0
so_campaign_manager-0.0.4.dist-info/entry_points.txt +2 -0
so_campaign_manager-0.0.4.dist-info/licenses/LICENSE +24 -0
so_campaign_manager-0.0.4.dist-info/top_level.txt +1 -0
socm/__about__.py +34 -0
socm/__init__.py +0 -0
socm/__main__.py +35 -0
socm/bookkeeper/__init__.py +1 -0
socm/bookkeeper/bookkeeper.py +488 -0
socm/configs/slurmise.toml +2 -0
socm/core/__init__.py +1 -0
socm/core/models.py +235 -0
socm/enactor/__init__.py +3 -0
socm/enactor/base.py +123 -0
socm/enactor/dryrun_enactor.py +216 -0
socm/enactor/rp_enactor.py +273 -0
socm/execs/__init__.py +3 -0
socm/execs/mapmaking.py +73 -0
socm/planner/__init__.py +2 -0
socm/planner/base.py +87 -0
socm/planner/heft_planner.py +442 -0
socm/resources/__init__.py +5 -0
socm/resources/perlmutter.py +22 -0
socm/resources/tiger.py +24 -0
socm/resources/universe.py +18 -0
socm/utils/__init__.py +0 -0
socm/utils/misc.py +90 -0
socm/utils/states.py +17 -0
socm/workflows/__init__.py +41 -0
socm/workflows/ml_mapmaking.py +111 -0
socm/workflows/ml_null_tests/__init__.py +10 -0
socm/workflows/ml_null_tests/base.py +117 -0
socm/workflows/ml_null_tests/day_night_null_test.py +132 -0
socm/workflows/ml_null_tests/direction_null_test.py +133 -0
socm/workflows/ml_null_tests/elevation_null_test.py +118 -0
socm/workflows/ml_null_tests/moon_close_null_test.py +165 -0
socm/workflows/ml_null_tests/moonrise_set_null_test.py +151 -0
socm/workflows/ml_null_tests/pwv_null_test.py +118 -0
socm/workflows/ml_null_tests/sun_close_null_test.py +173 -0
socm/workflows/ml_null_tests/time_null_test.py +76 -0
socm/workflows/ml_null_tests/wafer_null_test.py +175 -0
socm/workflows/sat_simulation.py +76 -0

socm/enactor/rp_enactor.py ADDED Viewed

@@ -0,0 +1,273 @@
+# Imports from general packages
+import os
+import threading as mt
+from copy import deepcopy
+from datetime import datetime
+from typing import Dict, List
+# Imports from dependent packages
+import numpy as np  # noqa: F401
+import radical.pilot as rp
+import radical.utils as ru
+from socm.core import Resource, Workflow
+from socm.enactor.base import Enactor
+from socm.utils.states import States
+class RPEnactor(Enactor):
+    """
+    The Emulated enactor is responsible to execute workflows on emulated
+    resources. The Enactor takes as input a list of tuples <workflow,resource>
+    and executes the workflows on their selected resources.
+    """
+    def __init__(self, sid: str):
+        super(RPEnactor, self).__init__(sid=sid)
+        # List with all the workflows that are executing and require to be
+        # monitored. This list is atomic and requires a lock
+        self._to_monitor = list()
+        os.environ["RADICAL_CONFIG_USER_DIR"] = os.path.join(
+            os.path.dirname(__file__) + "/../configs/"
+        )
+        self._prof.prof("enactor_setup", uid=self._uid)
+        # Lock to provide atomicity in the monitoring data structure
+        self._monitoring_lock = ru.RLock("cm.monitor_lock")
+        self._cb_lock = ru.RLock("enactor.cb_lock")
+        self._callbacks = dict()
+        # Creating a thread to execute the monitoring method.
+        self._monitoring_thread = None  # Private attribute that will hold the thread
+        self._terminate_monitor = mt.Event()  # Thread event to terminate.
+        self._run = False
+        self._resource = None
+        self._prof.prof("enactor_started", uid=self._uid)
+        self._rp_session = rp.Session(uid=sid)
+        self._rp_pmgr = rp.PilotManager(session=self._rp_session)
+        self._rp_tmgr = rp.TaskManager(session=self._rp_session)
+        self._logger.info("Enactor is ready")
+    def setup(self, resource: Resource, walltime: int, cores: int, execution_schema: str | None = None) -> None:
+        """
+        Sets up the enactor to execute workflows.
+        """
+        self._resource = resource
+        pd_init = {
+            "resource": f"so.{resource.name}",
+            "runtime": walltime,  # pilot runtime (min)
+            "exit_on_error": True,
+            "access_schema": "batch" if execution_schema == "batch" else "local",
+            "cores": cores,
+            "project": "simonsobs",
+        }
+        pdesc = rp.PilotDescription(pd_init)
+        self._logger.debug(f"Asking for {pdesc} pilot")
+        pilot = self._rp_pmgr.submit_pilots(pdesc)
+        self._rp_tmgr.add_pilots(pilot)
+        pilot.wait(state=rp.PMGR_ACTIVE)
+        self._logger.info("Pilot is ready")
+    def enact(self, workflows: List[Workflow]) -> None:
+        """
+        Method enact receives a set workflows and resources. It is responsible to
+        start the execution of the workflow and set a endpoint to the WMF that
+        executes the workflow
+        *workflows:* A workflows that will execute on a resource
+        *resources:* The resource that will be used.
+        """
+        self._prof.prof("enacting_start", uid=self._uid)
+        exec_workflows = []
+        for workflow in workflows:
+            # If the enactor has already received a workflow issue a warning and
+            # proceed.
+            if workflow.id in self._execution_status:
+                self._logger.info(
+                    "Workflow %s is in state %s",
+                    workflow,
+                    self._get_workflow_state(workflow.id).name,
+                )
+                continue
+            try:
+                # Create a calculator task. This is equivalent because with
+                # the emulated resources, a workflow is a number of operations
+                # that need to be executed.
+                exec_workflow = (
+                    rp.TaskDescription()
+                )  # Use workflow description and resources to create the TaskDescription
+                exec_workflow.uid = f"workflow.{workflow.id}"
+                exec_workflow.executable = workflow.executable
+                exec_workflow.arguments = []
+                if workflow.subcommand:
+                    exec_workflow.arguments += [workflow.subcommand]
+                exec_workflow.arguments += workflow.get_arguments()
+                self._logger.debug(
+                    "Workflow %s arguments: %s", workflow.id, exec_workflow.arguments
+                )
+                exec_workflow.ranks = workflow.resources["ranks"]
+                exec_workflow.cores_per_rank = workflow.resources["threads"]
+                exec_workflow.threading_type = rp.OpenMP
+                # exec_workflow.mem_per_rank = np.ceil(
+                #     workflow.resources["memory"] / workflow.resources["ranks"]
+                # )  # this translates to memory per rank
+                exec_workflow.post_exec = "echo ${SLURM_JOB_ID}.${SLURM_STEP_ID}"
+                if workflow.environment:
+                    exec_workflow.environment = workflow.environment
+                self._logger.info("Enacting workflow %s", workflow.id)
+                exec_workflows.append(exec_workflow)
+                # Lock the monitoring list and update it, as well as update
+                # the state of the workflow.
+                with self._monitoring_lock:
+                    self._to_monitor.append(workflow.id)
+                    self._execution_status[workflow.id] = {
+                        "state": States.EXECUTING,
+                        "endpoint": exec_workflow,
+                        "exec_thread": None,
+                        "start_time": datetime.now(),
+                        "end_time": None,
+                    }
+                for cb in self._callbacks:
+                    self._callbacks[cb](
+                        workflow_ids=[workflow.id],
+                        new_state=States.EXECUTING,
+                        step_ids=[None],
+                    )
+                # Execute the task.
+            except Exception as ex:
+                self._logger.error(f"Workflow {workflow} could not be executed")
+                self._logger.error(f"Exception raised {ex}", exc_info=True)
+        self._rp_tmgr.submit_tasks(exec_workflows)
+        self._prof.prof("enacting_stop", uid=self._uid)
+        # If there is no monitoring tasks, start one.
+        if self._monitoring_thread is None:
+            self._logger.info("Starting monitor thread")
+            self._monitoring_thread = mt.Thread(
+                target=self._monitor, name="monitor-thread"
+            )
+            self._monitoring_thread.start()
+    def _monitor(self):
+        """
+        **Purpose**: Thread in the master process to monitor the campaign execution
+                     data structure up to date.
+        """
+        self._prof.prof("workflow_monitor_start", uid=self._uid)
+        while not self._terminate_monitor.is_set():
+            if self._to_monitor:
+                workflows_executing = self._rp_tmgr.list_tasks()
+                # with self._monitoring_lock:
+                # It does not iterate correctly.
+                monitoring_list = deepcopy(self._to_monitor)
+                # self._logger.info("Monitoring workflows %s" % monitoring_list)
+                to_remove_wfs = list()
+                to_remove_sids = list()
+                for workflow_id in monitoring_list:
+                    if f"workflow.{workflow_id}" in workflows_executing:
+                        rp_workflow = self._rp_tmgr.get_tasks(
+                            uids=f"workflow.{workflow_id}"
+                        )
+                        if rp_workflow.state in rp.FINAL:
+                            with self._monitoring_lock:
+                                self._logger.debug(f"workflow.{workflow_id} Done")
+                                self._execution_status[workflow_id]["state"] = States.DONE
+                                self._execution_status[workflow_id][
+                                    "end_time"
+                                ] = datetime.now()
+                                self._logger.debug(
+                                    "Workflow %s finished: %s, step_id: %s",
+                                    workflow_id,
+                                    self._execution_status[workflow_id]["end_time"],
+                                    rp_workflow.stdout.split()[-1],
+                                )
+                                to_remove_wfs.append(workflow_id)
+                                to_remove_sids.append(rp_workflow.stdout.split()[-1])
+                            self._prof.prof("workflow_success", uid=self._uid)
+                if to_remove_wfs:
+                    for cb in self._callbacks:
+                        self._callbacks[cb](
+                            workflow_ids=to_remove_wfs,
+                            new_state=States.DONE,
+                            step_ids=to_remove_sids,
+                        )
+                    with self._monitoring_lock:
+                        for wid in to_remove_wfs:
+                            self._to_monitor.remove(wid)
+        self._prof.prof("workflow_monitor_end", uid=self._uid)
+    def get_status(self, workflows: str | List[str] | None = None) -> Dict[str, States]:
+        """
+        Get the state of a workflow or workflows.
+        *Parameter*
+        *workflows:* A workflow ID or a list of workflow IDs
+        *Returns*
+        *status*: A dictionary with the state of each workflow.
+        """
+        status = dict()
+        if workflows is None:
+            for workflow in self._execution_status:
+                status[workflow] = self._execution_status[workflow]["state"]
+        elif isinstance(workflows, list):
+            for workflow in workflows:
+                status[workflow] = self._execution_status[workflow]["state"]
+        else:
+            status[workflows] = self._execution_status[workflows]["state"]
+        return status
+    def update_status(self, workflow, new_state):
+        """
+        Update the state of a workflow that is executing
+        """
+        if workflow not in self._execution_status:
+            self._logger.warning(
+                "Has not enacted on workflow %s yet.",
+                workflow,
+                self._get_workflow_state(workflow),
+            )
+        else:
+            self._execution_status[workflow]["state"] = new_state
+    def terminate(self):
+        """
+        Public method to terminate the Enactor
+        """
+        self._logger.info("Start terminating procedure")
+        self._prof.prof("str_terminating", uid=self._uid)
+        if self._monitoring_thread:
+            self._prof.prof("monitor_terminate", uid=self._uid)
+            self._terminate_monitor.set()
+            self._monitoring_thread.join()
+            self._prof.prof("monitor_terminated", uid=self._uid)
+        self._logger.debug("Monitor thread terminated")
+        # self._rp_tmgr.close()
+        self._rp_pmgr.close(terminate=True)
+        self._rp_session.close(terminate=True)
+        self._logger.debug("Enactor thread terminated")
+    def register_state_cb(self, cb):
+        """
+        Registers a new state update callback function with the Enactor.
+        """
+        with self._cb_lock:
+            cb_name = cb.__name__
+            self._callbacks[cb_name] = cb

socm/execs/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from . import mapmaking
+SUBCOMMANDS = {"lat-mapmaking": mapmaking}

socm/execs/mapmaking.py ADDED Viewed

@@ -0,0 +1,73 @@
+from argparse import ArgumentParser, Namespace
+import humanfriendly
+import toml
+from socm.core.models import Campaign
+from socm.utils.misc import get_workflow_entries, parse_comma_separated_fields
+from socm.workflows import registered_workflows, subcampaign_map
+def get_parser(parser: ArgumentParser) -> ArgumentParser:
+    """Create and return a sub-argument parser for the LAT mapmaking campaign."""
+    parser.add_argument(
+        "--toml",
+        "-t",
+        type=str,
+        required=True,
+        help="Path to the configuration file for the workflow.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Enable dry run for faster development. This flag does not actually run the campaign.",
+    )
+    return parser
+def _main(args: Namespace) -> None:
+    # Import here to avoid loading radical.pilot at CLI startup (not available on macOS)
+    from socm.bookkeeper import Bookkeeper
+    config = toml.load(args.toml)
+    config = parse_comma_separated_fields(config=config, fields_to_parse=["maxiter", "downsample"])
+    workflows_configs = get_workflow_entries(config, subcampaign_map=subcampaign_map)
+    workflows = []
+    for workflow_type, workflow_config in workflows_configs.items():
+        if workflow_type in registered_workflows:
+            workflow_config["resources"]["memory"] = (
+                humanfriendly.parse_size(workflow_config["resources"]["memory"])
+                // 1000000
+            )
+            workflow_config["resources"]["runtime"] = (
+                humanfriendly.parse_timespan(workflow_config["resources"]["runtime"])
+                / 60
+            )  # in minutes
+            workflow_factory = registered_workflows[workflow_type]
+            tmp_workflows = workflow_factory.get_workflows(workflow_config)
+            for workflow in tmp_workflows:
+                workflow.id = len(workflows) + 1  # Assign a unique ID to each workflow
+                workflows.append(workflow)
+    policy = config["campaign"].get("policy", "time")
+    target_resource = config["campaign"].get("resource", "tiger3")
+    # pprint(workflows)
+    campaign = Campaign(
+        id=1,
+        workflows=workflows,
+        campaign_policy=policy,
+        deadline=config["campaign"]["deadline"],
+        execution_schema=config["campaign"]["execution_schema"],
+        requested_resources=config["campaign"]["requested_resources"],
+        target_resource=target_resource,
+    )
+    # This main class to execute the campaign to a resource.
+    b = Bookkeeper(
+        campaign=campaign,
+        policy=policy,
+        target_resource=target_resource,
+        deadline=humanfriendly.parse_timespan(config["campaign"]["deadline"]) / 60,
+        dryrun=args.dry_run
+    )
+    b.run()

socm/planner/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .base import PlanEntry, Planner # noqa: F401
2	+ from .heft_planner import HeftPlanner # noqa: F401

socm/planner/base.py ADDED Viewed

@@ -0,0 +1,87 @@
+import os
+from typing import Dict, List, NamedTuple, Tuple
+import networkx as nx
+import radical.utils as ru
+from ..core import Campaign, Resource, Workflow
+class PlanEntry(NamedTuple):
+    """Represents a scheduled workflow in the execution plan."""
+    workflow: Workflow
+    cores: range
+    memory: float
+    start_time: float
+    end_time: float
+class Planner(object):
+    """
+    The planner receives a campaign, a set of resources, and an execution time
+    estimation for each workflow per resource, and calculates a plan. The plan is
+    a list of tuples. Each tuple defines at least:
+    Workflow: A workflow member of the campaign
+    Resource: The resource on which the workflow will be executed.
+    Each planning class should always implement a plan method. This method
+    should calculate and return the execution plan. Each class can overleoad the
+    basic tuple with additional information based on what the planner is supposed
+    to do.
+    """
+    def __init__(
+        self,
+        campaign: Campaign | None = None,
+        resources: Resource | None = None,
+        resource_requirements: Dict[int, Dict[str, float]] | None = None,
+        policy: str | None = None,
+        sid: str | None = None,
+        objective: int | None = None
+    ):
+        self._campaign = campaign
+        self._resources = resources
+        self._resource_requirements = resource_requirements
+        self._policy = policy
+        self._objective = objective
+        self._plan = list()
+        self._uid = ru.generate_id("planner.%(counter)04d", mode=ru.ID_CUSTOM, ns=sid)
+        sid = sid if sid is not None else ru.generate_id("planner.%(counter)04d", mode=ru.ID_CUSTOM)
+        path = os.getcwd() + "/" + sid
+        self._logger = ru.Logger(name=self._uid, level="DEBUG", path=path)
+    def plan(
+        self,
+        campaign: List[Workflow] | None = None,
+        resources: range | None = None,
+        resource_requirements: Dict[int, Dict[str, float]] | None = None,
+        start_time: int = 0,
+        **kargs,
+    ) -> Tuple[List[Tuple[Workflow, range, float, float]], nx.DiGraph]:
+        """
+        The planning method
+        """
+        raise NotImplementedError("Plan method is not implemented")
+    def replan(
+        self,
+        campaign: List[Workflow] | None = None,
+        resources: range | None = None,
+        resource_requirements: Dict[int, Dict[str, float]] | None = None,
+        start_time: int = 0,
+    ) -> Tuple[List[Tuple[Workflow, range, float, float]], nx.DiGraph]:
+        """
+        The planning method
+        """
+        if campaign and resources and resource_requirements:
+            self._logger.debug("Replanning")
+            self._plan = self.plan(
+                campaign=campaign,
+                resources=resources,
+                resource_requirements=resource_requirements,
+                start_time=start_time,
+            )
+        else:
+            self._logger.debug("Nothing to plan for")
+        return self._plan