so-campaign-manager 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. so_campaign_manager-0.0.4.dist-info/METADATA +179 -0
  2. so_campaign_manager-0.0.4.dist-info/RECORD +44 -0
  3. so_campaign_manager-0.0.4.dist-info/WHEEL +5 -0
  4. so_campaign_manager-0.0.4.dist-info/entry_points.txt +2 -0
  5. so_campaign_manager-0.0.4.dist-info/licenses/LICENSE +24 -0
  6. so_campaign_manager-0.0.4.dist-info/top_level.txt +1 -0
  7. socm/__about__.py +34 -0
  8. socm/__init__.py +0 -0
  9. socm/__main__.py +35 -0
  10. socm/bookkeeper/__init__.py +1 -0
  11. socm/bookkeeper/bookkeeper.py +488 -0
  12. socm/configs/slurmise.toml +2 -0
  13. socm/core/__init__.py +1 -0
  14. socm/core/models.py +235 -0
  15. socm/enactor/__init__.py +3 -0
  16. socm/enactor/base.py +123 -0
  17. socm/enactor/dryrun_enactor.py +216 -0
  18. socm/enactor/rp_enactor.py +273 -0
  19. socm/execs/__init__.py +3 -0
  20. socm/execs/mapmaking.py +73 -0
  21. socm/planner/__init__.py +2 -0
  22. socm/planner/base.py +87 -0
  23. socm/planner/heft_planner.py +442 -0
  24. socm/resources/__init__.py +5 -0
  25. socm/resources/perlmutter.py +22 -0
  26. socm/resources/tiger.py +24 -0
  27. socm/resources/universe.py +18 -0
  28. socm/utils/__init__.py +0 -0
  29. socm/utils/misc.py +90 -0
  30. socm/utils/states.py +17 -0
  31. socm/workflows/__init__.py +41 -0
  32. socm/workflows/ml_mapmaking.py +111 -0
  33. socm/workflows/ml_null_tests/__init__.py +10 -0
  34. socm/workflows/ml_null_tests/base.py +117 -0
  35. socm/workflows/ml_null_tests/day_night_null_test.py +132 -0
  36. socm/workflows/ml_null_tests/direction_null_test.py +133 -0
  37. socm/workflows/ml_null_tests/elevation_null_test.py +118 -0
  38. socm/workflows/ml_null_tests/moon_close_null_test.py +165 -0
  39. socm/workflows/ml_null_tests/moonrise_set_null_test.py +151 -0
  40. socm/workflows/ml_null_tests/pwv_null_test.py +118 -0
  41. socm/workflows/ml_null_tests/sun_close_null_test.py +173 -0
  42. socm/workflows/ml_null_tests/time_null_test.py +76 -0
  43. socm/workflows/ml_null_tests/wafer_null_test.py +175 -0
  44. socm/workflows/sat_simulation.py +76 -0
@@ -0,0 +1,273 @@
1
+ # Imports from general packages
2
+ import os
3
+ import threading as mt
4
+ from copy import deepcopy
5
+ from datetime import datetime
6
+ from typing import Dict, List
7
+
8
+ # Imports from dependent packages
9
+ import numpy as np # noqa: F401
10
+ import radical.pilot as rp
11
+ import radical.utils as ru
12
+
13
+ from socm.core import Resource, Workflow
14
+ from socm.enactor.base import Enactor
15
+ from socm.utils.states import States
16
+
17
+
18
+ class RPEnactor(Enactor):
19
+ """
20
+ The Emulated enactor is responsible to execute workflows on emulated
21
+ resources. The Enactor takes as input a list of tuples <workflow,resource>
22
+ and executes the workflows on their selected resources.
23
+ """
24
+
25
+ def __init__(self, sid: str):
26
+ super(RPEnactor, self).__init__(sid=sid)
27
+ # List with all the workflows that are executing and require to be
28
+ # monitored. This list is atomic and requires a lock
29
+ self._to_monitor = list()
30
+
31
+ os.environ["RADICAL_CONFIG_USER_DIR"] = os.path.join(
32
+ os.path.dirname(__file__) + "/../configs/"
33
+ )
34
+ self._prof.prof("enactor_setup", uid=self._uid)
35
+ # Lock to provide atomicity in the monitoring data structure
36
+ self._monitoring_lock = ru.RLock("cm.monitor_lock")
37
+ self._cb_lock = ru.RLock("enactor.cb_lock")
38
+ self._callbacks = dict()
39
+
40
+ # Creating a thread to execute the monitoring method.
41
+ self._monitoring_thread = None # Private attribute that will hold the thread
42
+ self._terminate_monitor = mt.Event() # Thread event to terminate.
43
+
44
+ self._run = False
45
+ self._resource = None
46
+ self._prof.prof("enactor_started", uid=self._uid)
47
+ self._rp_session = rp.Session(uid=sid)
48
+ self._rp_pmgr = rp.PilotManager(session=self._rp_session)
49
+ self._rp_tmgr = rp.TaskManager(session=self._rp_session)
50
+ self._logger.info("Enactor is ready")
51
+
52
+ def setup(self, resource: Resource, walltime: int, cores: int, execution_schema: str | None = None) -> None:
53
+ """
54
+ Sets up the enactor to execute workflows.
55
+ """
56
+ self._resource = resource
57
+
58
+ pd_init = {
59
+ "resource": f"so.{resource.name}",
60
+ "runtime": walltime, # pilot runtime (min)
61
+ "exit_on_error": True,
62
+ "access_schema": "batch" if execution_schema == "batch" else "local",
63
+ "cores": cores,
64
+ "project": "simonsobs",
65
+ }
66
+
67
+ pdesc = rp.PilotDescription(pd_init)
68
+ self._logger.debug(f"Asking for {pdesc} pilot")
69
+ pilot = self._rp_pmgr.submit_pilots(pdesc)
70
+ self._rp_tmgr.add_pilots(pilot)
71
+
72
+ pilot.wait(state=rp.PMGR_ACTIVE)
73
+ self._logger.info("Pilot is ready")
74
+
75
+ def enact(self, workflows: List[Workflow]) -> None:
76
+ """
77
+ Method enact receives a set workflows and resources. It is responsible to
78
+ start the execution of the workflow and set a endpoint to the WMF that
79
+ executes the workflow
80
+
81
+ *workflows:* A workflows that will execute on a resource
82
+ *resources:* The resource that will be used.
83
+ """
84
+
85
+ self._prof.prof("enacting_start", uid=self._uid)
86
+ exec_workflows = []
87
+ for workflow in workflows:
88
+ # If the enactor has already received a workflow issue a warning and
89
+ # proceed.
90
+ if workflow.id in self._execution_status:
91
+ self._logger.info(
92
+ "Workflow %s is in state %s",
93
+ workflow,
94
+ self._get_workflow_state(workflow.id).name,
95
+ )
96
+ continue
97
+
98
+ try:
99
+ # Create a calculator task. This is equivalent because with
100
+ # the emulated resources, a workflow is a number of operations
101
+ # that need to be executed.
102
+
103
+ exec_workflow = (
104
+ rp.TaskDescription()
105
+ ) # Use workflow description and resources to create the TaskDescription
106
+ exec_workflow.uid = f"workflow.{workflow.id}"
107
+
108
+ exec_workflow.executable = workflow.executable
109
+ exec_workflow.arguments = []
110
+ if workflow.subcommand:
111
+ exec_workflow.arguments += [workflow.subcommand]
112
+ exec_workflow.arguments += workflow.get_arguments()
113
+ self._logger.debug(
114
+ "Workflow %s arguments: %s", workflow.id, exec_workflow.arguments
115
+ )
116
+
117
+ exec_workflow.ranks = workflow.resources["ranks"]
118
+ exec_workflow.cores_per_rank = workflow.resources["threads"]
119
+ exec_workflow.threading_type = rp.OpenMP
120
+ # exec_workflow.mem_per_rank = np.ceil(
121
+ # workflow.resources["memory"] / workflow.resources["ranks"]
122
+ # ) # this translates to memory per rank
123
+ exec_workflow.post_exec = "echo ${SLURM_JOB_ID}.${SLURM_STEP_ID}"
124
+ if workflow.environment:
125
+ exec_workflow.environment = workflow.environment
126
+ self._logger.info("Enacting workflow %s", workflow.id)
127
+ exec_workflows.append(exec_workflow)
128
+ # Lock the monitoring list and update it, as well as update
129
+ # the state of the workflow.
130
+ with self._monitoring_lock:
131
+ self._to_monitor.append(workflow.id)
132
+ self._execution_status[workflow.id] = {
133
+ "state": States.EXECUTING,
134
+ "endpoint": exec_workflow,
135
+ "exec_thread": None,
136
+ "start_time": datetime.now(),
137
+ "end_time": None,
138
+ }
139
+
140
+ for cb in self._callbacks:
141
+ self._callbacks[cb](
142
+ workflow_ids=[workflow.id],
143
+ new_state=States.EXECUTING,
144
+ step_ids=[None],
145
+ )
146
+ # Execute the task.
147
+ except Exception as ex:
148
+ self._logger.error(f"Workflow {workflow} could not be executed")
149
+ self._logger.error(f"Exception raised {ex}", exc_info=True)
150
+
151
+ self._rp_tmgr.submit_tasks(exec_workflows)
152
+
153
+ self._prof.prof("enacting_stop", uid=self._uid)
154
+ # If there is no monitoring tasks, start one.
155
+ if self._monitoring_thread is None:
156
+ self._logger.info("Starting monitor thread")
157
+ self._monitoring_thread = mt.Thread(
158
+ target=self._monitor, name="monitor-thread"
159
+ )
160
+ self._monitoring_thread.start()
161
+
162
+ def _monitor(self):
163
+ """
164
+ **Purpose**: Thread in the master process to monitor the campaign execution
165
+ data structure up to date.
166
+ """
167
+
168
+ self._prof.prof("workflow_monitor_start", uid=self._uid)
169
+ while not self._terminate_monitor.is_set():
170
+ if self._to_monitor:
171
+ workflows_executing = self._rp_tmgr.list_tasks()
172
+ # with self._monitoring_lock:
173
+ # It does not iterate correctly.
174
+ monitoring_list = deepcopy(self._to_monitor)
175
+ # self._logger.info("Monitoring workflows %s" % monitoring_list)
176
+ to_remove_wfs = list()
177
+ to_remove_sids = list()
178
+
179
+ for workflow_id in monitoring_list:
180
+ if f"workflow.{workflow_id}" in workflows_executing:
181
+ rp_workflow = self._rp_tmgr.get_tasks(
182
+ uids=f"workflow.{workflow_id}"
183
+ )
184
+ if rp_workflow.state in rp.FINAL:
185
+ with self._monitoring_lock:
186
+ self._logger.debug(f"workflow.{workflow_id} Done")
187
+ self._execution_status[workflow_id]["state"] = States.DONE
188
+ self._execution_status[workflow_id][
189
+ "end_time"
190
+ ] = datetime.now()
191
+ self._logger.debug(
192
+ "Workflow %s finished: %s, step_id: %s",
193
+ workflow_id,
194
+ self._execution_status[workflow_id]["end_time"],
195
+ rp_workflow.stdout.split()[-1],
196
+ )
197
+ to_remove_wfs.append(workflow_id)
198
+ to_remove_sids.append(rp_workflow.stdout.split()[-1])
199
+ self._prof.prof("workflow_success", uid=self._uid)
200
+ if to_remove_wfs:
201
+ for cb in self._callbacks:
202
+ self._callbacks[cb](
203
+ workflow_ids=to_remove_wfs,
204
+ new_state=States.DONE,
205
+ step_ids=to_remove_sids,
206
+ )
207
+ with self._monitoring_lock:
208
+ for wid in to_remove_wfs:
209
+ self._to_monitor.remove(wid)
210
+ self._prof.prof("workflow_monitor_end", uid=self._uid)
211
+
212
+ def get_status(self, workflows: str | List[str] | None = None) -> Dict[str, States]:
213
+ """
214
+ Get the state of a workflow or workflows.
215
+
216
+ *Parameter*
217
+ *workflows:* A workflow ID or a list of workflow IDs
218
+
219
+ *Returns*
220
+ *status*: A dictionary with the state of each workflow.
221
+ """
222
+
223
+ status = dict()
224
+ if workflows is None:
225
+ for workflow in self._execution_status:
226
+ status[workflow] = self._execution_status[workflow]["state"]
227
+ elif isinstance(workflows, list):
228
+ for workflow in workflows:
229
+ status[workflow] = self._execution_status[workflow]["state"]
230
+ else:
231
+ status[workflows] = self._execution_status[workflows]["state"]
232
+
233
+ return status
234
+
235
+ def update_status(self, workflow, new_state):
236
+ """
237
+ Update the state of a workflow that is executing
238
+ """
239
+
240
+ if workflow not in self._execution_status:
241
+ self._logger.warning(
242
+ "Has not enacted on workflow %s yet.",
243
+ workflow,
244
+ self._get_workflow_state(workflow),
245
+ )
246
+ else:
247
+ self._execution_status[workflow]["state"] = new_state
248
+
249
+ def terminate(self):
250
+ """
251
+ Public method to terminate the Enactor
252
+ """
253
+ self._logger.info("Start terminating procedure")
254
+ self._prof.prof("str_terminating", uid=self._uid)
255
+ if self._monitoring_thread:
256
+ self._prof.prof("monitor_terminate", uid=self._uid)
257
+ self._terminate_monitor.set()
258
+ self._monitoring_thread.join()
259
+ self._prof.prof("monitor_terminated", uid=self._uid)
260
+ self._logger.debug("Monitor thread terminated")
261
+ # self._rp_tmgr.close()
262
+ self._rp_pmgr.close(terminate=True)
263
+ self._rp_session.close(terminate=True)
264
+ self._logger.debug("Enactor thread terminated")
265
+
266
+ def register_state_cb(self, cb):
267
+ """
268
+ Registers a new state update callback function with the Enactor.
269
+ """
270
+
271
+ with self._cb_lock:
272
+ cb_name = cb.__name__
273
+ self._callbacks[cb_name] = cb
socm/execs/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from . import mapmaking
2
+
3
+ SUBCOMMANDS = {"lat-mapmaking": mapmaking}
@@ -0,0 +1,73 @@
1
+ from argparse import ArgumentParser, Namespace
2
+
3
+ import humanfriendly
4
+ import toml
5
+
6
+ from socm.core.models import Campaign
7
+ from socm.utils.misc import get_workflow_entries, parse_comma_separated_fields
8
+ from socm.workflows import registered_workflows, subcampaign_map
9
+
10
+
11
+ def get_parser(parser: ArgumentParser) -> ArgumentParser:
12
+ """Create and return a sub-argument parser for the LAT mapmaking campaign."""
13
+ parser.add_argument(
14
+ "--toml",
15
+ "-t",
16
+ type=str,
17
+ required=True,
18
+ help="Path to the configuration file for the workflow.",
19
+ )
20
+ parser.add_argument(
21
+ "--dry-run",
22
+ action="store_true",
23
+ help="Enable dry run for faster development. This flag does not actually run the campaign.",
24
+ )
25
+ return parser
26
+
27
+ def _main(args: Namespace) -> None:
28
+ # Import here to avoid loading radical.pilot at CLI startup (not available on macOS)
29
+ from socm.bookkeeper import Bookkeeper
30
+
31
+ config = toml.load(args.toml)
32
+ config = parse_comma_separated_fields(config=config, fields_to_parse=["maxiter", "downsample"])
33
+ workflows_configs = get_workflow_entries(config, subcampaign_map=subcampaign_map)
34
+
35
+ workflows = []
36
+ for workflow_type, workflow_config in workflows_configs.items():
37
+ if workflow_type in registered_workflows:
38
+ workflow_config["resources"]["memory"] = (
39
+ humanfriendly.parse_size(workflow_config["resources"]["memory"])
40
+ // 1000000
41
+ )
42
+ workflow_config["resources"]["runtime"] = (
43
+ humanfriendly.parse_timespan(workflow_config["resources"]["runtime"])
44
+ / 60
45
+ ) # in minutes
46
+ workflow_factory = registered_workflows[workflow_type]
47
+ tmp_workflows = workflow_factory.get_workflows(workflow_config)
48
+ for workflow in tmp_workflows:
49
+ workflow.id = len(workflows) + 1 # Assign a unique ID to each workflow
50
+ workflows.append(workflow)
51
+
52
+ policy = config["campaign"].get("policy", "time")
53
+ target_resource = config["campaign"].get("resource", "tiger3")
54
+ # pprint(workflows)
55
+ campaign = Campaign(
56
+ id=1,
57
+ workflows=workflows,
58
+ campaign_policy=policy,
59
+ deadline=config["campaign"]["deadline"],
60
+ execution_schema=config["campaign"]["execution_schema"],
61
+ requested_resources=config["campaign"]["requested_resources"],
62
+ target_resource=target_resource,
63
+ )
64
+ # This main class to execute the campaign to a resource.
65
+ b = Bookkeeper(
66
+ campaign=campaign,
67
+ policy=policy,
68
+ target_resource=target_resource,
69
+ deadline=humanfriendly.parse_timespan(config["campaign"]["deadline"]) / 60,
70
+ dryrun=args.dry_run
71
+ )
72
+
73
+ b.run()
@@ -0,0 +1,2 @@
1
+ from .base import PlanEntry, Planner # noqa: F401
2
+ from .heft_planner import HeftPlanner # noqa: F401
socm/planner/base.py ADDED
@@ -0,0 +1,87 @@
1
+ import os
2
+ from typing import Dict, List, NamedTuple, Tuple
3
+
4
+ import networkx as nx
5
+ import radical.utils as ru
6
+
7
+ from ..core import Campaign, Resource, Workflow
8
+
9
+
10
+ class PlanEntry(NamedTuple):
11
+ """Represents a scheduled workflow in the execution plan."""
12
+ workflow: Workflow
13
+ cores: range
14
+ memory: float
15
+ start_time: float
16
+ end_time: float
17
+
18
+ class Planner(object):
19
+ """
20
+ The planner receives a campaign, a set of resources, and an execution time
21
+ estimation for each workflow per resource, and calculates a plan. The plan is
22
+ a list of tuples. Each tuple defines at least:
23
+ Workflow: A workflow member of the campaign
24
+ Resource: The resource on which the workflow will be executed.
25
+
26
+ Each planning class should always implement a plan method. This method
27
+ should calculate and return the execution plan. Each class can overleoad the
28
+ basic tuple with additional information based on what the planner is supposed
29
+ to do.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ campaign: Campaign | None = None,
35
+ resources: Resource | None = None,
36
+ resource_requirements: Dict[int, Dict[str, float]] | None = None,
37
+ policy: str | None = None,
38
+ sid: str | None = None,
39
+ objective: int | None = None
40
+ ):
41
+ self._campaign = campaign
42
+ self._resources = resources
43
+ self._resource_requirements = resource_requirements
44
+ self._policy = policy
45
+ self._objective = objective
46
+ self._plan = list()
47
+ self._uid = ru.generate_id("planner.%(counter)04d", mode=ru.ID_CUSTOM, ns=sid)
48
+ sid = sid if sid is not None else ru.generate_id("planner.%(counter)04d", mode=ru.ID_CUSTOM)
49
+ path = os.getcwd() + "/" + sid
50
+ self._logger = ru.Logger(name=self._uid, level="DEBUG", path=path)
51
+
52
+ def plan(
53
+ self,
54
+ campaign: List[Workflow] | None = None,
55
+ resources: range | None = None,
56
+ resource_requirements: Dict[int, Dict[str, float]] | None = None,
57
+ start_time: int = 0,
58
+ **kargs,
59
+ ) -> Tuple[List[Tuple[Workflow, range, float, float]], nx.DiGraph]:
60
+ """
61
+ The planning method
62
+ """
63
+
64
+ raise NotImplementedError("Plan method is not implemented")
65
+
66
+ def replan(
67
+ self,
68
+ campaign: List[Workflow] | None = None,
69
+ resources: range | None = None,
70
+ resource_requirements: Dict[int, Dict[str, float]] | None = None,
71
+ start_time: int = 0,
72
+ ) -> Tuple[List[Tuple[Workflow, range, float, float]], nx.DiGraph]:
73
+ """
74
+ The planning method
75
+ """
76
+ if campaign and resources and resource_requirements:
77
+ self._logger.debug("Replanning")
78
+ self._plan = self.plan(
79
+ campaign=campaign,
80
+ resources=resources,
81
+ resource_requirements=resource_requirements,
82
+ start_time=start_time,
83
+ )
84
+ else:
85
+ self._logger.debug("Nothing to plan for")
86
+
87
+ return self._plan