so-campaign-manager 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. so_campaign_manager-0.0.4.dist-info/METADATA +179 -0
  2. so_campaign_manager-0.0.4.dist-info/RECORD +44 -0
  3. so_campaign_manager-0.0.4.dist-info/WHEEL +5 -0
  4. so_campaign_manager-0.0.4.dist-info/entry_points.txt +2 -0
  5. so_campaign_manager-0.0.4.dist-info/licenses/LICENSE +24 -0
  6. so_campaign_manager-0.0.4.dist-info/top_level.txt +1 -0
  7. socm/__about__.py +34 -0
  8. socm/__init__.py +0 -0
  9. socm/__main__.py +35 -0
  10. socm/bookkeeper/__init__.py +1 -0
  11. socm/bookkeeper/bookkeeper.py +488 -0
  12. socm/configs/slurmise.toml +2 -0
  13. socm/core/__init__.py +1 -0
  14. socm/core/models.py +235 -0
  15. socm/enactor/__init__.py +3 -0
  16. socm/enactor/base.py +123 -0
  17. socm/enactor/dryrun_enactor.py +216 -0
  18. socm/enactor/rp_enactor.py +273 -0
  19. socm/execs/__init__.py +3 -0
  20. socm/execs/mapmaking.py +73 -0
  21. socm/planner/__init__.py +2 -0
  22. socm/planner/base.py +87 -0
  23. socm/planner/heft_planner.py +442 -0
  24. socm/resources/__init__.py +5 -0
  25. socm/resources/perlmutter.py +22 -0
  26. socm/resources/tiger.py +24 -0
  27. socm/resources/universe.py +18 -0
  28. socm/utils/__init__.py +0 -0
  29. socm/utils/misc.py +90 -0
  30. socm/utils/states.py +17 -0
  31. socm/workflows/__init__.py +41 -0
  32. socm/workflows/ml_mapmaking.py +111 -0
  33. socm/workflows/ml_null_tests/__init__.py +10 -0
  34. socm/workflows/ml_null_tests/base.py +117 -0
  35. socm/workflows/ml_null_tests/day_night_null_test.py +132 -0
  36. socm/workflows/ml_null_tests/direction_null_test.py +133 -0
  37. socm/workflows/ml_null_tests/elevation_null_test.py +118 -0
  38. socm/workflows/ml_null_tests/moon_close_null_test.py +165 -0
  39. socm/workflows/ml_null_tests/moonrise_set_null_test.py +151 -0
  40. socm/workflows/ml_null_tests/pwv_null_test.py +118 -0
  41. socm/workflows/ml_null_tests/sun_close_null_test.py +173 -0
  42. socm/workflows/ml_null_tests/time_null_test.py +76 -0
  43. socm/workflows/ml_null_tests/wafer_null_test.py +175 -0
  44. socm/workflows/sat_simulation.py +76 -0
@@ -0,0 +1,488 @@
1
+ import os
2
+ import threading as mt
3
+ from importlib.resources import files
4
+ from math import ceil, floor
5
+ from pathlib import Path
6
+ from time import sleep
7
+ from typing import Dict
8
+
9
+ import radical.utils as ru
10
+ from slurmise.api import Slurmise
11
+ from slurmise.job_data import JobData
12
+ from slurmise.job_parse.file_parsers import FileMD5
13
+ from slurmise.slurm import parse_slurm_job_metadata
14
+
15
+ from ..core import Campaign, Workflow
16
+ from ..enactor import DryrunEnactor, RPEnactor
17
+ from ..planner import HeftPlanner
18
+ from ..resources import registered_resources
19
+ from ..utils.states import CFINAL, States
20
+
21
+
22
+ class Bookkeeper(object):
23
+ """
24
+ This is the Bookkeeping class. It gets the campaign and the resources, calls
25
+ the planner and enacts to the plan.
26
+
27
+ *Parameters:*
28
+
29
+ *campaign:* The campaign that needs to be executed.
30
+ *resources:* A set of resources.
31
+ *objective:* The campaign's objective
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ campaign: Campaign,
37
+ policy: str,
38
+ target_resource: str,
39
+ deadline: int,
40
+ dryrun: bool = False,
41
+ ):
42
+ self._campaign = {"campaign": campaign, "state": States.NEW}
43
+ self._session_id = ru.generate_id("socm.session", mode=ru.ID_PRIVATE)
44
+ self._uid = ru.generate_id(
45
+ "bookkeeper.%(counter)04d", mode=ru.ID_CUSTOM, ns=self._session_id
46
+ )
47
+
48
+ self._resource = registered_resources[target_resource]()
49
+ self._checkpoints = None
50
+ self._plan = None
51
+ self._plan_graph = None
52
+ self._unavail_resources = []
53
+ self._workflows_state = dict()
54
+ self._workflows_execids = dict()
55
+ self._objective = deadline
56
+ self._exec_state_lock = ru.RLock("workflows_state_lock")
57
+ self._monitor_lock = ru.RLock("monitor_list_lock")
58
+ self._slurmise = Slurmise(toml_path=files("socm.configs") / "slurmise.toml")
59
+ # The time in the campaign's world. The first element is the actual time
60
+ # of the campaign world. The second element is the
61
+ # self._time = {"time": 0, "step": []}io
62
+
63
+ path = os.getcwd() + "/" + self._session_id
64
+
65
+ self._logger = ru.Logger(name=self._uid, path=path, level="DEBUG")
66
+ self._prof = ru.Profiler(name=self._uid, path=path)
67
+ self._logger.debug(f"Deadline {deadline}")
68
+ self._planner = HeftPlanner(
69
+ sid=self._session_id,
70
+ policy=policy,
71
+ resources=self._resource,
72
+ objective=deadline
73
+ )
74
+
75
+ self._workflows_to_monitor = list()
76
+ self._est_end_times = dict()
77
+ self._enactor = RPEnactor(sid=self._session_id) if not dryrun else DryrunEnactor(sid=self._session_id)
78
+ self._dryrun = dryrun
79
+ self._enactor.register_state_cb(self.state_update_cb)
80
+ self._enactor.register_state_cb(self.workflowid_update_cb)
81
+
82
+ # Creating a thread to execute the monitoring and work methods.
83
+ # One flag for both threads may be enough to monitor and check.
84
+ self._terminate_event = mt.Event() # Thread event to terminate.
85
+ self._work_thread = None # Private attribute that will hold the thread
86
+ self._monitoring_thread = None # Private attribute that will hold the thread
87
+
88
+ def _get_campaign_requirements(self) -> Dict[str, Dict[str, float | int]]:
89
+ workflow_requirements = dict()
90
+ total_cores = 1 # self._resource.nodes * self._resource.cores_per_node
91
+ # total_memory = self._resource.nodes * self._resource.memory_per_node
92
+ for workflow in self._campaign["campaign"].workflows:
93
+ # tmp_runtime = np.inf
94
+ cores = 1
95
+ while cores <= total_cores:
96
+ self._logger.debug(
97
+ f"Workflow command: {workflow.get_command()} and subcommand: {workflow.subcommand}"
98
+ )
99
+ slurm_job, warns = (
100
+ None,
101
+ [1, 2],
102
+ )
103
+ # slurm_job, warns = self._slurmise.predict(cmd=workflow.get_command(), job_name=workflow.subcommand)
104
+ # self._logger.debug(
105
+ # f"Slurm job prediction for {workflow.id}: {slurm_job}, "
106
+ # f"runtime: {slurm_job.runtime}, memory: {slurm_job.memory}"
107
+ # )
108
+ cores *= 2
109
+ # if tmp_runtime / slurm_job.runtime > 1.5 and slurm_job.memory < total_memory:
110
+ # tmp_runtime = slurm_job.runtime
111
+ # cores *= 2
112
+ # else:
113
+ # break
114
+
115
+ if cores > total_cores or len(warns) > 0:
116
+ workflow_requirements[workflow.id] = {
117
+ "req_cpus": workflow.resources["ranks"]
118
+ * workflow.resources["threads"],
119
+ "req_memory": workflow.resources["memory"],
120
+ "req_walltime": workflow.resources["runtime"]
121
+ * 1.1, # Adding 10% to the runtime
122
+ }
123
+ else:
124
+ workflow.resources["ranks"] = cores // 2
125
+ workflow.resources["threads"] = 1
126
+ workflow.resources["memory"] = slurm_job.memory
127
+ workflow_requirements[workflow.id] = {
128
+ "req_cpus": cores // 2,
129
+ "req_memory": slurm_job.memory,
130
+ "req_walltime": slurm_job.runtime
131
+ * 1.1, # Adding 10% to the runtime
132
+ }
133
+ return workflow_requirements
134
+
135
+ def _update_checkpoints(self):
136
+ """
137
+ Create a list of timestamps when workflows may start executing or end.
138
+ """
139
+
140
+ self._checkpoints = [0]
141
+
142
+ for work in self._plan:
143
+ if work[-2] not in self._checkpoints:
144
+ self._checkpoints.append(work[-2])
145
+ if work[-1] not in self._checkpoints:
146
+ self._checkpoints.append(work[-1])
147
+
148
+ self._checkpoints.sort()
149
+
150
+ def _verify_objective(self):
151
+ """
152
+ This private method verifies that the plan has not deviated from the
153
+ maximum walltime. It checks the estimated makespan of the campaign and
154
+ compares it with the maximum walltime.
155
+ """
156
+
157
+ self._update_checkpoints()
158
+
159
+ if self._checkpoints[-1] > self._objective:
160
+ return False
161
+ else:
162
+ return True
163
+
164
+ def _record(self, workflow: Workflow) -> None:
165
+ """
166
+ Record the workflow execution data to the performance prediction system
167
+ """
168
+ if self._dryrun:
169
+ return
170
+
171
+ self._logger.debug(
172
+ f"Recording workflow {workflow.id} with execid {self._workflows_execids[workflow.id]}"
173
+ )
174
+ slurm_id, step_id = self._workflows_execids[workflow.id].split(".")
175
+ workflow_metadata = parse_slurm_job_metadata(
176
+ slurm_id=slurm_id,
177
+ step_name=step_id,
178
+ )
179
+
180
+ numerical_fields = {
181
+ "ranks": workflow.resources["ranks"],
182
+ "threads": workflow.resources["threads"],
183
+ }
184
+ for field in workflow.get_numeric_fields(avoid_attributes=["id"]):
185
+ numerical_fields[field] = getattr(workflow, field)
186
+
187
+ categorical_fields = {}
188
+ for field in workflow.get_categorical_fields(
189
+ avoid_attributes=["executable", "name", "context", "output_dir", "query"]
190
+ ):
191
+ val = getattr(workflow, field)
192
+ field_val = (
193
+ FileMD5().parse_file(Path(val.split("file://")[-1]).absolute())
194
+ if val.startswith("file://")
195
+ else val
196
+ )
197
+ categorical_fields[field] = field_val
198
+
199
+ workflow_jobdata = JobData(
200
+ job_name=workflow.name,
201
+ slurm_id=f"{slurm_id}.{step_id}",
202
+ categorical=categorical_fields,
203
+ numerical=numerical_fields,
204
+ memory=workflow_metadata["max_rss"],
205
+ runtime=workflow_metadata["elapsed_seconds"] / 60,
206
+ cmd=workflow.get_command(),
207
+ )
208
+ self._logger.debug(
209
+ "Workflow %s finished with metadata: %s and jobdata: %s",
210
+ workflow.id,
211
+ workflow_metadata,
212
+ workflow_jobdata,
213
+ )
214
+ self._slurmise.raw_record(job_data=workflow_jobdata)
215
+
216
+ def state_update_cb(self, workflow_ids, new_state, **kargs):
217
+ """
218
+ This is a state update callback. This callback is passed to the enactor.
219
+ """
220
+ self._logger.debug("Workflow %s to state %s", workflow_ids, new_state.name)
221
+ with self._exec_state_lock:
222
+ for workflow_id in workflow_ids:
223
+ self._workflows_state[workflow_id] = new_state
224
+
225
+ def workflowid_update_cb(self, workflow_ids, step_ids, **kargs):
226
+ """
227
+ This is a state update callback. This callback is passed to the enactor.
228
+ """
229
+ self._logger.debug("Workflow %s with slurmid %s", workflow_ids, step_ids)
230
+ with self._exec_state_lock:
231
+ for workflow_id, step_id in zip(workflow_ids, step_ids):
232
+ self._workflows_execids[workflow_id] = step_id
233
+
234
+ def work(self):
235
+ """
236
+ This method is responsible to execute the campaign.
237
+ """
238
+
239
+ # There is no need to check since I know there is no plan.
240
+ self._logger.debug("Campaign state to PLANNING")
241
+ self._prof.prof("planning_start", uid=self._uid)
242
+ if self._plan is None:
243
+ self._logger.debug("Calculating campaign plan")
244
+ with self._exec_state_lock:
245
+ self._campaign["state"] = States.PLANNING
246
+
247
+ workflow_requirements = self._get_campaign_requirements()
248
+
249
+ self._plan, self._plan_graph, selected_qos, cores_request = self._planner.plan(
250
+ campaign=self._campaign["campaign"].workflows,
251
+ execution_schema=self._campaign["campaign"].execution_schema,
252
+ resource_requirements=workflow_requirements,
253
+ requested_resources=self._campaign["campaign"].requested_resources
254
+ )
255
+
256
+ self._prof.prof("planning_ended", uid=self._uid)
257
+ self._logger.debug(f"Calculated campaign plan with {selected_qos} QOS and requesting {cores_request} cores")
258
+
259
+ # Update checkpoints and objective.
260
+ self._update_checkpoints()
261
+ if not self._verify_objective():
262
+ self._logger.error("Objective cannot be satisfied. Ending execution")
263
+ with self._exec_state_lock:
264
+ self._campaign["state"] = States.FAILED
265
+ sleep(1)
266
+ return
267
+
268
+ self._objective = int(
269
+ ceil(min(self._checkpoints[-1] * 1.25, self._objective))
270
+ )
271
+ self._logger.debug(
272
+ f"Campaign makespan {self._checkpoints[-1]}, and objective {self._objective}"
273
+ )
274
+ self._logger.debug(f"Resource max walltime {self._objective}")
275
+
276
+ self._enactor.setup(
277
+ resource=self._resource,
278
+ walltime=self._objective,
279
+ cores=cores_request,
280
+ execution_schema=self._campaign["campaign"].execution_schema,
281
+ )
282
+
283
+ with self._exec_state_lock:
284
+ self._campaign["state"] = States.EXECUTING
285
+ self._logger.debug("Campaign state to EXECUTING")
286
+
287
+ self._prof.prof("work_start", uid=self._uid)
288
+ while not self._terminate_event.is_set():
289
+ if not self._verify_objective():
290
+ self._logger.error("Objective cannot be satisfied. Ending execution")
291
+ with self._exec_state_lock:
292
+ self._campaign["state"] = States.FAILED
293
+ # self.terminate()
294
+ else:
295
+ self._prof.prof("work_submit", uid=self._uid)
296
+ workflows = list() # Workflows to enact
297
+ cores = list() # The selected cores
298
+ memory = list() # The memory per workflow
299
+
300
+ for wf_id in self._plan_graph.nodes():
301
+ predecessors_states = set()
302
+ for predecessor in self._plan_graph.predecessors(wf_id):
303
+ predecessors_states.add(self._workflows_state[predecessor])
304
+ # Do not enact to workflows that sould have been executed
305
+ # already.
306
+ if (
307
+ predecessors_states == set()
308
+ or predecessors_states == set([States.DONE])
309
+ ) and self._workflows_state[wf_id] == States.NEW:
310
+ node_slice = (
311
+ self._plan[wf_id - 1][2] / self._resource.memory_per_node
312
+ )
313
+ threads_per_core = floor(
314
+ self._resource.cores_per_node
315
+ * node_slice
316
+ / len(self._plan[wf_id - 1][1])
317
+ )
318
+ # print(node_slice, threads_per_core, self._plan[wf_id - 1])
319
+ workflows.append(self._plan[wf_id - 1][0])
320
+ cores.append((self._plan[wf_id - 1][1], threads_per_core))
321
+ memory.append(self._plan[wf_id - 1][2])
322
+
323
+ self._logger.debug(
324
+ f"To submit workflows {[x for x in workflows]}"
325
+ + f" to resources {cores}"
326
+ )
327
+
328
+ for rc_id in self._plan[wf_id - 1][1]:
329
+ self._est_end_times[rc_id] = self._plan[wf_id - 1][3]
330
+ if workflows:
331
+ self._logger.debug(
332
+ f"Submitting workflows {[x.id for x in workflows]}"
333
+ + f" to resources {cores}"
334
+ )
335
+
336
+ # There is no need to call the enactor when no new things
337
+ # should happen.
338
+ # self._logger.debug('Adding items: %s, %s', workflows, resources)
339
+ if workflows and cores and memory:
340
+ self._prof.prof("enactor_submit", uid=self._uid)
341
+ self._enactor.enact(workflows=workflows)
342
+ self._prof.prof("enactor_submitted", uid=self._uid)
343
+
344
+ with self._monitor_lock:
345
+ self._workflows_to_monitor += workflows
346
+ self._unavail_resources += cores
347
+ self._logger.info(
348
+ f"Total number of workflows to monitor {len(workflows)}"
349
+ )
350
+ self._logger.debug(
351
+ "Things monitored: %s, %s, %s",
352
+ self._workflows_to_monitor,
353
+ self._unavail_resources,
354
+ self._est_end_times,
355
+ )
356
+
357
+ self._prof.prof("work_submitted", uid=self._uid)
358
+ sleep(1)
359
+
360
+ def monitor(self):
361
+ """
362
+ This method monitors the state of the workflows. If the state is one of
363
+ the final states, it removes the workflow from the monitoring list, and
364
+ releases the resource. Otherwise if appends it to the end.
365
+ """
366
+ self._logger.info("Monitor thread started")
367
+ while not self._terminate_event.is_set():
368
+ while self._workflows_to_monitor:
369
+ self._prof.prof("workflow_monitor", uid=self._uid)
370
+ with self._monitor_lock:
371
+ workflows_snapshot = list(self._workflows_to_monitor)
372
+ finished = list()
373
+ for i in range(len(workflows_snapshot)):
374
+ if self._workflows_state[workflows_snapshot[i].id] in CFINAL:
375
+ resource = self._unavail_resources[i]
376
+ finished.append((workflows_snapshot[i], resource))
377
+
378
+ self._record(workflows_snapshot[i])
379
+ self._logger.info(
380
+ "Workflow %s finished",
381
+ workflows_snapshot[i].id,
382
+ )
383
+
384
+ if finished:
385
+ with self._monitor_lock:
386
+ for workflow, resource in finished:
387
+ self._workflows_to_monitor.remove(workflow)
388
+ self._unavail_resources.remove(resource)
389
+ self._prof.prof("workflow_finished", uid=self._uid)
390
+ else:
391
+ sleep(1) # Sleep for a while if nothing happened.
392
+
393
+ self._logger.debug("Monitor thread Stoped")
394
+
395
+ def get_makespan(self):
396
+ """
397
+ Returns the makespan of the campaign based on the current state of
398
+ execution
399
+ """
400
+
401
+ self._update_checkpoints()
402
+
403
+ return self._checkpoints[-1]
404
+
405
+ def terminate(self):
406
+ self._logger.info("Start terminating procedure")
407
+ self._prof.prof("str_bookkeper_terminating", uid=self._uid)
408
+
409
+ # Terminate enactor as well.
410
+ self._enactor.terminate()
411
+
412
+ # Terminate your threads.
413
+ self._logger.debug("Enactor terminated, terminating threads")
414
+
415
+ self._terminate_event.set() # Thread event to terminate.
416
+
417
+ self._prof.prof("monitor_bookkeper_terminate", uid=self._uid)
418
+ if self._monitoring_thread:
419
+ self._monitoring_thread.join()
420
+ self._prof.prof("monitor_bookkeper_terminated", uid=self._uid)
421
+ self._logger.debug("Monitor thread terminated")
422
+
423
+ self._prof.prof("work_bookkeper_terminate", uid=self._uid)
424
+ if self._work_thread:
425
+ self._work_thread.join() # Private attribute that will hold the thread
426
+ self._prof.prof("work_bookkeper_terminated", uid=self._uid)
427
+ self._logger.debug("Working thread terminated")
428
+
429
+ def run(self):
430
+ """
431
+ This method starts two threads for executing the campaign. The first
432
+ thread starts the work method. The second thread the monitoring thread.
433
+ """
434
+ try:
435
+ # Populate the execution status dictionary with workflows
436
+ with self._exec_state_lock:
437
+ for workflow in self._campaign["campaign"].workflows:
438
+ self._workflows_state[workflow.id] = States.NEW
439
+ self._prof.prof("bookkeper_start", uid=self._uid)
440
+ self._logger.info("Starting work thread")
441
+ self._work_thread = mt.Thread(target=self.work, name=f"bookkeeper-{self._uid}-work")
442
+ self._work_thread.start()
443
+ self._logger.info("Starting monitor thread")
444
+ self._monitoring_thread = mt.Thread(target=self.monitor, name=f"bookkeeper-{self._uid}-monitor")
445
+ self._monitoring_thread.start()
446
+ self._prof.prof("bookkeper_started", uid=self._uid)
447
+
448
+ # This waits regardless if workflows are failing or not. This loop can
449
+ # do meaningful work such as checking the state of the campaign. It can
450
+ # be a while true, until something happens.
451
+ # self._logger.debug(
452
+ # "Time now: %s, checkpoints: %s", self._time, self._checkpoints
453
+ # )
454
+ while self._checkpoints is None:
455
+ continue
456
+
457
+ self._prof.prof("bookkeper_wait", uid=self._uid)
458
+ while self._campaign["state"] not in CFINAL:
459
+ # Check if all workflows are in a final state.
460
+ cont = False
461
+
462
+ for workflow in self._campaign["campaign"].workflows:
463
+ if self._workflows_state[workflow.id] is States.FAILED:
464
+ self._campaign["state"] = States.FAILED
465
+ break
466
+ elif self._workflows_state[workflow.id] not in CFINAL:
467
+ cont = True
468
+
469
+ if not cont and not self._workflows_to_monitor:
470
+ self._campaign["state"] = States.DONE
471
+
472
+ if self._campaign["state"] not in CFINAL:
473
+ self._campaign["state"] = States.DONE
474
+ self._prof.prof("bookkeper_stopping", uid=self._uid)
475
+ except Exception as ex:
476
+ self._logger.error(f"Exception occured: {ex}")
477
+ finally:
478
+ self.terminate()
479
+
480
+ def get_campaign_state(self):
481
+ return self._campaign["state"]
482
+
483
+ def get_workflows_state(self):
484
+ states = dict()
485
+ for workflow in self._campaign["campaign"].workflows:
486
+ states[workflow.id] = self._workflows_state[workflow.id]
487
+
488
+ return states
@@ -0,0 +1,2 @@
1
+ [slurmise]
2
+ base_dir = '/scratch/gpfs/SIMONSOBS/users/ip8725/act_test/slurmise_dir'
socm/core/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .models import Campaign, QosPolicy, Resource, Workflow # noqa: F401