so-campaign-manager 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- so_campaign_manager-0.0.4.dist-info/METADATA +179 -0
- so_campaign_manager-0.0.4.dist-info/RECORD +44 -0
- so_campaign_manager-0.0.4.dist-info/WHEEL +5 -0
- so_campaign_manager-0.0.4.dist-info/entry_points.txt +2 -0
- so_campaign_manager-0.0.4.dist-info/licenses/LICENSE +24 -0
- so_campaign_manager-0.0.4.dist-info/top_level.txt +1 -0
- socm/__about__.py +34 -0
- socm/__init__.py +0 -0
- socm/__main__.py +35 -0
- socm/bookkeeper/__init__.py +1 -0
- socm/bookkeeper/bookkeeper.py +488 -0
- socm/configs/slurmise.toml +2 -0
- socm/core/__init__.py +1 -0
- socm/core/models.py +235 -0
- socm/enactor/__init__.py +3 -0
- socm/enactor/base.py +123 -0
- socm/enactor/dryrun_enactor.py +216 -0
- socm/enactor/rp_enactor.py +273 -0
- socm/execs/__init__.py +3 -0
- socm/execs/mapmaking.py +73 -0
- socm/planner/__init__.py +2 -0
- socm/planner/base.py +87 -0
- socm/planner/heft_planner.py +442 -0
- socm/resources/__init__.py +5 -0
- socm/resources/perlmutter.py +22 -0
- socm/resources/tiger.py +24 -0
- socm/resources/universe.py +18 -0
- socm/utils/__init__.py +0 -0
- socm/utils/misc.py +90 -0
- socm/utils/states.py +17 -0
- socm/workflows/__init__.py +41 -0
- socm/workflows/ml_mapmaking.py +111 -0
- socm/workflows/ml_null_tests/__init__.py +10 -0
- socm/workflows/ml_null_tests/base.py +117 -0
- socm/workflows/ml_null_tests/day_night_null_test.py +132 -0
- socm/workflows/ml_null_tests/direction_null_test.py +133 -0
- socm/workflows/ml_null_tests/elevation_null_test.py +118 -0
- socm/workflows/ml_null_tests/moon_close_null_test.py +165 -0
- socm/workflows/ml_null_tests/moonrise_set_null_test.py +151 -0
- socm/workflows/ml_null_tests/pwv_null_test.py +118 -0
- socm/workflows/ml_null_tests/sun_close_null_test.py +173 -0
- socm/workflows/ml_null_tests/time_null_test.py +76 -0
- socm/workflows/ml_null_tests/wafer_null_test.py +175 -0
- socm/workflows/sat_simulation.py +76 -0
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
|
|
3
|
+
import networkx as nx
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from ..core import Campaign, QosPolicy, Resource, Workflow
|
|
7
|
+
from .base import PlanEntry, Planner
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HeftPlanner(Planner):
|
|
11
|
+
"""Campaign planner using Heterogeneous Earliest Finish Time (HEFT) algorithm.
|
|
12
|
+
|
|
13
|
+
HEFT is a list scheduling algorithm that assigns tasks to processors to minimize
|
|
14
|
+
the overall completion time, considering both computation and communication costs.
|
|
15
|
+
|
|
16
|
+
Reference:
|
|
17
|
+
Topcuoglu, H., Hariri, S., & Wu, M. Y. (2002). Performance-effective and
|
|
18
|
+
low-complexity task scheduling for heterogeneous computing.
|
|
19
|
+
IEEE Transactions on Parallel and Distributed Systems, 13(3), 260-274.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
_estimated_walltime: List of estimated execution times (walltime) for each workflow
|
|
23
|
+
_estimated_cpus: List of estimated CPU requirements for each workflow
|
|
24
|
+
_estimated_memory: List of estimated memory requirements for each workflow
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
campaign: Campaign | None = None,
|
|
30
|
+
resources: Resource | None = None,
|
|
31
|
+
resource_requirements: Dict[int, Dict[str, float]] | None = None,
|
|
32
|
+
policy: str | None = None,
|
|
33
|
+
sid: str | None = None,
|
|
34
|
+
objective: int | None = None
|
|
35
|
+
):
|
|
36
|
+
super().__init__(
|
|
37
|
+
campaign=campaign,
|
|
38
|
+
resources=resources,
|
|
39
|
+
resource_requirements=resource_requirements,
|
|
40
|
+
sid=sid,
|
|
41
|
+
policy=policy,
|
|
42
|
+
objective=objective
|
|
43
|
+
)
|
|
44
|
+
# Initialize estimation tables (populated during planning)
|
|
45
|
+
self._estimated_walltime: List[float] = []
|
|
46
|
+
self._estimated_cpus: List[int] = []
|
|
47
|
+
self._estimated_memory: List[float] = []
|
|
48
|
+
|
|
49
|
+
def _get_free_memory(self, start_time: float, num_nodes: float) -> float:
|
|
50
|
+
"""Calculate available memory at a given time.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
start_time: Time point to check memory availability
|
|
54
|
+
num_nodes: The total number of nodes used
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Available memory in MB
|
|
58
|
+
"""
|
|
59
|
+
total_memory = num_nodes * self._resources.memory_per_node
|
|
60
|
+
used_memory = sum(
|
|
61
|
+
entry.memory
|
|
62
|
+
for entry in self._plan
|
|
63
|
+
if entry.start_time <= start_time < entry.end_time
|
|
64
|
+
)
|
|
65
|
+
return total_memory - used_memory
|
|
66
|
+
|
|
67
|
+
def _get_max_ncores(self, resource_requirements: Dict[int, Dict[str, float]]) -> int:
|
|
68
|
+
"""Get the maximum number of cores required by any single workflow."""
|
|
69
|
+
return max(values["req_cpus"] for values in resource_requirements.values())
|
|
70
|
+
|
|
71
|
+
def _find_suitable_qos_policies(self, requested_cores: int) -> QosPolicy:
|
|
72
|
+
"""Find QoS policies that can accommodate the campaign deadline."""
|
|
73
|
+
suitable_qos = self._resources.fits_in_qos(self._objective, cores=requested_cores)
|
|
74
|
+
if not suitable_qos:
|
|
75
|
+
available_qos = ', '.join(f"{q.name}({q.max_walltime}min)" for q in self._resources.qos)
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"No QoS policy can accommodate deadline of {self._objective} minutes. "
|
|
78
|
+
f"Available policies: {available_qos}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return suitable_qos
|
|
82
|
+
|
|
83
|
+
def _binary_search_resources(
|
|
84
|
+
self,
|
|
85
|
+
campaign: List[Workflow],
|
|
86
|
+
resource_requirements: Dict[int, Dict[str, float]],
|
|
87
|
+
lower_bound: int,
|
|
88
|
+
upper_bound: int
|
|
89
|
+
) -> Tuple[int, List[PlanEntry], nx.DiGraph] | None:
|
|
90
|
+
"""Binary search for minimum resources that satisfy the deadline.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Tuple of (ncores, plan, graph) if successful, None otherwise.
|
|
94
|
+
"""
|
|
95
|
+
best_ncores = None
|
|
96
|
+
best_plan = None
|
|
97
|
+
best_graph = None
|
|
98
|
+
|
|
99
|
+
while lower_bound <= upper_bound:
|
|
100
|
+
mid = (lower_bound + upper_bound) // 2
|
|
101
|
+
|
|
102
|
+
test_plan, test_graph = self._calculate_plan(
|
|
103
|
+
campaign=campaign,
|
|
104
|
+
resources=range(mid),
|
|
105
|
+
resource_requirements=resource_requirements
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
max_finish_time = max(entry.end_time for entry in test_plan)
|
|
109
|
+
|
|
110
|
+
if max_finish_time <= self._objective:
|
|
111
|
+
# This works! Try with fewer resources
|
|
112
|
+
best_ncores = mid
|
|
113
|
+
best_plan = test_plan
|
|
114
|
+
best_graph = test_graph
|
|
115
|
+
upper_bound = mid - 1
|
|
116
|
+
else:
|
|
117
|
+
# Need more resources
|
|
118
|
+
lower_bound = mid + 1
|
|
119
|
+
|
|
120
|
+
if best_ncores is not None:
|
|
121
|
+
return best_ncores, best_plan, best_graph
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
def _plan_with_qos_optimization(
|
|
125
|
+
self,
|
|
126
|
+
campaign: List[Workflow],
|
|
127
|
+
resource_requirements: Dict[int, Dict[str, float]],
|
|
128
|
+
requested_resources: int,
|
|
129
|
+
) -> Tuple[List[PlanEntry], nx.DiGraph, str, int]:
|
|
130
|
+
"""Find optimal QoS and resource allocation for the campaign.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Tuple of (plan, graph, qos_name, ncores).
|
|
134
|
+
"""
|
|
135
|
+
max_workflow_resources = self._get_max_ncores(resource_requirements)
|
|
136
|
+
upper_bound = requested_resources
|
|
137
|
+
lower_bound = max_workflow_resources
|
|
138
|
+
|
|
139
|
+
result = self._binary_search_resources(
|
|
140
|
+
campaign, resource_requirements, lower_bound, upper_bound
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if result is not None:
|
|
144
|
+
ncores, plan, plan_graph = result
|
|
145
|
+
qos_candidate = self._find_suitable_qos_policies(requested_cores=ncores)
|
|
146
|
+
self._logger.info(f"Plan to execute {plan} with {ncores} cores")
|
|
147
|
+
return plan, plan_graph, qos_candidate.name, ncores
|
|
148
|
+
else:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Cannot meet {self._objective} min deadline with {requested_resources} cores. "
|
|
151
|
+
f"Please increase deadline or increase requested cores."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _get_plan_graph(
|
|
155
|
+
self, plan: List[PlanEntry], resources: range
|
|
156
|
+
) -> nx.DiGraph:
|
|
157
|
+
"""Build dependency graph from the execution plan.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
plan: Execution plan with scheduled workflows
|
|
161
|
+
resources: Available resource cores
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Directed acyclic graph representing workflow dependencies
|
|
165
|
+
"""
|
|
166
|
+
self._logger.debug("Create resource dependency DAG")
|
|
167
|
+
graph = nx.DiGraph()
|
|
168
|
+
|
|
169
|
+
# Track which workflow is using each core
|
|
170
|
+
core_dependencies = {i: None for i in range(len(resources))}
|
|
171
|
+
|
|
172
|
+
for entry in plan:
|
|
173
|
+
# Find all previous tasks that occupied these cores
|
|
174
|
+
previous_tasks = {
|
|
175
|
+
core_dependencies[core]
|
|
176
|
+
for core in entry.cores
|
|
177
|
+
if core_dependencies[core] is not None
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
# Update core ownership
|
|
181
|
+
for core in entry.cores:
|
|
182
|
+
core_dependencies[core] = entry.workflow.id
|
|
183
|
+
|
|
184
|
+
# Add node and edges to graph
|
|
185
|
+
if not previous_tasks:
|
|
186
|
+
graph.add_node(entry.workflow.id)
|
|
187
|
+
else:
|
|
188
|
+
for predecessor_id in previous_tasks:
|
|
189
|
+
graph.add_edge(predecessor_id, entry.workflow.id)
|
|
190
|
+
|
|
191
|
+
self._logger.info(f"Calculated graph {graph}")
|
|
192
|
+
return graph
|
|
193
|
+
|
|
194
|
+
def plan(
|
|
195
|
+
self,
|
|
196
|
+
campaign: List[Workflow] | None = None,
|
|
197
|
+
resource_requirements: Dict[int, Dict[str, float]] | None = None,
|
|
198
|
+
execution_schema: str | None = None,
|
|
199
|
+
requested_resources: int | None = None
|
|
200
|
+
) -> Tuple[List[PlanEntry], nx.DiGraph, QosPolicy | None, int]:
|
|
201
|
+
"""Plan campaign execution with resource allocation.
|
|
202
|
+
|
|
203
|
+
In batch mode, uses the requested resources directly.
|
|
204
|
+
In remote mode, performs QoS selection and binary search to find the minimum
|
|
205
|
+
resources that satisfy the campaign deadline.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
campaign : List[Workflow] | None
|
|
210
|
+
The campaign workflows to plan
|
|
211
|
+
resource_requirements : Dict[int, Dict[str, float]] | None
|
|
212
|
+
Resource requirements for each workflow
|
|
213
|
+
execution_schema : str | None
|
|
214
|
+
'batch' for fixed resources, 'remote' for optimized allocation
|
|
215
|
+
requested_resources : int | None
|
|
216
|
+
Number of cores
|
|
217
|
+
|
|
218
|
+
Returns
|
|
219
|
+
-------
|
|
220
|
+
Tuple[plan, graph, qos, ncores]
|
|
221
|
+
- plan: List of PlanEntry tuples
|
|
222
|
+
- graph: DAG representation of the campaign
|
|
223
|
+
- qos: QoS policy name (None for batch mode)
|
|
224
|
+
- ncores: Number of cores allocated
|
|
225
|
+
"""
|
|
226
|
+
if execution_schema == "batch":
|
|
227
|
+
return self._plan_batch_mode(campaign, resource_requirements, requested_resources)
|
|
228
|
+
else:
|
|
229
|
+
return self._plan_with_qos_optimization(campaign, resource_requirements, requested_resources)
|
|
230
|
+
|
|
231
|
+
def _plan_batch_mode(
|
|
232
|
+
self,
|
|
233
|
+
campaign: List[Workflow],
|
|
234
|
+
resource_requirements: Dict[int, Dict[str, float]],
|
|
235
|
+
requested_resources: int
|
|
236
|
+
) -> Tuple[List[PlanEntry], nx.DiGraph, None, int]:
|
|
237
|
+
"""Plan execution for batch mode with fixed resources."""
|
|
238
|
+
plan, plan_graph = self._calculate_plan(
|
|
239
|
+
campaign=campaign,
|
|
240
|
+
resources=range(requested_resources),
|
|
241
|
+
resource_requirements=resource_requirements
|
|
242
|
+
)
|
|
243
|
+
self._logger.info(f"Plan to execute {plan} with {requested_resources} cores")
|
|
244
|
+
return plan, plan_graph, None, requested_resources
|
|
245
|
+
|
|
246
|
+
def _initialize_resource_estimates(
|
|
247
|
+
self, resource_requirements: Dict[int, Dict[str, float]]
|
|
248
|
+
) -> None:
|
|
249
|
+
"""Extract and store resource requirement estimates from workflows."""
|
|
250
|
+
self._estimated_walltime = []
|
|
251
|
+
self._estimated_cpus = []
|
|
252
|
+
self._estimated_memory = []
|
|
253
|
+
|
|
254
|
+
for resource_req in resource_requirements.values():
|
|
255
|
+
self._estimated_walltime.append(resource_req["req_walltime"])
|
|
256
|
+
self._estimated_cpus.append(resource_req["req_cpus"])
|
|
257
|
+
self._estimated_memory.append(resource_req["req_memory"])
|
|
258
|
+
|
|
259
|
+
def _get_sorted_workflow_indices(self) -> List[int]:
|
|
260
|
+
"""Get workflow indices sorted by execution time (longest first).
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
List of workflow indices in descending order of execution time
|
|
264
|
+
"""
|
|
265
|
+
return [
|
|
266
|
+
idx for idx, _ in sorted(
|
|
267
|
+
enumerate(self._estimated_walltime),
|
|
268
|
+
key=lambda x: x[1],
|
|
269
|
+
reverse=True
|
|
270
|
+
)
|
|
271
|
+
]
|
|
272
|
+
|
|
273
|
+
def _initialize_resource_free_times(
|
|
274
|
+
self, resources: range, start_time: float | int | list | np.ndarray
|
|
275
|
+
) -> np.ndarray:
|
|
276
|
+
"""Initialize array tracking when each resource becomes available.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
resources: Range of available resource cores
|
|
280
|
+
start_time: Initial availability time(s)
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Array of availability times for each core
|
|
284
|
+
"""
|
|
285
|
+
if isinstance(start_time, (np.ndarray, list)):
|
|
286
|
+
return np.array(start_time)
|
|
287
|
+
elif isinstance(start_time, (float, int)):
|
|
288
|
+
return np.array([start_time] * len(resources))
|
|
289
|
+
else:
|
|
290
|
+
return np.array([0.0] * len(resources))
|
|
291
|
+
|
|
292
|
+
def _find_best_resource_slot(
|
|
293
|
+
self,
|
|
294
|
+
workflow_idx: int,
|
|
295
|
+
resources: range,
|
|
296
|
+
resource_free: np.ndarray
|
|
297
|
+
) -> Tuple[int, float]:
|
|
298
|
+
"""Find the best resource slot for a workflow.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
workflow_idx: Index of the workflow to schedule
|
|
302
|
+
resources: Available resource cores
|
|
303
|
+
resource_free: Array tracking when each core becomes available
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Tuple of (best_core_index, earliest_start_time)
|
|
307
|
+
"""
|
|
308
|
+
walltime = self._estimated_walltime[workflow_idx]
|
|
309
|
+
memory_required = self._estimated_memory[workflow_idx]
|
|
310
|
+
cpus_required = self._estimated_cpus[workflow_idx]
|
|
311
|
+
|
|
312
|
+
min_end_time = float("inf")
|
|
313
|
+
best_core_idx = 0
|
|
314
|
+
core_idx = 0
|
|
315
|
+
|
|
316
|
+
while (core_idx + cpus_required) <= len(resources):
|
|
317
|
+
core_slice = slice(core_idx, core_idx + cpus_required)
|
|
318
|
+
start_time_candidate = resource_free[core_slice].max()
|
|
319
|
+
end_time_candidate = start_time_candidate + walltime
|
|
320
|
+
|
|
321
|
+
free_memory = self._get_free_memory(start_time_candidate, len(resources) / self._resources.cores_per_node)
|
|
322
|
+
|
|
323
|
+
if free_memory >= memory_required:
|
|
324
|
+
self._logger.debug(
|
|
325
|
+
f"Workflow {workflow_idx}: candidate finish time {end_time_candidate}"
|
|
326
|
+
)
|
|
327
|
+
if end_time_candidate < min_end_time:
|
|
328
|
+
min_end_time = end_time_candidate
|
|
329
|
+
best_core_idx = core_idx
|
|
330
|
+
self._logger.debug(
|
|
331
|
+
f"Workflow {workflow_idx}: minimum finish time {min_end_time}"
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
self._logger.debug(
|
|
335
|
+
f"Insufficient memory: {free_memory} MB available, {memory_required} MB required"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
core_idx += cpus_required
|
|
339
|
+
|
|
340
|
+
return best_core_idx, min_end_time - walltime
|
|
341
|
+
|
|
342
|
+
def _calculate_plan(
|
|
343
|
+
self,
|
|
344
|
+
campaign: List[Workflow] | None = None,
|
|
345
|
+
resources: range | None = None,
|
|
346
|
+
resource_requirements: Dict[int, Dict[str, float]] | None = None,
|
|
347
|
+
start_time: float = 0.0,
|
|
348
|
+
) -> Tuple[List[PlanEntry], nx.DiGraph]:
|
|
349
|
+
"""Implement the core HEFT scheduling algorithm.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
campaign: List of workflows to schedule
|
|
353
|
+
resources: Available resource cores
|
|
354
|
+
resource_requirements: Resource needs for each workflow
|
|
355
|
+
start_time: Initial time or per-core availability times
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Tuple of (execution_plan, dependency_graph)
|
|
359
|
+
"""
|
|
360
|
+
# Use provided parameters or fall back to instance attributes
|
|
361
|
+
workflows = campaign if campaign else self._campaign
|
|
362
|
+
cores = (
|
|
363
|
+
resources
|
|
364
|
+
if resources
|
|
365
|
+
else range(self._resources.nodes * self._resources.cores_per_node)
|
|
366
|
+
)
|
|
367
|
+
requirements = resource_requirements if resource_requirements else self._resource_requirements
|
|
368
|
+
|
|
369
|
+
# Initialize estimation tables
|
|
370
|
+
self._initialize_resource_estimates(requirements)
|
|
371
|
+
|
|
372
|
+
# Reset plan for fresh scheduling
|
|
373
|
+
self._plan: List[PlanEntry] = []
|
|
374
|
+
|
|
375
|
+
# Sort workflows by execution time (longest first)
|
|
376
|
+
sorted_indices = self._get_sorted_workflow_indices()
|
|
377
|
+
|
|
378
|
+
# Track when each core becomes available
|
|
379
|
+
resource_free = self._initialize_resource_free_times(cores, start_time)
|
|
380
|
+
|
|
381
|
+
# Schedule each workflow
|
|
382
|
+
for workflow_idx in sorted_indices:
|
|
383
|
+
best_core_idx, start_time_actual = self._find_best_resource_slot(
|
|
384
|
+
workflow_idx, cores, resource_free
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
cpus_required = self._estimated_cpus[workflow_idx]
|
|
388
|
+
walltime = self._estimated_walltime[workflow_idx]
|
|
389
|
+
memory_required = self._estimated_memory[workflow_idx]
|
|
390
|
+
core_slice = slice(best_core_idx, best_core_idx + cpus_required)
|
|
391
|
+
|
|
392
|
+
# Create plan entry
|
|
393
|
+
plan_entry = PlanEntry(
|
|
394
|
+
workflow=workflows[workflow_idx],
|
|
395
|
+
cores=cores[core_slice],
|
|
396
|
+
memory=memory_required,
|
|
397
|
+
start_time=start_time_actual,
|
|
398
|
+
end_time=start_time_actual + walltime
|
|
399
|
+
)
|
|
400
|
+
self._plan.append(plan_entry)
|
|
401
|
+
|
|
402
|
+
# Update resource availability
|
|
403
|
+
resource_free[core_slice] = start_time_actual + walltime
|
|
404
|
+
|
|
405
|
+
# Build dependency graph
|
|
406
|
+
plan_graph = self._get_plan_graph(self._plan, cores)
|
|
407
|
+
|
|
408
|
+
# Sort plan by workflow ID for consistent ordering
|
|
409
|
+
self._plan = sorted(self._plan, key=lambda entry: entry.workflow.id)
|
|
410
|
+
self._logger.debug("Potential plan %s", self._plan)
|
|
411
|
+
|
|
412
|
+
return self._plan, plan_graph
|
|
413
|
+
|
|
414
|
+
def replan(
|
|
415
|
+
self,
|
|
416
|
+
campaign: List[Workflow] | None = None,
|
|
417
|
+
resources: range | None = None,
|
|
418
|
+
resource_requirements: Dict[int, Dict[str, float]] | None = None,
|
|
419
|
+
start_time: float = 0.0,
|
|
420
|
+
) -> Tuple[List[PlanEntry], nx.DiGraph]:
|
|
421
|
+
"""Recalculate the execution plan with updated parameters.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
campaign: Updated list of workflows
|
|
425
|
+
resources: Updated resource allocation
|
|
426
|
+
resource_requirements: Updated resource requirements
|
|
427
|
+
start_time: New start time or per-core availability
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Tuple of (execution_plan, dependency_graph)
|
|
431
|
+
"""
|
|
432
|
+
if campaign and resources and resource_requirements:
|
|
433
|
+
self._logger.debug("Replanning with updated parameters")
|
|
434
|
+
return self._calculate_plan(
|
|
435
|
+
campaign=campaign,
|
|
436
|
+
resources=resources,
|
|
437
|
+
resource_requirements=resource_requirements,
|
|
438
|
+
start_time=start_time,
|
|
439
|
+
)
|
|
440
|
+
else:
|
|
441
|
+
self._logger.debug("Nothing to replan - missing required parameters")
|
|
442
|
+
return self._plan, self._get_plan_graph(self._plan, resources or range(0))
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from .perlmutter import PerlmutterResource # noqa: F401
|
|
2
|
+
from .tiger import TigerResource # noqa: F401
|
|
3
|
+
from .universe import UniverseResource # noqa: F401
|
|
4
|
+
|
|
5
|
+
registered_resources = {"perlmutter": PerlmutterResource, "tiger3": TigerResource, "universe": UniverseResource}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from ..core import QosPolicy, Resource
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PerlmutterResource(Resource):
|
|
5
|
+
"""
|
|
6
|
+
PerlmutterResource is a specialized Resource class for the Perlmutter HPC system.
|
|
7
|
+
It includes additional attributes specific to the Perlmutter system.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
name: str = "perlmutter"
|
|
11
|
+
nodes: int = 3072
|
|
12
|
+
cores_per_node: int = 128
|
|
13
|
+
memory_per_node: int = 1000000 # in MB
|
|
14
|
+
default_qos: str = "regular"
|
|
15
|
+
|
|
16
|
+
def __init__(self, **data):
|
|
17
|
+
super().__init__(**data)
|
|
18
|
+
self.qos = [QosPolicy(name="regular", max_walltime=2880, max_jobs=5000, max_cores=393216),
|
|
19
|
+
QosPolicy(name="interactive", max_walltime=240, max_jobs=2, max_cores=512),
|
|
20
|
+
QosPolicy(name="shared_interactive", max_walltime=240, max_jobs=2, max_cores=64),
|
|
21
|
+
QosPolicy(name="debug", max_walltime=30, max_jobs=5, max_cores=1024)
|
|
22
|
+
]
|
socm/resources/tiger.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from ..core import QosPolicy, Resource
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TigerResource(Resource):
|
|
5
|
+
"""
|
|
6
|
+
TigerResource is a specialized Resource class for the Tiger HPC system.
|
|
7
|
+
It includes additional attributes specific to the Tiger system.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
name: str = "tiger3"
|
|
11
|
+
nodes: int = 492
|
|
12
|
+
cores_per_node: int = 112
|
|
13
|
+
memory_per_node: int = 1000000 # in MB
|
|
14
|
+
default_qos: str = "test"
|
|
15
|
+
|
|
16
|
+
def __init__(self, **data):
|
|
17
|
+
super().__init__(**data)
|
|
18
|
+
self.qos = [QosPolicy(name="test", max_walltime=60, max_jobs=1, max_cores=8000),
|
|
19
|
+
QosPolicy(name="vshort", max_walltime=300, max_jobs=2000, max_cores=55104),
|
|
20
|
+
QosPolicy(name="short", max_walltime=1440, max_jobs=50, max_cores=8000),
|
|
21
|
+
QosPolicy(name="medium", max_walltime=4320, max_jobs=80, max_cores=4000),
|
|
22
|
+
QosPolicy(name="long", max_walltime=8640, max_jobs=16, max_cores=1000),
|
|
23
|
+
QosPolicy(name="vlong", max_walltime=21600, max_jobs=8, max_cores=900)
|
|
24
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from ..core import QosPolicy, Resource
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class UniverseResource(Resource):
|
|
5
|
+
"""
|
|
6
|
+
UniverseResource is a specialized Resource class for the Universe HPC system.
|
|
7
|
+
It includes additional attributes specific to the Universe system.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
name: str = "universe"
|
|
11
|
+
nodes: int = 28
|
|
12
|
+
cores_per_node: int = 224
|
|
13
|
+
memory_per_node: int = 1000000 # in MB
|
|
14
|
+
default_qos: str = "main"
|
|
15
|
+
|
|
16
|
+
def __init__(self, **data):
|
|
17
|
+
super().__init__(**data)
|
|
18
|
+
self.qos = [QosPolicy(name="main", max_walltime=43200, max_jobs=5000, max_cores=6272)]
|
socm/utils/__init__.py
ADDED
|
File without changes
|
socm/utils/misc.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def parse_comma_separated_fields(config: dict, fields_to_parse: List[str]) -> dict:
|
|
6
|
+
"""Convert comma-separated string values to lists."""
|
|
7
|
+
for key, value in config.items():
|
|
8
|
+
if isinstance(value, dict):
|
|
9
|
+
parse_comma_separated_fields(value, fields_to_parse)
|
|
10
|
+
elif key in fields_to_parse and isinstance(value, str) and ',' in value:
|
|
11
|
+
config[key] = [ast.literal_eval(item.strip()) for item in value.split(',')]
|
|
12
|
+
return config
|
|
13
|
+
|
|
14
|
+
def get_workflow_entries(campaign_dict: dict, subcampaign_map: Dict[str, list] | None = None) -> Dict[str, dict]:
|
|
15
|
+
"""
|
|
16
|
+
Extract workflow entries from a campaign dictionary using a predefined mapping.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
campaign_dict: A dictionary containing campaign configuration
|
|
20
|
+
subcampaign_map: A dictionary mapping subcampaign names to lists of their workflow names
|
|
21
|
+
E.g., {"ml-null-test": ["mission-tests", "wafer-tests"]}
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Dictionary containing workflow entries
|
|
25
|
+
"""
|
|
26
|
+
campaign_data = campaign_dict.get("campaign", {})
|
|
27
|
+
|
|
28
|
+
# Default empty map if none provided
|
|
29
|
+
if subcampaign_map is None:
|
|
30
|
+
subcampaign_map = {}
|
|
31
|
+
|
|
32
|
+
# Collect all workflows (direct and from subcampaigns)
|
|
33
|
+
workflows = {}
|
|
34
|
+
|
|
35
|
+
for workflow_key, workflow_value in campaign_data.items():
|
|
36
|
+
# Skip non-dictionary values
|
|
37
|
+
if not isinstance(workflow_value, dict):
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
# Check if this is a known subcampaign
|
|
41
|
+
if workflow_key in subcampaign_map:
|
|
42
|
+
# Process known workflows for this subcampaign
|
|
43
|
+
subcampaign_name = workflow_key
|
|
44
|
+
subcampaign_workflows = subcampaign_map[workflow_key]
|
|
45
|
+
|
|
46
|
+
# Create a copy of the subcampaign config without its workflows
|
|
47
|
+
subcampaign_common_config = {
|
|
48
|
+
k: v for k, v in workflow_value.items() if k not in subcampaign_workflows
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
for workflow_name in subcampaign_workflows:
|
|
52
|
+
if workflow_name in workflow_value:
|
|
53
|
+
# Start with the workflow's own config
|
|
54
|
+
workflow_config = workflow_value[workflow_name].copy()
|
|
55
|
+
|
|
56
|
+
# Update with common subcampaign config
|
|
57
|
+
workflow_config.update(subcampaign_common_config)
|
|
58
|
+
|
|
59
|
+
if isinstance(workflow_config, dict):
|
|
60
|
+
# Create combined key: subcampaign.workflow_name
|
|
61
|
+
workflows[f"{subcampaign_name}.{workflow_name}"] = (
|
|
62
|
+
workflow_config
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
# Treat as regular workflow
|
|
66
|
+
workflows[workflow_key] = workflow_value
|
|
67
|
+
|
|
68
|
+
return workflows
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_query_from_file(file_path: str) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Extract a SQL query from a file.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
file_path: The path to the file containing the SQL query.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
The SQL query as a string.
|
|
80
|
+
"""
|
|
81
|
+
query = "obs_id IN ("
|
|
82
|
+
with open(file_path, "r") as file:
|
|
83
|
+
obslist = file.readlines()
|
|
84
|
+
for obs_id in obslist:
|
|
85
|
+
obs_id = obs_id.strip()
|
|
86
|
+
query += f"'{obs_id}',"
|
|
87
|
+
query = query.rstrip(",")
|
|
88
|
+
query += ")"
|
|
89
|
+
|
|
90
|
+
return query
|
socm/utils/states.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# ------------------------------------------------------------------------------
|
|
2
|
+
# States
|
|
3
|
+
from enum import Enum, auto
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class States(Enum):
|
|
7
|
+
"""Workflow and campaign execution states."""
|
|
8
|
+
|
|
9
|
+
NEW = auto() # New campaign is submitted
|
|
10
|
+
PLANNING = auto() # Planning the execution of the campaign
|
|
11
|
+
EXECUTING = auto() # At least one workflow is executing
|
|
12
|
+
DONE = auto() # Campaign has finished successfully
|
|
13
|
+
FAILED = auto() # Campaign execution has failed
|
|
14
|
+
CANCELED = auto() # Campaign got canceled by the user.
|
|
15
|
+
|
|
16
|
+
# Final states for a campaign
|
|
17
|
+
CFINAL = [States.DONE, States.FAILED, States.CANCELED]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from socm.workflows.ml_mapmaking import MLMapmakingWorkflow # noqa: F401
|
|
2
|
+
from socm.workflows.ml_null_tests import ( # noqa: F401
|
|
3
|
+
DayNightNullTestWorkflow,
|
|
4
|
+
DirectionNullTestWorkflow,
|
|
5
|
+
ElevationNullTestWorkflow,
|
|
6
|
+
MoonCloseFarNullTestWorkflow,
|
|
7
|
+
MoonRiseSetNullTestWorkflow,
|
|
8
|
+
PWVNullTestWorkflow,
|
|
9
|
+
SunCloseFarNullTestWorkflow,
|
|
10
|
+
TimeNullTestWorkflow,
|
|
11
|
+
WaferNullTestWorkflow,
|
|
12
|
+
)
|
|
13
|
+
from socm.workflows.sat_simulation import SATSimWorkflow # noqa: F401
|
|
14
|
+
|
|
15
|
+
registered_workflows = {
|
|
16
|
+
"ml-mapmaking": MLMapmakingWorkflow,
|
|
17
|
+
"sat-sims": SATSimWorkflow,
|
|
18
|
+
"ml-null-tests.mission-tests": TimeNullTestWorkflow,
|
|
19
|
+
"ml-null-tests.wafer-tests": WaferNullTestWorkflow,
|
|
20
|
+
"ml-null-tests.direction-tests": DirectionNullTestWorkflow,
|
|
21
|
+
"ml-null-tests.pwv-tests": PWVNullTestWorkflow,
|
|
22
|
+
"ml-null-tests.day-night-tests": DayNightNullTestWorkflow,
|
|
23
|
+
"ml-null-tests.moonrise-set-tests": MoonRiseSetNullTestWorkflow,
|
|
24
|
+
"ml-null-tests.elevation-tests": ElevationNullTestWorkflow,
|
|
25
|
+
"ml-null-tests.sun-close-tests": SunCloseFarNullTestWorkflow,
|
|
26
|
+
"ml-null-tests.moon-close-tests": MoonCloseFarNullTestWorkflow,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
subcampaign_map = {
|
|
30
|
+
"ml-null-tests": [
|
|
31
|
+
"mission-tests",
|
|
32
|
+
"wafer-tests",
|
|
33
|
+
"direction-tests",
|
|
34
|
+
"pwv-tests",
|
|
35
|
+
"day-night-tests",
|
|
36
|
+
"moonrise-set-tests",
|
|
37
|
+
"elevation-tests",
|
|
38
|
+
"sun-close-tests",
|
|
39
|
+
"moon-close-tests",
|
|
40
|
+
]
|
|
41
|
+
}
|