so-campaign-manager 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. so_campaign_manager-0.0.4.dist-info/METADATA +179 -0
  2. so_campaign_manager-0.0.4.dist-info/RECORD +44 -0
  3. so_campaign_manager-0.0.4.dist-info/WHEEL +5 -0
  4. so_campaign_manager-0.0.4.dist-info/entry_points.txt +2 -0
  5. so_campaign_manager-0.0.4.dist-info/licenses/LICENSE +24 -0
  6. so_campaign_manager-0.0.4.dist-info/top_level.txt +1 -0
  7. socm/__about__.py +34 -0
  8. socm/__init__.py +0 -0
  9. socm/__main__.py +35 -0
  10. socm/bookkeeper/__init__.py +1 -0
  11. socm/bookkeeper/bookkeeper.py +488 -0
  12. socm/configs/slurmise.toml +2 -0
  13. socm/core/__init__.py +1 -0
  14. socm/core/models.py +235 -0
  15. socm/enactor/__init__.py +3 -0
  16. socm/enactor/base.py +123 -0
  17. socm/enactor/dryrun_enactor.py +216 -0
  18. socm/enactor/rp_enactor.py +273 -0
  19. socm/execs/__init__.py +3 -0
  20. socm/execs/mapmaking.py +73 -0
  21. socm/planner/__init__.py +2 -0
  22. socm/planner/base.py +87 -0
  23. socm/planner/heft_planner.py +442 -0
  24. socm/resources/__init__.py +5 -0
  25. socm/resources/perlmutter.py +22 -0
  26. socm/resources/tiger.py +24 -0
  27. socm/resources/universe.py +18 -0
  28. socm/utils/__init__.py +0 -0
  29. socm/utils/misc.py +90 -0
  30. socm/utils/states.py +17 -0
  31. socm/workflows/__init__.py +41 -0
  32. socm/workflows/ml_mapmaking.py +111 -0
  33. socm/workflows/ml_null_tests/__init__.py +10 -0
  34. socm/workflows/ml_null_tests/base.py +117 -0
  35. socm/workflows/ml_null_tests/day_night_null_test.py +132 -0
  36. socm/workflows/ml_null_tests/direction_null_test.py +133 -0
  37. socm/workflows/ml_null_tests/elevation_null_test.py +118 -0
  38. socm/workflows/ml_null_tests/moon_close_null_test.py +165 -0
  39. socm/workflows/ml_null_tests/moonrise_set_null_test.py +151 -0
  40. socm/workflows/ml_null_tests/pwv_null_test.py +118 -0
  41. socm/workflows/ml_null_tests/sun_close_null_test.py +173 -0
  42. socm/workflows/ml_null_tests/time_null_test.py +76 -0
  43. socm/workflows/ml_null_tests/wafer_null_test.py +175 -0
  44. socm/workflows/sat_simulation.py +76 -0
socm/core/models.py ADDED
@@ -0,0 +1,235 @@
1
+ from collections.abc import Iterable
2
+ from numbers import Number
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, get_args, get_origin
4
+
5
+ from pydantic import BaseModel, Field, PrivateAttr
6
+
7
+ if TYPE_CHECKING:
8
+ from radical.pilot import TaskDescription
9
+
10
+
11
+ class QosPolicy(BaseModel):
12
+ name: str
13
+ max_walltime: Optional[int] = None # in minutes
14
+ max_jobs: Optional[int] = None
15
+ max_cores: Optional[int] = None
16
+
17
+
18
+ class Resource(BaseModel):
19
+ name: str
20
+ nodes: int
21
+ cores_per_node: int
22
+ memory_per_node: int
23
+ qos: List[QosPolicy] = Field(default_factory=list)
24
+ _existing_jobs: Dict[str, List[Tuple[str, int, int]]] = PrivateAttr(default_factory=dict)
25
+
26
+ def fits_in_qos(self, walltime: int, cores: int) -> QosPolicy | None:
27
+ """
28
+ Check if the given walltime and cores fit within the specified QoS policy.
29
+
30
+ Args:
31
+ walltime (int): The requested walltime in minutes.
32
+ cores (int): The requested number of cores.
33
+
34
+ Returns:
35
+ QosPolicy | None: The matching QoS policy object or None if no match is found.
36
+ """
37
+
38
+ # What happens when the job does not fit in the best possible QoS?
39
+ for policy in self.qos:
40
+ existing_jobs = self._existing_jobs.get(policy.name, [])
41
+
42
+ # Check walltime constraint (None means unlimited)
43
+ if policy.max_walltime is not None and policy.max_walltime < walltime:
44
+ continue
45
+
46
+ # Check cores constraint (None means unlimited)
47
+ if policy.max_cores is not None:
48
+ remaining_cores = policy.max_cores - sum(job[2] for job in existing_jobs)
49
+ if remaining_cores < cores:
50
+ continue
51
+
52
+ # Check max jobs constraint (None means unlimited)
53
+ if policy.max_jobs is not None and len(existing_jobs) >= policy.max_jobs:
54
+ continue
55
+
56
+ return policy
57
+ return None
58
+
59
+ def register_job(self, job_id: str, walltime: int, cores: int) -> bool:
60
+ """
61
+ Register a job with the resource if it fits within the QoS policies.
62
+
63
+ Args:
64
+ job_id (str): The unique identifier for the job.
65
+ walltime (int): The requested walltime in minutes.
66
+ cores (int): The requested number of cores.
67
+
68
+ Returns:
69
+ bool: True if the job was registered successfully, False otherwise.
70
+ """
71
+ qos_policy = self.fits_in_qos(walltime, cores)
72
+ if qos_policy:
73
+ qos_name = qos_policy.name
74
+ existing_jobs = self._existing_jobs.get(qos_name, [])
75
+ existing_jobs.append((job_id, walltime, cores))
76
+ self._existing_jobs[qos_name] = existing_jobs
77
+ return True
78
+ return False
79
+
80
+ class Workflow(BaseModel):
81
+ name: str
82
+ executable: str
83
+ context: str
84
+ subcommand: str = ""
85
+ id: Optional[int] = None
86
+ environment: Optional[Dict[str, str]] = None
87
+ resources: Optional[Dict[str, int | float]] = None
88
+
89
+ model_config = {
90
+ "extra": "allow",
91
+ }
92
+
93
+ def get_command(self, **kargs) -> str:
94
+ raise NotImplementedError("This method should be implemented in subclasses")
95
+
96
+ def get_arguments(self, **kargs) -> str:
97
+ raise NotImplementedError("This method should be implemented in subclasses")
98
+
99
+ def get_numeric_fields(self, avoid_attributes: List[str] | None = None) -> List[str]:
100
+ """
101
+ Returns a list of field names that are either numeric types
102
+ or iterable collections of numeric types.
103
+
104
+ Uses Pydantic v2 model_fields for type introspection.
105
+
106
+ Returns:
107
+ List[str]: Field names with numeric values
108
+ """
109
+ if avoid_attributes is None:
110
+ avoid_attributes = []
111
+
112
+ numeric_fields = []
113
+
114
+ # Get field information from Pydantic v2 model_fields
115
+ for field_name, field_info in self.__class__.model_fields.items():
116
+ # Get the annotation type
117
+ if field_name in avoid_attributes or getattr(self, field_name) is None:
118
+ continue
119
+ field_type = field_info.annotation
120
+
121
+ # Check for direct numeric types
122
+ if isinstance(field_type, type) and issubclass(field_type, Number):
123
+ numeric_fields.append(field_name)
124
+ continue
125
+
126
+ # Check for complex types (Optional, List, etc)
127
+ origin = get_origin(field_type)
128
+ if origin is not None:
129
+ args = get_args(field_type)
130
+
131
+ # Check for Optional numeric types
132
+ if origin is Union:
133
+ for arg in args:
134
+ if isinstance(arg, type) and issubclass(arg, Number):
135
+ numeric_fields.append(field_name)
136
+ break
137
+ # Check for iterables of numbers
138
+ elif issubclass(origin, Iterable):
139
+ # Check if it's a parameterized generic like List[int]
140
+ if args and len(args) > 0:
141
+ element_type = args[0]
142
+ if isinstance(element_type, type) and issubclass(element_type, Number):
143
+ numeric_fields.append(field_name)
144
+
145
+ # Also check actual instance values for numeric fields not captured by annotations
146
+ for field_name, value in self.__dict__.items():
147
+ if field_name not in numeric_fields and field_name not in avoid_attributes:
148
+ if isinstance(value, Number):
149
+ numeric_fields.append(field_name)
150
+ elif isinstance(value, Iterable) and not isinstance(value, (str, bytes, dict)):
151
+ # Check if all elements are numbers
152
+ try:
153
+ if all(isinstance(item, Number) for item in value):
154
+ numeric_fields.append(field_name)
155
+ except (TypeError, ValueError):
156
+ pass
157
+
158
+ return numeric_fields
159
+
160
+ def get_categorical_fields(self, avoid_attributes: List[str] | None = None) -> List[str]:
161
+ """
162
+ Returns a list of field names that are either string types
163
+ or iterable collections of string types.
164
+
165
+ Uses Pydantic v2 model_fields for type introspection.
166
+
167
+ Returns:
168
+ List[str]: Field names with categorical (string) values
169
+ """
170
+ if avoid_attributes is None:
171
+ avoid_attributes = []
172
+ categorical_fields = []
173
+
174
+ # Get field information from Pydantic v2 model_fields
175
+ for field_name, field_info in self.__class__.model_fields.items():
176
+ # Get the annotation type
177
+ if field_name in avoid_attributes or getattr(self, field_name) is None:
178
+ continue
179
+ field_type = field_info.annotation
180
+
181
+ # Check for direct numeric types
182
+ if isinstance(field_type, type) and issubclass(field_type, str):
183
+ categorical_fields.append(field_name)
184
+ continue
185
+
186
+ # Check for complex types (Optional, List, etc)
187
+ origin = get_origin(field_type)
188
+ if origin is not None:
189
+ args = get_args(field_type)
190
+
191
+ # Check for Optional numeric types
192
+ if origin is Union:
193
+ for arg in args:
194
+ if isinstance(arg, type) and issubclass(arg, str):
195
+ categorical_fields.append(field_name)
196
+ break
197
+ # Check for iterables of numbers
198
+ elif issubclass(origin, Iterable):
199
+ # Check if it's a parameterized generic like List[int]
200
+ if args and len(args) > 0:
201
+ element_type = args[0]
202
+ if isinstance(element_type, type) and issubclass(element_type, str):
203
+ categorical_fields.append(field_name)
204
+
205
+ # Also check actual instance values for numeric fields not captured by annotations
206
+ for field_name, value in self.__dict__.items():
207
+ if field_name not in categorical_fields and field_name not in avoid_attributes:
208
+ if isinstance(value, str):
209
+ categorical_fields.append(field_name)
210
+ elif isinstance(value, Iterable) and not isinstance(value, (Number, bytes, dict)):
211
+ # Check if all elements are numbers
212
+ try:
213
+ if all(isinstance(item, str) for item in value):
214
+ categorical_fields.append(field_name)
215
+ except (TypeError, ValueError):
216
+ pass
217
+
218
+ return categorical_fields
219
+
220
+ def get_tasks(self) -> List["TaskDescription"]:
221
+ """
222
+ Returns a list of TaskDescription objects for the workflow.
223
+ This is a placeholder method and should be implemented in subclasses.
224
+ """
225
+ raise NotImplementedError("This method should be implemented in subclasses")
226
+
227
+
228
+ class Campaign(BaseModel):
229
+ id: int
230
+ workflows: List[Workflow]
231
+ deadline: str
232
+ target_resource: str = "tiger3"
233
+ campaign_policy: str = "time"
234
+ execution_schema: str = "batch"
235
+ requested_resources: int = 0
@@ -0,0 +1,3 @@
1
+ from .base import Enactor # noqa: F401
2
+ from .dryrun_enactor import DryrunEnactor # noqa: F401
3
+ from .rp_enactor import RPEnactor # noqa: F401
socm/enactor/base.py ADDED
@@ -0,0 +1,123 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import radical.utils as ru
5
+
6
+ from socm.core import Resource
7
+ from socm.utils.states import States
8
+
9
+
10
+ class Enactor(object):
11
+ """
12
+ The Enactor is responsible to execute workflows on resources. The Enactor
13
+ takes as input a list of tuples <workflow,resource> and executes the
14
+ workflows on their selected resources.
15
+
16
+ The Enactor offers a set of methods to execute and monitor workflows.
17
+
18
+ *Parameters:*
19
+
20
+ *workflows*: A list with the workflow IDs that are executing.
21
+
22
+ *execution_status*: a hash table table that holds the state, and
23
+ execution status.
24
+
25
+ *logger*: a logging object.
26
+ """
27
+
28
+ def __init__(self, sid=None):
29
+
30
+ self._worflows = list() # A list of workflows IDs
31
+ # This will a hash table of workflows. The table will include the
32
+ # following:
33
+ # 'workflowsID': {'state': The state of the workflow based on the WFM,
34
+ # 'endpoint': Process ID or object to WMF for the specific
35
+ # workflow,
36
+ # 'start_time': Epoch of when the workflow is submitted
37
+ # to the WMF,
38
+ # 'end_time': Epoch of when the workflow finished.}
39
+ self._execution_status = dict() # This will create a hash table of workflows
40
+
41
+ self._uid = ru.generate_id("enactor.%(counter)04d", mode=ru.ID_CUSTOM, ns=sid)
42
+
43
+ path = os.getcwd() + "/" + sid
44
+ # print(path)
45
+ name = self._uid
46
+
47
+ self._logger = ru.Logger(name=self._uid, path=path, level="DEBUG")
48
+ self._prof = ru.Profiler(name=name, path=path)
49
+
50
+ def setup(self, resource: Resource, walltime: int, cores: int, execution_schema: str | None = None) -> None:
51
+ """
52
+ Sets up the enactor to execute workflows.
53
+ """
54
+ raise NotImplementedError("setup is not implemented")
55
+
56
+
57
+ def enact(self, workflows, resources):
58
+ """
59
+ Method enact receives a set workflows and resources. It is responsible to
60
+ start the execution of the workflow and set a endpoint to the WMF that
61
+ executes the workflow
62
+
63
+ *workflows:* A workflows that will execute on a resource
64
+ *resources:* The resource that will be used.
65
+ """
66
+ raise NotImplementedError("enact is not implemented")
67
+
68
+ def _monitor(self):
69
+ """
70
+ This method monitors the execution of workflows
71
+ """
72
+
73
+ raise NotImplementedError("_monitor is not implemented")
74
+
75
+ def get_status(self, workflows: str | List[str] | None = None) -> Dict[str, States]:
76
+ """
77
+ Get the state of a workflow or workflows.
78
+
79
+ *Parameter*
80
+ *workflows:* A workflow ID or a list of workflow IDs
81
+
82
+ *Returns*
83
+ *status*: A dictionary with the state of each workflow.
84
+ """
85
+
86
+ status = dict()
87
+ if workflows is None:
88
+ for workflow in self._execution_status:
89
+ status[workflow] = self._execution_status[workflow]["state"]
90
+ elif isinstance(workflows, list):
91
+ for workflow in workflows:
92
+ status[workflow] = self._execution_status[workflow]["state"]
93
+ else:
94
+ status[workflows] = self._execution_status[workflows]["state"]
95
+
96
+ return status
97
+
98
+ def update_status_cb(self, workflow, new_state):
99
+ """
100
+ Update the state of a workflow that is executing
101
+ """
102
+
103
+ if workflow not in self._execution_status:
104
+ self._logger.warning(
105
+ "Has not enacted on workflow %s yet.",
106
+ workflow,
107
+ self._get_workflow_state(workflow),
108
+ )
109
+ else:
110
+ self._execution_status[workflow]["state"] = new_state
111
+
112
+ def _get_workflow_state(self, workflow):
113
+ """
114
+ Get a workflow's update
115
+ """
116
+
117
+ return self._execution_status[workflow]["state"]
118
+
119
+ def terminate(self):
120
+ """
121
+ Public method to terminate the Enactor
122
+ """
123
+ raise NotImplementedError("terminate is not implemented")
@@ -0,0 +1,216 @@
1
+ # Imports from general packages
2
+ import os
3
+ import threading as mt
4
+ from copy import deepcopy
5
+ from datetime import datetime
6
+ from time import sleep
7
+ from typing import Dict, List
8
+
9
+ # Imports from dependent packages
10
+ import radical.utils as ru
11
+
12
+ from socm.core import Resource, Workflow
13
+ from socm.enactor.base import Enactor
14
+ from socm.utils.states import States
15
+
16
+
17
+ class DryrunEnactor(Enactor):
18
+ """
19
+ The DryrunEnactor is responsible for simulating the execution of workflows
20
+ on resources without actually running them. It takes as input a list of
21
+ tuples <workflow, resource> and emulates the execution of workflows on their
22
+ selected resources for testing and validation purposes.
23
+ """
24
+
25
+ def __init__(self, sid: str):
26
+ super(DryrunEnactor, self).__init__(sid=sid)
27
+ # List with all the workflows that are executing and require to be
28
+ # monitored. This list is atomic and requires a lock
29
+ self._to_monitor = list()
30
+
31
+ os.environ["RADICAL_CONFIG_USER_DIR"] = os.path.join(
32
+ os.path.dirname(__file__) + "/../configs/"
33
+ )
34
+ self._prof.prof("enactor_setup", uid=self._uid)
35
+ # Lock to provide atomicity in the monitoring data structure
36
+ self._monitoring_lock = ru.RLock("cm.monitor_lock")
37
+ self._cb_lock = ru.RLock("enactor.cb_lock")
38
+ self._callbacks = dict()
39
+
40
+ # Creating a thread to execute the monitoring method.
41
+ self._monitoring_thread = None # Private attribute that will hold the thread
42
+ self._terminate_monitor = mt.Event() # Thread event to terminate.
43
+
44
+ self._run = False
45
+ self._resource = None
46
+ self._prof.prof("enactor_started", uid=self._uid)
47
+ self._logger.info("Enactor is ready")
48
+
49
+ def setup(self, resource: Resource, walltime: int, cores: int, execution_schema: str | None = None) -> None:
50
+ """
51
+ Sets up the enactor to execute workflows.
52
+ """
53
+ self._resource = resource
54
+
55
+
56
+ def enact(self, workflows: List[Workflow]) -> None:
57
+ """
58
+ Method enact receives a set workflows and resources. It is responsible to
59
+ start the execution of the workflow and set a endpoint to the WMF that
60
+ executes the workflow
61
+
62
+ *workflows:* A workflows that will execute on a resource
63
+ *resources:* The resource that will be used.
64
+ """
65
+
66
+ self._prof.prof("enacting_start", uid=self._uid)
67
+ for workflow in workflows:
68
+ # If the enactor has already received a workflow issue a warning and
69
+ # proceed.
70
+ if workflow.id in self._execution_status:
71
+ self._logger.info(
72
+ "Workflow %s is in state %s",
73
+ workflow,
74
+ self._get_workflow_state(workflow.id).name,
75
+ )
76
+ continue
77
+
78
+ try:
79
+ # Create a calculator task. This is equivalent because with
80
+ # the emulated resources, a workflow is a number of operations
81
+ # that need to be executed.
82
+
83
+ with self._monitoring_lock:
84
+ self._to_monitor.append(workflow.id)
85
+ self._execution_status[workflow.id] = {
86
+ "state": States.EXECUTING,
87
+ "exec_thread": None,
88
+ "start_time": datetime.now(),
89
+ "end_time": None,
90
+ }
91
+
92
+ for cb in self._callbacks:
93
+ self._callbacks[cb](
94
+ workflow_ids=[workflow.id],
95
+ new_state=States.EXECUTING,
96
+ step_ids=[None],
97
+ )
98
+ # Execute the task.
99
+ except Exception as ex:
100
+ self._logger.error(f"Workflow {workflow} could not be executed")
101
+ self._logger.error(f"Exception raised {ex}", exc_info=True)
102
+
103
+ self._prof.prof("enacting_stop", uid=self._uid)
104
+ # If there is no monitoring tasks, start one.
105
+ if self._monitoring_thread is None and self._to_monitor:
106
+ self._logger.info("Starting monitor thread")
107
+ self._monitoring_thread = mt.Thread(
108
+ target=self._monitor, name="monitor-thread"
109
+ )
110
+ self._monitoring_thread.start()
111
+ sleep(1)
112
+
113
+ def _monitor(self):
114
+ """
115
+ **Purpose**: Thread in the master process to monitor the campaign execution
116
+ data structure up to date.
117
+ """
118
+
119
+ while not self._terminate_monitor.is_set():
120
+ if self._to_monitor:
121
+ workflows_executing = [f"workflow.{workflow_id}" for workflow_id in self._to_monitor]
122
+ self._prof.prof("workflow_monitor_start", uid=self._uid)
123
+ # with self._monitoring_lock:
124
+ # It does not iterate correctly.
125
+ monitoring_list = deepcopy(self._to_monitor)
126
+ # self._logger.info("Monitoring workflows %s" % monitoring_list)
127
+ to_remove_wfs = list()
128
+ to_remove_sids = list()
129
+ self._logger.debug(f"Executing workflows: {workflows_executing}, monitoring list: {monitoring_list}")
130
+ for workflow_id in monitoring_list:
131
+ if f"workflow.{workflow_id}" in workflows_executing:
132
+ with self._monitoring_lock:
133
+ self._logger.debug(f"workflow.{workflow_id} Done")
134
+ self._execution_status[workflow_id]["state"] = States.DONE
135
+ self._execution_status[workflow_id][
136
+ "end_time"
137
+ ] = datetime.now()
138
+ self._logger.debug(
139
+ "Workflow %s finished: %s, step_id: %s",
140
+ workflow_id,
141
+ self._execution_status[workflow_id]["end_time"],
142
+ 0,
143
+ )
144
+ to_remove_wfs.append(workflow_id)
145
+ to_remove_sids.append(0)
146
+ self._prof.prof("workflow_success", uid=self._uid)
147
+ if to_remove_wfs:
148
+ for cb in self._callbacks:
149
+ self._callbacks[cb](
150
+ workflow_ids=to_remove_wfs,
151
+ new_state=States.DONE,
152
+ step_ids=to_remove_sids,
153
+ )
154
+ with self._monitoring_lock:
155
+ for wid in to_remove_wfs:
156
+ self._to_monitor.remove(wid)
157
+ self._prof.prof("workflow_monitor_end", uid=self._uid)
158
+
159
+ def get_status(self, workflows: str | List[str] | None = None) -> Dict[str, States]:
160
+ """
161
+ Get the state of a workflow or workflows.
162
+
163
+ *Parameter*
164
+ *workflows:* A workflow ID or a list of workflow IDs
165
+
166
+ *Returns*
167
+ *status*: A dictionary with the state of each workflow.
168
+ """
169
+
170
+ status = dict()
171
+ if workflows is None:
172
+ for workflow in self._execution_status:
173
+ status[workflow] = self._execution_status[workflow]["state"]
174
+ elif isinstance(workflows, list):
175
+ for workflow in workflows:
176
+ status[workflow] = self._execution_status[workflow]["state"]
177
+ else:
178
+ status[workflows] = self._execution_status[workflows]["state"]
179
+
180
+ return status
181
+
182
+ def update_status(self, workflow, new_state):
183
+ """
184
+ Update the state of a workflow that is executing
185
+ """
186
+
187
+ if workflow not in self._execution_status:
188
+ self._logger.warning(
189
+ "Has not enacted on workflow %s yet.",
190
+ workflow,
191
+ )
192
+ else:
193
+ self._execution_status[workflow]["state"] = new_state
194
+
195
+ def terminate(self):
196
+ """
197
+ Public method to terminate the Enactor
198
+ """
199
+ self._logger.info("Start terminating procedure")
200
+ self._prof.prof("str_terminating", uid=self._uid)
201
+ if self._monitoring_thread:
202
+ self._prof.prof("monitor_terminate", uid=self._uid)
203
+ self._terminate_monitor.set()
204
+ self._monitoring_thread.join()
205
+ self._prof.prof("monitor_terminated", uid=self._uid)
206
+ self._logger.debug("Monitor thread terminated")
207
+ self._logger.debug("Enactor thread terminated")
208
+
209
+ def register_state_cb(self, cb):
210
+ """
211
+ Registers a new state update callback function with the Enactor.
212
+ """
213
+
214
+ with self._cb_lock:
215
+ cb_name = cb.__name__
216
+ self._callbacks[cb_name] = cb