pyedb 0.60.0__py3-none-any.whl → 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyedb might be problematic. Click here for more details.
- pyedb/__init__.py +1 -1
- pyedb/configuration/cfg_components.py +35 -7
- pyedb/dotnet/database/cell/hierarchy/component.py +8 -6
- pyedb/dotnet/database/cell/hierarchy/model.py +1 -28
- pyedb/dotnet/database/cell/hierarchy/s_parameter_model.py +10 -14
- pyedb/dotnet/database/cell/hierarchy/spice_model.py +13 -7
- pyedb/dotnet/database/components.py +5 -1
- pyedb/dotnet/database/edb_data/padstacks_data.py +5 -3
- pyedb/dotnet/database/modeler.py +2 -1
- pyedb/dotnet/database/padstack.py +187 -1
- pyedb/dotnet/edb.py +70 -1
- pyedb/generic/general_methods.py +21 -0
- pyedb/grpc/database/definition/materials.py +1 -1
- pyedb/grpc/database/definition/padstack_def.py +16 -9
- pyedb/grpc/database/padstacks.py +201 -6
- pyedb/grpc/database/primitive/padstack_instance.py +90 -0
- pyedb/grpc/edb.py +70 -1
- pyedb/grpc/rpc_session.py +16 -3
- pyedb/workflows/__init__.py +21 -0
- pyedb/workflows/job_manager/__init__.py +21 -0
- pyedb/workflows/job_manager/backend/__init__.py +21 -0
- pyedb/workflows/job_manager/backend/job_manager_handler.py +910 -0
- pyedb/workflows/job_manager/backend/job_submission.py +1169 -0
- pyedb/workflows/job_manager/backend/service.py +1663 -0
- pyedb/workflows/job_manager/backend/start_service.py +86 -0
- pyedb/workflows/job_manager/backend/submit_job_on_scheduler.py +168 -0
- pyedb/workflows/job_manager/backend/submit_local_job.py +166 -0
- pyedb/workflows/utilities/__init__.py +21 -0
- pyedb/workflows/utilities/cutout.py +1 -1
- pyedb/workflows/utilities/hfss_log_parser.py +446 -0
- {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/METADATA +7 -4
- {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/RECORD +34 -24
- {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/WHEEL +0 -0
- {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1663 @@
|
|
|
1
|
+
# Copyright (C) 2023 - 2025 ANSYS, Inc. and/or its affiliates.
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
#
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
# furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
# copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
# SOFTWARE.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
Async job manager with pool scheduling and REST/WebSocket API.
|
|
26
|
+
|
|
27
|
+
The module implements a **fully asynchronous** multi-tenant job manager that:
|
|
28
|
+
|
|
29
|
+
* enforces **host resource limits** (CPU, memory, disk, concurrency),
|
|
30
|
+
* maintains **priority queues** (negative → low, zero → normal, positive → high),
|
|
31
|
+
* exposes **REST** and **Socket.IO** endpoints for integration,
|
|
32
|
+
* supports **external schedulers** (SLURM, LSF, PBS, Windows-HPC) **and**
|
|
33
|
+
local subprocess execution,
|
|
34
|
+
* guarantees **exactly-once** execution and **graceful draining** on shutdown.
|
|
35
|
+
|
|
36
|
+
It is designed to be **embedded** inside a PyEDB process or **deployed** as a
|
|
37
|
+
stand-alone micro-service (Docker, systemd, Kubernetes).
|
|
38
|
+
|
|
39
|
+
Examples
|
|
40
|
+
--------
|
|
41
|
+
Stand-alone REST server:
|
|
42
|
+
|
|
43
|
+
.. code-block:: bash
|
|
44
|
+
|
|
45
|
+
python -m pyedb.workflows.job_manager.service
|
|
46
|
+
|
|
47
|
+
Embedded inside PyEDB:
|
|
48
|
+
|
|
49
|
+
.. code-block:: python
|
|
50
|
+
|
|
51
|
+
from pyedb.workflows.job_manager.service import JobManager, ResourceLimits
|
|
52
|
+
|
|
53
|
+
manager = JobManager(ResourceLimits(max_concurrent_jobs=4))
|
|
54
|
+
await manager.submit_job(config, priority=10)
|
|
55
|
+
|
|
56
|
+
The REST API is **self-documenting** at runtime:
|
|
57
|
+
|
|
58
|
+
.. code-block:: bash
|
|
59
|
+
|
|
60
|
+
curl http://localhost:8080/resources
|
|
61
|
+
curl http://localhost:8080/queue
|
|
62
|
+
curl -X POST http://localhost:8080/jobs/submit -d @cfg.json
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
import asyncio
|
|
66
|
+
from collections import deque
|
|
67
|
+
from dataclasses import dataclass
|
|
68
|
+
from datetime import datetime
|
|
69
|
+
from enum import Enum
|
|
70
|
+
import getpass
|
|
71
|
+
import logging
|
|
72
|
+
import os.path
|
|
73
|
+
import platform
|
|
74
|
+
from typing import Any, Deque, Dict, List, Optional, Set
|
|
75
|
+
|
|
76
|
+
import aiohttp
|
|
77
|
+
from aiohttp import web
|
|
78
|
+
import psutil
|
|
79
|
+
import socketio
|
|
80
|
+
|
|
81
|
+
from pyedb.workflows.job_manager.backend.job_submission import (
|
|
82
|
+
HFSSSimulationConfig,
|
|
83
|
+
SchedulerType,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Configure logging
|
|
87
|
+
logging.basicConfig(level=logging.INFO)
|
|
88
|
+
logger = logging.getLogger("JobManager")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class JobStatus(Enum):
|
|
92
|
+
"""
|
|
93
|
+
Terminal and non-terminal job states used internally and exposed via REST.
|
|
94
|
+
|
|
95
|
+
Members
|
|
96
|
+
-------
|
|
97
|
+
PENDING
|
|
98
|
+
Initial state before queuing.
|
|
99
|
+
QUEUED
|
|
100
|
+
Awaiting resources in local queue.
|
|
101
|
+
SCHEDULED
|
|
102
|
+
Submitted to external scheduler.
|
|
103
|
+
RUNNING
|
|
104
|
+
Currently executing.
|
|
105
|
+
COMPLETED
|
|
106
|
+
Normal termination.
|
|
107
|
+
FAILED
|
|
108
|
+
Non-zero exit code or exception.
|
|
109
|
+
CANCELLED
|
|
110
|
+
User-initiated abort.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
PENDING = "pending"
|
|
114
|
+
RUNNING = "running"
|
|
115
|
+
COMPLETED = "completed"
|
|
116
|
+
FAILED = "failed"
|
|
117
|
+
CANCELLED = "cancelled"
|
|
118
|
+
SCHEDULED = "scheduled"
|
|
119
|
+
QUEUED = "queued" # Specifically for local queue
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class ResourceLimits:
|
|
124
|
+
"""
|
|
125
|
+
Host-level resource constraints enforced by the manager.
|
|
126
|
+
|
|
127
|
+
All attributes are **checked** before starting a new job; if any limit is
|
|
128
|
+
exceeded the job remains in the queue.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
max_concurrent_jobs : int, optional
|
|
133
|
+
Simultaneous local jobs. Default is ``1``.
|
|
134
|
+
max_cpu_percent : float, optional
|
|
135
|
+
CPU utilisation threshold (0-100). Default is ``80.0``.
|
|
136
|
+
min_memory_gb : float, optional
|
|
137
|
+
Free RAM required to start (GB). Default is ``2.0``.
|
|
138
|
+
min_disk_gb : float, optional
|
|
139
|
+
Free disk space required to start (GB). Default is ``10.0``.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
max_concurrent_jobs: int = 1
|
|
143
|
+
max_cpu_percent: float = 80.0 # Don't start new jobs if CPU > 80%
|
|
144
|
+
min_memory_gb: float = 2.0 # Minimum free memory required to start a job
|
|
145
|
+
min_disk_gb: float = 10.0 # Minimum free disk space required
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class JobInfo:
|
|
150
|
+
"""
|
|
151
|
+
**Mutable** state container for a single simulation.
|
|
152
|
+
|
|
153
|
+
Attributes
|
|
154
|
+
----------
|
|
155
|
+
config : HFSSSimulationConfig
|
|
156
|
+
Immutable configuration.
|
|
157
|
+
status : JobStatus
|
|
158
|
+
Current life-cycle state.
|
|
159
|
+
start_time : datetime or None
|
|
160
|
+
When the job entered ``RUNNING``.
|
|
161
|
+
end_time : datetime or None
|
|
162
|
+
When the job reached a terminal state.
|
|
163
|
+
return_code : int or None
|
|
164
|
+
Exit code of the solver or scheduler.
|
|
165
|
+
output : str
|
|
166
|
+
Stdout captured (local runs only).
|
|
167
|
+
error : str
|
|
168
|
+
Stderr captured (local runs only).
|
|
169
|
+
process : asyncio.subprocess.Process or None
|
|
170
|
+
Handle for local cancellation.
|
|
171
|
+
scheduler_job_id : str or None
|
|
172
|
+
External identifier (SLURM, LSF, …).
|
|
173
|
+
local_resources : dict or None
|
|
174
|
+
Snapshot of host telemetry at start time.
|
|
175
|
+
priority : int
|
|
176
|
+
Higher numbers are de-queued first.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
config: HFSSSimulationConfig
|
|
180
|
+
status: JobStatus
|
|
181
|
+
start_time: Optional[datetime] = None
|
|
182
|
+
end_time: Optional[datetime] = None
|
|
183
|
+
return_code: Optional[int] = None
|
|
184
|
+
output: str = ""
|
|
185
|
+
error: str = ""
|
|
186
|
+
process: Optional[Any] = None
|
|
187
|
+
scheduler_job_id: Optional[str] = None
|
|
188
|
+
local_resources: Optional[Dict[str, Any]] = None
|
|
189
|
+
priority: int = 0 # Higher number = higher priority
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class ResourceMonitor:
|
|
193
|
+
"""
|
|
194
|
+
**Async** background task that samples host telemetry every *N* seconds.
|
|
195
|
+
|
|
196
|
+
The monitor keeps a **thread-safe** in-memory cache used by
|
|
197
|
+
:meth:`JobPoolManager.can_start_job` to throttle submissions.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
update_interval : int, optional
|
|
202
|
+
Sampling period in seconds. Default is ``5``.
|
|
203
|
+
|
|
204
|
+
Attributes
|
|
205
|
+
----------
|
|
206
|
+
current_usage : dict
|
|
207
|
+
Cached resource usage information with keys:
|
|
208
|
+
- cpu_percent: Current CPU usage percentage
|
|
209
|
+
- memory_percent: Current memory usage percentage
|
|
210
|
+
- memory_used_gb: Memory used in GB
|
|
211
|
+
- memory_total_gb: Total memory in GB
|
|
212
|
+
- memory_free_gb: Free memory in GB
|
|
213
|
+
- disk_usage_percent: Disk usage percentage
|
|
214
|
+
- disk_free_gb: Free disk space in GB
|
|
215
|
+
- timestamp: Last update timestamp
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(self, update_interval: int = 5):
|
|
219
|
+
self.update_interval = update_interval
|
|
220
|
+
self.current_usage = {
|
|
221
|
+
"cpu_percent": 0,
|
|
222
|
+
"memory_percent": 0,
|
|
223
|
+
"memory_used_gb": 0,
|
|
224
|
+
"memory_total_gb": 0,
|
|
225
|
+
"memory_free_gb": 0,
|
|
226
|
+
"disk_usage_percent": 0,
|
|
227
|
+
"disk_free_gb": 0,
|
|
228
|
+
"timestamp": datetime.now().isoformat(),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
async def monitor_resources(self):
|
|
232
|
+
"""
|
|
233
|
+
**Infinite** coroutine that updates :attr:`current_usage`.
|
|
234
|
+
|
|
235
|
+
Runs until the event-loop is shut down. Samples CPU, memory, and disk
|
|
236
|
+
usage at regular intervals.
|
|
237
|
+
"""
|
|
238
|
+
await self._sample_once()
|
|
239
|
+
while True:
|
|
240
|
+
await self._sample_once()
|
|
241
|
+
await asyncio.sleep(self.update_interval)
|
|
242
|
+
|
|
243
|
+
async def _sample_once(self):
|
|
244
|
+
try:
|
|
245
|
+
# CPU usage
|
|
246
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
|
247
|
+
|
|
248
|
+
# Memory usage
|
|
249
|
+
memory = psutil.virtual_memory()
|
|
250
|
+
memory_total_gb = memory.total / (1024**3)
|
|
251
|
+
memory_used_gb = memory.used / (1024**3)
|
|
252
|
+
memory_free_gb = memory.available / (1024**3)
|
|
253
|
+
|
|
254
|
+
# Disk usage (checking the root directory)
|
|
255
|
+
disk = psutil.disk_usage(os.path.abspath(os.sep))
|
|
256
|
+
disk_usage_percent = disk.percent
|
|
257
|
+
disk_free_gb = disk.free / (1024**3)
|
|
258
|
+
|
|
259
|
+
self.current_usage.update(
|
|
260
|
+
{
|
|
261
|
+
"cpu_percent": cpu_percent,
|
|
262
|
+
"memory_percent": memory.percent,
|
|
263
|
+
"memory_used_gb": round(memory_used_gb, 2),
|
|
264
|
+
"memory_total_gb": round(memory_total_gb, 2),
|
|
265
|
+
"memory_free_gb": round(memory_free_gb, 2),
|
|
266
|
+
"disk_usage_percent": disk_usage_percent,
|
|
267
|
+
"disk_free_gb": round(disk_free_gb, 2),
|
|
268
|
+
"timestamp": datetime.now().isoformat(),
|
|
269
|
+
}
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.error(f"Resource monitoring error: {e}")
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class JobPoolManager:
|
|
277
|
+
"""
|
|
278
|
+
**Priority-aware** FIFO queues plus running-set tracker.
|
|
279
|
+
|
|
280
|
+
The implementation is **lock-free** (uses ``deque`` and ``dict``) and
|
|
281
|
+
**async-safe** (no awaits, therefore can be invoked from any thread).
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
resource_limits : ResourceLimits
|
|
286
|
+
Constraints used by :meth:`can_start_job`.
|
|
287
|
+
|
|
288
|
+
Attributes
|
|
289
|
+
----------
|
|
290
|
+
job_queue : Deque[str]
|
|
291
|
+
FIFO queue for normal priority jobs
|
|
292
|
+
priority_queue : Dict[int, List[str]]
|
|
293
|
+
Priority-based queues (key=priority, value=job_ids)
|
|
294
|
+
running_jobs : Set[str]
|
|
295
|
+
Set of currently running job IDs
|
|
296
|
+
job_priorities : Dict[str, int]
|
|
297
|
+
Mapping of job_id to priority
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
def __init__(self, resource_limits: ResourceLimits):
|
|
301
|
+
self.resource_limits = resource_limits
|
|
302
|
+
self.job_queue: Deque[str] = deque() # FIFO queue for job IDs
|
|
303
|
+
self.priority_queue: Dict[int, List[str]] = {} # Priority-based queue
|
|
304
|
+
self.running_jobs: Set[str] = set()
|
|
305
|
+
self.job_priorities: Dict[str, int] = {}
|
|
306
|
+
|
|
307
|
+
def add_job(self, job_id: str, priority: int = 0):
|
|
308
|
+
"""
|
|
309
|
+
Insert job into the **appropriate** queue (priority or FIFO).
|
|
310
|
+
|
|
311
|
+
Parameters
|
|
312
|
+
----------
|
|
313
|
+
job_id : str
|
|
314
|
+
Unique identifier.
|
|
315
|
+
priority : int, optional
|
|
316
|
+
Negative (low), zero (normal), positive (high). Default is ``0``.
|
|
317
|
+
"""
|
|
318
|
+
self.job_priorities[job_id] = priority
|
|
319
|
+
|
|
320
|
+
if priority > 0:
|
|
321
|
+
if priority not in self.priority_queue:
|
|
322
|
+
self.priority_queue[priority] = []
|
|
323
|
+
self.priority_queue[priority].append(job_id)
|
|
324
|
+
# Sort priority queues to maintain order
|
|
325
|
+
for pq in self.priority_queue.values():
|
|
326
|
+
pq.sort(key=lambda x: self.job_priorities.get(x, 0), reverse=True)
|
|
327
|
+
else:
|
|
328
|
+
self.job_queue.append(job_id)
|
|
329
|
+
|
|
330
|
+
def get_next_job(self) -> Optional[str]:
|
|
331
|
+
"""
|
|
332
|
+
Return the **next** job to be started (highest priority first).
|
|
333
|
+
|
|
334
|
+
Returns
|
|
335
|
+
-------
|
|
336
|
+
str or None
|
|
337
|
+
Job identifier or ``None`` if all queues are empty.
|
|
338
|
+
|
|
339
|
+
Notes
|
|
340
|
+
-----
|
|
341
|
+
Priority queues are checked first (highest to lowest), then the
|
|
342
|
+
normal FIFO queue.
|
|
343
|
+
"""
|
|
344
|
+
# First check priority queues (highest priority first)
|
|
345
|
+
for priority in sorted(self.priority_queue.keys(), reverse=True):
|
|
346
|
+
if self.priority_queue[priority]:
|
|
347
|
+
return self.priority_queue[priority].pop(0)
|
|
348
|
+
|
|
349
|
+
# Then check regular queue
|
|
350
|
+
if self.job_queue:
|
|
351
|
+
return self.job_queue.popleft()
|
|
352
|
+
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
def remove_job(self, job_id: str):
|
|
356
|
+
"""
|
|
357
|
+
**Idempotently** remove a job from **all** queues.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
job_id : str
|
|
362
|
+
Identifier to purge.
|
|
363
|
+
"""
|
|
364
|
+
if job_id in self.job_queue:
|
|
365
|
+
self.job_queue.remove(job_id)
|
|
366
|
+
|
|
367
|
+
for priority, jobs in self.priority_queue.items():
|
|
368
|
+
if job_id in jobs:
|
|
369
|
+
jobs.remove(job_id)
|
|
370
|
+
|
|
371
|
+
if job_id in self.job_priorities:
|
|
372
|
+
del self.job_priorities[job_id]
|
|
373
|
+
|
|
374
|
+
def can_start_job(self, resource_monitor: ResourceMonitor) -> bool:
|
|
375
|
+
"""
|
|
376
|
+
**Boolean** predicate that decides whether a new job may be started.
|
|
377
|
+
|
|
378
|
+
Checks resource limits without violating constraints.
|
|
379
|
+
|
|
380
|
+
Parameters
|
|
381
|
+
----------
|
|
382
|
+
resource_monitor : ResourceMonitor
|
|
383
|
+
Source of current host telemetry.
|
|
384
|
+
|
|
385
|
+
Returns
|
|
386
|
+
-------
|
|
387
|
+
bool
|
|
388
|
+
``True`` → job may be started, ``False`` → remain queued.
|
|
389
|
+
"""
|
|
390
|
+
resources = resource_monitor.current_usage
|
|
391
|
+
|
|
392
|
+
# Check if we've reached max concurrent jobs
|
|
393
|
+
if len(self.running_jobs) >= self.resource_limits.max_concurrent_jobs:
|
|
394
|
+
return False
|
|
395
|
+
|
|
396
|
+
# Check CPU usage
|
|
397
|
+
if resources["cpu_percent"] > self.resource_limits.max_cpu_percent:
|
|
398
|
+
logger.info(f"CPU usage too high ({resources['cpu_percent']}%), delaying job start")
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
# Check memory availability
|
|
402
|
+
if resources["memory_free_gb"] < self.resource_limits.min_memory_gb:
|
|
403
|
+
logger.info(f"Insufficient memory ({resources['memory_free_gb']}GB free), delaying job start")
|
|
404
|
+
return False
|
|
405
|
+
|
|
406
|
+
# Check disk space
|
|
407
|
+
if resources["disk_free_gb"] < self.resource_limits.min_disk_gb:
|
|
408
|
+
logger.info(f"Insufficient disk space ({resources['disk_free_gb']}GB free), delaying job start")
|
|
409
|
+
return False
|
|
410
|
+
return True
|
|
411
|
+
|
|
412
|
+
def get_queue_stats(self) -> Dict[str, Any]:
|
|
413
|
+
"""
|
|
414
|
+
Real-time snapshot for REST ``/queue`` endpoint.
|
|
415
|
+
|
|
416
|
+
Returns
|
|
417
|
+
-------
|
|
418
|
+
dict
|
|
419
|
+
Queue statistics with keys:
|
|
420
|
+
- total_queued: Total jobs in all queues
|
|
421
|
+
- regular_queue_size: Jobs in normal FIFO queue
|
|
422
|
+
- priority_queues: Dict of priority -> count
|
|
423
|
+
- running_jobs: Number of currently running jobs
|
|
424
|
+
- max_concurrent: Maximum concurrent jobs allowed
|
|
425
|
+
"""
|
|
426
|
+
total_queued = len(self.job_queue)
|
|
427
|
+
for jobs in self.priority_queue.values():
|
|
428
|
+
total_queued += len(jobs)
|
|
429
|
+
|
|
430
|
+
return {
|
|
431
|
+
"total_queued": total_queued,
|
|
432
|
+
"regular_queue_size": len(self.job_queue),
|
|
433
|
+
"priority_queues": {prio: len(jobs) for prio, jobs in self.priority_queue.items()},
|
|
434
|
+
"running_jobs": len(self.running_jobs),
|
|
435
|
+
"max_concurrent": self.resource_limits.max_concurrent_jobs,
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
class JobManager:
|
|
440
|
+
"""
|
|
441
|
+
**Async** job manager combining resource monitoring and job scheduling.
|
|
442
|
+
|
|
443
|
+
This class provides the core functionality for:
|
|
444
|
+
|
|
445
|
+
* Resource monitoring via :class:`ResourceMonitor`
|
|
446
|
+
* Job scheduling via :class:`JobPoolManager`
|
|
447
|
+
* REST/Socket.IO API via aiohttp web server
|
|
448
|
+
* Background task for continuous job processing
|
|
449
|
+
|
|
450
|
+
Parameters
|
|
451
|
+
----------
|
|
452
|
+
resource_limits : ResourceLimits, optional
|
|
453
|
+
Host constraints. Creates default instance if None.
|
|
454
|
+
scheduler_type : SchedulerType, optional
|
|
455
|
+
Type of job scheduler to use. Default is ``SchedulerType.NONE``.
|
|
456
|
+
|
|
457
|
+
Attributes
|
|
458
|
+
----------
|
|
459
|
+
jobs : Dict[str, JobInfo]
|
|
460
|
+
Dictionary of all managed jobs
|
|
461
|
+
resource_limits : ResourceLimits
|
|
462
|
+
Current resource constraints
|
|
463
|
+
job_pool : JobPoolManager
|
|
464
|
+
Priority-aware job queue manager
|
|
465
|
+
resource_monitor : ResourceMonitor
|
|
466
|
+
Host resource usage monitor
|
|
467
|
+
ansys_path : str or None
|
|
468
|
+
Path to ANSYS EDT executable
|
|
469
|
+
sio : socketio.AsyncServer
|
|
470
|
+
Socket.IO server for real-time updates
|
|
471
|
+
app : web.Application
|
|
472
|
+
aiohttp web application
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
def __init__(self, resource_limits: ResourceLimits = None, scheduler_type: SchedulerType = SchedulerType.NONE):
|
|
476
|
+
logging.basicConfig(level=logging.DEBUG, format="%(levelname)s:%(name)s:%(message)s")
|
|
477
|
+
self.jobs: Dict[str, JobInfo] = {}
|
|
478
|
+
if resource_limits is None:
|
|
479
|
+
resource_limits = ResourceLimits()
|
|
480
|
+
self.resource_limits = resource_limits
|
|
481
|
+
self.job_pool = JobPoolManager(self.resource_limits)
|
|
482
|
+
self.resource_monitor = ResourceMonitor()
|
|
483
|
+
self.ansys_path = None # Will be set by JobManagerHandler
|
|
484
|
+
|
|
485
|
+
# Initialize scheduler manager
|
|
486
|
+
self.scheduler_type = scheduler_type
|
|
487
|
+
if scheduler_type in {SchedulerType.SLURM, SchedulerType.LSF}:
|
|
488
|
+
self._sch_mgr = SchedulerManager(scheduler_type)
|
|
489
|
+
else:
|
|
490
|
+
self._sch_mgr = None
|
|
491
|
+
|
|
492
|
+
# Correct SocketIO initialization
|
|
493
|
+
self.sio = socketio.AsyncServer(async_mode="aiohttp", cors_allowed_origins="*")
|
|
494
|
+
self.app = web.Application()
|
|
495
|
+
self.sio.attach(self.app)
|
|
496
|
+
|
|
497
|
+
# Setup routes
|
|
498
|
+
self.setup_routes()
|
|
499
|
+
|
|
500
|
+
# Background task for continuous job processing
|
|
501
|
+
self._processing_task: Optional[asyncio.Task] = None
|
|
502
|
+
self._shutdown = False
|
|
503
|
+
# Start resource monitoring immediately
|
|
504
|
+
self._monitor_task = None
|
|
505
|
+
self._ensure_monitor_running()
|
|
506
|
+
# Background task for scheduler monitoring
|
|
507
|
+
self._scheduler_monitor_task: Optional[asyncio.Task] = None
|
|
508
|
+
self._ensure_scheduler_monitor_running()
|
|
509
|
+
|
|
510
|
+
def _ensure_monitor_running(self):
|
|
511
|
+
"""Ensure resource monitoring task is running."""
|
|
512
|
+
try:
|
|
513
|
+
loop = asyncio.get_running_loop()
|
|
514
|
+
if self._monitor_task is None or self._monitor_task.done():
|
|
515
|
+
self._monitor_task = loop.create_task(self.resource_monitor.monitor_resources())
|
|
516
|
+
except RuntimeError:
|
|
517
|
+
# No event loop running yet, will be started when JobManagerHandler starts
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
def _ensure_scheduler_monitor_running(self):
|
|
521
|
+
"""Ensure scheduler monitoring task is running for Slurm/LSF jobs."""
|
|
522
|
+
if self._sch_mgr is None:
|
|
523
|
+
# No scheduler configured, skip monitoring
|
|
524
|
+
return
|
|
525
|
+
try:
|
|
526
|
+
loop = asyncio.get_running_loop()
|
|
527
|
+
if self._scheduler_monitor_task is None or self._scheduler_monitor_task.done():
|
|
528
|
+
self._scheduler_monitor_task = loop.create_task(self._monitor_scheduler_jobs())
|
|
529
|
+
logger.info(f"Started scheduler monitoring for {self.scheduler_type.value}")
|
|
530
|
+
except RuntimeError:
|
|
531
|
+
# No event loop running yet, will be started when JobManagerHandler starts
|
|
532
|
+
pass
|
|
533
|
+
|
|
534
|
+
async def _monitor_scheduler_jobs(self):
|
|
535
|
+
"""
|
|
536
|
+
Continuously monitor jobs submitted to external schedulers (Slurm/LSF).
|
|
537
|
+
|
|
538
|
+
This background task polls the scheduler queue every 30 seconds and updates
|
|
539
|
+
job statuses based on the actual scheduler state.
|
|
540
|
+
"""
|
|
541
|
+
logger.info(f"✅ Scheduler monitoring loop started for {self.scheduler_type.value}")
|
|
542
|
+
|
|
543
|
+
while not self._shutdown:
|
|
544
|
+
try:
|
|
545
|
+
# Find all jobs that are currently scheduled (submitted to external scheduler)
|
|
546
|
+
scheduled_jobs = {
|
|
547
|
+
job_id: job_info
|
|
548
|
+
for job_id, job_info in self.jobs.items()
|
|
549
|
+
if job_info.status == JobStatus.SCHEDULED and job_info.scheduler_job_id
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
if not scheduled_jobs:
|
|
553
|
+
# No jobs to monitor, sleep longer
|
|
554
|
+
await asyncio.sleep(10)
|
|
555
|
+
continue
|
|
556
|
+
|
|
557
|
+
# Get current scheduler job list
|
|
558
|
+
scheduler_jobs = await self._sch_mgr.get_jobs()
|
|
559
|
+
scheduler_job_ids = {job["job_id"] for job in scheduler_jobs}
|
|
560
|
+
scheduler_job_states = {job["job_id"]: job["state"] for job in scheduler_jobs}
|
|
561
|
+
|
|
562
|
+
logger.info(
|
|
563
|
+
f"Monitoring {len(scheduled_jobs)} scheduled jobs. Scheduler has {len(scheduler_jobs)} jobs."
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
for job_id, job_info in scheduled_jobs.items():
|
|
567
|
+
scheduler_job_id = job_info.scheduler_job_id
|
|
568
|
+
|
|
569
|
+
if scheduler_job_id in scheduler_job_ids:
|
|
570
|
+
# Job still exists in scheduler queue
|
|
571
|
+
state = scheduler_job_states.get(scheduler_job_id, "UNKNOWN")
|
|
572
|
+
|
|
573
|
+
# Map scheduler states to our JobStatus
|
|
574
|
+
if state in ["RUNNING", "R"]:
|
|
575
|
+
if job_info.status != JobStatus.RUNNING:
|
|
576
|
+
job_info.status = JobStatus.RUNNING
|
|
577
|
+
if not job_info.start_time:
|
|
578
|
+
job_info.start_time = datetime.now()
|
|
579
|
+
await self.sio.emit(
|
|
580
|
+
"job_started",
|
|
581
|
+
{
|
|
582
|
+
"job_id": job_id,
|
|
583
|
+
"scheduler_job_id": scheduler_job_id,
|
|
584
|
+
"start_time": job_info.start_time.isoformat(),
|
|
585
|
+
},
|
|
586
|
+
)
|
|
587
|
+
logger.info(f"Job {job_id} (scheduler ID: {scheduler_job_id}) is now RUNNING")
|
|
588
|
+
|
|
589
|
+
elif state in ["PENDING", "PD", "PEND"]:
|
|
590
|
+
# Job is still pending/queued in scheduler
|
|
591
|
+
logger.debug(f"Job {job_id} (scheduler ID: {scheduler_job_id}) is PENDING in scheduler")
|
|
592
|
+
|
|
593
|
+
elif state in ["COMPLETING", "CG"]:
|
|
594
|
+
# Job is completing, keep current status
|
|
595
|
+
logger.debug(f"Job {job_id} (scheduler ID: {scheduler_job_id}) is COMPLETING")
|
|
596
|
+
|
|
597
|
+
else:
|
|
598
|
+
# Job no longer in scheduler queue - it has completed or failed
|
|
599
|
+
# Check if we can find output files to determine success/failure
|
|
600
|
+
job_info.end_time = datetime.now()
|
|
601
|
+
self.job_pool.running_jobs.discard(job_id)
|
|
602
|
+
|
|
603
|
+
# Try to determine if job completed successfully by checking output directory
|
|
604
|
+
output_dir = job_info.config.working_directory
|
|
605
|
+
log_file = os.path.join(output_dir, f"{job_info.config.jobid}.log")
|
|
606
|
+
|
|
607
|
+
# Default to completed - scheduler jobs that finish typically completed
|
|
608
|
+
# unless we can detect otherwise
|
|
609
|
+
if os.path.exists(log_file):
|
|
610
|
+
try:
|
|
611
|
+
# Check if log indicates success or failure
|
|
612
|
+
with open(log_file, "r") as f:
|
|
613
|
+
log_content = f.read()
|
|
614
|
+
if "error" in log_content.lower() or "failed" in log_content.lower():
|
|
615
|
+
job_info.status = JobStatus.FAILED
|
|
616
|
+
job_info.error = "Job failed based on log file content"
|
|
617
|
+
else:
|
|
618
|
+
job_info.status = JobStatus.COMPLETED
|
|
619
|
+
job_info.return_code = 0
|
|
620
|
+
except Exception as e:
|
|
621
|
+
logger.warning(f"Could not read log file for job {job_id}: {e}")
|
|
622
|
+
job_info.status = JobStatus.COMPLETED
|
|
623
|
+
job_info.return_code = 0
|
|
624
|
+
else:
|
|
625
|
+
# No log file found, assume completed
|
|
626
|
+
job_info.status = JobStatus.COMPLETED
|
|
627
|
+
job_info.return_code = 0
|
|
628
|
+
|
|
629
|
+
await self.sio.emit(
|
|
630
|
+
"job_completed",
|
|
631
|
+
{
|
|
632
|
+
"job_id": job_id,
|
|
633
|
+
"scheduler_job_id": scheduler_job_id,
|
|
634
|
+
"status": job_info.status.value,
|
|
635
|
+
"end_time": job_info.end_time.isoformat(),
|
|
636
|
+
"return_code": job_info.return_code,
|
|
637
|
+
},
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
logger.info(
|
|
641
|
+
f"Job {job_id} (scheduler ID: {scheduler_job_id}) completed with status "
|
|
642
|
+
f"{job_info.status.value}"
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.error(f"Error in scheduler monitoring loop: {e}")
|
|
647
|
+
|
|
648
|
+
# Poll every 5 seconds for responsive status updates
|
|
649
|
+
await asyncio.sleep(5)
|
|
650
|
+
|
|
651
|
+
logger.info("Scheduler monitoring loop stopped")
|
|
652
|
+
|
|
653
|
+
def setup_routes(self):
|
|
654
|
+
"""
|
|
655
|
+
Internal method that wires aiohttp routes to class methods.
|
|
656
|
+
|
|
657
|
+
Called once from __init__. Sets up all REST API endpoints.
|
|
658
|
+
"""
|
|
659
|
+
self.app.router.add_get("/", self.handle_index)
|
|
660
|
+
self.app.router.add_get("/jobs", self.handle_get_jobs)
|
|
661
|
+
self.app.router.add_get("/resources", self.handle_get_resources)
|
|
662
|
+
self.app.router.add_get("/queue", self.handle_get_queue)
|
|
663
|
+
self.app.router.add_post("/jobs/submit", self.handle_submit_job)
|
|
664
|
+
self.app.router.add_post("/jobs/{job_id}/cancel", self.handle_cancel_job)
|
|
665
|
+
self.app.router.add_post("/jobs/{job_id}/priority", self.handle_set_priority)
|
|
666
|
+
self.app.router.add_put("/pool/limits", self.handle_edit_concurrent_limits)
|
|
667
|
+
self.app.router.add_post("/system/start_monitoring", self.handle_start_monitoring)
|
|
668
|
+
self.app.router.add_get("/scheduler/partitions", self.handle_get_partitions)
|
|
669
|
+
self.app.router.add_get("/system/status", self.handle_get_system_status)
|
|
670
|
+
if os.path.exists("static"):
|
|
671
|
+
self.app.router.add_static("/static", "static")
|
|
672
|
+
else:
|
|
673
|
+
os.makedirs("static", exist_ok=True)
|
|
674
|
+
self.app.router.add_static("/static", "static")
|
|
675
|
+
|
|
676
|
+
async def handle_get_system_status(self, request):
|
|
677
|
+
"""
|
|
678
|
+
Get system and scheduler status.
|
|
679
|
+
|
|
680
|
+
Parameters
|
|
681
|
+
----------
|
|
682
|
+
request : aiohttp.web.Request
|
|
683
|
+
HTTP request object
|
|
684
|
+
|
|
685
|
+
Returns
|
|
686
|
+
-------
|
|
687
|
+
aiohttp.web.Response
|
|
688
|
+
JSON response with system status information
|
|
689
|
+
"""
|
|
690
|
+
# Ensure resource monitoring is active
|
|
691
|
+
self._ensure_monitor_running()
|
|
692
|
+
|
|
693
|
+
running_jobs = sum(1 for job in self.jobs.values() if job.status == JobStatus.RUNNING)
|
|
694
|
+
queued_jobs = sum(1 for job in self.jobs.values() if job.status == JobStatus.QUEUED)
|
|
695
|
+
status = {
|
|
696
|
+
"scheduler_detection": {
|
|
697
|
+
"active_scheduler": self.scheduler_type.name,
|
|
698
|
+
"detected_by": "JobManager",
|
|
699
|
+
"backend_available": True,
|
|
700
|
+
},
|
|
701
|
+
"resource_monitoring": {
|
|
702
|
+
"active": self._monitor_task is not None and not self._monitor_task.done(),
|
|
703
|
+
"last_update": self.resource_monitor.current_usage.get("timestamp", "Never"),
|
|
704
|
+
**self.resource_monitor.current_usage,
|
|
705
|
+
},
|
|
706
|
+
"mode": self.scheduler_type.value,
|
|
707
|
+
"local_pool": {
|
|
708
|
+
"running_jobs": running_jobs,
|
|
709
|
+
"queued_jobs": queued_jobs,
|
|
710
|
+
"max_concurrent": self.resource_limits.max_concurrent_jobs,
|
|
711
|
+
},
|
|
712
|
+
}
|
|
713
|
+
return web.json_response(status)
|
|
714
|
+
|
|
715
|
+
async def handle_get_partitions(self, request):
|
|
716
|
+
"""
|
|
717
|
+
Get scheduler partitions/queues.
|
|
718
|
+
|
|
719
|
+
Parameters
|
|
720
|
+
----------
|
|
721
|
+
request : aiohttp.web.Request
|
|
722
|
+
HTTP request object
|
|
723
|
+
|
|
724
|
+
Returns
|
|
725
|
+
-------
|
|
726
|
+
aiohttp.web.Response
|
|
727
|
+
JSON response with partition information or error
|
|
728
|
+
"""
|
|
729
|
+
if not self._sch_mgr:
|
|
730
|
+
return web.json_response({"error": "Scheduler not supported"}, status=400)
|
|
731
|
+
try:
|
|
732
|
+
partitions = await self._sch_mgr.get_partitions()
|
|
733
|
+
return web.json_response(partitions)
|
|
734
|
+
except Exception as e:
|
|
735
|
+
return web.json_response({"error": str(e)}, status=500)
|
|
736
|
+
|
|
737
|
+
async def handle_start_monitoring(self, request):
|
|
738
|
+
"""
|
|
739
|
+
Manually start resource monitoring.
|
|
740
|
+
|
|
741
|
+
Parameters
|
|
742
|
+
----------
|
|
743
|
+
request : aiohttp.web.Request
|
|
744
|
+
HTTP request object
|
|
745
|
+
|
|
746
|
+
Returns
|
|
747
|
+
-------
|
|
748
|
+
aiohttp.web.Response
|
|
749
|
+
JSON response indicating success or failure
|
|
750
|
+
"""
|
|
751
|
+
try:
|
|
752
|
+
if self._monitor_task is None or self._monitor_task.done():
|
|
753
|
+
self._monitor_task = asyncio.create_task(self.resource_monitor.monitor_resources())
|
|
754
|
+
return web.json_response({"success": True, "message": "Resource monitoring started"})
|
|
755
|
+
else:
|
|
756
|
+
return web.json_response({"success": True, "message": "Resource monitoring already active"})
|
|
757
|
+
except Exception as e:
|
|
758
|
+
return web.json_response({"success": False, "error": str(e)}, status=500)
|
|
759
|
+
|
|
760
|
+
async def handle_index(self, request):
|
|
761
|
+
"""
|
|
762
|
+
Serve the main web interface.
|
|
763
|
+
|
|
764
|
+
Parameters
|
|
765
|
+
----------
|
|
766
|
+
request : aiohttp.web.Request
|
|
767
|
+
HTTP request object
|
|
768
|
+
|
|
769
|
+
Returns
|
|
770
|
+
-------
|
|
771
|
+
aiohttp.web.FileResponse
|
|
772
|
+
Static HTML file
|
|
773
|
+
"""
|
|
774
|
+
return web.FileResponse("static/index.html")
|
|
775
|
+
|
|
776
|
+
async def handle_submit_job(self, request):
|
|
777
|
+
"""
|
|
778
|
+
Submit a new job for execution.
|
|
779
|
+
|
|
780
|
+
Parameters
|
|
781
|
+
----------
|
|
782
|
+
request : aiohttp.web.Request
|
|
783
|
+
HTTP POST request with JSON payload
|
|
784
|
+
|
|
785
|
+
Returns
|
|
786
|
+
-------
|
|
787
|
+
aiohttp.web.Response
|
|
788
|
+
JSON response with job ID or error
|
|
789
|
+
|
|
790
|
+
Notes
|
|
791
|
+
-----
|
|
792
|
+
Expected JSON payload:
|
|
793
|
+
|
|
794
|
+
.. code-block:: json
|
|
795
|
+
|
|
796
|
+
{
|
|
797
|
+
"config": {
|
|
798
|
+
"jobid": "job_123",
|
|
799
|
+
"project_path": "/path/to/project.aedt",
|
|
800
|
+
... other HFSS config fields
|
|
801
|
+
},
|
|
802
|
+
"priority": 0
|
|
803
|
+
}
|
|
804
|
+
"""
|
|
805
|
+
try:
|
|
806
|
+
logger.info("🔍 ROUTE HIT: /jobs/submit")
|
|
807
|
+
data = await request.json()
|
|
808
|
+
config_dict = data.get("config", {})
|
|
809
|
+
|
|
810
|
+
# Create HFSS config from dictionary
|
|
811
|
+
config = HFSSSimulationConfig.from_dict(config_dict)
|
|
812
|
+
if "user" not in data["config"] or data["config"]["user"] is None:
|
|
813
|
+
data["config"]["user"] = getpass.getuser()
|
|
814
|
+
|
|
815
|
+
# Overwrite scheduler type and user with authoritative values
|
|
816
|
+
if config.scheduler_type != self.scheduler_type:
|
|
817
|
+
print("Overriding scheduler type from client:", config.scheduler_type, "→", self.scheduler_type)
|
|
818
|
+
config.scheduler_type = self.scheduler_type
|
|
819
|
+
|
|
820
|
+
# Submit the job
|
|
821
|
+
job_id = await self.submit_job(config)
|
|
822
|
+
|
|
823
|
+
return web.json_response(
|
|
824
|
+
{"success": True, "job_id": job_id, "message": f"Job {job_id} submitted successfully"}
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
except Exception as e:
|
|
828
|
+
return web.json_response({"success": False, "error": str(e)}, status=400)
|
|
829
|
+
|
|
830
|
+
async def handle_cancel_job(self, request):
|
|
831
|
+
"""
|
|
832
|
+
Cancel a running or queued job.
|
|
833
|
+
|
|
834
|
+
Parameters
|
|
835
|
+
----------
|
|
836
|
+
request : aiohttp.web.Request
|
|
837
|
+
HTTP request with job_id in URL path
|
|
838
|
+
|
|
839
|
+
Returns
|
|
840
|
+
-------
|
|
841
|
+
aiohttp.web.Response
|
|
842
|
+
JSON response indicating success or failure
|
|
843
|
+
"""
|
|
844
|
+
job_id = request.match_info.get("job_id")
|
|
845
|
+
|
|
846
|
+
if job_id not in self.jobs:
|
|
847
|
+
return web.json_response({"success": False, "error": f"Job {job_id} not found"}, status=404)
|
|
848
|
+
|
|
849
|
+
success = await self.cancel_job(job_id)
|
|
850
|
+
|
|
851
|
+
return web.json_response(
|
|
852
|
+
{"success": success, "message": f"Job {job_id} cancellation {'initiated' if success else 'failed'}"}
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
async def handle_get_jobs(self, request):
|
|
856
|
+
"""
|
|
857
|
+
Get list of all jobs.
|
|
858
|
+
|
|
859
|
+
Parameters
|
|
860
|
+
----------
|
|
861
|
+
request : aiohttp.web.Request
|
|
862
|
+
HTTP request object
|
|
863
|
+
|
|
864
|
+
Returns
|
|
865
|
+
-------
|
|
866
|
+
aiohttp.web.Response
|
|
867
|
+
JSON array of job objects with status information
|
|
868
|
+
"""
|
|
869
|
+
jobs_data = []
|
|
870
|
+
for job_id, job_info in self.jobs.items():
|
|
871
|
+
jobs_data.append(
|
|
872
|
+
{
|
|
873
|
+
"id": job_id,
|
|
874
|
+
"config": job_info.config.to_dict(),
|
|
875
|
+
"status": job_info.status.value,
|
|
876
|
+
"start_time": job_info.start_time.isoformat() if job_info.start_time else None,
|
|
877
|
+
"end_time": job_info.end_time.isoformat() if job_info.end_time else None,
|
|
878
|
+
"return_code": job_info.return_code,
|
|
879
|
+
"scheduler_job_id": job_info.scheduler_job_id,
|
|
880
|
+
}
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
return web.json_response(jobs_data)
|
|
884
|
+
|
|
885
|
+
async def handle_get_resources(self, request):
|
|
886
|
+
"""
|
|
887
|
+
Get current resource usage.
|
|
888
|
+
|
|
889
|
+
Parameters
|
|
890
|
+
----------
|
|
891
|
+
request : aiohttp.web.Request
|
|
892
|
+
HTTP request object
|
|
893
|
+
|
|
894
|
+
Returns
|
|
895
|
+
-------
|
|
896
|
+
aiohttp.web.Response
|
|
897
|
+
JSON with current host resource usage
|
|
898
|
+
"""
|
|
899
|
+
return web.json_response(self.resource_monitor.current_usage)
|
|
900
|
+
|
|
901
|
+
async def handle_get_queue(self, request):
|
|
902
|
+
"""
|
|
903
|
+
Get queue statistics.
|
|
904
|
+
|
|
905
|
+
Parameters
|
|
906
|
+
----------
|
|
907
|
+
request : aiohttp.web.Request
|
|
908
|
+
HTTP request object
|
|
909
|
+
|
|
910
|
+
Returns
|
|
911
|
+
-------
|
|
912
|
+
aiohttp.web.Response
|
|
913
|
+
JSON with queue statistics for dashboard display
|
|
914
|
+
"""
|
|
915
|
+
stats = self.job_pool.get_queue_stats()
|
|
916
|
+
logger.info(f"/queue endpoint returning max_concurrent = {stats['max_concurrent']}")
|
|
917
|
+
return web.json_response(stats)
|
|
918
|
+
|
|
919
|
+
async def handle_set_priority(self, request):
|
|
920
|
+
"""
|
|
921
|
+
Change job priority and re-queue.
|
|
922
|
+
|
|
923
|
+
Parameters
|
|
924
|
+
----------
|
|
925
|
+
request : aiohttp.web.Request
|
|
926
|
+
HTTP POST request with JSON payload
|
|
927
|
+
|
|
928
|
+
Returns
|
|
929
|
+
-------
|
|
930
|
+
aiohttp.web.Response
|
|
931
|
+
JSON response indicating success or failure
|
|
932
|
+
"""
|
|
933
|
+
job_id = request.match_info.get("job_id")
|
|
934
|
+
|
|
935
|
+
if job_id not in self.jobs:
|
|
936
|
+
return web.json_response({"success": False, "error": "Job not found"}, status=404)
|
|
937
|
+
|
|
938
|
+
try:
|
|
939
|
+
data = await request.json()
|
|
940
|
+
priority = data.get("priority", 0)
|
|
941
|
+
|
|
942
|
+
# Update priority in the pool
|
|
943
|
+
self.job_pool.remove_job(job_id)
|
|
944
|
+
self.job_pool.add_job(job_id, priority)
|
|
945
|
+
|
|
946
|
+
return web.json_response({"success": True, "message": f"Priority set to {priority}"})
|
|
947
|
+
|
|
948
|
+
except Exception as e:
|
|
949
|
+
return web.json_response({"success": False, "error": str(e)}, status=400)
|
|
950
|
+
|
|
951
|
+
async def handle_edit_concurrent_limits(self, request):
|
|
952
|
+
"""
|
|
953
|
+
Edit concurrent job limits.
|
|
954
|
+
|
|
955
|
+
Parameters
|
|
956
|
+
----------
|
|
957
|
+
request : aiohttp.web.Request
|
|
958
|
+
HTTP PUT request with JSON payload
|
|
959
|
+
|
|
960
|
+
Returns
|
|
961
|
+
-------
|
|
962
|
+
aiohttp.web.Response
|
|
963
|
+
JSON response indicating success or failure
|
|
964
|
+
"""
|
|
965
|
+
try:
|
|
966
|
+
data = await request.json()
|
|
967
|
+
|
|
968
|
+
if not data:
|
|
969
|
+
return web.json_response({"error": "No data provided"}, status=400)
|
|
970
|
+
|
|
971
|
+
# Update the concurrent job limits
|
|
972
|
+
updated_limits = await self.edit_concurrent_limits(data)
|
|
973
|
+
|
|
974
|
+
if updated_limits:
|
|
975
|
+
return web.json_response(
|
|
976
|
+
{"success": True, "message": "Concurrent job limits updated successfully", "limits": updated_limits}
|
|
977
|
+
)
|
|
978
|
+
else:
|
|
979
|
+
return web.json_response({"error": "Failed to update limits"}, status=400)
|
|
980
|
+
|
|
981
|
+
except Exception as e:
|
|
982
|
+
return web.json_response({"error": str(e)}, status=500)
|
|
983
|
+
|
|
984
|
+
async def wait_until_all_done(self) -> None:
|
|
985
|
+
"""
|
|
986
|
+
**Coroutine** that blocks until **every** job reaches a terminal state.
|
|
987
|
+
|
|
988
|
+
Safe to call from REST handlers or CLI scripts. Polls job status
|
|
989
|
+
until all jobs are completed, failed, or cancelled.
|
|
990
|
+
"""
|
|
991
|
+
while True:
|
|
992
|
+
# All jobs that are NOT in a terminal state
|
|
993
|
+
active = [
|
|
994
|
+
j
|
|
995
|
+
for j in self.jobs.values()
|
|
996
|
+
if j.status not in {JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED}
|
|
997
|
+
]
|
|
998
|
+
if not active:
|
|
999
|
+
return
|
|
1000
|
+
await asyncio.sleep(1) # be nice to the event-loop
|
|
1001
|
+
|
|
1002
|
+
async def submit_job(self, config: HFSSSimulationConfig, priority: int = 0) -> str:
|
|
1003
|
+
"""
|
|
1004
|
+
**Async** entry point for job submission.
|
|
1005
|
+
|
|
1006
|
+
Parameters
|
|
1007
|
+
----------
|
|
1008
|
+
config : HFSSSimulationConfig
|
|
1009
|
+
Validated simulation configuration.
|
|
1010
|
+
priority : int, optional
|
|
1011
|
+
Job priority. Default is ``0``.
|
|
1012
|
+
|
|
1013
|
+
Returns
|
|
1014
|
+
-------
|
|
1015
|
+
str
|
|
1016
|
+
Unique job identifier (same as ``config.jobid``).
|
|
1017
|
+
|
|
1018
|
+
Notes
|
|
1019
|
+
-----
|
|
1020
|
+
This method:
|
|
1021
|
+
1. Creates a JobInfo object with QUEUED status
|
|
1022
|
+
2. Adds the job to the appropriate queue
|
|
1023
|
+
3. Notifies web clients via Socket.IO
|
|
1024
|
+
4. Starts the processing loop if not already running
|
|
1025
|
+
"""
|
|
1026
|
+
job_id = config.jobid
|
|
1027
|
+
|
|
1028
|
+
# Create job info
|
|
1029
|
+
job_info = JobInfo(config=config, status=JobStatus.QUEUED, priority=priority)
|
|
1030
|
+
self.jobs[job_id] = job_info
|
|
1031
|
+
|
|
1032
|
+
# Add to job pool
|
|
1033
|
+
self.job_pool.add_job(job_id, priority)
|
|
1034
|
+
|
|
1035
|
+
# Notify web clients
|
|
1036
|
+
await self.sio.emit(
|
|
1037
|
+
"job_queued",
|
|
1038
|
+
{"job_id": job_id, "priority": priority, "queue_position": self.job_pool.get_queue_stats()["total_queued"]},
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
logger.info(f"Job {job_id} queued with priority {priority}")
|
|
1042
|
+
|
|
1043
|
+
# Trigger processing if not already running
|
|
1044
|
+
if not self._processing_task or self._processing_task.done():
|
|
1045
|
+
self._processing_task = asyncio.create_task(self._process_jobs_continuously())
|
|
1046
|
+
|
|
1047
|
+
return job_id
|
|
1048
|
+
|
|
1049
|
+
async def _process_jobs_continuously(self):
|
|
1050
|
+
"""
|
|
1051
|
+
Continuously process jobs until shutdown is requested.
|
|
1052
|
+
|
|
1053
|
+
This is the main job processing loop that:
|
|
1054
|
+
- Checks if new jobs can be started based on resource limits
|
|
1055
|
+
- Dequeues the highest priority job
|
|
1056
|
+
- Starts job execution in a separate task
|
|
1057
|
+
- Sleeps when no jobs can be started or queue is empty
|
|
1058
|
+
"""
|
|
1059
|
+
logger.info("✅ Job processing loop started.")
|
|
1060
|
+
while not self._shutdown:
|
|
1061
|
+
can_start = self.job_pool.can_start_job(self.resource_monitor)
|
|
1062
|
+
if can_start:
|
|
1063
|
+
next_job_id = self.job_pool.get_next_job()
|
|
1064
|
+
if next_job_id:
|
|
1065
|
+
logger.info(f"Dequeued job {next_job_id}. Starting...")
|
|
1066
|
+
self.job_pool.running_jobs.add(next_job_id)
|
|
1067
|
+
asyncio.create_task(self._process_single_job(next_job_id))
|
|
1068
|
+
else:
|
|
1069
|
+
logger.info("Queue is empty, sleeping.")
|
|
1070
|
+
await asyncio.sleep(1)
|
|
1071
|
+
else:
|
|
1072
|
+
logger.warning("Cannot start new job, waiting...")
|
|
1073
|
+
await asyncio.sleep(5)
|
|
1074
|
+
|
|
1075
|
+
await asyncio.sleep(0.2)
|
|
1076
|
+
|
|
1077
|
+
async def _process_single_job(self, job_id: str):
|
|
1078
|
+
"""
|
|
1079
|
+
Process a single job from the pool.
|
|
1080
|
+
|
|
1081
|
+
Parameters
|
|
1082
|
+
----------
|
|
1083
|
+
job_id : str
|
|
1084
|
+
Job identifier to process
|
|
1085
|
+
|
|
1086
|
+
Notes
|
|
1087
|
+
-----
|
|
1088
|
+
This method handles:
|
|
1089
|
+
- Local execution via subprocess
|
|
1090
|
+
- Scheduler submission (SLURM/LSF)
|
|
1091
|
+
- Status updates and notifications
|
|
1092
|
+
- Error handling and cleanup
|
|
1093
|
+
"""
|
|
1094
|
+
job_info = self.jobs.get(job_id)
|
|
1095
|
+
if not job_info or job_info.status != JobStatus.QUEUED:
|
|
1096
|
+
self.job_pool.running_jobs.discard(job_id)
|
|
1097
|
+
return
|
|
1098
|
+
|
|
1099
|
+
# Update job status
|
|
1100
|
+
job_info.status = JobStatus.RUNNING
|
|
1101
|
+
job_info.start_time = datetime.now()
|
|
1102
|
+
job_info.local_resources = self.resource_monitor.current_usage.copy()
|
|
1103
|
+
|
|
1104
|
+
# Notify web clients
|
|
1105
|
+
await self.sio.emit(
|
|
1106
|
+
"job_started",
|
|
1107
|
+
{"job_id": job_id, "start_time": job_info.start_time.isoformat(), "resources": job_info.local_resources},
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
logger.info(f"Job {job_id} started")
|
|
1111
|
+
|
|
1112
|
+
try:
|
|
1113
|
+
# Run the simulation
|
|
1114
|
+
if job_info.config.scheduler_type != SchedulerType.NONE:
|
|
1115
|
+
# Make sure the executable path is present
|
|
1116
|
+
if not job_info.config.ansys_edt_path or not os.path.exists(job_info.config.ansys_edt_path):
|
|
1117
|
+
if self.ansys_path and os.path.exists(self.ansys_path):
|
|
1118
|
+
job_info.config = HFSSSimulationConfig(
|
|
1119
|
+
**{**job_info.config.model_dump(), "ansys_edt_path": self.ansys_path}
|
|
1120
|
+
)
|
|
1121
|
+
logger.info(f"Using JobManager's detected ANSYS path: {self.ansys_path}")
|
|
1122
|
+
else:
|
|
1123
|
+
raise FileNotFoundError(
|
|
1124
|
+
f"ANSYS executable not found. Config path: {job_info.config.ansys_edt_path}, "
|
|
1125
|
+
f"Manager path: {self.ansys_path}"
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# Now generate the script – the path is guaranteed to be non-empty
|
|
1129
|
+
result = job_info.config.submit_to_scheduler()
|
|
1130
|
+
job_info.scheduler_job_id = job_info.config._extract_job_id(result.stdout)
|
|
1131
|
+
job_info.status = JobStatus.SCHEDULED
|
|
1132
|
+
logger.info(
|
|
1133
|
+
f"Job {job_id} submitted to scheduler with ID: {job_info.scheduler_job_id}, status: SCHEDULED"
|
|
1134
|
+
)
|
|
1135
|
+
await self.sio.emit("job_scheduled", {"job_id": job_id, "scheduler_job_id": job_info.scheduler_job_id})
|
|
1136
|
+
|
|
1137
|
+
else:
|
|
1138
|
+
# ---------------- local mode – same guarantee -----------------
|
|
1139
|
+
if not job_info.config.ansys_edt_path or not os.path.exists(job_info.config.ansys_edt_path):
|
|
1140
|
+
if self.ansys_path and os.path.exists(self.ansys_path):
|
|
1141
|
+
job_info.config = HFSSSimulationConfig(
|
|
1142
|
+
**{**job_info.config.model_dump(), "ansys_edt_path": self.ansys_path}
|
|
1143
|
+
)
|
|
1144
|
+
logger.info(f"Using JobManager's detected ANSYS path: {self.ansys_path}")
|
|
1145
|
+
else:
|
|
1146
|
+
raise FileNotFoundError(
|
|
1147
|
+
f"ANSYS executable not found. Config path: {job_info.config.ansys_edt_path}, "
|
|
1148
|
+
f"Manager path: {self.ansys_path}"
|
|
1149
|
+
)
|
|
1150
|
+
|
|
1151
|
+
# Generate command as list for secure execution
|
|
1152
|
+
command_list = job_info.config.generate_command_list()
|
|
1153
|
+
|
|
1154
|
+
# Log the command being executed for debugging
|
|
1155
|
+
logger.info(f"Executing command for job {job_id}: {' '.join(command_list)}")
|
|
1156
|
+
logger.info(f"ANSYS executable path: {job_info.config.ansys_edt_path}")
|
|
1157
|
+
logger.info(f"Project path: {job_info.config.project_path}")
|
|
1158
|
+
|
|
1159
|
+
# Check if project file exists
|
|
1160
|
+
if not os.path.exists(job_info.config.project_path):
|
|
1161
|
+
raise FileNotFoundError(f"Project file not found: {job_info.config.project_path}")
|
|
1162
|
+
|
|
1163
|
+
# Run locally - using asyncio subprocess for better control with secure command list
|
|
1164
|
+
process = await asyncio.create_subprocess_exec(
|
|
1165
|
+
*command_list,
|
|
1166
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1167
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
job_info.process = process
|
|
1171
|
+
|
|
1172
|
+
# Wait for completion with timeout (24 hours max)
|
|
1173
|
+
try:
|
|
1174
|
+
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=86400)
|
|
1175
|
+
|
|
1176
|
+
job_info.return_code = process.returncode
|
|
1177
|
+
job_info.output = stdout.decode() if stdout else ""
|
|
1178
|
+
job_info.error = stderr.decode() if stderr else ""
|
|
1179
|
+
|
|
1180
|
+
if process.returncode == 0:
|
|
1181
|
+
job_info.status = JobStatus.COMPLETED
|
|
1182
|
+
logger.info(f"Job {job_id} completed successfully")
|
|
1183
|
+
else:
|
|
1184
|
+
job_info.status = JobStatus.FAILED
|
|
1185
|
+
logger.error(f"Job {job_id} failed with return code {process.returncode}")
|
|
1186
|
+
|
|
1187
|
+
except asyncio.TimeoutError:
|
|
1188
|
+
job_info.status = JobStatus.FAILED
|
|
1189
|
+
job_info.error = "Job timed out after 24 hours"
|
|
1190
|
+
process.terminate()
|
|
1191
|
+
logger.error(f"Job {job_id} timed out")
|
|
1192
|
+
|
|
1193
|
+
except Exception as e:
|
|
1194
|
+
job_info.status = JobStatus.FAILED
|
|
1195
|
+
job_info.error = str(e)
|
|
1196
|
+
logger.error(f"Job {job_id} failed with error: {e}")
|
|
1197
|
+
|
|
1198
|
+
finally:
|
|
1199
|
+
job_info.end_time = datetime.now()
|
|
1200
|
+
self.job_pool.running_jobs.discard(job_id)
|
|
1201
|
+
|
|
1202
|
+
# Notify web clients
|
|
1203
|
+
await self.sio.emit(
|
|
1204
|
+
"job_completed",
|
|
1205
|
+
{
|
|
1206
|
+
"job_id": job_id,
|
|
1207
|
+
"status": job_info.status.value,
|
|
1208
|
+
"end_time": job_info.end_time.isoformat(),
|
|
1209
|
+
"return_code": job_info.return_code,
|
|
1210
|
+
},
|
|
1211
|
+
)
|
|
1212
|
+
|
|
1213
|
+
async def cancel_job(self, job_id: str) -> bool:
|
|
1214
|
+
"""
|
|
1215
|
+
**Cancel** a queued or running job.
|
|
1216
|
+
|
|
1217
|
+
Parameters
|
|
1218
|
+
----------
|
|
1219
|
+
job_id : str
|
|
1220
|
+
Identifier returned by :meth:`submit_job`.
|
|
1221
|
+
|
|
1222
|
+
Returns
|
|
1223
|
+
-------
|
|
1224
|
+
bool
|
|
1225
|
+
``True`` → cancellation succeeded, ``False`` → job not found or
|
|
1226
|
+
already terminal.
|
|
1227
|
+
|
|
1228
|
+
Notes
|
|
1229
|
+
-----
|
|
1230
|
+
For queued jobs: immediately removes from queue and marks as cancelled.
|
|
1231
|
+
For running jobs: attempts to terminate the process and cleanup.
|
|
1232
|
+
"""
|
|
1233
|
+
job_info = self.jobs.get(job_id)
|
|
1234
|
+
if not job_info:
|
|
1235
|
+
return False
|
|
1236
|
+
|
|
1237
|
+
if job_info.status == JobStatus.QUEUED:
|
|
1238
|
+
# Remove from queue
|
|
1239
|
+
self.job_pool.remove_job(job_id)
|
|
1240
|
+
job_info.status = JobStatus.CANCELLED
|
|
1241
|
+
job_info.end_time = datetime.now()
|
|
1242
|
+
return True
|
|
1243
|
+
|
|
1244
|
+
elif job_info.status == JobStatus.SCHEDULED:
|
|
1245
|
+
# Cancel job in external scheduler
|
|
1246
|
+
if job_info.scheduler_job_id:
|
|
1247
|
+
try:
|
|
1248
|
+
success = await self._sch_mgr.cancel_job(job_info.scheduler_job_id)
|
|
1249
|
+
if success:
|
|
1250
|
+
job_info.status = JobStatus.CANCELLED
|
|
1251
|
+
job_info.end_time = datetime.now()
|
|
1252
|
+
self.job_pool.running_jobs.discard(job_id)
|
|
1253
|
+
logger.info(f"Cancelled scheduler job {job_id} (scheduler ID: {job_info.scheduler_job_id})")
|
|
1254
|
+
return True
|
|
1255
|
+
else:
|
|
1256
|
+
logger.warning(f"Failed to cancel scheduler job {job_info.scheduler_job_id}")
|
|
1257
|
+
return False
|
|
1258
|
+
except Exception as e:
|
|
1259
|
+
logger.error(f"Error cancelling scheduler job {job_id}: {e}")
|
|
1260
|
+
return False
|
|
1261
|
+
return False
|
|
1262
|
+
|
|
1263
|
+
elif job_info.status == JobStatus.RUNNING and job_info.process:
|
|
1264
|
+
try:
|
|
1265
|
+
job_info.process.terminate()
|
|
1266
|
+
await asyncio.sleep(2)
|
|
1267
|
+
if job_info.process.returncode is None:
|
|
1268
|
+
job_info.process.kill()
|
|
1269
|
+
|
|
1270
|
+
job_info.status = JobStatus.CANCELLED
|
|
1271
|
+
job_info.end_time = datetime.now()
|
|
1272
|
+
return True
|
|
1273
|
+
|
|
1274
|
+
except Exception as e:
|
|
1275
|
+
logger.error(f"Failed to cancel job {job_id}: {e}")
|
|
1276
|
+
return False
|
|
1277
|
+
|
|
1278
|
+
return False
|
|
1279
|
+
|
|
1280
|
+
async def edit_concurrent_limits(self, update_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
1281
|
+
"""
|
|
1282
|
+
Edit concurrent job limits in the pool.
|
|
1283
|
+
|
|
1284
|
+
Parameters
|
|
1285
|
+
----------
|
|
1286
|
+
update_data : dict
|
|
1287
|
+
Fields to update in resource limits. Valid fields:
|
|
1288
|
+
- max_concurrent_jobs: Positive integer
|
|
1289
|
+
- max_cpu_percent: Float between 0 and 100
|
|
1290
|
+
- min_memory_gb: Non-negative float
|
|
1291
|
+
- min_disk_gb: Non-negative float
|
|
1292
|
+
|
|
1293
|
+
Returns
|
|
1294
|
+
-------
|
|
1295
|
+
dict or None
|
|
1296
|
+
Updated limits data or None if update failed
|
|
1297
|
+
|
|
1298
|
+
Raises
|
|
1299
|
+
------
|
|
1300
|
+
ValueError
|
|
1301
|
+
If any field validation fails
|
|
1302
|
+
"""
|
|
1303
|
+
try:
|
|
1304
|
+
# Define allowed fields for editing
|
|
1305
|
+
allowed_fields = ["max_concurrent_jobs", "max_cpu_percent", "min_memory_gb", "min_disk_gb"]
|
|
1306
|
+
|
|
1307
|
+
# Update allowed fields
|
|
1308
|
+
updated = False
|
|
1309
|
+
old_limits = {}
|
|
1310
|
+
|
|
1311
|
+
for field in allowed_fields:
|
|
1312
|
+
if field in update_data:
|
|
1313
|
+
old_value = getattr(self.resource_limits, field)
|
|
1314
|
+
new_value = update_data[field]
|
|
1315
|
+
|
|
1316
|
+
# Validate the new value
|
|
1317
|
+
if field == "max_concurrent_jobs" and (not isinstance(new_value, int) or new_value < 1):
|
|
1318
|
+
raise ValueError("max_concurrent_jobs must be a positive integer")
|
|
1319
|
+
elif field == "max_cpu_percent" and (
|
|
1320
|
+
not isinstance(new_value, (int, float)) or new_value <= 0 or new_value > 100
|
|
1321
|
+
):
|
|
1322
|
+
raise ValueError("max_cpu_percent must be between 0 and 100")
|
|
1323
|
+
elif field in ["min_memory_gb", "min_disk_gb"] and (
|
|
1324
|
+
not isinstance(new_value, (int, float)) or new_value < 0
|
|
1325
|
+
):
|
|
1326
|
+
raise ValueError(f"{field} must be a non-negative number")
|
|
1327
|
+
|
|
1328
|
+
old_limits[field] = old_value
|
|
1329
|
+
setattr(self.resource_limits, field, new_value)
|
|
1330
|
+
self.job_pool.resource_limits = self.resource_limits
|
|
1331
|
+
updated = True
|
|
1332
|
+
|
|
1333
|
+
if updated:
|
|
1334
|
+
# Log the changes
|
|
1335
|
+
for field in old_limits:
|
|
1336
|
+
logger.info(
|
|
1337
|
+
f"Resource limit {field} changed from {old_limits[field]} to "
|
|
1338
|
+
f"{getattr(self.resource_limits, field)}"
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
# Notify web clients about the update
|
|
1342
|
+
await self.sio.emit(
|
|
1343
|
+
"limits_updated",
|
|
1344
|
+
{
|
|
1345
|
+
"old_limits": old_limits,
|
|
1346
|
+
"new_limits": {
|
|
1347
|
+
"max_concurrent_jobs": self.resource_limits.max_concurrent_jobs,
|
|
1348
|
+
"max_cpu_percent": self.resource_limits.max_cpu_percent,
|
|
1349
|
+
"min_memory_gb": self.resource_limits.min_memory_gb,
|
|
1350
|
+
"min_disk_gb": self.resource_limits.min_disk_gb,
|
|
1351
|
+
},
|
|
1352
|
+
},
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
# Return updated limits data
|
|
1356
|
+
return {
|
|
1357
|
+
"max_concurrent_jobs": self.resource_limits.max_concurrent_jobs,
|
|
1358
|
+
"max_cpu_percent": self.resource_limits.max_cpu_percent,
|
|
1359
|
+
"min_memory_gb": self.resource_limits.min_memory_gb,
|
|
1360
|
+
"min_disk_gb": self.resource_limits.min_disk_gb,
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
return None
|
|
1364
|
+
|
|
1365
|
+
except Exception as e:
|
|
1366
|
+
logger.error(f"Failed to update concurrent limits: {e}")
|
|
1367
|
+
return None
|
|
1368
|
+
|
|
1369
|
+
|
|
1370
|
+
async def submit_job_to_manager(
|
|
1371
|
+
config: HFSSSimulationConfig, priority: int = 0, manager_url: str = "http://localhost:8080"
|
|
1372
|
+
) -> str:
|
|
1373
|
+
"""
|
|
1374
|
+
**Helper** coroutine that submits a job to a **remote** Job Manager.
|
|
1375
|
+
|
|
1376
|
+
Falls back to **local** execution if the HTTP call fails (offline mode).
|
|
1377
|
+
|
|
1378
|
+
Parameters
|
|
1379
|
+
----------
|
|
1380
|
+
config : HFSSSimulationConfig
|
|
1381
|
+
Validated configuration.
|
|
1382
|
+
priority : int, optional
|
|
1383
|
+
Job priority. Default is ``0``.
|
|
1384
|
+
manager_url : str, optional
|
|
1385
|
+
Base URL of the manager. Default is ``"http://localhost:8080"``.
|
|
1386
|
+
|
|
1387
|
+
Returns
|
|
1388
|
+
-------
|
|
1389
|
+
str
|
|
1390
|
+
Job identifier (local or remote).
|
|
1391
|
+
|
|
1392
|
+
Raises
|
|
1393
|
+
------
|
|
1394
|
+
Exception
|
|
1395
|
+
If **both** remote and local execution fail.
|
|
1396
|
+
|
|
1397
|
+
Notes
|
|
1398
|
+
-----
|
|
1399
|
+
This function is useful for clients that want to submit jobs to a
|
|
1400
|
+
remote manager but maintain offline capability.
|
|
1401
|
+
"""
|
|
1402
|
+
try:
|
|
1403
|
+
async with aiohttp.ClientSession() as session:
|
|
1404
|
+
url = f"{manager_url}/jobs/submit"
|
|
1405
|
+
async with session.post(url, json={"config": config.to_dict(), "priority": priority}) as response:
|
|
1406
|
+
result = await response.json()
|
|
1407
|
+
|
|
1408
|
+
if result["success"]:
|
|
1409
|
+
return result["job_id"]
|
|
1410
|
+
else:
|
|
1411
|
+
raise Exception(f"Job submission failed: {result['error']}")
|
|
1412
|
+
|
|
1413
|
+
except Exception as e:
|
|
1414
|
+
logger.error(f"Failed to submit job to manager: {e}")
|
|
1415
|
+
# Fall back to local execution
|
|
1416
|
+
return await config.run_simulation()
|
|
1417
|
+
|
|
1418
|
+
|
|
1419
|
+
# --------------------------------------------------------------------------- #
|
|
1420
|
+
# SchedulerManager – live SLURM / LSF introspection
|
|
1421
|
+
# --------------------------------------------------------------------------- #
|
|
1422
|
+
class SchedulerManager:
|
|
1423
|
+
"""
|
|
1424
|
+
Thin async wrapper around cluster scheduler commands.
|
|
1425
|
+
|
|
1426
|
+
Provides live introspection of SLURM and LSF clusters including:
|
|
1427
|
+
|
|
1428
|
+
* List of partitions / queues with resource information
|
|
1429
|
+
* Per-partition: total & free cores, total & free memory
|
|
1430
|
+
* Global job table (running, pending, etc.)
|
|
1431
|
+
|
|
1432
|
+
All methods are **coroutines** so they can be awaited from the REST layer
|
|
1433
|
+
without blocking the event-loop.
|
|
1434
|
+
|
|
1435
|
+
Parameters
|
|
1436
|
+
----------
|
|
1437
|
+
scheduler_type : SchedulerType
|
|
1438
|
+
Type of scheduler (SLURM or LSF only)
|
|
1439
|
+
|
|
1440
|
+
Raises
|
|
1441
|
+
------
|
|
1442
|
+
ValueError
|
|
1443
|
+
If scheduler_type is not SLURM or LSF
|
|
1444
|
+
"""
|
|
1445
|
+
|
|
1446
|
+
def __init__(self, scheduler_type: SchedulerType):
|
|
1447
|
+
if scheduler_type not in {SchedulerType.SLURM, SchedulerType.LSF}:
|
|
1448
|
+
raise ValueError("Only SLURM and LSF are supported")
|
|
1449
|
+
self.scheduler_type = scheduler_type
|
|
1450
|
+
|
|
1451
|
+
async def get_partitions(self) -> List[Dict[str, Any]]:
|
|
1452
|
+
"""
|
|
1453
|
+
Get list of scheduler partitions/queues with resource information.
|
|
1454
|
+
|
|
1455
|
+
Returns
|
|
1456
|
+
-------
|
|
1457
|
+
List[Dict[str, Any]]
|
|
1458
|
+
List of partition dictionaries with keys:
|
|
1459
|
+
- name: Partition/queue name
|
|
1460
|
+
- cores_total: Total available cores
|
|
1461
|
+
- cores_used: Currently used cores
|
|
1462
|
+
- memory_total_gb: Total memory in GB
|
|
1463
|
+
- memory_used_gb: Currently used memory in GB
|
|
1464
|
+
|
|
1465
|
+
Raises
|
|
1466
|
+
------
|
|
1467
|
+
RuntimeError
|
|
1468
|
+
If scheduler command execution fails
|
|
1469
|
+
"""
|
|
1470
|
+
if self.scheduler_type == SchedulerType.SLURM:
|
|
1471
|
+
return await self._slurm_partitions()
|
|
1472
|
+
else: # LSF
|
|
1473
|
+
return await self._lsf_partitions()
|
|
1474
|
+
|
|
1475
|
+
async def get_jobs(self) -> List[Dict[str, Any]]:
|
|
1476
|
+
"""
|
|
1477
|
+
Get global job table (all users).
|
|
1478
|
+
|
|
1479
|
+
Returns
|
|
1480
|
+
-------
|
|
1481
|
+
List[Dict[str, Any]]
|
|
1482
|
+
List of job dictionaries with keys:
|
|
1483
|
+
- job_id: Scheduler job ID
|
|
1484
|
+
- partition: Partition/queue name
|
|
1485
|
+
- user: Job owner username
|
|
1486
|
+
- state: Job state (RUNNING, PENDING, etc.)
|
|
1487
|
+
- nodes: Number of nodes allocated
|
|
1488
|
+
- cpus: Number of CPUs allocated
|
|
1489
|
+
- memory_gb: Memory allocated in GB
|
|
1490
|
+
|
|
1491
|
+
Raises
|
|
1492
|
+
------
|
|
1493
|
+
RuntimeError
|
|
1494
|
+
If scheduler command execution fails
|
|
1495
|
+
"""
|
|
1496
|
+
if self.scheduler_type == SchedulerType.SLURM:
|
|
1497
|
+
return await self._slurm_jobs()
|
|
1498
|
+
else:
|
|
1499
|
+
return await self._lsf_jobs()
|
|
1500
|
+
|
|
1501
|
+
async def _slurm_partitions(self) -> List[Dict[str, Any]]:
|
|
1502
|
+
"""Parse SLURM partition information from sinfo command."""
|
|
1503
|
+
cmd = ["sinfo", "-h", "-o", "%R %F %C %m"] # PARTITION NODES(A/I/O/T) CPUS(A/I/O/T) MEMORY
|
|
1504
|
+
stdout = await self._run(cmd)
|
|
1505
|
+
out = []
|
|
1506
|
+
for line in stdout.splitlines():
|
|
1507
|
+
if not line.strip():
|
|
1508
|
+
continue
|
|
1509
|
+
part, node_str, cpu_str, mem_mb = line.split()
|
|
1510
|
+
na, ni, no, nt = map(int, node_str.split("/"))
|
|
1511
|
+
ca, ci, co, ct = map(int, cpu_str.split("/"))
|
|
1512
|
+
mem_total = float(mem_mb.rstrip("+MGTP")) / 1024 # GB
|
|
1513
|
+
out.append(
|
|
1514
|
+
{
|
|
1515
|
+
"name": part,
|
|
1516
|
+
"nodes_total": nt,
|
|
1517
|
+
"nodes_used": na + no,
|
|
1518
|
+
"cores_total": ct,
|
|
1519
|
+
"cores_used": ca + co,
|
|
1520
|
+
"memory_total_gb": mem_total,
|
|
1521
|
+
"memory_used_gb": mem_total * (na + no) / max(nt, 1),
|
|
1522
|
+
}
|
|
1523
|
+
)
|
|
1524
|
+
return out
|
|
1525
|
+
|
|
1526
|
+
async def _slurm_jobs(self) -> List[Dict[str, Any]]:
|
|
1527
|
+
"""Parse SLURM job information from squeue command."""
|
|
1528
|
+
cmd = ["squeue", "-h", "-o", "%i %u %P %T %D %C %m"]
|
|
1529
|
+
stdout = await self._run(cmd)
|
|
1530
|
+
jobs = []
|
|
1531
|
+
for line in stdout.splitlines():
|
|
1532
|
+
if not line.strip():
|
|
1533
|
+
continue
|
|
1534
|
+
job_id, user, partition, state, nodes, cpus, mem_str = line.split()
|
|
1535
|
+
# unify memory to GiB
|
|
1536
|
+
mem_str = mem_str.strip()
|
|
1537
|
+
if mem_str.endswith(("M", "G", "T")):
|
|
1538
|
+
unit = mem_str[-1]
|
|
1539
|
+
val = float(mem_str[:-1])
|
|
1540
|
+
if unit == "M":
|
|
1541
|
+
memory_gb = val / 1024
|
|
1542
|
+
elif unit == "G":
|
|
1543
|
+
memory_gb = val
|
|
1544
|
+
else: # T
|
|
1545
|
+
memory_gb = val * 1024
|
|
1546
|
+
else: # plain number → assume MiB
|
|
1547
|
+
memory_gb = float(mem_str) / 1024
|
|
1548
|
+
jobs.append(
|
|
1549
|
+
{
|
|
1550
|
+
"job_id": job_id,
|
|
1551
|
+
"partition": partition,
|
|
1552
|
+
"user": user,
|
|
1553
|
+
"state": state,
|
|
1554
|
+
"nodes": int(nodes),
|
|
1555
|
+
"cpus": int(cpus),
|
|
1556
|
+
"memory_gb": memory_gb,
|
|
1557
|
+
}
|
|
1558
|
+
)
|
|
1559
|
+
return jobs
|
|
1560
|
+
|
|
1561
|
+
async def _lsf_partitions(self) -> List[Dict[str, Any]]:
|
|
1562
|
+
"""Parse LSF queue information from bqueues and bhosts commands."""
|
|
1563
|
+
# 1. queues → max slots
|
|
1564
|
+
qraw = await self._run(["bqueues", "-o", "queue_name:20 max:10 num_proc:10", "-noheader"])
|
|
1565
|
+
qinfo = {}
|
|
1566
|
+
for ln in qraw.splitlines():
|
|
1567
|
+
if not ln.strip():
|
|
1568
|
+
continue
|
|
1569
|
+
name, max_s, num_p = ln.split()
|
|
1570
|
+
qinfo[name] = {
|
|
1571
|
+
"nodes_total": int(num_p),
|
|
1572
|
+
"nodes_used": 0,
|
|
1573
|
+
"cores_total": int(num_p),
|
|
1574
|
+
"cores_used": 0,
|
|
1575
|
+
"mem_total_gb": 0.0,
|
|
1576
|
+
"mem_used_gb": 0.0,
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
# 2. hosts → real cores + real memory
|
|
1580
|
+
hraw = await self._run(["bhosts", "-o", "host_name:20 ncpus:10 max_mem:15", "-noheader"])
|
|
1581
|
+
for ln in hraw.splitlines():
|
|
1582
|
+
if not ln.strip():
|
|
1583
|
+
continue
|
|
1584
|
+
host, ncpus, max_mem_kb = ln.split()
|
|
1585
|
+
max_mem_gb = int(max_mem_kb) / 1024**2
|
|
1586
|
+
for q in qinfo.values():
|
|
1587
|
+
q["mem_total_gb"] += max_mem_gb
|
|
1588
|
+
# LSF does not give per-host used mem; keep 0 for now
|
|
1589
|
+
return [{"name": q, **qinfo[q]} for q in qinfo]
|
|
1590
|
+
|
|
1591
|
+
async def _lsf_jobs(self) -> List[Dict[str, Any]]:
|
|
1592
|
+
"""Parse LSF job information from bjobs command."""
|
|
1593
|
+
cmd = ["bjobs", "-u", "all", "-o", "jobid:10 user:15 queue:15 stat:10 slots:10 mem:10", "-noheader"]
|
|
1594
|
+
stdout = await self._run(cmd)
|
|
1595
|
+
jobs = []
|
|
1596
|
+
for line in stdout.splitlines():
|
|
1597
|
+
if not line.strip():
|
|
1598
|
+
continue
|
|
1599
|
+
job_id, user, queue, state, slots, mem = line.split()
|
|
1600
|
+
jobs.append(
|
|
1601
|
+
{
|
|
1602
|
+
"job_id": job_id,
|
|
1603
|
+
"partition": queue,
|
|
1604
|
+
"user": user,
|
|
1605
|
+
"state": state,
|
|
1606
|
+
"nodes": 1, # LSF does not expose node count directly
|
|
1607
|
+
"cpus": int(slots),
|
|
1608
|
+
"memory_gb": int(mem) / 1024 if mem.isdigit() else float(mem),
|
|
1609
|
+
}
|
|
1610
|
+
)
|
|
1611
|
+
return jobs
|
|
1612
|
+
|
|
1613
|
+
async def _run(self, cmd: List[str]) -> str:
|
|
1614
|
+
"""
|
|
1615
|
+
Run scheduler command and return output.
|
|
1616
|
+
|
|
1617
|
+
Parameters
|
|
1618
|
+
----------
|
|
1619
|
+
cmd : List[str]
|
|
1620
|
+
Command and arguments to execute
|
|
1621
|
+
|
|
1622
|
+
Returns
|
|
1623
|
+
-------
|
|
1624
|
+
str
|
|
1625
|
+
Command stdout as string
|
|
1626
|
+
|
|
1627
|
+
Raises
|
|
1628
|
+
------
|
|
1629
|
+
RuntimeError
|
|
1630
|
+
If command returns non-zero exit code
|
|
1631
|
+
"""
|
|
1632
|
+
proc = await asyncio.create_subprocess_exec(
|
|
1633
|
+
*cmd,
|
|
1634
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1635
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1636
|
+
)
|
|
1637
|
+
stdout, stderr = await proc.communicate()
|
|
1638
|
+
if proc.returncode != 0:
|
|
1639
|
+
raise RuntimeError(f"{' '.join(cmd)} failed: {stderr.decode()}")
|
|
1640
|
+
return stdout.decode().strip()
|
|
1641
|
+
|
|
1642
|
+
|
|
1643
|
+
# Usage example
|
|
1644
|
+
async def main():
|
|
1645
|
+
"""
|
|
1646
|
+
Example usage of the JobManager class.
|
|
1647
|
+
|
|
1648
|
+
This demonstrates how to create a job manager with custom resource
|
|
1649
|
+
limits and submit jobs with different priorities.
|
|
1650
|
+
"""
|
|
1651
|
+
# Create job manager with custom resource limits
|
|
1652
|
+
resource_limits = ResourceLimits(
|
|
1653
|
+
max_concurrent_jobs=3, # Allow 3 simultaneous jobs
|
|
1654
|
+
max_cpu_percent=75.0, # Don't start jobs if CPU > 75%
|
|
1655
|
+
min_memory_gb=4.0, # Require 4GB free memory
|
|
1656
|
+
min_disk_gb=20.0, # Require 20GB free disk space
|
|
1657
|
+
)
|
|
1658
|
+
|
|
1659
|
+
manager = JobManager(resource_limits)
|
|
1660
|
+
|
|
1661
|
+
|
|
1662
|
+
if __name__ == "__main__":
|
|
1663
|
+
asyncio.run(main())
|