pyedb 0.60.0__py3-none-any.whl → 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyedb might be problematic. Click here for more details.

Files changed (34) hide show
  1. pyedb/__init__.py +1 -1
  2. pyedb/configuration/cfg_components.py +35 -7
  3. pyedb/dotnet/database/cell/hierarchy/component.py +8 -6
  4. pyedb/dotnet/database/cell/hierarchy/model.py +1 -28
  5. pyedb/dotnet/database/cell/hierarchy/s_parameter_model.py +10 -14
  6. pyedb/dotnet/database/cell/hierarchy/spice_model.py +13 -7
  7. pyedb/dotnet/database/components.py +5 -1
  8. pyedb/dotnet/database/edb_data/padstacks_data.py +5 -3
  9. pyedb/dotnet/database/modeler.py +2 -1
  10. pyedb/dotnet/database/padstack.py +187 -1
  11. pyedb/dotnet/edb.py +70 -1
  12. pyedb/generic/general_methods.py +21 -0
  13. pyedb/grpc/database/definition/materials.py +1 -1
  14. pyedb/grpc/database/definition/padstack_def.py +16 -9
  15. pyedb/grpc/database/padstacks.py +201 -6
  16. pyedb/grpc/database/primitive/padstack_instance.py +90 -0
  17. pyedb/grpc/edb.py +70 -1
  18. pyedb/grpc/rpc_session.py +16 -3
  19. pyedb/workflows/__init__.py +21 -0
  20. pyedb/workflows/job_manager/__init__.py +21 -0
  21. pyedb/workflows/job_manager/backend/__init__.py +21 -0
  22. pyedb/workflows/job_manager/backend/job_manager_handler.py +910 -0
  23. pyedb/workflows/job_manager/backend/job_submission.py +1169 -0
  24. pyedb/workflows/job_manager/backend/service.py +1663 -0
  25. pyedb/workflows/job_manager/backend/start_service.py +86 -0
  26. pyedb/workflows/job_manager/backend/submit_job_on_scheduler.py +168 -0
  27. pyedb/workflows/job_manager/backend/submit_local_job.py +166 -0
  28. pyedb/workflows/utilities/__init__.py +21 -0
  29. pyedb/workflows/utilities/cutout.py +1 -1
  30. pyedb/workflows/utilities/hfss_log_parser.py +446 -0
  31. {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/METADATA +7 -4
  32. {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/RECORD +34 -24
  33. {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/WHEEL +0 -0
  34. {pyedb-0.60.0.dist-info → pyedb-0.61.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1663 @@
1
+ # Copyright (C) 2023 - 2025 ANSYS, Inc. and/or its affiliates.
2
+ # SPDX-License-Identifier: MIT
3
+ #
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+
24
+ """
25
+ Async job manager with pool scheduling and REST/WebSocket API.
26
+
27
+ The module implements a **fully asynchronous** multi-tenant job manager that:
28
+
29
+ * enforces **host resource limits** (CPU, memory, disk, concurrency),
30
+ * maintains **priority queues** (negative → low, zero → normal, positive → high),
31
+ * exposes **REST** and **Socket.IO** endpoints for integration,
32
+ * supports **external schedulers** (SLURM, LSF, PBS, Windows-HPC) **and**
33
+ local subprocess execution,
34
+ * guarantees **exactly-once** execution and **graceful draining** on shutdown.
35
+
36
+ It is designed to be **embedded** inside a PyEDB process or **deployed** as a
37
+ stand-alone micro-service (Docker, systemd, Kubernetes).
38
+
39
+ Examples
40
+ --------
41
+ Stand-alone REST server:
42
+
43
+ .. code-block:: bash
44
+
45
+ python -m pyedb.workflows.job_manager.service
46
+
47
+ Embedded inside PyEDB:
48
+
49
+ .. code-block:: python
50
+
51
+ from pyedb.workflows.job_manager.service import JobManager, ResourceLimits
52
+
53
+ manager = JobManager(ResourceLimits(max_concurrent_jobs=4))
54
+ await manager.submit_job(config, priority=10)
55
+
56
+ The REST API is **self-documenting** at runtime:
57
+
58
+ .. code-block:: bash
59
+
60
+ curl http://localhost:8080/resources
61
+ curl http://localhost:8080/queue
62
+ curl -X POST http://localhost:8080/jobs/submit -d @cfg.json
63
+ """
64
+
65
+ import asyncio
66
+ from collections import deque
67
+ from dataclasses import dataclass
68
+ from datetime import datetime
69
+ from enum import Enum
70
+ import getpass
71
+ import logging
72
+ import os.path
73
+ import platform
74
+ from typing import Any, Deque, Dict, List, Optional, Set
75
+
76
+ import aiohttp
77
+ from aiohttp import web
78
+ import psutil
79
+ import socketio
80
+
81
+ from pyedb.workflows.job_manager.backend.job_submission import (
82
+ HFSSSimulationConfig,
83
+ SchedulerType,
84
+ )
85
+
86
+ # Configure logging
87
+ logging.basicConfig(level=logging.INFO)
88
+ logger = logging.getLogger("JobManager")
89
+
90
+
91
+ class JobStatus(Enum):
92
+ """
93
+ Terminal and non-terminal job states used internally and exposed via REST.
94
+
95
+ Members
96
+ -------
97
+ PENDING
98
+ Initial state before queuing.
99
+ QUEUED
100
+ Awaiting resources in local queue.
101
+ SCHEDULED
102
+ Submitted to external scheduler.
103
+ RUNNING
104
+ Currently executing.
105
+ COMPLETED
106
+ Normal termination.
107
+ FAILED
108
+ Non-zero exit code or exception.
109
+ CANCELLED
110
+ User-initiated abort.
111
+ """
112
+
113
+ PENDING = "pending"
114
+ RUNNING = "running"
115
+ COMPLETED = "completed"
116
+ FAILED = "failed"
117
+ CANCELLED = "cancelled"
118
+ SCHEDULED = "scheduled"
119
+ QUEUED = "queued" # Specifically for local queue
120
+
121
+
122
+ @dataclass
123
+ class ResourceLimits:
124
+ """
125
+ Host-level resource constraints enforced by the manager.
126
+
127
+ All attributes are **checked** before starting a new job; if any limit is
128
+ exceeded the job remains in the queue.
129
+
130
+ Parameters
131
+ ----------
132
+ max_concurrent_jobs : int, optional
133
+ Simultaneous local jobs. Default is ``1``.
134
+ max_cpu_percent : float, optional
135
+ CPU utilisation threshold (0-100). Default is ``80.0``.
136
+ min_memory_gb : float, optional
137
+ Free RAM required to start (GB). Default is ``2.0``.
138
+ min_disk_gb : float, optional
139
+ Free disk space required to start (GB). Default is ``10.0``.
140
+ """
141
+
142
+ max_concurrent_jobs: int = 1
143
+ max_cpu_percent: float = 80.0 # Don't start new jobs if CPU > 80%
144
+ min_memory_gb: float = 2.0 # Minimum free memory required to start a job
145
+ min_disk_gb: float = 10.0 # Minimum free disk space required
146
+
147
+
148
+ @dataclass
149
+ class JobInfo:
150
+ """
151
+ **Mutable** state container for a single simulation.
152
+
153
+ Attributes
154
+ ----------
155
+ config : HFSSSimulationConfig
156
+ Immutable configuration.
157
+ status : JobStatus
158
+ Current life-cycle state.
159
+ start_time : datetime or None
160
+ When the job entered ``RUNNING``.
161
+ end_time : datetime or None
162
+ When the job reached a terminal state.
163
+ return_code : int or None
164
+ Exit code of the solver or scheduler.
165
+ output : str
166
+ Stdout captured (local runs only).
167
+ error : str
168
+ Stderr captured (local runs only).
169
+ process : asyncio.subprocess.Process or None
170
+ Handle for local cancellation.
171
+ scheduler_job_id : str or None
172
+ External identifier (SLURM, LSF, …).
173
+ local_resources : dict or None
174
+ Snapshot of host telemetry at start time.
175
+ priority : int
176
+ Higher numbers are de-queued first.
177
+ """
178
+
179
+ config: HFSSSimulationConfig
180
+ status: JobStatus
181
+ start_time: Optional[datetime] = None
182
+ end_time: Optional[datetime] = None
183
+ return_code: Optional[int] = None
184
+ output: str = ""
185
+ error: str = ""
186
+ process: Optional[Any] = None
187
+ scheduler_job_id: Optional[str] = None
188
+ local_resources: Optional[Dict[str, Any]] = None
189
+ priority: int = 0 # Higher number = higher priority
190
+
191
+
192
+ class ResourceMonitor:
193
+ """
194
+ **Async** background task that samples host telemetry every *N* seconds.
195
+
196
+ The monitor keeps a **thread-safe** in-memory cache used by
197
+ :meth:`JobPoolManager.can_start_job` to throttle submissions.
198
+
199
+ Parameters
200
+ ----------
201
+ update_interval : int, optional
202
+ Sampling period in seconds. Default is ``5``.
203
+
204
+ Attributes
205
+ ----------
206
+ current_usage : dict
207
+ Cached resource usage information with keys:
208
+ - cpu_percent: Current CPU usage percentage
209
+ - memory_percent: Current memory usage percentage
210
+ - memory_used_gb: Memory used in GB
211
+ - memory_total_gb: Total memory in GB
212
+ - memory_free_gb: Free memory in GB
213
+ - disk_usage_percent: Disk usage percentage
214
+ - disk_free_gb: Free disk space in GB
215
+ - timestamp: Last update timestamp
216
+ """
217
+
218
+ def __init__(self, update_interval: int = 5):
219
+ self.update_interval = update_interval
220
+ self.current_usage = {
221
+ "cpu_percent": 0,
222
+ "memory_percent": 0,
223
+ "memory_used_gb": 0,
224
+ "memory_total_gb": 0,
225
+ "memory_free_gb": 0,
226
+ "disk_usage_percent": 0,
227
+ "disk_free_gb": 0,
228
+ "timestamp": datetime.now().isoformat(),
229
+ }
230
+
231
+ async def monitor_resources(self):
232
+ """
233
+ **Infinite** coroutine that updates :attr:`current_usage`.
234
+
235
+ Runs until the event-loop is shut down. Samples CPU, memory, and disk
236
+ usage at regular intervals.
237
+ """
238
+ await self._sample_once()
239
+ while True:
240
+ await self._sample_once()
241
+ await asyncio.sleep(self.update_interval)
242
+
243
+ async def _sample_once(self):
244
+ try:
245
+ # CPU usage
246
+ cpu_percent = psutil.cpu_percent(interval=1)
247
+
248
+ # Memory usage
249
+ memory = psutil.virtual_memory()
250
+ memory_total_gb = memory.total / (1024**3)
251
+ memory_used_gb = memory.used / (1024**3)
252
+ memory_free_gb = memory.available / (1024**3)
253
+
254
+ # Disk usage (checking the root directory)
255
+ disk = psutil.disk_usage(os.path.abspath(os.sep))
256
+ disk_usage_percent = disk.percent
257
+ disk_free_gb = disk.free / (1024**3)
258
+
259
+ self.current_usage.update(
260
+ {
261
+ "cpu_percent": cpu_percent,
262
+ "memory_percent": memory.percent,
263
+ "memory_used_gb": round(memory_used_gb, 2),
264
+ "memory_total_gb": round(memory_total_gb, 2),
265
+ "memory_free_gb": round(memory_free_gb, 2),
266
+ "disk_usage_percent": disk_usage_percent,
267
+ "disk_free_gb": round(disk_free_gb, 2),
268
+ "timestamp": datetime.now().isoformat(),
269
+ }
270
+ )
271
+
272
+ except Exception as e:
273
+ logger.error(f"Resource monitoring error: {e}")
274
+
275
+
276
+ class JobPoolManager:
277
+ """
278
+ **Priority-aware** FIFO queues plus running-set tracker.
279
+
280
+ The implementation is **lock-free** (uses ``deque`` and ``dict``) and
281
+ **async-safe** (no awaits, therefore can be invoked from any thread).
282
+
283
+ Parameters
284
+ ----------
285
+ resource_limits : ResourceLimits
286
+ Constraints used by :meth:`can_start_job`.
287
+
288
+ Attributes
289
+ ----------
290
+ job_queue : Deque[str]
291
+ FIFO queue for normal priority jobs
292
+ priority_queue : Dict[int, List[str]]
293
+ Priority-based queues (key=priority, value=job_ids)
294
+ running_jobs : Set[str]
295
+ Set of currently running job IDs
296
+ job_priorities : Dict[str, int]
297
+ Mapping of job_id to priority
298
+ """
299
+
300
+ def __init__(self, resource_limits: ResourceLimits):
301
+ self.resource_limits = resource_limits
302
+ self.job_queue: Deque[str] = deque() # FIFO queue for job IDs
303
+ self.priority_queue: Dict[int, List[str]] = {} # Priority-based queue
304
+ self.running_jobs: Set[str] = set()
305
+ self.job_priorities: Dict[str, int] = {}
306
+
307
+ def add_job(self, job_id: str, priority: int = 0):
308
+ """
309
+ Insert job into the **appropriate** queue (priority or FIFO).
310
+
311
+ Parameters
312
+ ----------
313
+ job_id : str
314
+ Unique identifier.
315
+ priority : int, optional
316
+ Negative (low), zero (normal), positive (high). Default is ``0``.
317
+ """
318
+ self.job_priorities[job_id] = priority
319
+
320
+ if priority > 0:
321
+ if priority not in self.priority_queue:
322
+ self.priority_queue[priority] = []
323
+ self.priority_queue[priority].append(job_id)
324
+ # Sort priority queues to maintain order
325
+ for pq in self.priority_queue.values():
326
+ pq.sort(key=lambda x: self.job_priorities.get(x, 0), reverse=True)
327
+ else:
328
+ self.job_queue.append(job_id)
329
+
330
+ def get_next_job(self) -> Optional[str]:
331
+ """
332
+ Return the **next** job to be started (highest priority first).
333
+
334
+ Returns
335
+ -------
336
+ str or None
337
+ Job identifier or ``None`` if all queues are empty.
338
+
339
+ Notes
340
+ -----
341
+ Priority queues are checked first (highest to lowest), then the
342
+ normal FIFO queue.
343
+ """
344
+ # First check priority queues (highest priority first)
345
+ for priority in sorted(self.priority_queue.keys(), reverse=True):
346
+ if self.priority_queue[priority]:
347
+ return self.priority_queue[priority].pop(0)
348
+
349
+ # Then check regular queue
350
+ if self.job_queue:
351
+ return self.job_queue.popleft()
352
+
353
+ return None
354
+
355
+ def remove_job(self, job_id: str):
356
+ """
357
+ **Idempotently** remove a job from **all** queues.
358
+
359
+ Parameters
360
+ ----------
361
+ job_id : str
362
+ Identifier to purge.
363
+ """
364
+ if job_id in self.job_queue:
365
+ self.job_queue.remove(job_id)
366
+
367
+ for priority, jobs in self.priority_queue.items():
368
+ if job_id in jobs:
369
+ jobs.remove(job_id)
370
+
371
+ if job_id in self.job_priorities:
372
+ del self.job_priorities[job_id]
373
+
374
+ def can_start_job(self, resource_monitor: ResourceMonitor) -> bool:
375
+ """
376
+ **Boolean** predicate that decides whether a new job may be started.
377
+
378
+ Checks resource limits without violating constraints.
379
+
380
+ Parameters
381
+ ----------
382
+ resource_monitor : ResourceMonitor
383
+ Source of current host telemetry.
384
+
385
+ Returns
386
+ -------
387
+ bool
388
+ ``True`` → job may be started, ``False`` → remain queued.
389
+ """
390
+ resources = resource_monitor.current_usage
391
+
392
+ # Check if we've reached max concurrent jobs
393
+ if len(self.running_jobs) >= self.resource_limits.max_concurrent_jobs:
394
+ return False
395
+
396
+ # Check CPU usage
397
+ if resources["cpu_percent"] > self.resource_limits.max_cpu_percent:
398
+ logger.info(f"CPU usage too high ({resources['cpu_percent']}%), delaying job start")
399
+ return False
400
+
401
+ # Check memory availability
402
+ if resources["memory_free_gb"] < self.resource_limits.min_memory_gb:
403
+ logger.info(f"Insufficient memory ({resources['memory_free_gb']}GB free), delaying job start")
404
+ return False
405
+
406
+ # Check disk space
407
+ if resources["disk_free_gb"] < self.resource_limits.min_disk_gb:
408
+ logger.info(f"Insufficient disk space ({resources['disk_free_gb']}GB free), delaying job start")
409
+ return False
410
+ return True
411
+
412
+ def get_queue_stats(self) -> Dict[str, Any]:
413
+ """
414
+ Real-time snapshot for REST ``/queue`` endpoint.
415
+
416
+ Returns
417
+ -------
418
+ dict
419
+ Queue statistics with keys:
420
+ - total_queued: Total jobs in all queues
421
+ - regular_queue_size: Jobs in normal FIFO queue
422
+ - priority_queues: Dict of priority -> count
423
+ - running_jobs: Number of currently running jobs
424
+ - max_concurrent: Maximum concurrent jobs allowed
425
+ """
426
+ total_queued = len(self.job_queue)
427
+ for jobs in self.priority_queue.values():
428
+ total_queued += len(jobs)
429
+
430
+ return {
431
+ "total_queued": total_queued,
432
+ "regular_queue_size": len(self.job_queue),
433
+ "priority_queues": {prio: len(jobs) for prio, jobs in self.priority_queue.items()},
434
+ "running_jobs": len(self.running_jobs),
435
+ "max_concurrent": self.resource_limits.max_concurrent_jobs,
436
+ }
437
+
438
+
439
+ class JobManager:
440
+ """
441
+ **Async** job manager combining resource monitoring and job scheduling.
442
+
443
+ This class provides the core functionality for:
444
+
445
+ * Resource monitoring via :class:`ResourceMonitor`
446
+ * Job scheduling via :class:`JobPoolManager`
447
+ * REST/Socket.IO API via aiohttp web server
448
+ * Background task for continuous job processing
449
+
450
+ Parameters
451
+ ----------
452
+ resource_limits : ResourceLimits, optional
453
+ Host constraints. Creates default instance if None.
454
+ scheduler_type : SchedulerType, optional
455
+ Type of job scheduler to use. Default is ``SchedulerType.NONE``.
456
+
457
+ Attributes
458
+ ----------
459
+ jobs : Dict[str, JobInfo]
460
+ Dictionary of all managed jobs
461
+ resource_limits : ResourceLimits
462
+ Current resource constraints
463
+ job_pool : JobPoolManager
464
+ Priority-aware job queue manager
465
+ resource_monitor : ResourceMonitor
466
+ Host resource usage monitor
467
+ ansys_path : str or None
468
+ Path to ANSYS EDT executable
469
+ sio : socketio.AsyncServer
470
+ Socket.IO server for real-time updates
471
+ app : web.Application
472
+ aiohttp web application
473
+ """
474
+
475
+ def __init__(self, resource_limits: ResourceLimits = None, scheduler_type: SchedulerType = SchedulerType.NONE):
476
+ logging.basicConfig(level=logging.DEBUG, format="%(levelname)s:%(name)s:%(message)s")
477
+ self.jobs: Dict[str, JobInfo] = {}
478
+ if resource_limits is None:
479
+ resource_limits = ResourceLimits()
480
+ self.resource_limits = resource_limits
481
+ self.job_pool = JobPoolManager(self.resource_limits)
482
+ self.resource_monitor = ResourceMonitor()
483
+ self.ansys_path = None # Will be set by JobManagerHandler
484
+
485
+ # Initialize scheduler manager
486
+ self.scheduler_type = scheduler_type
487
+ if scheduler_type in {SchedulerType.SLURM, SchedulerType.LSF}:
488
+ self._sch_mgr = SchedulerManager(scheduler_type)
489
+ else:
490
+ self._sch_mgr = None
491
+
492
+ # Correct SocketIO initialization
493
+ self.sio = socketio.AsyncServer(async_mode="aiohttp", cors_allowed_origins="*")
494
+ self.app = web.Application()
495
+ self.sio.attach(self.app)
496
+
497
+ # Setup routes
498
+ self.setup_routes()
499
+
500
+ # Background task for continuous job processing
501
+ self._processing_task: Optional[asyncio.Task] = None
502
+ self._shutdown = False
503
+ # Start resource monitoring immediately
504
+ self._monitor_task = None
505
+ self._ensure_monitor_running()
506
+ # Background task for scheduler monitoring
507
+ self._scheduler_monitor_task: Optional[asyncio.Task] = None
508
+ self._ensure_scheduler_monitor_running()
509
+
510
+ def _ensure_monitor_running(self):
511
+ """Ensure resource monitoring task is running."""
512
+ try:
513
+ loop = asyncio.get_running_loop()
514
+ if self._monitor_task is None or self._monitor_task.done():
515
+ self._monitor_task = loop.create_task(self.resource_monitor.monitor_resources())
516
+ except RuntimeError:
517
+ # No event loop running yet, will be started when JobManagerHandler starts
518
+ pass
519
+
520
+ def _ensure_scheduler_monitor_running(self):
521
+ """Ensure scheduler monitoring task is running for Slurm/LSF jobs."""
522
+ if self._sch_mgr is None:
523
+ # No scheduler configured, skip monitoring
524
+ return
525
+ try:
526
+ loop = asyncio.get_running_loop()
527
+ if self._scheduler_monitor_task is None or self._scheduler_monitor_task.done():
528
+ self._scheduler_monitor_task = loop.create_task(self._monitor_scheduler_jobs())
529
+ logger.info(f"Started scheduler monitoring for {self.scheduler_type.value}")
530
+ except RuntimeError:
531
+ # No event loop running yet, will be started when JobManagerHandler starts
532
+ pass
533
+
534
+ async def _monitor_scheduler_jobs(self):
535
+ """
536
+ Continuously monitor jobs submitted to external schedulers (Slurm/LSF).
537
+
538
+ This background task polls the scheduler queue every 30 seconds and updates
539
+ job statuses based on the actual scheduler state.
540
+ """
541
+ logger.info(f"✅ Scheduler monitoring loop started for {self.scheduler_type.value}")
542
+
543
+ while not self._shutdown:
544
+ try:
545
+ # Find all jobs that are currently scheduled (submitted to external scheduler)
546
+ scheduled_jobs = {
547
+ job_id: job_info
548
+ for job_id, job_info in self.jobs.items()
549
+ if job_info.status == JobStatus.SCHEDULED and job_info.scheduler_job_id
550
+ }
551
+
552
+ if not scheduled_jobs:
553
+ # No jobs to monitor, sleep longer
554
+ await asyncio.sleep(10)
555
+ continue
556
+
557
+ # Get current scheduler job list
558
+ scheduler_jobs = await self._sch_mgr.get_jobs()
559
+ scheduler_job_ids = {job["job_id"] for job in scheduler_jobs}
560
+ scheduler_job_states = {job["job_id"]: job["state"] for job in scheduler_jobs}
561
+
562
+ logger.info(
563
+ f"Monitoring {len(scheduled_jobs)} scheduled jobs. Scheduler has {len(scheduler_jobs)} jobs."
564
+ )
565
+
566
+ for job_id, job_info in scheduled_jobs.items():
567
+ scheduler_job_id = job_info.scheduler_job_id
568
+
569
+ if scheduler_job_id in scheduler_job_ids:
570
+ # Job still exists in scheduler queue
571
+ state = scheduler_job_states.get(scheduler_job_id, "UNKNOWN")
572
+
573
+ # Map scheduler states to our JobStatus
574
+ if state in ["RUNNING", "R"]:
575
+ if job_info.status != JobStatus.RUNNING:
576
+ job_info.status = JobStatus.RUNNING
577
+ if not job_info.start_time:
578
+ job_info.start_time = datetime.now()
579
+ await self.sio.emit(
580
+ "job_started",
581
+ {
582
+ "job_id": job_id,
583
+ "scheduler_job_id": scheduler_job_id,
584
+ "start_time": job_info.start_time.isoformat(),
585
+ },
586
+ )
587
+ logger.info(f"Job {job_id} (scheduler ID: {scheduler_job_id}) is now RUNNING")
588
+
589
+ elif state in ["PENDING", "PD", "PEND"]:
590
+ # Job is still pending/queued in scheduler
591
+ logger.debug(f"Job {job_id} (scheduler ID: {scheduler_job_id}) is PENDING in scheduler")
592
+
593
+ elif state in ["COMPLETING", "CG"]:
594
+ # Job is completing, keep current status
595
+ logger.debug(f"Job {job_id} (scheduler ID: {scheduler_job_id}) is COMPLETING")
596
+
597
+ else:
598
+ # Job no longer in scheduler queue - it has completed or failed
599
+ # Check if we can find output files to determine success/failure
600
+ job_info.end_time = datetime.now()
601
+ self.job_pool.running_jobs.discard(job_id)
602
+
603
+ # Try to determine if job completed successfully by checking output directory
604
+ output_dir = job_info.config.working_directory
605
+ log_file = os.path.join(output_dir, f"{job_info.config.jobid}.log")
606
+
607
+ # Default to completed - scheduler jobs that finish typically completed
608
+ # unless we can detect otherwise
609
+ if os.path.exists(log_file):
610
+ try:
611
+ # Check if log indicates success or failure
612
+ with open(log_file, "r") as f:
613
+ log_content = f.read()
614
+ if "error" in log_content.lower() or "failed" in log_content.lower():
615
+ job_info.status = JobStatus.FAILED
616
+ job_info.error = "Job failed based on log file content"
617
+ else:
618
+ job_info.status = JobStatus.COMPLETED
619
+ job_info.return_code = 0
620
+ except Exception as e:
621
+ logger.warning(f"Could not read log file for job {job_id}: {e}")
622
+ job_info.status = JobStatus.COMPLETED
623
+ job_info.return_code = 0
624
+ else:
625
+ # No log file found, assume completed
626
+ job_info.status = JobStatus.COMPLETED
627
+ job_info.return_code = 0
628
+
629
+ await self.sio.emit(
630
+ "job_completed",
631
+ {
632
+ "job_id": job_id,
633
+ "scheduler_job_id": scheduler_job_id,
634
+ "status": job_info.status.value,
635
+ "end_time": job_info.end_time.isoformat(),
636
+ "return_code": job_info.return_code,
637
+ },
638
+ )
639
+
640
+ logger.info(
641
+ f"Job {job_id} (scheduler ID: {scheduler_job_id}) completed with status "
642
+ f"{job_info.status.value}"
643
+ )
644
+
645
+ except Exception as e:
646
+ logger.error(f"Error in scheduler monitoring loop: {e}")
647
+
648
+ # Poll every 5 seconds for responsive status updates
649
+ await asyncio.sleep(5)
650
+
651
+ logger.info("Scheduler monitoring loop stopped")
652
+
653
+ def setup_routes(self):
654
+ """
655
+ Internal method that wires aiohttp routes to class methods.
656
+
657
+ Called once from __init__. Sets up all REST API endpoints.
658
+ """
659
+ self.app.router.add_get("/", self.handle_index)
660
+ self.app.router.add_get("/jobs", self.handle_get_jobs)
661
+ self.app.router.add_get("/resources", self.handle_get_resources)
662
+ self.app.router.add_get("/queue", self.handle_get_queue)
663
+ self.app.router.add_post("/jobs/submit", self.handle_submit_job)
664
+ self.app.router.add_post("/jobs/{job_id}/cancel", self.handle_cancel_job)
665
+ self.app.router.add_post("/jobs/{job_id}/priority", self.handle_set_priority)
666
+ self.app.router.add_put("/pool/limits", self.handle_edit_concurrent_limits)
667
+ self.app.router.add_post("/system/start_monitoring", self.handle_start_monitoring)
668
+ self.app.router.add_get("/scheduler/partitions", self.handle_get_partitions)
669
+ self.app.router.add_get("/system/status", self.handle_get_system_status)
670
+ if os.path.exists("static"):
671
+ self.app.router.add_static("/static", "static")
672
+ else:
673
+ os.makedirs("static", exist_ok=True)
674
+ self.app.router.add_static("/static", "static")
675
+
676
+ async def handle_get_system_status(self, request):
677
+ """
678
+ Get system and scheduler status.
679
+
680
+ Parameters
681
+ ----------
682
+ request : aiohttp.web.Request
683
+ HTTP request object
684
+
685
+ Returns
686
+ -------
687
+ aiohttp.web.Response
688
+ JSON response with system status information
689
+ """
690
+ # Ensure resource monitoring is active
691
+ self._ensure_monitor_running()
692
+
693
+ running_jobs = sum(1 for job in self.jobs.values() if job.status == JobStatus.RUNNING)
694
+ queued_jobs = sum(1 for job in self.jobs.values() if job.status == JobStatus.QUEUED)
695
+ status = {
696
+ "scheduler_detection": {
697
+ "active_scheduler": self.scheduler_type.name,
698
+ "detected_by": "JobManager",
699
+ "backend_available": True,
700
+ },
701
+ "resource_monitoring": {
702
+ "active": self._monitor_task is not None and not self._monitor_task.done(),
703
+ "last_update": self.resource_monitor.current_usage.get("timestamp", "Never"),
704
+ **self.resource_monitor.current_usage,
705
+ },
706
+ "mode": self.scheduler_type.value,
707
+ "local_pool": {
708
+ "running_jobs": running_jobs,
709
+ "queued_jobs": queued_jobs,
710
+ "max_concurrent": self.resource_limits.max_concurrent_jobs,
711
+ },
712
+ }
713
+ return web.json_response(status)
714
+
715
+ async def handle_get_partitions(self, request):
716
+ """
717
+ Get scheduler partitions/queues.
718
+
719
+ Parameters
720
+ ----------
721
+ request : aiohttp.web.Request
722
+ HTTP request object
723
+
724
+ Returns
725
+ -------
726
+ aiohttp.web.Response
727
+ JSON response with partition information or error
728
+ """
729
+ if not self._sch_mgr:
730
+ return web.json_response({"error": "Scheduler not supported"}, status=400)
731
+ try:
732
+ partitions = await self._sch_mgr.get_partitions()
733
+ return web.json_response(partitions)
734
+ except Exception as e:
735
+ return web.json_response({"error": str(e)}, status=500)
736
+
737
+ async def handle_start_monitoring(self, request):
738
+ """
739
+ Manually start resource monitoring.
740
+
741
+ Parameters
742
+ ----------
743
+ request : aiohttp.web.Request
744
+ HTTP request object
745
+
746
+ Returns
747
+ -------
748
+ aiohttp.web.Response
749
+ JSON response indicating success or failure
750
+ """
751
+ try:
752
+ if self._monitor_task is None or self._monitor_task.done():
753
+ self._monitor_task = asyncio.create_task(self.resource_monitor.monitor_resources())
754
+ return web.json_response({"success": True, "message": "Resource monitoring started"})
755
+ else:
756
+ return web.json_response({"success": True, "message": "Resource monitoring already active"})
757
+ except Exception as e:
758
+ return web.json_response({"success": False, "error": str(e)}, status=500)
759
+
760
+ async def handle_index(self, request):
761
+ """
762
+ Serve the main web interface.
763
+
764
+ Parameters
765
+ ----------
766
+ request : aiohttp.web.Request
767
+ HTTP request object
768
+
769
+ Returns
770
+ -------
771
+ aiohttp.web.FileResponse
772
+ Static HTML file
773
+ """
774
+ return web.FileResponse("static/index.html")
775
+
776
+ async def handle_submit_job(self, request):
777
+ """
778
+ Submit a new job for execution.
779
+
780
+ Parameters
781
+ ----------
782
+ request : aiohttp.web.Request
783
+ HTTP POST request with JSON payload
784
+
785
+ Returns
786
+ -------
787
+ aiohttp.web.Response
788
+ JSON response with job ID or error
789
+
790
+ Notes
791
+ -----
792
+ Expected JSON payload:
793
+
794
+ .. code-block:: json
795
+
796
+ {
797
+ "config": {
798
+ "jobid": "job_123",
799
+ "project_path": "/path/to/project.aedt",
800
+ ... other HFSS config fields
801
+ },
802
+ "priority": 0
803
+ }
804
+ """
805
+ try:
806
+ logger.info("🔍 ROUTE HIT: /jobs/submit")
807
+ data = await request.json()
808
+ config_dict = data.get("config", {})
809
+
810
+ # Create HFSS config from dictionary
811
+ config = HFSSSimulationConfig.from_dict(config_dict)
812
+ if "user" not in data["config"] or data["config"]["user"] is None:
813
+ data["config"]["user"] = getpass.getuser()
814
+
815
+ # Overwrite scheduler type and user with authoritative values
816
+ if config.scheduler_type != self.scheduler_type:
817
+ print("Overriding scheduler type from client:", config.scheduler_type, "→", self.scheduler_type)
818
+ config.scheduler_type = self.scheduler_type
819
+
820
+ # Submit the job
821
+ job_id = await self.submit_job(config)
822
+
823
+ return web.json_response(
824
+ {"success": True, "job_id": job_id, "message": f"Job {job_id} submitted successfully"}
825
+ )
826
+
827
+ except Exception as e:
828
+ return web.json_response({"success": False, "error": str(e)}, status=400)
829
+
830
+ async def handle_cancel_job(self, request):
831
+ """
832
+ Cancel a running or queued job.
833
+
834
+ Parameters
835
+ ----------
836
+ request : aiohttp.web.Request
837
+ HTTP request with job_id in URL path
838
+
839
+ Returns
840
+ -------
841
+ aiohttp.web.Response
842
+ JSON response indicating success or failure
843
+ """
844
+ job_id = request.match_info.get("job_id")
845
+
846
+ if job_id not in self.jobs:
847
+ return web.json_response({"success": False, "error": f"Job {job_id} not found"}, status=404)
848
+
849
+ success = await self.cancel_job(job_id)
850
+
851
+ return web.json_response(
852
+ {"success": success, "message": f"Job {job_id} cancellation {'initiated' if success else 'failed'}"}
853
+ )
854
+
855
+ async def handle_get_jobs(self, request):
856
+ """
857
+ Get list of all jobs.
858
+
859
+ Parameters
860
+ ----------
861
+ request : aiohttp.web.Request
862
+ HTTP request object
863
+
864
+ Returns
865
+ -------
866
+ aiohttp.web.Response
867
+ JSON array of job objects with status information
868
+ """
869
+ jobs_data = []
870
+ for job_id, job_info in self.jobs.items():
871
+ jobs_data.append(
872
+ {
873
+ "id": job_id,
874
+ "config": job_info.config.to_dict(),
875
+ "status": job_info.status.value,
876
+ "start_time": job_info.start_time.isoformat() if job_info.start_time else None,
877
+ "end_time": job_info.end_time.isoformat() if job_info.end_time else None,
878
+ "return_code": job_info.return_code,
879
+ "scheduler_job_id": job_info.scheduler_job_id,
880
+ }
881
+ )
882
+
883
+ return web.json_response(jobs_data)
884
+
885
+ async def handle_get_resources(self, request):
886
+ """
887
+ Get current resource usage.
888
+
889
+ Parameters
890
+ ----------
891
+ request : aiohttp.web.Request
892
+ HTTP request object
893
+
894
+ Returns
895
+ -------
896
+ aiohttp.web.Response
897
+ JSON with current host resource usage
898
+ """
899
+ return web.json_response(self.resource_monitor.current_usage)
900
+
901
+ async def handle_get_queue(self, request):
902
+ """
903
+ Get queue statistics.
904
+
905
+ Parameters
906
+ ----------
907
+ request : aiohttp.web.Request
908
+ HTTP request object
909
+
910
+ Returns
911
+ -------
912
+ aiohttp.web.Response
913
+ JSON with queue statistics for dashboard display
914
+ """
915
+ stats = self.job_pool.get_queue_stats()
916
+ logger.info(f"/queue endpoint returning max_concurrent = {stats['max_concurrent']}")
917
+ return web.json_response(stats)
918
+
919
+ async def handle_set_priority(self, request):
920
+ """
921
+ Change job priority and re-queue.
922
+
923
+ Parameters
924
+ ----------
925
+ request : aiohttp.web.Request
926
+ HTTP POST request with JSON payload
927
+
928
+ Returns
929
+ -------
930
+ aiohttp.web.Response
931
+ JSON response indicating success or failure
932
+ """
933
+ job_id = request.match_info.get("job_id")
934
+
935
+ if job_id not in self.jobs:
936
+ return web.json_response({"success": False, "error": "Job not found"}, status=404)
937
+
938
+ try:
939
+ data = await request.json()
940
+ priority = data.get("priority", 0)
941
+
942
+ # Update priority in the pool
943
+ self.job_pool.remove_job(job_id)
944
+ self.job_pool.add_job(job_id, priority)
945
+
946
+ return web.json_response({"success": True, "message": f"Priority set to {priority}"})
947
+
948
+ except Exception as e:
949
+ return web.json_response({"success": False, "error": str(e)}, status=400)
950
+
951
+ async def handle_edit_concurrent_limits(self, request):
952
+ """
953
+ Edit concurrent job limits.
954
+
955
+ Parameters
956
+ ----------
957
+ request : aiohttp.web.Request
958
+ HTTP PUT request with JSON payload
959
+
960
+ Returns
961
+ -------
962
+ aiohttp.web.Response
963
+ JSON response indicating success or failure
964
+ """
965
+ try:
966
+ data = await request.json()
967
+
968
+ if not data:
969
+ return web.json_response({"error": "No data provided"}, status=400)
970
+
971
+ # Update the concurrent job limits
972
+ updated_limits = await self.edit_concurrent_limits(data)
973
+
974
+ if updated_limits:
975
+ return web.json_response(
976
+ {"success": True, "message": "Concurrent job limits updated successfully", "limits": updated_limits}
977
+ )
978
+ else:
979
+ return web.json_response({"error": "Failed to update limits"}, status=400)
980
+
981
+ except Exception as e:
982
+ return web.json_response({"error": str(e)}, status=500)
983
+
984
+ async def wait_until_all_done(self) -> None:
985
+ """
986
+ **Coroutine** that blocks until **every** job reaches a terminal state.
987
+
988
+ Safe to call from REST handlers or CLI scripts. Polls job status
989
+ until all jobs are completed, failed, or cancelled.
990
+ """
991
+ while True:
992
+ # All jobs that are NOT in a terminal state
993
+ active = [
994
+ j
995
+ for j in self.jobs.values()
996
+ if j.status not in {JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED}
997
+ ]
998
+ if not active:
999
+ return
1000
+ await asyncio.sleep(1) # be nice to the event-loop
1001
+
1002
+ async def submit_job(self, config: HFSSSimulationConfig, priority: int = 0) -> str:
1003
+ """
1004
+ **Async** entry point for job submission.
1005
+
1006
+ Parameters
1007
+ ----------
1008
+ config : HFSSSimulationConfig
1009
+ Validated simulation configuration.
1010
+ priority : int, optional
1011
+ Job priority. Default is ``0``.
1012
+
1013
+ Returns
1014
+ -------
1015
+ str
1016
+ Unique job identifier (same as ``config.jobid``).
1017
+
1018
+ Notes
1019
+ -----
1020
+ This method:
1021
+ 1. Creates a JobInfo object with QUEUED status
1022
+ 2. Adds the job to the appropriate queue
1023
+ 3. Notifies web clients via Socket.IO
1024
+ 4. Starts the processing loop if not already running
1025
+ """
1026
+ job_id = config.jobid
1027
+
1028
+ # Create job info
1029
+ job_info = JobInfo(config=config, status=JobStatus.QUEUED, priority=priority)
1030
+ self.jobs[job_id] = job_info
1031
+
1032
+ # Add to job pool
1033
+ self.job_pool.add_job(job_id, priority)
1034
+
1035
+ # Notify web clients
1036
+ await self.sio.emit(
1037
+ "job_queued",
1038
+ {"job_id": job_id, "priority": priority, "queue_position": self.job_pool.get_queue_stats()["total_queued"]},
1039
+ )
1040
+
1041
+ logger.info(f"Job {job_id} queued with priority {priority}")
1042
+
1043
+ # Trigger processing if not already running
1044
+ if not self._processing_task or self._processing_task.done():
1045
+ self._processing_task = asyncio.create_task(self._process_jobs_continuously())
1046
+
1047
+ return job_id
1048
+
1049
+ async def _process_jobs_continuously(self):
1050
+ """
1051
+ Continuously process jobs until shutdown is requested.
1052
+
1053
+ This is the main job processing loop that:
1054
+ - Checks if new jobs can be started based on resource limits
1055
+ - Dequeues the highest priority job
1056
+ - Starts job execution in a separate task
1057
+ - Sleeps when no jobs can be started or queue is empty
1058
+ """
1059
+ logger.info("✅ Job processing loop started.")
1060
+ while not self._shutdown:
1061
+ can_start = self.job_pool.can_start_job(self.resource_monitor)
1062
+ if can_start:
1063
+ next_job_id = self.job_pool.get_next_job()
1064
+ if next_job_id:
1065
+ logger.info(f"Dequeued job {next_job_id}. Starting...")
1066
+ self.job_pool.running_jobs.add(next_job_id)
1067
+ asyncio.create_task(self._process_single_job(next_job_id))
1068
+ else:
1069
+ logger.info("Queue is empty, sleeping.")
1070
+ await asyncio.sleep(1)
1071
+ else:
1072
+ logger.warning("Cannot start new job, waiting...")
1073
+ await asyncio.sleep(5)
1074
+
1075
+ await asyncio.sleep(0.2)
1076
+
1077
+ async def _process_single_job(self, job_id: str):
1078
+ """
1079
+ Process a single job from the pool.
1080
+
1081
+ Parameters
1082
+ ----------
1083
+ job_id : str
1084
+ Job identifier to process
1085
+
1086
+ Notes
1087
+ -----
1088
+ This method handles:
1089
+ - Local execution via subprocess
1090
+ - Scheduler submission (SLURM/LSF)
1091
+ - Status updates and notifications
1092
+ - Error handling and cleanup
1093
+ """
1094
+ job_info = self.jobs.get(job_id)
1095
+ if not job_info or job_info.status != JobStatus.QUEUED:
1096
+ self.job_pool.running_jobs.discard(job_id)
1097
+ return
1098
+
1099
+ # Update job status
1100
+ job_info.status = JobStatus.RUNNING
1101
+ job_info.start_time = datetime.now()
1102
+ job_info.local_resources = self.resource_monitor.current_usage.copy()
1103
+
1104
+ # Notify web clients
1105
+ await self.sio.emit(
1106
+ "job_started",
1107
+ {"job_id": job_id, "start_time": job_info.start_time.isoformat(), "resources": job_info.local_resources},
1108
+ )
1109
+
1110
+ logger.info(f"Job {job_id} started")
1111
+
1112
+ try:
1113
+ # Run the simulation
1114
+ if job_info.config.scheduler_type != SchedulerType.NONE:
1115
+ # Make sure the executable path is present
1116
+ if not job_info.config.ansys_edt_path or not os.path.exists(job_info.config.ansys_edt_path):
1117
+ if self.ansys_path and os.path.exists(self.ansys_path):
1118
+ job_info.config = HFSSSimulationConfig(
1119
+ **{**job_info.config.model_dump(), "ansys_edt_path": self.ansys_path}
1120
+ )
1121
+ logger.info(f"Using JobManager's detected ANSYS path: {self.ansys_path}")
1122
+ else:
1123
+ raise FileNotFoundError(
1124
+ f"ANSYS executable not found. Config path: {job_info.config.ansys_edt_path}, "
1125
+ f"Manager path: {self.ansys_path}"
1126
+ )
1127
+
1128
+ # Now generate the script – the path is guaranteed to be non-empty
1129
+ result = job_info.config.submit_to_scheduler()
1130
+ job_info.scheduler_job_id = job_info.config._extract_job_id(result.stdout)
1131
+ job_info.status = JobStatus.SCHEDULED
1132
+ logger.info(
1133
+ f"Job {job_id} submitted to scheduler with ID: {job_info.scheduler_job_id}, status: SCHEDULED"
1134
+ )
1135
+ await self.sio.emit("job_scheduled", {"job_id": job_id, "scheduler_job_id": job_info.scheduler_job_id})
1136
+
1137
+ else:
1138
+ # ---------------- local mode – same guarantee -----------------
1139
+ if not job_info.config.ansys_edt_path or not os.path.exists(job_info.config.ansys_edt_path):
1140
+ if self.ansys_path and os.path.exists(self.ansys_path):
1141
+ job_info.config = HFSSSimulationConfig(
1142
+ **{**job_info.config.model_dump(), "ansys_edt_path": self.ansys_path}
1143
+ )
1144
+ logger.info(f"Using JobManager's detected ANSYS path: {self.ansys_path}")
1145
+ else:
1146
+ raise FileNotFoundError(
1147
+ f"ANSYS executable not found. Config path: {job_info.config.ansys_edt_path}, "
1148
+ f"Manager path: {self.ansys_path}"
1149
+ )
1150
+
1151
+ # Generate command as list for secure execution
1152
+ command_list = job_info.config.generate_command_list()
1153
+
1154
+ # Log the command being executed for debugging
1155
+ logger.info(f"Executing command for job {job_id}: {' '.join(command_list)}")
1156
+ logger.info(f"ANSYS executable path: {job_info.config.ansys_edt_path}")
1157
+ logger.info(f"Project path: {job_info.config.project_path}")
1158
+
1159
+ # Check if project file exists
1160
+ if not os.path.exists(job_info.config.project_path):
1161
+ raise FileNotFoundError(f"Project file not found: {job_info.config.project_path}")
1162
+
1163
+ # Run locally - using asyncio subprocess for better control with secure command list
1164
+ process = await asyncio.create_subprocess_exec(
1165
+ *command_list,
1166
+ stdout=asyncio.subprocess.PIPE,
1167
+ stderr=asyncio.subprocess.PIPE,
1168
+ )
1169
+
1170
+ job_info.process = process
1171
+
1172
+ # Wait for completion with timeout (24 hours max)
1173
+ try:
1174
+ stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=86400)
1175
+
1176
+ job_info.return_code = process.returncode
1177
+ job_info.output = stdout.decode() if stdout else ""
1178
+ job_info.error = stderr.decode() if stderr else ""
1179
+
1180
+ if process.returncode == 0:
1181
+ job_info.status = JobStatus.COMPLETED
1182
+ logger.info(f"Job {job_id} completed successfully")
1183
+ else:
1184
+ job_info.status = JobStatus.FAILED
1185
+ logger.error(f"Job {job_id} failed with return code {process.returncode}")
1186
+
1187
+ except asyncio.TimeoutError:
1188
+ job_info.status = JobStatus.FAILED
1189
+ job_info.error = "Job timed out after 24 hours"
1190
+ process.terminate()
1191
+ logger.error(f"Job {job_id} timed out")
1192
+
1193
+ except Exception as e:
1194
+ job_info.status = JobStatus.FAILED
1195
+ job_info.error = str(e)
1196
+ logger.error(f"Job {job_id} failed with error: {e}")
1197
+
1198
+ finally:
1199
+ job_info.end_time = datetime.now()
1200
+ self.job_pool.running_jobs.discard(job_id)
1201
+
1202
+ # Notify web clients
1203
+ await self.sio.emit(
1204
+ "job_completed",
1205
+ {
1206
+ "job_id": job_id,
1207
+ "status": job_info.status.value,
1208
+ "end_time": job_info.end_time.isoformat(),
1209
+ "return_code": job_info.return_code,
1210
+ },
1211
+ )
1212
+
1213
+ async def cancel_job(self, job_id: str) -> bool:
1214
+ """
1215
+ **Cancel** a queued or running job.
1216
+
1217
+ Parameters
1218
+ ----------
1219
+ job_id : str
1220
+ Identifier returned by :meth:`submit_job`.
1221
+
1222
+ Returns
1223
+ -------
1224
+ bool
1225
+ ``True`` → cancellation succeeded, ``False`` → job not found or
1226
+ already terminal.
1227
+
1228
+ Notes
1229
+ -----
1230
+ For queued jobs: immediately removes from queue and marks as cancelled.
1231
+ For running jobs: attempts to terminate the process and cleanup.
1232
+ """
1233
+ job_info = self.jobs.get(job_id)
1234
+ if not job_info:
1235
+ return False
1236
+
1237
+ if job_info.status == JobStatus.QUEUED:
1238
+ # Remove from queue
1239
+ self.job_pool.remove_job(job_id)
1240
+ job_info.status = JobStatus.CANCELLED
1241
+ job_info.end_time = datetime.now()
1242
+ return True
1243
+
1244
+ elif job_info.status == JobStatus.SCHEDULED:
1245
+ # Cancel job in external scheduler
1246
+ if job_info.scheduler_job_id:
1247
+ try:
1248
+ success = await self._sch_mgr.cancel_job(job_info.scheduler_job_id)
1249
+ if success:
1250
+ job_info.status = JobStatus.CANCELLED
1251
+ job_info.end_time = datetime.now()
1252
+ self.job_pool.running_jobs.discard(job_id)
1253
+ logger.info(f"Cancelled scheduler job {job_id} (scheduler ID: {job_info.scheduler_job_id})")
1254
+ return True
1255
+ else:
1256
+ logger.warning(f"Failed to cancel scheduler job {job_info.scheduler_job_id}")
1257
+ return False
1258
+ except Exception as e:
1259
+ logger.error(f"Error cancelling scheduler job {job_id}: {e}")
1260
+ return False
1261
+ return False
1262
+
1263
+ elif job_info.status == JobStatus.RUNNING and job_info.process:
1264
+ try:
1265
+ job_info.process.terminate()
1266
+ await asyncio.sleep(2)
1267
+ if job_info.process.returncode is None:
1268
+ job_info.process.kill()
1269
+
1270
+ job_info.status = JobStatus.CANCELLED
1271
+ job_info.end_time = datetime.now()
1272
+ return True
1273
+
1274
+ except Exception as e:
1275
+ logger.error(f"Failed to cancel job {job_id}: {e}")
1276
+ return False
1277
+
1278
+ return False
1279
+
1280
+ async def edit_concurrent_limits(self, update_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
1281
+ """
1282
+ Edit concurrent job limits in the pool.
1283
+
1284
+ Parameters
1285
+ ----------
1286
+ update_data : dict
1287
+ Fields to update in resource limits. Valid fields:
1288
+ - max_concurrent_jobs: Positive integer
1289
+ - max_cpu_percent: Float between 0 and 100
1290
+ - min_memory_gb: Non-negative float
1291
+ - min_disk_gb: Non-negative float
1292
+
1293
+ Returns
1294
+ -------
1295
+ dict or None
1296
+ Updated limits data or None if update failed
1297
+
1298
+ Raises
1299
+ ------
1300
+ ValueError
1301
+ If any field validation fails
1302
+ """
1303
+ try:
1304
+ # Define allowed fields for editing
1305
+ allowed_fields = ["max_concurrent_jobs", "max_cpu_percent", "min_memory_gb", "min_disk_gb"]
1306
+
1307
+ # Update allowed fields
1308
+ updated = False
1309
+ old_limits = {}
1310
+
1311
+ for field in allowed_fields:
1312
+ if field in update_data:
1313
+ old_value = getattr(self.resource_limits, field)
1314
+ new_value = update_data[field]
1315
+
1316
+ # Validate the new value
1317
+ if field == "max_concurrent_jobs" and (not isinstance(new_value, int) or new_value < 1):
1318
+ raise ValueError("max_concurrent_jobs must be a positive integer")
1319
+ elif field == "max_cpu_percent" and (
1320
+ not isinstance(new_value, (int, float)) or new_value <= 0 or new_value > 100
1321
+ ):
1322
+ raise ValueError("max_cpu_percent must be between 0 and 100")
1323
+ elif field in ["min_memory_gb", "min_disk_gb"] and (
1324
+ not isinstance(new_value, (int, float)) or new_value < 0
1325
+ ):
1326
+ raise ValueError(f"{field} must be a non-negative number")
1327
+
1328
+ old_limits[field] = old_value
1329
+ setattr(self.resource_limits, field, new_value)
1330
+ self.job_pool.resource_limits = self.resource_limits
1331
+ updated = True
1332
+
1333
+ if updated:
1334
+ # Log the changes
1335
+ for field in old_limits:
1336
+ logger.info(
1337
+ f"Resource limit {field} changed from {old_limits[field]} to "
1338
+ f"{getattr(self.resource_limits, field)}"
1339
+ )
1340
+
1341
+ # Notify web clients about the update
1342
+ await self.sio.emit(
1343
+ "limits_updated",
1344
+ {
1345
+ "old_limits": old_limits,
1346
+ "new_limits": {
1347
+ "max_concurrent_jobs": self.resource_limits.max_concurrent_jobs,
1348
+ "max_cpu_percent": self.resource_limits.max_cpu_percent,
1349
+ "min_memory_gb": self.resource_limits.min_memory_gb,
1350
+ "min_disk_gb": self.resource_limits.min_disk_gb,
1351
+ },
1352
+ },
1353
+ )
1354
+
1355
+ # Return updated limits data
1356
+ return {
1357
+ "max_concurrent_jobs": self.resource_limits.max_concurrent_jobs,
1358
+ "max_cpu_percent": self.resource_limits.max_cpu_percent,
1359
+ "min_memory_gb": self.resource_limits.min_memory_gb,
1360
+ "min_disk_gb": self.resource_limits.min_disk_gb,
1361
+ }
1362
+
1363
+ return None
1364
+
1365
+ except Exception as e:
1366
+ logger.error(f"Failed to update concurrent limits: {e}")
1367
+ return None
1368
+
1369
+
1370
+ async def submit_job_to_manager(
1371
+ config: HFSSSimulationConfig, priority: int = 0, manager_url: str = "http://localhost:8080"
1372
+ ) -> str:
1373
+ """
1374
+ **Helper** coroutine that submits a job to a **remote** Job Manager.
1375
+
1376
+ Falls back to **local** execution if the HTTP call fails (offline mode).
1377
+
1378
+ Parameters
1379
+ ----------
1380
+ config : HFSSSimulationConfig
1381
+ Validated configuration.
1382
+ priority : int, optional
1383
+ Job priority. Default is ``0``.
1384
+ manager_url : str, optional
1385
+ Base URL of the manager. Default is ``"http://localhost:8080"``.
1386
+
1387
+ Returns
1388
+ -------
1389
+ str
1390
+ Job identifier (local or remote).
1391
+
1392
+ Raises
1393
+ ------
1394
+ Exception
1395
+ If **both** remote and local execution fail.
1396
+
1397
+ Notes
1398
+ -----
1399
+ This function is useful for clients that want to submit jobs to a
1400
+ remote manager but maintain offline capability.
1401
+ """
1402
+ try:
1403
+ async with aiohttp.ClientSession() as session:
1404
+ url = f"{manager_url}/jobs/submit"
1405
+ async with session.post(url, json={"config": config.to_dict(), "priority": priority}) as response:
1406
+ result = await response.json()
1407
+
1408
+ if result["success"]:
1409
+ return result["job_id"]
1410
+ else:
1411
+ raise Exception(f"Job submission failed: {result['error']}")
1412
+
1413
+ except Exception as e:
1414
+ logger.error(f"Failed to submit job to manager: {e}")
1415
+ # Fall back to local execution
1416
+ return await config.run_simulation()
1417
+
1418
+
1419
+ # --------------------------------------------------------------------------- #
1420
+ # SchedulerManager – live SLURM / LSF introspection
1421
+ # --------------------------------------------------------------------------- #
1422
+ class SchedulerManager:
1423
+ """
1424
+ Thin async wrapper around cluster scheduler commands.
1425
+
1426
+ Provides live introspection of SLURM and LSF clusters including:
1427
+
1428
+ * List of partitions / queues with resource information
1429
+ * Per-partition: total & free cores, total & free memory
1430
+ * Global job table (running, pending, etc.)
1431
+
1432
+ All methods are **coroutines** so they can be awaited from the REST layer
1433
+ without blocking the event-loop.
1434
+
1435
+ Parameters
1436
+ ----------
1437
+ scheduler_type : SchedulerType
1438
+ Type of scheduler (SLURM or LSF only)
1439
+
1440
+ Raises
1441
+ ------
1442
+ ValueError
1443
+ If scheduler_type is not SLURM or LSF
1444
+ """
1445
+
1446
+ def __init__(self, scheduler_type: SchedulerType):
1447
+ if scheduler_type not in {SchedulerType.SLURM, SchedulerType.LSF}:
1448
+ raise ValueError("Only SLURM and LSF are supported")
1449
+ self.scheduler_type = scheduler_type
1450
+
1451
+ async def get_partitions(self) -> List[Dict[str, Any]]:
1452
+ """
1453
+ Get list of scheduler partitions/queues with resource information.
1454
+
1455
+ Returns
1456
+ -------
1457
+ List[Dict[str, Any]]
1458
+ List of partition dictionaries with keys:
1459
+ - name: Partition/queue name
1460
+ - cores_total: Total available cores
1461
+ - cores_used: Currently used cores
1462
+ - memory_total_gb: Total memory in GB
1463
+ - memory_used_gb: Currently used memory in GB
1464
+
1465
+ Raises
1466
+ ------
1467
+ RuntimeError
1468
+ If scheduler command execution fails
1469
+ """
1470
+ if self.scheduler_type == SchedulerType.SLURM:
1471
+ return await self._slurm_partitions()
1472
+ else: # LSF
1473
+ return await self._lsf_partitions()
1474
+
1475
+ async def get_jobs(self) -> List[Dict[str, Any]]:
1476
+ """
1477
+ Get global job table (all users).
1478
+
1479
+ Returns
1480
+ -------
1481
+ List[Dict[str, Any]]
1482
+ List of job dictionaries with keys:
1483
+ - job_id: Scheduler job ID
1484
+ - partition: Partition/queue name
1485
+ - user: Job owner username
1486
+ - state: Job state (RUNNING, PENDING, etc.)
1487
+ - nodes: Number of nodes allocated
1488
+ - cpus: Number of CPUs allocated
1489
+ - memory_gb: Memory allocated in GB
1490
+
1491
+ Raises
1492
+ ------
1493
+ RuntimeError
1494
+ If scheduler command execution fails
1495
+ """
1496
+ if self.scheduler_type == SchedulerType.SLURM:
1497
+ return await self._slurm_jobs()
1498
+ else:
1499
+ return await self._lsf_jobs()
1500
+
1501
+ async def _slurm_partitions(self) -> List[Dict[str, Any]]:
1502
+ """Parse SLURM partition information from sinfo command."""
1503
+ cmd = ["sinfo", "-h", "-o", "%R %F %C %m"] # PARTITION NODES(A/I/O/T) CPUS(A/I/O/T) MEMORY
1504
+ stdout = await self._run(cmd)
1505
+ out = []
1506
+ for line in stdout.splitlines():
1507
+ if not line.strip():
1508
+ continue
1509
+ part, node_str, cpu_str, mem_mb = line.split()
1510
+ na, ni, no, nt = map(int, node_str.split("/"))
1511
+ ca, ci, co, ct = map(int, cpu_str.split("/"))
1512
+ mem_total = float(mem_mb.rstrip("+MGTP")) / 1024 # GB
1513
+ out.append(
1514
+ {
1515
+ "name": part,
1516
+ "nodes_total": nt,
1517
+ "nodes_used": na + no,
1518
+ "cores_total": ct,
1519
+ "cores_used": ca + co,
1520
+ "memory_total_gb": mem_total,
1521
+ "memory_used_gb": mem_total * (na + no) / max(nt, 1),
1522
+ }
1523
+ )
1524
+ return out
1525
+
1526
+ async def _slurm_jobs(self) -> List[Dict[str, Any]]:
1527
+ """Parse SLURM job information from squeue command."""
1528
+ cmd = ["squeue", "-h", "-o", "%i %u %P %T %D %C %m"]
1529
+ stdout = await self._run(cmd)
1530
+ jobs = []
1531
+ for line in stdout.splitlines():
1532
+ if not line.strip():
1533
+ continue
1534
+ job_id, user, partition, state, nodes, cpus, mem_str = line.split()
1535
+ # unify memory to GiB
1536
+ mem_str = mem_str.strip()
1537
+ if mem_str.endswith(("M", "G", "T")):
1538
+ unit = mem_str[-1]
1539
+ val = float(mem_str[:-1])
1540
+ if unit == "M":
1541
+ memory_gb = val / 1024
1542
+ elif unit == "G":
1543
+ memory_gb = val
1544
+ else: # T
1545
+ memory_gb = val * 1024
1546
+ else: # plain number → assume MiB
1547
+ memory_gb = float(mem_str) / 1024
1548
+ jobs.append(
1549
+ {
1550
+ "job_id": job_id,
1551
+ "partition": partition,
1552
+ "user": user,
1553
+ "state": state,
1554
+ "nodes": int(nodes),
1555
+ "cpus": int(cpus),
1556
+ "memory_gb": memory_gb,
1557
+ }
1558
+ )
1559
+ return jobs
1560
+
1561
+ async def _lsf_partitions(self) -> List[Dict[str, Any]]:
1562
+ """Parse LSF queue information from bqueues and bhosts commands."""
1563
+ # 1. queues → max slots
1564
+ qraw = await self._run(["bqueues", "-o", "queue_name:20 max:10 num_proc:10", "-noheader"])
1565
+ qinfo = {}
1566
+ for ln in qraw.splitlines():
1567
+ if not ln.strip():
1568
+ continue
1569
+ name, max_s, num_p = ln.split()
1570
+ qinfo[name] = {
1571
+ "nodes_total": int(num_p),
1572
+ "nodes_used": 0,
1573
+ "cores_total": int(num_p),
1574
+ "cores_used": 0,
1575
+ "mem_total_gb": 0.0,
1576
+ "mem_used_gb": 0.0,
1577
+ }
1578
+
1579
+ # 2. hosts → real cores + real memory
1580
+ hraw = await self._run(["bhosts", "-o", "host_name:20 ncpus:10 max_mem:15", "-noheader"])
1581
+ for ln in hraw.splitlines():
1582
+ if not ln.strip():
1583
+ continue
1584
+ host, ncpus, max_mem_kb = ln.split()
1585
+ max_mem_gb = int(max_mem_kb) / 1024**2
1586
+ for q in qinfo.values():
1587
+ q["mem_total_gb"] += max_mem_gb
1588
+ # LSF does not give per-host used mem; keep 0 for now
1589
+ return [{"name": q, **qinfo[q]} for q in qinfo]
1590
+
1591
+ async def _lsf_jobs(self) -> List[Dict[str, Any]]:
1592
+ """Parse LSF job information from bjobs command."""
1593
+ cmd = ["bjobs", "-u", "all", "-o", "jobid:10 user:15 queue:15 stat:10 slots:10 mem:10", "-noheader"]
1594
+ stdout = await self._run(cmd)
1595
+ jobs = []
1596
+ for line in stdout.splitlines():
1597
+ if not line.strip():
1598
+ continue
1599
+ job_id, user, queue, state, slots, mem = line.split()
1600
+ jobs.append(
1601
+ {
1602
+ "job_id": job_id,
1603
+ "partition": queue,
1604
+ "user": user,
1605
+ "state": state,
1606
+ "nodes": 1, # LSF does not expose node count directly
1607
+ "cpus": int(slots),
1608
+ "memory_gb": int(mem) / 1024 if mem.isdigit() else float(mem),
1609
+ }
1610
+ )
1611
+ return jobs
1612
+
1613
+ async def _run(self, cmd: List[str]) -> str:
1614
+ """
1615
+ Run scheduler command and return output.
1616
+
1617
+ Parameters
1618
+ ----------
1619
+ cmd : List[str]
1620
+ Command and arguments to execute
1621
+
1622
+ Returns
1623
+ -------
1624
+ str
1625
+ Command stdout as string
1626
+
1627
+ Raises
1628
+ ------
1629
+ RuntimeError
1630
+ If command returns non-zero exit code
1631
+ """
1632
+ proc = await asyncio.create_subprocess_exec(
1633
+ *cmd,
1634
+ stdout=asyncio.subprocess.PIPE,
1635
+ stderr=asyncio.subprocess.PIPE,
1636
+ )
1637
+ stdout, stderr = await proc.communicate()
1638
+ if proc.returncode != 0:
1639
+ raise RuntimeError(f"{' '.join(cmd)} failed: {stderr.decode()}")
1640
+ return stdout.decode().strip()
1641
+
1642
+
1643
+ # Usage example
1644
+ async def main():
1645
+ """
1646
+ Example usage of the JobManager class.
1647
+
1648
+ This demonstrates how to create a job manager with custom resource
1649
+ limits and submit jobs with different priorities.
1650
+ """
1651
+ # Create job manager with custom resource limits
1652
+ resource_limits = ResourceLimits(
1653
+ max_concurrent_jobs=3, # Allow 3 simultaneous jobs
1654
+ max_cpu_percent=75.0, # Don't start jobs if CPU > 75%
1655
+ min_memory_gb=4.0, # Require 4GB free memory
1656
+ min_disk_gb=20.0, # Require 20GB free disk space
1657
+ )
1658
+
1659
+ manager = JobManager(resource_limits)
1660
+
1661
+
1662
+ if __name__ == "__main__":
1663
+ asyncio.run(main())