hpc-runner 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/__init__.py +57 -0
- hpc_runner/_version.py +34 -0
- hpc_runner/cli/__init__.py +1 -0
- hpc_runner/cli/cancel.py +38 -0
- hpc_runner/cli/config.py +109 -0
- hpc_runner/cli/main.py +76 -0
- hpc_runner/cli/monitor.py +30 -0
- hpc_runner/cli/run.py +292 -0
- hpc_runner/cli/status.py +66 -0
- hpc_runner/core/__init__.py +31 -0
- hpc_runner/core/config.py +177 -0
- hpc_runner/core/descriptors.py +110 -0
- hpc_runner/core/exceptions.py +38 -0
- hpc_runner/core/job.py +328 -0
- hpc_runner/core/job_array.py +58 -0
- hpc_runner/core/job_info.py +104 -0
- hpc_runner/core/resources.py +49 -0
- hpc_runner/core/result.py +161 -0
- hpc_runner/core/types.py +13 -0
- hpc_runner/py.typed +0 -0
- hpc_runner/schedulers/__init__.py +60 -0
- hpc_runner/schedulers/base.py +194 -0
- hpc_runner/schedulers/detection.py +52 -0
- hpc_runner/schedulers/local/__init__.py +5 -0
- hpc_runner/schedulers/local/scheduler.py +354 -0
- hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
- hpc_runner/schedulers/sge/__init__.py +5 -0
- hpc_runner/schedulers/sge/args.py +232 -0
- hpc_runner/schedulers/sge/parser.py +287 -0
- hpc_runner/schedulers/sge/scheduler.py +881 -0
- hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
- hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
- hpc_runner/templates/__init__.py +5 -0
- hpc_runner/templates/engine.py +55 -0
- hpc_runner/tui/__init__.py +5 -0
- hpc_runner/tui/app.py +436 -0
- hpc_runner/tui/components/__init__.py +17 -0
- hpc_runner/tui/components/detail_panel.py +187 -0
- hpc_runner/tui/components/filter_bar.py +174 -0
- hpc_runner/tui/components/filter_popup.py +345 -0
- hpc_runner/tui/components/job_table.py +260 -0
- hpc_runner/tui/providers/__init__.py +5 -0
- hpc_runner/tui/providers/jobs.py +197 -0
- hpc_runner/tui/screens/__init__.py +7 -0
- hpc_runner/tui/screens/confirm.py +67 -0
- hpc_runner/tui/screens/job_details.py +210 -0
- hpc_runner/tui/screens/log_viewer.py +170 -0
- hpc_runner/tui/snapshot.py +153 -0
- hpc_runner/tui/styles/monitor.tcss +567 -0
- hpc_runner/workflow/__init__.py +6 -0
- hpc_runner/workflow/dependency.py +20 -0
- hpc_runner/workflow/pipeline.py +180 -0
- hpc_runner-0.2.0.dist-info/METADATA +285 -0
- hpc_runner-0.2.0.dist-info/RECORD +56 -0
- hpc_runner-0.2.0.dist-info/WHEEL +4 -0
- hpc_runner-0.2.0.dist-info/entry_points.txt +2 -0
hpc_runner/core/types.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Type aliases for hpc-tools."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TypeAlias
|
|
5
|
+
|
|
6
|
+
# Path types
|
|
7
|
+
PathLike: TypeAlias = str | Path
|
|
8
|
+
|
|
9
|
+
# Command types
|
|
10
|
+
Command: TypeAlias = str | list[str]
|
|
11
|
+
|
|
12
|
+
# Resource value types
|
|
13
|
+
ResourceValue: TypeAlias = int | str
|
hpc_runner/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Scheduler registry and auto-detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from hpc_runner.schedulers.detection import detect_scheduler
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
12
|
+
|
|
13
|
+
_SCHEDULERS: dict[str, str] = {
|
|
14
|
+
"sge": "hpc_runner.schedulers.sge:SGEScheduler",
|
|
15
|
+
"slurm": "hpc_runner.schedulers.slurm:SlurmScheduler",
|
|
16
|
+
"pbs": "hpc_runner.schedulers.pbs:PBSScheduler",
|
|
17
|
+
"local": "hpc_runner.schedulers.local:LocalScheduler",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_scheduler(name: str | None = None) -> "BaseScheduler":
|
|
22
|
+
"""Get scheduler instance.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
name: Scheduler name or None to auto-detect
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Scheduler instance
|
|
29
|
+
"""
|
|
30
|
+
if name is None:
|
|
31
|
+
name = detect_scheduler()
|
|
32
|
+
|
|
33
|
+
if name not in _SCHEDULERS:
|
|
34
|
+
available = list(_SCHEDULERS.keys())
|
|
35
|
+
raise ValueError(f"Unknown scheduler: {name}. Available: {available}")
|
|
36
|
+
|
|
37
|
+
# Lazy import
|
|
38
|
+
module_path, class_name = _SCHEDULERS[name].rsplit(":", 1)
|
|
39
|
+
module = importlib.import_module(module_path)
|
|
40
|
+
scheduler_class = getattr(module, class_name)
|
|
41
|
+
|
|
42
|
+
return scheduler_class()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def register_scheduler(name: str, import_path: str) -> None:
|
|
46
|
+
"""Register a custom scheduler.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
name: Scheduler name
|
|
50
|
+
import_path: Import path like "mypackage.scheduler:MyScheduler"
|
|
51
|
+
"""
|
|
52
|
+
_SCHEDULERS[name] = import_path
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def list_schedulers() -> list[str]:
|
|
56
|
+
"""List available scheduler names."""
|
|
57
|
+
return list(_SCHEDULERS.keys())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["get_scheduler", "register_scheduler", "list_schedulers", "detect_scheduler"]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Base scheduler with rendering protocol."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from hpc_runner.core.descriptors import SchedulerArg
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from hpc_runner.core.job import Job
|
|
14
|
+
from hpc_runner.core.job_array import JobArray
|
|
15
|
+
from hpc_runner.core.job_info import JobInfo
|
|
16
|
+
from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseScheduler(ABC):
|
|
20
|
+
"""Abstract base class for HPC schedulers.
|
|
21
|
+
|
|
22
|
+
Subclasses must:
|
|
23
|
+
1. Define `name` class attribute
|
|
24
|
+
2. Populate `ARG_RENDERERS` dict mapping Job attribute names to SchedulerArg instances
|
|
25
|
+
3. Implement abstract methods for job submission and management
|
|
26
|
+
|
|
27
|
+
The rendering protocol:
|
|
28
|
+
- `render_directives(job)` - Returns list of script directives
|
|
29
|
+
- `render_args(job)` - Returns list of command-line arguments
|
|
30
|
+
|
|
31
|
+
Both methods iterate over job.iter_attributes() and use ARG_RENDERERS
|
|
32
|
+
to convert values to scheduler-specific syntax.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name: str = ""
|
|
36
|
+
|
|
37
|
+
# Subclasses populate this in __init__ with config-driven values
|
|
38
|
+
ARG_RENDERERS: dict[str, SchedulerArg] = {}
|
|
39
|
+
|
|
40
|
+
# =========================================================================
|
|
41
|
+
# Rendering Protocol
|
|
42
|
+
# =========================================================================
|
|
43
|
+
|
|
44
|
+
def render_directives(self, job: "Job") -> list[str]:
|
|
45
|
+
"""Render job attributes as script directives.
|
|
46
|
+
|
|
47
|
+
Iterates over job's renderable attributes and uses ARG_RENDERERS
|
|
48
|
+
to convert each to the appropriate directive format.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
job: The job to render
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of directive strings (e.g., ['#$ -N jobname', '#$ -pe smp 4'])
|
|
55
|
+
"""
|
|
56
|
+
directives: list[str] = []
|
|
57
|
+
|
|
58
|
+
for attr_name, value in job.iter_attributes():
|
|
59
|
+
renderer = self.ARG_RENDERERS.get(attr_name)
|
|
60
|
+
if renderer is None:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
directive = renderer.to_directive(value)
|
|
64
|
+
if directive is not None:
|
|
65
|
+
directives.append(directive)
|
|
66
|
+
|
|
67
|
+
return directives
|
|
68
|
+
|
|
69
|
+
def render_args(self, job: "Job") -> list[str]:
|
|
70
|
+
"""Render job attributes as command-line arguments.
|
|
71
|
+
|
|
72
|
+
Iterates over job's renderable attributes and uses ARG_RENDERERS
|
|
73
|
+
to convert each to command-line argument format.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
job: The job to render
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of argument strings (e.g., ['-N', 'jobname', '-pe', 'smp', '4'])
|
|
80
|
+
"""
|
|
81
|
+
args: list[str] = []
|
|
82
|
+
|
|
83
|
+
for attr_name, value in job.iter_attributes():
|
|
84
|
+
renderer = self.ARG_RENDERERS.get(attr_name)
|
|
85
|
+
if renderer is None:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
args.extend(renderer.to_args(value))
|
|
89
|
+
|
|
90
|
+
return args
|
|
91
|
+
|
|
92
|
+
# =========================================================================
|
|
93
|
+
# Abstract Methods - Subclasses must implement
|
|
94
|
+
# =========================================================================
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def submit(
|
|
98
|
+
self, job: "Job", interactive: bool = False, keep_script: bool = False
|
|
99
|
+
) -> "JobResult":
|
|
100
|
+
"""Submit a job to the scheduler.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
job: Job to submit.
|
|
104
|
+
interactive: If True, run interactively.
|
|
105
|
+
keep_script: If True, don't delete job script after submission.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def submit_array(self, array: "JobArray") -> "ArrayJobResult":
|
|
110
|
+
"""Submit an array job."""
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def cancel(self, job_id: str) -> bool:
|
|
114
|
+
"""Cancel a job."""
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def get_status(self, job_id: str) -> "JobStatus":
|
|
118
|
+
"""Get job status."""
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def get_exit_code(self, job_id: str) -> int | None:
|
|
122
|
+
"""Get job exit code."""
|
|
123
|
+
|
|
124
|
+
@abstractmethod
|
|
125
|
+
def generate_script(self, job: "Job", array_range: str | None = None) -> str:
|
|
126
|
+
"""Generate submission script."""
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def build_submit_command(self, job: "Job") -> list[str]:
|
|
130
|
+
"""Build submission command line."""
|
|
131
|
+
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def build_interactive_command(self, job: "Job") -> list[str]:
|
|
134
|
+
"""Build interactive execution command."""
|
|
135
|
+
|
|
136
|
+
# =========================================================================
|
|
137
|
+
# Optional Methods - Override if scheduler supports these
|
|
138
|
+
# =========================================================================
|
|
139
|
+
|
|
140
|
+
def get_output_path(self, job_id: str, stream: str) -> Path | None:
|
|
141
|
+
"""Get path to output file.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
job_id: Job ID
|
|
145
|
+
stream: "stdout" or "stderr"
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Path to output file, or None if not determinable.
|
|
149
|
+
"""
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
def get_scheduler_args(self, job: "Job") -> list[str]:
|
|
153
|
+
"""Get scheduler-specific raw args from job."""
|
|
154
|
+
return getattr(job, f"{self.name}_args", [])
|
|
155
|
+
|
|
156
|
+
def list_active_jobs(
|
|
157
|
+
self,
|
|
158
|
+
user: str | None = None,
|
|
159
|
+
status: set["JobStatus"] | None = None,
|
|
160
|
+
queue: str | None = None,
|
|
161
|
+
) -> list["JobInfo"]:
|
|
162
|
+
"""List active jobs. Override in subclass."""
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
def list_completed_jobs(
|
|
166
|
+
self,
|
|
167
|
+
user: str | None = None,
|
|
168
|
+
since: datetime | None = None,
|
|
169
|
+
until: datetime | None = None,
|
|
170
|
+
exit_code: int | None = None,
|
|
171
|
+
queue: str | None = None,
|
|
172
|
+
limit: int = 100,
|
|
173
|
+
) -> list["JobInfo"]:
|
|
174
|
+
"""List completed jobs from accounting. Override in subclass."""
|
|
175
|
+
return []
|
|
176
|
+
|
|
177
|
+
def has_accounting(self) -> bool:
|
|
178
|
+
"""Check if job accounting/history is available."""
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
def get_job_details(self, job_id: str) -> tuple["JobInfo", dict[str, object]]:
|
|
182
|
+
"""Get detailed information for a single job.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
job_id: The job identifier.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Tuple of (JobInfo, extra_details dict).
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
JobNotFoundError: If job doesn't exist.
|
|
192
|
+
NotImplementedError: If not implemented by scheduler.
|
|
193
|
+
"""
|
|
194
|
+
raise NotImplementedError(f"{self.name} does not implement get_job_details()")
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Auto-detection of available scheduler."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _check_sge_via_qstat() -> bool:
|
|
9
|
+
"""Check if qstat is SGE by examining its help output."""
|
|
10
|
+
try:
|
|
11
|
+
result = subprocess.run(
|
|
12
|
+
["qstat", "-help"],
|
|
13
|
+
capture_output=True,
|
|
14
|
+
text=True,
|
|
15
|
+
timeout=5,
|
|
16
|
+
)
|
|
17
|
+
# SGE's qstat -help starts with "SGE" or "GE" version info
|
|
18
|
+
output = result.stdout + result.stderr
|
|
19
|
+
return "SGE" in output or "Grid Engine" in output
|
|
20
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def detect_scheduler() -> str:
|
|
25
|
+
"""Auto-detect available scheduler.
|
|
26
|
+
|
|
27
|
+
Order of precedence:
|
|
28
|
+
1. HPC_SCHEDULER environment variable
|
|
29
|
+
2. SGE (check for SGE_ROOT or qstat -help output)
|
|
30
|
+
3. Slurm (check for sbatch)
|
|
31
|
+
4. PBS (check for qsub with PBS_CONF_FILE)
|
|
32
|
+
5. Local fallback
|
|
33
|
+
"""
|
|
34
|
+
# Environment override
|
|
35
|
+
if scheduler := os.environ.get("HPC_SCHEDULER"):
|
|
36
|
+
return scheduler.lower()
|
|
37
|
+
|
|
38
|
+
# Check for SGE (via SGE_ROOT or qstat help output)
|
|
39
|
+
if shutil.which("qsub"):
|
|
40
|
+
if os.environ.get("SGE_ROOT") or _check_sge_via_qstat():
|
|
41
|
+
return "sge"
|
|
42
|
+
|
|
43
|
+
# Check for Slurm
|
|
44
|
+
if shutil.which("sbatch") and shutil.which("squeue"):
|
|
45
|
+
return "slurm"
|
|
46
|
+
|
|
47
|
+
# Check for PBS/Torque
|
|
48
|
+
if shutil.which("qsub") and os.environ.get("PBS_CONF_FILE"):
|
|
49
|
+
return "pbs"
|
|
50
|
+
|
|
51
|
+
# Fallback to local
|
|
52
|
+
return "local"
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""Local scheduler - executes jobs as subprocesses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from hpc_runner.core.exceptions import AccountingNotAvailable, JobNotFoundError
|
|
13
|
+
from hpc_runner.core.job_info import JobInfo
|
|
14
|
+
from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
|
|
15
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
16
|
+
from hpc_runner.templates import render_template
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from hpc_runner.core.job import Job
|
|
20
|
+
from hpc_runner.core.job_array import JobArray
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LocalScheduler(BaseScheduler):
|
|
24
|
+
"""Execute jobs locally (for development/testing)."""
|
|
25
|
+
|
|
26
|
+
name = "local"
|
|
27
|
+
|
|
28
|
+
_job_counter: int = 0
|
|
29
|
+
_processes: dict[str, subprocess.Popen] = {} # type: ignore[type-arg]
|
|
30
|
+
_exit_codes: dict[str, int] = {}
|
|
31
|
+
_output_paths: dict[str, dict[str, Path]] = {}
|
|
32
|
+
|
|
33
|
+
def submit(
|
|
34
|
+
self, job: "Job", interactive: bool = False, keep_script: bool = False
|
|
35
|
+
) -> JobResult:
|
|
36
|
+
"""Run job as local subprocess."""
|
|
37
|
+
LocalScheduler._job_counter += 1
|
|
38
|
+
job_id = f"local_{LocalScheduler._job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
|
39
|
+
|
|
40
|
+
# Set up environment with modules (modules not actually loaded locally)
|
|
41
|
+
env = os.environ.copy() if job.inherit_env else {}
|
|
42
|
+
|
|
43
|
+
# Generate and write script
|
|
44
|
+
script = self.generate_script(job)
|
|
45
|
+
script_path = Path(tempfile.gettempdir()) / f".hpc_local_{job_id}.sh"
|
|
46
|
+
script_path.write_text(script)
|
|
47
|
+
script_path.chmod(0o755)
|
|
48
|
+
|
|
49
|
+
workdir = Path(job.workdir) if job.workdir else Path.cwd()
|
|
50
|
+
|
|
51
|
+
# Determine output paths
|
|
52
|
+
stdout_file = job.stdout or f"{job.name}.{job_id}.out"
|
|
53
|
+
stdout_path = workdir / stdout_file
|
|
54
|
+
if job.merge_output:
|
|
55
|
+
stderr_path = stdout_path # Merge stderr into stdout
|
|
56
|
+
else:
|
|
57
|
+
stderr_file = job.stderr or f"{job.name}.{job_id}.err"
|
|
58
|
+
stderr_path = workdir / stderr_file
|
|
59
|
+
|
|
60
|
+
# Store output paths
|
|
61
|
+
LocalScheduler._output_paths[job_id] = {
|
|
62
|
+
"stdout": stdout_path,
|
|
63
|
+
"stderr": stderr_path,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if interactive:
|
|
67
|
+
# Blocking execution
|
|
68
|
+
with open(stdout_path, "w") as stdout_f:
|
|
69
|
+
if job.merge_output:
|
|
70
|
+
result = subprocess.run(
|
|
71
|
+
[str(script_path)],
|
|
72
|
+
cwd=workdir,
|
|
73
|
+
env=env,
|
|
74
|
+
stdout=stdout_f,
|
|
75
|
+
stderr=subprocess.STDOUT,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
with open(stderr_path, "w") as stderr_f:
|
|
79
|
+
result = subprocess.run(
|
|
80
|
+
[str(script_path)],
|
|
81
|
+
cwd=workdir,
|
|
82
|
+
env=env,
|
|
83
|
+
stdout=stdout_f,
|
|
84
|
+
stderr=stderr_f,
|
|
85
|
+
)
|
|
86
|
+
LocalScheduler._exit_codes[job_id] = result.returncode
|
|
87
|
+
script_path.unlink(missing_ok=True)
|
|
88
|
+
else:
|
|
89
|
+
# Background execution
|
|
90
|
+
stdout_f = open(stdout_path, "w")
|
|
91
|
+
if job.merge_output:
|
|
92
|
+
proc = subprocess.Popen(
|
|
93
|
+
[str(script_path)],
|
|
94
|
+
cwd=workdir,
|
|
95
|
+
env=env,
|
|
96
|
+
stdout=stdout_f,
|
|
97
|
+
stderr=subprocess.STDOUT,
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
stderr_f = open(stderr_path, "w")
|
|
101
|
+
proc = subprocess.Popen(
|
|
102
|
+
[str(script_path)],
|
|
103
|
+
cwd=workdir,
|
|
104
|
+
env=env,
|
|
105
|
+
stdout=stdout_f,
|
|
106
|
+
stderr=stderr_f,
|
|
107
|
+
)
|
|
108
|
+
LocalScheduler._processes[job_id] = proc
|
|
109
|
+
# Store script path for cleanup
|
|
110
|
+
proc._script_path = script_path # type: ignore[attr-defined]
|
|
111
|
+
proc._stdout_file = stdout_f # type: ignore[attr-defined]
|
|
112
|
+
if not job.merge_output:
|
|
113
|
+
proc._stderr_file = stderr_f # type: ignore[attr-defined]
|
|
114
|
+
|
|
115
|
+
return JobResult(job_id=job_id, scheduler=self, job=job)
|
|
116
|
+
|
|
117
|
+
def submit_array(self, array: "JobArray") -> ArrayJobResult:
|
|
118
|
+
"""Simulate array job by submitting multiple jobs."""
|
|
119
|
+
# For local scheduler, we just run one job
|
|
120
|
+
# and return an ArrayJobResult pointing to it
|
|
121
|
+
LocalScheduler._job_counter += 1
|
|
122
|
+
base_job_id = f"local_array_{LocalScheduler._job_counter}"
|
|
123
|
+
|
|
124
|
+
# Run jobs sequentially (or could be parallel)
|
|
125
|
+
for idx in array.indices:
|
|
126
|
+
# Set array index environment variable
|
|
127
|
+
os.environ["HPC_ARRAY_TASK_ID"] = str(idx)
|
|
128
|
+
os.environ["SGE_TASK_ID"] = str(idx) # SGE compat
|
|
129
|
+
os.environ["SLURM_ARRAY_TASK_ID"] = str(idx) # Slurm compat
|
|
130
|
+
|
|
131
|
+
# Create a job ID for this task
|
|
132
|
+
task_job_id = f"{base_job_id}.{idx}"
|
|
133
|
+
self._submit_array_task(array.job, task_job_id, idx)
|
|
134
|
+
|
|
135
|
+
return ArrayJobResult(base_job_id=base_job_id, scheduler=self, array=array)
|
|
136
|
+
|
|
137
|
+
def _submit_array_task(self, job: "Job", job_id: str, index: int) -> None:
|
|
138
|
+
"""Submit a single array task."""
|
|
139
|
+
env = os.environ.copy() if job.inherit_env else {}
|
|
140
|
+
env["HPC_ARRAY_TASK_ID"] = str(index)
|
|
141
|
+
|
|
142
|
+
script = self.generate_script(job)
|
|
143
|
+
script_path = Path(tempfile.gettempdir()) / f".hpc_local_{job_id}.sh"
|
|
144
|
+
script_path.write_text(script)
|
|
145
|
+
script_path.chmod(0o755)
|
|
146
|
+
|
|
147
|
+
workdir = Path(job.workdir) if job.workdir else Path.cwd()
|
|
148
|
+
stdout_path = workdir / f"{job.name}.{job_id}.out"
|
|
149
|
+
|
|
150
|
+
LocalScheduler._output_paths[job_id] = {"stdout": stdout_path, "stderr": stdout_path}
|
|
151
|
+
|
|
152
|
+
stdout_f = open(stdout_path, "w")
|
|
153
|
+
proc = subprocess.Popen(
|
|
154
|
+
[str(script_path)],
|
|
155
|
+
cwd=workdir,
|
|
156
|
+
env=env,
|
|
157
|
+
stdout=stdout_f,
|
|
158
|
+
stderr=subprocess.STDOUT,
|
|
159
|
+
)
|
|
160
|
+
LocalScheduler._processes[job_id] = proc
|
|
161
|
+
proc._script_path = script_path # type: ignore[attr-defined]
|
|
162
|
+
proc._stdout_file = stdout_f # type: ignore[attr-defined]
|
|
163
|
+
|
|
164
|
+
def cancel(self, job_id: str) -> bool:
|
|
165
|
+
"""Cancel a local job."""
|
|
166
|
+
if job_id in LocalScheduler._processes:
|
|
167
|
+
proc = LocalScheduler._processes[job_id]
|
|
168
|
+
proc.terminate()
|
|
169
|
+
proc.wait()
|
|
170
|
+
self._cleanup_process(job_id)
|
|
171
|
+
return True
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
def get_status(self, job_id: str) -> JobStatus:
|
|
175
|
+
"""Get job status."""
|
|
176
|
+
if job_id in LocalScheduler._exit_codes:
|
|
177
|
+
# Already completed
|
|
178
|
+
return JobStatus.COMPLETED if LocalScheduler._exit_codes[job_id] == 0 else JobStatus.FAILED
|
|
179
|
+
|
|
180
|
+
if job_id not in LocalScheduler._processes:
|
|
181
|
+
return JobStatus.UNKNOWN
|
|
182
|
+
|
|
183
|
+
proc = LocalScheduler._processes[job_id]
|
|
184
|
+
poll = proc.poll()
|
|
185
|
+
|
|
186
|
+
if poll is None:
|
|
187
|
+
return JobStatus.RUNNING
|
|
188
|
+
|
|
189
|
+
# Process completed
|
|
190
|
+
LocalScheduler._exit_codes[job_id] = poll
|
|
191
|
+
self._cleanup_process(job_id)
|
|
192
|
+
|
|
193
|
+
return JobStatus.COMPLETED if poll == 0 else JobStatus.FAILED
|
|
194
|
+
|
|
195
|
+
def _cleanup_process(self, job_id: str) -> None:
|
|
196
|
+
"""Clean up process resources."""
|
|
197
|
+
if job_id in LocalScheduler._processes:
|
|
198
|
+
proc = LocalScheduler._processes[job_id]
|
|
199
|
+
# Close file handles
|
|
200
|
+
if hasattr(proc, "_stdout_file"):
|
|
201
|
+
proc._stdout_file.close() # type: ignore[attr-defined]
|
|
202
|
+
if hasattr(proc, "_stderr_file"):
|
|
203
|
+
proc._stderr_file.close() # type: ignore[attr-defined]
|
|
204
|
+
# Remove script
|
|
205
|
+
if hasattr(proc, "_script_path"):
|
|
206
|
+
proc._script_path.unlink(missing_ok=True) # type: ignore[attr-defined]
|
|
207
|
+
del LocalScheduler._processes[job_id]
|
|
208
|
+
|
|
209
|
+
def get_exit_code(self, job_id: str) -> int | None:
|
|
210
|
+
"""Get exit code."""
|
|
211
|
+
# First check if we have a cached exit code
|
|
212
|
+
if job_id in LocalScheduler._exit_codes:
|
|
213
|
+
return LocalScheduler._exit_codes[job_id]
|
|
214
|
+
|
|
215
|
+
# Check if process is done
|
|
216
|
+
if job_id in LocalScheduler._processes:
|
|
217
|
+
proc = LocalScheduler._processes[job_id]
|
|
218
|
+
poll = proc.poll()
|
|
219
|
+
if poll is not None:
|
|
220
|
+
LocalScheduler._exit_codes[job_id] = poll
|
|
221
|
+
return poll
|
|
222
|
+
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
def get_output_path(self, job_id: str, stream: str) -> Path | None:
|
|
226
|
+
"""Get output file path."""
|
|
227
|
+
if job_id in LocalScheduler._output_paths:
|
|
228
|
+
return LocalScheduler._output_paths[job_id].get(stream)
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
def generate_script(self, job: "Job", array_range: str | None = None) -> str:
|
|
232
|
+
"""Generate local execution script."""
|
|
233
|
+
return render_template(
|
|
234
|
+
"local/templates/job.sh.j2",
|
|
235
|
+
job=job,
|
|
236
|
+
scheduler=self,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def build_submit_command(self, job: "Job") -> list[str]:
|
|
240
|
+
"""Build command - for local, just bash."""
|
|
241
|
+
return ["bash", "-c", job.command if isinstance(job.command, str) else " ".join(job.command)]
|
|
242
|
+
|
|
243
|
+
def build_interactive_command(self, job: "Job") -> list[str]:
|
|
244
|
+
"""Build interactive command - for local, just bash."""
|
|
245
|
+
return ["bash", "-c", job.command if isinstance(job.command, str) else " ".join(job.command)]
|
|
246
|
+
|
|
247
|
+
# -------------------------------------------------------------------------
|
|
248
|
+
# TUI Monitor API (stubs for local scheduler)
|
|
249
|
+
# -------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
def list_active_jobs(
|
|
252
|
+
self,
|
|
253
|
+
user: str | None = None,
|
|
254
|
+
status: set[JobStatus] | None = None,
|
|
255
|
+
queue: str | None = None,
|
|
256
|
+
) -> list[JobInfo]:
|
|
257
|
+
"""List active local jobs.
|
|
258
|
+
|
|
259
|
+
The local scheduler tracks running processes in memory.
|
|
260
|
+
"""
|
|
261
|
+
jobs: list[JobInfo] = []
|
|
262
|
+
current_user = os.environ.get("USER", "unknown")
|
|
263
|
+
|
|
264
|
+
for job_id, proc in LocalScheduler._processes.items():
|
|
265
|
+
poll = proc.poll()
|
|
266
|
+
if poll is None: # Still running
|
|
267
|
+
job_status = JobStatus.RUNNING
|
|
268
|
+
else:
|
|
269
|
+
continue # Skip completed jobs
|
|
270
|
+
|
|
271
|
+
# Apply filters
|
|
272
|
+
if user is not None and user != current_user:
|
|
273
|
+
continue
|
|
274
|
+
if status is not None and job_status not in status:
|
|
275
|
+
continue
|
|
276
|
+
# queue filter doesn't apply to local scheduler
|
|
277
|
+
|
|
278
|
+
jobs.append(
|
|
279
|
+
JobInfo(
|
|
280
|
+
job_id=job_id,
|
|
281
|
+
name=job_id, # Local scheduler doesn't track job names
|
|
282
|
+
user=current_user,
|
|
283
|
+
status=job_status,
|
|
284
|
+
queue="local",
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
return jobs
|
|
289
|
+
|
|
290
|
+
def list_completed_jobs(
|
|
291
|
+
self,
|
|
292
|
+
user: str | None = None,
|
|
293
|
+
since: datetime | None = None,
|
|
294
|
+
until: datetime | None = None,
|
|
295
|
+
exit_code: int | None = None,
|
|
296
|
+
queue: str | None = None,
|
|
297
|
+
limit: int = 100,
|
|
298
|
+
) -> list[JobInfo]:
|
|
299
|
+
"""List completed local jobs.
|
|
300
|
+
|
|
301
|
+
The local scheduler does not persist job history, so this
|
|
302
|
+
raises AccountingNotAvailable.
|
|
303
|
+
"""
|
|
304
|
+
raise AccountingNotAvailable(
|
|
305
|
+
"Local scheduler does not persist job history. "
|
|
306
|
+
"Completed job information is only available during the current session."
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
def has_accounting(self) -> bool:
|
|
310
|
+
"""Check if job accounting is available.
|
|
311
|
+
|
|
312
|
+
Local scheduler does not have persistent accounting.
|
|
313
|
+
"""
|
|
314
|
+
return False
|
|
315
|
+
|
|
316
|
+
def get_job_details(self, job_id: str) -> tuple[JobInfo, dict[str, object]]:
|
|
317
|
+
"""Get details for a local job."""
|
|
318
|
+
current_user = os.environ.get("USER", "unknown")
|
|
319
|
+
|
|
320
|
+
# Check running processes
|
|
321
|
+
if job_id in LocalScheduler._processes:
|
|
322
|
+
proc = LocalScheduler._processes[job_id]
|
|
323
|
+
poll = proc.poll()
|
|
324
|
+
status = JobStatus.RUNNING if poll is None else (
|
|
325
|
+
JobStatus.COMPLETED if poll == 0 else JobStatus.FAILED
|
|
326
|
+
)
|
|
327
|
+
job_info = JobInfo(
|
|
328
|
+
job_id=job_id,
|
|
329
|
+
name=job_id,
|
|
330
|
+
user=current_user,
|
|
331
|
+
status=status,
|
|
332
|
+
queue="local",
|
|
333
|
+
exit_code=poll if poll is not None else None,
|
|
334
|
+
stdout_path=LocalScheduler._output_paths.get(job_id, {}).get("stdout"),
|
|
335
|
+
stderr_path=LocalScheduler._output_paths.get(job_id, {}).get("stderr"),
|
|
336
|
+
)
|
|
337
|
+
return job_info, {}
|
|
338
|
+
|
|
339
|
+
# Check completed jobs with cached exit codes
|
|
340
|
+
if job_id in LocalScheduler._exit_codes:
|
|
341
|
+
exit_code = LocalScheduler._exit_codes[job_id]
|
|
342
|
+
job_info = JobInfo(
|
|
343
|
+
job_id=job_id,
|
|
344
|
+
name=job_id,
|
|
345
|
+
user=current_user,
|
|
346
|
+
status=JobStatus.COMPLETED if exit_code == 0 else JobStatus.FAILED,
|
|
347
|
+
queue="local",
|
|
348
|
+
exit_code=exit_code,
|
|
349
|
+
stdout_path=LocalScheduler._output_paths.get(job_id, {}).get("stdout"),
|
|
350
|
+
stderr_path=LocalScheduler._output_paths.get(job_id, {}).get("stderr"),
|
|
351
|
+
)
|
|
352
|
+
return job_info, {}
|
|
353
|
+
|
|
354
|
+
raise JobNotFoundError(f"Job {job_id} not found")
|