hpc-runner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ """Local scheduler - executes jobs as subprocesses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import subprocess
7
+ import tempfile
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
13
+ from hpc_runner.schedulers.base import BaseScheduler
14
+ from hpc_runner.templates import render_template
15
+
16
+ if TYPE_CHECKING:
17
+ from hpc_runner.core.job import Job
18
+ from hpc_runner.core.job_array import JobArray
19
+
20
+
21
+ class LocalScheduler(BaseScheduler):
22
+ """Execute jobs locally (for development/testing)."""
23
+
24
+ name = "local"
25
+
26
+ _job_counter: int = 0
27
+ _processes: dict[str, subprocess.Popen] = {} # type: ignore[type-arg]
28
+ _exit_codes: dict[str, int] = {}
29
+ _output_paths: dict[str, dict[str, Path]] = {}
30
+
31
+ def submit(self, job: "Job", interactive: bool = False) -> JobResult:
32
+ """Run job as local subprocess."""
33
+ LocalScheduler._job_counter += 1
34
+ job_id = f"local_{LocalScheduler._job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
35
+
36
+ # Set up environment with modules (modules not actually loaded locally)
37
+ env = os.environ.copy() if job.inherit_env else {}
38
+
39
+ # Generate and write script
40
+ script = self.generate_script(job)
41
+ script_path = Path(tempfile.gettempdir()) / f".hpc_local_{job_id}.sh"
42
+ script_path.write_text(script)
43
+ script_path.chmod(0o755)
44
+
45
+ workdir = Path(job.workdir) if job.workdir else Path.cwd()
46
+
47
+ # Determine output paths
48
+ stdout_file = job.stdout or f"{job.name}.{job_id}.out"
49
+ stdout_path = workdir / stdout_file
50
+ if job.merge_output:
51
+ stderr_path = stdout_path # Merge stderr into stdout
52
+ else:
53
+ stderr_file = job.stderr or f"{job.name}.{job_id}.err"
54
+ stderr_path = workdir / stderr_file
55
+
56
+ # Store output paths
57
+ LocalScheduler._output_paths[job_id] = {
58
+ "stdout": stdout_path,
59
+ "stderr": stderr_path,
60
+ }
61
+
62
+ if interactive:
63
+ # Blocking execution
64
+ with open(stdout_path, "w") as stdout_f:
65
+ if job.merge_output:
66
+ result = subprocess.run(
67
+ [str(script_path)],
68
+ cwd=workdir,
69
+ env=env,
70
+ stdout=stdout_f,
71
+ stderr=subprocess.STDOUT,
72
+ )
73
+ else:
74
+ with open(stderr_path, "w") as stderr_f:
75
+ result = subprocess.run(
76
+ [str(script_path)],
77
+ cwd=workdir,
78
+ env=env,
79
+ stdout=stdout_f,
80
+ stderr=stderr_f,
81
+ )
82
+ LocalScheduler._exit_codes[job_id] = result.returncode
83
+ script_path.unlink(missing_ok=True)
84
+ else:
85
+ # Background execution
86
+ stdout_f = open(stdout_path, "w")
87
+ if job.merge_output:
88
+ proc = subprocess.Popen(
89
+ [str(script_path)],
90
+ cwd=workdir,
91
+ env=env,
92
+ stdout=stdout_f,
93
+ stderr=subprocess.STDOUT,
94
+ )
95
+ else:
96
+ stderr_f = open(stderr_path, "w")
97
+ proc = subprocess.Popen(
98
+ [str(script_path)],
99
+ cwd=workdir,
100
+ env=env,
101
+ stdout=stdout_f,
102
+ stderr=stderr_f,
103
+ )
104
+ LocalScheduler._processes[job_id] = proc
105
+ # Store script path for cleanup
106
+ proc._script_path = script_path # type: ignore[attr-defined]
107
+ proc._stdout_file = stdout_f # type: ignore[attr-defined]
108
+ if not job.merge_output:
109
+ proc._stderr_file = stderr_f # type: ignore[attr-defined]
110
+
111
+ return JobResult(job_id=job_id, scheduler=self, job=job)
112
+
113
+ def submit_array(self, array: "JobArray") -> ArrayJobResult:
114
+ """Simulate array job by submitting multiple jobs."""
115
+ # For local scheduler, we just run one job
116
+ # and return an ArrayJobResult pointing to it
117
+ LocalScheduler._job_counter += 1
118
+ base_job_id = f"local_array_{LocalScheduler._job_counter}"
119
+
120
+ # Run jobs sequentially (or could be parallel)
121
+ for idx in array.indices:
122
+ # Set array index environment variable
123
+ os.environ["HPC_ARRAY_TASK_ID"] = str(idx)
124
+ os.environ["SGE_TASK_ID"] = str(idx) # SGE compat
125
+ os.environ["SLURM_ARRAY_TASK_ID"] = str(idx) # Slurm compat
126
+
127
+ # Create a job ID for this task
128
+ task_job_id = f"{base_job_id}.{idx}"
129
+ self._submit_array_task(array.job, task_job_id, idx)
130
+
131
+ return ArrayJobResult(base_job_id=base_job_id, scheduler=self, array=array)
132
+
133
+ def _submit_array_task(self, job: "Job", job_id: str, index: int) -> None:
134
+ """Submit a single array task."""
135
+ env = os.environ.copy() if job.inherit_env else {}
136
+ env["HPC_ARRAY_TASK_ID"] = str(index)
137
+
138
+ script = self.generate_script(job)
139
+ script_path = Path(tempfile.gettempdir()) / f".hpc_local_{job_id}.sh"
140
+ script_path.write_text(script)
141
+ script_path.chmod(0o755)
142
+
143
+ workdir = Path(job.workdir) if job.workdir else Path.cwd()
144
+ stdout_path = workdir / f"{job.name}.{job_id}.out"
145
+
146
+ LocalScheduler._output_paths[job_id] = {"stdout": stdout_path, "stderr": stdout_path}
147
+
148
+ stdout_f = open(stdout_path, "w")
149
+ proc = subprocess.Popen(
150
+ [str(script_path)],
151
+ cwd=workdir,
152
+ env=env,
153
+ stdout=stdout_f,
154
+ stderr=subprocess.STDOUT,
155
+ )
156
+ LocalScheduler._processes[job_id] = proc
157
+ proc._script_path = script_path # type: ignore[attr-defined]
158
+ proc._stdout_file = stdout_f # type: ignore[attr-defined]
159
+
160
+ def cancel(self, job_id: str) -> bool:
161
+ """Cancel a local job."""
162
+ if job_id in LocalScheduler._processes:
163
+ proc = LocalScheduler._processes[job_id]
164
+ proc.terminate()
165
+ proc.wait()
166
+ self._cleanup_process(job_id)
167
+ return True
168
+ return False
169
+
170
+ def get_status(self, job_id: str) -> JobStatus:
171
+ """Get job status."""
172
+ if job_id in LocalScheduler._exit_codes:
173
+ # Already completed
174
+ return JobStatus.COMPLETED if LocalScheduler._exit_codes[job_id] == 0 else JobStatus.FAILED
175
+
176
+ if job_id not in LocalScheduler._processes:
177
+ return JobStatus.UNKNOWN
178
+
179
+ proc = LocalScheduler._processes[job_id]
180
+ poll = proc.poll()
181
+
182
+ if poll is None:
183
+ return JobStatus.RUNNING
184
+
185
+ # Process completed
186
+ LocalScheduler._exit_codes[job_id] = poll
187
+ self._cleanup_process(job_id)
188
+
189
+ return JobStatus.COMPLETED if poll == 0 else JobStatus.FAILED
190
+
191
+ def _cleanup_process(self, job_id: str) -> None:
192
+ """Clean up process resources."""
193
+ if job_id in LocalScheduler._processes:
194
+ proc = LocalScheduler._processes[job_id]
195
+ # Close file handles
196
+ if hasattr(proc, "_stdout_file"):
197
+ proc._stdout_file.close() # type: ignore[attr-defined]
198
+ if hasattr(proc, "_stderr_file"):
199
+ proc._stderr_file.close() # type: ignore[attr-defined]
200
+ # Remove script
201
+ if hasattr(proc, "_script_path"):
202
+ proc._script_path.unlink(missing_ok=True) # type: ignore[attr-defined]
203
+ del LocalScheduler._processes[job_id]
204
+
205
+ def get_exit_code(self, job_id: str) -> int | None:
206
+ """Get exit code."""
207
+ # First check if we have a cached exit code
208
+ if job_id in LocalScheduler._exit_codes:
209
+ return LocalScheduler._exit_codes[job_id]
210
+
211
+ # Check if process is done
212
+ if job_id in LocalScheduler._processes:
213
+ proc = LocalScheduler._processes[job_id]
214
+ poll = proc.poll()
215
+ if poll is not None:
216
+ LocalScheduler._exit_codes[job_id] = poll
217
+ return poll
218
+
219
+ return None
220
+
221
+ def get_output_path(self, job_id: str, stream: str) -> Path | None:
222
+ """Get output file path."""
223
+ if job_id in LocalScheduler._output_paths:
224
+ return LocalScheduler._output_paths[job_id].get(stream)
225
+ return None
226
+
227
+ def generate_script(self, job: "Job") -> str:
228
+ """Generate local execution script."""
229
+ return render_template(
230
+ "local/templates/job.sh.j2",
231
+ job=job,
232
+ scheduler=self,
233
+ )
234
+
235
+ def build_submit_command(self, job: "Job") -> list[str]:
236
+ """Build command - for local, just bash."""
237
+ return ["bash", "-c", job.command if isinstance(job.command, str) else " ".join(job.command)]
@@ -0,0 +1,28 @@
1
+ #!/bin/bash
2
+ # Generated by hpc-tools (local scheduler)
3
+
4
+ # Exit on error
5
+ set -e
6
+
7
+ {% if job.modules_path %}
8
+ # Additional module paths (simulated for local)
9
+ {% for path in job.modules_path %}
10
+ # module use {{ path }}
11
+ {% endfor %}
12
+ {% endif %}
13
+
14
+ {% if job.modules %}
15
+ # Modules (simulated for local - not actually loaded)
16
+ {% for mod in job.modules %}
17
+ # module load {{ mod }}
18
+ {% endfor %}
19
+ {% endif %}
20
+
21
+ {% if job.workdir %}
22
+ # Change to working directory
23
+ cd {{ job.workdir }}
24
+ {% endif %}
25
+
26
+ # Execute command
27
+ {{ job.command }}
28
+ exit $?
@@ -0,0 +1,5 @@
1
+ """SGE (Sun Grid Engine) scheduler implementation."""
2
+
3
+ from hpc_runner.schedulers.sge.scheduler import SGEScheduler
4
+
5
+ __all__ = ["SGEScheduler"]
@@ -0,0 +1,165 @@
1
+ """SGE-specific argument descriptors."""
2
+
3
+ from hpc_runner.core.descriptors import SchedulerArg
4
+
5
+
6
+ class SGEArg(SchedulerArg):
7
+ """Base SGE argument.
8
+
9
+ SGE uses #$ -flag value format for directives.
10
+ """
11
+
12
+ def to_args(self, value) -> list[str]:
13
+ if value is None:
14
+ return []
15
+ return [f"-{self.flag}", str(self.converter(value))]
16
+
17
+ def to_directive(self, value) -> str | None:
18
+ if value is None:
19
+ return None
20
+ return f"#$ -{self.flag} {self.converter(value)}"
21
+
22
+
23
+ class SGECpuArg(SGEArg):
24
+ """CPU/slots argument using parallel environment.
25
+
26
+ Note: The PE name is configurable via config.
27
+ """
28
+
29
+ def __init__(self, pe_name: str = "smp"):
30
+ super().__init__("pe", converter=lambda v: f"{pe_name} {v}", doc="Parallel environment")
31
+ self.pe_name = pe_name
32
+
33
+ def to_args(self, value, pe_name: str | None = None) -> list[str]:
34
+ if value is None:
35
+ return []
36
+ pe = pe_name or self.pe_name
37
+ return ["-pe", f"{pe} {value}"]
38
+
39
+ def to_directive(self, value, pe_name: str | None = None) -> str | None:
40
+ if value is None:
41
+ return None
42
+ pe = pe_name or self.pe_name
43
+ return f"#$ -pe {pe} {value}"
44
+
45
+
46
+ class SGEMemArg(SGEArg):
47
+ """Memory argument.
48
+
49
+ Uses -l resource=value format. Resource name is configurable.
50
+ """
51
+
52
+ def __init__(self, resource_name: str = "mem_free"):
53
+ super().__init__("l", doc="Memory requirement")
54
+ self.resource_name = resource_name
55
+
56
+ def to_args(self, value, resource_name: str | None = None) -> list[str]:
57
+ if value is None:
58
+ return []
59
+ res = resource_name or self.resource_name
60
+ return ["-l", f"{res}={value}"]
61
+
62
+ def to_directive(self, value, resource_name: str | None = None) -> str | None:
63
+ if value is None:
64
+ return None
65
+ res = resource_name or self.resource_name
66
+ return f"#$ -l {res}={value}"
67
+
68
+
69
+ class SGETimeArg(SGEArg):
70
+ """Time limit argument.
71
+
72
+ Uses -l h_rt=HH:MM:SS format. Resource name is configurable.
73
+ """
74
+
75
+ def __init__(self, resource_name: str = "h_rt"):
76
+ super().__init__("l", doc="Hard runtime limit")
77
+ self.resource_name = resource_name
78
+
79
+ def to_args(self, value, resource_name: str | None = None) -> list[str]:
80
+ if value is None:
81
+ return []
82
+ res = resource_name or self.resource_name
83
+ return ["-l", f"{res}={value}"]
84
+
85
+ def to_directive(self, value, resource_name: str | None = None) -> str | None:
86
+ if value is None:
87
+ return None
88
+ res = resource_name or self.resource_name
89
+ return f"#$ -l {res}={value}"
90
+
91
+
92
+ class SGEQueueArg(SGEArg):
93
+ """Queue argument."""
94
+
95
+ def __init__(self):
96
+ super().__init__("q", doc="Queue name")
97
+
98
+
99
+ class SGEJobNameArg(SGEArg):
100
+ """Job name argument."""
101
+
102
+ def __init__(self):
103
+ super().__init__("N", doc="Job name")
104
+
105
+
106
+ class SGEOutputArg(SGEArg):
107
+ """Stdout path argument."""
108
+
109
+ def __init__(self):
110
+ super().__init__("o", doc="Stdout file path")
111
+
112
+
113
+ class SGEErrorArg(SGEArg):
114
+ """Stderr path argument."""
115
+
116
+ def __init__(self):
117
+ super().__init__("e", doc="Stderr file path")
118
+
119
+
120
+ class SGEArrayArg(SGEArg):
121
+ """Array job argument."""
122
+
123
+ def __init__(self):
124
+ super().__init__("t", doc="Array job range (e.g., 1-100, 1-100:10)")
125
+
126
+
127
+ class SGEJoinOutputArg(SGEArg):
128
+ """Join stdout and stderr."""
129
+
130
+ def __init__(self):
131
+ super().__init__("j", doc="Join stdout and stderr")
132
+
133
+ def to_args(self, value) -> list[str]:
134
+ if value:
135
+ return ["-j", "y"]
136
+ return []
137
+
138
+ def to_directive(self, value) -> str | None:
139
+ if value:
140
+ return "#$ -j y"
141
+ return None
142
+
143
+
144
+ class SGECwdArg(SGEArg):
145
+ """Use current working directory."""
146
+
147
+ def __init__(self):
148
+ super().__init__("cwd", doc="Use current working directory")
149
+
150
+ def to_args(self, value) -> list[str]:
151
+ if value:
152
+ return ["-cwd"]
153
+ return []
154
+
155
+ def to_directive(self, value) -> str | None:
156
+ if value:
157
+ return "#$ -cwd"
158
+ return None
159
+
160
+
161
+ class SGEShellArg(SGEArg):
162
+ """Shell to use for the job."""
163
+
164
+ def __init__(self):
165
+ super().__init__("S", doc="Shell path")
@@ -0,0 +1,194 @@
1
+ """SGE output parsing utilities."""
2
+
3
+ import re
4
+ import xml.etree.ElementTree as ET
5
+ from typing import Any
6
+
7
+ from hpc_runner.core.result import JobStatus
8
+
9
+
10
+ def parse_qstat_xml(xml_output: str) -> dict[str, Any]:
11
+ """Parse qstat -xml output.
12
+
13
+ Returns dict with job_id -> job_info mappings.
14
+ """
15
+ jobs: dict[str, Any] = {}
16
+
17
+ try:
18
+ root = ET.fromstring(xml_output)
19
+
20
+ # Parse queue_info (running jobs)
21
+ for job_list in root.findall(".//job_list"):
22
+ job_info = _parse_job_element(job_list)
23
+ if job_info:
24
+ jobs[job_info["job_id"]] = job_info
25
+
26
+ # Parse job_info (pending jobs)
27
+ for job_list in root.findall(".//job_info/job_list"):
28
+ job_info = _parse_job_element(job_list)
29
+ if job_info:
30
+ jobs[job_info["job_id"]] = job_info
31
+
32
+ except ET.ParseError:
33
+ pass
34
+
35
+ return jobs
36
+
37
+
38
+ def _parse_job_element(elem: ET.Element) -> dict[str, Any] | None:
39
+ """Parse a single job_list element."""
40
+ job_id_elem = elem.find("JB_job_number")
41
+ if job_id_elem is None or job_id_elem.text is None:
42
+ return None
43
+
44
+ job_info: dict[str, Any] = {
45
+ "job_id": job_id_elem.text,
46
+ }
47
+
48
+ # Job name
49
+ name_elem = elem.find("JB_name")
50
+ if name_elem is not None and name_elem.text:
51
+ job_info["name"] = name_elem.text
52
+
53
+ # State
54
+ state_elem = elem.find("state")
55
+ if state_elem is not None and state_elem.text:
56
+ job_info["state"] = state_elem.text
57
+
58
+ # Queue
59
+ queue_elem = elem.find("queue_name")
60
+ if queue_elem is not None and queue_elem.text:
61
+ job_info["queue"] = queue_elem.text
62
+
63
+ # Slots
64
+ slots_elem = elem.find("slots")
65
+ if slots_elem is not None and slots_elem.text:
66
+ job_info["slots"] = int(slots_elem.text)
67
+
68
+ return job_info
69
+
70
+
71
+ def parse_qstat_plain(output: str) -> dict[str, Any]:
72
+ """Parse plain qstat output.
73
+
74
+ Format:
75
+ job-ID prior name user state submit/start at queue slots ja-task-ID
76
+ --------------------------------------------------------------------------------
77
+ 12345 0.55500 myjob user r 01/01/2024 10:00:00 all.q@node1 1
78
+ """
79
+ jobs: dict[str, Any] = {}
80
+
81
+ lines = output.strip().split("\n")
82
+
83
+ # Skip header lines
84
+ data_started = False
85
+ for line in lines:
86
+ if line.startswith("-"):
87
+ data_started = True
88
+ continue
89
+ if not data_started:
90
+ continue
91
+
92
+ parts = line.split()
93
+ if len(parts) >= 5:
94
+ job_id = parts[0]
95
+ jobs[job_id] = {
96
+ "job_id": job_id,
97
+ "priority": parts[1],
98
+ "name": parts[2],
99
+ "user": parts[3],
100
+ "state": parts[4],
101
+ }
102
+
103
+ # Parse queue if present
104
+ if len(parts) >= 8:
105
+ jobs[job_id]["queue"] = parts[7]
106
+
107
+ # Parse slots if present
108
+ if len(parts) >= 9:
109
+ try:
110
+ jobs[job_id]["slots"] = int(parts[8])
111
+ except ValueError:
112
+ pass
113
+
114
+ return jobs
115
+
116
+
117
+ def parse_qacct_output(output: str) -> dict[str, Any]:
118
+ """Parse qacct output for job accounting info.
119
+
120
+ Format:
121
+ ==============================================================
122
+ qname all.q
123
+ hostname node1
124
+ group users
125
+ owner user
126
+ jobname myjob
127
+ jobnumber 12345
128
+ ...
129
+ exit_status 0
130
+ """
131
+ info: dict[str, Any] = {}
132
+
133
+ for line in output.strip().split("\n"):
134
+ if line.startswith("="):
135
+ continue
136
+
137
+ parts = line.split(None, 1)
138
+ if len(parts) == 2:
139
+ key, value = parts
140
+ info[key] = value.strip()
141
+
142
+ return info
143
+
144
+
145
+ def state_to_status(state: str) -> JobStatus:
146
+ """Convert SGE state code to JobStatus.
147
+
148
+ SGE states:
149
+ - qw: pending (queued, waiting)
150
+ - hqw: hold (on hold)
151
+ - r: running
152
+ - t: transferring
153
+ - Rr, Rt: restarted
154
+ - s, ts: suspended
155
+ - S, tS: queue suspended
156
+ - T, tT: threshold
157
+ - Eqw: error (waiting)
158
+ - dr: deleting (running)
159
+ - dt: deleting (transferring)
160
+ """
161
+ state = state.lower()
162
+
163
+ if state in ("r", "t", "rr", "rt"):
164
+ return JobStatus.RUNNING
165
+ elif state in ("qw", "hqw"):
166
+ return JobStatus.PENDING
167
+ elif state in ("eqw",):
168
+ return JobStatus.FAILED
169
+ elif state in ("dr", "dt"):
170
+ return JobStatus.CANCELLED
171
+ elif state in ("s", "ts", "ss", "ts"):
172
+ return JobStatus.PENDING # Suspended, treat as pending
173
+
174
+ return JobStatus.UNKNOWN
175
+
176
+
177
+ def parse_qsub_output(output: str) -> str | None:
178
+ """Parse qsub output to extract job ID.
179
+
180
+ Expected format:
181
+ Your job 12345 ("jobname") has been submitted
182
+ Your job-array 12345.1-10:1 ("jobname") has been submitted
183
+ """
184
+ # Standard job
185
+ match = re.search(r"Your job (\d+)", output)
186
+ if match:
187
+ return match.group(1)
188
+
189
+ # Array job
190
+ match = re.search(r"Your job-array (\d+)", output)
191
+ if match:
192
+ return match.group(1)
193
+
194
+ return None