hpc-runner 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. hpc_runner/_version.py +2 -2
  2. hpc_runner/cli/cancel.py +1 -1
  3. hpc_runner/cli/config.py +2 -2
  4. hpc_runner/cli/main.py +17 -13
  5. hpc_runner/cli/monitor.py +30 -0
  6. hpc_runner/cli/run.py +223 -67
  7. hpc_runner/cli/status.py +6 -5
  8. hpc_runner/core/__init__.py +30 -0
  9. hpc_runner/core/descriptors.py +87 -33
  10. hpc_runner/core/exceptions.py +9 -0
  11. hpc_runner/core/job.py +272 -93
  12. hpc_runner/core/job_info.py +104 -0
  13. hpc_runner/core/result.py +4 -0
  14. hpc_runner/schedulers/base.py +148 -30
  15. hpc_runner/schedulers/detection.py +22 -4
  16. hpc_runner/schedulers/local/scheduler.py +119 -2
  17. hpc_runner/schedulers/sge/args.py +161 -94
  18. hpc_runner/schedulers/sge/parser.py +106 -13
  19. hpc_runner/schedulers/sge/scheduler.py +727 -171
  20. hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
  21. hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
  22. hpc_runner/tui/__init__.py +5 -0
  23. hpc_runner/tui/app.py +436 -0
  24. hpc_runner/tui/components/__init__.py +17 -0
  25. hpc_runner/tui/components/detail_panel.py +187 -0
  26. hpc_runner/tui/components/filter_bar.py +174 -0
  27. hpc_runner/tui/components/filter_popup.py +345 -0
  28. hpc_runner/tui/components/job_table.py +260 -0
  29. hpc_runner/tui/providers/__init__.py +5 -0
  30. hpc_runner/tui/providers/jobs.py +197 -0
  31. hpc_runner/tui/screens/__init__.py +7 -0
  32. hpc_runner/tui/screens/confirm.py +67 -0
  33. hpc_runner/tui/screens/job_details.py +210 -0
  34. hpc_runner/tui/screens/log_viewer.py +170 -0
  35. hpc_runner/tui/snapshot.py +153 -0
  36. hpc_runner/tui/styles/monitor.tcss +567 -0
  37. hpc_runner-0.2.1.dist-info/METADATA +285 -0
  38. hpc_runner-0.2.1.dist-info/RECORD +56 -0
  39. hpc_runner/schedulers/sge/templates/job.sh.j2 +0 -39
  40. hpc_runner-0.1.1.dist-info/METADATA +0 -46
  41. hpc_runner-0.1.1.dist-info/RECORD +0 -38
  42. {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/WHEEL +0 -0
  43. {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/entry_points.txt +0 -0
@@ -5,26 +5,55 @@ from __future__ import annotations
5
5
  import os
6
6
  import subprocess
7
7
  import tempfile
8
+ import uuid
9
+ from datetime import datetime
8
10
  from pathlib import Path
9
11
  from typing import TYPE_CHECKING
10
12
 
11
13
  from hpc_runner.core.config import get_config
14
+
15
+
16
+ def get_script_dir() -> Path:
17
+ """Get directory for temporary job scripts.
18
+
19
+ Uses HPC_SCRIPT_DIR environment variable if set, otherwise
20
+ defaults to ~/.cache/hpc-runner/scripts/.
21
+
22
+ Returns:
23
+ Path to script directory (created if needed).
24
+ """
25
+ if env_dir := os.environ.get("HPC_SCRIPT_DIR"):
26
+ script_dir = Path(env_dir)
27
+ else:
28
+ script_dir = Path.home() / ".cache" / "hpc-runner" / "scripts"
29
+
30
+ script_dir.mkdir(parents=True, exist_ok=True)
31
+ return script_dir
32
+
33
+
34
+ from hpc_runner.core.job_info import JobInfo
12
35
  from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
13
36
  from hpc_runner.schedulers.base import BaseScheduler
14
37
  from hpc_runner.schedulers.sge.args import (
38
+ SGEArrayArg,
15
39
  SGECpuArg,
16
40
  SGECwdArg,
17
41
  SGEErrorArg,
42
+ SGEHoldArg,
43
+ SGEInheritEnvArg,
18
44
  SGEJobNameArg,
19
- SGEJoinOutputArg,
20
45
  SGEMemArg,
46
+ SGEMergeOutputArg,
21
47
  SGEOutputArg,
48
+ SGEPriorityArg,
22
49
  SGEQueueArg,
50
+ SGEShellArg,
23
51
  SGETimeArg,
24
52
  )
25
53
  from hpc_runner.schedulers.sge.parser import (
26
54
  parse_qacct_output,
27
55
  parse_qstat_plain,
56
+ parse_qstat_xml,
28
57
  parse_qsub_output,
29
58
  state_to_status,
30
59
  )
@@ -40,66 +69,316 @@ class SGEScheduler(BaseScheduler):
40
69
 
41
70
  name = "sge"
42
71
 
43
- # Descriptor-based argument definitions
44
- cpu_arg = SGECpuArg()
45
- mem_arg = SGEMemArg()
46
- time_arg = SGETimeArg()
47
- queue_arg = SGEQueueArg()
48
- job_name_arg = SGEJobNameArg()
49
- stdout_arg = SGEOutputArg()
50
- stderr_arg = SGEErrorArg()
51
- join_output_arg = SGEJoinOutputArg()
52
- cwd_arg = SGECwdArg()
53
-
54
72
  def __init__(self) -> None:
55
- # Load scheduler-specific config
73
+ """Initialize SGE scheduler with config-driven settings."""
56
74
  config = get_config()
57
75
  sge_config = config.get_scheduler_config("sge")
58
76
 
77
+ # Extract config values (also stored as attributes for testing/introspection)
59
78
  self.pe_name = sge_config.get("parallel_environment", "smp")
60
79
  self.mem_resource = sge_config.get("memory_resource", "mem_free")
61
80
  self.time_resource = sge_config.get("time_resource", "h_rt")
62
- self.merge_output_default = sge_config.get("merge_output", True)
63
81
 
64
- def submit(self, job: "Job", interactive: bool = False) -> JobResult:
65
- """Submit a job to SGE."""
82
+ # Module handling config
83
+ self.purge_modules = sge_config.get("purge_modules", False)
84
+ self.silent_modules = sge_config.get("silent_modules", False)
85
+ self.module_init_script = sge_config.get("module_init_script", "")
86
+
87
+ # Environment handling config
88
+ self.expand_makeflags = sge_config.get("expand_makeflags", True)
89
+ self.unset_vars = sge_config.get("unset_vars", [])
90
+
91
+ # Build the argument renderer registry
92
+ # Maps Job attribute names -> SGE argument renderer instances
93
+ # Note: 'nodes' and 'tasks' are NOT mapped - they're Slurm/MPI concepts.
94
+ # If a job has these set, they'll be silently ignored by SGE.
95
+ self.ARG_RENDERERS = {
96
+ # Basic attributes
97
+ "shell": SGEShellArg(),
98
+ "use_cwd": SGECwdArg(),
99
+ "inherit_env": SGEInheritEnvArg(),
100
+ "name": SGEJobNameArg(),
101
+ "queue": SGEQueueArg(),
102
+ "priority": SGEPriorityArg(),
103
+ "stdout": SGEOutputArg(),
104
+ "stderr": SGEErrorArg(),
105
+ # Resource attributes (config-driven)
106
+ "cpu": SGECpuArg(pe_name=self.pe_name),
107
+ "mem": SGEMemArg(resource_name=self.mem_resource),
108
+ "time": SGETimeArg(resource_name=self.time_resource),
109
+ }
110
+
111
+ # Keep references for special-case rendering
112
+ self._array_arg = SGEArrayArg()
113
+ self._hold_arg = SGEHoldArg()
114
+ self._merge_output_arg = SGEMergeOutputArg()
115
+
116
+ # =========================================================================
117
+ # Script Generation
118
+ # =========================================================================
119
+
120
+ def generate_script(
121
+ self,
122
+ job: "Job",
123
+ array_range: str | None = None,
124
+ keep_script: bool = False,
125
+ script_path: str | None = None,
126
+ ) -> str:
127
+ """Generate qsub script using template.
128
+
129
+ Args:
130
+ job: Job to generate script for.
131
+ array_range: Array job range string (e.g., "1-100").
132
+ keep_script: If True, script won't self-delete after execution.
133
+ script_path: Path where script will be written (for self-deletion).
134
+ """
135
+ directives = self._build_directives(job, array_range)
136
+ return render_template(
137
+ "sge/templates/batch.sh.j2",
138
+ job=job,
139
+ scheduler=self,
140
+ directives=directives,
141
+ script_path=script_path,
142
+ keep_script=keep_script,
143
+ )
144
+
145
+ def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
146
+ """Build complete list of #$ directives for the job.
147
+
148
+ Uses the rendering protocol from BaseScheduler, then adds
149
+ special cases that aren't simple attribute mappings.
150
+ """
151
+ directives: list[str] = []
152
+
153
+ # 1. Render standard attributes via protocol
154
+ directives.extend(self.render_directives(job))
155
+
156
+ # 2. Handle output merging (derived from stderr being None)
157
+ if job.merge_output:
158
+ if d := self._merge_output_arg.to_directive(True):
159
+ directives.append(d)
160
+
161
+ # 3. Array job range
162
+ if array_range:
163
+ if d := self._array_arg.to_directive(array_range):
164
+ directives.append(d)
165
+
166
+ # 4. Dependencies
167
+ dep_str = self._build_dependency_string(job)
168
+ if dep_str:
169
+ if d := self._hold_arg.to_directive(dep_str):
170
+ directives.append(d)
171
+
172
+ # 5. Custom resources (ResourceSet)
173
+ for resource in job.resources:
174
+ directives.append(f"#$ -l {resource.name}={resource.value}")
175
+
176
+ # 6. Raw passthrough arguments
177
+ for arg in job.raw_args + job.sge_args:
178
+ if arg.startswith("-"):
179
+ directives.append(f"#$ {arg}")
180
+ else:
181
+ directives.append(f"#$ -{arg}")
182
+
183
+ return directives
184
+
185
+ def _build_dependency_string(self, job: "Job") -> str | None:
186
+ """Build SGE dependency string from job dependencies."""
187
+ # String-based dependency from CLI
188
+ if job.dependency:
189
+ if ":" in job.dependency:
190
+ return job.dependency.split(":", 1)[1]
191
+ return job.dependency
192
+
193
+ # Programmatic dependencies from Job.after()
194
+ if job.dependencies:
195
+ return ",".join(dep.job_id for dep in job.dependencies)
196
+
197
+ return None
198
+
199
+ # =========================================================================
200
+ # Command Building
201
+ # =========================================================================
202
+
203
+ def build_submit_command(self, job: "Job") -> list[str]:
204
+ """Build qsub command line."""
205
+ cmd = ["qsub"]
206
+ cmd.extend(self.render_args(job))
207
+ cmd.extend(job.raw_args)
208
+ cmd.extend(job.sge_args)
209
+ return cmd
210
+
211
+ def build_interactive_command(self, job: "Job") -> list[str]:
212
+ """Build qrsh command for interactive jobs.
213
+
214
+ Note: qrsh supports a subset of qsub options. Notably:
215
+ - Does NOT support: -S (shell), -o/-e (output), -j (join), -N (name)
216
+ - Does support: -V, -pe, -l, -q, -cwd
217
+ """
218
+ import shlex
219
+
220
+ cmd = ["qrsh"]
221
+
222
+ # Only include qrsh-compatible options
223
+ QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
224
+
225
+ for attr_name, value in job.iter_attributes():
226
+ if attr_name not in QRSH_COMPATIBLE:
227
+ continue
228
+ renderer = self.ARG_RENDERERS.get(attr_name)
229
+ if renderer:
230
+ cmd.extend(renderer.to_args(value))
231
+
232
+ cmd.extend(job.raw_args)
233
+ cmd.extend(job.sge_args)
234
+
235
+ # Add the command - split it back into parts for proper argument handling
236
+ # This preserves quoting: "bash -c 'echo hello'" -> ['bash', '-c', 'echo hello']
237
+ cmd.extend(shlex.split(job.command))
238
+
239
+ return cmd
240
+
241
+ # =========================================================================
242
+ # Job Submission
243
+ # =========================================================================
244
+
245
+ def submit(
246
+ self, job: "Job", interactive: bool = False, keep_script: bool = False
247
+ ) -> JobResult:
248
+ """Submit a job to SGE.
249
+
250
+ Args:
251
+ job: Job to submit.
252
+ interactive: If True, run interactively via qrsh.
253
+ keep_script: If True, don't delete the job script after submission.
254
+ Useful for debugging.
255
+ """
66
256
  if interactive:
67
- return self._submit_interactive(job)
68
- return self._submit_batch(job)
257
+ return self._submit_interactive(job, keep_script=keep_script)
258
+ return self._submit_batch(job, keep_script=keep_script)
69
259
 
70
- def _submit_batch(self, job: "Job") -> JobResult:
260
+ def _submit_batch(self, job: "Job", keep_script: bool = False) -> JobResult:
71
261
  """Submit via qsub."""
72
- script = self.generate_script(job)
262
+ # Determine script path first (needed for self-deletion in template)
263
+ script_dir = get_script_dir()
264
+ script_name = f"hpc_batch_{uuid.uuid4().hex[:8]}.sh"
265
+ script_path = script_dir / script_name
266
+
267
+ # Generate script with cleanup instruction
268
+ script = self.generate_script(
269
+ job, keep_script=keep_script, script_path=str(script_path)
270
+ )
73
271
 
74
- with tempfile.NamedTemporaryFile(
75
- mode="w", suffix=".sh", delete=False, prefix="hpc_"
76
- ) as f:
77
- f.write(script)
78
- script_path = f.name
272
+ script_path.write_text(script)
273
+ script_path.chmod(0o755)
274
+
275
+ if keep_script:
276
+ import sys
277
+ print(f"Script saved: {script_path}", file=sys.stderr)
79
278
 
80
279
  try:
81
- cmd = ["qsub", script_path]
82
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
280
+ result = subprocess.run(
281
+ ["qsub", str(script_path)],
282
+ capture_output=True,
283
+ text=True,
284
+ errors="replace",
285
+ check=True,
286
+ )
83
287
  job_id = parse_qsub_output(result.stdout)
84
288
 
85
289
  if job_id is None:
86
- raise RuntimeError(f"Failed to parse job ID from qsub output: {result.stdout}")
290
+ raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
87
291
 
88
292
  return JobResult(job_id=job_id, scheduler=self, job=job)
89
293
  finally:
90
- Path(script_path).unlink(missing_ok=True)
294
+ # Clean up locally after qsub (script is copied to spool)
295
+ # The script inside the job will also self-delete unless keep_script
296
+ if not keep_script:
297
+ script_path.unlink(missing_ok=True)
298
+
299
+ def _submit_interactive(self, job: "Job", keep_script: bool = False) -> JobResult:
300
+ """Submit via qrsh for interactive execution.
91
301
 
92
- def _submit_interactive(self, job: "Job") -> JobResult:
93
- """Submit via qrsh for interactive execution."""
94
- cmd = self.build_interactive_command(job)
302
+ Creates a wrapper script with full environment setup (modules, venv, etc.)
303
+ and executes it via qrsh. The script self-deletes after execution unless
304
+ keep_script is True.
305
+
306
+ Note: Script is written to ~/.cache/hpc-runner/scripts/ (shared filesystem)
307
+ rather than /tmp (which is node-local).
308
+ """
309
+ # Generate unique script path in shared script directory
310
+ script_dir = get_script_dir()
311
+ script_name = f"hpc_interactive_{uuid.uuid4().hex[:8]}.sh"
312
+ script_path = script_dir / script_name
313
+
314
+ # Generate wrapper script with the actual path (for self-deletion)
315
+ script = self._generate_interactive_script(
316
+ job, str(script_path), keep_script=keep_script
317
+ )
318
+
319
+ # Write script to shared filesystem
320
+ script_path.write_text(script)
321
+ script_path.chmod(0o755)
322
+
323
+ if keep_script:
324
+ # Print script path for debugging
325
+ import sys
326
+ print(f"Script saved: {script_path}", file=sys.stderr)
327
+
328
+ # Build qrsh command with script path
329
+ cmd = self._build_qrsh_command(job, str(script_path))
330
+
331
+ # Run and capture exit code
95
332
  result = subprocess.run(cmd, check=False)
96
- # For interactive jobs, we don't have a real job ID
97
- return JobResult(job_id="interactive", scheduler=self, job=job)
333
+
334
+ # Clean up if script still exists and we're not keeping it
335
+ if not keep_script:
336
+ script_path.unlink(missing_ok=True)
337
+
338
+ return JobResult(
339
+ job_id="interactive",
340
+ scheduler=self,
341
+ job=job,
342
+ _exit_code=result.returncode,
343
+ )
344
+
345
+ def _generate_interactive_script(
346
+ self, job: "Job", script_path: str, keep_script: bool = False
347
+ ) -> str:
348
+ """Generate wrapper script for interactive jobs."""
349
+ return render_template(
350
+ "sge/templates/interactive.sh.j2",
351
+ job=job,
352
+ scheduler=self,
353
+ script_path=script_path,
354
+ keep_script=keep_script,
355
+ )
356
+
357
+ def _build_qrsh_command(self, job: "Job", script_path: str) -> list[str]:
358
+ """Build qrsh command to run wrapper script."""
359
+ cmd = ["qrsh"]
360
+
361
+ # Only include qrsh-compatible options
362
+ QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
363
+
364
+ for attr_name, value in job.iter_attributes():
365
+ if attr_name not in QRSH_COMPATIBLE:
366
+ continue
367
+ renderer = self.ARG_RENDERERS.get(attr_name)
368
+ if renderer:
369
+ cmd.extend(renderer.to_args(value))
370
+
371
+ cmd.extend(job.raw_args)
372
+ cmd.extend(job.sge_args)
373
+
374
+ # Execute the wrapper script
375
+ cmd.append(script_path)
376
+
377
+ return cmd
98
378
 
99
379
  def submit_array(self, array: "JobArray") -> ArrayJobResult:
100
380
  """Submit array job."""
101
- job = array.job
102
- script = self.generate_script(job, array_range=array.range_str)
381
+ script = self.generate_script(array.job, array_range=array.range_str)
103
382
 
104
383
  with tempfile.NamedTemporaryFile(
105
384
  mode="w", suffix=".sh", delete=False, prefix="hpc_"
@@ -108,17 +387,25 @@ class SGEScheduler(BaseScheduler):
108
387
  script_path = f.name
109
388
 
110
389
  try:
111
- cmd = ["qsub", script_path]
112
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
390
+ result = subprocess.run(
391
+ ["qsub", script_path],
392
+ capture_output=True,
393
+ text=True,
394
+ check=True,
395
+ )
113
396
  job_id = parse_qsub_output(result.stdout)
114
397
 
115
398
  if job_id is None:
116
- raise RuntimeError(f"Failed to parse job ID from qsub output: {result.stdout}")
399
+ raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
117
400
 
118
401
  return ArrayJobResult(base_job_id=job_id, scheduler=self, array=array)
119
402
  finally:
120
403
  Path(script_path).unlink(missing_ok=True)
121
404
 
405
+ # =========================================================================
406
+ # Job Management
407
+ # =========================================================================
408
+
122
409
  def cancel(self, job_id: str) -> bool:
123
410
  """Cancel a job via qdel."""
124
411
  try:
@@ -128,7 +415,7 @@ class SGEScheduler(BaseScheduler):
128
415
  return False
129
416
 
130
417
  def get_status(self, job_id: str) -> JobStatus:
131
- """Get job status via qstat."""
418
+ """Get job status via qstat/qacct."""
132
419
  # Try qstat first (running/pending jobs)
133
420
  try:
134
421
  result = subprocess.run(
@@ -137,30 +424,17 @@ class SGEScheduler(BaseScheduler):
137
424
  text=True,
138
425
  )
139
426
  if result.returncode == 0:
140
- # Job exists, check state from regular qstat
141
- result2 = subprocess.run(
142
- ["qstat"],
143
- capture_output=True,
144
- text=True,
145
- )
427
+ result2 = subprocess.run(["qstat"], capture_output=True, text=True)
146
428
  if result2.returncode == 0:
147
429
  jobs = parse_qstat_plain(result2.stdout)
148
- # Handle array job task IDs (e.g., 12345.1)
149
430
  base_id = job_id.split(".")[0]
150
431
  if base_id in jobs:
151
- state = jobs[base_id].get("state", "")
152
- return state_to_status(state)
153
- # Check if full ID matches
154
- if job_id in jobs:
155
- state = jobs[job_id].get("state", "")
156
- return state_to_status(state)
157
-
158
- # Job exists but not in qstat output - likely running
432
+ return state_to_status(jobs[base_id].get("state", ""))
159
433
  return JobStatus.RUNNING
160
434
  except subprocess.CalledProcessError:
161
435
  pass
162
436
 
163
- # Job not in qstat, check qacct for completed jobs
437
+ # Check qacct for completed jobs
164
438
  try:
165
439
  result = subprocess.run(
166
440
  ["qacct", "-j", job_id],
@@ -169,11 +443,9 @@ class SGEScheduler(BaseScheduler):
169
443
  )
170
444
  if result.returncode == 0:
171
445
  info = parse_qacct_output(result.stdout)
172
- exit_status = info.get("exit_status", "")
173
- if exit_status == "0":
446
+ if info.get("exit_status") == "0":
174
447
  return JobStatus.COMPLETED
175
- else:
176
- return JobStatus.FAILED
448
+ return JobStatus.FAILED
177
449
  except subprocess.CalledProcessError:
178
450
  pass
179
451
 
@@ -196,130 +468,414 @@ class SGEScheduler(BaseScheduler):
196
468
  pass
197
469
  return None
198
470
 
199
- def get_output_path(self, job_id: str, stream: str) -> Path | None:
200
- """Determine output path.
201
-
202
- SGE uses patterns that need to be resolved.
471
+ # =========================================================================
472
+ # TUI Monitor API
473
+ # =========================================================================
474
+
475
+ def list_active_jobs(
476
+ self,
477
+ user: str | None = None,
478
+ status: set[JobStatus] | None = None,
479
+ queue: str | None = None,
480
+ ) -> list[JobInfo]:
481
+ """List active SGE jobs using qstat -xml.
482
+
483
+ Args:
484
+ user: Filter by username. None = all users.
485
+ status: Filter by status set. None = all active statuses.
486
+ queue: Filter by queue name. None = all queues.
487
+
488
+ Returns:
489
+ List of JobInfo for matching active jobs.
203
490
  """
204
- # This is tricky with SGE as paths can use $JOB_ID, etc.
205
- # For now, return None and let user check
206
- return None
207
-
208
- def generate_script(self, job: "Job", array_range: str | None = None) -> str:
209
- """Generate qsub script using template."""
210
- directives = self._build_directives(job, array_range)
211
- return render_template(
212
- "sge/templates/job.sh.j2",
213
- job=job,
214
- scheduler=self,
215
- directives=directives,
216
- )
217
-
218
- def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
219
- """Build #$ directives."""
220
- directives: list[str] = []
221
-
222
- # Shell
223
- directives.append("#$ -S /bin/bash")
224
-
225
- # Use current working directory
226
- if job.workdir is None:
227
- directives.append("#$ -cwd")
491
+ # Build qstat command
492
+ cmd = ["qstat", "-xml"]
493
+ if user:
494
+ cmd.extend(["-u", user])
495
+ else:
496
+ # Show all users' jobs
497
+ cmd.extend(["-u", "*"])
228
498
 
229
- # Job name
230
- if job.name:
231
- directives.append(f"#$ -N {job.name}")
499
+ try:
500
+ result = subprocess.run(
501
+ cmd,
502
+ capture_output=True,
503
+ text=True,
504
+ errors="replace",
505
+ check=True,
506
+ )
507
+ except subprocess.CalledProcessError:
508
+ # qstat failed - likely no jobs or scheduler not available
509
+ return []
510
+ except FileNotFoundError:
511
+ # qstat not found
512
+ return []
513
+
514
+ # Parse XML output
515
+ parsed_jobs = parse_qstat_xml(result.stdout)
516
+
517
+ # Convert to JobInfo and apply filters
518
+ jobs: list[JobInfo] = []
519
+ for job_id, job_data in parsed_jobs.items():
520
+ # Convert state to JobStatus
521
+ state_str = job_data.get("state", "")
522
+ job_status = state_to_status(state_str)
523
+
524
+ # Apply status filter
525
+ if status is not None and job_status not in status:
526
+ continue
527
+
528
+ # Apply queue filter
529
+ job_queue = job_data.get("queue")
530
+ if queue is not None and job_queue != queue:
531
+ continue
532
+
533
+ # Build JobInfo
534
+ job_info = JobInfo(
535
+ job_id=job_id,
536
+ name=job_data.get("name", job_id),
537
+ user=job_data.get("user", "unknown"),
538
+ status=job_status,
539
+ queue=job_queue,
540
+ cpu=job_data.get("slots"),
541
+ node=job_data.get("node"),
542
+ )
232
543
 
233
- # CPU/slots via parallel environment
234
- if job.cpu:
235
- directives.append(f"#$ -pe {self.pe_name} {job.cpu}")
544
+ # Add timing info if available
545
+ if "submit_time" in job_data:
546
+ job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
547
+ if "start_time" in job_data:
548
+ job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
549
+ # Calculate runtime for running jobs
550
+ if job_info.status == JobStatus.RUNNING:
551
+ job_info.runtime = datetime.now() - job_info.start_time
552
+
553
+ # Array task ID
554
+ if "array_task_id" in job_data:
555
+ try:
556
+ job_info.array_task_id = int(job_data["array_task_id"])
557
+ except ValueError:
558
+ pass # Could be a range like "1-10"
559
+
560
+ jobs.append(job_info)
561
+
562
+ return jobs
563
+
564
+ def list_completed_jobs(
565
+ self,
566
+ user: str | None = None,
567
+ since: datetime | None = None,
568
+ until: datetime | None = None,
569
+ exit_code: int | None = None,
570
+ queue: str | None = None,
571
+ limit: int = 100,
572
+ ) -> list[JobInfo]:
573
+ """List completed SGE jobs from qacct.
574
+
575
+ TODO: Implement using qacct.
576
+ """
577
+ raise NotImplementedError("SGE list_completed_jobs() not yet implemented")
236
578
 
237
- # Memory
238
- if job.mem:
239
- directives.append(f"#$ -l {self.mem_resource}={job.mem}")
579
+ def has_accounting(self) -> bool:
580
+ """Check if SGE accounting is available."""
581
+ return True
240
582
 
241
- # Time
242
- if job.time:
243
- directives.append(f"#$ -l {self.time_resource}={job.time}")
583
+ def get_job_details(self, job_id: str) -> tuple[JobInfo, dict[str, object]]:
584
+ """Get detailed information for an SGE job using qstat -j -xml.
244
585
 
245
- # Queue
246
- if job.queue:
247
- directives.append(f"#$ -q {job.queue}")
586
+ Parses the full job details including output paths, resources, etc.
248
587
 
249
- # Output handling - merge by default
250
- if job.merge_output:
251
- directives.append("#$ -j y")
252
- if job.stdout:
253
- directives.append(f"#$ -o {job.stdout}")
588
+ Returns:
589
+ Tuple of (JobInfo, extra_details dict).
590
+ The extra_details dict contains resources, pe_name, pe_range,
591
+ cwd, script_file, dependencies, project, department.
592
+ """
593
+ cmd = ["qstat", "-j", job_id, "-xml"]
594
+ try:
595
+ result = subprocess.run(
596
+ cmd,
597
+ capture_output=True,
598
+ text=True,
599
+ errors="replace",
600
+ check=True,
601
+ )
602
+ output = result.stdout
603
+ except subprocess.CalledProcessError as exc:
604
+ output = exc.stdout or exc.stderr or ""
605
+ if not output:
606
+ raise ValueError(f"Job {job_id} not found")
607
+ except FileNotFoundError:
608
+ raise RuntimeError("qstat not found")
609
+
610
+ # Parse XML output
611
+ job_data = self._parse_qstat_j_xml(output)
612
+ if not job_data and output:
613
+ raise ValueError(f"Job {job_id} not found")
614
+
615
+ # Separate extra details from JobInfo fields
616
+ extra_details: dict[str, object] = {}
617
+ for key in (
618
+ "resources",
619
+ "pe_name",
620
+ "pe_range",
621
+ "cwd",
622
+ "script_file",
623
+ "dependencies",
624
+ "project",
625
+ "department",
626
+ "job_args",
627
+ "command",
628
+ ):
629
+ if key in job_data:
630
+ extra_details[key] = job_data[key]
631
+
632
+ # Get basic info from qstat -xml first
633
+ basic_jobs = self.list_active_jobs()
634
+ basic_info = next((j for j in basic_jobs if j.job_id == job_id), None)
635
+
636
+ if basic_info:
637
+ # Merge detailed info with basic info
638
+ if job_data.get("stdout_path"):
639
+ basic_info.stdout_path = job_data["stdout_path"]
640
+ if job_data.get("stderr_path"):
641
+ basic_info.stderr_path = job_data["stderr_path"]
642
+ if job_data.get("node"):
643
+ basic_info.node = job_data["node"]
644
+
645
+ # Always use timing from detailed qstat -j output (more reliable)
646
+ if job_data.get("submit_time"):
647
+ basic_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
648
+ if job_data.get("start_time"):
649
+ basic_info.start_time = datetime.fromtimestamp(job_data["start_time"])
650
+ # Calculate runtime if running
651
+ if basic_info.status == JobStatus.RUNNING:
652
+ basic_info.runtime = datetime.now() - basic_info.start_time
653
+
654
+ return basic_info, extra_details
254
655
  else:
255
- if job.stdout:
256
- directives.append(f"#$ -o {job.stdout}")
257
- if job.stderr:
258
- directives.append(f"#$ -e {job.stderr}")
259
-
260
- # Array job
261
- if array_range:
262
- directives.append(f"#$ -t {array_range}")
263
-
264
- # Resources (GRES-style)
265
- for resource in job.resources:
266
- directives.append(f"#$ -l {resource.name}={resource.value}")
267
-
268
- # Dependencies
269
- if job.dependencies:
270
- dep_ids = ",".join(dep.job_id for dep in job.dependencies)
271
- # SGE uses -hold_jid for dependencies
272
- directives.append(f"#$ -hold_jid {dep_ids}")
273
-
274
- # Raw args
275
- for arg in job.raw_args + job.sge_args:
276
- if arg.startswith("-"):
277
- directives.append(f"#$ {arg}")
656
+ # Build from scratch using qstat -j data
657
+ job_info = JobInfo(
658
+ job_id=job_id,
659
+ name=job_data.get("name", job_id),
660
+ user=job_data.get("user", "unknown"),
661
+ status=job_data.get("status", JobStatus.UNKNOWN),
662
+ queue=job_data.get("queue"),
663
+ stdout_path=job_data.get("stdout_path"),
664
+ stderr_path=job_data.get("stderr_path"),
665
+ node=job_data.get("node"),
666
+ )
667
+ # Add timing info
668
+ if job_data.get("submit_time"):
669
+ job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
670
+ if job_data.get("start_time"):
671
+ job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
672
+ if job_info.status == JobStatus.RUNNING:
673
+ job_info.runtime = datetime.now() - job_info.start_time
674
+ return job_info, extra_details
675
+
676
+ def _parse_qstat_j_xml(self, xml_output: str) -> dict[str, object]:
677
+ """Parse qstat -j -xml output to extract job details.
678
+
679
+ Returns a dict with:
680
+ - Basic: name, user, stdout_path, stderr_path
681
+ - Resources: dict of resource_name -> value
682
+ - PE: pe_name, pe_range
683
+ - Paths: cwd, script_file
684
+ - Dependencies: list of job IDs
685
+ - Other: project, department
686
+ """
687
+ import xml.etree.ElementTree as ET
688
+
689
+ data: dict[str, object] = {}
690
+
691
+ root = self._parse_xml_root(xml_output)
692
+ if root is None:
693
+ return data
694
+ self._strip_xml_namespaces(root)
695
+
696
+ # Find job info element
697
+ job_info = root.find(".//JB_job_number/..")
698
+ if job_info is None:
699
+ # Try alternative structure
700
+ job_info = root.find(".//djob_info/element")
701
+ if job_info is None:
702
+ return data
703
+
704
+ # Extract basic fields
705
+ name_elem = job_info.find(".//JB_job_name")
706
+ if name_elem is not None and name_elem.text:
707
+ data["name"] = name_elem.text
708
+
709
+ owner_elem = job_info.find(".//JB_owner")
710
+ if owner_elem is not None and owner_elem.text:
711
+ data["user"] = owner_elem.text
712
+
713
+ # Project and department
714
+ project_elem = job_info.find(".//JB_project")
715
+ if project_elem is not None and project_elem.text:
716
+ data["project"] = project_elem.text
717
+
718
+ dept_elem = job_info.find(".//JB_department")
719
+ if dept_elem is not None and dept_elem.text:
720
+ data["department"] = dept_elem.text
721
+
722
+ # Get cwd for resolving relative paths
723
+ cwd: Path | None = None
724
+ cwd_elem = job_info.find(".//JB_cwd")
725
+ if cwd_elem is not None and cwd_elem.text:
726
+ cwd = Path(cwd_elem.text)
727
+ data["cwd"] = str(cwd)
728
+
729
+ # Script file
730
+ script_elem = job_info.find(".//JB_script_file")
731
+ if script_elem is not None and script_elem.text:
732
+ data["script_file"] = script_elem.text
733
+
734
+ # Job arguments/command
735
+ job_args: list[str] = []
736
+ for arg_elem in job_info.findall(".//JB_job_args//ST_name"):
737
+ if arg_elem.text:
738
+ job_args.append(arg_elem.text)
739
+ if job_args:
740
+ data["job_args"] = job_args
741
+
742
+ # Submission time
743
+ submit_text = job_info.findtext(".//JB_submission_time")
744
+ if submit_text:
745
+ try:
746
+ data["submit_time"] = int(submit_text)
747
+ except ValueError:
748
+ pass
749
+
750
+ # Start time (for running jobs) - in JB_ja_tasks/ulong_sublist/JAT_start_time
751
+ task_start_text = job_info.findtext(
752
+ ".//JB_ja_tasks/ulong_sublist/JAT_start_time"
753
+ )
754
+ if task_start_text:
755
+ try:
756
+ data["start_time"] = int(task_start_text)
757
+ except ValueError:
758
+ pass
759
+
760
+ # Also check direct JAT_start_time (alternative structure)
761
+ if "start_time" not in data:
762
+ start_text = job_info.findtext(".//JAT_start_time")
763
+ if start_text:
764
+ try:
765
+ data["start_time"] = int(start_text)
766
+ except ValueError:
767
+ pass
768
+
769
+ # For interactive jobs (qrsh), get command from QRSH_COMMAND env var
770
+ for env_elem in job_info.findall(".//JB_env_list/job_sublist"):
771
+ var_elem = env_elem.find("VA_variable")
772
+ val_elem = env_elem.find("VA_value")
773
+ if var_elem is not None and var_elem.text == "QRSH_COMMAND":
774
+ if val_elem is not None and val_elem.text:
775
+ data["command"] = self._normalize_qrsh_command(val_elem.text)
776
+ break
777
+
778
+ # stdout path - look for PN_path in stdout_path_list
779
+ stdout_path_elem = job_info.find(".//JB_stdout_path_list//PN_path")
780
+ if stdout_path_elem is not None and stdout_path_elem.text:
781
+ stdout_path = Path(stdout_path_elem.text)
782
+ # Resolve relative paths against cwd
783
+ if not stdout_path.is_absolute() and cwd:
784
+ stdout_path = cwd / stdout_path
785
+ data["stdout_path"] = stdout_path
786
+
787
+ # stderr path
788
+ stderr_path_elem = job_info.find(".//JB_stderr_path_list//PN_path")
789
+ if stderr_path_elem is not None and stderr_path_elem.text:
790
+ stderr_path = Path(stderr_path_elem.text)
791
+ if not stderr_path.is_absolute() and cwd:
792
+ stderr_path = cwd / stderr_path
793
+ data["stderr_path"] = stderr_path
794
+
795
+ # Check merge flag
796
+ merge_elem = job_info.find(".//JB_merge_stderr")
797
+ if merge_elem is not None and merge_elem.text:
798
+ if merge_elem.text.lower() in ("true", "1", "y"):
799
+ data["merge"] = True
800
+
801
+ # If merge is enabled and we have stdout but no stderr, use stdout for both
802
+ if data.get("merge") and data.get("stdout_path") and not data.get("stderr_path"):
803
+ data["stderr_path"] = data["stdout_path"]
804
+
805
+ # Parse hard resource list
806
+ resources: dict[str, str] = {}
807
+ for qstat_elem in job_info.findall(".//JB_hard_resource_list/qstat_l_requests"):
808
+ res_name_elem = qstat_elem.find("CE_name")
809
+ res_val_elem = qstat_elem.find("CE_stringval")
810
+ if res_name_elem is not None and res_name_elem.text:
811
+ res_name = res_name_elem.text
812
+ res_val = res_val_elem.text if res_val_elem is not None else ""
813
+ resources[res_name] = res_val or ""
814
+
815
+ # Also check soft resources
816
+ for qstat_elem in job_info.findall(".//JB_soft_resource_list/qstat_l_requests"):
817
+ res_name_elem = qstat_elem.find("CE_name")
818
+ res_val_elem = qstat_elem.find("CE_stringval")
819
+ if res_name_elem is not None and res_name_elem.text:
820
+ res_name = res_name_elem.text
821
+ res_val = res_val_elem.text if res_val_elem is not None else ""
822
+ resources[f"{res_name} (soft)"] = res_val or ""
823
+
824
+ if resources:
825
+ data["resources"] = resources
826
+
827
+ # Parallel environment
828
+ pe_elem = job_info.find(".//JB_pe")
829
+ if pe_elem is not None and pe_elem.text:
830
+ data["pe_name"] = pe_elem.text
831
+
832
+ # PE range (min-max slots)
833
+ pe_range_min = job_info.find(".//JB_pe_range//RN_min")
834
+ pe_range_max = job_info.find(".//JB_pe_range//RN_max")
835
+ if pe_range_min is not None and pe_range_max is not None:
836
+ min_val = pe_range_min.text or "1"
837
+ max_val = pe_range_max.text or "1"
838
+ if min_val == max_val:
839
+ data["pe_range"] = min_val
278
840
  else:
279
- directives.append(f"#$ -{arg}")
841
+ data["pe_range"] = f"{min_val}-{max_val}"
280
842
 
281
- return directives
282
-
283
- def build_submit_command(self, job: "Job") -> list[str]:
284
- """Build qsub command line."""
285
- cmd = ["qsub"]
286
-
287
- if job.name:
288
- cmd.extend(["-N", job.name])
289
- if job.cpu:
290
- cmd.extend(["-pe", self.pe_name, str(job.cpu)])
291
- if job.mem:
292
- cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
293
- if job.time:
294
- cmd.extend(["-l", f"{self.time_resource}={job.time}"])
295
- if job.queue:
296
- cmd.extend(["-q", job.queue])
297
-
298
- cmd.extend(job.raw_args)
299
- cmd.extend(job.sge_args)
843
+ # Dependencies (predecessor jobs)
844
+ dependencies: list[str] = []
845
+ for dep_elem in job_info.findall(".//JB_jid_predecessor_list//JRE_job_number"):
846
+ if dep_elem.text:
847
+ dependencies.append(dep_elem.text)
848
+ if dependencies:
849
+ data["dependencies"] = dependencies
300
850
 
301
- return cmd
851
+ return data
302
852
 
303
- def build_interactive_command(self, job: "Job") -> list[str]:
304
- """Build qrsh command for interactive jobs."""
305
- cmd = ["qrsh"]
853
+ def _strip_xml_namespaces(self, root: "ET.Element") -> None:
854
+ """Strip namespaces so ElementTree can match tag names directly."""
855
+ import xml.etree.ElementTree as ET
306
856
 
307
- if job.cpu:
308
- cmd.extend(["-pe", self.pe_name, str(job.cpu)])
309
- if job.mem:
310
- cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
311
- if job.time:
312
- cmd.extend(["-l", f"{self.time_resource}={job.time}"])
313
- if job.queue:
314
- cmd.extend(["-q", job.queue])
857
+ for elem in root.iter():
858
+ if isinstance(elem.tag, str) and "}" in elem.tag:
859
+ elem.tag = elem.tag.split("}", 1)[1]
315
860
 
316
- cmd.extend(job.raw_args)
317
- cmd.extend(job.sge_args)
861
+ def _normalize_qrsh_command(self, value: str) -> str:
862
+ """Normalize QRSH_COMMAND by replacing non-ASCII separators with spaces."""
863
+ cleaned = "".join(ch if 32 <= ord(ch) < 127 else " " for ch in value)
864
+ return " ".join(cleaned.split())
318
865
 
319
- # Add the command
320
- if isinstance(job.command, str):
321
- cmd.append(job.command)
322
- else:
323
- cmd.extend(job.command)
866
+ def _parse_xml_root(self, xml_output: str) -> "ET.Element | None":
867
+ """Parse XML output, tolerating leading/trailing non-XML noise."""
868
+ import xml.etree.ElementTree as ET
324
869
 
325
- return cmd
870
+ try:
871
+ return ET.fromstring(xml_output)
872
+ except ET.ParseError:
873
+ pass
874
+ start = xml_output.find("<")
875
+ end = xml_output.rfind(">")
876
+ if start == -1 or end == -1 or end <= start:
877
+ return None
878
+ try:
879
+ return ET.fromstring(xml_output[start : end + 1])
880
+ except ET.ParseError:
881
+ return None