hpc-runner 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. hpc_runner/__init__.py +57 -0
  2. hpc_runner/_version.py +34 -0
  3. hpc_runner/cli/__init__.py +1 -0
  4. hpc_runner/cli/cancel.py +38 -0
  5. hpc_runner/cli/config.py +109 -0
  6. hpc_runner/cli/main.py +76 -0
  7. hpc_runner/cli/monitor.py +30 -0
  8. hpc_runner/cli/run.py +292 -0
  9. hpc_runner/cli/status.py +66 -0
  10. hpc_runner/core/__init__.py +31 -0
  11. hpc_runner/core/config.py +177 -0
  12. hpc_runner/core/descriptors.py +110 -0
  13. hpc_runner/core/exceptions.py +38 -0
  14. hpc_runner/core/job.py +328 -0
  15. hpc_runner/core/job_array.py +58 -0
  16. hpc_runner/core/job_info.py +104 -0
  17. hpc_runner/core/resources.py +49 -0
  18. hpc_runner/core/result.py +161 -0
  19. hpc_runner/core/types.py +13 -0
  20. hpc_runner/py.typed +0 -0
  21. hpc_runner/schedulers/__init__.py +60 -0
  22. hpc_runner/schedulers/base.py +194 -0
  23. hpc_runner/schedulers/detection.py +52 -0
  24. hpc_runner/schedulers/local/__init__.py +5 -0
  25. hpc_runner/schedulers/local/scheduler.py +354 -0
  26. hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
  27. hpc_runner/schedulers/sge/__init__.py +5 -0
  28. hpc_runner/schedulers/sge/args.py +232 -0
  29. hpc_runner/schedulers/sge/parser.py +287 -0
  30. hpc_runner/schedulers/sge/scheduler.py +881 -0
  31. hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
  32. hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
  33. hpc_runner/templates/__init__.py +5 -0
  34. hpc_runner/templates/engine.py +55 -0
  35. hpc_runner/tui/__init__.py +5 -0
  36. hpc_runner/tui/app.py +436 -0
  37. hpc_runner/tui/components/__init__.py +17 -0
  38. hpc_runner/tui/components/detail_panel.py +187 -0
  39. hpc_runner/tui/components/filter_bar.py +174 -0
  40. hpc_runner/tui/components/filter_popup.py +345 -0
  41. hpc_runner/tui/components/job_table.py +260 -0
  42. hpc_runner/tui/providers/__init__.py +5 -0
  43. hpc_runner/tui/providers/jobs.py +197 -0
  44. hpc_runner/tui/screens/__init__.py +7 -0
  45. hpc_runner/tui/screens/confirm.py +67 -0
  46. hpc_runner/tui/screens/job_details.py +210 -0
  47. hpc_runner/tui/screens/log_viewer.py +170 -0
  48. hpc_runner/tui/snapshot.py +153 -0
  49. hpc_runner/tui/styles/monitor.tcss +567 -0
  50. hpc_runner/workflow/__init__.py +6 -0
  51. hpc_runner/workflow/dependency.py +20 -0
  52. hpc_runner/workflow/pipeline.py +180 -0
  53. hpc_runner-0.2.0.dist-info/METADATA +285 -0
  54. hpc_runner-0.2.0.dist-info/RECORD +56 -0
  55. hpc_runner-0.2.0.dist-info/WHEEL +4 -0
  56. hpc_runner-0.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,881 @@
1
+ """SGE scheduler implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import subprocess
7
+ import tempfile
8
+ import uuid
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING
12
+
13
+ from hpc_runner.core.config import get_config
14
+
15
+
16
+ def get_script_dir() -> Path:
17
+ """Get directory for temporary job scripts.
18
+
19
+ Uses HPC_SCRIPT_DIR environment variable if set, otherwise
20
+ defaults to ~/.cache/hpc-runner/scripts/.
21
+
22
+ Returns:
23
+ Path to script directory (created if needed).
24
+ """
25
+ if env_dir := os.environ.get("HPC_SCRIPT_DIR"):
26
+ script_dir = Path(env_dir)
27
+ else:
28
+ script_dir = Path.home() / ".cache" / "hpc-runner" / "scripts"
29
+
30
+ script_dir.mkdir(parents=True, exist_ok=True)
31
+ return script_dir
32
+
33
+
34
+ from hpc_runner.core.job_info import JobInfo
35
+ from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
36
+ from hpc_runner.schedulers.base import BaseScheduler
37
+ from hpc_runner.schedulers.sge.args import (
38
+ SGEArrayArg,
39
+ SGECpuArg,
40
+ SGECwdArg,
41
+ SGEErrorArg,
42
+ SGEHoldArg,
43
+ SGEInheritEnvArg,
44
+ SGEJobNameArg,
45
+ SGEMemArg,
46
+ SGEMergeOutputArg,
47
+ SGEOutputArg,
48
+ SGEPriorityArg,
49
+ SGEQueueArg,
50
+ SGEShellArg,
51
+ SGETimeArg,
52
+ )
53
+ from hpc_runner.schedulers.sge.parser import (
54
+ parse_qacct_output,
55
+ parse_qstat_plain,
56
+ parse_qstat_xml,
57
+ parse_qsub_output,
58
+ state_to_status,
59
+ )
60
+ from hpc_runner.templates import render_template
61
+
62
+ if TYPE_CHECKING:
63
+ from hpc_runner.core.job import Job
64
+ from hpc_runner.core.job_array import JobArray
65
+
66
+
67
+ class SGEScheduler(BaseScheduler):
68
+ """Sun Grid Engine scheduler implementation."""
69
+
70
+ name = "sge"
71
+
72
+ def __init__(self) -> None:
73
+ """Initialize SGE scheduler with config-driven settings."""
74
+ config = get_config()
75
+ sge_config = config.get_scheduler_config("sge")
76
+
77
+ # Extract config values (also stored as attributes for testing/introspection)
78
+ self.pe_name = sge_config.get("parallel_environment", "smp")
79
+ self.mem_resource = sge_config.get("memory_resource", "mem_free")
80
+ self.time_resource = sge_config.get("time_resource", "h_rt")
81
+
82
+ # Module handling config
83
+ self.purge_modules = sge_config.get("purge_modules", False)
84
+ self.silent_modules = sge_config.get("silent_modules", False)
85
+ self.module_init_script = sge_config.get("module_init_script", "")
86
+
87
+ # Environment handling config
88
+ self.expand_makeflags = sge_config.get("expand_makeflags", True)
89
+ self.unset_vars = sge_config.get("unset_vars", [])
90
+
91
+ # Build the argument renderer registry
92
+ # Maps Job attribute names -> SGE argument renderer instances
93
+ # Note: 'nodes' and 'tasks' are NOT mapped - they're Slurm/MPI concepts.
94
+ # If a job has these set, they'll be silently ignored by SGE.
95
+ self.ARG_RENDERERS = {
96
+ # Basic attributes
97
+ "shell": SGEShellArg(),
98
+ "use_cwd": SGECwdArg(),
99
+ "inherit_env": SGEInheritEnvArg(),
100
+ "name": SGEJobNameArg(),
101
+ "queue": SGEQueueArg(),
102
+ "priority": SGEPriorityArg(),
103
+ "stdout": SGEOutputArg(),
104
+ "stderr": SGEErrorArg(),
105
+ # Resource attributes (config-driven)
106
+ "cpu": SGECpuArg(pe_name=self.pe_name),
107
+ "mem": SGEMemArg(resource_name=self.mem_resource),
108
+ "time": SGETimeArg(resource_name=self.time_resource),
109
+ }
110
+
111
+ # Keep references for special-case rendering
112
+ self._array_arg = SGEArrayArg()
113
+ self._hold_arg = SGEHoldArg()
114
+ self._merge_output_arg = SGEMergeOutputArg()
115
+
116
+ # =========================================================================
117
+ # Script Generation
118
+ # =========================================================================
119
+
120
+ def generate_script(
121
+ self,
122
+ job: "Job",
123
+ array_range: str | None = None,
124
+ keep_script: bool = False,
125
+ script_path: str | None = None,
126
+ ) -> str:
127
+ """Generate qsub script using template.
128
+
129
+ Args:
130
+ job: Job to generate script for.
131
+ array_range: Array job range string (e.g., "1-100").
132
+ keep_script: If True, script won't self-delete after execution.
133
+ script_path: Path where script will be written (for self-deletion).
134
+ """
135
+ directives = self._build_directives(job, array_range)
136
+ return render_template(
137
+ "sge/templates/batch.sh.j2",
138
+ job=job,
139
+ scheduler=self,
140
+ directives=directives,
141
+ script_path=script_path,
142
+ keep_script=keep_script,
143
+ )
144
+
145
+ def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
146
+ """Build complete list of #$ directives for the job.
147
+
148
+ Uses the rendering protocol from BaseScheduler, then adds
149
+ special cases that aren't simple attribute mappings.
150
+ """
151
+ directives: list[str] = []
152
+
153
+ # 1. Render standard attributes via protocol
154
+ directives.extend(self.render_directives(job))
155
+
156
+ # 2. Handle output merging (derived from stderr being None)
157
+ if job.merge_output:
158
+ if d := self._merge_output_arg.to_directive(True):
159
+ directives.append(d)
160
+
161
+ # 3. Array job range
162
+ if array_range:
163
+ if d := self._array_arg.to_directive(array_range):
164
+ directives.append(d)
165
+
166
+ # 4. Dependencies
167
+ dep_str = self._build_dependency_string(job)
168
+ if dep_str:
169
+ if d := self._hold_arg.to_directive(dep_str):
170
+ directives.append(d)
171
+
172
+ # 5. Custom resources (ResourceSet)
173
+ for resource in job.resources:
174
+ directives.append(f"#$ -l {resource.name}={resource.value}")
175
+
176
+ # 6. Raw passthrough arguments
177
+ for arg in job.raw_args + job.sge_args:
178
+ if arg.startswith("-"):
179
+ directives.append(f"#$ {arg}")
180
+ else:
181
+ directives.append(f"#$ -{arg}")
182
+
183
+ return directives
184
+
185
+ def _build_dependency_string(self, job: "Job") -> str | None:
186
+ """Build SGE dependency string from job dependencies."""
187
+ # String-based dependency from CLI
188
+ if job.dependency:
189
+ if ":" in job.dependency:
190
+ return job.dependency.split(":", 1)[1]
191
+ return job.dependency
192
+
193
+ # Programmatic dependencies from Job.after()
194
+ if job.dependencies:
195
+ return ",".join(dep.job_id for dep in job.dependencies)
196
+
197
+ return None
198
+
199
+ # =========================================================================
200
+ # Command Building
201
+ # =========================================================================
202
+
203
+ def build_submit_command(self, job: "Job") -> list[str]:
204
+ """Build qsub command line."""
205
+ cmd = ["qsub"]
206
+ cmd.extend(self.render_args(job))
207
+ cmd.extend(job.raw_args)
208
+ cmd.extend(job.sge_args)
209
+ return cmd
210
+
211
+ def build_interactive_command(self, job: "Job") -> list[str]:
212
+ """Build qrsh command for interactive jobs.
213
+
214
+ Note: qrsh supports a subset of qsub options. Notably:
215
+ - Does NOT support: -S (shell), -o/-e (output), -j (join), -N (name)
216
+ - Does support: -V, -pe, -l, -q, -cwd
217
+ """
218
+ import shlex
219
+
220
+ cmd = ["qrsh"]
221
+
222
+ # Only include qrsh-compatible options
223
+ QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
224
+
225
+ for attr_name, value in job.iter_attributes():
226
+ if attr_name not in QRSH_COMPATIBLE:
227
+ continue
228
+ renderer = self.ARG_RENDERERS.get(attr_name)
229
+ if renderer:
230
+ cmd.extend(renderer.to_args(value))
231
+
232
+ cmd.extend(job.raw_args)
233
+ cmd.extend(job.sge_args)
234
+
235
+ # Add the command - split it back into parts for proper argument handling
236
+ # This preserves quoting: "bash -c 'echo hello'" -> ['bash', '-c', 'echo hello']
237
+ cmd.extend(shlex.split(job.command))
238
+
239
+ return cmd
240
+
241
+ # =========================================================================
242
+ # Job Submission
243
+ # =========================================================================
244
+
245
+ def submit(
246
+ self, job: "Job", interactive: bool = False, keep_script: bool = False
247
+ ) -> JobResult:
248
+ """Submit a job to SGE.
249
+
250
+ Args:
251
+ job: Job to submit.
252
+ interactive: If True, run interactively via qrsh.
253
+ keep_script: If True, don't delete the job script after submission.
254
+ Useful for debugging.
255
+ """
256
+ if interactive:
257
+ return self._submit_interactive(job, keep_script=keep_script)
258
+ return self._submit_batch(job, keep_script=keep_script)
259
+
260
+ def _submit_batch(self, job: "Job", keep_script: bool = False) -> JobResult:
261
+ """Submit via qsub."""
262
+ # Determine script path first (needed for self-deletion in template)
263
+ script_dir = get_script_dir()
264
+ script_name = f"hpc_batch_{uuid.uuid4().hex[:8]}.sh"
265
+ script_path = script_dir / script_name
266
+
267
+ # Generate script with cleanup instruction
268
+ script = self.generate_script(
269
+ job, keep_script=keep_script, script_path=str(script_path)
270
+ )
271
+
272
+ script_path.write_text(script)
273
+ script_path.chmod(0o755)
274
+
275
+ if keep_script:
276
+ import sys
277
+ print(f"Script saved: {script_path}", file=sys.stderr)
278
+
279
+ try:
280
+ result = subprocess.run(
281
+ ["qsub", str(script_path)],
282
+ capture_output=True,
283
+ text=True,
284
+ errors="replace",
285
+ check=True,
286
+ )
287
+ job_id = parse_qsub_output(result.stdout)
288
+
289
+ if job_id is None:
290
+ raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
291
+
292
+ return JobResult(job_id=job_id, scheduler=self, job=job)
293
+ finally:
294
+ # Clean up locally after qsub (script is copied to spool)
295
+ # The script inside the job will also self-delete unless keep_script
296
+ if not keep_script:
297
+ script_path.unlink(missing_ok=True)
298
+
299
+ def _submit_interactive(self, job: "Job", keep_script: bool = False) -> JobResult:
300
+ """Submit via qrsh for interactive execution.
301
+
302
+ Creates a wrapper script with full environment setup (modules, venv, etc.)
303
+ and executes it via qrsh. The script self-deletes after execution unless
304
+ keep_script is True.
305
+
306
+ Note: Script is written to ~/.cache/hpc-runner/scripts/ (shared filesystem)
307
+ rather than /tmp (which is node-local).
308
+ """
309
+ # Generate unique script path in shared script directory
310
+ script_dir = get_script_dir()
311
+ script_name = f"hpc_interactive_{uuid.uuid4().hex[:8]}.sh"
312
+ script_path = script_dir / script_name
313
+
314
+ # Generate wrapper script with the actual path (for self-deletion)
315
+ script = self._generate_interactive_script(
316
+ job, str(script_path), keep_script=keep_script
317
+ )
318
+
319
+ # Write script to shared filesystem
320
+ script_path.write_text(script)
321
+ script_path.chmod(0o755)
322
+
323
+ if keep_script:
324
+ # Print script path for debugging
325
+ import sys
326
+ print(f"Script saved: {script_path}", file=sys.stderr)
327
+
328
+ # Build qrsh command with script path
329
+ cmd = self._build_qrsh_command(job, str(script_path))
330
+
331
+ # Run and capture exit code
332
+ result = subprocess.run(cmd, check=False)
333
+
334
+ # Clean up if script still exists and we're not keeping it
335
+ if not keep_script:
336
+ script_path.unlink(missing_ok=True)
337
+
338
+ return JobResult(
339
+ job_id="interactive",
340
+ scheduler=self,
341
+ job=job,
342
+ _exit_code=result.returncode,
343
+ )
344
+
345
+ def _generate_interactive_script(
346
+ self, job: "Job", script_path: str, keep_script: bool = False
347
+ ) -> str:
348
+ """Generate wrapper script for interactive jobs."""
349
+ return render_template(
350
+ "sge/templates/interactive.sh.j2",
351
+ job=job,
352
+ scheduler=self,
353
+ script_path=script_path,
354
+ keep_script=keep_script,
355
+ )
356
+
357
+ def _build_qrsh_command(self, job: "Job", script_path: str) -> list[str]:
358
+ """Build qrsh command to run wrapper script."""
359
+ cmd = ["qrsh"]
360
+
361
+ # Only include qrsh-compatible options
362
+ QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
363
+
364
+ for attr_name, value in job.iter_attributes():
365
+ if attr_name not in QRSH_COMPATIBLE:
366
+ continue
367
+ renderer = self.ARG_RENDERERS.get(attr_name)
368
+ if renderer:
369
+ cmd.extend(renderer.to_args(value))
370
+
371
+ cmd.extend(job.raw_args)
372
+ cmd.extend(job.sge_args)
373
+
374
+ # Execute the wrapper script
375
+ cmd.append(script_path)
376
+
377
+ return cmd
378
+
379
+ def submit_array(self, array: "JobArray") -> ArrayJobResult:
380
+ """Submit array job."""
381
+ script = self.generate_script(array.job, array_range=array.range_str)
382
+
383
+ with tempfile.NamedTemporaryFile(
384
+ mode="w", suffix=".sh", delete=False, prefix="hpc_"
385
+ ) as f:
386
+ f.write(script)
387
+ script_path = f.name
388
+
389
+ try:
390
+ result = subprocess.run(
391
+ ["qsub", script_path],
392
+ capture_output=True,
393
+ text=True,
394
+ check=True,
395
+ )
396
+ job_id = parse_qsub_output(result.stdout)
397
+
398
+ if job_id is None:
399
+ raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
400
+
401
+ return ArrayJobResult(base_job_id=job_id, scheduler=self, array=array)
402
+ finally:
403
+ Path(script_path).unlink(missing_ok=True)
404
+
405
+ # =========================================================================
406
+ # Job Management
407
+ # =========================================================================
408
+
409
+ def cancel(self, job_id: str) -> bool:
410
+ """Cancel a job via qdel."""
411
+ try:
412
+ subprocess.run(["qdel", job_id], check=True, capture_output=True)
413
+ return True
414
+ except subprocess.CalledProcessError:
415
+ return False
416
+
417
+ def get_status(self, job_id: str) -> JobStatus:
418
+ """Get job status via qstat/qacct."""
419
+ # Try qstat first (running/pending jobs)
420
+ try:
421
+ result = subprocess.run(
422
+ ["qstat", "-j", job_id],
423
+ capture_output=True,
424
+ text=True,
425
+ )
426
+ if result.returncode == 0:
427
+ result2 = subprocess.run(["qstat"], capture_output=True, text=True)
428
+ if result2.returncode == 0:
429
+ jobs = parse_qstat_plain(result2.stdout)
430
+ base_id = job_id.split(".")[0]
431
+ if base_id in jobs:
432
+ return state_to_status(jobs[base_id].get("state", ""))
433
+ return JobStatus.RUNNING
434
+ except subprocess.CalledProcessError:
435
+ pass
436
+
437
+ # Check qacct for completed jobs
438
+ try:
439
+ result = subprocess.run(
440
+ ["qacct", "-j", job_id],
441
+ capture_output=True,
442
+ text=True,
443
+ )
444
+ if result.returncode == 0:
445
+ info = parse_qacct_output(result.stdout)
446
+ if info.get("exit_status") == "0":
447
+ return JobStatus.COMPLETED
448
+ return JobStatus.FAILED
449
+ except subprocess.CalledProcessError:
450
+ pass
451
+
452
+ return JobStatus.UNKNOWN
453
+
454
+ def get_exit_code(self, job_id: str) -> int | None:
455
+ """Get exit code from qacct."""
456
+ try:
457
+ result = subprocess.run(
458
+ ["qacct", "-j", job_id],
459
+ capture_output=True,
460
+ text=True,
461
+ )
462
+ if result.returncode == 0:
463
+ info = parse_qacct_output(result.stdout)
464
+ exit_status = info.get("exit_status")
465
+ if exit_status is not None:
466
+ return int(exit_status)
467
+ except (subprocess.CalledProcessError, ValueError):
468
+ pass
469
+ return None
470
+
471
+ # =========================================================================
472
+ # TUI Monitor API
473
+ # =========================================================================
474
+
475
+ def list_active_jobs(
476
+ self,
477
+ user: str | None = None,
478
+ status: set[JobStatus] | None = None,
479
+ queue: str | None = None,
480
+ ) -> list[JobInfo]:
481
+ """List active SGE jobs using qstat -xml.
482
+
483
+ Args:
484
+ user: Filter by username. None = all users.
485
+ status: Filter by status set. None = all active statuses.
486
+ queue: Filter by queue name. None = all queues.
487
+
488
+ Returns:
489
+ List of JobInfo for matching active jobs.
490
+ """
491
+ # Build qstat command
492
+ cmd = ["qstat", "-xml"]
493
+ if user:
494
+ cmd.extend(["-u", user])
495
+ else:
496
+ # Show all users' jobs
497
+ cmd.extend(["-u", "*"])
498
+
499
+ try:
500
+ result = subprocess.run(
501
+ cmd,
502
+ capture_output=True,
503
+ text=True,
504
+ errors="replace",
505
+ check=True,
506
+ )
507
+ except subprocess.CalledProcessError:
508
+ # qstat failed - likely no jobs or scheduler not available
509
+ return []
510
+ except FileNotFoundError:
511
+ # qstat not found
512
+ return []
513
+
514
+ # Parse XML output
515
+ parsed_jobs = parse_qstat_xml(result.stdout)
516
+
517
+ # Convert to JobInfo and apply filters
518
+ jobs: list[JobInfo] = []
519
+ for job_id, job_data in parsed_jobs.items():
520
+ # Convert state to JobStatus
521
+ state_str = job_data.get("state", "")
522
+ job_status = state_to_status(state_str)
523
+
524
+ # Apply status filter
525
+ if status is not None and job_status not in status:
526
+ continue
527
+
528
+ # Apply queue filter
529
+ job_queue = job_data.get("queue")
530
+ if queue is not None and job_queue != queue:
531
+ continue
532
+
533
+ # Build JobInfo
534
+ job_info = JobInfo(
535
+ job_id=job_id,
536
+ name=job_data.get("name", job_id),
537
+ user=job_data.get("user", "unknown"),
538
+ status=job_status,
539
+ queue=job_queue,
540
+ cpu=job_data.get("slots"),
541
+ node=job_data.get("node"),
542
+ )
543
+
544
+ # Add timing info if available
545
+ if "submit_time" in job_data:
546
+ job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
547
+ if "start_time" in job_data:
548
+ job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
549
+ # Calculate runtime for running jobs
550
+ if job_info.status == JobStatus.RUNNING:
551
+ job_info.runtime = datetime.now() - job_info.start_time
552
+
553
+ # Array task ID
554
+ if "array_task_id" in job_data:
555
+ try:
556
+ job_info.array_task_id = int(job_data["array_task_id"])
557
+ except ValueError:
558
+ pass # Could be a range like "1-10"
559
+
560
+ jobs.append(job_info)
561
+
562
+ return jobs
563
+
564
+ def list_completed_jobs(
565
+ self,
566
+ user: str | None = None,
567
+ since: datetime | None = None,
568
+ until: datetime | None = None,
569
+ exit_code: int | None = None,
570
+ queue: str | None = None,
571
+ limit: int = 100,
572
+ ) -> list[JobInfo]:
573
+ """List completed SGE jobs from qacct.
574
+
575
+ TODO: Implement using qacct.
576
+ """
577
+ raise NotImplementedError("SGE list_completed_jobs() not yet implemented")
578
+
579
+ def has_accounting(self) -> bool:
580
+ """Check if SGE accounting is available."""
581
+ return True
582
+
583
+ def get_job_details(self, job_id: str) -> tuple[JobInfo, dict[str, object]]:
584
+ """Get detailed information for an SGE job using qstat -j -xml.
585
+
586
+ Parses the full job details including output paths, resources, etc.
587
+
588
+ Returns:
589
+ Tuple of (JobInfo, extra_details dict).
590
+ The extra_details dict contains resources, pe_name, pe_range,
591
+ cwd, script_file, dependencies, project, department.
592
+ """
593
+ cmd = ["qstat", "-j", job_id, "-xml"]
594
+ try:
595
+ result = subprocess.run(
596
+ cmd,
597
+ capture_output=True,
598
+ text=True,
599
+ errors="replace",
600
+ check=True,
601
+ )
602
+ output = result.stdout
603
+ except subprocess.CalledProcessError as exc:
604
+ output = exc.stdout or exc.stderr or ""
605
+ if not output:
606
+ raise ValueError(f"Job {job_id} not found")
607
+ except FileNotFoundError:
608
+ raise RuntimeError("qstat not found")
609
+
610
+ # Parse XML output
611
+ job_data = self._parse_qstat_j_xml(output)
612
+ if not job_data and output:
613
+ raise ValueError(f"Job {job_id} not found")
614
+
615
+ # Separate extra details from JobInfo fields
616
+ extra_details: dict[str, object] = {}
617
+ for key in (
618
+ "resources",
619
+ "pe_name",
620
+ "pe_range",
621
+ "cwd",
622
+ "script_file",
623
+ "dependencies",
624
+ "project",
625
+ "department",
626
+ "job_args",
627
+ "command",
628
+ ):
629
+ if key in job_data:
630
+ extra_details[key] = job_data[key]
631
+
632
+ # Get basic info from qstat -xml first
633
+ basic_jobs = self.list_active_jobs()
634
+ basic_info = next((j for j in basic_jobs if j.job_id == job_id), None)
635
+
636
+ if basic_info:
637
+ # Merge detailed info with basic info
638
+ if job_data.get("stdout_path"):
639
+ basic_info.stdout_path = job_data["stdout_path"]
640
+ if job_data.get("stderr_path"):
641
+ basic_info.stderr_path = job_data["stderr_path"]
642
+ if job_data.get("node"):
643
+ basic_info.node = job_data["node"]
644
+
645
+ # Always use timing from detailed qstat -j output (more reliable)
646
+ if job_data.get("submit_time"):
647
+ basic_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
648
+ if job_data.get("start_time"):
649
+ basic_info.start_time = datetime.fromtimestamp(job_data["start_time"])
650
+ # Calculate runtime if running
651
+ if basic_info.status == JobStatus.RUNNING:
652
+ basic_info.runtime = datetime.now() - basic_info.start_time
653
+
654
+ return basic_info, extra_details
655
+ else:
656
+ # Build from scratch using qstat -j data
657
+ job_info = JobInfo(
658
+ job_id=job_id,
659
+ name=job_data.get("name", job_id),
660
+ user=job_data.get("user", "unknown"),
661
+ status=job_data.get("status", JobStatus.UNKNOWN),
662
+ queue=job_data.get("queue"),
663
+ stdout_path=job_data.get("stdout_path"),
664
+ stderr_path=job_data.get("stderr_path"),
665
+ node=job_data.get("node"),
666
+ )
667
+ # Add timing info
668
+ if job_data.get("submit_time"):
669
+ job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
670
+ if job_data.get("start_time"):
671
+ job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
672
+ if job_info.status == JobStatus.RUNNING:
673
+ job_info.runtime = datetime.now() - job_info.start_time
674
+ return job_info, extra_details
675
+
676
+ def _parse_qstat_j_xml(self, xml_output: str) -> dict[str, object]:
677
+ """Parse qstat -j -xml output to extract job details.
678
+
679
+ Returns a dict with:
680
+ - Basic: name, user, stdout_path, stderr_path
681
+ - Resources: dict of resource_name -> value
682
+ - PE: pe_name, pe_range
683
+ - Paths: cwd, script_file
684
+ - Dependencies: list of job IDs
685
+ - Other: project, department
686
+ """
687
+ import xml.etree.ElementTree as ET
688
+
689
+ data: dict[str, object] = {}
690
+
691
+ root = self._parse_xml_root(xml_output)
692
+ if root is None:
693
+ return data
694
+ self._strip_xml_namespaces(root)
695
+
696
+ # Find job info element
697
+ job_info = root.find(".//JB_job_number/..")
698
+ if job_info is None:
699
+ # Try alternative structure
700
+ job_info = root.find(".//djob_info/element")
701
+ if job_info is None:
702
+ return data
703
+
704
+ # Extract basic fields
705
+ name_elem = job_info.find(".//JB_job_name")
706
+ if name_elem is not None and name_elem.text:
707
+ data["name"] = name_elem.text
708
+
709
+ owner_elem = job_info.find(".//JB_owner")
710
+ if owner_elem is not None and owner_elem.text:
711
+ data["user"] = owner_elem.text
712
+
713
+ # Project and department
714
+ project_elem = job_info.find(".//JB_project")
715
+ if project_elem is not None and project_elem.text:
716
+ data["project"] = project_elem.text
717
+
718
+ dept_elem = job_info.find(".//JB_department")
719
+ if dept_elem is not None and dept_elem.text:
720
+ data["department"] = dept_elem.text
721
+
722
+ # Get cwd for resolving relative paths
723
+ cwd: Path | None = None
724
+ cwd_elem = job_info.find(".//JB_cwd")
725
+ if cwd_elem is not None and cwd_elem.text:
726
+ cwd = Path(cwd_elem.text)
727
+ data["cwd"] = str(cwd)
728
+
729
+ # Script file
730
+ script_elem = job_info.find(".//JB_script_file")
731
+ if script_elem is not None and script_elem.text:
732
+ data["script_file"] = script_elem.text
733
+
734
+ # Job arguments/command
735
+ job_args: list[str] = []
736
+ for arg_elem in job_info.findall(".//JB_job_args//ST_name"):
737
+ if arg_elem.text:
738
+ job_args.append(arg_elem.text)
739
+ if job_args:
740
+ data["job_args"] = job_args
741
+
742
+ # Submission time
743
+ submit_text = job_info.findtext(".//JB_submission_time")
744
+ if submit_text:
745
+ try:
746
+ data["submit_time"] = int(submit_text)
747
+ except ValueError:
748
+ pass
749
+
750
+ # Start time (for running jobs) - in JB_ja_tasks/ulong_sublist/JAT_start_time
751
+ task_start_text = job_info.findtext(
752
+ ".//JB_ja_tasks/ulong_sublist/JAT_start_time"
753
+ )
754
+ if task_start_text:
755
+ try:
756
+ data["start_time"] = int(task_start_text)
757
+ except ValueError:
758
+ pass
759
+
760
+ # Also check direct JAT_start_time (alternative structure)
761
+ if "start_time" not in data:
762
+ start_text = job_info.findtext(".//JAT_start_time")
763
+ if start_text:
764
+ try:
765
+ data["start_time"] = int(start_text)
766
+ except ValueError:
767
+ pass
768
+
769
+ # For interactive jobs (qrsh), get command from QRSH_COMMAND env var
770
+ for env_elem in job_info.findall(".//JB_env_list/job_sublist"):
771
+ var_elem = env_elem.find("VA_variable")
772
+ val_elem = env_elem.find("VA_value")
773
+ if var_elem is not None and var_elem.text == "QRSH_COMMAND":
774
+ if val_elem is not None and val_elem.text:
775
+ data["command"] = self._normalize_qrsh_command(val_elem.text)
776
+ break
777
+
778
+ # stdout path - look for PN_path in stdout_path_list
779
+ stdout_path_elem = job_info.find(".//JB_stdout_path_list//PN_path")
780
+ if stdout_path_elem is not None and stdout_path_elem.text:
781
+ stdout_path = Path(stdout_path_elem.text)
782
+ # Resolve relative paths against cwd
783
+ if not stdout_path.is_absolute() and cwd:
784
+ stdout_path = cwd / stdout_path
785
+ data["stdout_path"] = stdout_path
786
+
787
+ # stderr path
788
+ stderr_path_elem = job_info.find(".//JB_stderr_path_list//PN_path")
789
+ if stderr_path_elem is not None and stderr_path_elem.text:
790
+ stderr_path = Path(stderr_path_elem.text)
791
+ if not stderr_path.is_absolute() and cwd:
792
+ stderr_path = cwd / stderr_path
793
+ data["stderr_path"] = stderr_path
794
+
795
+ # Check merge flag
796
+ merge_elem = job_info.find(".//JB_merge_stderr")
797
+ if merge_elem is not None and merge_elem.text:
798
+ if merge_elem.text.lower() in ("true", "1", "y"):
799
+ data["merge"] = True
800
+
801
+ # If merge is enabled and we have stdout but no stderr, use stdout for both
802
+ if data.get("merge") and data.get("stdout_path") and not data.get("stderr_path"):
803
+ data["stderr_path"] = data["stdout_path"]
804
+
805
+ # Parse hard resource list
806
+ resources: dict[str, str] = {}
807
+ for qstat_elem in job_info.findall(".//JB_hard_resource_list/qstat_l_requests"):
808
+ res_name_elem = qstat_elem.find("CE_name")
809
+ res_val_elem = qstat_elem.find("CE_stringval")
810
+ if res_name_elem is not None and res_name_elem.text:
811
+ res_name = res_name_elem.text
812
+ res_val = res_val_elem.text if res_val_elem is not None else ""
813
+ resources[res_name] = res_val or ""
814
+
815
+ # Also check soft resources
816
+ for qstat_elem in job_info.findall(".//JB_soft_resource_list/qstat_l_requests"):
817
+ res_name_elem = qstat_elem.find("CE_name")
818
+ res_val_elem = qstat_elem.find("CE_stringval")
819
+ if res_name_elem is not None and res_name_elem.text:
820
+ res_name = res_name_elem.text
821
+ res_val = res_val_elem.text if res_val_elem is not None else ""
822
+ resources[f"{res_name} (soft)"] = res_val or ""
823
+
824
+ if resources:
825
+ data["resources"] = resources
826
+
827
+ # Parallel environment
828
+ pe_elem = job_info.find(".//JB_pe")
829
+ if pe_elem is not None and pe_elem.text:
830
+ data["pe_name"] = pe_elem.text
831
+
832
+ # PE range (min-max slots)
833
+ pe_range_min = job_info.find(".//JB_pe_range//RN_min")
834
+ pe_range_max = job_info.find(".//JB_pe_range//RN_max")
835
+ if pe_range_min is not None and pe_range_max is not None:
836
+ min_val = pe_range_min.text or "1"
837
+ max_val = pe_range_max.text or "1"
838
+ if min_val == max_val:
839
+ data["pe_range"] = min_val
840
+ else:
841
+ data["pe_range"] = f"{min_val}-{max_val}"
842
+
843
+ # Dependencies (predecessor jobs)
844
+ dependencies: list[str] = []
845
+ for dep_elem in job_info.findall(".//JB_jid_predecessor_list//JRE_job_number"):
846
+ if dep_elem.text:
847
+ dependencies.append(dep_elem.text)
848
+ if dependencies:
849
+ data["dependencies"] = dependencies
850
+
851
+ return data
852
+
853
+ def _strip_xml_namespaces(self, root: "ET.Element") -> None:
854
+ """Strip namespaces so ElementTree can match tag names directly."""
855
+ import xml.etree.ElementTree as ET
856
+
857
+ for elem in root.iter():
858
+ if isinstance(elem.tag, str) and "}" in elem.tag:
859
+ elem.tag = elem.tag.split("}", 1)[1]
860
+
861
+ def _normalize_qrsh_command(self, value: str) -> str:
862
+ """Normalize QRSH_COMMAND by replacing non-ASCII separators with spaces."""
863
+ cleaned = "".join(ch if 32 <= ord(ch) < 127 else " " for ch in value)
864
+ return " ".join(cleaned.split())
865
+
866
+ def _parse_xml_root(self, xml_output: str) -> "ET.Element | None":
867
+ """Parse XML output, tolerating leading/trailing non-XML noise."""
868
+ import xml.etree.ElementTree as ET
869
+
870
+ try:
871
+ return ET.fromstring(xml_output)
872
+ except ET.ParseError:
873
+ pass
874
+ start = xml_output.find("<")
875
+ end = xml_output.rfind(">")
876
+ if start == -1 or end == -1 or end <= start:
877
+ return None
878
+ try:
879
+ return ET.fromstring(xml_output[start : end + 1])
880
+ except ET.ParseError:
881
+ return None