hpc-runner 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. hpc_runner/_version.py +2 -2
  2. hpc_runner/cli/cancel.py +1 -1
  3. hpc_runner/cli/config.py +2 -2
  4. hpc_runner/cli/main.py +17 -13
  5. hpc_runner/cli/monitor.py +30 -0
  6. hpc_runner/cli/run.py +223 -67
  7. hpc_runner/cli/status.py +6 -5
  8. hpc_runner/core/__init__.py +30 -0
  9. hpc_runner/core/descriptors.py +87 -33
  10. hpc_runner/core/exceptions.py +9 -0
  11. hpc_runner/core/job.py +272 -93
  12. hpc_runner/core/job_info.py +104 -0
  13. hpc_runner/core/result.py +4 -0
  14. hpc_runner/schedulers/base.py +148 -30
  15. hpc_runner/schedulers/detection.py +22 -4
  16. hpc_runner/schedulers/local/scheduler.py +119 -2
  17. hpc_runner/schedulers/sge/args.py +161 -94
  18. hpc_runner/schedulers/sge/parser.py +106 -13
  19. hpc_runner/schedulers/sge/scheduler.py +727 -171
  20. hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
  21. hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
  22. hpc_runner/tui/__init__.py +5 -0
  23. hpc_runner/tui/app.py +436 -0
  24. hpc_runner/tui/components/__init__.py +17 -0
  25. hpc_runner/tui/components/detail_panel.py +187 -0
  26. hpc_runner/tui/components/filter_bar.py +174 -0
  27. hpc_runner/tui/components/filter_popup.py +345 -0
  28. hpc_runner/tui/components/job_table.py +260 -0
  29. hpc_runner/tui/providers/__init__.py +5 -0
  30. hpc_runner/tui/providers/jobs.py +197 -0
  31. hpc_runner/tui/screens/__init__.py +7 -0
  32. hpc_runner/tui/screens/confirm.py +67 -0
  33. hpc_runner/tui/screens/job_details.py +210 -0
  34. hpc_runner/tui/screens/log_viewer.py +170 -0
  35. hpc_runner/tui/snapshot.py +153 -0
  36. hpc_runner/tui/styles/monitor.tcss +567 -0
  37. hpc_runner-0.2.1.dist-info/METADATA +285 -0
  38. hpc_runner-0.2.1.dist-info/RECORD +56 -0
  39. hpc_runner/schedulers/sge/templates/job.sh.j2 +0 -39
  40. hpc_runner-0.1.1.dist-info/METADATA +0 -46
  41. hpc_runner-0.1.1.dist-info/RECORD +0 -38
  42. {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/WHEEL +0 -0
  43. {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/entry_points.txt +0 -0
hpc_runner/core/result.py CHANGED
@@ -36,6 +36,7 @@ class JobResult:
36
36
  job: "Job"
37
37
 
38
38
  _cached_status: JobStatus | None = field(default=None, repr=False)
39
+ _exit_code: int | None = field(default=None, repr=False) # For interactive jobs
39
40
 
40
41
  @property
41
42
  def status(self) -> JobStatus:
@@ -55,6 +56,9 @@ class JobResult:
55
56
  @property
56
57
  def returncode(self) -> int | None:
57
58
  """Get exit code (None if not complete)."""
59
+ # For interactive jobs, use cached exit code
60
+ if self._exit_code is not None:
61
+ return self._exit_code
58
62
  if not self.is_complete:
59
63
  return None
60
64
  return self.scheduler.get_exit_code(self.job_id)
@@ -1,41 +1,108 @@
1
- """Abstract base class for scheduler implementations."""
1
+ """Base scheduler with rendering protocol."""
2
+
3
+ from __future__ import annotations
2
4
 
3
5
  from abc import ABC, abstractmethod
6
+ from datetime import datetime
4
7
  from pathlib import Path
5
8
  from typing import TYPE_CHECKING
6
9
 
10
+ from hpc_runner.core.descriptors import SchedulerArg
11
+
7
12
  if TYPE_CHECKING:
8
13
  from hpc_runner.core.job import Job
9
14
  from hpc_runner.core.job_array import JobArray
15
+ from hpc_runner.core.job_info import JobInfo
10
16
  from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
11
17
 
12
18
 
13
19
  class BaseScheduler(ABC):
14
- """Abstract base class for scheduler implementations.
15
-
16
- Each scheduler must implement:
17
- - submit(): Submit a job
18
- - submit_array(): Submit an array job
19
- - cancel(): Cancel a job
20
- - get_status(): Query job status
21
- - get_exit_code(): Get job exit code
22
- - get_output_path(): Get output file path
23
- - generate_script(): Generate job script
24
- - build_submit_command(): Build submission command
20
+ """Abstract base class for HPC schedulers.
21
+
22
+ Subclasses must:
23
+ 1. Define `name` class attribute
24
+ 2. Populate `ARG_RENDERERS` dict mapping Job attribute names to SchedulerArg instances
25
+ 3. Implement abstract methods for job submission and management
26
+
27
+ The rendering protocol:
28
+ - `render_directives(job)` - Returns list of script directives
29
+ - `render_args(job)` - Returns list of command-line arguments
30
+
31
+ Both methods iterate over job.iter_attributes() and use ARG_RENDERERS
32
+ to convert values to scheduler-specific syntax.
25
33
  """
26
34
 
27
- name: str # e.g., "sge", "slurm", "local"
35
+ name: str = ""
28
36
 
29
- @abstractmethod
30
- def submit(self, job: "Job", interactive: bool = False) -> "JobResult":
31
- """Submit a job to the scheduler.
37
+ # Subclasses populate this in __init__ with config-driven values
38
+ ARG_RENDERERS: dict[str, SchedulerArg] = {}
39
+
40
+ # =========================================================================
41
+ # Rendering Protocol
42
+ # =========================================================================
43
+
44
+ def render_directives(self, job: "Job") -> list[str]:
45
+ """Render job attributes as script directives.
46
+
47
+ Iterates over job's renderable attributes and uses ARG_RENDERERS
48
+ to convert each to the appropriate directive format.
49
+
50
+ Args:
51
+ job: The job to render
52
+
53
+ Returns:
54
+ List of directive strings (e.g., ['#$ -N jobname', '#$ -pe smp 4'])
55
+ """
56
+ directives: list[str] = []
57
+
58
+ for attr_name, value in job.iter_attributes():
59
+ renderer = self.ARG_RENDERERS.get(attr_name)
60
+ if renderer is None:
61
+ continue
62
+
63
+ directive = renderer.to_directive(value)
64
+ if directive is not None:
65
+ directives.append(directive)
66
+
67
+ return directives
68
+
69
+ def render_args(self, job: "Job") -> list[str]:
70
+ """Render job attributes as command-line arguments.
71
+
72
+ Iterates over job's renderable attributes and uses ARG_RENDERERS
73
+ to convert each to command-line argument format.
32
74
 
33
75
  Args:
34
- job: Job specification
35
- interactive: Run interactively (blocking)
76
+ job: The job to render
36
77
 
37
78
  Returns:
38
- JobResult with job ID and methods
79
+ List of argument strings (e.g., ['-N', 'jobname', '-pe', 'smp', '4'])
80
+ """
81
+ args: list[str] = []
82
+
83
+ for attr_name, value in job.iter_attributes():
84
+ renderer = self.ARG_RENDERERS.get(attr_name)
85
+ if renderer is None:
86
+ continue
87
+
88
+ args.extend(renderer.to_args(value))
89
+
90
+ return args
91
+
92
+ # =========================================================================
93
+ # Abstract Methods - Subclasses must implement
94
+ # =========================================================================
95
+
96
+ @abstractmethod
97
+ def submit(
98
+ self, job: "Job", interactive: bool = False, keep_script: bool = False
99
+ ) -> "JobResult":
100
+ """Submit a job to the scheduler.
101
+
102
+ Args:
103
+ job: Job to submit.
104
+ interactive: If True, run interactively.
105
+ keep_script: If True, don't delete job script after submission.
39
106
  """
40
107
 
41
108
  @abstractmethod
@@ -44,33 +111,84 @@ class BaseScheduler(ABC):
44
111
 
45
112
  @abstractmethod
46
113
  def cancel(self, job_id: str) -> bool:
47
- """Cancel a job by ID."""
114
+ """Cancel a job."""
48
115
 
49
116
  @abstractmethod
50
117
  def get_status(self, job_id: str) -> "JobStatus":
51
- """Get current status of a job."""
118
+ """Get job status."""
52
119
 
53
120
  @abstractmethod
54
121
  def get_exit_code(self, job_id: str) -> int | None:
55
- """Get exit code of completed job."""
122
+ """Get job exit code."""
56
123
 
57
124
  @abstractmethod
125
+ def generate_script(self, job: "Job", array_range: str | None = None) -> str:
126
+ """Generate submission script."""
127
+
128
+ @abstractmethod
129
+ def build_submit_command(self, job: "Job") -> list[str]:
130
+ """Build submission command line."""
131
+
132
+ @abstractmethod
133
+ def build_interactive_command(self, job: "Job") -> list[str]:
134
+ """Build interactive execution command."""
135
+
136
+ # =========================================================================
137
+ # Optional Methods - Override if scheduler supports these
138
+ # =========================================================================
139
+
58
140
  def get_output_path(self, job_id: str, stream: str) -> Path | None:
59
141
  """Get path to output file.
60
142
 
61
143
  Args:
62
144
  job_id: Job ID
63
145
  stream: "stdout" or "stderr"
64
- """
65
146
 
66
- @abstractmethod
67
- def generate_script(self, job: "Job") -> str:
68
- """Generate job script content."""
69
-
70
- @abstractmethod
71
- def build_submit_command(self, job: "Job") -> list[str]:
72
- """Build the submission command (e.g., qsub args)."""
147
+ Returns:
148
+ Path to output file, or None if not determinable.
149
+ """
150
+ return None
73
151
 
74
152
  def get_scheduler_args(self, job: "Job") -> list[str]:
75
153
  """Get scheduler-specific raw args from job."""
76
154
  return getattr(job, f"{self.name}_args", [])
155
+
156
+ def list_active_jobs(
157
+ self,
158
+ user: str | None = None,
159
+ status: set["JobStatus"] | None = None,
160
+ queue: str | None = None,
161
+ ) -> list["JobInfo"]:
162
+ """List active jobs. Override in subclass."""
163
+ return []
164
+
165
+ def list_completed_jobs(
166
+ self,
167
+ user: str | None = None,
168
+ since: datetime | None = None,
169
+ until: datetime | None = None,
170
+ exit_code: int | None = None,
171
+ queue: str | None = None,
172
+ limit: int = 100,
173
+ ) -> list["JobInfo"]:
174
+ """List completed jobs from accounting. Override in subclass."""
175
+ return []
176
+
177
+ def has_accounting(self) -> bool:
178
+ """Check if job accounting/history is available."""
179
+ return False
180
+
181
+ def get_job_details(self, job_id: str) -> tuple["JobInfo", dict[str, object]]:
182
+ """Get detailed information for a single job.
183
+
184
+ Args:
185
+ job_id: The job identifier.
186
+
187
+ Returns:
188
+ Tuple of (JobInfo, extra_details dict).
189
+
190
+ Raises:
191
+ JobNotFoundError: If job doesn't exist.
192
+ NotImplementedError: If not implemented by scheduler.
193
+ """
194
+ raise NotImplementedError(f"{self.name} does not implement get_job_details()")
@@ -2,6 +2,23 @@
2
2
 
3
3
  import os
4
4
  import shutil
5
+ import subprocess
6
+
7
+
8
+ def _check_sge_via_qstat() -> bool:
9
+ """Check if qstat is SGE by examining its help output."""
10
+ try:
11
+ result = subprocess.run(
12
+ ["qstat", "-help"],
13
+ capture_output=True,
14
+ text=True,
15
+ timeout=5,
16
+ )
17
+ # SGE's qstat -help starts with "SGE" or "GE" version info
18
+ output = result.stdout + result.stderr
19
+ return "SGE" in output or "Grid Engine" in output
20
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
21
+ return False
5
22
 
6
23
 
7
24
  def detect_scheduler() -> str:
@@ -9,7 +26,7 @@ def detect_scheduler() -> str:
9
26
 
10
27
  Order of precedence:
11
28
  1. HPC_SCHEDULER environment variable
12
- 2. SGE (check for qsub with SGE_ROOT)
29
+ 2. SGE (check for SGE_ROOT or qstat -help output)
13
30
  3. Slurm (check for sbatch)
14
31
  4. PBS (check for qsub with PBS_CONF_FILE)
15
32
  5. Local fallback
@@ -18,9 +35,10 @@ def detect_scheduler() -> str:
18
35
  if scheduler := os.environ.get("HPC_SCHEDULER"):
19
36
  return scheduler.lower()
20
37
 
21
- # Check for SGE (also uses qsub but has SGE_ROOT)
22
- if shutil.which("qsub") and os.environ.get("SGE_ROOT"):
23
- return "sge"
38
+ # Check for SGE (via SGE_ROOT or qstat help output)
39
+ if shutil.which("qsub"):
40
+ if os.environ.get("SGE_ROOT") or _check_sge_via_qstat():
41
+ return "sge"
24
42
 
25
43
  # Check for Slurm
26
44
  if shutil.which("sbatch") and shutil.which("squeue"):
@@ -9,6 +9,8 @@ from datetime import datetime
9
9
  from pathlib import Path
10
10
  from typing import TYPE_CHECKING
11
11
 
12
+ from hpc_runner.core.exceptions import AccountingNotAvailable, JobNotFoundError
13
+ from hpc_runner.core.job_info import JobInfo
12
14
  from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
13
15
  from hpc_runner.schedulers.base import BaseScheduler
14
16
  from hpc_runner.templates import render_template
@@ -28,7 +30,9 @@ class LocalScheduler(BaseScheduler):
28
30
  _exit_codes: dict[str, int] = {}
29
31
  _output_paths: dict[str, dict[str, Path]] = {}
30
32
 
31
- def submit(self, job: "Job", interactive: bool = False) -> JobResult:
33
+ def submit(
34
+ self, job: "Job", interactive: bool = False, keep_script: bool = False
35
+ ) -> JobResult:
32
36
  """Run job as local subprocess."""
33
37
  LocalScheduler._job_counter += 1
34
38
  job_id = f"local_{LocalScheduler._job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
@@ -224,7 +228,7 @@ class LocalScheduler(BaseScheduler):
224
228
  return LocalScheduler._output_paths[job_id].get(stream)
225
229
  return None
226
230
 
227
- def generate_script(self, job: "Job") -> str:
231
+ def generate_script(self, job: "Job", array_range: str | None = None) -> str:
228
232
  """Generate local execution script."""
229
233
  return render_template(
230
234
  "local/templates/job.sh.j2",
@@ -235,3 +239,116 @@ class LocalScheduler(BaseScheduler):
235
239
  def build_submit_command(self, job: "Job") -> list[str]:
236
240
  """Build command - for local, just bash."""
237
241
  return ["bash", "-c", job.command if isinstance(job.command, str) else " ".join(job.command)]
242
+
243
+ def build_interactive_command(self, job: "Job") -> list[str]:
244
+ """Build interactive command - for local, just bash."""
245
+ return ["bash", "-c", job.command if isinstance(job.command, str) else " ".join(job.command)]
246
+
247
+ # -------------------------------------------------------------------------
248
+ # TUI Monitor API (stubs for local scheduler)
249
+ # -------------------------------------------------------------------------
250
+
251
+ def list_active_jobs(
252
+ self,
253
+ user: str | None = None,
254
+ status: set[JobStatus] | None = None,
255
+ queue: str | None = None,
256
+ ) -> list[JobInfo]:
257
+ """List active local jobs.
258
+
259
+ The local scheduler tracks running processes in memory.
260
+ """
261
+ jobs: list[JobInfo] = []
262
+ current_user = os.environ.get("USER", "unknown")
263
+
264
+ for job_id, proc in LocalScheduler._processes.items():
265
+ poll = proc.poll()
266
+ if poll is None: # Still running
267
+ job_status = JobStatus.RUNNING
268
+ else:
269
+ continue # Skip completed jobs
270
+
271
+ # Apply filters
272
+ if user is not None and user != current_user:
273
+ continue
274
+ if status is not None and job_status not in status:
275
+ continue
276
+ # queue filter doesn't apply to local scheduler
277
+
278
+ jobs.append(
279
+ JobInfo(
280
+ job_id=job_id,
281
+ name=job_id, # Local scheduler doesn't track job names
282
+ user=current_user,
283
+ status=job_status,
284
+ queue="local",
285
+ )
286
+ )
287
+
288
+ return jobs
289
+
290
+ def list_completed_jobs(
291
+ self,
292
+ user: str | None = None,
293
+ since: datetime | None = None,
294
+ until: datetime | None = None,
295
+ exit_code: int | None = None,
296
+ queue: str | None = None,
297
+ limit: int = 100,
298
+ ) -> list[JobInfo]:
299
+ """List completed local jobs.
300
+
301
+ The local scheduler does not persist job history, so this
302
+ raises AccountingNotAvailable.
303
+ """
304
+ raise AccountingNotAvailable(
305
+ "Local scheduler does not persist job history. "
306
+ "Completed job information is only available during the current session."
307
+ )
308
+
309
+ def has_accounting(self) -> bool:
310
+ """Check if job accounting is available.
311
+
312
+ Local scheduler does not have persistent accounting.
313
+ """
314
+ return False
315
+
316
+ def get_job_details(self, job_id: str) -> tuple[JobInfo, dict[str, object]]:
317
+ """Get details for a local job."""
318
+ current_user = os.environ.get("USER", "unknown")
319
+
320
+ # Check running processes
321
+ if job_id in LocalScheduler._processes:
322
+ proc = LocalScheduler._processes[job_id]
323
+ poll = proc.poll()
324
+ status = JobStatus.RUNNING if poll is None else (
325
+ JobStatus.COMPLETED if poll == 0 else JobStatus.FAILED
326
+ )
327
+ job_info = JobInfo(
328
+ job_id=job_id,
329
+ name=job_id,
330
+ user=current_user,
331
+ status=status,
332
+ queue="local",
333
+ exit_code=poll if poll is not None else None,
334
+ stdout_path=LocalScheduler._output_paths.get(job_id, {}).get("stdout"),
335
+ stderr_path=LocalScheduler._output_paths.get(job_id, {}).get("stderr"),
336
+ )
337
+ return job_info, {}
338
+
339
+ # Check completed jobs with cached exit codes
340
+ if job_id in LocalScheduler._exit_codes:
341
+ exit_code = LocalScheduler._exit_codes[job_id]
342
+ job_info = JobInfo(
343
+ job_id=job_id,
344
+ name=job_id,
345
+ user=current_user,
346
+ status=JobStatus.COMPLETED if exit_code == 0 else JobStatus.FAILED,
347
+ queue="local",
348
+ exit_code=exit_code,
349
+ stdout_path=LocalScheduler._output_paths.get(job_id, {}).get("stdout"),
350
+ stderr_path=LocalScheduler._output_paths.get(job_id, {}).get("stderr"),
351
+ )
352
+ return job_info, {}
353
+
354
+ raise JobNotFoundError(f"Job {job_id} not found")