hpc-runner 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/__init__.py +57 -0
- hpc_runner/_version.py +34 -0
- hpc_runner/cli/__init__.py +1 -0
- hpc_runner/cli/cancel.py +38 -0
- hpc_runner/cli/config.py +109 -0
- hpc_runner/cli/main.py +76 -0
- hpc_runner/cli/monitor.py +30 -0
- hpc_runner/cli/run.py +292 -0
- hpc_runner/cli/status.py +66 -0
- hpc_runner/core/__init__.py +31 -0
- hpc_runner/core/config.py +177 -0
- hpc_runner/core/descriptors.py +110 -0
- hpc_runner/core/exceptions.py +38 -0
- hpc_runner/core/job.py +328 -0
- hpc_runner/core/job_array.py +58 -0
- hpc_runner/core/job_info.py +104 -0
- hpc_runner/core/resources.py +49 -0
- hpc_runner/core/result.py +161 -0
- hpc_runner/core/types.py +13 -0
- hpc_runner/py.typed +0 -0
- hpc_runner/schedulers/__init__.py +60 -0
- hpc_runner/schedulers/base.py +194 -0
- hpc_runner/schedulers/detection.py +52 -0
- hpc_runner/schedulers/local/__init__.py +5 -0
- hpc_runner/schedulers/local/scheduler.py +354 -0
- hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
- hpc_runner/schedulers/sge/__init__.py +5 -0
- hpc_runner/schedulers/sge/args.py +232 -0
- hpc_runner/schedulers/sge/parser.py +287 -0
- hpc_runner/schedulers/sge/scheduler.py +881 -0
- hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
- hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
- hpc_runner/templates/__init__.py +5 -0
- hpc_runner/templates/engine.py +55 -0
- hpc_runner/tui/__init__.py +5 -0
- hpc_runner/tui/app.py +436 -0
- hpc_runner/tui/components/__init__.py +17 -0
- hpc_runner/tui/components/detail_panel.py +187 -0
- hpc_runner/tui/components/filter_bar.py +174 -0
- hpc_runner/tui/components/filter_popup.py +345 -0
- hpc_runner/tui/components/job_table.py +260 -0
- hpc_runner/tui/providers/__init__.py +5 -0
- hpc_runner/tui/providers/jobs.py +197 -0
- hpc_runner/tui/screens/__init__.py +7 -0
- hpc_runner/tui/screens/confirm.py +67 -0
- hpc_runner/tui/screens/job_details.py +210 -0
- hpc_runner/tui/screens/log_viewer.py +170 -0
- hpc_runner/tui/snapshot.py +153 -0
- hpc_runner/tui/styles/monitor.tcss +567 -0
- hpc_runner/workflow/__init__.py +6 -0
- hpc_runner/workflow/dependency.py +20 -0
- hpc_runner/workflow/pipeline.py +180 -0
- hpc_runner-0.2.0.dist-info/METADATA +285 -0
- hpc_runner-0.2.0.dist-info/RECORD +56 -0
- hpc_runner-0.2.0.dist-info/WHEEL +4 -0
- hpc_runner-0.2.0.dist-info/entry_points.txt +2 -0
hpc_runner/cli/status.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Status command - check job status."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import rich_click as click
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.table import Table
|
|
7
|
+
|
|
8
|
+
from hpc_runner.cli.main import Context, pass_context
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.command()
|
|
14
|
+
@click.argument("job_id", required=False)
|
|
15
|
+
@click.option("--all", "all_users", is_flag=True, help="Show all users' jobs")
|
|
16
|
+
@click.option("--watch", is_flag=True, help="Watch mode (refresh periodically)")
|
|
17
|
+
@pass_context
|
|
18
|
+
def status(
|
|
19
|
+
ctx: Context,
|
|
20
|
+
job_id: str | None,
|
|
21
|
+
all_users: bool,
|
|
22
|
+
watch: bool,
|
|
23
|
+
) -> None:
|
|
24
|
+
"""Check job status.
|
|
25
|
+
|
|
26
|
+
If JOB_ID is provided, show status of that specific job.
|
|
27
|
+
Otherwise, list all your jobs.
|
|
28
|
+
"""
|
|
29
|
+
from hpc_runner.schedulers import get_scheduler
|
|
30
|
+
|
|
31
|
+
scheduler = get_scheduler(ctx.scheduler)
|
|
32
|
+
|
|
33
|
+
if job_id:
|
|
34
|
+
# Show specific job status
|
|
35
|
+
status = scheduler.get_status(job_id)
|
|
36
|
+
exit_code = scheduler.get_exit_code(job_id)
|
|
37
|
+
|
|
38
|
+
table = Table(title=f"Job {job_id}")
|
|
39
|
+
table.add_column("Property", style="cyan")
|
|
40
|
+
table.add_column("Value")
|
|
41
|
+
|
|
42
|
+
table.add_row("Status", _status_style(status.name))
|
|
43
|
+
if exit_code is not None:
|
|
44
|
+
table.add_row("Exit Code", str(exit_code))
|
|
45
|
+
|
|
46
|
+
console.print(table)
|
|
47
|
+
else:
|
|
48
|
+
# List all jobs (not implemented for all schedulers)
|
|
49
|
+
console.print(
|
|
50
|
+
"[yellow]Listing all jobs requires scheduler-specific implementation[/yellow]"
|
|
51
|
+
)
|
|
52
|
+
console.print("Use 'hpc status <job_id>' to check a specific job")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _status_style(status: str) -> str:
|
|
56
|
+
"""Apply color to status string."""
|
|
57
|
+
colors = {
|
|
58
|
+
"PENDING": "[yellow]PENDING[/yellow]",
|
|
59
|
+
"RUNNING": "[blue]RUNNING[/blue]",
|
|
60
|
+
"COMPLETED": "[green]COMPLETED[/green]",
|
|
61
|
+
"FAILED": "[red]FAILED[/red]",
|
|
62
|
+
"CANCELLED": "[magenta]CANCELLED[/magenta]",
|
|
63
|
+
"TIMEOUT": "[red]TIMEOUT[/red]",
|
|
64
|
+
"UNKNOWN": "[dim]UNKNOWN[/dim]",
|
|
65
|
+
}
|
|
66
|
+
return colors.get(status, status)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Core models and abstractions for hpc-tools."""
|
|
2
|
+
|
|
3
|
+
from .exceptions import (
|
|
4
|
+
AccountingNotAvailable,
|
|
5
|
+
ConfigError,
|
|
6
|
+
ConfigNotFoundError,
|
|
7
|
+
HPCToolsError,
|
|
8
|
+
JobNotFoundError,
|
|
9
|
+
SchedulerError,
|
|
10
|
+
SubmissionError,
|
|
11
|
+
ValidationError,
|
|
12
|
+
)
|
|
13
|
+
from .job_info import JobInfo
|
|
14
|
+
from .result import ArrayJobResult, JobResult, JobStatus
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
# Exceptions
|
|
18
|
+
"AccountingNotAvailable",
|
|
19
|
+
"ConfigError",
|
|
20
|
+
"ConfigNotFoundError",
|
|
21
|
+
"HPCToolsError",
|
|
22
|
+
"JobNotFoundError",
|
|
23
|
+
"SchedulerError",
|
|
24
|
+
"SubmissionError",
|
|
25
|
+
"ValidationError",
|
|
26
|
+
# Types
|
|
27
|
+
"JobInfo",
|
|
28
|
+
"JobResult",
|
|
29
|
+
"ArrayJobResult",
|
|
30
|
+
"JobStatus",
|
|
31
|
+
]
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Configuration loading and management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
if sys.version_info >= (3, 11):
|
|
11
|
+
import tomllib
|
|
12
|
+
else:
|
|
13
|
+
import tomli as tomllib
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class HPCConfig:
|
|
18
|
+
"""Loaded configuration."""
|
|
19
|
+
|
|
20
|
+
defaults: dict[str, Any] = field(default_factory=dict)
|
|
21
|
+
tools: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
22
|
+
types: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
23
|
+
schedulers: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
24
|
+
|
|
25
|
+
_source_path: Path | None = field(default=None, repr=False)
|
|
26
|
+
|
|
27
|
+
def get_job_config(self, tool_or_type: str) -> dict[str, Any]:
|
|
28
|
+
"""Get merged configuration for a tool or type.
|
|
29
|
+
|
|
30
|
+
Lookup order:
|
|
31
|
+
1. Check types[tool_or_type]
|
|
32
|
+
2. Check tools[tool_or_type]
|
|
33
|
+
3. Fall back to defaults
|
|
34
|
+
"""
|
|
35
|
+
config = self.defaults.copy()
|
|
36
|
+
|
|
37
|
+
if tool_or_type in self.types:
|
|
38
|
+
config = _merge(config, self.types[tool_or_type])
|
|
39
|
+
elif tool_or_type in self.tools:
|
|
40
|
+
config = _merge(config, self.tools[tool_or_type])
|
|
41
|
+
|
|
42
|
+
return config
|
|
43
|
+
|
|
44
|
+
def get_tool_config(self, command: str) -> dict[str, Any]:
|
|
45
|
+
"""Get configuration matching a command.
|
|
46
|
+
|
|
47
|
+
Extracts tool name from command and looks up config.
|
|
48
|
+
"""
|
|
49
|
+
# Extract tool name (first word, strip path)
|
|
50
|
+
tool = command.split()[0]
|
|
51
|
+
tool = Path(tool).name
|
|
52
|
+
|
|
53
|
+
return self.get_job_config(tool)
|
|
54
|
+
|
|
55
|
+
def get_scheduler_config(self, scheduler: str) -> dict[str, Any]:
|
|
56
|
+
"""Get scheduler-specific configuration."""
|
|
57
|
+
return self.schedulers.get(scheduler, {})
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
|
61
|
+
"""Deep merge with override taking precedence."""
|
|
62
|
+
result = base.copy()
|
|
63
|
+
for key, value in override.items():
|
|
64
|
+
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
|
65
|
+
result[key] = _merge(result[key], value)
|
|
66
|
+
elif key in result and isinstance(result[key], list) and isinstance(value, list):
|
|
67
|
+
# Check for list reset marker
|
|
68
|
+
if value and value[0] == "-":
|
|
69
|
+
result[key] = value[1:]
|
|
70
|
+
else:
|
|
71
|
+
result[key] = list(set(result[key] + value))
|
|
72
|
+
else:
|
|
73
|
+
result[key] = value
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def find_config_file() -> Path | None:
|
|
78
|
+
"""Find configuration file in priority order.
|
|
79
|
+
|
|
80
|
+
Search order:
|
|
81
|
+
1. ./hpc-tools.toml (current directory)
|
|
82
|
+
2. ./pyproject.toml [tool.hpc-tools] section
|
|
83
|
+
3. Git repository root hpc-tools.toml
|
|
84
|
+
4. ~/.config/hpc-tools/config.toml
|
|
85
|
+
5. Package defaults
|
|
86
|
+
"""
|
|
87
|
+
# Current directory
|
|
88
|
+
cwd = Path.cwd()
|
|
89
|
+
if (cwd / "hpc-tools.toml").exists():
|
|
90
|
+
return cwd / "hpc-tools.toml"
|
|
91
|
+
|
|
92
|
+
if (cwd / "pyproject.toml").exists():
|
|
93
|
+
try:
|
|
94
|
+
with open(cwd / "pyproject.toml", "rb") as f:
|
|
95
|
+
pyproject = tomllib.load(f)
|
|
96
|
+
if "tool" in pyproject and "hpc-tools" in pyproject["tool"]:
|
|
97
|
+
return cwd / "pyproject.toml"
|
|
98
|
+
except Exception:
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
# Git root
|
|
102
|
+
git_root = _find_git_root(cwd)
|
|
103
|
+
if git_root and (git_root / "hpc-tools.toml").exists():
|
|
104
|
+
return git_root / "hpc-tools.toml"
|
|
105
|
+
|
|
106
|
+
# User config
|
|
107
|
+
user_config = Path.home() / ".config" / "hpc-tools" / "config.toml"
|
|
108
|
+
if user_config.exists():
|
|
109
|
+
return user_config
|
|
110
|
+
|
|
111
|
+
# Package defaults
|
|
112
|
+
package_defaults = Path(__file__).parent.parent.parent.parent / "defaults" / "config.toml"
|
|
113
|
+
if package_defaults.exists():
|
|
114
|
+
return package_defaults
|
|
115
|
+
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _find_git_root(start: Path) -> Path | None:
|
|
120
|
+
"""Find git repository root."""
|
|
121
|
+
current = start.resolve()
|
|
122
|
+
while current != current.parent:
|
|
123
|
+
if (current / ".git").exists():
|
|
124
|
+
return current
|
|
125
|
+
current = current.parent
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def load_config(path: Path | str | None = None) -> HPCConfig:
|
|
130
|
+
"""Load configuration from file.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
path: Explicit config path or None to auto-discover
|
|
134
|
+
"""
|
|
135
|
+
if path is None:
|
|
136
|
+
path = find_config_file()
|
|
137
|
+
|
|
138
|
+
if path is None:
|
|
139
|
+
return HPCConfig() # Empty config, use defaults
|
|
140
|
+
|
|
141
|
+
path = Path(path)
|
|
142
|
+
|
|
143
|
+
with open(path, "rb") as f:
|
|
144
|
+
data = tomllib.load(f)
|
|
145
|
+
|
|
146
|
+
# Handle pyproject.toml
|
|
147
|
+
if path.name == "pyproject.toml":
|
|
148
|
+
data = data.get("tool", {}).get("hpc-tools", {})
|
|
149
|
+
|
|
150
|
+
config = HPCConfig(
|
|
151
|
+
defaults=data.get("defaults", {}),
|
|
152
|
+
tools=data.get("tools", {}),
|
|
153
|
+
types=data.get("types", {}),
|
|
154
|
+
schedulers=data.get("schedulers", {}),
|
|
155
|
+
)
|
|
156
|
+
config._source_path = path
|
|
157
|
+
|
|
158
|
+
return config
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# Global config cache
|
|
162
|
+
_cached_config: HPCConfig | None = None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def get_config() -> HPCConfig:
|
|
166
|
+
"""Get the global configuration (cached)."""
|
|
167
|
+
global _cached_config
|
|
168
|
+
if _cached_config is None:
|
|
169
|
+
_cached_config = load_config()
|
|
170
|
+
return _cached_config
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def reload_config(path: Path | str | None = None) -> HPCConfig:
|
|
174
|
+
"""Reload configuration (clears cache)."""
|
|
175
|
+
global _cached_config
|
|
176
|
+
_cached_config = load_config(path)
|
|
177
|
+
return _cached_config
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Descriptor pattern for job attributes and scheduler arguments."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Generic, TypeVar
|
|
5
|
+
|
|
6
|
+
T = TypeVar("T")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# =============================================================================
|
|
10
|
+
# Job Attribute Descriptor
|
|
11
|
+
# =============================================================================
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JobAttribute(Generic[T]):
|
|
15
|
+
"""Descriptor for Job attributes that enables iteration and rendering.
|
|
16
|
+
|
|
17
|
+
This descriptor provides:
|
|
18
|
+
- Clean attribute access on Job instances
|
|
19
|
+
- Class-level access returns the descriptor itself
|
|
20
|
+
- Support for default values
|
|
21
|
+
- Registration for iteration by schedulers
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
class Job:
|
|
25
|
+
name = JobAttribute('name')
|
|
26
|
+
cpu = JobAttribute('cpu', default=1)
|
|
27
|
+
|
|
28
|
+
job = Job()
|
|
29
|
+
job.name = "test"
|
|
30
|
+
print(job.name) # "test"
|
|
31
|
+
print(Job.name) # <JobAttribute 'name'>
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, name: str, *, default: T | None = None):
|
|
35
|
+
self.public_name = name
|
|
36
|
+
self.default = default
|
|
37
|
+
self._private_name: str | None = None
|
|
38
|
+
|
|
39
|
+
def __set_name__(self, owner: type, name: str) -> None:
|
|
40
|
+
self._private_name = f"_{name}"
|
|
41
|
+
|
|
42
|
+
def __get__(self, obj: Any, objtype: type | None = None) -> T | "JobAttribute[T]":
|
|
43
|
+
if obj is None:
|
|
44
|
+
return self
|
|
45
|
+
return getattr(obj, self._private_name, self.default)
|
|
46
|
+
|
|
47
|
+
def __set__(self, obj: Any, value: T | None) -> None:
|
|
48
|
+
setattr(obj, self._private_name, value)
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
return f"<JobAttribute '{self.public_name}'>"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# =============================================================================
|
|
55
|
+
# Scheduler Argument Base Class
|
|
56
|
+
# =============================================================================
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class SchedulerArg(ABC, Generic[T]):
|
|
60
|
+
"""Base class for scheduler-specific argument renderers.
|
|
61
|
+
|
|
62
|
+
Each scheduler backend (SGE, Slurm, PBS) will have subclasses that know
|
|
63
|
+
how to render job attribute values into that scheduler's syntax.
|
|
64
|
+
|
|
65
|
+
Subclasses must implement:
|
|
66
|
+
- to_args(value) -> list of command-line arguments
|
|
67
|
+
- to_directive(value) -> script directive string or None
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
class SGEJobNameArg(SchedulerArg[str]):
|
|
71
|
+
def to_args(self, value):
|
|
72
|
+
return ["-N", value] if value else []
|
|
73
|
+
|
|
74
|
+
def to_directive(self, value):
|
|
75
|
+
return f"#$ -N {value}" if value else None
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
flag: str,
|
|
81
|
+
*,
|
|
82
|
+
doc: str = "",
|
|
83
|
+
):
|
|
84
|
+
self.flag = flag
|
|
85
|
+
self.doc = doc
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def to_args(self, value: T | None) -> list[str]:
|
|
89
|
+
"""Convert value to command-line arguments.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
value: The job attribute value (may be None)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of command-line argument strings, empty list if value is None
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
@abstractmethod
|
|
99
|
+
def to_directive(self, value: T | None) -> str | None:
|
|
100
|
+
"""Convert value to a script directive.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
value: The job attribute value (may be None)
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Directive string (e.g., "#$ -N jobname") or None if value is None
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __repr__(self) -> str:
|
|
110
|
+
return f"<{self.__class__.__name__} flag='{self.flag}'>"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Custom exceptions for hpc-tools."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class HPCToolsError(Exception):
|
|
5
|
+
"""Base exception for hpc-tools."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SchedulerError(HPCToolsError):
|
|
9
|
+
"""Error related to scheduler operations."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SubmissionError(SchedulerError):
|
|
13
|
+
"""Error during job submission."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JobNotFoundError(SchedulerError):
|
|
17
|
+
"""Job ID not found."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ConfigError(HPCToolsError):
|
|
21
|
+
"""Error in configuration."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConfigNotFoundError(ConfigError):
|
|
25
|
+
"""Configuration file not found."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ValidationError(HPCToolsError):
|
|
29
|
+
"""Validation error for job parameters."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AccountingNotAvailable(SchedulerError):
|
|
33
|
+
"""Job accounting/history is not enabled on this cluster.
|
|
34
|
+
|
|
35
|
+
Raised when attempting to query historical job data (e.g., via qacct
|
|
36
|
+
for SGE or sacct for Slurm) but the scheduler's accounting system
|
|
37
|
+
is not configured or accessible.
|
|
38
|
+
"""
|