hpc-runner 0.1.1__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/CLAUDE.md +68 -0
- hpc_runner-0.2.1/PKG-INFO +285 -0
- hpc_runner-0.2.1/README.md +241 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/defaults/config.toml +9 -0
- hpc_runner-0.2.1/docs/HPC_MONITOR_TUI_PLAN.md +796 -0
- hpc_runner-0.2.1/docs/TEXTUAL_STYLING_COOKBOOK.md +1346 -0
- hpc_runner-0.2.1/docs/cli-redesign-spec.md +786 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/pyproject.toml +1 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/sourceme +3 -9
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/_version.py +2 -2
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/cli/cancel.py +1 -1
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/cli/config.py +2 -2
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/cli/main.py +17 -13
- hpc_runner-0.2.1/src/hpc_runner/cli/monitor.py +30 -0
- hpc_runner-0.2.1/src/hpc_runner/cli/run.py +292 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/cli/status.py +6 -5
- hpc_runner-0.2.1/src/hpc_runner/core/__init__.py +31 -0
- hpc_runner-0.2.1/src/hpc_runner/core/descriptors.py +110 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/core/exceptions.py +9 -0
- hpc_runner-0.2.1/src/hpc_runner/core/job.py +328 -0
- hpc_runner-0.2.1/src/hpc_runner/core/job_info.py +104 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/core/result.py +4 -0
- hpc_runner-0.2.1/src/hpc_runner/schedulers/base.py +194 -0
- hpc_runner-0.2.1/src/hpc_runner/schedulers/detection.py +52 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/schedulers/local/scheduler.py +119 -2
- hpc_runner-0.2.1/src/hpc_runner/schedulers/sge/args.py +232 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/schedulers/sge/parser.py +106 -13
- hpc_runner-0.2.1/src/hpc_runner/schedulers/sge/scheduler.py +881 -0
- hpc_runner-0.2.1/src/hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
- hpc_runner-0.2.1/src/hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/__init__.py +5 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/app.py +436 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/components/__init__.py +17 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/components/detail_panel.py +187 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/components/filter_bar.py +174 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/components/filter_popup.py +345 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/components/job_table.py +260 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/providers/__init__.py +5 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/providers/jobs.py +197 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/screens/__init__.py +7 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/screens/confirm.py +67 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/screens/job_details.py +210 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/screens/log_viewer.py +170 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/snapshot.py +153 -0
- hpc_runner-0.2.1/src/hpc_runner/tui/styles/monitor.tcss +567 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/conftest.py +37 -1
- hpc_runner-0.2.1/tests/test_cli/test_run.py +238 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_schedulers/test_sge.py +139 -0
- hpc_runner-0.2.1/tests/test_tui/__init__.py +0 -0
- hpc_runner-0.2.1/tests/test_tui/test_app_snapshot.py +184 -0
- hpc_runner-0.2.1/tests/test_tui/test_detail_panel.py +320 -0
- hpc_runner-0.2.1/tests/test_tui/test_job_table.py +270 -0
- hpc_runner-0.1.1/PKG-INFO +0 -46
- hpc_runner-0.1.1/README.md +0 -3
- hpc_runner-0.1.1/src/hpc_runner/cli/run.py +0 -136
- hpc_runner-0.1.1/src/hpc_runner/core/__init__.py +0 -1
- hpc_runner-0.1.1/src/hpc_runner/core/descriptors.py +0 -56
- hpc_runner-0.1.1/src/hpc_runner/core/job.py +0 -149
- hpc_runner-0.1.1/src/hpc_runner/schedulers/base.py +0 -76
- hpc_runner-0.1.1/src/hpc_runner/schedulers/detection.py +0 -34
- hpc_runner-0.1.1/src/hpc_runner/schedulers/sge/args.py +0 -165
- hpc_runner-0.1.1/src/hpc_runner/schedulers/sge/scheduler.py +0 -325
- hpc_runner-0.1.1/src/hpc_runner/schedulers/sge/templates/job.sh.j2 +0 -39
- hpc_runner-0.1.1/tests/test_cli/test_run.py +0 -98
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/.github/workflows/ci.yml +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/.github/workflows/publish.yml +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/.gitignore +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/SPEC.md +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/cli/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/core/config.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/core/job_array.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/core/resources.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/core/types.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/py.typed +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/schedulers/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/schedulers/local/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/schedulers/local/templates/job.sh.j2 +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/schedulers/sge/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/templates/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/templates/engine.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/workflow/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/workflow/dependency.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/src/hpc_runner/workflow/pipeline.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_cli/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_core/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_core/test_config.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_core/test_job.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_core/test_resources.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_schedulers/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_schedulers/test_detection.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_schedulers/test_local.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_workflow/__init__.py +0 -0
- {hpc_runner-0.1.1 → hpc_runner-0.2.1}/tests/test_workflow/test_pipeline.py +0 -0
|
@@ -58,8 +58,76 @@ Uses `rich-click` for styled output. Commands: `run`, `status`, `cancel`, `confi
|
|
|
58
58
|
|
|
59
59
|
Jinja2 templates for job scripts. Each scheduler has its own template in `schedulers/{name}/templates/job.sh.j2`.
|
|
60
60
|
|
|
61
|
+
### TUI (`src/hpc_runner/tui/`)
|
|
62
|
+
|
|
63
|
+
**HpcMonitorApp** - Textual-based terminal UI for monitoring HPC jobs. Entry point: `hpc monitor`.
|
|
64
|
+
|
|
65
|
+
- **app.py** - Main application with custom Nord-inspired theme
|
|
66
|
+
- **styles/monitor.tcss** - CSS styling following Rovr aesthetic (see `docs/TEXTUAL_STYLING_COOKBOOK.md`)
|
|
67
|
+
- **snapshot.py** - Visual review utility for development
|
|
68
|
+
|
|
61
69
|
## Key Design Decisions
|
|
62
70
|
|
|
63
71
|
- **Merged output by default**: stderr goes to stdout unless `--stderr` specified
|
|
64
72
|
- **Configurable SGE settings**: PE name, memory resource name, time resource name all come from config, not hardcoded
|
|
65
73
|
- **Descriptor pattern**: Scheduler arguments use Python descriptors for type-safe flag/directive generation
|
|
74
|
+
|
|
75
|
+
## TUI Development Rules
|
|
76
|
+
|
|
77
|
+
### Styling Requirements (CRITICAL)
|
|
78
|
+
|
|
79
|
+
All TUI components MUST follow these styling patterns. **Do NOT use DEFAULT_CSS in components** - put all styles in `monitor.tcss` for consistency.
|
|
80
|
+
|
|
81
|
+
**Core Principles:**
|
|
82
|
+
- **Transparent backgrounds everywhere** - use `background: transparent` on all widgets
|
|
83
|
+
- **Rounded borders** - use `border: round $border-blurred` (unfocused) or `border: round $border` (focused)
|
|
84
|
+
- **No solid colored backgrounds** except for highlighted/selected items
|
|
85
|
+
- **Border titles in $primary** - use `border-title-color: $primary`
|
|
86
|
+
|
|
87
|
+
**Standard Widget Patterns:**
|
|
88
|
+
```css
|
|
89
|
+
/* Panels and containers */
|
|
90
|
+
MyWidget {
|
|
91
|
+
background: transparent;
|
|
92
|
+
border: round $border-blurred;
|
|
93
|
+
border-title-color: $primary;
|
|
94
|
+
border-title-background: transparent;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
MyWidget:focus, MyWidget:focus-within {
|
|
98
|
+
border: round $border;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/* Buttons - transparent with border */
|
|
102
|
+
Button {
|
|
103
|
+
background: transparent;
|
|
104
|
+
border: round $border-blurred;
|
|
105
|
+
color: $foreground;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
Button:hover {
|
|
109
|
+
background: $boost;
|
|
110
|
+
border: round $border;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/* Popups/overlays - transparent background */
|
|
114
|
+
Popup {
|
|
115
|
+
layer: overlay;
|
|
116
|
+
background: transparent;
|
|
117
|
+
border: round $primary;
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**CSS Variables (defined in monitor.tcss):**
|
|
122
|
+
- `$border-blurred` - muted border for unfocused elements
|
|
123
|
+
- `$border` - bright border for focused elements
|
|
124
|
+
- `$primary` - teal accent color (#88C0D0)
|
|
125
|
+
- `$error` - red for destructive actions
|
|
126
|
+
|
|
127
|
+
**Verification:**
|
|
128
|
+
|
|
129
|
+
After ANY edit to TUI code, verify visually that:
|
|
130
|
+
1. All backgrounds are transparent (terminal shows through)
|
|
131
|
+
2. Borders are rounded (╭╮╰╯ characters)
|
|
132
|
+
3. No solid color blocks except for selected/highlighted items
|
|
133
|
+
4. Focus states brighten borders appropriately
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hpc-runner
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Unified HPC job submission across multiple schedulers
|
|
5
|
+
Project-URL: Homepage, https://github.com/sjalloq/hpc-runner
|
|
6
|
+
Project-URL: Repository, https://github.com/sjalloq/hpc-runner
|
|
7
|
+
Author: Shareef Jalloq
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: cluster,hpc,job-submission,pbs,sge,slurm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: System :: Clustering
|
|
21
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: jinja2>=3.0
|
|
24
|
+
Requires-Dist: rich-click>=1.7
|
|
25
|
+
Requires-Dist: textual>=6.11
|
|
26
|
+
Requires-Dist: tomli>=2.0; python_version < '3.11'
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: build; extra == 'all'
|
|
29
|
+
Requires-Dist: hatch-vcs; extra == 'all'
|
|
30
|
+
Requires-Dist: mypy; extra == 'all'
|
|
31
|
+
Requires-Dist: pytest-cov; extra == 'all'
|
|
32
|
+
Requires-Dist: pytest>=7.0; extra == 'all'
|
|
33
|
+
Requires-Dist: ruff; extra == 'all'
|
|
34
|
+
Requires-Dist: twine; extra == 'all'
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: build; extra == 'dev'
|
|
37
|
+
Requires-Dist: hatch-vcs; extra == 'dev'
|
|
38
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
42
|
+
Requires-Dist: twine; extra == 'dev'
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# hpc-runner
|
|
46
|
+
|
|
47
|
+
**Unified HPC job submission across multiple schedulers**
|
|
48
|
+
|
|
49
|
+
Write your jobs once, run them on any cluster - SGE, Slurm, PBS, or locally for testing.
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Unified CLI** - Same commands work across SGE, Slurm, PBS
|
|
54
|
+
- **Python API** - Programmatic job submission with dependencies and pipelines
|
|
55
|
+
- **Auto-detection** - Automatically finds your cluster's scheduler
|
|
56
|
+
- **Interactive TUI** - Monitor jobs with a terminal dashboard
|
|
57
|
+
- **Job Dependencies** - Chain jobs with afterok, afterany, afternotok
|
|
58
|
+
- **Array Jobs** - Batch processing with throttling support
|
|
59
|
+
- **Virtual Environment Handling** - Automatic venv activation on compute nodes
|
|
60
|
+
- **Module Integration** - Load environment modules in job scripts
|
|
61
|
+
- **Dry-run Mode** - Preview generated scripts before submission
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install hpc-runner
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Or with uv:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
uv pip install hpc-runner
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
### CLI
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Basic job submission
|
|
81
|
+
hpc run python train.py
|
|
82
|
+
|
|
83
|
+
# With resources
|
|
84
|
+
hpc run --cpu 4 --mem 16G --time 4:00:00 "python train.py"
|
|
85
|
+
|
|
86
|
+
# GPU job
|
|
87
|
+
hpc run --queue gpu --cpu 4 --mem 32G "python train.py --epochs 100"
|
|
88
|
+
|
|
89
|
+
# Preview without submitting
|
|
90
|
+
hpc run --dry-run --cpu 8 "make -j8"
|
|
91
|
+
|
|
92
|
+
# Interactive session
|
|
93
|
+
hpc run --interactive bash
|
|
94
|
+
|
|
95
|
+
# Array job
|
|
96
|
+
hpc run --array 1-100 "python process.py --task-id \$SGE_TASK_ID"
|
|
97
|
+
|
|
98
|
+
# Wait for completion
|
|
99
|
+
hpc run --wait python long_job.py
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Python API
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from hpc_runner import Job
|
|
106
|
+
|
|
107
|
+
# Create and submit a job
|
|
108
|
+
job = Job(
|
|
109
|
+
command="python train.py",
|
|
110
|
+
cpu=4,
|
|
111
|
+
mem="16G",
|
|
112
|
+
time="4:00:00",
|
|
113
|
+
queue="gpu",
|
|
114
|
+
)
|
|
115
|
+
result = job.submit()
|
|
116
|
+
|
|
117
|
+
# Wait for completion
|
|
118
|
+
status = result.wait()
|
|
119
|
+
print(f"Exit code: {result.returncode}")
|
|
120
|
+
|
|
121
|
+
# Read output
|
|
122
|
+
print(result.read_stdout())
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Job Dependencies
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from hpc_runner import Job
|
|
129
|
+
|
|
130
|
+
# First job
|
|
131
|
+
preprocess = Job(command="python preprocess.py", cpu=8, mem="32G")
|
|
132
|
+
result1 = preprocess.submit()
|
|
133
|
+
|
|
134
|
+
# Second job runs after first succeeds
|
|
135
|
+
train = Job(command="python train.py", cpu=4, mem="48G", queue="gpu")
|
|
136
|
+
train.after(result1, type="afterok")
|
|
137
|
+
result2 = train.submit()
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Pipelines
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from hpc_runner import Pipeline
|
|
144
|
+
|
|
145
|
+
with Pipeline("ml_workflow") as p:
|
|
146
|
+
p.add("python preprocess.py", name="preprocess", cpu=8)
|
|
147
|
+
p.add("python train.py", name="train", depends_on=["preprocess"], queue="gpu")
|
|
148
|
+
p.add("python evaluate.py", name="evaluate", depends_on=["train"])
|
|
149
|
+
|
|
150
|
+
results = p.submit()
|
|
151
|
+
p.wait()
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Scheduler Support
|
|
155
|
+
|
|
156
|
+
| Scheduler | Status | Notes |
|
|
157
|
+
|-----------|--------|-------|
|
|
158
|
+
| SGE | Fully implemented | qsub, qstat, qdel, qrsh |
|
|
159
|
+
| Local | Fully implemented | Run as subprocess (for testing) |
|
|
160
|
+
| Slurm | Planned | sbatch, squeue, scancel |
|
|
161
|
+
| PBS | Planned | qsub, qstat, qdel |
|
|
162
|
+
|
|
163
|
+
### Auto-detection Priority
|
|
164
|
+
|
|
165
|
+
1. `HPC_SCHEDULER` environment variable
|
|
166
|
+
2. SGE (`SGE_ROOT` or `qstat` available)
|
|
167
|
+
3. Slurm (`sbatch` available)
|
|
168
|
+
4. PBS (`qsub` with PBS)
|
|
169
|
+
5. Local fallback
|
|
170
|
+
|
|
171
|
+
## Configuration
|
|
172
|
+
|
|
173
|
+
hpc-runner uses TOML configuration files. Location priority:
|
|
174
|
+
|
|
175
|
+
1. `--config /path/to/config.toml`
|
|
176
|
+
2. `./hpc-tools.toml`
|
|
177
|
+
3. `./pyproject.toml` under `[tool.hpc-tools]`
|
|
178
|
+
4. Git repository root `hpc-tools.toml`
|
|
179
|
+
5. `~/.config/hpc-tools/config.toml`
|
|
180
|
+
6. Package defaults
|
|
181
|
+
|
|
182
|
+
### Example Configuration
|
|
183
|
+
|
|
184
|
+
```toml
|
|
185
|
+
[defaults]
|
|
186
|
+
cpu = 1
|
|
187
|
+
mem = "4G"
|
|
188
|
+
time = "1:00:00"
|
|
189
|
+
inherit_env = true
|
|
190
|
+
|
|
191
|
+
[schedulers.sge]
|
|
192
|
+
parallel_environment = "smp"
|
|
193
|
+
memory_resource = "mem_free"
|
|
194
|
+
purge_modules = true
|
|
195
|
+
|
|
196
|
+
[types.gpu]
|
|
197
|
+
queue = "gpu"
|
|
198
|
+
resources = [{name = "gpu", value = 1}]
|
|
199
|
+
|
|
200
|
+
[types.interactive]
|
|
201
|
+
queue = "interactive"
|
|
202
|
+
time = "8:00:00"
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Use named job types:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
hpc run --job-type gpu "python train.py"
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## TUI Monitor
|
|
212
|
+
|
|
213
|
+
Launch the interactive job monitor:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
hpc monitor
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Key bindings:
|
|
220
|
+
- `q` - Quit
|
|
221
|
+
- `r` - Refresh
|
|
222
|
+
- `u` - Toggle user filter (my jobs / all)
|
|
223
|
+
- `/` - Search
|
|
224
|
+
- `Enter` - View job details
|
|
225
|
+
- `Tab` - Switch tabs
|
|
226
|
+
|
|
227
|
+
## CLI Reference
|
|
228
|
+
|
|
229
|
+
```
|
|
230
|
+
hpc run [OPTIONS] COMMAND
|
|
231
|
+
|
|
232
|
+
Options:
|
|
233
|
+
--job-name TEXT Job name
|
|
234
|
+
--cpu INTEGER Number of CPUs
|
|
235
|
+
--mem TEXT Memory (e.g., 16G, 4096M)
|
|
236
|
+
--time TEXT Time limit (e.g., 4:00:00)
|
|
237
|
+
--queue TEXT Queue/partition name
|
|
238
|
+
--directory PATH Working directory
|
|
239
|
+
--module TEXT Module to load (repeatable)
|
|
240
|
+
--array TEXT Array spec (e.g., 1-100, 1-100%5)
|
|
241
|
+
--depend TEXT Job dependencies
|
|
242
|
+
--inherit-env Inherit environment (default: true)
|
|
243
|
+
--no-inherit-env Don't inherit environment
|
|
244
|
+
--interactive Run interactively (qrsh/srun)
|
|
245
|
+
--local Run locally (no scheduler)
|
|
246
|
+
--dry-run Show script without submitting
|
|
247
|
+
--wait Wait for completion
|
|
248
|
+
--keep-script Keep job script for debugging
|
|
249
|
+
-h, --help Show help
|
|
250
|
+
|
|
251
|
+
Other commands:
|
|
252
|
+
hpc status [JOB_ID] Check job status
|
|
253
|
+
hpc cancel JOB_ID Cancel a job
|
|
254
|
+
hpc monitor Interactive TUI
|
|
255
|
+
hpc config show Show active configuration
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## Development
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
# Setup environment
|
|
262
|
+
source sourceme
|
|
263
|
+
source sourceme --clean # Clean rebuild
|
|
264
|
+
|
|
265
|
+
# Run tests
|
|
266
|
+
pytest
|
|
267
|
+
pytest -v
|
|
268
|
+
pytest -k "test_job"
|
|
269
|
+
|
|
270
|
+
# Type checking
|
|
271
|
+
mypy src/hpc_runner
|
|
272
|
+
|
|
273
|
+
# Linting
|
|
274
|
+
ruff check src/hpc_runner
|
|
275
|
+
ruff format src/hpc_runner
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Documentation
|
|
279
|
+
|
|
280
|
+
- [Programmatic API Reference](docs/programmatic_api.md)
|
|
281
|
+
- [TUI Styling Guide](docs/TEXTUAL_STYLING_COOKBOOK.md)
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
MIT License - see LICENSE file for details.
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# hpc-runner
|
|
2
|
+
|
|
3
|
+
**Unified HPC job submission across multiple schedulers**
|
|
4
|
+
|
|
5
|
+
Write your jobs once, run them on any cluster - SGE, Slurm, PBS, or locally for testing.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Unified CLI** - Same commands work across SGE, Slurm, PBS
|
|
10
|
+
- **Python API** - Programmatic job submission with dependencies and pipelines
|
|
11
|
+
- **Auto-detection** - Automatically finds your cluster's scheduler
|
|
12
|
+
- **Interactive TUI** - Monitor jobs with a terminal dashboard
|
|
13
|
+
- **Job Dependencies** - Chain jobs with afterok, afterany, afternotok
|
|
14
|
+
- **Array Jobs** - Batch processing with throttling support
|
|
15
|
+
- **Virtual Environment Handling** - Automatic venv activation on compute nodes
|
|
16
|
+
- **Module Integration** - Load environment modules in job scripts
|
|
17
|
+
- **Dry-run Mode** - Preview generated scripts before submission
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install hpc-runner
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or with uv:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv pip install hpc-runner
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
### CLI
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Basic job submission
|
|
37
|
+
hpc run python train.py
|
|
38
|
+
|
|
39
|
+
# With resources
|
|
40
|
+
hpc run --cpu 4 --mem 16G --time 4:00:00 "python train.py"
|
|
41
|
+
|
|
42
|
+
# GPU job
|
|
43
|
+
hpc run --queue gpu --cpu 4 --mem 32G "python train.py --epochs 100"
|
|
44
|
+
|
|
45
|
+
# Preview without submitting
|
|
46
|
+
hpc run --dry-run --cpu 8 "make -j8"
|
|
47
|
+
|
|
48
|
+
# Interactive session
|
|
49
|
+
hpc run --interactive bash
|
|
50
|
+
|
|
51
|
+
# Array job
|
|
52
|
+
hpc run --array 1-100 "python process.py --task-id \$SGE_TASK_ID"
|
|
53
|
+
|
|
54
|
+
# Wait for completion
|
|
55
|
+
hpc run --wait python long_job.py
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Python API
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from hpc_runner import Job
|
|
62
|
+
|
|
63
|
+
# Create and submit a job
|
|
64
|
+
job = Job(
|
|
65
|
+
command="python train.py",
|
|
66
|
+
cpu=4,
|
|
67
|
+
mem="16G",
|
|
68
|
+
time="4:00:00",
|
|
69
|
+
queue="gpu",
|
|
70
|
+
)
|
|
71
|
+
result = job.submit()
|
|
72
|
+
|
|
73
|
+
# Wait for completion
|
|
74
|
+
status = result.wait()
|
|
75
|
+
print(f"Exit code: {result.returncode}")
|
|
76
|
+
|
|
77
|
+
# Read output
|
|
78
|
+
print(result.read_stdout())
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Job Dependencies
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from hpc_runner import Job
|
|
85
|
+
|
|
86
|
+
# First job
|
|
87
|
+
preprocess = Job(command="python preprocess.py", cpu=8, mem="32G")
|
|
88
|
+
result1 = preprocess.submit()
|
|
89
|
+
|
|
90
|
+
# Second job runs after first succeeds
|
|
91
|
+
train = Job(command="python train.py", cpu=4, mem="48G", queue="gpu")
|
|
92
|
+
train.after(result1, type="afterok")
|
|
93
|
+
result2 = train.submit()
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Pipelines
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from hpc_runner import Pipeline
|
|
100
|
+
|
|
101
|
+
with Pipeline("ml_workflow") as p:
|
|
102
|
+
p.add("python preprocess.py", name="preprocess", cpu=8)
|
|
103
|
+
p.add("python train.py", name="train", depends_on=["preprocess"], queue="gpu")
|
|
104
|
+
p.add("python evaluate.py", name="evaluate", depends_on=["train"])
|
|
105
|
+
|
|
106
|
+
results = p.submit()
|
|
107
|
+
p.wait()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Scheduler Support
|
|
111
|
+
|
|
112
|
+
| Scheduler | Status | Notes |
|
|
113
|
+
|-----------|--------|-------|
|
|
114
|
+
| SGE | Fully implemented | qsub, qstat, qdel, qrsh |
|
|
115
|
+
| Local | Fully implemented | Run as subprocess (for testing) |
|
|
116
|
+
| Slurm | Planned | sbatch, squeue, scancel |
|
|
117
|
+
| PBS | Planned | qsub, qstat, qdel |
|
|
118
|
+
|
|
119
|
+
### Auto-detection Priority
|
|
120
|
+
|
|
121
|
+
1. `HPC_SCHEDULER` environment variable
|
|
122
|
+
2. SGE (`SGE_ROOT` or `qstat` available)
|
|
123
|
+
3. Slurm (`sbatch` available)
|
|
124
|
+
4. PBS (`qsub` with PBS)
|
|
125
|
+
5. Local fallback
|
|
126
|
+
|
|
127
|
+
## Configuration
|
|
128
|
+
|
|
129
|
+
hpc-runner uses TOML configuration files. Location priority:
|
|
130
|
+
|
|
131
|
+
1. `--config /path/to/config.toml`
|
|
132
|
+
2. `./hpc-tools.toml`
|
|
133
|
+
3. `./pyproject.toml` under `[tool.hpc-tools]`
|
|
134
|
+
4. Git repository root `hpc-tools.toml`
|
|
135
|
+
5. `~/.config/hpc-tools/config.toml`
|
|
136
|
+
6. Package defaults
|
|
137
|
+
|
|
138
|
+
### Example Configuration
|
|
139
|
+
|
|
140
|
+
```toml
|
|
141
|
+
[defaults]
|
|
142
|
+
cpu = 1
|
|
143
|
+
mem = "4G"
|
|
144
|
+
time = "1:00:00"
|
|
145
|
+
inherit_env = true
|
|
146
|
+
|
|
147
|
+
[schedulers.sge]
|
|
148
|
+
parallel_environment = "smp"
|
|
149
|
+
memory_resource = "mem_free"
|
|
150
|
+
purge_modules = true
|
|
151
|
+
|
|
152
|
+
[types.gpu]
|
|
153
|
+
queue = "gpu"
|
|
154
|
+
resources = [{name = "gpu", value = 1}]
|
|
155
|
+
|
|
156
|
+
[types.interactive]
|
|
157
|
+
queue = "interactive"
|
|
158
|
+
time = "8:00:00"
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Use named job types:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
hpc run --job-type gpu "python train.py"
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## TUI Monitor
|
|
168
|
+
|
|
169
|
+
Launch the interactive job monitor:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
hpc monitor
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Key bindings:
|
|
176
|
+
- `q` - Quit
|
|
177
|
+
- `r` - Refresh
|
|
178
|
+
- `u` - Toggle user filter (my jobs / all)
|
|
179
|
+
- `/` - Search
|
|
180
|
+
- `Enter` - View job details
|
|
181
|
+
- `Tab` - Switch tabs
|
|
182
|
+
|
|
183
|
+
## CLI Reference
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
hpc run [OPTIONS] COMMAND
|
|
187
|
+
|
|
188
|
+
Options:
|
|
189
|
+
--job-name TEXT Job name
|
|
190
|
+
--cpu INTEGER Number of CPUs
|
|
191
|
+
--mem TEXT Memory (e.g., 16G, 4096M)
|
|
192
|
+
--time TEXT Time limit (e.g., 4:00:00)
|
|
193
|
+
--queue TEXT Queue/partition name
|
|
194
|
+
--directory PATH Working directory
|
|
195
|
+
--module TEXT Module to load (repeatable)
|
|
196
|
+
--array TEXT Array spec (e.g., 1-100, 1-100%5)
|
|
197
|
+
--depend TEXT Job dependencies
|
|
198
|
+
--inherit-env Inherit environment (default: true)
|
|
199
|
+
--no-inherit-env Don't inherit environment
|
|
200
|
+
--interactive Run interactively (qrsh/srun)
|
|
201
|
+
--local Run locally (no scheduler)
|
|
202
|
+
--dry-run Show script without submitting
|
|
203
|
+
--wait Wait for completion
|
|
204
|
+
--keep-script Keep job script for debugging
|
|
205
|
+
-h, --help Show help
|
|
206
|
+
|
|
207
|
+
Other commands:
|
|
208
|
+
hpc status [JOB_ID] Check job status
|
|
209
|
+
hpc cancel JOB_ID Cancel a job
|
|
210
|
+
hpc monitor Interactive TUI
|
|
211
|
+
hpc config show Show active configuration
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Development
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
# Setup environment
|
|
218
|
+
source sourceme
|
|
219
|
+
source sourceme --clean # Clean rebuild
|
|
220
|
+
|
|
221
|
+
# Run tests
|
|
222
|
+
pytest
|
|
223
|
+
pytest -v
|
|
224
|
+
pytest -k "test_job"
|
|
225
|
+
|
|
226
|
+
# Type checking
|
|
227
|
+
mypy src/hpc_runner
|
|
228
|
+
|
|
229
|
+
# Linting
|
|
230
|
+
ruff check src/hpc_runner
|
|
231
|
+
ruff format src/hpc_runner
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Documentation
|
|
235
|
+
|
|
236
|
+
- [Programmatic API Reference](docs/programmatic_api.md)
|
|
237
|
+
- [TUI Styling Guide](docs/TEXTUAL_STYLING_COOKBOOK.md)
|
|
238
|
+
|
|
239
|
+
## License
|
|
240
|
+
|
|
241
|
+
MIT License - see LICENSE file for details.
|
|
@@ -16,6 +16,15 @@ memory_resource = "mem_free"
|
|
|
16
16
|
time_resource = "h_rt"
|
|
17
17
|
merge_output = true
|
|
18
18
|
|
|
19
|
+
# Module handling
|
|
20
|
+
purge_modules = false # Run 'module purge' before loading modules
|
|
21
|
+
silent_modules = false # Use -s flag for silent module operations
|
|
22
|
+
module_init_script = "" # Site-specific module init (empty = auto-detect)
|
|
23
|
+
|
|
24
|
+
# Environment handling
|
|
25
|
+
expand_makeflags = true # Expand $NSLOTS in MAKEFLAGS
|
|
26
|
+
unset_vars = [] # Environment variables to unset (e.g., ["https_proxy"])
|
|
27
|
+
|
|
19
28
|
# Slurm-specific settings (for future use)
|
|
20
29
|
[schedulers.slurm]
|
|
21
30
|
# Default Slurm settings
|