clusterpilot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. clusterpilot-0.1.0/.claude/settings.local.json +41 -0
  2. clusterpilot-0.1.0/.gitignore +222 -0
  3. clusterpilot-0.1.0/CLAUDE.md +400 -0
  4. clusterpilot-0.1.0/LICENSE +21 -0
  5. clusterpilot-0.1.0/PKG-INFO +367 -0
  6. clusterpilot-0.1.0/README.md +339 -0
  7. clusterpilot-0.1.0/clusterpilot/__init__.py +3 -0
  8. clusterpilot-0.1.0/clusterpilot/__main__.py +139 -0
  9. clusterpilot-0.1.0/clusterpilot/cluster/__init__.py +33 -0
  10. clusterpilot-0.1.0/clusterpilot/cluster/probe.py +230 -0
  11. clusterpilot-0.1.0/clusterpilot/cluster/slurm.py +174 -0
  12. clusterpilot-0.1.0/clusterpilot/config.py +197 -0
  13. clusterpilot-0.1.0/clusterpilot/db.py +286 -0
  14. clusterpilot-0.1.0/clusterpilot/jobs/__init__.py +4 -0
  15. clusterpilot-0.1.0/clusterpilot/jobs/ai_gen.py +375 -0
  16. clusterpilot-0.1.0/clusterpilot/jobs/daemon.py +328 -0
  17. clusterpilot-0.1.0/clusterpilot/jobs/env_detect.py +170 -0
  18. clusterpilot-0.1.0/clusterpilot/notify/__init__.py +19 -0
  19. clusterpilot-0.1.0/clusterpilot/notify/ntfy.py +131 -0
  20. clusterpilot-0.1.0/clusterpilot/ssh/__init__.py +15 -0
  21. clusterpilot-0.1.0/clusterpilot/ssh/connection.py +122 -0
  22. clusterpilot-0.1.0/clusterpilot/ssh/rsync.py +167 -0
  23. clusterpilot-0.1.0/clusterpilot/tui/__init__.py +3 -0
  24. clusterpilot-0.1.0/clusterpilot/tui/app.py +679 -0
  25. clusterpilot-0.1.0/clusterpilot/tui/config_view.py +87 -0
  26. clusterpilot-0.1.0/clusterpilot/tui/jobs.py +369 -0
  27. clusterpilot-0.1.0/clusterpilot/tui/submit.py +692 -0
  28. clusterpilot-0.1.0/clusterpilot/tui/widgets/__init__.py +0 -0
  29. clusterpilot-0.1.0/clusterpilot/tui/widgets/file_explorer.py +155 -0
  30. clusterpilot-0.1.0/clusterpilot-tui.jsx +679 -0
  31. clusterpilot-0.1.0/docs/screenshots/tui-submit-ssh-notruecolor.png +0 -0
  32. clusterpilot-0.1.0/docs/screenshots/tui-submit.png +0 -0
  33. clusterpilot-0.1.0/hpc-app-flow.jsx +502 -0
  34. clusterpilot-0.1.0/pyproject.toml +55 -0
  35. clusterpilot-0.1.0/tests/__init__.py +0 -0
  36. clusterpilot-0.1.0/tests/test_ai_gen.py +146 -0
  37. clusterpilot-0.1.0/tests/test_config.py +201 -0
  38. clusterpilot-0.1.0/tests/test_db.py +207 -0
  39. clusterpilot-0.1.0/tests/test_env_detect.py +198 -0
  40. clusterpilot-0.1.0/tests/test_ntfy.py +194 -0
  41. clusterpilot-0.1.0/tests/test_probe.py +248 -0
  42. clusterpilot-0.1.0/tests/test_slurm.py +175 -0
  43. clusterpilot-0.1.0/tests/test_ssh.py +120 -0
@@ -0,0 +1,41 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(/Users/juliafrank/Projects/clusterpilot/.venv/bin/python:*)",
5
+ "Bash(wc:*)",
6
+ "Bash(git add:*)",
7
+ "Bash(git commit:*)",
8
+ "Bash(git push:*)",
9
+ "Bash(git remote:*)",
10
+ "Bash(git fetch:*)",
11
+ "Bash(git rebase:*)",
12
+ "Bash(.venv/bin/pip show:*)",
13
+ "Bash(.venv/bin/python -c \"import pytest; print\\(pytest.__version__\\)\" 2>&1; .venv/bin/python -c \"import pytest_asyncio; print\\(pytest_asyncio.__version__\\)\" 2>&1; .venv/bin/python -c \"import aiosqlite; print\\(aiosqlite.__version__\\)\" 2>&1)",
14
+ "Bash(.venv/bin/pip install:*)",
15
+ "Bash(.venv/bin/python -m pytest tests/ -v 2>&1)",
16
+ "Bash(.venv/bin/python -m pytest tests/ --tb=short -q 2>&1)",
17
+ "Bash(.venv/bin/python -m pytest tests/ --tb=short -q 2>&1; echo \"EXIT: $?\")",
18
+ "Bash(.venv/bin/python -m pytest tests/test_db.py::test_get_all_jobs_newest_first tests/test_ntfy.py::TestJobEventHelpers::test_notify_failed_includes_log_excerpt tests/test_ssh.py::TestRunRemote::test_raises_ssh_error_on_timeout -v 2>&1)",
19
+ "Bash(.venv/bin/python -m pytest tests/test_ai_gen.py -v 2>&1)",
20
+ "Bash(python3 -c \"\nimport shutil, os\nsrc_dir = '/Users/juliafrank/Documents/Screenshots'\ndst_dir = '/Users/juliafrank/Projects/clusterpilot/docs/screenshots'\nos.makedirs\\(dst_dir, exist_ok=True\\)\nfor f in os.listdir\\(src_dir\\):\n if '2026-03-12' in f and f.endswith\\('.png'\\):\n print\\(repr\\(f\\)\\)\n if '2.04.43' in f:\n shutil.copy\\(os.path.join\\(src_dir, f\\), os.path.join\\(dst_dir, 'tui-submit.png'\\)\\)\n elif '2.04.46' in f:\n shutil.copy\\(os.path.join\\(src_dir, f\\), os.path.join\\(dst_dir, 'tui-submit-ssh-notruecolor.png'\\)\\)\nprint\\('done'\\)\n\")",
21
+ "Bash(python -m pytest tests/test_env_detect.py tests/test_ai_gen.py -v 2>&1)",
22
+ "Bash(python -c \"from clusterpilot.tui.widgets.file_explorer import FileExplorer, load_recent_paths, save_recent_path; print\\('import OK'\\)\")",
23
+ "Bash(python -c \"from clusterpilot.tui.app import ClusterPilotApp; print\\('app import OK'\\)\")",
24
+ "Bash(python -c \"from clusterpilot.jobs.ai_gen import generate_script; print\\('import OK'\\)\")",
25
+ "Bash(python -c \"from clusterpilot.tui.submit import SubmitView; print\\('import OK'\\)\")",
26
+ "Bash(python -c \"from clusterpilot.config import load_config, _DEFAULT_DOWNLOAD_EXCLUDES; print\\('Download excludes:', _DEFAULT_DOWNLOAD_EXCLUDES\\)\")",
27
+ "Bash(python -c \"from clusterpilot.tui.jobs import JobsView; from clusterpilot.db import delete_job; from clusterpilot.cluster.slurm import cat_log; print\\('OK'\\)\")",
28
+ "WebSearch",
29
+ "Bash(python -c \"from clusterpilot.tui.app import ClusterPilotApp; from clusterpilot.jobs.ai_gen import ApiUsage, _PRICING; from clusterpilot.db import get_total_usage; print\\('OK'\\)\")",
30
+ "Bash(python -c \"from clusterpilot.tui.jobs import JobsView; from clusterpilot.tui.submit import SubmitView; from clusterpilot.tui.app import ClusterPilotApp; print\\('All OK'\\)\")",
31
+ "Bash(python -c \"import textual; print\\(textual.__version__\\)\")",
32
+ "Bash(sqlite3 ~/.local/share/clusterpilot/jobs.db \"SELECT job_id, status, walltime, partition FROM jobs WHERE job_id = '6691110';\" 2>/dev/null || echo \"DB or job not found\")",
33
+ "Bash(python3 -c \"\nfrom clusterpilot.config import load_config\ncfg = load_config\\(\\)\nprint\\(f'Topic: {cfg.notifications.ntfy_topic!r}'\\)\nprint\\(f'Server: {cfg.notifications.ntfy_server!r}'\\)\nprint\\(f'Backend: {cfg.notifications.backend!r}'\\)\n\")",
34
+ "Bash(python3 << 'EOF'\nimport asyncio\nfrom clusterpilot.config import load_config\nfrom clusterpilot.db import JobRecord\nfrom clusterpilot.notify.ntfy import notify_started\n\nasync def test\\(\\):\n cfg = load_config\\(\\)\n job = JobRecord\\(\n job_id=\"6691110\",\n cluster_name=\"grex\",\n job_name=\"test_job\",\n partition=\"stamps\",\n status=\"RUNNING\",\n walltime=\"0-00:30:00\",\n local_dir=\"/tmp/test\",\n working_dir=\"/home/juliaf/clusterpilot_jobs/test_job\",\n log_path=\"/home/juliaf/clusterpilot_jobs/test_job/slurm.out\",\n started_at=None,\n finished_at=None,\n synced=False,\n \\)\n \n print\\(f\"Config notifications: {cfg.notifications}\"\\)\n print\\(f\"ntfy_topic: {cfg.notifications.ntfy_topic!r}\"\\)\n print\\(f\"Job data: {job}\"\\)\n \n try:\n await notify_started\\(cfg.notifications, job\\)\n print\\(\"✓ Notification sent successfully\"\\)\n except Exception as e:\n print\\(f\"✗ Error: {type\\(e\\).__name__}: {e}\"\\)\n\nasyncio.run\\(test\\(\\)\\)\nEOF)",
35
+ "Bash(python3 << 'EOF'\nimport asyncio\nimport aiosqlite\nfrom pathlib import Path\nfrom clusterpilot.config import load_config\nfrom clusterpilot.db import DB_PATH, init_db, get_active_jobs\nfrom clusterpilot.notify.ntfy import notify_started\n\nasync def test\\(\\):\n cfg = load_config\\(\\)\n \n async with aiosqlite.connect\\(DB_PATH\\) as db:\n await init_db\\(db\\)\n \n # Get the specific job from the database\n cursor = await db.execute\\(\n \"SELECT job_id, job_name, cluster_name, host, user, account, partition, script_path, working_dir, local_dir, walltime, status, started_at, finished_at, log_path, synced, input_tokens, output_tokens, model_used FROM jobs WHERE job_id = '6691110'\"\n \\)\n row = await cursor.fetchone\\(\\)\n \n if not row:\n print\\(\"Job not found in database\"\\)\n return\n \n from clusterpilot.db import JobRecord\n job = JobRecord\\(\n job_id=row[0],\n job_name=row[1],\n cluster_name=row[2],\n host=row[3],\n user=row[4],\n account=row[5],\n partition=row[6],\n script_path=row[7],\n working_dir=row[8],\n local_dir=row[9],\n walltime=row[10],\n status=row[11],\n started_at=row[12],\n finished_at=row[13],\n log_path=row[14],\n synced=bool\\(row[15]\\),\n input_tokens=row[16],\n output_tokens=row[17],\n model_used=row[18],\n \\)\n \n print\\(f\"Job: {job.job_id} {job.job_name} on {job.cluster_name}\"\\)\n print\\(f\"Config: {cfg.notifications}\"\\)\n \n try:\n await notify_started\\(cfg.notifications, job\\)\n print\\(\"✓ Notification sent successfully\"\\)\n except Exception as e:\n import traceback\n print\\(f\"✗ Error: {type\\(e\\).__name__}: {e}\"\\)\n traceback.print_exc\\(\\)\n\nasyncio.run\\(test\\(\\)\\)\nEOF)",
36
+ "Bash(python3 << 'EOF'\nwith open\\('/Users/juliafrank/Projects/clusterpilot/clusterpilot/notify/ntfy.py', 'rb'\\) as f:\n content = f.read\\(\\).decode\\('utf-8'\\)\n\nlines = content.split\\('\\\\n'\\)\nprint\\(\"=== Non-ASCII characters in notification titles ===\\\\n\"\\)\nfor i, line in enumerate\\(lines, 1\\):\n if 'title=' in line and i > 50: # After the send\\(\\) function\n print\\(f\"Line {i}: {line.strip\\(\\)}\"\\)\n for j, char in enumerate\\(line\\):\n if ord\\(char\\) > 127:\n print\\(f\" Position {j}: {char!r} \\(U+{ord\\(char\\):04X}\\)\"\\)\n print\\(\\)\nEOF)",
37
+ "Bash(python -m build --help 2>/dev/null || echo \"build not installed\")",
38
+ "Bash(.venv/bin/python -m build 2>&1)"
39
+ ]
40
+ }
41
+ }
@@ -0,0 +1,222 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # ClusterPilot: never commit credentials or private config
210
+ config.toml
211
+ *.toml.local
212
+
213
+ # ClusterPilot: cache and runtime state
214
+ *.sqlite
215
+ *.db
216
+
217
+ # macOS
218
+ .DS_Store
219
+ .AppleDouble
220
+
221
+ # Design files kept in repo (intentionally tracked)
222
+ # hpc-app-flow.jsx and clusterpilot-tui.jsx are design references
@@ -0,0 +1,400 @@
1
+ # ClusterPilot
2
+
3
+ AI-assisted HPC workflow manager for Compute Canada (DRAC) clusters
4
+ and University of Manitoba's Grex cluster.
5
+ Built by a computational physics PhD student who got tired of doing this manually.
6
+
7
+ ---
8
+
9
+ ## What this does
10
+
11
+ Automates the full local-to-cluster-and-back workflow for researchers on
12
+ supported HPC clusters:
13
+
14
+ 1. SSH authentication via ControlMaster (one interactive login, then headless)
15
+ 2. Cluster environment discovery (sinfo, module avail, sacctmgr)
16
+ 3. AI-generated SLURM scripts via the Anthropic API, contextualised to the
17
+ specific cluster's partitions, modules, accounts, and quirks
18
+ 4. File upload to the appropriate scratch/work filesystem via rsync
19
+ 5. Job submission and SQLite-backed job tracking
20
+ 6. Background poll daemon (systemd user service) -- polls squeue every 5 min,
21
+ no persistent SSH connection required
22
+ 7. Push notifications to phone on job events (started, completed, failed, ETA)
23
+ 8. Automatic result sync back to local workstation on job completion
24
+
25
+ ---
26
+
27
+ ## Supported clusters
28
+
29
+ ### 1. Compute Canada / DRAC national clusters
30
+
31
+ cedar, narval, graham, beluga.
32
+ Docs: https://docs.alliancecan.ca/wiki/Getting_started
33
+
34
+ **DRAC-specific SLURM quirks -- inject into every generation prompt:**
35
+
36
+ - `--account=def-supervisorname` is mandatory in every script; the job will
37
+ be rejected without it
38
+ - Job I/O must target `$SCRATCH`, never `$HOME` (home quota is ~50 GB;
39
+ scratch is a large fast parallel filesystem)
40
+ - Module system is Lmod: `module load julia/1.10.4 cuda/12.2`
41
+ - Use `module spider <name>` to find available versions
42
+ - GPU syntax on cedar/narval: `--gres=gpu:a100:2`
43
+ - `$SLURM_TMPDIR` is fast local node-level SSD; use for temporary files
44
+ during a run, copy results to `$SCRATCH` before job ends
45
+ - Email notifications: `--mail-type=FAIL,END` and `--mail-user=`
46
+ - Array jobs: `--array=0-N%M` (M = max simultaneous)
47
+ - GPU walltime limits: 24h on cedar gpu partition, 48h on narval gpu partition
48
+ - Always `module purge` before loading new modules to avoid conflicts
49
+
50
+ **DRAC scratch path:**
51
+ ```
52
+ $SCRATCH -> /scratch/<username>/
53
+ ```
54
+
55
+ ---
56
+
57
+ ### 2. University of Manitoba -- Grex
58
+
59
+ Grex is a community HPC cluster at UManitoba, running SLURM. Available to
60
+ UManitoba-affiliated researchers and their collaborators. It is a heterogeneous
61
+ cluster with contributed nodes, large-memory nodes, and GPU nodes.
62
+ Docs: https://um-grex.github.io/grex-docs/
63
+
64
+ **Login:**
65
+ - Hostname: `grex.hpc.umanitoba.ca`
66
+ - Login nodes: `bison.hpc.umanitoba.ca`, `yak.hpc.umanitoba.ca`
67
+ (tatanka and zebu were decommissioned August-September 2024 -- never use)
68
+ - MFA is required
69
+ - SSH access works from off-campus without VPN; VPN is only needed for
70
+ the OpenOnDemand web interface (ood.hpc.umanitoba.ca)
71
+
72
+ **Grex-specific SLURM quirks -- inject into every generation prompt:**
73
+
74
+ - `--account=` is NOT mandatory on Grex the way it is on DRAC; most users
75
+ submit to the community pool without specifying an account
76
+ - Partitions must always be specified explicitly; the app handles this via the
77
+ partition picker on the F2 Submit screen (populated from sinfo cache)
78
+ - The only SLURM default on Grex is `skylake` for non-contributor CPU jobs;
79
+ the app should not rely on this -- the picker makes selection explicit always
80
+ - GPU jobs MUST use a GPU partition; requesting `--gres=gpu:...` on `skylake`,
81
+ `compute`, or `largemem` will cause the job to be rejected by SLURM
82
+ - Multiple partitions can be listed: `--partition=skylake,largemem` so SLURM
83
+ picks whichever is free first; useful for CPU jobs that fit either
84
+ - `$SLURM_TMPDIR` is fast local node disk; use for temp I/O, copy results
85
+ to `$HOME` or group project storage before the job ends
86
+ - Grex does NOT have a `$SCRATCH` environment variable like DRAC; the
87
+ equivalent is `$HOME` (for smaller data) or a group project directory
88
+ - Contributed/community nodes: non-owner jobs run opportunistically and may
89
+ be preempted; do not use these partitions for long uncheckpointed runs
90
+ unless you own them
91
+ - Module system is Lmod, same syntax as DRAC: `module load <name>/<version>`
92
+ - CVMFS is available as an additional software stack source
93
+
94
+ **Grex partitions (inject relevant rows per job type):**
95
+
96
+ | Partition | Use case | Notes |
97
+ |------------|--------------------------------|------------------------------------------|
98
+ | skylake | Default short CPU jobs | Intel Skylake; auto-default for most users |
99
+ | largemem | High-memory CPU jobs | Must specify explicitly |
100
+ | compute | General CPU | Must specify explicitly |
101
+ | gpu | GPU jobs | Must specify; L40S and older GPU nodes |
102
+ | test | Short interactive/test jobs | Oversubscription enabled; quick turnaround |
103
+ | stamps-b | Contributed (owner priority) | Opportunistic for non-owners |
104
+
105
+ **Grex hardware (as of early 2025):**
106
+ - AMD Genoa CPU nodes: 30 nodes added September 2024, 5760 total cores
107
+ - GPU nodes: 2 nodes with NVIDIA L40S GPUs added 2025
108
+ - GPU syntax: `--gres=gpu:l40s:1` (L40S nodes) or `--gres=gpu:1` (older nodes)
109
+
110
+ **Grex storage:**
111
+ ```
112
+ $HOME -> /home/<username>/ (personal, limited quota)
113
+ $SLURM_TMPDIR -> fast local node disk (vanishes when job ends)
114
+ group project -> /home/grex/<group>/ (shared group storage)
115
+ ```
116
+ There is no `$SCRATCH` on Grex. Write job outputs to `$HOME` or the group
117
+ project directory. Use `$SLURM_TMPDIR` only for within-job temporary files.
118
+
119
+ ---
120
+
121
+ ## Cluster type abstraction
122
+
123
+ The `cluster_type` field in the config drives which quirks are injected into
124
+ SLURM generation prompts. Adding a new institution means adding a new type
125
+ in `cluster/profiles.py` -- cluster-specific logic lives in one place only.
126
+
127
+ ```toml
128
+ # Example entries in ~/.config/clusterpilot/config.toml
129
+
130
+ [[clusters]]
131
+ name = "cedar"
132
+ hostname = "cedar.computecanada.ca"
133
+ username = "jfrank"
134
+ account = "def-mlafond" # mandatory for DRAC
135
+ ssh_key = "~/.ssh/id_ed25519"
136
+ scratch_path = "/scratch/jfrank"
137
+ cluster_type = "drac"
138
+
139
+ [[clusters]]
140
+ name = "narval"
141
+ hostname = "narval.computecanada.ca"
142
+ username = "jfrank"
143
+ account = "def-mlafond"
144
+ ssh_key = "~/.ssh/id_ed25519"
145
+ scratch_path = "/scratch/jfrank"
146
+ cluster_type = "drac"
147
+
148
+ [[clusters]]
149
+ name = "grex"
150
+ hostname = "grex.hpc.umanitoba.ca"
151
+ username = "jfrank"
152
+ account = "" # not required on Grex
153
+ ssh_key = "~/.ssh/id_ed25519"
154
+ scratch_path = "/home/jfrank" # or group project path; no $SCRATCH on Grex
155
+ cluster_type = "grex"
156
+ ```
157
+
158
+ ---
159
+
160
+ ## Stack
161
+
162
+ - **Language:** Python 3.11+
163
+ - **TUI:** Textual (terminal UI framework)
164
+ - **SSH:** subprocess + system ssh binary with ControlMaster (not Paramiko)
165
+ - **AI:** Anthropic Python SDK, claude-sonnet-4-6 for SLURM generation
166
+ - **Database:** SQLite via stdlib sqlite3 (zero-dependency local job history)
167
+ - **Notifications:** ntfy.sh (or any HTTP POST endpoint) via httpx
168
+ - **Daemon:** systemd user service (clusterpilot-poll.service)
169
+ - **Config:** TOML via stdlib tomllib / tomli
170
+
171
+ ---
172
+
173
+ ## Module structure and build order
174
+
175
+ Build strictly in this order -- each layer depends on the one before it.
176
+
177
+ ```
178
+ clusterpilot/
179
+ ssh/
180
+ session.py # ControlMaster: connect, run_command, is_connected, disconnect
181
+ cluster/
182
+ probe.py # sinfo, module avail, sacctmgr -- parse and cache in SQLite
183
+ # sinfo must capture: partition name, max walltime, GPU gres,
184
+ # node count, state -- enough to populate the partition picker
185
+ cache.py # SQLite cache layer for cluster state (TTL: 24h)
186
+ profiles.py # Cluster type definitions: drac, grex. Add new types here only.
187
+ jobs/
188
+ generate.py # Anthropic API call: cluster context + user description -> script
189
+ submit.py # rsync upload + sbatch, capture job ID
190
+ state.py # State machine: PENDING -> RUNNING -> COMPLETED/FAILED
191
+ db.py # SQLite job log: insert, update, query
192
+ notify/
193
+ push.py # HTTP POST to ntfy endpoint (or any webhook)
194
+ desktop.py # libnotify via subprocess (Linux: notify-send)
195
+ daemon/
196
+ poll.py # Main poll loop: squeue -> state transitions -> notify -> rsync
197
+ service.py # systemd unit file writer and installer
198
+ tui/
199
+ app.py # Textual App root
200
+ screens/
201
+ jobs.py # Job list + detail + log + action buttons (F1)
202
+ submit.py # Job description input + partition picker + AI script generation + file list (F2)
203
+ config.py # Cluster profiles + SSH + notify + API key settings (F9)
204
+ widgets/
205
+ job_table.py # Scrollable job list with status indicators
206
+ log_panel.py # RichLog widget, auto-scroll
207
+ progress.py # Walltime progress bar
208
+ config.py # Config loader: ~/.config/clusterpilot/config.toml
209
+ cli.py # Entry point: `clusterpilot` command
210
+ ```
211
+
212
+ ---
213
+
214
+ ## SLURM script generation -- prompt structure
215
+
216
+ When calling the Anthropic API to generate a SLURM script, the system
217
+ prompt must include three layers of context:
218
+
219
+ 1. **Universal SLURM rules** -- valid for all clusters
220
+ 2. **Cluster-type quirks** -- injected based on `cluster_type` field from
221
+ `cluster/profiles.py` (the DRAC and Grex sections above are the source
222
+ of truth for these quirks)
223
+ 3. **Probed cluster state** -- actual output of `sinfo`, `module avail`,
224
+ and (for DRAC) `sacctmgr`, cached in SQLite for 24h
225
+
226
+ The user message is their plain-language job description, plus the
227
+ **user-selected partition** which is passed as a hard constraint, not a
228
+ suggestion. The AI must honour it and use the correct `--gres` syntax for
229
+ that partition's hardware.
230
+
231
+ Model: `claude-sonnet-4-6`. Opus is not needed for script generation.
232
+
233
+ ---
234
+
235
+ ## Partition picker design
236
+
237
+ Partition selection is a **required manual step** on the F2 Submit screen,
238
+ for all cluster types. The app never auto-selects a partition.
239
+
240
+ Rationale: partition access is personal -- research groups have dedicated
241
+ GPU partitions that general users cannot use, and users generally know which
242
+ partitions they are allowed on. Auto-selection would be wrong as often as it
243
+ would be right.
244
+
245
+ **UX flow on F2 Submit:**
246
+ 1. User selects cluster (dropdown, populated from config)
247
+ 2. User selects partition (dropdown, populated from the sinfo cache for that
248
+ cluster -- shows partition name, max walltime, GPU availability, node count)
249
+ 3. User types plain-language job description
250
+ 4. AI generates script with the chosen partition as a hard `--partition=` value
251
+
252
+ **What the partition picker shows** (parsed from `sinfo -o "%P %l %G %D %a"`):
253
+ - Partition name
254
+ - Max walltime (e.g. `24:00:00`)
255
+ - GPU resources if any (e.g. `gpu:a100:4`, or `(null)` for CPU-only)
256
+ - Number of nodes
257
+ - State (up / down / drain)
258
+
259
+ Show only `up` partitions. Sort GPU partitions to the top since most
260
+ ClusterPilot users are running GPU workloads.
261
+
262
+ **Do not validate** whether the user has access to a given partition -- the
263
+ app has no way to know which contributed or restricted partitions a specific
264
+ user belongs to. Users know their own access. If they pick a partition they
265
+ cannot use, sbatch will reject it with a clear error message that ClusterPilot
266
+ surfaces in the log panel.
267
+
268
+ ---
269
+
270
+ ## SSH strategy
271
+
272
+ System ssh binary via subprocess, not Paramiko. ControlMaster socket
273
+ management is more reliable through the system binary and respects the
274
+ user's existing `~/.ssh/config`.
275
+
276
+ `clusterpilot setup` writes entries like the following on first run:
277
+
278
+ ```
279
+ Host cedar
280
+ HostName cedar.computecanada.ca
281
+ ControlMaster auto
282
+ ControlPath ~/.ssh/cm_%h_%p_%r
283
+ ControlPersist 4h
284
+ ServerAliveInterval 60
285
+
286
+ Host narval
287
+ HostName narval.computecanada.ca
288
+ ControlMaster auto
289
+ ControlPath ~/.ssh/cm_%h_%p_%r
290
+ ControlPersist 4h
291
+ ServerAliveInterval 60
292
+
293
+ Host grex
294
+ HostName grex.hpc.umanitoba.ca
295
+ ControlMaster auto
296
+ ControlPath ~/.ssh/cm_%h_%p_%r
297
+ ControlPersist 4h
298
+ ServerAliveInterval 60
299
+ ```
300
+
301
+ `session.run_command(host, cmd)` opens a connection over the existing socket
302
+ (sub-second), runs the command, and closes. Each poll cycle is:
303
+ connect -> `squeue -j JOB_ID -h` -> disconnect. No persistent pipe.
304
+
305
+ ---
306
+
307
+ ## Notification design
308
+
309
+ `notify.push` sends a single HTTP POST:
310
+
311
+ ```python
312
+ httpx.post(
313
+ config.notifications.endpoint,
314
+ content=message,
315
+ headers={"Title": title, "Priority": priority, "Tags": tags},
316
+ )
317
+ ```
318
+
319
+ The endpoint string is the user's only configuration. Supported out of the box:
320
+ - ntfy.sh free hosted tier (default -- no account needed, sufficient for
321
+ typical ClusterPilot usage volumes)
322
+ - Self-hosted ntfy server (single Go binary)
323
+ - Any webhook accepting a plain POST body
324
+
325
+ Users never need a paid ntfy.sh subscription for normal usage.
326
+
327
+ ---
328
+
329
+ ## Monetisation model (open core / hosted SaaS)
330
+
331
+ ClusterPilot is MIT licensed. The full source is free to use and self-host.
332
+
333
+ ### Free (self-hosted) tier -- always fully functional
334
+ - BYOK: user sets `ANTHROPIC_API_KEY` in environment
335
+ - BYOE: user configures their own ntfy endpoint
336
+ - All features work -- nothing is paywalled in the open source version
337
+ - All supported clusters work: DRAC (cedar, narval, graham, beluga) and Grex
338
+
339
+ ### Hosted tier (future, ~$5/month)
340
+ If launched, paying users get a pooled API key, managed notification
341
+ endpoint, cloud job history, and a web dashboard.
342
+
343
+ Framing: "ClusterPilot is 100% free and open source. A hosted tier exists
344
+ for researchers who want zero setup -- subscribing also supports development."
345
+
346
+ **Do not build the hosted tier yet.** Ship a working open source v0.1 on
347
+ cedar + grex first. Early adopters are already waiting.
348
+
349
+ ---
350
+
351
+ ## Code conventions
352
+
353
+ - British English in all comments, docstrings, and user-facing strings
354
+ - Type hints on all function signatures
355
+ - Dataclasses or TypedDicts for structured data, not bare dicts
356
+ - No class where a module-level function suffices
357
+ - Each module has one clear responsibility (see structure above)
358
+ - All subprocess calls go through `ssh/session.py` -- never call ssh directly
359
+ from other modules
360
+ - All cluster-specific logic lives in `cluster/profiles.py` -- adding a new
361
+ cluster type means editing exactly one file
362
+ - Errors surface as typed exceptions, not bare strings
363
+ - Tests live in `tests/` mirroring the source structure
364
+ - Use `pytest` and mock subprocess calls -- never make real SSH calls in tests
365
+
366
+ ---
367
+
368
+ ## TUI aesthetic reference
369
+
370
+ Phosphor amber terminal aesthetic. Warm amber (`#e8a020`) on near-black
371
+ (`#0c0a06`). Status colours: green (RUNNING), amber (PENDING), cyan
372
+ (COMPLETED), red (FAILED). Box-drawing borders. Monospace throughout.
373
+ Keyboard-driven with persistent footer showing shortcuts.
374
+ See `clusterpilot-tui.jsx` in the project root for the full visual mockup.
375
+
376
+ Textual widget mapping:
377
+ - Partition picker -> `Select` widget, populated from SQLite sinfo cache
378
+ - Job list -> `ListView` with custom `ListItem` subclass
379
+ - Log output -> `RichLog` with auto_scroll=True
380
+ - Progress bars -> `ProgressBar` widget
381
+ - Layout -> `TabbedContent` for F1/F2/F9 tabs, `Horizontal`/`Vertical` containers
382
+ - Footer shortcuts -> `Footer` widget with key bindings
383
+
384
+ ---
385
+
386
+ ## Distribution
387
+
388
+ ```
389
+ pip install clusterpilot
390
+ ```
391
+
392
+ Entry point: `clusterpilot` (defined in `pyproject.toml`).
393
+ First run: `clusterpilot setup` -- interactive wizard that writes config
394
+ and installs the systemd user service.
395
+
396
+ Target: PyPI, MIT licence, public GitHub repo.
397
+ Future: conda-forge (many HPC users prefer conda).
398
+
399
+ Priority for v0.1: cedar + grex working end-to-end. Expand to graham
400
+ and beluga after initial user feedback.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Julia Frank
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.