aiondemandcluster 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiondemandcluster-0.1.0/.env.example +22 -0
- aiondemandcluster-0.1.0/.github/workflows/ci.yml +26 -0
- aiondemandcluster-0.1.0/.github/workflows/publish.yml +39 -0
- aiondemandcluster-0.1.0/.gitignore +24 -0
- aiondemandcluster-0.1.0/LICENSE +21 -0
- aiondemandcluster-0.1.0/PKG-INFO +339 -0
- aiondemandcluster-0.1.0/README.md +314 -0
- aiondemandcluster-0.1.0/aiod/__init__.py +4 -0
- aiondemandcluster-0.1.0/aiod/bench.py +173 -0
- aiondemandcluster-0.1.0/aiod/bootstrap.py +102 -0
- aiondemandcluster-0.1.0/aiod/branding.py +37 -0
- aiondemandcluster-0.1.0/aiod/ccr.py +90 -0
- aiondemandcluster-0.1.0/aiod/cli.py +1004 -0
- aiondemandcluster-0.1.0/aiod/config.py +40 -0
- aiondemandcluster-0.1.0/aiod/engine.py +143 -0
- aiondemandcluster-0.1.0/aiod/events.py +45 -0
- aiondemandcluster-0.1.0/aiod/health.py +136 -0
- aiondemandcluster-0.1.0/aiod/model_configs.py +95 -0
- aiondemandcluster-0.1.0/aiod/onboard.py +127 -0
- aiondemandcluster-0.1.0/aiod/profiles.py +139 -0
- aiondemandcluster-0.1.0/aiod/providers.py +46 -0
- aiondemandcluster-0.1.0/aiod/proxy.py +289 -0
- aiondemandcluster-0.1.0/aiod/runpod.py +203 -0
- aiondemandcluster-0.1.0/aiod/sizing.py +465 -0
- aiondemandcluster-0.1.0/aiod/state.py +69 -0
- aiondemandcluster-0.1.0/aiod/tui.py +541 -0
- aiondemandcluster-0.1.0/aiod/vast.py +379 -0
- aiondemandcluster-0.1.0/aiod/watch.py +146 -0
- aiondemandcluster-0.1.0/pyproject.toml +47 -0
- aiondemandcluster-0.1.0/tests/test_bench.py +33 -0
- aiondemandcluster-0.1.0/tests/test_bootstrap.py +65 -0
- aiondemandcluster-0.1.0/tests/test_ccr.py +49 -0
- aiondemandcluster-0.1.0/tests/test_events.py +26 -0
- aiondemandcluster-0.1.0/tests/test_model_configs.py +34 -0
- aiondemandcluster-0.1.0/tests/test_onboard.py +40 -0
- aiondemandcluster-0.1.0/tests/test_profiles.py +39 -0
- aiondemandcluster-0.1.0/tests/test_proxy.py +59 -0
- aiondemandcluster-0.1.0/tests/test_runpod.py +44 -0
- aiondemandcluster-0.1.0/tests/test_sizing.py +70 -0
- aiondemandcluster-0.1.0/tests/test_vast.py +32 -0
- aiondemandcluster-0.1.0/tests/test_watch.py +35 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# ---- Required (at least one provider) ----
|
|
2
|
+
# vast.ai API key — Account → API Keys at https://cloud.vast.ai/
|
|
3
|
+
VAST_API_KEY=
|
|
4
|
+
|
|
5
|
+
# RunPod API key — Settings → API Keys at https://www.runpod.io/console/user/settings
|
|
6
|
+
# (only needed if you use --provider runpod)
|
|
7
|
+
RUNPOD_API_KEY=
|
|
8
|
+
|
|
9
|
+
# ---- Optional ----
|
|
10
|
+
# HuggingFace token — ONLY needed for gated/private models (Llama, Gemma, etc.)
|
|
11
|
+
# Also raises the rate limit on model-metadata lookups. Public models work without it.
|
|
12
|
+
HF_TOKEN=
|
|
13
|
+
|
|
14
|
+
# Bearer token that protects your vLLM endpoint on the public internet.
|
|
15
|
+
# Leave blank to have `aiod` generate a random one per launch.
|
|
16
|
+
VLLM_API_KEY=
|
|
17
|
+
|
|
18
|
+
# ---- Defaults (override per-run on the CLI) ----
|
|
19
|
+
# Auto-destroy the instance after this many hours so a forgotten box can't bill all weekend.
|
|
20
|
+
AIOD_TTL_HOURS=4
|
|
21
|
+
# Hard cap on price; offers above this are never rented.
|
|
22
|
+
AIOD_MAX_PRICE=6.0
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.10", "3.12"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python-version }}
|
|
19
|
+
- name: Install
|
|
20
|
+
run: |
|
|
21
|
+
python -m pip install --upgrade pip
|
|
22
|
+
pip install -e ".[dev]"
|
|
23
|
+
- name: Lint
|
|
24
|
+
run: ruff check aiod/
|
|
25
|
+
- name: Test
|
|
26
|
+
run: pytest tests/ -q
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Publishes when you push a version tag (e.g. v0.1.0) or create a GitHub Release.
|
|
4
|
+
on:
|
|
5
|
+
push:
|
|
6
|
+
tags: ["v*"]
|
|
7
|
+
release:
|
|
8
|
+
types: [published]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
- name: Build sdist + wheel
|
|
19
|
+
run: |
|
|
20
|
+
python -m pip install --upgrade build
|
|
21
|
+
python -m build
|
|
22
|
+
- uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: dist
|
|
25
|
+
path: dist/
|
|
26
|
+
|
|
27
|
+
publish:
|
|
28
|
+
needs: build
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
environment: pypi
|
|
31
|
+
permissions:
|
|
32
|
+
id-token: write # Trusted Publishing (OIDC) — no API token needed
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/download-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
- name: Publish to PyPI
|
|
39
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Secrets — never commit
|
|
2
|
+
.env
|
|
3
|
+
*.key
|
|
4
|
+
|
|
5
|
+
# Local runtime state (tracks the running instance)
|
|
6
|
+
.aiod/
|
|
7
|
+
state.json
|
|
8
|
+
|
|
9
|
+
# Python
|
|
10
|
+
__pycache__/
|
|
11
|
+
*.py[cod]
|
|
12
|
+
*.egg-info/
|
|
13
|
+
.eggs/
|
|
14
|
+
build/
|
|
15
|
+
dist/
|
|
16
|
+
.venv/
|
|
17
|
+
venv/
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
|
|
21
|
+
# OS / editor
|
|
22
|
+
.DS_Store
|
|
23
|
+
.idea/
|
|
24
|
+
.vscode/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 AIonDemandCluster contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aiondemandcluster
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AI on Demand — spin up a HuggingFace model on vast.ai GPUs and drive it from Claude Code via Claude Code Router.
|
|
5
|
+
Project-URL: Homepage, https://github.com/jhammant/AIonDemandCluster
|
|
6
|
+
Project-URL: Repository, https://github.com/jhammant/AIonDemandCluster
|
|
7
|
+
Author: AIonDemandCluster contributors
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: claude-code,gpu,inference,llm,vast.ai,vllm
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Requires-Dist: httpx>=0.27
|
|
13
|
+
Requires-Dist: platformdirs>=4.2
|
|
14
|
+
Requires-Dist: python-dotenv>=1.0
|
|
15
|
+
Requires-Dist: pyyaml>=6.0
|
|
16
|
+
Requires-Dist: rich>=13.7
|
|
17
|
+
Requires-Dist: starlette>=0.37
|
|
18
|
+
Requires-Dist: textual>=0.79
|
|
19
|
+
Requires-Dist: typer>=0.12
|
|
20
|
+
Requires-Dist: uvicorn>=0.30
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# AI on Demand Cluster (`aiod`)
|
|
27
|
+
|
|
28
|
+
Spin up any HuggingFace model on a **vast.ai** GPU, serve it with **vLLM**, and drive
|
|
29
|
+
it from **Claude Code** through **Claude Code Router (CCR)** — all from one command.
|
|
30
|
+
|
|
31
|
+
Give it a HuggingFace link; `aiod` figures out how much VRAM the model needs, finds
|
|
32
|
+
the cheapest matching machine on vast.ai, shows you the live **$/hr** before you commit,
|
|
33
|
+
rents it, waits for the model to load, and writes your CCR config so you can run
|
|
34
|
+
`ccr code` against your own hosted model.
|
|
35
|
+
|
|
36
|
+
```text
|
|
37
|
+
HuggingFace link
|
|
38
|
+
│ (params, dtype, KV shape, context)
|
|
39
|
+
▼
|
|
40
|
+
┌─────────────┐ estimate VRAM per quant ┌──────────────────────┐
|
|
41
|
+
│ sizing │ ─────────────────────────────▶ │ live vast.ai offers │ ← real $/hr
|
|
42
|
+
└─────────────┘ └──────────────────────┘
|
|
43
|
+
│ rent cheapest fit
|
|
44
|
+
▼
|
|
45
|
+
vast.ai GPU box ──▶ vLLM (OpenAI-compatible /v1) ──▶ model weights
|
|
46
|
+
▲
|
|
47
|
+
│ http://<public-ip>:<port>/v1/chat/completions (+ bearer token)
|
|
48
|
+
│
|
|
49
|
+
Claude Code Router (local) ◀── translates Anthropic ⇄ OpenAI
|
|
50
|
+
▲
|
|
51
|
+
│
|
|
52
|
+
Claude Code (ccr code)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Why CCR sits in the middle: Claude Code speaks the **Anthropic** Messages API, while
|
|
56
|
+
vLLM serves the **OpenAI** API. CCR runs locally and translates between them, so the
|
|
57
|
+
remote box can stay a plain OpenAI-compatible server.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Prerequisites
|
|
62
|
+
|
|
63
|
+
- **Python 3.10+**
|
|
64
|
+
- A **vast.ai** account ([sign up](https://cloud.vast.ai/?ref_id=25480)) + an API key from
|
|
65
|
+
[Account → API Keys](https://cloud.vast.ai/manage-keys/) (required)
|
|
66
|
+
- A **HuggingFace token** — only for gated/private models like Llama/Gemma (optional)
|
|
67
|
+
- **Claude Code Router (classic CLI)** installed locally:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
npm install -g @musistudio/claude-code-router
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
> Note: the project has a Desktop app (v3) that stores config under
|
|
74
|
+
> `~/Library/Application Support/...`. `aiod` targets the **classic CLI**, whose
|
|
75
|
+
> config lives at `~/.claude-code-router/config.json` and uses `ccr start/code/stop`.
|
|
76
|
+
|
|
77
|
+
## Install
|
|
78
|
+
|
|
79
|
+
> **Early release** — install from source for now. PyPI (`pipx install aiondemandcluster`)
|
|
80
|
+
> is wired up and coming shortly.
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
git clone https://github.com/jhammant/AIonDemandCluster.git
|
|
84
|
+
cd AIonDemandCluster
|
|
85
|
+
python -m venv .venv && source .venv/bin/activate
|
|
86
|
+
pip install -e .
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Then run `aiod init` to set up your keys.
|
|
90
|
+
|
|
91
|
+
<details>
|
|
92
|
+
<summary>Once on PyPI (pipx / uv / pip)</summary>
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pipx install aiondemandcluster # or: uv tool install aiondemandcluster / pip install aiondemandcluster
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
</details>
|
|
99
|
+
|
|
100
|
+
<details>
|
|
101
|
+
<summary>Dev setup (tests + lint)</summary>
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
git clone https://github.com/jhammant/AIonDemandCluster.git
|
|
105
|
+
cd AIonDemandCluster
|
|
106
|
+
python -m venv .venv && source .venv/bin/activate
|
|
107
|
+
pip install -e ".[dev]"
|
|
108
|
+
pytest -q && ruff check aiod/
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
</details>
|
|
112
|
+
|
|
113
|
+
## Configure
|
|
114
|
+
|
|
115
|
+
**Guided setup (recommended)** — opens the right pages, validates each key as you paste
|
|
116
|
+
it, and writes `.env` for you:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
aiod init
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Check an existing setup any time:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
aiod doctor
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**Or configure manually:**
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
cp .env.example .env
|
|
132
|
+
# then edit .env:
|
|
133
|
+
# VAST_API_KEY=... (required)
|
|
134
|
+
# HF_TOKEN=... (optional — gated models only)
|
|
135
|
+
# VLLM_API_KEY= (optional — auto-generated per launch if blank)
|
|
136
|
+
# AIOD_TTL_HOURS=4 (teardown reminder window)
|
|
137
|
+
# AIOD_MAX_PRICE=6.0 (hard $/hr cap on offers)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
`.env` is gitignored — secrets never reach the public repo. Keys are read from (in
|
|
141
|
+
order) **environment variables → project `.env` → global `~/.config/aiod/.env`**, so a
|
|
142
|
+
global install (`pipx install` / `uv tool install`) finds your keys from any directory.
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Usage
|
|
147
|
+
|
|
148
|
+
> **The TUI is the easiest way to drive everything** — start, estimate, launch, ping,
|
|
149
|
+
> and tear down from one screen: **`aiod tui`**. The CLI below is the scriptable
|
|
150
|
+
> equivalent (and what the TUI calls under the hood).
|
|
151
|
+
|
|
152
|
+
### Estimate cost before spending anything
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
aiod estimate Qwen/Qwen2.5-Coder-32B-Instruct
|
|
156
|
+
# or a full link:
|
|
157
|
+
aiod estimate https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Prints the model's size and a table of **VRAM need / cheapest GPU fit / live $/hr** for
|
|
161
|
+
each quantization option (bf16, fp8, int4).
|
|
162
|
+
|
|
163
|
+
### Spin it up
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
aiod spin Qwen/Qwen2.5-Coder-32B-Instruct # full precision (bf16)
|
|
167
|
+
aiod spin Qwen/Qwen2.5-Coder-32B-Instruct -q fp8 # ~half the VRAM
|
|
168
|
+
aiod spin <model> --max-price 3 --ttl 2 -y # cap price, 2h window, no prompt
|
|
169
|
+
aiod spin <model> --idle 20 # auto-shutdown after 20 idle min
|
|
170
|
+
aiod spin --profile coder-32b # use a saved preset (see below)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
`spin` sizes the model, finds the cheapest fitting GPU, shows the plan, rents it, waits
|
|
174
|
+
for vLLM to finish loading, then writes your CCR config. When it's ready:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
ccr restart && ccr code
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
…and Claude Code is now talking to your hosted model.
|
|
181
|
+
|
|
182
|
+
### Manage the instance
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
aiod status # provider status, endpoint, cost so far, time left on the TTL
|
|
186
|
+
aiod ping --tools # send a test prompt (+ tool call) to confirm it serves
|
|
187
|
+
aiod bench # benchmark: TTFT, tokens/sec, throughput, $/1M tokens (add -c 8)
|
|
188
|
+
aiod watch --idle 20 # foreground idle watcher (auto-destroys when idle)
|
|
189
|
+
aiod ccr-config # re-write the CCR config from the tracked instance
|
|
190
|
+
aiod teardown # destroy the instance and stop billing ← run this when done
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Engines — vLLM and llama.cpp (GGUF)
|
|
194
|
+
|
|
195
|
+
`aiod` auto-detects the model format: **safetensors / AWQ / fp8 → vLLM**, and
|
|
196
|
+
**GGUF → llama.cpp** (multi-part shards, multi-GPU, OpenAI endpoint). Force it with
|
|
197
|
+
`--engine vllm|llamacpp`, or pick it in the TUI. The right **tool-call parser** is
|
|
198
|
+
auto-selected per model family (a registry in `aiod/model_configs.py`) so function
|
|
199
|
+
calling works with Claude Code without hand-tuning flags.
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
# A 789B GGUF model on rented GPUs, benchmarked, for a couple of dollars:
|
|
203
|
+
aiod estimate huihui-ai/Huihui-GLM-5.2-abliterated-GGUF # live $/hr
|
|
204
|
+
aiod spin huihui-ai/Huihui-GLM-5.2-abliterated-GGUF -q UD-Q3_K_M --idle 30
|
|
205
|
+
aiod bench -c 4 # → tok/s + $/1M tokens
|
|
206
|
+
aiod teardown
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Profiles — named presets for spinning up a stack
|
|
210
|
+
|
|
211
|
+
A profile bundles model + quant + provider + price/idle settings under one name, so you
|
|
212
|
+
can launch a whole "architecture" by name (and pick it in the TUI).
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
aiod profile list # built-in + your presets
|
|
216
|
+
aiod profile show coder-32b
|
|
217
|
+
aiod profile add my-glm --model zai-org/GLM-4.6 --quant fp8 --idle 30 --max-price 4
|
|
218
|
+
aiod spin --profile my-glm # launch it (any flag still overrides)
|
|
219
|
+
aiod profile path # the YAML file you can hand-edit
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Profiles live in `~/.config/aiod/profiles.yaml` — adding a new one is just a YAML block.
|
|
223
|
+
Built-in starters: `coder-7b` (cheap test), `coder-32b`, `qwen3-coder-30b`, `glm-4.6`.
|
|
224
|
+
|
|
225
|
+
### Auto spin-up (on-demand) — the proxy
|
|
226
|
+
|
|
227
|
+
Run a local proxy and point Claude Code at it. The **first message spins the box up,
|
|
228
|
+
streams the live warm-up progress into the chat** (sizing → renting → booting →
|
|
229
|
+
downloading → ready), then continues with the real answer — all in one reply, no resend.
|
|
230
|
+
It auto-destroys on idle too.
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
aiod proxy --profile coder-32b --idle 20 # points CCR at the proxy automatically
|
|
234
|
+
ccr restart && ccr code # just start chatting — it spins on demand
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
Because progress streams continuously, the connection stays alive (no client timeout)
|
|
238
|
+
and you *see what's happening* instead of a silent hang. Watch it anywhere:
|
|
239
|
+
`aiod status`, the TUI, or `GET http://127.0.0.1:4000/aiod/status`. Non-streaming requests
|
|
240
|
+
(rare from Claude Code) get a short "warming up" reply to resend instead.
|
|
241
|
+
|
|
242
|
+
### Auto spin-down on idle
|
|
243
|
+
|
|
244
|
+
Pass `--idle N` to `spin` (or set it in a profile, or the TUI) and `aiod` starts a local
|
|
245
|
+
watcher that polls the box's vLLM metrics and **destroys the instance after N minutes with
|
|
246
|
+
no requests**. The TTL is a hard backstop. Note: the watcher runs on *your* machine, so if
|
|
247
|
+
it sleeps the watcher pauses — `aiod status` still shows when the TTL is blown.
|
|
248
|
+
|
|
249
|
+
### Interactive TUI (the control center)
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
aiod tui
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
One screen to run the whole lifecycle: pick a **profile** (or type a HuggingFace link),
|
|
256
|
+
choose quant / provider / idle window → **Estimate** live cost → **Launch** with streaming
|
|
257
|
+
progress → watch the **running-instance panel** (status, cost so far, idle/TTL, refreshed
|
|
258
|
+
live) → **Ping** or **Teardown**. CCR config is written automatically on launch.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Choosing a model
|
|
263
|
+
|
|
264
|
+
Claude Code is **extremely tool-call heavy** — it lives on function calling, file edits,
|
|
265
|
+
and multi-step agentic loops. Pick models with strong tool-use support, e.g.:
|
|
266
|
+
|
|
267
|
+
| Model | Notes |
|
|
268
|
+
|---|---|
|
|
269
|
+
| `Qwen/Qwen2.5-Coder-32B-Instruct` | Great coding + tool calling; fits one 80GB GPU at int4/fp8 |
|
|
270
|
+
| `Qwen/Qwen3-Coder-30B-A3B-Instruct` | MoE, fast, strong agentic behavior |
|
|
271
|
+
| `deepseek-ai/DeepSeek-V2.5` | Strong general + tool use (large) |
|
|
272
|
+
| `zai-org/GLM-4.6` | Solid agentic/coding model |
|
|
273
|
+
|
|
274
|
+
`aiod` sets `--enable-auto-tool-choice --tool-call-parser hermes` on vLLM, which suits
|
|
275
|
+
Qwen/Hermes-style models. Other families may need a different `--tool-call-parser`.
|
|
276
|
+
|
|
277
|
+
## Cost & safety
|
|
278
|
+
|
|
279
|
+
- **`--max-price` / `AIOD_MAX_PRICE`** — a hard ceiling; offers above it are never rented.
|
|
280
|
+
- **TTL** — `aiod status` shows time left and warns when exceeded. **Auto-destroy is a
|
|
281
|
+
reminder, not enforced on the box** (putting your vast key on a public machine would be
|
|
282
|
+
a security risk). **Always run `aiod teardown` when you're done** — billing continues
|
|
283
|
+
until the instance is destroyed.
|
|
284
|
+
- The inference endpoint is protected by a bearer token (`--api-key`), but it is exposed
|
|
285
|
+
on a public IP. Don't serve anything sensitive, and tear down promptly.
|
|
286
|
+
|
|
287
|
+
## How it works
|
|
288
|
+
|
|
289
|
+
- **`sizing.py`** — pulls model metadata from the HF Hub (params from safetensors, KV
|
|
290
|
+
shape from `config.json`), estimates VRAM per quant (`weights × bytes + KV-cache +
|
|
291
|
+
overhead`), and maps it to a GPU plan (count × tier, power-of-two tensor parallel).
|
|
292
|
+
- **`vast.py`** — vast.ai REST client: searches offers (filtering for
|
|
293
|
+
`direct_port_count ≥ 1` so the port can be mapped), rents via `PUT /asks/{id}/`, reads
|
|
294
|
+
the public `host:port` from `ports["8000/tcp"]`, and destroys on teardown.
|
|
295
|
+
- **`bootstrap.py`** — builds the vLLM launch (`vllm/vllm-openai` image + args: model,
|
|
296
|
+
tensor-parallel size, quantization, api-key, HF token).
|
|
297
|
+
- **`health.py`** — polls `/v1/models` until the model is serving.
|
|
298
|
+
- **`ccr.py`** — merges an `aiod-vllm` provider + router into `~/.claude-code-router/config.json`
|
|
299
|
+
(preserving your other providers; backs up the old config).
|
|
300
|
+
|
|
301
|
+
## Caveats
|
|
302
|
+
|
|
303
|
+
- **int4 (awq/gptq) needs a pre-quantized checkpoint** (a `*-AWQ` / `*-GPTQ` repo). On a
|
|
304
|
+
full-precision repo, use `fp8` (works online) or `bf16`. `aiod` warns you.
|
|
305
|
+
- **Multi-GPU tensor parallel** uses NCCL over shared memory; some vast machines have a
|
|
306
|
+
small `/dev/shm`. If a 2+ GPU launch crashes on `/dev/shm`, that's why — try a
|
|
307
|
+
different host or a smaller quant that fits on one GPU.
|
|
308
|
+
- VRAM estimates are conservative (they assume concurrent sequences); real usage is often
|
|
309
|
+
lower. Tune with `--concurrency` / `--context`.
|
|
310
|
+
|
|
311
|
+
## Providers
|
|
312
|
+
|
|
313
|
+
Pick the GPU backend with `--provider` (CLI) or the dropdown (TUI); set the matching key
|
|
314
|
+
in `.env`.
|
|
315
|
+
|
|
316
|
+
| Provider | Key | Notes |
|
|
317
|
+
|---|---|---|
|
|
318
|
+
| **vast.ai** (default) | `VAST_API_KEY` | Cheapest marketplace; bid on specific offers. |
|
|
319
|
+
| **RunPod** | `RUNPOD_API_KEY` | Clean API; uses a public **TCP** port (avoids the proxy's 100s stream timeout). |
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
aiod estimate <model> --provider runpod # live RunPod pricing ($0)
|
|
323
|
+
aiod spin <model> --provider runpod -q fp8 # rent a RunPod pod
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## Referral
|
|
327
|
+
|
|
328
|
+
Signup links use the maintainer's referral links so signing up through them supports the
|
|
329
|
+
project at **no extra cost to you**:
|
|
330
|
+
|
|
331
|
+
- **vast.ai** — `https://cloud.vast.ai/?ref_id=25480` (3% of referred spend, for the life of the account)
|
|
332
|
+
- **RunPod** — `https://runpod.io?ref=p8hj7fq3` (credits on referred spend)
|
|
333
|
+
|
|
334
|
+
Forking this repo? Point them at your own links in one place:
|
|
335
|
+
[`aiod/branding.py`](aiod/branding.py) (`VAST_REFERRAL_URL` / `RUNPOD_REFERRAL_URL`).
|
|
336
|
+
|
|
337
|
+
## License
|
|
338
|
+
|
|
339
|
+
MIT — see [LICENSE](LICENSE).
|