aiondemandcluster 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiod/__init__.py +4 -0
- aiod/bench.py +173 -0
- aiod/bootstrap.py +102 -0
- aiod/branding.py +37 -0
- aiod/ccr.py +90 -0
- aiod/cli.py +1004 -0
- aiod/config.py +40 -0
- aiod/engine.py +143 -0
- aiod/events.py +45 -0
- aiod/health.py +136 -0
- aiod/model_configs.py +95 -0
- aiod/onboard.py +127 -0
- aiod/profiles.py +139 -0
- aiod/providers.py +46 -0
- aiod/proxy.py +289 -0
- aiod/runpod.py +203 -0
- aiod/sizing.py +465 -0
- aiod/state.py +69 -0
- aiod/tui.py +541 -0
- aiod/vast.py +379 -0
- aiod/watch.py +146 -0
- aiondemandcluster-0.1.0.dist-info/METADATA +339 -0
- aiondemandcluster-0.1.0.dist-info/RECORD +26 -0
- aiondemandcluster-0.1.0.dist-info/WHEEL +4 -0
- aiondemandcluster-0.1.0.dist-info/entry_points.txt +2 -0
- aiondemandcluster-0.1.0.dist-info/licenses/LICENSE +21 -0
aiod/__init__.py
ADDED
aiod/bench.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""A small benchmark for a running OpenAI-compatible endpoint.
|
|
2
|
+
|
|
3
|
+
Measures the numbers you actually want to report:
|
|
4
|
+
* TTFT — time to first token (streaming), p50/p95
|
|
5
|
+
* decode speed — output tokens/sec per request
|
|
6
|
+
* throughput — aggregate output tokens/sec under concurrency
|
|
7
|
+
* $/1M tokens — derived from the instance price and throughput
|
|
8
|
+
|
|
9
|
+
Works against vLLM or llama.cpp (both stream OpenAI-style SSE).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import time
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
|
|
19
|
+
import httpx
|
|
20
|
+
|
|
21
|
+
DEFAULT_PROMPT = (
|
|
22
|
+
"Write a Python function `merge_intervals(intervals)` that merges overlapping "
|
|
23
|
+
"intervals and returns the merged list, with a short docstring."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ReqResult:
|
|
29
|
+
ok: bool
|
|
30
|
+
ttft: float | None = None # seconds to first token
|
|
31
|
+
total: float | None = None # total wall time
|
|
32
|
+
completion_tokens: int = 0
|
|
33
|
+
error: str | None = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def decode_tok_s(self) -> float | None:
|
|
37
|
+
if self.ok and self.ttft is not None and self.total and self.completion_tokens:
|
|
38
|
+
gen_time = max(self.total - self.ttft, 1e-6)
|
|
39
|
+
return self.completion_tokens / gen_time
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class BenchResult:
|
|
45
|
+
n: int
|
|
46
|
+
concurrency: int
|
|
47
|
+
wall_time: float
|
|
48
|
+
results: list[ReqResult] = field(default_factory=list)
|
|
49
|
+
price_per_hr: float | None = None
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def ok(self) -> list[ReqResult]:
|
|
53
|
+
return [r for r in self.results if r.ok]
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def ttft_p50(self) -> float | None:
|
|
57
|
+
return _pct([r.ttft for r in self.ok if r.ttft is not None], 50)
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def ttft_p95(self) -> float | None:
|
|
61
|
+
return _pct([r.ttft for r in self.ok if r.ttft is not None], 95)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def avg_decode_tok_s(self) -> float | None:
|
|
65
|
+
vals = [r.decode_tok_s for r in self.ok if r.decode_tok_s]
|
|
66
|
+
return sum(vals) / len(vals) if vals else None
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def total_completion_tokens(self) -> int:
|
|
70
|
+
return sum(r.completion_tokens for r in self.ok)
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def throughput_tok_s(self) -> float | None:
|
|
74
|
+
if self.wall_time > 0 and self.total_completion_tokens:
|
|
75
|
+
return self.total_completion_tokens / self.wall_time
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def cost_per_million(self) -> float | None:
|
|
80
|
+
"""$/1M output tokens at this throughput."""
|
|
81
|
+
tps = self.throughput_tok_s
|
|
82
|
+
if tps and self.price_per_hr:
|
|
83
|
+
return (self.price_per_hr / 3600.0) / tps * 1_000_000
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _pct(values: list[float], p: float) -> float | None:
|
|
88
|
+
if not values:
|
|
89
|
+
return None
|
|
90
|
+
s = sorted(values)
|
|
91
|
+
k = max(0, min(len(s) - 1, round((p / 100.0) * (len(s) - 1))))
|
|
92
|
+
return s[k]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _one_request(
|
|
96
|
+
base_url: str, model: str, api_key: str | None, prompt: str, max_tokens: int, timeout: float
|
|
97
|
+
) -> ReqResult:
|
|
98
|
+
headers = {"Content-Type": "application/json"}
|
|
99
|
+
if api_key:
|
|
100
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
101
|
+
payload = {
|
|
102
|
+
"model": model,
|
|
103
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
104
|
+
"max_tokens": max_tokens,
|
|
105
|
+
"temperature": 0.2,
|
|
106
|
+
"stream": True,
|
|
107
|
+
"stream_options": {"include_usage": True},
|
|
108
|
+
}
|
|
109
|
+
url = f"{base_url.rstrip('/')}/chat/completions"
|
|
110
|
+
start = time.time()
|
|
111
|
+
ttft: float | None = None
|
|
112
|
+
chunk_tokens = 0
|
|
113
|
+
usage_tokens = 0
|
|
114
|
+
try:
|
|
115
|
+
with httpx.stream("POST", url, headers=headers, json=payload, timeout=timeout) as r:
|
|
116
|
+
if r.status_code != 200:
|
|
117
|
+
r.read()
|
|
118
|
+
return ReqResult(ok=False, error=f"HTTP {r.status_code}: {r.text[:160]}")
|
|
119
|
+
for line in r.iter_lines():
|
|
120
|
+
if not line.startswith("data: "):
|
|
121
|
+
continue
|
|
122
|
+
data = line[len("data: "):].strip()
|
|
123
|
+
if data == "[DONE]":
|
|
124
|
+
break
|
|
125
|
+
try:
|
|
126
|
+
obj = json.loads(data)
|
|
127
|
+
except json.JSONDecodeError:
|
|
128
|
+
continue
|
|
129
|
+
choices = obj.get("choices") or []
|
|
130
|
+
if choices:
|
|
131
|
+
delta = choices[0].get("delta") or {}
|
|
132
|
+
# reasoning models (GLM-5, DeepSeek-R1, ...) stream into
|
|
133
|
+
# reasoning_content; count it as generated work too.
|
|
134
|
+
if delta.get("content") or delta.get("reasoning_content"):
|
|
135
|
+
if ttft is None:
|
|
136
|
+
ttft = time.time() - start
|
|
137
|
+
chunk_tokens += 1
|
|
138
|
+
if obj.get("usage"):
|
|
139
|
+
usage_tokens = obj["usage"].get("completion_tokens", 0)
|
|
140
|
+
except httpx.HTTPError as e:
|
|
141
|
+
return ReqResult(ok=False, error=str(e))
|
|
142
|
+
|
|
143
|
+
total = time.time() - start
|
|
144
|
+
return ReqResult(
|
|
145
|
+
ok=True, ttft=ttft, total=total, completion_tokens=usage_tokens or chunk_tokens
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def run_benchmark(
|
|
150
|
+
base_url: str,
|
|
151
|
+
model: str,
|
|
152
|
+
api_key: str | None = None,
|
|
153
|
+
n: int = 5,
|
|
154
|
+
max_tokens: int = 256,
|
|
155
|
+
concurrency: int = 1,
|
|
156
|
+
prompt: str = DEFAULT_PROMPT,
|
|
157
|
+
price_per_hr: float | None = None,
|
|
158
|
+
timeout: float = 180.0,
|
|
159
|
+
) -> BenchResult:
|
|
160
|
+
"""Fire `n` requests `concurrency`-at-a-time and aggregate the stats."""
|
|
161
|
+
results: list[ReqResult] = []
|
|
162
|
+
wall_start = time.time()
|
|
163
|
+
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
|
164
|
+
futures = [
|
|
165
|
+
pool.submit(_one_request, base_url, model, api_key, prompt, max_tokens, timeout)
|
|
166
|
+
for _ in range(n)
|
|
167
|
+
]
|
|
168
|
+
for f in futures:
|
|
169
|
+
results.append(f.result())
|
|
170
|
+
wall = time.time() - wall_start
|
|
171
|
+
return BenchResult(
|
|
172
|
+
n=n, concurrency=concurrency, wall_time=wall, results=results, price_per_hr=price_per_hr
|
|
173
|
+
)
|
aiod/bootstrap.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Builds the serving configuration that runs on the rented box: the Docker
|
|
2
|
+
image, the OpenAI-API server command, environment, and the published port. The
|
|
3
|
+
provider clients (vast/runpod) consume this when creating the instance.
|
|
4
|
+
|
|
5
|
+
Two engines:
|
|
6
|
+
* vllm — safetensors / AWQ / GPTQ / fp8 models (image vllm/vllm-openai)
|
|
7
|
+
* llamacpp — GGUF models (image ghcr.io/ggml-org/llama.cpp:server-cuda); loads
|
|
8
|
+
the GGUF straight from HF with -hf repo:QUANT (multi-part shards
|
|
9
|
+
auto-detected), spreads across all GPUs with -ngl 999, and enables
|
|
10
|
+
tool calling with --jinja. We pass --port 8000 so the published
|
|
11
|
+
port is identical to vLLM's.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
|
|
18
|
+
VLLM_IMAGE = "vllm/vllm-openai:latest"
|
|
19
|
+
LLAMACPP_IMAGE = "ghcr.io/ggml-org/llama.cpp:server-cuda"
|
|
20
|
+
|
|
21
|
+
# Port the server listens on inside the container (published to a public port).
|
|
22
|
+
CONTAINER_PORT = 8000
|
|
23
|
+
|
|
24
|
+
# Map our quant keys to vLLM's --quantization values. bf16 = no flag.
|
|
25
|
+
VLLM_QUANT_FLAG = {
|
|
26
|
+
"bf16": None,
|
|
27
|
+
"fp16": None,
|
|
28
|
+
"fp8": "fp8",
|
|
29
|
+
"awq-int4": "awq",
|
|
30
|
+
"gptq-int4": "gptq",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ServerConfig:
|
|
36
|
+
repo_id: str
|
|
37
|
+
num_gpus: int
|
|
38
|
+
quant: str
|
|
39
|
+
api_key: str
|
|
40
|
+
engine: str = "vllm" # "vllm" | "llamacpp"
|
|
41
|
+
image: str | None = None # defaults per engine
|
|
42
|
+
port: int = CONTAINER_PORT
|
|
43
|
+
max_model_len: int | None = None
|
|
44
|
+
gpu_memory_utilization: float = 0.92
|
|
45
|
+
tool_call_parser: str = "hermes"
|
|
46
|
+
extra_args: list[str] = field(default_factory=list)
|
|
47
|
+
hf_token: str | None = None
|
|
48
|
+
gguf_quant: str | None = None # llamacpp: the GGUF quant tag, e.g. "UD-IQ1_M"
|
|
49
|
+
|
|
50
|
+
def __post_init__(self) -> None:
|
|
51
|
+
if self.image is None:
|
|
52
|
+
self.image = LLAMACPP_IMAGE if self.engine == "llamacpp" else VLLM_IMAGE
|
|
53
|
+
|
|
54
|
+
def server_args(self) -> list[str]:
|
|
55
|
+
if self.engine == "llamacpp":
|
|
56
|
+
return self._llamacpp_args()
|
|
57
|
+
return self._vllm_args()
|
|
58
|
+
|
|
59
|
+
def _vllm_args(self) -> list[str]:
|
|
60
|
+
args = [
|
|
61
|
+
"--host", "0.0.0.0",
|
|
62
|
+
"--port", str(self.port),
|
|
63
|
+
"--model", self.repo_id,
|
|
64
|
+
"--served-model-name", self.repo_id,
|
|
65
|
+
"--api-key", self.api_key,
|
|
66
|
+
"--tensor-parallel-size", str(self.num_gpus),
|
|
67
|
+
"--gpu-memory-utilization", str(self.gpu_memory_utilization),
|
|
68
|
+
# Claude Code is extremely tool-call heavy; enable robust tool parsing.
|
|
69
|
+
"--enable-auto-tool-choice",
|
|
70
|
+
"--tool-call-parser", self.tool_call_parser,
|
|
71
|
+
]
|
|
72
|
+
flag = VLLM_QUANT_FLAG.get(self.quant)
|
|
73
|
+
if flag:
|
|
74
|
+
args += ["--quantization", flag]
|
|
75
|
+
if self.max_model_len:
|
|
76
|
+
args += ["--max-model-len", str(self.max_model_len)]
|
|
77
|
+
args += self.extra_args
|
|
78
|
+
return args
|
|
79
|
+
|
|
80
|
+
def _llamacpp_args(self) -> list[str]:
|
|
81
|
+
# -hf repo[:quant]; the tag is a case-insensitive substring match against
|
|
82
|
+
# filenames (works across subfolders) and auto-downloads all shards.
|
|
83
|
+
ref = f"{self.repo_id}:{self.gguf_quant}" if self.gguf_quant else self.repo_id
|
|
84
|
+
args = [
|
|
85
|
+
"-hf", ref,
|
|
86
|
+
"--host", "0.0.0.0",
|
|
87
|
+
"--port", str(self.port),
|
|
88
|
+
"--api-key", self.api_key,
|
|
89
|
+
"-ngl", "999", # all layers on GPU; default layer-split spreads them
|
|
90
|
+
"--jinja", # enable OpenAI tool/function calling
|
|
91
|
+
"-c", str(self.max_model_len or 32768),
|
|
92
|
+
]
|
|
93
|
+
args += self.extra_args
|
|
94
|
+
return args
|
|
95
|
+
|
|
96
|
+
def env(self) -> dict[str, str]:
|
|
97
|
+
e: dict[str, str] = {}
|
|
98
|
+
if self.hf_token:
|
|
99
|
+
# vLLM/huggingface_hub read both; llama.cpp reads HF_TOKEN.
|
|
100
|
+
e["HF_TOKEN"] = self.hf_token
|
|
101
|
+
e["HUGGING_FACE_HUB_TOKEN"] = self.hf_token
|
|
102
|
+
return e
|
aiod/branding.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Project links and the maintainer referral links.
|
|
2
|
+
|
|
3
|
+
To earn referral credit, paste YOUR referral link below for each provider.
|
|
4
|
+
* vast.ai -> Settings -> Referral Link (3% of referred spend, for life)
|
|
5
|
+
* RunPod -> Settings -> Referrals (credits on referred spend)
|
|
6
|
+
|
|
7
|
+
Leave an entry empty to fall back to the plain signup URL. These constants feed
|
|
8
|
+
the `aiod init` wizard and the README signup links, so you only edit them here.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
# >>> EDIT THESE: your referral links per provider <<<
|
|
14
|
+
VAST_REFERRAL_URL = "https://cloud.vast.ai/?ref_id=25480"
|
|
15
|
+
RUNPOD_REFERRAL_URL = "https://runpod.io?ref=p8hj7fq3"
|
|
16
|
+
|
|
17
|
+
# Plain signup fallbacks (used if the matching referral link above is empty).
|
|
18
|
+
SIGNUP_FALLBACK = {
|
|
19
|
+
"vast": "https://cloud.vast.ai/",
|
|
20
|
+
"runpod": "https://www.runpod.io/",
|
|
21
|
+
}
|
|
22
|
+
REFERRAL_URLS = {
|
|
23
|
+
"vast": VAST_REFERRAL_URL,
|
|
24
|
+
"runpod": RUNPOD_REFERRAL_URL,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Other links.
|
|
28
|
+
VAST_KEYS_URL = "https://cloud.vast.ai/manage-keys/"
|
|
29
|
+
RUNPOD_KEYS_URL = "https://www.runpod.io/console/user/settings"
|
|
30
|
+
HF_TOKENS_URL = "https://huggingface.co/settings/tokens"
|
|
31
|
+
CCR_INSTALL_CMD = "npm install -g @musistudio/claude-code-router"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def signup_url(provider: str = "vast") -> str:
|
|
35
|
+
"""The link to send people who don't have an account on `provider` yet."""
|
|
36
|
+
ref = (REFERRAL_URLS.get(provider) or "").strip()
|
|
37
|
+
return ref or SIGNUP_FALLBACK.get(provider, SIGNUP_FALLBACK["vast"])
|
aiod/ccr.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Generate / merge the Claude Code Router config so `ccr code` routes Claude
|
|
2
|
+
Code to our remote vLLM box.
|
|
3
|
+
|
|
4
|
+
Verified schema notes (musistudio/claude-code-router, classic CLI):
|
|
5
|
+
* config path: ~/.claude-code-router/config.json
|
|
6
|
+
* Providers[].api_base_url must be the FULL URL ending in /chat/completions
|
|
7
|
+
(NOT just the base) for a custom OpenAI-compatible endpoint like vLLM.
|
|
8
|
+
* Providers[].api_key is sent upstream as `Authorization: Bearer`.
|
|
9
|
+
* Providers[].models lists names exactly as /v1/models reports them.
|
|
10
|
+
* transformer is omitted for a vanilla OpenAI-compatible endpoint.
|
|
11
|
+
* Router values are the string "providerName,modelName".
|
|
12
|
+
|
|
13
|
+
We MERGE: existing providers and top-level settings are preserved; we only
|
|
14
|
+
replace our own provider (matched by name) and repoint the Router.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import shutil
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
CONFIG_DIR = Path.home() / ".claude-code-router"
|
|
24
|
+
CONFIG_FILE = CONFIG_DIR / "config.json"
|
|
25
|
+
PROVIDER_NAME = "aiod-vllm"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_provider(base_url: str, api_key: str, model: str) -> dict:
|
|
29
|
+
"""base_url ends in /v1 (e.g. http://1.2.3.4:33526/v1)."""
|
|
30
|
+
return {
|
|
31
|
+
"name": PROVIDER_NAME,
|
|
32
|
+
"api_base_url": f"{base_url.rstrip('/')}/chat/completions",
|
|
33
|
+
"api_key": api_key or "dummy",
|
|
34
|
+
"models": [model],
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def build_router(model: str) -> dict:
|
|
39
|
+
ref = f"{PROVIDER_NAME},{model}"
|
|
40
|
+
return {
|
|
41
|
+
"default": ref,
|
|
42
|
+
"background": ref,
|
|
43
|
+
"think": ref,
|
|
44
|
+
"longContext": ref,
|
|
45
|
+
"longContextThreshold": 60000,
|
|
46
|
+
"webSearch": ref,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_existing() -> dict:
|
|
51
|
+
if CONFIG_FILE.exists():
|
|
52
|
+
try:
|
|
53
|
+
return json.loads(CONFIG_FILE.read_text())
|
|
54
|
+
except json.JSONDecodeError:
|
|
55
|
+
pass
|
|
56
|
+
return {}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def write_config(base_url: str, api_key: str, model: str) -> Path:
|
|
60
|
+
"""Merge our provider+router into the CCR config, preserving everything else.
|
|
61
|
+
Backs up an existing config to config.json.bak. Returns the config path."""
|
|
62
|
+
cfg = _load_existing()
|
|
63
|
+
|
|
64
|
+
if CONFIG_FILE.exists():
|
|
65
|
+
shutil.copy2(CONFIG_FILE, CONFIG_FILE.with_suffix(".json.bak"))
|
|
66
|
+
|
|
67
|
+
cfg.setdefault("LOG", False)
|
|
68
|
+
cfg.setdefault("HOST", "127.0.0.1")
|
|
69
|
+
cfg.setdefault("PORT", 3456)
|
|
70
|
+
|
|
71
|
+
providers = cfg.get("Providers")
|
|
72
|
+
if not isinstance(providers, list):
|
|
73
|
+
providers = []
|
|
74
|
+
providers = [p for p in providers if p.get("name") != PROVIDER_NAME]
|
|
75
|
+
providers.append(build_provider(base_url, api_key, model))
|
|
76
|
+
cfg["Providers"] = providers
|
|
77
|
+
|
|
78
|
+
cfg["Router"] = build_router(model)
|
|
79
|
+
|
|
80
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
CONFIG_FILE.write_text(json.dumps(cfg, indent=2))
|
|
82
|
+
return CONFIG_FILE
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def config_snippet(base_url: str, api_key: str, model: str) -> str:
|
|
86
|
+
"""The exact provider+router block, for printing / dry-run."""
|
|
87
|
+
return json.dumps(
|
|
88
|
+
{"Providers": [build_provider(base_url, api_key, model)], "Router": build_router(model)},
|
|
89
|
+
indent=2,
|
|
90
|
+
)
|