aiondemandcluster 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aiod/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ """AI on Demand — spin up a HuggingFace model on vast.ai GPUs and drive it from
2
+ Claude Code via Claude Code Router."""
3
+
4
+ __version__ = "0.1.0"
aiod/bench.py ADDED
@@ -0,0 +1,173 @@
1
+ """A small benchmark for a running OpenAI-compatible endpoint.
2
+
3
+ Measures the numbers you actually want to report:
4
+ * TTFT — time to first token (streaming), p50/p95
5
+ * decode speed — output tokens/sec per request
6
+ * throughput — aggregate output tokens/sec under concurrency
7
+ * $/1M tokens — derived from the instance price and throughput
8
+
9
+ Works against vLLM or llama.cpp (both stream OpenAI-style SSE).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import time
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ from dataclasses import dataclass, field
18
+
19
+ import httpx
20
+
21
+ DEFAULT_PROMPT = (
22
+ "Write a Python function `merge_intervals(intervals)` that merges overlapping "
23
+ "intervals and returns the merged list, with a short docstring."
24
+ )
25
+
26
+
27
+ @dataclass
28
+ class ReqResult:
29
+ ok: bool
30
+ ttft: float | None = None # seconds to first token
31
+ total: float | None = None # total wall time
32
+ completion_tokens: int = 0
33
+ error: str | None = None
34
+
35
+ @property
36
+ def decode_tok_s(self) -> float | None:
37
+ if self.ok and self.ttft is not None and self.total and self.completion_tokens:
38
+ gen_time = max(self.total - self.ttft, 1e-6)
39
+ return self.completion_tokens / gen_time
40
+ return None
41
+
42
+
43
+ @dataclass
44
+ class BenchResult:
45
+ n: int
46
+ concurrency: int
47
+ wall_time: float
48
+ results: list[ReqResult] = field(default_factory=list)
49
+ price_per_hr: float | None = None
50
+
51
+ @property
52
+ def ok(self) -> list[ReqResult]:
53
+ return [r for r in self.results if r.ok]
54
+
55
+ @property
56
+ def ttft_p50(self) -> float | None:
57
+ return _pct([r.ttft for r in self.ok if r.ttft is not None], 50)
58
+
59
+ @property
60
+ def ttft_p95(self) -> float | None:
61
+ return _pct([r.ttft for r in self.ok if r.ttft is not None], 95)
62
+
63
+ @property
64
+ def avg_decode_tok_s(self) -> float | None:
65
+ vals = [r.decode_tok_s for r in self.ok if r.decode_tok_s]
66
+ return sum(vals) / len(vals) if vals else None
67
+
68
+ @property
69
+ def total_completion_tokens(self) -> int:
70
+ return sum(r.completion_tokens for r in self.ok)
71
+
72
+ @property
73
+ def throughput_tok_s(self) -> float | None:
74
+ if self.wall_time > 0 and self.total_completion_tokens:
75
+ return self.total_completion_tokens / self.wall_time
76
+ return None
77
+
78
+ @property
79
+ def cost_per_million(self) -> float | None:
80
+ """$/1M output tokens at this throughput."""
81
+ tps = self.throughput_tok_s
82
+ if tps and self.price_per_hr:
83
+ return (self.price_per_hr / 3600.0) / tps * 1_000_000
84
+ return None
85
+
86
+
87
+ def _pct(values: list[float], p: float) -> float | None:
88
+ if not values:
89
+ return None
90
+ s = sorted(values)
91
+ k = max(0, min(len(s) - 1, round((p / 100.0) * (len(s) - 1))))
92
+ return s[k]
93
+
94
+
95
+ def _one_request(
96
+ base_url: str, model: str, api_key: str | None, prompt: str, max_tokens: int, timeout: float
97
+ ) -> ReqResult:
98
+ headers = {"Content-Type": "application/json"}
99
+ if api_key:
100
+ headers["Authorization"] = f"Bearer {api_key}"
101
+ payload = {
102
+ "model": model,
103
+ "messages": [{"role": "user", "content": prompt}],
104
+ "max_tokens": max_tokens,
105
+ "temperature": 0.2,
106
+ "stream": True,
107
+ "stream_options": {"include_usage": True},
108
+ }
109
+ url = f"{base_url.rstrip('/')}/chat/completions"
110
+ start = time.time()
111
+ ttft: float | None = None
112
+ chunk_tokens = 0
113
+ usage_tokens = 0
114
+ try:
115
+ with httpx.stream("POST", url, headers=headers, json=payload, timeout=timeout) as r:
116
+ if r.status_code != 200:
117
+ r.read()
118
+ return ReqResult(ok=False, error=f"HTTP {r.status_code}: {r.text[:160]}")
119
+ for line in r.iter_lines():
120
+ if not line.startswith("data: "):
121
+ continue
122
+ data = line[len("data: "):].strip()
123
+ if data == "[DONE]":
124
+ break
125
+ try:
126
+ obj = json.loads(data)
127
+ except json.JSONDecodeError:
128
+ continue
129
+ choices = obj.get("choices") or []
130
+ if choices:
131
+ delta = choices[0].get("delta") or {}
132
+ # reasoning models (GLM-5, DeepSeek-R1, ...) stream into
133
+ # reasoning_content; count it as generated work too.
134
+ if delta.get("content") or delta.get("reasoning_content"):
135
+ if ttft is None:
136
+ ttft = time.time() - start
137
+ chunk_tokens += 1
138
+ if obj.get("usage"):
139
+ usage_tokens = obj["usage"].get("completion_tokens", 0)
140
+ except httpx.HTTPError as e:
141
+ return ReqResult(ok=False, error=str(e))
142
+
143
+ total = time.time() - start
144
+ return ReqResult(
145
+ ok=True, ttft=ttft, total=total, completion_tokens=usage_tokens or chunk_tokens
146
+ )
147
+
148
+
149
+ def run_benchmark(
150
+ base_url: str,
151
+ model: str,
152
+ api_key: str | None = None,
153
+ n: int = 5,
154
+ max_tokens: int = 256,
155
+ concurrency: int = 1,
156
+ prompt: str = DEFAULT_PROMPT,
157
+ price_per_hr: float | None = None,
158
+ timeout: float = 180.0,
159
+ ) -> BenchResult:
160
+ """Fire `n` requests `concurrency`-at-a-time and aggregate the stats."""
161
+ results: list[ReqResult] = []
162
+ wall_start = time.time()
163
+ with ThreadPoolExecutor(max_workers=concurrency) as pool:
164
+ futures = [
165
+ pool.submit(_one_request, base_url, model, api_key, prompt, max_tokens, timeout)
166
+ for _ in range(n)
167
+ ]
168
+ for f in futures:
169
+ results.append(f.result())
170
+ wall = time.time() - wall_start
171
+ return BenchResult(
172
+ n=n, concurrency=concurrency, wall_time=wall, results=results, price_per_hr=price_per_hr
173
+ )
aiod/bootstrap.py ADDED
@@ -0,0 +1,102 @@
1
+ """Builds the serving configuration that runs on the rented box: the Docker
2
+ image, the OpenAI-API server command, environment, and the published port. The
3
+ provider clients (vast/runpod) consume this when creating the instance.
4
+
5
+ Two engines:
6
+ * vllm — safetensors / AWQ / GPTQ / fp8 models (image vllm/vllm-openai)
7
+ * llamacpp — GGUF models (image ghcr.io/ggml-org/llama.cpp:server-cuda); loads
8
+ the GGUF straight from HF with -hf repo:QUANT (multi-part shards
9
+ auto-detected), spreads across all GPUs with -ngl 999, and enables
10
+ tool calling with --jinja. We pass --port 8000 so the published
11
+ port is identical to vLLM's.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass, field
17
+
18
+ VLLM_IMAGE = "vllm/vllm-openai:latest"
19
+ LLAMACPP_IMAGE = "ghcr.io/ggml-org/llama.cpp:server-cuda"
20
+
21
+ # Port the server listens on inside the container (published to a public port).
22
+ CONTAINER_PORT = 8000
23
+
24
+ # Map our quant keys to vLLM's --quantization values. bf16 = no flag.
25
+ VLLM_QUANT_FLAG = {
26
+ "bf16": None,
27
+ "fp16": None,
28
+ "fp8": "fp8",
29
+ "awq-int4": "awq",
30
+ "gptq-int4": "gptq",
31
+ }
32
+
33
+
34
+ @dataclass
35
+ class ServerConfig:
36
+ repo_id: str
37
+ num_gpus: int
38
+ quant: str
39
+ api_key: str
40
+ engine: str = "vllm" # "vllm" | "llamacpp"
41
+ image: str | None = None # defaults per engine
42
+ port: int = CONTAINER_PORT
43
+ max_model_len: int | None = None
44
+ gpu_memory_utilization: float = 0.92
45
+ tool_call_parser: str = "hermes"
46
+ extra_args: list[str] = field(default_factory=list)
47
+ hf_token: str | None = None
48
+ gguf_quant: str | None = None # llamacpp: the GGUF quant tag, e.g. "UD-IQ1_M"
49
+
50
+ def __post_init__(self) -> None:
51
+ if self.image is None:
52
+ self.image = LLAMACPP_IMAGE if self.engine == "llamacpp" else VLLM_IMAGE
53
+
54
+ def server_args(self) -> list[str]:
55
+ if self.engine == "llamacpp":
56
+ return self._llamacpp_args()
57
+ return self._vllm_args()
58
+
59
+ def _vllm_args(self) -> list[str]:
60
+ args = [
61
+ "--host", "0.0.0.0",
62
+ "--port", str(self.port),
63
+ "--model", self.repo_id,
64
+ "--served-model-name", self.repo_id,
65
+ "--api-key", self.api_key,
66
+ "--tensor-parallel-size", str(self.num_gpus),
67
+ "--gpu-memory-utilization", str(self.gpu_memory_utilization),
68
+ # Claude Code is extremely tool-call heavy; enable robust tool parsing.
69
+ "--enable-auto-tool-choice",
70
+ "--tool-call-parser", self.tool_call_parser,
71
+ ]
72
+ flag = VLLM_QUANT_FLAG.get(self.quant)
73
+ if flag:
74
+ args += ["--quantization", flag]
75
+ if self.max_model_len:
76
+ args += ["--max-model-len", str(self.max_model_len)]
77
+ args += self.extra_args
78
+ return args
79
+
80
+ def _llamacpp_args(self) -> list[str]:
81
+ # -hf repo[:quant]; the tag is a case-insensitive substring match against
82
+ # filenames (works across subfolders) and auto-downloads all shards.
83
+ ref = f"{self.repo_id}:{self.gguf_quant}" if self.gguf_quant else self.repo_id
84
+ args = [
85
+ "-hf", ref,
86
+ "--host", "0.0.0.0",
87
+ "--port", str(self.port),
88
+ "--api-key", self.api_key,
89
+ "-ngl", "999", # all layers on GPU; default layer-split spreads them
90
+ "--jinja", # enable OpenAI tool/function calling
91
+ "-c", str(self.max_model_len or 32768),
92
+ ]
93
+ args += self.extra_args
94
+ return args
95
+
96
+ def env(self) -> dict[str, str]:
97
+ e: dict[str, str] = {}
98
+ if self.hf_token:
99
+ # vLLM/huggingface_hub read both; llama.cpp reads HF_TOKEN.
100
+ e["HF_TOKEN"] = self.hf_token
101
+ e["HUGGING_FACE_HUB_TOKEN"] = self.hf_token
102
+ return e
aiod/branding.py ADDED
@@ -0,0 +1,37 @@
1
+ """Project links and the maintainer referral links.
2
+
3
+ To earn referral credit, paste YOUR referral link below for each provider.
4
+ * vast.ai -> Settings -> Referral Link (3% of referred spend, for life)
5
+ * RunPod -> Settings -> Referrals (credits on referred spend)
6
+
7
+ Leave an entry empty to fall back to the plain signup URL. These constants feed
8
+ the `aiod init` wizard and the README signup links, so you only edit them here.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ # >>> EDIT THESE: your referral links per provider <<<
14
+ VAST_REFERRAL_URL = "https://cloud.vast.ai/?ref_id=25480"
15
+ RUNPOD_REFERRAL_URL = "https://runpod.io?ref=p8hj7fq3"
16
+
17
+ # Plain signup fallbacks (used if the matching referral link above is empty).
18
+ SIGNUP_FALLBACK = {
19
+ "vast": "https://cloud.vast.ai/",
20
+ "runpod": "https://www.runpod.io/",
21
+ }
22
+ REFERRAL_URLS = {
23
+ "vast": VAST_REFERRAL_URL,
24
+ "runpod": RUNPOD_REFERRAL_URL,
25
+ }
26
+
27
+ # Other links.
28
+ VAST_KEYS_URL = "https://cloud.vast.ai/manage-keys/"
29
+ RUNPOD_KEYS_URL = "https://www.runpod.io/console/user/settings"
30
+ HF_TOKENS_URL = "https://huggingface.co/settings/tokens"
31
+ CCR_INSTALL_CMD = "npm install -g @musistudio/claude-code-router"
32
+
33
+
34
+ def signup_url(provider: str = "vast") -> str:
35
+ """The link to send people who don't have an account on `provider` yet."""
36
+ ref = (REFERRAL_URLS.get(provider) or "").strip()
37
+ return ref or SIGNUP_FALLBACK.get(provider, SIGNUP_FALLBACK["vast"])
aiod/ccr.py ADDED
@@ -0,0 +1,90 @@
1
+ """Generate / merge the Claude Code Router config so `ccr code` routes Claude
2
+ Code to our remote vLLM box.
3
+
4
+ Verified schema notes (musistudio/claude-code-router, classic CLI):
5
+ * config path: ~/.claude-code-router/config.json
6
+ * Providers[].api_base_url must be the FULL URL ending in /chat/completions
7
+ (NOT just the base) for a custom OpenAI-compatible endpoint like vLLM.
8
+ * Providers[].api_key is sent upstream as `Authorization: Bearer`.
9
+ * Providers[].models lists names exactly as /v1/models reports them.
10
+ * transformer is omitted for a vanilla OpenAI-compatible endpoint.
11
+ * Router values are the string "providerName,modelName".
12
+
13
+ We MERGE: existing providers and top-level settings are preserved; we only
14
+ replace our own provider (matched by name) and repoint the Router.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import shutil
21
+ from pathlib import Path
22
+
23
+ CONFIG_DIR = Path.home() / ".claude-code-router"
24
+ CONFIG_FILE = CONFIG_DIR / "config.json"
25
+ PROVIDER_NAME = "aiod-vllm"
26
+
27
+
28
+ def build_provider(base_url: str, api_key: str, model: str) -> dict:
29
+ """base_url ends in /v1 (e.g. http://1.2.3.4:33526/v1)."""
30
+ return {
31
+ "name": PROVIDER_NAME,
32
+ "api_base_url": f"{base_url.rstrip('/')}/chat/completions",
33
+ "api_key": api_key or "dummy",
34
+ "models": [model],
35
+ }
36
+
37
+
38
+ def build_router(model: str) -> dict:
39
+ ref = f"{PROVIDER_NAME},{model}"
40
+ return {
41
+ "default": ref,
42
+ "background": ref,
43
+ "think": ref,
44
+ "longContext": ref,
45
+ "longContextThreshold": 60000,
46
+ "webSearch": ref,
47
+ }
48
+
49
+
50
+ def _load_existing() -> dict:
51
+ if CONFIG_FILE.exists():
52
+ try:
53
+ return json.loads(CONFIG_FILE.read_text())
54
+ except json.JSONDecodeError:
55
+ pass
56
+ return {}
57
+
58
+
59
+ def write_config(base_url: str, api_key: str, model: str) -> Path:
60
+ """Merge our provider+router into the CCR config, preserving everything else.
61
+ Backs up an existing config to config.json.bak. Returns the config path."""
62
+ cfg = _load_existing()
63
+
64
+ if CONFIG_FILE.exists():
65
+ shutil.copy2(CONFIG_FILE, CONFIG_FILE.with_suffix(".json.bak"))
66
+
67
+ cfg.setdefault("LOG", False)
68
+ cfg.setdefault("HOST", "127.0.0.1")
69
+ cfg.setdefault("PORT", 3456)
70
+
71
+ providers = cfg.get("Providers")
72
+ if not isinstance(providers, list):
73
+ providers = []
74
+ providers = [p for p in providers if p.get("name") != PROVIDER_NAME]
75
+ providers.append(build_provider(base_url, api_key, model))
76
+ cfg["Providers"] = providers
77
+
78
+ cfg["Router"] = build_router(model)
79
+
80
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
81
+ CONFIG_FILE.write_text(json.dumps(cfg, indent=2))
82
+ return CONFIG_FILE
83
+
84
+
85
+ def config_snippet(base_url: str, api_key: str, model: str) -> str:
86
+ """The exact provider+router block, for printing / dry-run."""
87
+ return json.dumps(
88
+ {"Providers": [build_provider(base_url, api_key, model)], "Router": build_router(model)},
89
+ indent=2,
90
+ )