vllm-i64 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +179 -0
- package/dist/index.d.mts +10 -3
- package/dist/index.d.ts +10 -3
- package/dist/index.js +14 -5
- package/dist/index.mjs +14 -5
- package/package.json +1 -1
package/README.md
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# vllm-i64
|
|
2
|
+
|
|
3
|
+
TypeScript SDK for [vllm-i64](https://github.com/Complexity-ML/vllm-i64) — the integer-first inference engine for token-routed language models.
|
|
4
|
+
|
|
5
|
+
Zero dependencies. Node >= 18.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install vllm-i64
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```ts
|
|
14
|
+
import { I64Client } from "vllm-i64";
|
|
15
|
+
|
|
16
|
+
const client = new I64Client("http://localhost:8000");
|
|
17
|
+
|
|
18
|
+
// Chat completion
|
|
19
|
+
const res = await client.chat.create([
|
|
20
|
+
{ role: "user", content: "Write a fibonacci function in Python" }
|
|
21
|
+
]);
|
|
22
|
+
console.log(res.choices[0].message.content);
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Streaming
|
|
26
|
+
|
|
27
|
+
```ts
|
|
28
|
+
for await (const chunk of client.chat.stream([
|
|
29
|
+
{ role: "user", content: "Explain transformers" }
|
|
30
|
+
])) {
|
|
31
|
+
process.stdout.write(chunk);
|
|
32
|
+
}
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Tool Calls (OpenAI-compatible)
|
|
36
|
+
|
|
37
|
+
```ts
|
|
38
|
+
const res = await client.chat.create(
|
|
39
|
+
[{ role: "user", content: "What's the weather in Paris?" }],
|
|
40
|
+
{
|
|
41
|
+
tools: [{
|
|
42
|
+
type: "function",
|
|
43
|
+
function: {
|
|
44
|
+
name: "get_weather",
|
|
45
|
+
description: "Get current weather",
|
|
46
|
+
parameters: {
|
|
47
|
+
type: "object",
|
|
48
|
+
properties: { city: { type: "string" } },
|
|
49
|
+
required: ["city"]
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}]
|
|
53
|
+
}
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
if (res.choices[0].message.tool_calls) {
|
|
57
|
+
console.log(res.choices[0].message.tool_calls);
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Text Completions
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
const res = await client.completions.create("def fibonacci(n):", {
|
|
65
|
+
max_tokens: 200,
|
|
66
|
+
temperature: 0.2,
|
|
67
|
+
});
|
|
68
|
+
console.log(res.choices[0].text);
|
|
69
|
+
|
|
70
|
+
// Batch — multiple prompts at once
|
|
71
|
+
const batch = await client.completions.batch(
|
|
72
|
+
["Hello", "Bonjour", "Hola"],
|
|
73
|
+
{ max_tokens: 50 }
|
|
74
|
+
);
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Monitoring
|
|
78
|
+
|
|
79
|
+
```ts
|
|
80
|
+
// Live snapshot — batch size, KV cache, tok/s, GPU
|
|
81
|
+
const snap = await client.monitor.snapshot();
|
|
82
|
+
console.log(`${snap.engine.total_tokens_generated} tokens generated`);
|
|
83
|
+
console.log(`${snap.perf?.tok_per_s} tok/s`);
|
|
84
|
+
console.log(`KV cache: ${snap.kv_cache?.usage_pct}% used`);
|
|
85
|
+
|
|
86
|
+
// Health check
|
|
87
|
+
const health = await client.monitor.health();
|
|
88
|
+
console.log(health.status); // "ok" | "degraded"
|
|
89
|
+
|
|
90
|
+
// Expert routing distribution (MoE models)
|
|
91
|
+
const experts = await client.monitor.experts();
|
|
92
|
+
console.log(`${experts.num_experts} experts, imbalance: ${experts.imbalance}`);
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## KV Cache Management
|
|
96
|
+
|
|
97
|
+
```ts
|
|
98
|
+
// Cache statistics
|
|
99
|
+
const stats = await client.cache.stats();
|
|
100
|
+
console.log(`${stats.used_blocks}/${stats.num_blocks} blocks used`);
|
|
101
|
+
console.log(`${stats.prefix_cached_blocks} prefix blocks cached`);
|
|
102
|
+
|
|
103
|
+
// Purge prefix cache (admin)
|
|
104
|
+
await client.cache.purge();
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## LoRA Hot-Swap
|
|
108
|
+
|
|
109
|
+
```ts
|
|
110
|
+
// Load an adapter at runtime
|
|
111
|
+
await client.lora.load({
|
|
112
|
+
adapter_id: 1,
|
|
113
|
+
path: "/models/lora-python-v2",
|
|
114
|
+
name: "python-specialist",
|
|
115
|
+
scaling: 0.8,
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
// List loaded adapters
|
|
119
|
+
const { adapters } = await client.lora.list();
|
|
120
|
+
console.log(adapters); // [{ id: 1, name: "python-specialist" }]
|
|
121
|
+
|
|
122
|
+
// Swap to a different adapter
|
|
123
|
+
await client.lora.load({ adapter_id: 2, path: "/models/lora-chat-v3" });
|
|
124
|
+
|
|
125
|
+
// Unload when done
|
|
126
|
+
await client.lora.unload(1);
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## RAG (Retrieval-Augmented Generation)
|
|
130
|
+
|
|
131
|
+
```ts
|
|
132
|
+
// Index documents
|
|
133
|
+
await client.rag.index({ text: "Paris is the capital of France." });
|
|
134
|
+
await client.rag.index({ file: "/data/docs/handbook.pdf" });
|
|
135
|
+
|
|
136
|
+
// Search
|
|
137
|
+
const results = await client.rag.search("capital of France", 3);
|
|
138
|
+
console.log(results.results[0].text);
|
|
139
|
+
|
|
140
|
+
// Stats
|
|
141
|
+
const ragStats = await client.rag.stats();
|
|
142
|
+
console.log(`${ragStats.total_chunks} chunks indexed`);
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Authentication
|
|
146
|
+
|
|
147
|
+
```ts
|
|
148
|
+
const client = new I64Client("http://localhost:8000", {
|
|
149
|
+
apiKey: "sk-your-api-key",
|
|
150
|
+
timeoutMs: 30_000,
|
|
151
|
+
});
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## API Reference
|
|
155
|
+
|
|
156
|
+
| Namespace | Methods |
|
|
157
|
+
|---|---|
|
|
158
|
+
| `client.chat` | `create()`, `stream()`, `streamRaw()` |
|
|
159
|
+
| `client.completions` | `create()`, `stream()`, `batch()` |
|
|
160
|
+
| `client.cache` | `stats()`, `purge()` |
|
|
161
|
+
| `client.lora` | `load()`, `unload()`, `list()` |
|
|
162
|
+
| `client.monitor` | `health()`, `isReady()`, `models()`, `snapshot()`, `metrics()`, `experts()`, `cancel()` |
|
|
163
|
+
| `client.rag` | `index()`, `search()`, `stats()` |
|
|
164
|
+
|
|
165
|
+
## What is vllm-i64?
|
|
166
|
+
|
|
167
|
+
An integer-first inference engine for token-routed Mixture-of-Experts models. Key features:
|
|
168
|
+
|
|
169
|
+
- **Token routing**: `expert_id = token_id % num_experts` — deterministic, no learned router
|
|
170
|
+
- **Continuous batching**: mixed prefill + decode in every step
|
|
171
|
+
- **Paged KV cache**: with prefix caching, LRU eviction, FP8 compression
|
|
172
|
+
- **LoRA hot-swap**: load/unload adapters at runtime without restart
|
|
173
|
+
- **OpenAI-compatible API**: drop-in replacement for any OpenAI client
|
|
174
|
+
|
|
175
|
+
Built by [Complexity-ML](https://github.com/Complexity-ML) / INL.
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
Apache-2.0
|
package/dist/index.d.mts
CHANGED
|
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
|
|
|
44
44
|
};
|
|
45
45
|
};
|
|
46
46
|
stop?: string | string[];
|
|
47
|
+
min_p?: number;
|
|
48
|
+
typical_p?: number;
|
|
47
49
|
repetition_penalty?: number;
|
|
50
|
+
min_tokens?: number;
|
|
48
51
|
logprobs?: boolean;
|
|
49
52
|
}
|
|
50
53
|
interface ChatCompletionChoice {
|
|
@@ -233,7 +236,7 @@ declare class HttpClient {
|
|
|
233
236
|
private apiKey?;
|
|
234
237
|
private timeout;
|
|
235
238
|
constructor(baseUrl?: string, options?: ClientOptions);
|
|
236
|
-
fetch(path: string, init?: RequestInit): Promise<Response>;
|
|
239
|
+
fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
|
|
237
240
|
get<T>(path: string): Promise<T>;
|
|
238
241
|
post<T>(path: string, body: unknown): Promise<T>;
|
|
239
242
|
readSSE(res: Response): AsyncGenerator<string, void, undefined>;
|
|
@@ -269,11 +272,15 @@ declare class ChatEndpoint {
|
|
|
269
272
|
* }
|
|
270
273
|
* ```
|
|
271
274
|
*/
|
|
272
|
-
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
275
|
+
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
276
|
+
signal?: AbortSignal;
|
|
277
|
+
}): AsyncGenerator<string, void, undefined>;
|
|
273
278
|
/**
|
|
274
279
|
* Streaming chat — yields raw SSE delta objects.
|
|
275
280
|
*/
|
|
276
|
-
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
281
|
+
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
282
|
+
signal?: AbortSignal;
|
|
283
|
+
}): AsyncGenerator<StreamDelta, void, undefined>;
|
|
277
284
|
}
|
|
278
285
|
|
|
279
286
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
|
|
|
44
44
|
};
|
|
45
45
|
};
|
|
46
46
|
stop?: string | string[];
|
|
47
|
+
min_p?: number;
|
|
48
|
+
typical_p?: number;
|
|
47
49
|
repetition_penalty?: number;
|
|
50
|
+
min_tokens?: number;
|
|
48
51
|
logprobs?: boolean;
|
|
49
52
|
}
|
|
50
53
|
interface ChatCompletionChoice {
|
|
@@ -233,7 +236,7 @@ declare class HttpClient {
|
|
|
233
236
|
private apiKey?;
|
|
234
237
|
private timeout;
|
|
235
238
|
constructor(baseUrl?: string, options?: ClientOptions);
|
|
236
|
-
fetch(path: string, init?: RequestInit): Promise<Response>;
|
|
239
|
+
fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
|
|
237
240
|
get<T>(path: string): Promise<T>;
|
|
238
241
|
post<T>(path: string, body: unknown): Promise<T>;
|
|
239
242
|
readSSE(res: Response): AsyncGenerator<string, void, undefined>;
|
|
@@ -269,11 +272,15 @@ declare class ChatEndpoint {
|
|
|
269
272
|
* }
|
|
270
273
|
* ```
|
|
271
274
|
*/
|
|
272
|
-
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
275
|
+
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
276
|
+
signal?: AbortSignal;
|
|
277
|
+
}): AsyncGenerator<string, void, undefined>;
|
|
273
278
|
/**
|
|
274
279
|
* Streaming chat — yields raw SSE delta objects.
|
|
275
280
|
*/
|
|
276
|
-
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
281
|
+
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
282
|
+
signal?: AbortSignal;
|
|
283
|
+
}): AsyncGenerator<StreamDelta, void, undefined>;
|
|
277
284
|
}
|
|
278
285
|
|
|
279
286
|
/**
|
package/dist/index.js
CHANGED
|
@@ -42,7 +42,7 @@ var HttpClient = class {
|
|
|
42
42
|
this.apiKey = options.apiKey;
|
|
43
43
|
this.timeout = options.timeoutMs ?? 12e4;
|
|
44
44
|
}
|
|
45
|
-
async fetch(path, init = {}) {
|
|
45
|
+
async fetch(path, init = {}, externalSignal) {
|
|
46
46
|
const headers = {
|
|
47
47
|
"Content-Type": "application/json",
|
|
48
48
|
...init.headers
|
|
@@ -52,6 +52,13 @@ var HttpClient = class {
|
|
|
52
52
|
}
|
|
53
53
|
const controller = new AbortController();
|
|
54
54
|
const timer = setTimeout(() => controller.abort(), this.timeout);
|
|
55
|
+
if (externalSignal) {
|
|
56
|
+
if (externalSignal.aborted) {
|
|
57
|
+
controller.abort();
|
|
58
|
+
} else {
|
|
59
|
+
externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
60
|
+
}
|
|
61
|
+
}
|
|
55
62
|
try {
|
|
56
63
|
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
57
64
|
...init,
|
|
@@ -178,30 +185,32 @@ var ChatEndpoint = class {
|
|
|
178
185
|
* ```
|
|
179
186
|
*/
|
|
180
187
|
async *stream(messages, options = {}) {
|
|
188
|
+
const { signal, ...rest } = options;
|
|
181
189
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
182
190
|
method: "POST",
|
|
183
191
|
body: JSON.stringify({
|
|
184
192
|
model: "default",
|
|
185
193
|
messages,
|
|
186
|
-
...
|
|
194
|
+
...rest,
|
|
187
195
|
stream: true
|
|
188
196
|
})
|
|
189
|
-
});
|
|
197
|
+
}, signal);
|
|
190
198
|
yield* this.http.readSSE(res);
|
|
191
199
|
}
|
|
192
200
|
/**
|
|
193
201
|
* Streaming chat — yields raw SSE delta objects.
|
|
194
202
|
*/
|
|
195
203
|
async *streamRaw(messages, options = {}) {
|
|
204
|
+
const { signal, ...rest } = options;
|
|
196
205
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
197
206
|
method: "POST",
|
|
198
207
|
body: JSON.stringify({
|
|
199
208
|
model: "default",
|
|
200
209
|
messages,
|
|
201
|
-
...
|
|
210
|
+
...rest,
|
|
202
211
|
stream: true
|
|
203
212
|
})
|
|
204
|
-
});
|
|
213
|
+
}, signal);
|
|
205
214
|
yield* this.http.readSSERaw(res);
|
|
206
215
|
}
|
|
207
216
|
};
|
package/dist/index.mjs
CHANGED
|
@@ -8,7 +8,7 @@ var HttpClient = class {
|
|
|
8
8
|
this.apiKey = options.apiKey;
|
|
9
9
|
this.timeout = options.timeoutMs ?? 12e4;
|
|
10
10
|
}
|
|
11
|
-
async fetch(path, init = {}) {
|
|
11
|
+
async fetch(path, init = {}, externalSignal) {
|
|
12
12
|
const headers = {
|
|
13
13
|
"Content-Type": "application/json",
|
|
14
14
|
...init.headers
|
|
@@ -18,6 +18,13 @@ var HttpClient = class {
|
|
|
18
18
|
}
|
|
19
19
|
const controller = new AbortController();
|
|
20
20
|
const timer = setTimeout(() => controller.abort(), this.timeout);
|
|
21
|
+
if (externalSignal) {
|
|
22
|
+
if (externalSignal.aborted) {
|
|
23
|
+
controller.abort();
|
|
24
|
+
} else {
|
|
25
|
+
externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
26
|
+
}
|
|
27
|
+
}
|
|
21
28
|
try {
|
|
22
29
|
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
23
30
|
...init,
|
|
@@ -144,30 +151,32 @@ var ChatEndpoint = class {
|
|
|
144
151
|
* ```
|
|
145
152
|
*/
|
|
146
153
|
async *stream(messages, options = {}) {
|
|
154
|
+
const { signal, ...rest } = options;
|
|
147
155
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
148
156
|
method: "POST",
|
|
149
157
|
body: JSON.stringify({
|
|
150
158
|
model: "default",
|
|
151
159
|
messages,
|
|
152
|
-
...
|
|
160
|
+
...rest,
|
|
153
161
|
stream: true
|
|
154
162
|
})
|
|
155
|
-
});
|
|
163
|
+
}, signal);
|
|
156
164
|
yield* this.http.readSSE(res);
|
|
157
165
|
}
|
|
158
166
|
/**
|
|
159
167
|
* Streaming chat — yields raw SSE delta objects.
|
|
160
168
|
*/
|
|
161
169
|
async *streamRaw(messages, options = {}) {
|
|
170
|
+
const { signal, ...rest } = options;
|
|
162
171
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
163
172
|
method: "POST",
|
|
164
173
|
body: JSON.stringify({
|
|
165
174
|
model: "default",
|
|
166
175
|
messages,
|
|
167
|
-
...
|
|
176
|
+
...rest,
|
|
168
177
|
stream: true
|
|
169
178
|
})
|
|
170
|
-
});
|
|
179
|
+
}, signal);
|
|
171
180
|
yield* this.http.readSSERaw(res);
|
|
172
181
|
}
|
|
173
182
|
};
|