vllm-i64 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,179 @@
1
+ # vllm-i64
2
+
3
+ TypeScript SDK for [vllm-i64](https://github.com/Complexity-ML/vllm-i64) — the integer-first inference engine for token-routed language models.
4
+
5
+ Zero dependencies. Node >= 18.
6
+
7
+ ```bash
8
+ npm install vllm-i64
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```ts
14
+ import { I64Client } from "vllm-i64";
15
+
16
+ const client = new I64Client("http://localhost:8000");
17
+
18
+ // Chat completion
19
+ const res = await client.chat.create([
20
+ { role: "user", content: "Write a fibonacci function in Python" }
21
+ ]);
22
+ console.log(res.choices[0].message.content);
23
+ ```
24
+
25
+ ## Streaming
26
+
27
+ ```ts
28
+ for await (const chunk of client.chat.stream([
29
+ { role: "user", content: "Explain transformers" }
30
+ ])) {
31
+ process.stdout.write(chunk);
32
+ }
33
+ ```
34
+
35
+ ## Tool Calls (OpenAI-compatible)
36
+
37
+ ```ts
38
+ const res = await client.chat.create(
39
+ [{ role: "user", content: "What's the weather in Paris?" }],
40
+ {
41
+ tools: [{
42
+ type: "function",
43
+ function: {
44
+ name: "get_weather",
45
+ description: "Get current weather",
46
+ parameters: {
47
+ type: "object",
48
+ properties: { city: { type: "string" } },
49
+ required: ["city"]
50
+ }
51
+ }
52
+ }]
53
+ }
54
+ );
55
+
56
+ if (res.choices[0].message.tool_calls) {
57
+ console.log(res.choices[0].message.tool_calls);
58
+ }
59
+ ```
60
+
61
+ ## Text Completions
62
+
63
+ ```ts
64
+ const res = await client.completions.create("def fibonacci(n):", {
65
+ max_tokens: 200,
66
+ temperature: 0.2,
67
+ });
68
+ console.log(res.choices[0].text);
69
+
70
+ // Batch — multiple prompts at once
71
+ const batch = await client.completions.batch(
72
+ ["Hello", "Bonjour", "Hola"],
73
+ { max_tokens: 50 }
74
+ );
75
+ ```
76
+
77
+ ## Monitoring
78
+
79
+ ```ts
80
+ // Live snapshot — batch size, KV cache, tok/s, GPU
81
+ const snap = await client.monitor.snapshot();
82
+ console.log(`${snap.engine.total_tokens_generated} tokens generated`);
83
+ console.log(`${snap.perf?.tok_per_s} tok/s`);
84
+ console.log(`KV cache: ${snap.kv_cache?.usage_pct}% used`);
85
+
86
+ // Health check
87
+ const health = await client.monitor.health();
88
+ console.log(health.status); // "ok" | "degraded"
89
+
90
+ // Expert routing distribution (MoE models)
91
+ const experts = await client.monitor.experts();
92
+ console.log(`${experts.num_experts} experts, imbalance: ${experts.imbalance}`);
93
+ ```
94
+
95
+ ## KV Cache Management
96
+
97
+ ```ts
98
+ // Cache statistics
99
+ const stats = await client.cache.stats();
100
+ console.log(`${stats.used_blocks}/${stats.num_blocks} blocks used`);
101
+ console.log(`${stats.prefix_cached_blocks} prefix blocks cached`);
102
+
103
+ // Purge prefix cache (admin)
104
+ await client.cache.purge();
105
+ ```
106
+
107
+ ## LoRA Hot-Swap
108
+
109
+ ```ts
110
+ // Load an adapter at runtime
111
+ await client.lora.load({
112
+ adapter_id: 1,
113
+ path: "/models/lora-python-v2",
114
+ name: "python-specialist",
115
+ scaling: 0.8,
116
+ });
117
+
118
+ // List loaded adapters
119
+ const { adapters } = await client.lora.list();
120
+ console.log(adapters); // [{ id: 1, name: "python-specialist" }]
121
+
122
+ // Swap to a different adapter
123
+ await client.lora.load({ adapter_id: 2, path: "/models/lora-chat-v3" });
124
+
125
+ // Unload when done
126
+ await client.lora.unload(1);
127
+ ```
128
+
129
+ ## RAG (Retrieval-Augmented Generation)
130
+
131
+ ```ts
132
+ // Index documents
133
+ await client.rag.index({ text: "Paris is the capital of France." });
134
+ await client.rag.index({ file: "/data/docs/handbook.pdf" });
135
+
136
+ // Search
137
+ const results = await client.rag.search("capital of France", 3);
138
+ console.log(results.results[0].text);
139
+
140
+ // Stats
141
+ const ragStats = await client.rag.stats();
142
+ console.log(`${ragStats.total_chunks} chunks indexed`);
143
+ ```
144
+
145
+ ## Authentication
146
+
147
+ ```ts
148
+ const client = new I64Client("http://localhost:8000", {
149
+ apiKey: "sk-your-api-key",
150
+ timeoutMs: 30_000,
151
+ });
152
+ ```
153
+
154
+ ## API Reference
155
+
156
+ | Namespace | Methods |
157
+ |---|---|
158
+ | `client.chat` | `create()`, `stream()`, `streamRaw()` |
159
+ | `client.completions` | `create()`, `stream()`, `batch()` |
160
+ | `client.cache` | `stats()`, `purge()` |
161
+ | `client.lora` | `load()`, `unload()`, `list()` |
162
+ | `client.monitor` | `health()`, `isReady()`, `models()`, `snapshot()`, `metrics()`, `experts()`, `cancel()` |
163
+ | `client.rag` | `index()`, `search()`, `stats()` |
164
+
165
+ ## What is vllm-i64?
166
+
167
+ An integer-first inference engine for token-routed Mixture-of-Experts models. Key features:
168
+
169
+ - **Token routing**: `expert_id = token_id % num_experts` — deterministic, no learned router
170
+ - **Continuous batching**: mixed prefill + decode in every step
171
+ - **Paged KV cache**: with prefix caching, LRU eviction, FP8 compression
172
+ - **LoRA hot-swap**: load/unload adapters at runtime without restart
173
+ - **OpenAI-compatible API**: drop-in replacement for any OpenAI client
174
+
175
+ Built by [Complexity-ML](https://github.com/Complexity-ML) / INL.
176
+
177
+ ## License
178
+
179
+ Apache-2.0
package/dist/index.d.mts CHANGED
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
44
44
  };
45
45
  };
46
46
  stop?: string | string[];
47
+ min_p?: number;
48
+ typical_p?: number;
47
49
  repetition_penalty?: number;
50
+ min_tokens?: number;
48
51
  logprobs?: boolean;
49
52
  }
50
53
  interface ChatCompletionChoice {
@@ -233,7 +236,7 @@ declare class HttpClient {
233
236
  private apiKey?;
234
237
  private timeout;
235
238
  constructor(baseUrl?: string, options?: ClientOptions);
236
- fetch(path: string, init?: RequestInit): Promise<Response>;
239
+ fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
237
240
  get<T>(path: string): Promise<T>;
238
241
  post<T>(path: string, body: unknown): Promise<T>;
239
242
  readSSE(res: Response): AsyncGenerator<string, void, undefined>;
@@ -269,11 +272,15 @@ declare class ChatEndpoint {
269
272
  * }
270
273
  * ```
271
274
  */
272
- stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<string, void, undefined>;
275
+ stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
276
+ signal?: AbortSignal;
277
+ }): AsyncGenerator<string, void, undefined>;
273
278
  /**
274
279
  * Streaming chat — yields raw SSE delta objects.
275
280
  */
276
- streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<StreamDelta, void, undefined>;
281
+ streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
282
+ signal?: AbortSignal;
283
+ }): AsyncGenerator<StreamDelta, void, undefined>;
277
284
  }
278
285
 
279
286
  /**
package/dist/index.d.ts CHANGED
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
44
44
  };
45
45
  };
46
46
  stop?: string | string[];
47
+ min_p?: number;
48
+ typical_p?: number;
47
49
  repetition_penalty?: number;
50
+ min_tokens?: number;
48
51
  logprobs?: boolean;
49
52
  }
50
53
  interface ChatCompletionChoice {
@@ -233,7 +236,7 @@ declare class HttpClient {
233
236
  private apiKey?;
234
237
  private timeout;
235
238
  constructor(baseUrl?: string, options?: ClientOptions);
236
- fetch(path: string, init?: RequestInit): Promise<Response>;
239
+ fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
237
240
  get<T>(path: string): Promise<T>;
238
241
  post<T>(path: string, body: unknown): Promise<T>;
239
242
  readSSE(res: Response): AsyncGenerator<string, void, undefined>;
@@ -269,11 +272,15 @@ declare class ChatEndpoint {
269
272
  * }
270
273
  * ```
271
274
  */
272
- stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<string, void, undefined>;
275
+ stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
276
+ signal?: AbortSignal;
277
+ }): AsyncGenerator<string, void, undefined>;
273
278
  /**
274
279
  * Streaming chat — yields raw SSE delta objects.
275
280
  */
276
- streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<StreamDelta, void, undefined>;
281
+ streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
282
+ signal?: AbortSignal;
283
+ }): AsyncGenerator<StreamDelta, void, undefined>;
277
284
  }
278
285
 
279
286
  /**
package/dist/index.js CHANGED
@@ -42,7 +42,7 @@ var HttpClient = class {
42
42
  this.apiKey = options.apiKey;
43
43
  this.timeout = options.timeoutMs ?? 12e4;
44
44
  }
45
- async fetch(path, init = {}) {
45
+ async fetch(path, init = {}, externalSignal) {
46
46
  const headers = {
47
47
  "Content-Type": "application/json",
48
48
  ...init.headers
@@ -52,6 +52,13 @@ var HttpClient = class {
52
52
  }
53
53
  const controller = new AbortController();
54
54
  const timer = setTimeout(() => controller.abort(), this.timeout);
55
+ if (externalSignal) {
56
+ if (externalSignal.aborted) {
57
+ controller.abort();
58
+ } else {
59
+ externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
60
+ }
61
+ }
55
62
  try {
56
63
  const res = await fetch(`${this.baseUrl}${path}`, {
57
64
  ...init,
@@ -178,30 +185,32 @@ var ChatEndpoint = class {
178
185
  * ```
179
186
  */
180
187
  async *stream(messages, options = {}) {
188
+ const { signal, ...rest } = options;
181
189
  const res = await this.http.fetch("/v1/chat/completions", {
182
190
  method: "POST",
183
191
  body: JSON.stringify({
184
192
  model: "default",
185
193
  messages,
186
- ...options,
194
+ ...rest,
187
195
  stream: true
188
196
  })
189
- });
197
+ }, signal);
190
198
  yield* this.http.readSSE(res);
191
199
  }
192
200
  /**
193
201
  * Streaming chat — yields raw SSE delta objects.
194
202
  */
195
203
  async *streamRaw(messages, options = {}) {
204
+ const { signal, ...rest } = options;
196
205
  const res = await this.http.fetch("/v1/chat/completions", {
197
206
  method: "POST",
198
207
  body: JSON.stringify({
199
208
  model: "default",
200
209
  messages,
201
- ...options,
210
+ ...rest,
202
211
  stream: true
203
212
  })
204
- });
213
+ }, signal);
205
214
  yield* this.http.readSSERaw(res);
206
215
  }
207
216
  };
package/dist/index.mjs CHANGED
@@ -8,7 +8,7 @@ var HttpClient = class {
8
8
  this.apiKey = options.apiKey;
9
9
  this.timeout = options.timeoutMs ?? 12e4;
10
10
  }
11
- async fetch(path, init = {}) {
11
+ async fetch(path, init = {}, externalSignal) {
12
12
  const headers = {
13
13
  "Content-Type": "application/json",
14
14
  ...init.headers
@@ -18,6 +18,13 @@ var HttpClient = class {
18
18
  }
19
19
  const controller = new AbortController();
20
20
  const timer = setTimeout(() => controller.abort(), this.timeout);
21
+ if (externalSignal) {
22
+ if (externalSignal.aborted) {
23
+ controller.abort();
24
+ } else {
25
+ externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
26
+ }
27
+ }
21
28
  try {
22
29
  const res = await fetch(`${this.baseUrl}${path}`, {
23
30
  ...init,
@@ -144,30 +151,32 @@ var ChatEndpoint = class {
144
151
  * ```
145
152
  */
146
153
  async *stream(messages, options = {}) {
154
+ const { signal, ...rest } = options;
147
155
  const res = await this.http.fetch("/v1/chat/completions", {
148
156
  method: "POST",
149
157
  body: JSON.stringify({
150
158
  model: "default",
151
159
  messages,
152
- ...options,
160
+ ...rest,
153
161
  stream: true
154
162
  })
155
- });
163
+ }, signal);
156
164
  yield* this.http.readSSE(res);
157
165
  }
158
166
  /**
159
167
  * Streaming chat — yields raw SSE delta objects.
160
168
  */
161
169
  async *streamRaw(messages, options = {}) {
170
+ const { signal, ...rest } = options;
162
171
  const res = await this.http.fetch("/v1/chat/completions", {
163
172
  method: "POST",
164
173
  body: JSON.stringify({
165
174
  model: "default",
166
175
  messages,
167
- ...options,
176
+ ...rest,
168
177
  stream: true
169
178
  })
170
- });
179
+ }, signal);
171
180
  yield* this.http.readSSERaw(res);
172
181
  }
173
182
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vllm-i64",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "TypeScript SDK for vllm-i64 — integer-first inference engine",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",