vllm-i64 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +432 -0
- package/dist/index.d.ts +432 -0
- package/dist/index.js +381 -0
- package/dist/index.mjs +347 -0
- package/package.json +47 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
CacheEndpoint: () => CacheEndpoint,
|
|
24
|
+
ChatEndpoint: () => ChatEndpoint,
|
|
25
|
+
CompletionsEndpoint: () => CompletionsEndpoint,
|
|
26
|
+
HttpClient: () => HttpClient,
|
|
27
|
+
I64Client: () => I64Client,
|
|
28
|
+
LoRAEndpoint: () => LoRAEndpoint,
|
|
29
|
+
MonitorEndpoint: () => MonitorEndpoint,
|
|
30
|
+
RAGEndpoint: () => RAGEndpoint,
|
|
31
|
+
default: () => index_default
|
|
32
|
+
});
|
|
33
|
+
module.exports = __toCommonJS(index_exports);
|
|
34
|
+
|
|
35
|
+
// src/client.ts
|
|
36
|
+
var HttpClient = class {
|
|
37
|
+
baseUrl;
|
|
38
|
+
apiKey;
|
|
39
|
+
timeout;
|
|
40
|
+
constructor(baseUrl = "http://localhost:8000", options = {}) {
|
|
41
|
+
this.baseUrl = baseUrl.replace(/\/+$/, "");
|
|
42
|
+
this.apiKey = options.apiKey;
|
|
43
|
+
this.timeout = options.timeoutMs ?? 12e4;
|
|
44
|
+
}
|
|
45
|
+
async fetch(path, init = {}) {
|
|
46
|
+
const headers = {
|
|
47
|
+
"Content-Type": "application/json",
|
|
48
|
+
...init.headers
|
|
49
|
+
};
|
|
50
|
+
if (this.apiKey) {
|
|
51
|
+
headers["Authorization"] = `Bearer ${this.apiKey}`;
|
|
52
|
+
}
|
|
53
|
+
const controller = new AbortController();
|
|
54
|
+
const timer = setTimeout(() => controller.abort(), this.timeout);
|
|
55
|
+
try {
|
|
56
|
+
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
57
|
+
...init,
|
|
58
|
+
headers,
|
|
59
|
+
signal: controller.signal
|
|
60
|
+
});
|
|
61
|
+
if (!res.ok) {
|
|
62
|
+
const body = await res.text();
|
|
63
|
+
let msg;
|
|
64
|
+
try {
|
|
65
|
+
msg = JSON.parse(body)?.error?.message ?? body;
|
|
66
|
+
} catch {
|
|
67
|
+
msg = body;
|
|
68
|
+
}
|
|
69
|
+
throw new Error(`vllm-i64 ${res.status}: ${msg}`);
|
|
70
|
+
}
|
|
71
|
+
return res;
|
|
72
|
+
} finally {
|
|
73
|
+
clearTimeout(timer);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
async get(path) {
|
|
77
|
+
const res = await this.fetch(path);
|
|
78
|
+
return res.json();
|
|
79
|
+
}
|
|
80
|
+
async post(path, body) {
|
|
81
|
+
const res = await this.fetch(path, {
|
|
82
|
+
method: "POST",
|
|
83
|
+
body: JSON.stringify(body)
|
|
84
|
+
});
|
|
85
|
+
return res.json();
|
|
86
|
+
}
|
|
87
|
+
// ---------------------------------------------------------------
|
|
88
|
+
// SSE streaming
|
|
89
|
+
// ---------------------------------------------------------------
|
|
90
|
+
async *readSSE(res) {
|
|
91
|
+
if (!res.body) return;
|
|
92
|
+
const reader = res.body.getReader();
|
|
93
|
+
const decoder = new TextDecoder();
|
|
94
|
+
let buffer = "";
|
|
95
|
+
try {
|
|
96
|
+
while (true) {
|
|
97
|
+
const { done, value } = await reader.read();
|
|
98
|
+
if (done) break;
|
|
99
|
+
buffer += decoder.decode(value, { stream: true });
|
|
100
|
+
const lines = buffer.split("\n");
|
|
101
|
+
buffer = lines.pop() ?? "";
|
|
102
|
+
for (const line of lines) {
|
|
103
|
+
const trimmed = line.trim();
|
|
104
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
105
|
+
const payload = trimmed.slice(6);
|
|
106
|
+
if (payload === "[DONE]") return;
|
|
107
|
+
try {
|
|
108
|
+
const data = JSON.parse(payload);
|
|
109
|
+
const content = data.choices?.[0]?.delta?.content ?? data.choices?.[0]?.text ?? "";
|
|
110
|
+
if (content) yield content;
|
|
111
|
+
} catch {
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
} finally {
|
|
116
|
+
reader.releaseLock();
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
async *readSSERaw(res) {
|
|
120
|
+
if (!res.body) return;
|
|
121
|
+
const reader = res.body.getReader();
|
|
122
|
+
const decoder = new TextDecoder();
|
|
123
|
+
let buffer = "";
|
|
124
|
+
try {
|
|
125
|
+
while (true) {
|
|
126
|
+
const { done, value } = await reader.read();
|
|
127
|
+
if (done) break;
|
|
128
|
+
buffer += decoder.decode(value, { stream: true });
|
|
129
|
+
const lines = buffer.split("\n");
|
|
130
|
+
buffer = lines.pop() ?? "";
|
|
131
|
+
for (const line of lines) {
|
|
132
|
+
const trimmed = line.trim();
|
|
133
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
134
|
+
const payload = trimmed.slice(6);
|
|
135
|
+
if (payload === "[DONE]") return;
|
|
136
|
+
try {
|
|
137
|
+
yield JSON.parse(payload);
|
|
138
|
+
} catch {
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
} finally {
|
|
143
|
+
reader.releaseLock();
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
// src/endpoints/chat.ts
|
|
149
|
+
var ChatEndpoint = class {
|
|
150
|
+
constructor(http) {
|
|
151
|
+
this.http = http;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Chat completion (non-streaming).
|
|
155
|
+
*
|
|
156
|
+
* @example
|
|
157
|
+
* ```ts
|
|
158
|
+
* const res = await client.chat.create([{ role: "user", content: "Hi" }]);
|
|
159
|
+
* console.log(res.choices[0].message.content);
|
|
160
|
+
* ```
|
|
161
|
+
*/
|
|
162
|
+
async create(messages, options = {}) {
|
|
163
|
+
return this.http.post("/v1/chat/completions", {
|
|
164
|
+
model: "default",
|
|
165
|
+
messages,
|
|
166
|
+
...options,
|
|
167
|
+
stream: false
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Streaming chat — yields content strings.
|
|
172
|
+
*
|
|
173
|
+
* @example
|
|
174
|
+
* ```ts
|
|
175
|
+
* for await (const chunk of client.chat.stream([{ role: "user", content: "Hi" }])) {
|
|
176
|
+
* process.stdout.write(chunk);
|
|
177
|
+
* }
|
|
178
|
+
* ```
|
|
179
|
+
*/
|
|
180
|
+
async *stream(messages, options = {}) {
|
|
181
|
+
const res = await this.http.fetch("/v1/chat/completions", {
|
|
182
|
+
method: "POST",
|
|
183
|
+
body: JSON.stringify({
|
|
184
|
+
model: "default",
|
|
185
|
+
messages,
|
|
186
|
+
...options,
|
|
187
|
+
stream: true
|
|
188
|
+
})
|
|
189
|
+
});
|
|
190
|
+
yield* this.http.readSSE(res);
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Streaming chat — yields raw SSE delta objects.
|
|
194
|
+
*/
|
|
195
|
+
async *streamRaw(messages, options = {}) {
|
|
196
|
+
const res = await this.http.fetch("/v1/chat/completions", {
|
|
197
|
+
method: "POST",
|
|
198
|
+
body: JSON.stringify({
|
|
199
|
+
model: "default",
|
|
200
|
+
messages,
|
|
201
|
+
...options,
|
|
202
|
+
stream: true
|
|
203
|
+
})
|
|
204
|
+
});
|
|
205
|
+
yield* this.http.readSSERaw(res);
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
|
|
209
|
+
// src/endpoints/completions.ts
|
|
210
|
+
var CompletionsEndpoint = class {
|
|
211
|
+
constructor(http) {
|
|
212
|
+
this.http = http;
|
|
213
|
+
}
|
|
214
|
+
/** Text completion (non-streaming). */
|
|
215
|
+
async create(prompt, options = {}) {
|
|
216
|
+
return this.http.post("/v1/completions", {
|
|
217
|
+
model: "default",
|
|
218
|
+
prompt,
|
|
219
|
+
...options,
|
|
220
|
+
stream: false
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
/** Streaming text completion — yields text chunks. */
|
|
224
|
+
async *stream(prompt, options = {}) {
|
|
225
|
+
const res = await this.http.fetch("/v1/completions", {
|
|
226
|
+
method: "POST",
|
|
227
|
+
body: JSON.stringify({
|
|
228
|
+
model: "default",
|
|
229
|
+
prompt,
|
|
230
|
+
...options,
|
|
231
|
+
stream: true
|
|
232
|
+
})
|
|
233
|
+
});
|
|
234
|
+
yield* this.http.readSSE(res);
|
|
235
|
+
}
|
|
236
|
+
/** Submit multiple prompts at once. */
|
|
237
|
+
async batch(prompts, options = {}) {
|
|
238
|
+
return this.http.post("/v1/batch", { prompts, ...options });
|
|
239
|
+
}
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
// src/endpoints/cache.ts
|
|
243
|
+
var CacheEndpoint = class {
|
|
244
|
+
constructor(http) {
|
|
245
|
+
this.http = http;
|
|
246
|
+
}
|
|
247
|
+
/** Get KV cache statistics. */
|
|
248
|
+
async stats() {
|
|
249
|
+
return this.http.get("/v1/cache/stats");
|
|
250
|
+
}
|
|
251
|
+
/** Purge prefix cache (admin). */
|
|
252
|
+
async purge() {
|
|
253
|
+
return this.http.post("/v1/cache/purge", {});
|
|
254
|
+
}
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
// src/endpoints/lora.ts
|
|
258
|
+
var LoRAEndpoint = class {
|
|
259
|
+
constructor(http) {
|
|
260
|
+
this.http = http;
|
|
261
|
+
}
|
|
262
|
+
/** Load a LoRA adapter (admin). */
|
|
263
|
+
async load(params) {
|
|
264
|
+
return this.http.post("/v1/lora/load", params);
|
|
265
|
+
}
|
|
266
|
+
/** Unload a LoRA adapter (admin). */
|
|
267
|
+
async unload(adapter_id) {
|
|
268
|
+
return this.http.post("/v1/lora/unload", { adapter_id });
|
|
269
|
+
}
|
|
270
|
+
/** List loaded adapters. */
|
|
271
|
+
async list() {
|
|
272
|
+
return this.http.get("/v1/lora/list");
|
|
273
|
+
}
|
|
274
|
+
};
|
|
275
|
+
|
|
276
|
+
// src/endpoints/monitor.ts
|
|
277
|
+
var MonitorEndpoint = class {
|
|
278
|
+
constructor(http) {
|
|
279
|
+
this.http = http;
|
|
280
|
+
}
|
|
281
|
+
/** Health check with engine stats. */
|
|
282
|
+
async health() {
|
|
283
|
+
return this.http.get("/health");
|
|
284
|
+
}
|
|
285
|
+
/** Check if server is reachable and ready. */
|
|
286
|
+
async isReady() {
|
|
287
|
+
try {
|
|
288
|
+
const h = await this.health();
|
|
289
|
+
return h.status === "ok" || h.status === "degraded";
|
|
290
|
+
} catch {
|
|
291
|
+
return false;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
/** List available models. */
|
|
295
|
+
async models() {
|
|
296
|
+
return this.http.get("/v1/models");
|
|
297
|
+
}
|
|
298
|
+
/** Live monitoring snapshot (batch, KV, perf, GPU). */
|
|
299
|
+
async snapshot() {
|
|
300
|
+
return this.http.get("/v1/monitor");
|
|
301
|
+
}
|
|
302
|
+
/** Latency percentiles and request stats. */
|
|
303
|
+
async metrics() {
|
|
304
|
+
return this.http.get("/v1/metrics");
|
|
305
|
+
}
|
|
306
|
+
/** Expert routing distribution (MoE models). */
|
|
307
|
+
async experts() {
|
|
308
|
+
return this.http.get("/v1/experts");
|
|
309
|
+
}
|
|
310
|
+
/** Cancel a running request. */
|
|
311
|
+
async cancel(requestId) {
|
|
312
|
+
return this.http.post(`/v1/cancel/${requestId}`, {});
|
|
313
|
+
}
|
|
314
|
+
};
|
|
315
|
+
|
|
316
|
+
// src/endpoints/rag.ts
|
|
317
|
+
var RAGEndpoint = class {
|
|
318
|
+
constructor(http) {
|
|
319
|
+
this.http = http;
|
|
320
|
+
}
|
|
321
|
+
/** Index text or a file. */
|
|
322
|
+
async index(params) {
|
|
323
|
+
return this.http.post("/v1/rag/index", params);
|
|
324
|
+
}
|
|
325
|
+
/** Search indexed documents. */
|
|
326
|
+
async search(query, k = 3) {
|
|
327
|
+
return this.http.post("/v1/rag/search", { query, k });
|
|
328
|
+
}
|
|
329
|
+
/** Get RAG index statistics. */
|
|
330
|
+
async stats() {
|
|
331
|
+
return this.http.get("/v1/rag/stats");
|
|
332
|
+
}
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
// src/index.ts
|
|
336
|
+
var I64Client = class {
|
|
337
|
+
http;
|
|
338
|
+
/** Chat completions (streaming + non-streaming, tool_calls). */
|
|
339
|
+
chat;
|
|
340
|
+
/** Text completions (streaming + batch). */
|
|
341
|
+
completions;
|
|
342
|
+
/** KV cache management (stats, purge). */
|
|
343
|
+
cache;
|
|
344
|
+
/** LoRA adapter management (load, unload, list). */
|
|
345
|
+
lora;
|
|
346
|
+
/** Monitoring, health, metrics, expert routing. */
|
|
347
|
+
monitor;
|
|
348
|
+
/** RAG — index, search, stats. */
|
|
349
|
+
rag;
|
|
350
|
+
/**
|
|
351
|
+
* Create a vllm-i64 client.
|
|
352
|
+
*
|
|
353
|
+
* @param baseUrl - Server URL (default: http://localhost:8000)
|
|
354
|
+
* @param options - API key and timeout
|
|
355
|
+
*/
|
|
356
|
+
constructor(baseUrl = "http://localhost:8000", options = {}) {
|
|
357
|
+
this.http = new HttpClient(baseUrl, options);
|
|
358
|
+
this.chat = new ChatEndpoint(this.http);
|
|
359
|
+
this.completions = new CompletionsEndpoint(this.http);
|
|
360
|
+
this.cache = new CacheEndpoint(this.http);
|
|
361
|
+
this.lora = new LoRAEndpoint(this.http);
|
|
362
|
+
this.monitor = new MonitorEndpoint(this.http);
|
|
363
|
+
this.rag = new RAGEndpoint(this.http);
|
|
364
|
+
}
|
|
365
|
+
/** Server base URL. */
|
|
366
|
+
get baseUrl() {
|
|
367
|
+
return this.http.baseUrl;
|
|
368
|
+
}
|
|
369
|
+
};
|
|
370
|
+
var index_default = I64Client;
|
|
371
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
372
|
+
0 && (module.exports = {
|
|
373
|
+
CacheEndpoint,
|
|
374
|
+
ChatEndpoint,
|
|
375
|
+
CompletionsEndpoint,
|
|
376
|
+
HttpClient,
|
|
377
|
+
I64Client,
|
|
378
|
+
LoRAEndpoint,
|
|
379
|
+
MonitorEndpoint,
|
|
380
|
+
RAGEndpoint
|
|
381
|
+
});
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
// src/client.ts
|
|
2
|
+
var HttpClient = class {
|
|
3
|
+
baseUrl;
|
|
4
|
+
apiKey;
|
|
5
|
+
timeout;
|
|
6
|
+
constructor(baseUrl = "http://localhost:8000", options = {}) {
|
|
7
|
+
this.baseUrl = baseUrl.replace(/\/+$/, "");
|
|
8
|
+
this.apiKey = options.apiKey;
|
|
9
|
+
this.timeout = options.timeoutMs ?? 12e4;
|
|
10
|
+
}
|
|
11
|
+
async fetch(path, init = {}) {
|
|
12
|
+
const headers = {
|
|
13
|
+
"Content-Type": "application/json",
|
|
14
|
+
...init.headers
|
|
15
|
+
};
|
|
16
|
+
if (this.apiKey) {
|
|
17
|
+
headers["Authorization"] = `Bearer ${this.apiKey}`;
|
|
18
|
+
}
|
|
19
|
+
const controller = new AbortController();
|
|
20
|
+
const timer = setTimeout(() => controller.abort(), this.timeout);
|
|
21
|
+
try {
|
|
22
|
+
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
23
|
+
...init,
|
|
24
|
+
headers,
|
|
25
|
+
signal: controller.signal
|
|
26
|
+
});
|
|
27
|
+
if (!res.ok) {
|
|
28
|
+
const body = await res.text();
|
|
29
|
+
let msg;
|
|
30
|
+
try {
|
|
31
|
+
msg = JSON.parse(body)?.error?.message ?? body;
|
|
32
|
+
} catch {
|
|
33
|
+
msg = body;
|
|
34
|
+
}
|
|
35
|
+
throw new Error(`vllm-i64 ${res.status}: ${msg}`);
|
|
36
|
+
}
|
|
37
|
+
return res;
|
|
38
|
+
} finally {
|
|
39
|
+
clearTimeout(timer);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async get(path) {
|
|
43
|
+
const res = await this.fetch(path);
|
|
44
|
+
return res.json();
|
|
45
|
+
}
|
|
46
|
+
async post(path, body) {
|
|
47
|
+
const res = await this.fetch(path, {
|
|
48
|
+
method: "POST",
|
|
49
|
+
body: JSON.stringify(body)
|
|
50
|
+
});
|
|
51
|
+
return res.json();
|
|
52
|
+
}
|
|
53
|
+
// ---------------------------------------------------------------
|
|
54
|
+
// SSE streaming
|
|
55
|
+
// ---------------------------------------------------------------
|
|
56
|
+
async *readSSE(res) {
|
|
57
|
+
if (!res.body) return;
|
|
58
|
+
const reader = res.body.getReader();
|
|
59
|
+
const decoder = new TextDecoder();
|
|
60
|
+
let buffer = "";
|
|
61
|
+
try {
|
|
62
|
+
while (true) {
|
|
63
|
+
const { done, value } = await reader.read();
|
|
64
|
+
if (done) break;
|
|
65
|
+
buffer += decoder.decode(value, { stream: true });
|
|
66
|
+
const lines = buffer.split("\n");
|
|
67
|
+
buffer = lines.pop() ?? "";
|
|
68
|
+
for (const line of lines) {
|
|
69
|
+
const trimmed = line.trim();
|
|
70
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
71
|
+
const payload = trimmed.slice(6);
|
|
72
|
+
if (payload === "[DONE]") return;
|
|
73
|
+
try {
|
|
74
|
+
const data = JSON.parse(payload);
|
|
75
|
+
const content = data.choices?.[0]?.delta?.content ?? data.choices?.[0]?.text ?? "";
|
|
76
|
+
if (content) yield content;
|
|
77
|
+
} catch {
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
} finally {
|
|
82
|
+
reader.releaseLock();
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
async *readSSERaw(res) {
|
|
86
|
+
if (!res.body) return;
|
|
87
|
+
const reader = res.body.getReader();
|
|
88
|
+
const decoder = new TextDecoder();
|
|
89
|
+
let buffer = "";
|
|
90
|
+
try {
|
|
91
|
+
while (true) {
|
|
92
|
+
const { done, value } = await reader.read();
|
|
93
|
+
if (done) break;
|
|
94
|
+
buffer += decoder.decode(value, { stream: true });
|
|
95
|
+
const lines = buffer.split("\n");
|
|
96
|
+
buffer = lines.pop() ?? "";
|
|
97
|
+
for (const line of lines) {
|
|
98
|
+
const trimmed = line.trim();
|
|
99
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
100
|
+
const payload = trimmed.slice(6);
|
|
101
|
+
if (payload === "[DONE]") return;
|
|
102
|
+
try {
|
|
103
|
+
yield JSON.parse(payload);
|
|
104
|
+
} catch {
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
} finally {
|
|
109
|
+
reader.releaseLock();
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// src/endpoints/chat.ts
|
|
115
|
+
var ChatEndpoint = class {
|
|
116
|
+
constructor(http) {
|
|
117
|
+
this.http = http;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Chat completion (non-streaming).
|
|
121
|
+
*
|
|
122
|
+
* @example
|
|
123
|
+
* ```ts
|
|
124
|
+
* const res = await client.chat.create([{ role: "user", content: "Hi" }]);
|
|
125
|
+
* console.log(res.choices[0].message.content);
|
|
126
|
+
* ```
|
|
127
|
+
*/
|
|
128
|
+
async create(messages, options = {}) {
|
|
129
|
+
return this.http.post("/v1/chat/completions", {
|
|
130
|
+
model: "default",
|
|
131
|
+
messages,
|
|
132
|
+
...options,
|
|
133
|
+
stream: false
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Streaming chat — yields content strings.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* ```ts
|
|
141
|
+
* for await (const chunk of client.chat.stream([{ role: "user", content: "Hi" }])) {
|
|
142
|
+
* process.stdout.write(chunk);
|
|
143
|
+
* }
|
|
144
|
+
* ```
|
|
145
|
+
*/
|
|
146
|
+
async *stream(messages, options = {}) {
|
|
147
|
+
const res = await this.http.fetch("/v1/chat/completions", {
|
|
148
|
+
method: "POST",
|
|
149
|
+
body: JSON.stringify({
|
|
150
|
+
model: "default",
|
|
151
|
+
messages,
|
|
152
|
+
...options,
|
|
153
|
+
stream: true
|
|
154
|
+
})
|
|
155
|
+
});
|
|
156
|
+
yield* this.http.readSSE(res);
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Streaming chat — yields raw SSE delta objects.
|
|
160
|
+
*/
|
|
161
|
+
async *streamRaw(messages, options = {}) {
|
|
162
|
+
const res = await this.http.fetch("/v1/chat/completions", {
|
|
163
|
+
method: "POST",
|
|
164
|
+
body: JSON.stringify({
|
|
165
|
+
model: "default",
|
|
166
|
+
messages,
|
|
167
|
+
...options,
|
|
168
|
+
stream: true
|
|
169
|
+
})
|
|
170
|
+
});
|
|
171
|
+
yield* this.http.readSSERaw(res);
|
|
172
|
+
}
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
// src/endpoints/completions.ts
|
|
176
|
+
var CompletionsEndpoint = class {
|
|
177
|
+
constructor(http) {
|
|
178
|
+
this.http = http;
|
|
179
|
+
}
|
|
180
|
+
/** Text completion (non-streaming). */
|
|
181
|
+
async create(prompt, options = {}) {
|
|
182
|
+
return this.http.post("/v1/completions", {
|
|
183
|
+
model: "default",
|
|
184
|
+
prompt,
|
|
185
|
+
...options,
|
|
186
|
+
stream: false
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
/** Streaming text completion — yields text chunks. */
|
|
190
|
+
async *stream(prompt, options = {}) {
|
|
191
|
+
const res = await this.http.fetch("/v1/completions", {
|
|
192
|
+
method: "POST",
|
|
193
|
+
body: JSON.stringify({
|
|
194
|
+
model: "default",
|
|
195
|
+
prompt,
|
|
196
|
+
...options,
|
|
197
|
+
stream: true
|
|
198
|
+
})
|
|
199
|
+
});
|
|
200
|
+
yield* this.http.readSSE(res);
|
|
201
|
+
}
|
|
202
|
+
/** Submit multiple prompts at once. */
|
|
203
|
+
async batch(prompts, options = {}) {
|
|
204
|
+
return this.http.post("/v1/batch", { prompts, ...options });
|
|
205
|
+
}
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
// src/endpoints/cache.ts
|
|
209
|
+
var CacheEndpoint = class {
|
|
210
|
+
constructor(http) {
|
|
211
|
+
this.http = http;
|
|
212
|
+
}
|
|
213
|
+
/** Get KV cache statistics. */
|
|
214
|
+
async stats() {
|
|
215
|
+
return this.http.get("/v1/cache/stats");
|
|
216
|
+
}
|
|
217
|
+
/** Purge prefix cache (admin). */
|
|
218
|
+
async purge() {
|
|
219
|
+
return this.http.post("/v1/cache/purge", {});
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
// src/endpoints/lora.ts
|
|
224
|
+
var LoRAEndpoint = class {
|
|
225
|
+
constructor(http) {
|
|
226
|
+
this.http = http;
|
|
227
|
+
}
|
|
228
|
+
/** Load a LoRA adapter (admin). */
|
|
229
|
+
async load(params) {
|
|
230
|
+
return this.http.post("/v1/lora/load", params);
|
|
231
|
+
}
|
|
232
|
+
/** Unload a LoRA adapter (admin). */
|
|
233
|
+
async unload(adapter_id) {
|
|
234
|
+
return this.http.post("/v1/lora/unload", { adapter_id });
|
|
235
|
+
}
|
|
236
|
+
/** List loaded adapters. */
|
|
237
|
+
async list() {
|
|
238
|
+
return this.http.get("/v1/lora/list");
|
|
239
|
+
}
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
// src/endpoints/monitor.ts
|
|
243
|
+
var MonitorEndpoint = class {
|
|
244
|
+
constructor(http) {
|
|
245
|
+
this.http = http;
|
|
246
|
+
}
|
|
247
|
+
/** Health check with engine stats. */
|
|
248
|
+
async health() {
|
|
249
|
+
return this.http.get("/health");
|
|
250
|
+
}
|
|
251
|
+
/** Check if server is reachable and ready. */
|
|
252
|
+
async isReady() {
|
|
253
|
+
try {
|
|
254
|
+
const h = await this.health();
|
|
255
|
+
return h.status === "ok" || h.status === "degraded";
|
|
256
|
+
} catch {
|
|
257
|
+
return false;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
/** List available models. */
|
|
261
|
+
async models() {
|
|
262
|
+
return this.http.get("/v1/models");
|
|
263
|
+
}
|
|
264
|
+
/** Live monitoring snapshot (batch, KV, perf, GPU). */
|
|
265
|
+
async snapshot() {
|
|
266
|
+
return this.http.get("/v1/monitor");
|
|
267
|
+
}
|
|
268
|
+
/** Latency percentiles and request stats. */
|
|
269
|
+
async metrics() {
|
|
270
|
+
return this.http.get("/v1/metrics");
|
|
271
|
+
}
|
|
272
|
+
/** Expert routing distribution (MoE models). */
|
|
273
|
+
async experts() {
|
|
274
|
+
return this.http.get("/v1/experts");
|
|
275
|
+
}
|
|
276
|
+
/** Cancel a running request. */
|
|
277
|
+
async cancel(requestId) {
|
|
278
|
+
return this.http.post(`/v1/cancel/${requestId}`, {});
|
|
279
|
+
}
|
|
280
|
+
};
|
|
281
|
+
|
|
282
|
+
// src/endpoints/rag.ts
|
|
283
|
+
var RAGEndpoint = class {
|
|
284
|
+
constructor(http) {
|
|
285
|
+
this.http = http;
|
|
286
|
+
}
|
|
287
|
+
/** Index text or a file. */
|
|
288
|
+
async index(params) {
|
|
289
|
+
return this.http.post("/v1/rag/index", params);
|
|
290
|
+
}
|
|
291
|
+
/** Search indexed documents. */
|
|
292
|
+
async search(query, k = 3) {
|
|
293
|
+
return this.http.post("/v1/rag/search", { query, k });
|
|
294
|
+
}
|
|
295
|
+
/** Get RAG index statistics. */
|
|
296
|
+
async stats() {
|
|
297
|
+
return this.http.get("/v1/rag/stats");
|
|
298
|
+
}
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
// src/index.ts
|
|
302
|
+
var I64Client = class {
|
|
303
|
+
http;
|
|
304
|
+
/** Chat completions (streaming + non-streaming, tool_calls). */
|
|
305
|
+
chat;
|
|
306
|
+
/** Text completions (streaming + batch). */
|
|
307
|
+
completions;
|
|
308
|
+
/** KV cache management (stats, purge). */
|
|
309
|
+
cache;
|
|
310
|
+
/** LoRA adapter management (load, unload, list). */
|
|
311
|
+
lora;
|
|
312
|
+
/** Monitoring, health, metrics, expert routing. */
|
|
313
|
+
monitor;
|
|
314
|
+
/** RAG — index, search, stats. */
|
|
315
|
+
rag;
|
|
316
|
+
/**
|
|
317
|
+
* Create a vllm-i64 client.
|
|
318
|
+
*
|
|
319
|
+
* @param baseUrl - Server URL (default: http://localhost:8000)
|
|
320
|
+
* @param options - API key and timeout
|
|
321
|
+
*/
|
|
322
|
+
constructor(baseUrl = "http://localhost:8000", options = {}) {
|
|
323
|
+
this.http = new HttpClient(baseUrl, options);
|
|
324
|
+
this.chat = new ChatEndpoint(this.http);
|
|
325
|
+
this.completions = new CompletionsEndpoint(this.http);
|
|
326
|
+
this.cache = new CacheEndpoint(this.http);
|
|
327
|
+
this.lora = new LoRAEndpoint(this.http);
|
|
328
|
+
this.monitor = new MonitorEndpoint(this.http);
|
|
329
|
+
this.rag = new RAGEndpoint(this.http);
|
|
330
|
+
}
|
|
331
|
+
/** Server base URL. */
|
|
332
|
+
get baseUrl() {
|
|
333
|
+
return this.http.baseUrl;
|
|
334
|
+
}
|
|
335
|
+
};
|
|
336
|
+
var index_default = I64Client;
|
|
337
|
+
export {
|
|
338
|
+
CacheEndpoint,
|
|
339
|
+
ChatEndpoint,
|
|
340
|
+
CompletionsEndpoint,
|
|
341
|
+
HttpClient,
|
|
342
|
+
I64Client,
|
|
343
|
+
LoRAEndpoint,
|
|
344
|
+
MonitorEndpoint,
|
|
345
|
+
RAGEndpoint,
|
|
346
|
+
index_default as default
|
|
347
|
+
};
|