vllm-i64 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +432 -0
- package/dist/index.d.ts +432 -0
- package/dist/index.js +381 -0
- package/dist/index.mjs +347 -0
- package/package.json +47 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* vllm-i64 SDK — Type definitions
|
|
3
|
+
*
|
|
4
|
+
* All interfaces for requests, responses, and API objects.
|
|
5
|
+
*
|
|
6
|
+
* INL - 2025
|
|
7
|
+
*/
|
|
8
|
+
interface ChatMessage {
|
|
9
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
10
|
+
content: string | null;
|
|
11
|
+
tool_calls?: ToolCall[];
|
|
12
|
+
tool_call_id?: string;
|
|
13
|
+
name?: string;
|
|
14
|
+
}
|
|
15
|
+
interface ToolCall {
|
|
16
|
+
id: string;
|
|
17
|
+
type: "function";
|
|
18
|
+
function: {
|
|
19
|
+
name: string;
|
|
20
|
+
arguments: string;
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
interface ToolDefinition {
|
|
24
|
+
type: "function";
|
|
25
|
+
function: {
|
|
26
|
+
name: string;
|
|
27
|
+
description: string;
|
|
28
|
+
parameters: Record<string, unknown>;
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
interface ChatCompletionRequest {
|
|
32
|
+
model?: string;
|
|
33
|
+
messages: ChatMessage[];
|
|
34
|
+
temperature?: number;
|
|
35
|
+
top_p?: number;
|
|
36
|
+
top_k?: number;
|
|
37
|
+
max_tokens?: number;
|
|
38
|
+
stream?: boolean;
|
|
39
|
+
tools?: ToolDefinition[];
|
|
40
|
+
tool_choice?: string | {
|
|
41
|
+
type: string;
|
|
42
|
+
function: {
|
|
43
|
+
name: string;
|
|
44
|
+
};
|
|
45
|
+
};
|
|
46
|
+
stop?: string | string[];
|
|
47
|
+
repetition_penalty?: number;
|
|
48
|
+
logprobs?: boolean;
|
|
49
|
+
}
|
|
50
|
+
interface ChatCompletionChoice {
|
|
51
|
+
index: number;
|
|
52
|
+
message: ChatMessage;
|
|
53
|
+
finish_reason: "stop" | "length" | "tool_calls";
|
|
54
|
+
logprobs?: unknown;
|
|
55
|
+
}
|
|
56
|
+
interface ChatCompletionResponse {
|
|
57
|
+
id: string;
|
|
58
|
+
object: "chat.completion";
|
|
59
|
+
created: number;
|
|
60
|
+
model: string;
|
|
61
|
+
choices: ChatCompletionChoice[];
|
|
62
|
+
usage?: UsageInfo;
|
|
63
|
+
}
|
|
64
|
+
interface CompletionRequest {
|
|
65
|
+
model?: string;
|
|
66
|
+
prompt: string;
|
|
67
|
+
max_tokens?: number;
|
|
68
|
+
temperature?: number;
|
|
69
|
+
top_p?: number;
|
|
70
|
+
top_k?: number;
|
|
71
|
+
stream?: boolean;
|
|
72
|
+
stop?: string | string[];
|
|
73
|
+
}
|
|
74
|
+
interface CompletionResponse {
|
|
75
|
+
id: string;
|
|
76
|
+
object: "text_completion";
|
|
77
|
+
created: number;
|
|
78
|
+
model: string;
|
|
79
|
+
choices: {
|
|
80
|
+
text: string;
|
|
81
|
+
index: number;
|
|
82
|
+
finish_reason: "stop" | "length";
|
|
83
|
+
}[];
|
|
84
|
+
usage?: UsageInfo;
|
|
85
|
+
}
|
|
86
|
+
interface UsageInfo {
|
|
87
|
+
prompt_tokens: number;
|
|
88
|
+
completion_tokens: number;
|
|
89
|
+
total_tokens: number;
|
|
90
|
+
}
|
|
91
|
+
interface StreamDelta {
|
|
92
|
+
id: string;
|
|
93
|
+
choices: {
|
|
94
|
+
index: number;
|
|
95
|
+
delta: {
|
|
96
|
+
role?: string;
|
|
97
|
+
content?: string;
|
|
98
|
+
};
|
|
99
|
+
finish_reason: string | null;
|
|
100
|
+
}[];
|
|
101
|
+
}
|
|
102
|
+
interface HealthResponse {
|
|
103
|
+
status: "ok" | "degraded" | "error";
|
|
104
|
+
uptime_seconds: number;
|
|
105
|
+
model_loaded: boolean;
|
|
106
|
+
kv_cache_usage_pct?: number;
|
|
107
|
+
gpu?: GpuInfo;
|
|
108
|
+
[key: string]: unknown;
|
|
109
|
+
}
|
|
110
|
+
interface GpuInfo {
|
|
111
|
+
free_mb: number;
|
|
112
|
+
total_mb: number;
|
|
113
|
+
used_mb: number;
|
|
114
|
+
utilization_pct: number;
|
|
115
|
+
}
|
|
116
|
+
interface ModelInfo {
|
|
117
|
+
id: string;
|
|
118
|
+
object: "model";
|
|
119
|
+
owned_by: string;
|
|
120
|
+
}
|
|
121
|
+
interface CacheStats {
|
|
122
|
+
num_blocks: number;
|
|
123
|
+
used_blocks: number;
|
|
124
|
+
free_blocks: number;
|
|
125
|
+
block_size: number;
|
|
126
|
+
active_seqs: number;
|
|
127
|
+
usage_pct: number;
|
|
128
|
+
prefix_cached_blocks?: number;
|
|
129
|
+
prefix_unique_hashes?: number;
|
|
130
|
+
lru_evictions_total?: number;
|
|
131
|
+
swapped_seqs?: number;
|
|
132
|
+
[key: string]: unknown;
|
|
133
|
+
}
|
|
134
|
+
interface CachePurgeResult {
|
|
135
|
+
status: string;
|
|
136
|
+
purged_blocks: number;
|
|
137
|
+
}
|
|
138
|
+
interface MonitorSnapshot {
|
|
139
|
+
timestamp: number;
|
|
140
|
+
uptime_s: number;
|
|
141
|
+
requests_served: number;
|
|
142
|
+
active_requests: number;
|
|
143
|
+
peak_batch_size: number;
|
|
144
|
+
scheduler: Record<string, number>;
|
|
145
|
+
engine: {
|
|
146
|
+
total_steps: number;
|
|
147
|
+
total_tokens_generated: number;
|
|
148
|
+
};
|
|
149
|
+
kv_cache?: CacheStats;
|
|
150
|
+
perf?: {
|
|
151
|
+
avg_step_ms: number;
|
|
152
|
+
tok_per_s: number;
|
|
153
|
+
forward_pct: number;
|
|
154
|
+
};
|
|
155
|
+
gpu?: {
|
|
156
|
+
free_mb: number;
|
|
157
|
+
total_mb: number;
|
|
158
|
+
utilization_pct: number;
|
|
159
|
+
};
|
|
160
|
+
lora?: {
|
|
161
|
+
loaded_adapters: number;
|
|
162
|
+
adapters: string[];
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
interface ExpertStats {
|
|
166
|
+
num_experts: number;
|
|
167
|
+
total_tokens: number;
|
|
168
|
+
distribution: number[];
|
|
169
|
+
counts: number[];
|
|
170
|
+
imbalance: number;
|
|
171
|
+
}
|
|
172
|
+
interface LoRAAdapter {
|
|
173
|
+
id: number;
|
|
174
|
+
name: string;
|
|
175
|
+
}
|
|
176
|
+
interface LoRALoadParams {
|
|
177
|
+
adapter_id: number;
|
|
178
|
+
path: string;
|
|
179
|
+
name?: string;
|
|
180
|
+
scaling?: number;
|
|
181
|
+
}
|
|
182
|
+
interface LoRALoadResult {
|
|
183
|
+
status: string;
|
|
184
|
+
adapter_id: number;
|
|
185
|
+
name: string;
|
|
186
|
+
}
|
|
187
|
+
interface LoRAUnloadResult {
|
|
188
|
+
status: string;
|
|
189
|
+
adapter_id: number;
|
|
190
|
+
}
|
|
191
|
+
interface LoRAListResult {
|
|
192
|
+
adapters: LoRAAdapter[];
|
|
193
|
+
}
|
|
194
|
+
interface RAGIndexParams {
|
|
195
|
+
text?: string;
|
|
196
|
+
file?: string;
|
|
197
|
+
}
|
|
198
|
+
interface RAGIndexResult {
|
|
199
|
+
status: string;
|
|
200
|
+
chunks: number;
|
|
201
|
+
}
|
|
202
|
+
interface RAGSearchResult {
|
|
203
|
+
query: string;
|
|
204
|
+
results: RAGResult[];
|
|
205
|
+
count: number;
|
|
206
|
+
}
|
|
207
|
+
interface RAGResult {
|
|
208
|
+
text: string;
|
|
209
|
+
score: number;
|
|
210
|
+
[key: string]: unknown;
|
|
211
|
+
}
|
|
212
|
+
interface RAGStatsResult {
|
|
213
|
+
enabled: boolean;
|
|
214
|
+
total_chunks: number;
|
|
215
|
+
dimension: number;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* vllm-i64 SDK — HTTP Client core
|
|
220
|
+
*
|
|
221
|
+
* Handles fetch, auth, timeouts, and SSE streaming.
|
|
222
|
+
* Endpoint modules use this as their base.
|
|
223
|
+
*
|
|
224
|
+
* INL - 2025
|
|
225
|
+
*/
|
|
226
|
+
|
|
227
|
+
interface ClientOptions {
|
|
228
|
+
apiKey?: string;
|
|
229
|
+
timeoutMs?: number;
|
|
230
|
+
}
|
|
231
|
+
declare class HttpClient {
|
|
232
|
+
readonly baseUrl: string;
|
|
233
|
+
private apiKey?;
|
|
234
|
+
private timeout;
|
|
235
|
+
constructor(baseUrl?: string, options?: ClientOptions);
|
|
236
|
+
fetch(path: string, init?: RequestInit): Promise<Response>;
|
|
237
|
+
get<T>(path: string): Promise<T>;
|
|
238
|
+
post<T>(path: string, body: unknown): Promise<T>;
|
|
239
|
+
readSSE(res: Response): AsyncGenerator<string, void, undefined>;
|
|
240
|
+
readSSERaw(res: Response): AsyncGenerator<StreamDelta, void, undefined>;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Chat completions endpoint.
|
|
245
|
+
*
|
|
246
|
+
* INL - 2025
|
|
247
|
+
*/
|
|
248
|
+
|
|
249
|
+
declare class ChatEndpoint {
|
|
250
|
+
private http;
|
|
251
|
+
constructor(http: HttpClient);
|
|
252
|
+
/**
|
|
253
|
+
* Chat completion (non-streaming).
|
|
254
|
+
*
|
|
255
|
+
* @example
|
|
256
|
+
* ```ts
|
|
257
|
+
* const res = await client.chat.create([{ role: "user", content: "Hi" }]);
|
|
258
|
+
* console.log(res.choices[0].message.content);
|
|
259
|
+
* ```
|
|
260
|
+
*/
|
|
261
|
+
create(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): Promise<ChatCompletionResponse>;
|
|
262
|
+
/**
|
|
263
|
+
* Streaming chat — yields content strings.
|
|
264
|
+
*
|
|
265
|
+
* @example
|
|
266
|
+
* ```ts
|
|
267
|
+
* for await (const chunk of client.chat.stream([{ role: "user", content: "Hi" }])) {
|
|
268
|
+
* process.stdout.write(chunk);
|
|
269
|
+
* }
|
|
270
|
+
* ```
|
|
271
|
+
*/
|
|
272
|
+
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<string, void, undefined>;
|
|
273
|
+
/**
|
|
274
|
+
* Streaming chat — yields raw SSE delta objects.
|
|
275
|
+
*/
|
|
276
|
+
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<StreamDelta, void, undefined>;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Text completions endpoint.
|
|
281
|
+
*
|
|
282
|
+
* INL - 2025
|
|
283
|
+
*/
|
|
284
|
+
|
|
285
|
+
declare class CompletionsEndpoint {
|
|
286
|
+
private http;
|
|
287
|
+
constructor(http: HttpClient);
|
|
288
|
+
/** Text completion (non-streaming). */
|
|
289
|
+
create(prompt: string, options?: Omit<CompletionRequest, "prompt" | "stream">): Promise<CompletionResponse>;
|
|
290
|
+
/** Streaming text completion — yields text chunks. */
|
|
291
|
+
stream(prompt: string, options?: Omit<CompletionRequest, "prompt" | "stream">): AsyncGenerator<string, void, undefined>;
|
|
292
|
+
/** Submit multiple prompts at once. */
|
|
293
|
+
batch(prompts: string[], options?: {
|
|
294
|
+
max_tokens?: number;
|
|
295
|
+
temperature?: number;
|
|
296
|
+
}): Promise<{
|
|
297
|
+
results: CompletionResponse[];
|
|
298
|
+
}>;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* KV cache management endpoints.
|
|
303
|
+
*
|
|
304
|
+
* INL - 2025
|
|
305
|
+
*/
|
|
306
|
+
|
|
307
|
+
declare class CacheEndpoint {
|
|
308
|
+
private http;
|
|
309
|
+
constructor(http: HttpClient);
|
|
310
|
+
/** Get KV cache statistics. */
|
|
311
|
+
stats(): Promise<CacheStats>;
|
|
312
|
+
/** Purge prefix cache (admin). */
|
|
313
|
+
purge(): Promise<CachePurgeResult>;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* LoRA adapter management endpoints.
|
|
318
|
+
*
|
|
319
|
+
* INL - 2025
|
|
320
|
+
*/
|
|
321
|
+
|
|
322
|
+
declare class LoRAEndpoint {
|
|
323
|
+
private http;
|
|
324
|
+
constructor(http: HttpClient);
|
|
325
|
+
/** Load a LoRA adapter (admin). */
|
|
326
|
+
load(params: LoRALoadParams): Promise<LoRALoadResult>;
|
|
327
|
+
/** Unload a LoRA adapter (admin). */
|
|
328
|
+
unload(adapter_id: number): Promise<LoRAUnloadResult>;
|
|
329
|
+
/** List loaded adapters. */
|
|
330
|
+
list(): Promise<LoRAListResult>;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Monitoring, health, and expert routing endpoints.
|
|
335
|
+
*
|
|
336
|
+
* INL - 2025
|
|
337
|
+
*/
|
|
338
|
+
|
|
339
|
+
declare class MonitorEndpoint {
|
|
340
|
+
private http;
|
|
341
|
+
constructor(http: HttpClient);
|
|
342
|
+
/** Health check with engine stats. */
|
|
343
|
+
health(): Promise<HealthResponse>;
|
|
344
|
+
/** Check if server is reachable and ready. */
|
|
345
|
+
isReady(): Promise<boolean>;
|
|
346
|
+
/** List available models. */
|
|
347
|
+
models(): Promise<{
|
|
348
|
+
data: ModelInfo[];
|
|
349
|
+
}>;
|
|
350
|
+
/** Live monitoring snapshot (batch, KV, perf, GPU). */
|
|
351
|
+
snapshot(): Promise<MonitorSnapshot>;
|
|
352
|
+
/** Latency percentiles and request stats. */
|
|
353
|
+
metrics(): Promise<Record<string, unknown>>;
|
|
354
|
+
/** Expert routing distribution (MoE models). */
|
|
355
|
+
experts(): Promise<ExpertStats>;
|
|
356
|
+
/** Cancel a running request. */
|
|
357
|
+
cancel(requestId: string): Promise<{
|
|
358
|
+
status: string;
|
|
359
|
+
}>;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* RAG (Retrieval-Augmented Generation) endpoints.
|
|
364
|
+
*
|
|
365
|
+
* INL - 2025
|
|
366
|
+
*/
|
|
367
|
+
|
|
368
|
+
declare class RAGEndpoint {
|
|
369
|
+
private http;
|
|
370
|
+
constructor(http: HttpClient);
|
|
371
|
+
/** Index text or a file. */
|
|
372
|
+
index(params: RAGIndexParams): Promise<RAGIndexResult>;
|
|
373
|
+
/** Search indexed documents. */
|
|
374
|
+
search(query: string, k?: number): Promise<RAGSearchResult>;
|
|
375
|
+
/** Get RAG index statistics. */
|
|
376
|
+
stats(): Promise<RAGStatsResult>;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* vllm-i64 — TypeScript SDK
|
|
381
|
+
*
|
|
382
|
+
* Zero-dependency client for the vllm-i64 inference engine.
|
|
383
|
+
*
|
|
384
|
+
* @example
|
|
385
|
+
* ```ts
|
|
386
|
+
* import { I64Client } from "vllm-i64";
|
|
387
|
+
*
|
|
388
|
+
* const client = new I64Client("http://localhost:8000");
|
|
389
|
+
*
|
|
390
|
+
* // Chat
|
|
391
|
+
* const res = await client.chat.create([{ role: "user", content: "Hello!" }]);
|
|
392
|
+
*
|
|
393
|
+
* // Stream
|
|
394
|
+
* for await (const chunk of client.chat.stream([{ role: "user", content: "Hi" }])) {
|
|
395
|
+
* process.stdout.write(chunk);
|
|
396
|
+
* }
|
|
397
|
+
*
|
|
398
|
+
* // Admin
|
|
399
|
+
* await client.monitor.snapshot();
|
|
400
|
+
* await client.cache.purge();
|
|
401
|
+
* await client.lora.load({ adapter_id: 1, path: "/models/v2" });
|
|
402
|
+
* ```
|
|
403
|
+
*
|
|
404
|
+
* INL - 2025
|
|
405
|
+
*/
|
|
406
|
+
|
|
407
|
+
declare class I64Client {
|
|
408
|
+
private http;
|
|
409
|
+
/** Chat completions (streaming + non-streaming, tool_calls). */
|
|
410
|
+
readonly chat: ChatEndpoint;
|
|
411
|
+
/** Text completions (streaming + batch). */
|
|
412
|
+
readonly completions: CompletionsEndpoint;
|
|
413
|
+
/** KV cache management (stats, purge). */
|
|
414
|
+
readonly cache: CacheEndpoint;
|
|
415
|
+
/** LoRA adapter management (load, unload, list). */
|
|
416
|
+
readonly lora: LoRAEndpoint;
|
|
417
|
+
/** Monitoring, health, metrics, expert routing. */
|
|
418
|
+
readonly monitor: MonitorEndpoint;
|
|
419
|
+
/** RAG — index, search, stats. */
|
|
420
|
+
readonly rag: RAGEndpoint;
|
|
421
|
+
/**
|
|
422
|
+
* Create a vllm-i64 client.
|
|
423
|
+
*
|
|
424
|
+
* @param baseUrl - Server URL (default: http://localhost:8000)
|
|
425
|
+
* @param options - API key and timeout
|
|
426
|
+
*/
|
|
427
|
+
constructor(baseUrl?: string, options?: ClientOptions);
|
|
428
|
+
/** Server base URL. */
|
|
429
|
+
get baseUrl(): string;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
|