vllm-i64 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,432 @@
1
+ /**
2
+ * vllm-i64 SDK — Type definitions
3
+ *
4
+ * All interfaces for requests, responses, and API objects.
5
+ *
6
+ * INL - 2025
7
+ */
8
+ interface ChatMessage {
9
+ role: "system" | "user" | "assistant" | "tool";
10
+ content: string | null;
11
+ tool_calls?: ToolCall[];
12
+ tool_call_id?: string;
13
+ name?: string;
14
+ }
15
+ interface ToolCall {
16
+ id: string;
17
+ type: "function";
18
+ function: {
19
+ name: string;
20
+ arguments: string;
21
+ };
22
+ }
23
+ interface ToolDefinition {
24
+ type: "function";
25
+ function: {
26
+ name: string;
27
+ description: string;
28
+ parameters: Record<string, unknown>;
29
+ };
30
+ }
31
+ interface ChatCompletionRequest {
32
+ model?: string;
33
+ messages: ChatMessage[];
34
+ temperature?: number;
35
+ top_p?: number;
36
+ top_k?: number;
37
+ max_tokens?: number;
38
+ stream?: boolean;
39
+ tools?: ToolDefinition[];
40
+ tool_choice?: string | {
41
+ type: string;
42
+ function: {
43
+ name: string;
44
+ };
45
+ };
46
+ stop?: string | string[];
47
+ repetition_penalty?: number;
48
+ logprobs?: boolean;
49
+ }
50
+ interface ChatCompletionChoice {
51
+ index: number;
52
+ message: ChatMessage;
53
+ finish_reason: "stop" | "length" | "tool_calls";
54
+ logprobs?: unknown;
55
+ }
56
+ interface ChatCompletionResponse {
57
+ id: string;
58
+ object: "chat.completion";
59
+ created: number;
60
+ model: string;
61
+ choices: ChatCompletionChoice[];
62
+ usage?: UsageInfo;
63
+ }
64
+ interface CompletionRequest {
65
+ model?: string;
66
+ prompt: string;
67
+ max_tokens?: number;
68
+ temperature?: number;
69
+ top_p?: number;
70
+ top_k?: number;
71
+ stream?: boolean;
72
+ stop?: string | string[];
73
+ }
74
+ interface CompletionResponse {
75
+ id: string;
76
+ object: "text_completion";
77
+ created: number;
78
+ model: string;
79
+ choices: {
80
+ text: string;
81
+ index: number;
82
+ finish_reason: "stop" | "length";
83
+ }[];
84
+ usage?: UsageInfo;
85
+ }
86
+ interface UsageInfo {
87
+ prompt_tokens: number;
88
+ completion_tokens: number;
89
+ total_tokens: number;
90
+ }
91
+ interface StreamDelta {
92
+ id: string;
93
+ choices: {
94
+ index: number;
95
+ delta: {
96
+ role?: string;
97
+ content?: string;
98
+ };
99
+ finish_reason: string | null;
100
+ }[];
101
+ }
102
+ interface HealthResponse {
103
+ status: "ok" | "degraded" | "error";
104
+ uptime_seconds: number;
105
+ model_loaded: boolean;
106
+ kv_cache_usage_pct?: number;
107
+ gpu?: GpuInfo;
108
+ [key: string]: unknown;
109
+ }
110
+ interface GpuInfo {
111
+ free_mb: number;
112
+ total_mb: number;
113
+ used_mb: number;
114
+ utilization_pct: number;
115
+ }
116
+ interface ModelInfo {
117
+ id: string;
118
+ object: "model";
119
+ owned_by: string;
120
+ }
121
+ interface CacheStats {
122
+ num_blocks: number;
123
+ used_blocks: number;
124
+ free_blocks: number;
125
+ block_size: number;
126
+ active_seqs: number;
127
+ usage_pct: number;
128
+ prefix_cached_blocks?: number;
129
+ prefix_unique_hashes?: number;
130
+ lru_evictions_total?: number;
131
+ swapped_seqs?: number;
132
+ [key: string]: unknown;
133
+ }
134
+ interface CachePurgeResult {
135
+ status: string;
136
+ purged_blocks: number;
137
+ }
138
+ interface MonitorSnapshot {
139
+ timestamp: number;
140
+ uptime_s: number;
141
+ requests_served: number;
142
+ active_requests: number;
143
+ peak_batch_size: number;
144
+ scheduler: Record<string, number>;
145
+ engine: {
146
+ total_steps: number;
147
+ total_tokens_generated: number;
148
+ };
149
+ kv_cache?: CacheStats;
150
+ perf?: {
151
+ avg_step_ms: number;
152
+ tok_per_s: number;
153
+ forward_pct: number;
154
+ };
155
+ gpu?: {
156
+ free_mb: number;
157
+ total_mb: number;
158
+ utilization_pct: number;
159
+ };
160
+ lora?: {
161
+ loaded_adapters: number;
162
+ adapters: string[];
163
+ };
164
+ }
165
+ interface ExpertStats {
166
+ num_experts: number;
167
+ total_tokens: number;
168
+ distribution: number[];
169
+ counts: number[];
170
+ imbalance: number;
171
+ }
172
+ interface LoRAAdapter {
173
+ id: number;
174
+ name: string;
175
+ }
176
+ interface LoRALoadParams {
177
+ adapter_id: number;
178
+ path: string;
179
+ name?: string;
180
+ scaling?: number;
181
+ }
182
+ interface LoRALoadResult {
183
+ status: string;
184
+ adapter_id: number;
185
+ name: string;
186
+ }
187
+ interface LoRAUnloadResult {
188
+ status: string;
189
+ adapter_id: number;
190
+ }
191
+ interface LoRAListResult {
192
+ adapters: LoRAAdapter[];
193
+ }
194
+ interface RAGIndexParams {
195
+ text?: string;
196
+ file?: string;
197
+ }
198
+ interface RAGIndexResult {
199
+ status: string;
200
+ chunks: number;
201
+ }
202
+ interface RAGSearchResult {
203
+ query: string;
204
+ results: RAGResult[];
205
+ count: number;
206
+ }
207
+ interface RAGResult {
208
+ text: string;
209
+ score: number;
210
+ [key: string]: unknown;
211
+ }
212
+ interface RAGStatsResult {
213
+ enabled: boolean;
214
+ total_chunks: number;
215
+ dimension: number;
216
+ }
217
+
218
+ /**
219
+ * vllm-i64 SDK — HTTP Client core
220
+ *
221
+ * Handles fetch, auth, timeouts, and SSE streaming.
222
+ * Endpoint modules use this as their base.
223
+ *
224
+ * INL - 2025
225
+ */
226
+
227
+ interface ClientOptions {
228
+ apiKey?: string;
229
+ timeoutMs?: number;
230
+ }
231
+ declare class HttpClient {
232
+ readonly baseUrl: string;
233
+ private apiKey?;
234
+ private timeout;
235
+ constructor(baseUrl?: string, options?: ClientOptions);
236
+ fetch(path: string, init?: RequestInit): Promise<Response>;
237
+ get<T>(path: string): Promise<T>;
238
+ post<T>(path: string, body: unknown): Promise<T>;
239
+ readSSE(res: Response): AsyncGenerator<string, void, undefined>;
240
+ readSSERaw(res: Response): AsyncGenerator<StreamDelta, void, undefined>;
241
+ }
242
+
243
+ /**
244
+ * Chat completions endpoint.
245
+ *
246
+ * INL - 2025
247
+ */
248
+
249
+ declare class ChatEndpoint {
250
+ private http;
251
+ constructor(http: HttpClient);
252
+ /**
253
+ * Chat completion (non-streaming).
254
+ *
255
+ * @example
256
+ * ```ts
257
+ * const res = await client.chat.create([{ role: "user", content: "Hi" }]);
258
+ * console.log(res.choices[0].message.content);
259
+ * ```
260
+ */
261
+ create(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): Promise<ChatCompletionResponse>;
262
+ /**
263
+ * Streaming chat — yields content strings.
264
+ *
265
+ * @example
266
+ * ```ts
267
+ * for await (const chunk of client.chat.stream([{ role: "user", content: "Hi" }])) {
268
+ * process.stdout.write(chunk);
269
+ * }
270
+ * ```
271
+ */
272
+ stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<string, void, undefined>;
273
+ /**
274
+ * Streaming chat — yields raw SSE delta objects.
275
+ */
276
+ streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<StreamDelta, void, undefined>;
277
+ }
278
+
279
+ /**
280
+ * Text completions endpoint.
281
+ *
282
+ * INL - 2025
283
+ */
284
+
285
+ declare class CompletionsEndpoint {
286
+ private http;
287
+ constructor(http: HttpClient);
288
+ /** Text completion (non-streaming). */
289
+ create(prompt: string, options?: Omit<CompletionRequest, "prompt" | "stream">): Promise<CompletionResponse>;
290
+ /** Streaming text completion — yields text chunks. */
291
+ stream(prompt: string, options?: Omit<CompletionRequest, "prompt" | "stream">): AsyncGenerator<string, void, undefined>;
292
+ /** Submit multiple prompts at once. */
293
+ batch(prompts: string[], options?: {
294
+ max_tokens?: number;
295
+ temperature?: number;
296
+ }): Promise<{
297
+ results: CompletionResponse[];
298
+ }>;
299
+ }
300
+
301
+ /**
302
+ * KV cache management endpoints.
303
+ *
304
+ * INL - 2025
305
+ */
306
+
307
+ declare class CacheEndpoint {
308
+ private http;
309
+ constructor(http: HttpClient);
310
+ /** Get KV cache statistics. */
311
+ stats(): Promise<CacheStats>;
312
+ /** Purge prefix cache (admin). */
313
+ purge(): Promise<CachePurgeResult>;
314
+ }
315
+
316
+ /**
317
+ * LoRA adapter management endpoints.
318
+ *
319
+ * INL - 2025
320
+ */
321
+
322
+ declare class LoRAEndpoint {
323
+ private http;
324
+ constructor(http: HttpClient);
325
+ /** Load a LoRA adapter (admin). */
326
+ load(params: LoRALoadParams): Promise<LoRALoadResult>;
327
+ /** Unload a LoRA adapter (admin). */
328
+ unload(adapter_id: number): Promise<LoRAUnloadResult>;
329
+ /** List loaded adapters. */
330
+ list(): Promise<LoRAListResult>;
331
+ }
332
+
333
+ /**
334
+ * Monitoring, health, and expert routing endpoints.
335
+ *
336
+ * INL - 2025
337
+ */
338
+
339
+ declare class MonitorEndpoint {
340
+ private http;
341
+ constructor(http: HttpClient);
342
+ /** Health check with engine stats. */
343
+ health(): Promise<HealthResponse>;
344
+ /** Check if server is reachable and ready. */
345
+ isReady(): Promise<boolean>;
346
+ /** List available models. */
347
+ models(): Promise<{
348
+ data: ModelInfo[];
349
+ }>;
350
+ /** Live monitoring snapshot (batch, KV, perf, GPU). */
351
+ snapshot(): Promise<MonitorSnapshot>;
352
+ /** Latency percentiles and request stats. */
353
+ metrics(): Promise<Record<string, unknown>>;
354
+ /** Expert routing distribution (MoE models). */
355
+ experts(): Promise<ExpertStats>;
356
+ /** Cancel a running request. */
357
+ cancel(requestId: string): Promise<{
358
+ status: string;
359
+ }>;
360
+ }
361
+
362
+ /**
363
+ * RAG (Retrieval-Augmented Generation) endpoints.
364
+ *
365
+ * INL - 2025
366
+ */
367
+
368
+ declare class RAGEndpoint {
369
+ private http;
370
+ constructor(http: HttpClient);
371
+ /** Index text or a file. */
372
+ index(params: RAGIndexParams): Promise<RAGIndexResult>;
373
+ /** Search indexed documents. */
374
+ search(query: string, k?: number): Promise<RAGSearchResult>;
375
+ /** Get RAG index statistics. */
376
+ stats(): Promise<RAGStatsResult>;
377
+ }
378
+
379
+ /**
380
+ * vllm-i64 — TypeScript SDK
381
+ *
382
+ * Zero-dependency client for the vllm-i64 inference engine.
383
+ *
384
+ * @example
385
+ * ```ts
386
+ * import { I64Client } from "vllm-i64";
387
+ *
388
+ * const client = new I64Client("http://localhost:8000");
389
+ *
390
+ * // Chat
391
+ * const res = await client.chat.create([{ role: "user", content: "Hello!" }]);
392
+ *
393
+ * // Stream
394
+ * for await (const chunk of client.chat.stream([{ role: "user", content: "Hi" }])) {
395
+ * process.stdout.write(chunk);
396
+ * }
397
+ *
398
+ * // Admin
399
+ * await client.monitor.snapshot();
400
+ * await client.cache.purge();
401
+ * await client.lora.load({ adapter_id: 1, path: "/models/v2" });
402
+ * ```
403
+ *
404
+ * INL - 2025
405
+ */
406
+
407
+ declare class I64Client {
408
+ private http;
409
+ /** Chat completions (streaming + non-streaming, tool_calls). */
410
+ readonly chat: ChatEndpoint;
411
+ /** Text completions (streaming + batch). */
412
+ readonly completions: CompletionsEndpoint;
413
+ /** KV cache management (stats, purge). */
414
+ readonly cache: CacheEndpoint;
415
+ /** LoRA adapter management (load, unload, list). */
416
+ readonly lora: LoRAEndpoint;
417
+ /** Monitoring, health, metrics, expert routing. */
418
+ readonly monitor: MonitorEndpoint;
419
+ /** RAG — index, search, stats. */
420
+ readonly rag: RAGEndpoint;
421
+ /**
422
+ * Create a vllm-i64 client.
423
+ *
424
+ * @param baseUrl - Server URL (default: http://localhost:8000)
425
+ * @param options - API key and timeout
426
+ */
427
+ constructor(baseUrl?: string, options?: ClientOptions);
428
+ /** Server base URL. */
429
+ get baseUrl(): string;
430
+ }
431
+
432
+ export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };