vllm-i64 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +125 -4
- package/dist/index.d.ts +125 -4
- package/dist/index.js +126 -6
- package/dist/index.mjs +124 -5
- package/package.json +5 -2
package/dist/index.d.mts
CHANGED
|
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
|
|
|
44
44
|
};
|
|
45
45
|
};
|
|
46
46
|
stop?: string | string[];
|
|
47
|
+
min_p?: number;
|
|
48
|
+
typical_p?: number;
|
|
47
49
|
repetition_penalty?: number;
|
|
50
|
+
min_tokens?: number;
|
|
48
51
|
logprobs?: boolean;
|
|
49
52
|
}
|
|
50
53
|
interface ChatCompletionChoice {
|
|
@@ -214,6 +217,55 @@ interface RAGStatsResult {
|
|
|
214
217
|
total_chunks: number;
|
|
215
218
|
dimension: number;
|
|
216
219
|
}
|
|
220
|
+
interface SearchCompletionRequest {
|
|
221
|
+
query: string;
|
|
222
|
+
max_tokens?: number;
|
|
223
|
+
temperature?: number;
|
|
224
|
+
search_count?: number;
|
|
225
|
+
user?: string;
|
|
226
|
+
stream?: boolean;
|
|
227
|
+
}
|
|
228
|
+
interface SearchSource {
|
|
229
|
+
index: number;
|
|
230
|
+
title: string;
|
|
231
|
+
url: string;
|
|
232
|
+
domain: string;
|
|
233
|
+
favicon: string;
|
|
234
|
+
}
|
|
235
|
+
interface SearchCompletionResponse {
|
|
236
|
+
id: string;
|
|
237
|
+
object: "search.completion";
|
|
238
|
+
model: string;
|
|
239
|
+
query: string;
|
|
240
|
+
choices: {
|
|
241
|
+
index: number;
|
|
242
|
+
message: {
|
|
243
|
+
role: "assistant";
|
|
244
|
+
content: string;
|
|
245
|
+
};
|
|
246
|
+
finish_reason: "stop" | "length";
|
|
247
|
+
}[];
|
|
248
|
+
sources: SearchSource[];
|
|
249
|
+
usage?: UsageInfo;
|
|
250
|
+
}
|
|
251
|
+
interface SearchHistoryEntry {
|
|
252
|
+
query: string;
|
|
253
|
+
sources: SearchSource[];
|
|
254
|
+
answer: string;
|
|
255
|
+
timestamp: number;
|
|
256
|
+
}
|
|
257
|
+
interface SearchHistoryResponse {
|
|
258
|
+
history: SearchHistoryEntry[];
|
|
259
|
+
count: number;
|
|
260
|
+
}
|
|
261
|
+
interface SearchStatsResponse {
|
|
262
|
+
enabled: boolean;
|
|
263
|
+
num_partitions: number;
|
|
264
|
+
total_keys: number;
|
|
265
|
+
total_entries: number;
|
|
266
|
+
max_per_key: number;
|
|
267
|
+
persist_dir: string | null;
|
|
268
|
+
}
|
|
217
269
|
|
|
218
270
|
/**
|
|
219
271
|
* vllm-i64 SDK — HTTP Client core
|
|
@@ -233,7 +285,7 @@ declare class HttpClient {
|
|
|
233
285
|
private apiKey?;
|
|
234
286
|
private timeout;
|
|
235
287
|
constructor(baseUrl?: string, options?: ClientOptions);
|
|
236
|
-
fetch(path: string, init?: RequestInit): Promise<Response>;
|
|
288
|
+
fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
|
|
237
289
|
get<T>(path: string): Promise<T>;
|
|
238
290
|
post<T>(path: string, body: unknown): Promise<T>;
|
|
239
291
|
readSSE(res: Response): AsyncGenerator<string, void, undefined>;
|
|
@@ -269,11 +321,15 @@ declare class ChatEndpoint {
|
|
|
269
321
|
* }
|
|
270
322
|
* ```
|
|
271
323
|
*/
|
|
272
|
-
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
324
|
+
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
325
|
+
signal?: AbortSignal;
|
|
326
|
+
}): AsyncGenerator<string, void, undefined>;
|
|
273
327
|
/**
|
|
274
328
|
* Streaming chat — yields raw SSE delta objects.
|
|
275
329
|
*/
|
|
276
|
-
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
330
|
+
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
331
|
+
signal?: AbortSignal;
|
|
332
|
+
}): AsyncGenerator<StreamDelta, void, undefined>;
|
|
277
333
|
}
|
|
278
334
|
|
|
279
335
|
/**
|
|
@@ -376,6 +432,65 @@ declare class RAGEndpoint {
|
|
|
376
432
|
stats(): Promise<RAGStatsResult>;
|
|
377
433
|
}
|
|
378
434
|
|
|
435
|
+
/**
|
|
436
|
+
* Search endpoints — Perplexity-style search-augmented generation.
|
|
437
|
+
*
|
|
438
|
+
* Token-routed isolation: partition = sha256(api_key ∥ user_id) mod N
|
|
439
|
+
* No data leak possible. No shared cache. No session tokens.
|
|
440
|
+
*
|
|
441
|
+
* INL - 2025
|
|
442
|
+
*/
|
|
443
|
+
|
|
444
|
+
declare class SearchEndpoint {
|
|
445
|
+
private http;
|
|
446
|
+
constructor(http: HttpClient);
|
|
447
|
+
/**
|
|
448
|
+
* Search-augmented generation: query → web search → cited answer.
|
|
449
|
+
*
|
|
450
|
+
* @example
|
|
451
|
+
* ```ts
|
|
452
|
+
* const res = await client.search.create({ query: "What is MoE?" });
|
|
453
|
+
* console.log(res.choices[0].message.content);
|
|
454
|
+
* for (const src of res.sources) {
|
|
455
|
+
* console.log(`[${src.index}] ${src.title} — ${src.url}`);
|
|
456
|
+
* }
|
|
457
|
+
* ```
|
|
458
|
+
*/
|
|
459
|
+
create(params: SearchCompletionRequest): Promise<SearchCompletionResponse>;
|
|
460
|
+
/**
|
|
461
|
+
* Stream search completion — yields text chunks.
|
|
462
|
+
* Sources are sent as the final SSE event.
|
|
463
|
+
*
|
|
464
|
+
* @example
|
|
465
|
+
* ```ts
|
|
466
|
+
* const { stream, sources } = await client.search.stream({ query: "token routing" });
|
|
467
|
+
* for await (const chunk of stream) {
|
|
468
|
+
* process.stdout.write(chunk);
|
|
469
|
+
* }
|
|
470
|
+
* console.log("\nSources:", await sources);
|
|
471
|
+
* ```
|
|
472
|
+
*/
|
|
473
|
+
stream(params: SearchCompletionRequest, signal?: AbortSignal): Promise<{
|
|
474
|
+
stream: AsyncGenerator<string>;
|
|
475
|
+
sources: Promise<SearchSource[]>;
|
|
476
|
+
}>;
|
|
477
|
+
/**
|
|
478
|
+
* Get search history for the authenticated user.
|
|
479
|
+
* History is partitioned by api_key + user — no cross-user access.
|
|
480
|
+
*/
|
|
481
|
+
history(user?: string, limit?: number): Promise<SearchHistoryResponse>;
|
|
482
|
+
/**
|
|
483
|
+
* Clear search history for the authenticated user.
|
|
484
|
+
* Only clears the caller's own partition.
|
|
485
|
+
*/
|
|
486
|
+
clearHistory(user?: string): Promise<{
|
|
487
|
+
status: string;
|
|
488
|
+
removed: number;
|
|
489
|
+
}>;
|
|
490
|
+
/** Search history statistics (admin). */
|
|
491
|
+
stats(): Promise<SearchStatsResponse>;
|
|
492
|
+
}
|
|
493
|
+
|
|
379
494
|
/**
|
|
380
495
|
* vllm-i64 — TypeScript SDK
|
|
381
496
|
*
|
|
@@ -395,6 +510,10 @@ declare class RAGEndpoint {
|
|
|
395
510
|
* process.stdout.write(chunk);
|
|
396
511
|
* }
|
|
397
512
|
*
|
|
513
|
+
* // Search (Perplexity-style, token-routed isolation)
|
|
514
|
+
* const search = await client.search.create({ query: "What is MoE?" });
|
|
515
|
+
* console.log(search.sources);
|
|
516
|
+
*
|
|
398
517
|
* // Admin
|
|
399
518
|
* await client.monitor.snapshot();
|
|
400
519
|
* await client.cache.purge();
|
|
@@ -418,6 +537,8 @@ declare class I64Client {
|
|
|
418
537
|
readonly monitor: MonitorEndpoint;
|
|
419
538
|
/** RAG — index, search, stats. */
|
|
420
539
|
readonly rag: RAGEndpoint;
|
|
540
|
+
/** Web search — Perplexity-style with token-routed isolation. */
|
|
541
|
+
readonly search: SearchEndpoint;
|
|
421
542
|
/**
|
|
422
543
|
* Create a vllm-i64 client.
|
|
423
544
|
*
|
|
@@ -429,4 +550,4 @@ declare class I64Client {
|
|
|
429
550
|
get baseUrl(): string;
|
|
430
551
|
}
|
|
431
552
|
|
|
432
|
-
export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
|
|
553
|
+
export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type SearchCompletionRequest, type SearchCompletionResponse, SearchEndpoint, type SearchHistoryEntry, type SearchHistoryResponse, type SearchSource, type SearchStatsResponse, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
|
|
|
44
44
|
};
|
|
45
45
|
};
|
|
46
46
|
stop?: string | string[];
|
|
47
|
+
min_p?: number;
|
|
48
|
+
typical_p?: number;
|
|
47
49
|
repetition_penalty?: number;
|
|
50
|
+
min_tokens?: number;
|
|
48
51
|
logprobs?: boolean;
|
|
49
52
|
}
|
|
50
53
|
interface ChatCompletionChoice {
|
|
@@ -214,6 +217,55 @@ interface RAGStatsResult {
|
|
|
214
217
|
total_chunks: number;
|
|
215
218
|
dimension: number;
|
|
216
219
|
}
|
|
220
|
+
interface SearchCompletionRequest {
|
|
221
|
+
query: string;
|
|
222
|
+
max_tokens?: number;
|
|
223
|
+
temperature?: number;
|
|
224
|
+
search_count?: number;
|
|
225
|
+
user?: string;
|
|
226
|
+
stream?: boolean;
|
|
227
|
+
}
|
|
228
|
+
interface SearchSource {
|
|
229
|
+
index: number;
|
|
230
|
+
title: string;
|
|
231
|
+
url: string;
|
|
232
|
+
domain: string;
|
|
233
|
+
favicon: string;
|
|
234
|
+
}
|
|
235
|
+
interface SearchCompletionResponse {
|
|
236
|
+
id: string;
|
|
237
|
+
object: "search.completion";
|
|
238
|
+
model: string;
|
|
239
|
+
query: string;
|
|
240
|
+
choices: {
|
|
241
|
+
index: number;
|
|
242
|
+
message: {
|
|
243
|
+
role: "assistant";
|
|
244
|
+
content: string;
|
|
245
|
+
};
|
|
246
|
+
finish_reason: "stop" | "length";
|
|
247
|
+
}[];
|
|
248
|
+
sources: SearchSource[];
|
|
249
|
+
usage?: UsageInfo;
|
|
250
|
+
}
|
|
251
|
+
interface SearchHistoryEntry {
|
|
252
|
+
query: string;
|
|
253
|
+
sources: SearchSource[];
|
|
254
|
+
answer: string;
|
|
255
|
+
timestamp: number;
|
|
256
|
+
}
|
|
257
|
+
interface SearchHistoryResponse {
|
|
258
|
+
history: SearchHistoryEntry[];
|
|
259
|
+
count: number;
|
|
260
|
+
}
|
|
261
|
+
interface SearchStatsResponse {
|
|
262
|
+
enabled: boolean;
|
|
263
|
+
num_partitions: number;
|
|
264
|
+
total_keys: number;
|
|
265
|
+
total_entries: number;
|
|
266
|
+
max_per_key: number;
|
|
267
|
+
persist_dir: string | null;
|
|
268
|
+
}
|
|
217
269
|
|
|
218
270
|
/**
|
|
219
271
|
* vllm-i64 SDK — HTTP Client core
|
|
@@ -233,7 +285,7 @@ declare class HttpClient {
|
|
|
233
285
|
private apiKey?;
|
|
234
286
|
private timeout;
|
|
235
287
|
constructor(baseUrl?: string, options?: ClientOptions);
|
|
236
|
-
fetch(path: string, init?: RequestInit): Promise<Response>;
|
|
288
|
+
fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
|
|
237
289
|
get<T>(path: string): Promise<T>;
|
|
238
290
|
post<T>(path: string, body: unknown): Promise<T>;
|
|
239
291
|
readSSE(res: Response): AsyncGenerator<string, void, undefined>;
|
|
@@ -269,11 +321,15 @@ declare class ChatEndpoint {
|
|
|
269
321
|
* }
|
|
270
322
|
* ```
|
|
271
323
|
*/
|
|
272
|
-
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
324
|
+
stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
325
|
+
signal?: AbortSignal;
|
|
326
|
+
}): AsyncGenerator<string, void, undefined>;
|
|
273
327
|
/**
|
|
274
328
|
* Streaming chat — yields raw SSE delta objects.
|
|
275
329
|
*/
|
|
276
|
-
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">
|
|
330
|
+
streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
|
|
331
|
+
signal?: AbortSignal;
|
|
332
|
+
}): AsyncGenerator<StreamDelta, void, undefined>;
|
|
277
333
|
}
|
|
278
334
|
|
|
279
335
|
/**
|
|
@@ -376,6 +432,65 @@ declare class RAGEndpoint {
|
|
|
376
432
|
stats(): Promise<RAGStatsResult>;
|
|
377
433
|
}
|
|
378
434
|
|
|
435
|
+
/**
|
|
436
|
+
* Search endpoints — Perplexity-style search-augmented generation.
|
|
437
|
+
*
|
|
438
|
+
* Token-routed isolation: partition = sha256(api_key ∥ user_id) mod N
|
|
439
|
+
* No data leak possible. No shared cache. No session tokens.
|
|
440
|
+
*
|
|
441
|
+
* INL - 2025
|
|
442
|
+
*/
|
|
443
|
+
|
|
444
|
+
declare class SearchEndpoint {
|
|
445
|
+
private http;
|
|
446
|
+
constructor(http: HttpClient);
|
|
447
|
+
/**
|
|
448
|
+
* Search-augmented generation: query → web search → cited answer.
|
|
449
|
+
*
|
|
450
|
+
* @example
|
|
451
|
+
* ```ts
|
|
452
|
+
* const res = await client.search.create({ query: "What is MoE?" });
|
|
453
|
+
* console.log(res.choices[0].message.content);
|
|
454
|
+
* for (const src of res.sources) {
|
|
455
|
+
* console.log(`[${src.index}] ${src.title} — ${src.url}`);
|
|
456
|
+
* }
|
|
457
|
+
* ```
|
|
458
|
+
*/
|
|
459
|
+
create(params: SearchCompletionRequest): Promise<SearchCompletionResponse>;
|
|
460
|
+
/**
|
|
461
|
+
* Stream search completion — yields text chunks.
|
|
462
|
+
* Sources are sent as the final SSE event.
|
|
463
|
+
*
|
|
464
|
+
* @example
|
|
465
|
+
* ```ts
|
|
466
|
+
* const { stream, sources } = await client.search.stream({ query: "token routing" });
|
|
467
|
+
* for await (const chunk of stream) {
|
|
468
|
+
* process.stdout.write(chunk);
|
|
469
|
+
* }
|
|
470
|
+
* console.log("\nSources:", await sources);
|
|
471
|
+
* ```
|
|
472
|
+
*/
|
|
473
|
+
stream(params: SearchCompletionRequest, signal?: AbortSignal): Promise<{
|
|
474
|
+
stream: AsyncGenerator<string>;
|
|
475
|
+
sources: Promise<SearchSource[]>;
|
|
476
|
+
}>;
|
|
477
|
+
/**
|
|
478
|
+
* Get search history for the authenticated user.
|
|
479
|
+
* History is partitioned by api_key + user — no cross-user access.
|
|
480
|
+
*/
|
|
481
|
+
history(user?: string, limit?: number): Promise<SearchHistoryResponse>;
|
|
482
|
+
/**
|
|
483
|
+
* Clear search history for the authenticated user.
|
|
484
|
+
* Only clears the caller's own partition.
|
|
485
|
+
*/
|
|
486
|
+
clearHistory(user?: string): Promise<{
|
|
487
|
+
status: string;
|
|
488
|
+
removed: number;
|
|
489
|
+
}>;
|
|
490
|
+
/** Search history statistics (admin). */
|
|
491
|
+
stats(): Promise<SearchStatsResponse>;
|
|
492
|
+
}
|
|
493
|
+
|
|
379
494
|
/**
|
|
380
495
|
* vllm-i64 — TypeScript SDK
|
|
381
496
|
*
|
|
@@ -395,6 +510,10 @@ declare class RAGEndpoint {
|
|
|
395
510
|
* process.stdout.write(chunk);
|
|
396
511
|
* }
|
|
397
512
|
*
|
|
513
|
+
* // Search (Perplexity-style, token-routed isolation)
|
|
514
|
+
* const search = await client.search.create({ query: "What is MoE?" });
|
|
515
|
+
* console.log(search.sources);
|
|
516
|
+
*
|
|
398
517
|
* // Admin
|
|
399
518
|
* await client.monitor.snapshot();
|
|
400
519
|
* await client.cache.purge();
|
|
@@ -418,6 +537,8 @@ declare class I64Client {
|
|
|
418
537
|
readonly monitor: MonitorEndpoint;
|
|
419
538
|
/** RAG — index, search, stats. */
|
|
420
539
|
readonly rag: RAGEndpoint;
|
|
540
|
+
/** Web search — Perplexity-style with token-routed isolation. */
|
|
541
|
+
readonly search: SearchEndpoint;
|
|
421
542
|
/**
|
|
422
543
|
* Create a vllm-i64 client.
|
|
423
544
|
*
|
|
@@ -429,4 +550,4 @@ declare class I64Client {
|
|
|
429
550
|
get baseUrl(): string;
|
|
430
551
|
}
|
|
431
552
|
|
|
432
|
-
export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
|
|
553
|
+
export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type SearchCompletionRequest, type SearchCompletionResponse, SearchEndpoint, type SearchHistoryEntry, type SearchHistoryResponse, type SearchSource, type SearchStatsResponse, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
|
package/dist/index.js
CHANGED
|
@@ -28,6 +28,7 @@ __export(index_exports, {
|
|
|
28
28
|
LoRAEndpoint: () => LoRAEndpoint,
|
|
29
29
|
MonitorEndpoint: () => MonitorEndpoint,
|
|
30
30
|
RAGEndpoint: () => RAGEndpoint,
|
|
31
|
+
SearchEndpoint: () => SearchEndpoint,
|
|
31
32
|
default: () => index_default
|
|
32
33
|
});
|
|
33
34
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -42,7 +43,7 @@ var HttpClient = class {
|
|
|
42
43
|
this.apiKey = options.apiKey;
|
|
43
44
|
this.timeout = options.timeoutMs ?? 12e4;
|
|
44
45
|
}
|
|
45
|
-
async fetch(path, init = {}) {
|
|
46
|
+
async fetch(path, init = {}, externalSignal) {
|
|
46
47
|
const headers = {
|
|
47
48
|
"Content-Type": "application/json",
|
|
48
49
|
...init.headers
|
|
@@ -52,6 +53,13 @@ var HttpClient = class {
|
|
|
52
53
|
}
|
|
53
54
|
const controller = new AbortController();
|
|
54
55
|
const timer = setTimeout(() => controller.abort(), this.timeout);
|
|
56
|
+
if (externalSignal) {
|
|
57
|
+
if (externalSignal.aborted) {
|
|
58
|
+
controller.abort();
|
|
59
|
+
} else {
|
|
60
|
+
externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
61
|
+
}
|
|
62
|
+
}
|
|
55
63
|
try {
|
|
56
64
|
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
57
65
|
...init,
|
|
@@ -178,30 +186,32 @@ var ChatEndpoint = class {
|
|
|
178
186
|
* ```
|
|
179
187
|
*/
|
|
180
188
|
async *stream(messages, options = {}) {
|
|
189
|
+
const { signal, ...rest } = options;
|
|
181
190
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
182
191
|
method: "POST",
|
|
183
192
|
body: JSON.stringify({
|
|
184
193
|
model: "default",
|
|
185
194
|
messages,
|
|
186
|
-
...
|
|
195
|
+
...rest,
|
|
187
196
|
stream: true
|
|
188
197
|
})
|
|
189
|
-
});
|
|
198
|
+
}, signal);
|
|
190
199
|
yield* this.http.readSSE(res);
|
|
191
200
|
}
|
|
192
201
|
/**
|
|
193
202
|
* Streaming chat — yields raw SSE delta objects.
|
|
194
203
|
*/
|
|
195
204
|
async *streamRaw(messages, options = {}) {
|
|
205
|
+
const { signal, ...rest } = options;
|
|
196
206
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
197
207
|
method: "POST",
|
|
198
208
|
body: JSON.stringify({
|
|
199
209
|
model: "default",
|
|
200
210
|
messages,
|
|
201
|
-
...
|
|
211
|
+
...rest,
|
|
202
212
|
stream: true
|
|
203
213
|
})
|
|
204
|
-
});
|
|
214
|
+
}, signal);
|
|
205
215
|
yield* this.http.readSSERaw(res);
|
|
206
216
|
}
|
|
207
217
|
};
|
|
@@ -332,6 +342,112 @@ var RAGEndpoint = class {
|
|
|
332
342
|
}
|
|
333
343
|
};
|
|
334
344
|
|
|
345
|
+
// src/endpoints/search.ts
|
|
346
|
+
var SearchEndpoint = class {
|
|
347
|
+
constructor(http) {
|
|
348
|
+
this.http = http;
|
|
349
|
+
}
|
|
350
|
+
/**
|
|
351
|
+
* Search-augmented generation: query → web search → cited answer.
|
|
352
|
+
*
|
|
353
|
+
* @example
|
|
354
|
+
* ```ts
|
|
355
|
+
* const res = await client.search.create({ query: "What is MoE?" });
|
|
356
|
+
* console.log(res.choices[0].message.content);
|
|
357
|
+
* for (const src of res.sources) {
|
|
358
|
+
* console.log(`[${src.index}] ${src.title} — ${src.url}`);
|
|
359
|
+
* }
|
|
360
|
+
* ```
|
|
361
|
+
*/
|
|
362
|
+
async create(params) {
|
|
363
|
+
return this.http.post("/v1/search/completions", { ...params, stream: false });
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Stream search completion — yields text chunks.
|
|
367
|
+
* Sources are sent as the final SSE event.
|
|
368
|
+
*
|
|
369
|
+
* @example
|
|
370
|
+
* ```ts
|
|
371
|
+
* const { stream, sources } = await client.search.stream({ query: "token routing" });
|
|
372
|
+
* for await (const chunk of stream) {
|
|
373
|
+
* process.stdout.write(chunk);
|
|
374
|
+
* }
|
|
375
|
+
* console.log("\nSources:", await sources);
|
|
376
|
+
* ```
|
|
377
|
+
*/
|
|
378
|
+
async stream(params, signal) {
|
|
379
|
+
const res = await this.http.fetch(
|
|
380
|
+
"/v1/search/completions",
|
|
381
|
+
{ method: "POST", body: JSON.stringify({ ...params, stream: true }) },
|
|
382
|
+
signal
|
|
383
|
+
);
|
|
384
|
+
let resolveSourcesFn;
|
|
385
|
+
const sourcesPromise = new Promise((resolve) => {
|
|
386
|
+
resolveSourcesFn = resolve;
|
|
387
|
+
});
|
|
388
|
+
const self = this;
|
|
389
|
+
async function* readStream() {
|
|
390
|
+
if (!res.body) return;
|
|
391
|
+
const reader = res.body.getReader();
|
|
392
|
+
const decoder = new TextDecoder();
|
|
393
|
+
let buffer = "";
|
|
394
|
+
let foundSources = false;
|
|
395
|
+
try {
|
|
396
|
+
while (true) {
|
|
397
|
+
const { done, value } = await reader.read();
|
|
398
|
+
if (done) break;
|
|
399
|
+
buffer += decoder.decode(value, { stream: true });
|
|
400
|
+
const lines = buffer.split("\n");
|
|
401
|
+
buffer = lines.pop() ?? "";
|
|
402
|
+
for (const line of lines) {
|
|
403
|
+
const trimmed = line.trim();
|
|
404
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
405
|
+
const payload = trimmed.slice(6);
|
|
406
|
+
if (payload === "[DONE]") return;
|
|
407
|
+
try {
|
|
408
|
+
const data = JSON.parse(payload);
|
|
409
|
+
if (data.sources) {
|
|
410
|
+
foundSources = true;
|
|
411
|
+
resolveSourcesFn(data.sources);
|
|
412
|
+
continue;
|
|
413
|
+
}
|
|
414
|
+
const content = data.choices?.[0]?.delta?.content ?? "";
|
|
415
|
+
if (content) yield content;
|
|
416
|
+
} catch {
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
} finally {
|
|
421
|
+
reader.releaseLock();
|
|
422
|
+
if (!foundSources) resolveSourcesFn([]);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
return { stream: readStream(), sources: sourcesPromise };
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Get search history for the authenticated user.
|
|
429
|
+
* History is partitioned by api_key + user — no cross-user access.
|
|
430
|
+
*/
|
|
431
|
+
async history(user, limit = 50) {
|
|
432
|
+
const params = new URLSearchParams({ limit: String(limit) });
|
|
433
|
+
if (user) params.set("user", user);
|
|
434
|
+
return this.http.get(`/v1/search/history?${params}`);
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Clear search history for the authenticated user.
|
|
438
|
+
* Only clears the caller's own partition.
|
|
439
|
+
*/
|
|
440
|
+
async clearHistory(user) {
|
|
441
|
+
const params = user ? `?user=${encodeURIComponent(user)}` : "";
|
|
442
|
+
const res = await this.http.fetch(`/v1/search/history${params}`, { method: "DELETE" });
|
|
443
|
+
return res.json();
|
|
444
|
+
}
|
|
445
|
+
/** Search history statistics (admin). */
|
|
446
|
+
async stats() {
|
|
447
|
+
return this.http.get("/v1/search/stats");
|
|
448
|
+
}
|
|
449
|
+
};
|
|
450
|
+
|
|
335
451
|
// src/index.ts
|
|
336
452
|
var I64Client = class {
|
|
337
453
|
http;
|
|
@@ -347,6 +463,8 @@ var I64Client = class {
|
|
|
347
463
|
monitor;
|
|
348
464
|
/** RAG — index, search, stats. */
|
|
349
465
|
rag;
|
|
466
|
+
/** Web search — Perplexity-style with token-routed isolation. */
|
|
467
|
+
search;
|
|
350
468
|
/**
|
|
351
469
|
* Create a vllm-i64 client.
|
|
352
470
|
*
|
|
@@ -361,6 +479,7 @@ var I64Client = class {
|
|
|
361
479
|
this.lora = new LoRAEndpoint(this.http);
|
|
362
480
|
this.monitor = new MonitorEndpoint(this.http);
|
|
363
481
|
this.rag = new RAGEndpoint(this.http);
|
|
482
|
+
this.search = new SearchEndpoint(this.http);
|
|
364
483
|
}
|
|
365
484
|
/** Server base URL. */
|
|
366
485
|
get baseUrl() {
|
|
@@ -377,5 +496,6 @@ var index_default = I64Client;
|
|
|
377
496
|
I64Client,
|
|
378
497
|
LoRAEndpoint,
|
|
379
498
|
MonitorEndpoint,
|
|
380
|
-
RAGEndpoint
|
|
499
|
+
RAGEndpoint,
|
|
500
|
+
SearchEndpoint
|
|
381
501
|
});
|
package/dist/index.mjs
CHANGED
|
@@ -8,7 +8,7 @@ var HttpClient = class {
|
|
|
8
8
|
this.apiKey = options.apiKey;
|
|
9
9
|
this.timeout = options.timeoutMs ?? 12e4;
|
|
10
10
|
}
|
|
11
|
-
async fetch(path, init = {}) {
|
|
11
|
+
async fetch(path, init = {}, externalSignal) {
|
|
12
12
|
const headers = {
|
|
13
13
|
"Content-Type": "application/json",
|
|
14
14
|
...init.headers
|
|
@@ -18,6 +18,13 @@ var HttpClient = class {
|
|
|
18
18
|
}
|
|
19
19
|
const controller = new AbortController();
|
|
20
20
|
const timer = setTimeout(() => controller.abort(), this.timeout);
|
|
21
|
+
if (externalSignal) {
|
|
22
|
+
if (externalSignal.aborted) {
|
|
23
|
+
controller.abort();
|
|
24
|
+
} else {
|
|
25
|
+
externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
26
|
+
}
|
|
27
|
+
}
|
|
21
28
|
try {
|
|
22
29
|
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
23
30
|
...init,
|
|
@@ -144,30 +151,32 @@ var ChatEndpoint = class {
|
|
|
144
151
|
* ```
|
|
145
152
|
*/
|
|
146
153
|
async *stream(messages, options = {}) {
|
|
154
|
+
const { signal, ...rest } = options;
|
|
147
155
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
148
156
|
method: "POST",
|
|
149
157
|
body: JSON.stringify({
|
|
150
158
|
model: "default",
|
|
151
159
|
messages,
|
|
152
|
-
...
|
|
160
|
+
...rest,
|
|
153
161
|
stream: true
|
|
154
162
|
})
|
|
155
|
-
});
|
|
163
|
+
}, signal);
|
|
156
164
|
yield* this.http.readSSE(res);
|
|
157
165
|
}
|
|
158
166
|
/**
|
|
159
167
|
* Streaming chat — yields raw SSE delta objects.
|
|
160
168
|
*/
|
|
161
169
|
async *streamRaw(messages, options = {}) {
|
|
170
|
+
const { signal, ...rest } = options;
|
|
162
171
|
const res = await this.http.fetch("/v1/chat/completions", {
|
|
163
172
|
method: "POST",
|
|
164
173
|
body: JSON.stringify({
|
|
165
174
|
model: "default",
|
|
166
175
|
messages,
|
|
167
|
-
...
|
|
176
|
+
...rest,
|
|
168
177
|
stream: true
|
|
169
178
|
})
|
|
170
|
-
});
|
|
179
|
+
}, signal);
|
|
171
180
|
yield* this.http.readSSERaw(res);
|
|
172
181
|
}
|
|
173
182
|
};
|
|
@@ -298,6 +307,112 @@ var RAGEndpoint = class {
|
|
|
298
307
|
}
|
|
299
308
|
};
|
|
300
309
|
|
|
310
|
+
// src/endpoints/search.ts
|
|
311
|
+
var SearchEndpoint = class {
|
|
312
|
+
constructor(http) {
|
|
313
|
+
this.http = http;
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Search-augmented generation: query → web search → cited answer.
|
|
317
|
+
*
|
|
318
|
+
* @example
|
|
319
|
+
* ```ts
|
|
320
|
+
* const res = await client.search.create({ query: "What is MoE?" });
|
|
321
|
+
* console.log(res.choices[0].message.content);
|
|
322
|
+
* for (const src of res.sources) {
|
|
323
|
+
* console.log(`[${src.index}] ${src.title} — ${src.url}`);
|
|
324
|
+
* }
|
|
325
|
+
* ```
|
|
326
|
+
*/
|
|
327
|
+
async create(params) {
|
|
328
|
+
return this.http.post("/v1/search/completions", { ...params, stream: false });
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Stream search completion — yields text chunks.
|
|
332
|
+
* Sources are sent as the final SSE event.
|
|
333
|
+
*
|
|
334
|
+
* @example
|
|
335
|
+
* ```ts
|
|
336
|
+
* const { stream, sources } = await client.search.stream({ query: "token routing" });
|
|
337
|
+
* for await (const chunk of stream) {
|
|
338
|
+
* process.stdout.write(chunk);
|
|
339
|
+
* }
|
|
340
|
+
* console.log("\nSources:", await sources);
|
|
341
|
+
* ```
|
|
342
|
+
*/
|
|
343
|
+
async stream(params, signal) {
|
|
344
|
+
const res = await this.http.fetch(
|
|
345
|
+
"/v1/search/completions",
|
|
346
|
+
{ method: "POST", body: JSON.stringify({ ...params, stream: true }) },
|
|
347
|
+
signal
|
|
348
|
+
);
|
|
349
|
+
let resolveSourcesFn;
|
|
350
|
+
const sourcesPromise = new Promise((resolve) => {
|
|
351
|
+
resolveSourcesFn = resolve;
|
|
352
|
+
});
|
|
353
|
+
const self = this;
|
|
354
|
+
async function* readStream() {
|
|
355
|
+
if (!res.body) return;
|
|
356
|
+
const reader = res.body.getReader();
|
|
357
|
+
const decoder = new TextDecoder();
|
|
358
|
+
let buffer = "";
|
|
359
|
+
let foundSources = false;
|
|
360
|
+
try {
|
|
361
|
+
while (true) {
|
|
362
|
+
const { done, value } = await reader.read();
|
|
363
|
+
if (done) break;
|
|
364
|
+
buffer += decoder.decode(value, { stream: true });
|
|
365
|
+
const lines = buffer.split("\n");
|
|
366
|
+
buffer = lines.pop() ?? "";
|
|
367
|
+
for (const line of lines) {
|
|
368
|
+
const trimmed = line.trim();
|
|
369
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
370
|
+
const payload = trimmed.slice(6);
|
|
371
|
+
if (payload === "[DONE]") return;
|
|
372
|
+
try {
|
|
373
|
+
const data = JSON.parse(payload);
|
|
374
|
+
if (data.sources) {
|
|
375
|
+
foundSources = true;
|
|
376
|
+
resolveSourcesFn(data.sources);
|
|
377
|
+
continue;
|
|
378
|
+
}
|
|
379
|
+
const content = data.choices?.[0]?.delta?.content ?? "";
|
|
380
|
+
if (content) yield content;
|
|
381
|
+
} catch {
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
} finally {
|
|
386
|
+
reader.releaseLock();
|
|
387
|
+
if (!foundSources) resolveSourcesFn([]);
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
return { stream: readStream(), sources: sourcesPromise };
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Get search history for the authenticated user.
|
|
394
|
+
* History is partitioned by api_key + user — no cross-user access.
|
|
395
|
+
*/
|
|
396
|
+
async history(user, limit = 50) {
|
|
397
|
+
const params = new URLSearchParams({ limit: String(limit) });
|
|
398
|
+
if (user) params.set("user", user);
|
|
399
|
+
return this.http.get(`/v1/search/history?${params}`);
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* Clear search history for the authenticated user.
|
|
403
|
+
* Only clears the caller's own partition.
|
|
404
|
+
*/
|
|
405
|
+
async clearHistory(user) {
|
|
406
|
+
const params = user ? `?user=${encodeURIComponent(user)}` : "";
|
|
407
|
+
const res = await this.http.fetch(`/v1/search/history${params}`, { method: "DELETE" });
|
|
408
|
+
return res.json();
|
|
409
|
+
}
|
|
410
|
+
/** Search history statistics (admin). */
|
|
411
|
+
async stats() {
|
|
412
|
+
return this.http.get("/v1/search/stats");
|
|
413
|
+
}
|
|
414
|
+
};
|
|
415
|
+
|
|
301
416
|
// src/index.ts
|
|
302
417
|
var I64Client = class {
|
|
303
418
|
http;
|
|
@@ -313,6 +428,8 @@ var I64Client = class {
|
|
|
313
428
|
monitor;
|
|
314
429
|
/** RAG — index, search, stats. */
|
|
315
430
|
rag;
|
|
431
|
+
/** Web search — Perplexity-style with token-routed isolation. */
|
|
432
|
+
search;
|
|
316
433
|
/**
|
|
317
434
|
* Create a vllm-i64 client.
|
|
318
435
|
*
|
|
@@ -327,6 +444,7 @@ var I64Client = class {
|
|
|
327
444
|
this.lora = new LoRAEndpoint(this.http);
|
|
328
445
|
this.monitor = new MonitorEndpoint(this.http);
|
|
329
446
|
this.rag = new RAGEndpoint(this.http);
|
|
447
|
+
this.search = new SearchEndpoint(this.http);
|
|
330
448
|
}
|
|
331
449
|
/** Server base URL. */
|
|
332
450
|
get baseUrl() {
|
|
@@ -343,5 +461,6 @@ export {
|
|
|
343
461
|
LoRAEndpoint,
|
|
344
462
|
MonitorEndpoint,
|
|
345
463
|
RAGEndpoint,
|
|
464
|
+
SearchEndpoint,
|
|
346
465
|
index_default as default
|
|
347
466
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "vllm-i64",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "TypeScript SDK for vllm-i64 — integer-first inference engine",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -28,7 +28,10 @@
|
|
|
28
28
|
"moe",
|
|
29
29
|
"openai",
|
|
30
30
|
"ai",
|
|
31
|
-
"complexity"
|
|
31
|
+
"complexity",
|
|
32
|
+
"search",
|
|
33
|
+
"perplexity",
|
|
34
|
+
"security"
|
|
32
35
|
],
|
|
33
36
|
"author": "Complexity-ML / INL",
|
|
34
37
|
"license": "Apache-2.0",
|