vllm-i64 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
44
44
  };
45
45
  };
46
46
  stop?: string | string[];
47
+ min_p?: number;
48
+ typical_p?: number;
47
49
  repetition_penalty?: number;
50
+ min_tokens?: number;
48
51
  logprobs?: boolean;
49
52
  }
50
53
  interface ChatCompletionChoice {
@@ -214,6 +217,55 @@ interface RAGStatsResult {
214
217
  total_chunks: number;
215
218
  dimension: number;
216
219
  }
220
+ interface SearchCompletionRequest {
221
+ query: string;
222
+ max_tokens?: number;
223
+ temperature?: number;
224
+ search_count?: number;
225
+ user?: string;
226
+ stream?: boolean;
227
+ }
228
+ interface SearchSource {
229
+ index: number;
230
+ title: string;
231
+ url: string;
232
+ domain: string;
233
+ favicon: string;
234
+ }
235
+ interface SearchCompletionResponse {
236
+ id: string;
237
+ object: "search.completion";
238
+ model: string;
239
+ query: string;
240
+ choices: {
241
+ index: number;
242
+ message: {
243
+ role: "assistant";
244
+ content: string;
245
+ };
246
+ finish_reason: "stop" | "length";
247
+ }[];
248
+ sources: SearchSource[];
249
+ usage?: UsageInfo;
250
+ }
251
+ interface SearchHistoryEntry {
252
+ query: string;
253
+ sources: SearchSource[];
254
+ answer: string;
255
+ timestamp: number;
256
+ }
257
+ interface SearchHistoryResponse {
258
+ history: SearchHistoryEntry[];
259
+ count: number;
260
+ }
261
+ interface SearchStatsResponse {
262
+ enabled: boolean;
263
+ num_partitions: number;
264
+ total_keys: number;
265
+ total_entries: number;
266
+ max_per_key: number;
267
+ persist_dir: string | null;
268
+ }
217
269
 
218
270
  /**
219
271
  * vllm-i64 SDK — HTTP Client core
@@ -233,7 +285,7 @@ declare class HttpClient {
233
285
  private apiKey?;
234
286
  private timeout;
235
287
  constructor(baseUrl?: string, options?: ClientOptions);
236
- fetch(path: string, init?: RequestInit): Promise<Response>;
288
+ fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
237
289
  get<T>(path: string): Promise<T>;
238
290
  post<T>(path: string, body: unknown): Promise<T>;
239
291
  readSSE(res: Response): AsyncGenerator<string, void, undefined>;
@@ -269,11 +321,15 @@ declare class ChatEndpoint {
269
321
  * }
270
322
  * ```
271
323
  */
272
- stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<string, void, undefined>;
324
+ stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
325
+ signal?: AbortSignal;
326
+ }): AsyncGenerator<string, void, undefined>;
273
327
  /**
274
328
  * Streaming chat — yields raw SSE delta objects.
275
329
  */
276
- streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<StreamDelta, void, undefined>;
330
+ streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
331
+ signal?: AbortSignal;
332
+ }): AsyncGenerator<StreamDelta, void, undefined>;
277
333
  }
278
334
 
279
335
  /**
@@ -376,6 +432,65 @@ declare class RAGEndpoint {
376
432
  stats(): Promise<RAGStatsResult>;
377
433
  }
378
434
 
435
+ /**
436
+ * Search endpoints — Perplexity-style search-augmented generation.
437
+ *
438
+ * Token-routed isolation: partition = sha256(api_key ∥ user_id) mod N
439
+ * No data leak possible. No shared cache. No session tokens.
440
+ *
441
+ * INL - 2025
442
+ */
443
+
444
+ declare class SearchEndpoint {
445
+ private http;
446
+ constructor(http: HttpClient);
447
+ /**
448
+ * Search-augmented generation: query → web search → cited answer.
449
+ *
450
+ * @example
451
+ * ```ts
452
+ * const res = await client.search.create({ query: "What is MoE?" });
453
+ * console.log(res.choices[0].message.content);
454
+ * for (const src of res.sources) {
455
+ * console.log(`[${src.index}] ${src.title} — ${src.url}`);
456
+ * }
457
+ * ```
458
+ */
459
+ create(params: SearchCompletionRequest): Promise<SearchCompletionResponse>;
460
+ /**
461
+ * Stream search completion — yields text chunks.
462
+ * Sources are sent as the final SSE event.
463
+ *
464
+ * @example
465
+ * ```ts
466
+ * const { stream, sources } = await client.search.stream({ query: "token routing" });
467
+ * for await (const chunk of stream) {
468
+ * process.stdout.write(chunk);
469
+ * }
470
+ * console.log("\nSources:", await sources);
471
+ * ```
472
+ */
473
+ stream(params: SearchCompletionRequest, signal?: AbortSignal): Promise<{
474
+ stream: AsyncGenerator<string>;
475
+ sources: Promise<SearchSource[]>;
476
+ }>;
477
+ /**
478
+ * Get search history for the authenticated user.
479
+ * History is partitioned by api_key + user — no cross-user access.
480
+ */
481
+ history(user?: string, limit?: number): Promise<SearchHistoryResponse>;
482
+ /**
483
+ * Clear search history for the authenticated user.
484
+ * Only clears the caller's own partition.
485
+ */
486
+ clearHistory(user?: string): Promise<{
487
+ status: string;
488
+ removed: number;
489
+ }>;
490
+ /** Search history statistics (admin). */
491
+ stats(): Promise<SearchStatsResponse>;
492
+ }
493
+
379
494
  /**
380
495
  * vllm-i64 — TypeScript SDK
381
496
  *
@@ -395,6 +510,10 @@ declare class RAGEndpoint {
395
510
  * process.stdout.write(chunk);
396
511
  * }
397
512
  *
513
+ * // Search (Perplexity-style, token-routed isolation)
514
+ * const search = await client.search.create({ query: "What is MoE?" });
515
+ * console.log(search.sources);
516
+ *
398
517
  * // Admin
399
518
  * await client.monitor.snapshot();
400
519
  * await client.cache.purge();
@@ -418,6 +537,8 @@ declare class I64Client {
418
537
  readonly monitor: MonitorEndpoint;
419
538
  /** RAG — index, search, stats. */
420
539
  readonly rag: RAGEndpoint;
540
+ /** Web search — Perplexity-style with token-routed isolation. */
541
+ readonly search: SearchEndpoint;
421
542
  /**
422
543
  * Create a vllm-i64 client.
423
544
  *
@@ -429,4 +550,4 @@ declare class I64Client {
429
550
  get baseUrl(): string;
430
551
  }
431
552
 
432
- export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
553
+ export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type SearchCompletionRequest, type SearchCompletionResponse, SearchEndpoint, type SearchHistoryEntry, type SearchHistoryResponse, type SearchSource, type SearchStatsResponse, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
package/dist/index.d.ts CHANGED
@@ -44,7 +44,10 @@ interface ChatCompletionRequest {
44
44
  };
45
45
  };
46
46
  stop?: string | string[];
47
+ min_p?: number;
48
+ typical_p?: number;
47
49
  repetition_penalty?: number;
50
+ min_tokens?: number;
48
51
  logprobs?: boolean;
49
52
  }
50
53
  interface ChatCompletionChoice {
@@ -214,6 +217,55 @@ interface RAGStatsResult {
214
217
  total_chunks: number;
215
218
  dimension: number;
216
219
  }
220
+ interface SearchCompletionRequest {
221
+ query: string;
222
+ max_tokens?: number;
223
+ temperature?: number;
224
+ search_count?: number;
225
+ user?: string;
226
+ stream?: boolean;
227
+ }
228
+ interface SearchSource {
229
+ index: number;
230
+ title: string;
231
+ url: string;
232
+ domain: string;
233
+ favicon: string;
234
+ }
235
+ interface SearchCompletionResponse {
236
+ id: string;
237
+ object: "search.completion";
238
+ model: string;
239
+ query: string;
240
+ choices: {
241
+ index: number;
242
+ message: {
243
+ role: "assistant";
244
+ content: string;
245
+ };
246
+ finish_reason: "stop" | "length";
247
+ }[];
248
+ sources: SearchSource[];
249
+ usage?: UsageInfo;
250
+ }
251
+ interface SearchHistoryEntry {
252
+ query: string;
253
+ sources: SearchSource[];
254
+ answer: string;
255
+ timestamp: number;
256
+ }
257
+ interface SearchHistoryResponse {
258
+ history: SearchHistoryEntry[];
259
+ count: number;
260
+ }
261
+ interface SearchStatsResponse {
262
+ enabled: boolean;
263
+ num_partitions: number;
264
+ total_keys: number;
265
+ total_entries: number;
266
+ max_per_key: number;
267
+ persist_dir: string | null;
268
+ }
217
269
 
218
270
  /**
219
271
  * vllm-i64 SDK — HTTP Client core
@@ -233,7 +285,7 @@ declare class HttpClient {
233
285
  private apiKey?;
234
286
  private timeout;
235
287
  constructor(baseUrl?: string, options?: ClientOptions);
236
- fetch(path: string, init?: RequestInit): Promise<Response>;
288
+ fetch(path: string, init?: RequestInit, externalSignal?: AbortSignal): Promise<Response>;
237
289
  get<T>(path: string): Promise<T>;
238
290
  post<T>(path: string, body: unknown): Promise<T>;
239
291
  readSSE(res: Response): AsyncGenerator<string, void, undefined>;
@@ -269,11 +321,15 @@ declare class ChatEndpoint {
269
321
  * }
270
322
  * ```
271
323
  */
272
- stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<string, void, undefined>;
324
+ stream(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
325
+ signal?: AbortSignal;
326
+ }): AsyncGenerator<string, void, undefined>;
273
327
  /**
274
328
  * Streaming chat — yields raw SSE delta objects.
275
329
  */
276
- streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream">): AsyncGenerator<StreamDelta, void, undefined>;
330
+ streamRaw(messages: ChatMessage[], options?: Omit<ChatCompletionRequest, "messages" | "stream"> & {
331
+ signal?: AbortSignal;
332
+ }): AsyncGenerator<StreamDelta, void, undefined>;
277
333
  }
278
334
 
279
335
  /**
@@ -376,6 +432,65 @@ declare class RAGEndpoint {
376
432
  stats(): Promise<RAGStatsResult>;
377
433
  }
378
434
 
435
+ /**
436
+ * Search endpoints — Perplexity-style search-augmented generation.
437
+ *
438
+ * Token-routed isolation: partition = sha256(api_key ∥ user_id) mod N
439
+ * No data leak possible. No shared cache. No session tokens.
440
+ *
441
+ * INL - 2025
442
+ */
443
+
444
+ declare class SearchEndpoint {
445
+ private http;
446
+ constructor(http: HttpClient);
447
+ /**
448
+ * Search-augmented generation: query → web search → cited answer.
449
+ *
450
+ * @example
451
+ * ```ts
452
+ * const res = await client.search.create({ query: "What is MoE?" });
453
+ * console.log(res.choices[0].message.content);
454
+ * for (const src of res.sources) {
455
+ * console.log(`[${src.index}] ${src.title} — ${src.url}`);
456
+ * }
457
+ * ```
458
+ */
459
+ create(params: SearchCompletionRequest): Promise<SearchCompletionResponse>;
460
+ /**
461
+ * Stream search completion — yields text chunks.
462
+ * Sources are sent as the final SSE event.
463
+ *
464
+ * @example
465
+ * ```ts
466
+ * const { stream, sources } = await client.search.stream({ query: "token routing" });
467
+ * for await (const chunk of stream) {
468
+ * process.stdout.write(chunk);
469
+ * }
470
+ * console.log("\nSources:", await sources);
471
+ * ```
472
+ */
473
+ stream(params: SearchCompletionRequest, signal?: AbortSignal): Promise<{
474
+ stream: AsyncGenerator<string>;
475
+ sources: Promise<SearchSource[]>;
476
+ }>;
477
+ /**
478
+ * Get search history for the authenticated user.
479
+ * History is partitioned by api_key + user — no cross-user access.
480
+ */
481
+ history(user?: string, limit?: number): Promise<SearchHistoryResponse>;
482
+ /**
483
+ * Clear search history for the authenticated user.
484
+ * Only clears the caller's own partition.
485
+ */
486
+ clearHistory(user?: string): Promise<{
487
+ status: string;
488
+ removed: number;
489
+ }>;
490
+ /** Search history statistics (admin). */
491
+ stats(): Promise<SearchStatsResponse>;
492
+ }
493
+
379
494
  /**
380
495
  * vllm-i64 — TypeScript SDK
381
496
  *
@@ -395,6 +510,10 @@ declare class RAGEndpoint {
395
510
  * process.stdout.write(chunk);
396
511
  * }
397
512
  *
513
+ * // Search (Perplexity-style, token-routed isolation)
514
+ * const search = await client.search.create({ query: "What is MoE?" });
515
+ * console.log(search.sources);
516
+ *
398
517
  * // Admin
399
518
  * await client.monitor.snapshot();
400
519
  * await client.cache.purge();
@@ -418,6 +537,8 @@ declare class I64Client {
418
537
  readonly monitor: MonitorEndpoint;
419
538
  /** RAG — index, search, stats. */
420
539
  readonly rag: RAGEndpoint;
540
+ /** Web search — Perplexity-style with token-routed isolation. */
541
+ readonly search: SearchEndpoint;
421
542
  /**
422
543
  * Create a vllm-i64 client.
423
544
  *
@@ -429,4 +550,4 @@ declare class I64Client {
429
550
  get baseUrl(): string;
430
551
  }
431
552
 
432
- export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
553
+ export { CacheEndpoint, type CachePurgeResult, type CacheStats, type ChatCompletionChoice, type ChatCompletionRequest, type ChatCompletionResponse, ChatEndpoint, type ChatMessage, type ClientOptions, type CompletionRequest, type CompletionResponse, CompletionsEndpoint, type ExpertStats, type GpuInfo, type HealthResponse, HttpClient, I64Client, type LoRAAdapter, LoRAEndpoint, type LoRAListResult, type LoRALoadParams, type LoRALoadResult, type LoRAUnloadResult, type ModelInfo, MonitorEndpoint, type MonitorSnapshot, RAGEndpoint, type RAGIndexParams, type RAGIndexResult, type RAGResult, type RAGSearchResult, type RAGStatsResult, type SearchCompletionRequest, type SearchCompletionResponse, SearchEndpoint, type SearchHistoryEntry, type SearchHistoryResponse, type SearchSource, type SearchStatsResponse, type StreamDelta, type ToolCall, type ToolDefinition, type UsageInfo, I64Client as default };
package/dist/index.js CHANGED
@@ -28,6 +28,7 @@ __export(index_exports, {
28
28
  LoRAEndpoint: () => LoRAEndpoint,
29
29
  MonitorEndpoint: () => MonitorEndpoint,
30
30
  RAGEndpoint: () => RAGEndpoint,
31
+ SearchEndpoint: () => SearchEndpoint,
31
32
  default: () => index_default
32
33
  });
33
34
  module.exports = __toCommonJS(index_exports);
@@ -42,7 +43,7 @@ var HttpClient = class {
42
43
  this.apiKey = options.apiKey;
43
44
  this.timeout = options.timeoutMs ?? 12e4;
44
45
  }
45
- async fetch(path, init = {}) {
46
+ async fetch(path, init = {}, externalSignal) {
46
47
  const headers = {
47
48
  "Content-Type": "application/json",
48
49
  ...init.headers
@@ -52,6 +53,13 @@ var HttpClient = class {
52
53
  }
53
54
  const controller = new AbortController();
54
55
  const timer = setTimeout(() => controller.abort(), this.timeout);
56
+ if (externalSignal) {
57
+ if (externalSignal.aborted) {
58
+ controller.abort();
59
+ } else {
60
+ externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
61
+ }
62
+ }
55
63
  try {
56
64
  const res = await fetch(`${this.baseUrl}${path}`, {
57
65
  ...init,
@@ -178,30 +186,32 @@ var ChatEndpoint = class {
178
186
  * ```
179
187
  */
180
188
  async *stream(messages, options = {}) {
189
+ const { signal, ...rest } = options;
181
190
  const res = await this.http.fetch("/v1/chat/completions", {
182
191
  method: "POST",
183
192
  body: JSON.stringify({
184
193
  model: "default",
185
194
  messages,
186
- ...options,
195
+ ...rest,
187
196
  stream: true
188
197
  })
189
- });
198
+ }, signal);
190
199
  yield* this.http.readSSE(res);
191
200
  }
192
201
  /**
193
202
  * Streaming chat — yields raw SSE delta objects.
194
203
  */
195
204
  async *streamRaw(messages, options = {}) {
205
+ const { signal, ...rest } = options;
196
206
  const res = await this.http.fetch("/v1/chat/completions", {
197
207
  method: "POST",
198
208
  body: JSON.stringify({
199
209
  model: "default",
200
210
  messages,
201
- ...options,
211
+ ...rest,
202
212
  stream: true
203
213
  })
204
- });
214
+ }, signal);
205
215
  yield* this.http.readSSERaw(res);
206
216
  }
207
217
  };
@@ -332,6 +342,112 @@ var RAGEndpoint = class {
332
342
  }
333
343
  };
334
344
 
345
+ // src/endpoints/search.ts
346
+ var SearchEndpoint = class {
347
+ constructor(http) {
348
+ this.http = http;
349
+ }
350
+ /**
351
+ * Search-augmented generation: query → web search → cited answer.
352
+ *
353
+ * @example
354
+ * ```ts
355
+ * const res = await client.search.create({ query: "What is MoE?" });
356
+ * console.log(res.choices[0].message.content);
357
+ * for (const src of res.sources) {
358
+ * console.log(`[${src.index}] ${src.title} — ${src.url}`);
359
+ * }
360
+ * ```
361
+ */
362
+ async create(params) {
363
+ return this.http.post("/v1/search/completions", { ...params, stream: false });
364
+ }
365
+ /**
366
+ * Stream search completion — yields text chunks.
367
+ * Sources are sent as the final SSE event.
368
+ *
369
+ * @example
370
+ * ```ts
371
+ * const { stream, sources } = await client.search.stream({ query: "token routing" });
372
+ * for await (const chunk of stream) {
373
+ * process.stdout.write(chunk);
374
+ * }
375
+ * console.log("\nSources:", await sources);
376
+ * ```
377
+ */
378
+ async stream(params, signal) {
379
+ const res = await this.http.fetch(
380
+ "/v1/search/completions",
381
+ { method: "POST", body: JSON.stringify({ ...params, stream: true }) },
382
+ signal
383
+ );
384
+ let resolveSourcesFn;
385
+ const sourcesPromise = new Promise((resolve) => {
386
+ resolveSourcesFn = resolve;
387
+ });
388
+ const self = this;
389
+ async function* readStream() {
390
+ if (!res.body) return;
391
+ const reader = res.body.getReader();
392
+ const decoder = new TextDecoder();
393
+ let buffer = "";
394
+ let foundSources = false;
395
+ try {
396
+ while (true) {
397
+ const { done, value } = await reader.read();
398
+ if (done) break;
399
+ buffer += decoder.decode(value, { stream: true });
400
+ const lines = buffer.split("\n");
401
+ buffer = lines.pop() ?? "";
402
+ for (const line of lines) {
403
+ const trimmed = line.trim();
404
+ if (!trimmed.startsWith("data: ")) continue;
405
+ const payload = trimmed.slice(6);
406
+ if (payload === "[DONE]") return;
407
+ try {
408
+ const data = JSON.parse(payload);
409
+ if (data.sources) {
410
+ foundSources = true;
411
+ resolveSourcesFn(data.sources);
412
+ continue;
413
+ }
414
+ const content = data.choices?.[0]?.delta?.content ?? "";
415
+ if (content) yield content;
416
+ } catch {
417
+ }
418
+ }
419
+ }
420
+ } finally {
421
+ reader.releaseLock();
422
+ if (!foundSources) resolveSourcesFn([]);
423
+ }
424
+ }
425
+ return { stream: readStream(), sources: sourcesPromise };
426
+ }
427
+ /**
428
+ * Get search history for the authenticated user.
429
+ * History is partitioned by api_key + user — no cross-user access.
430
+ */
431
+ async history(user, limit = 50) {
432
+ const params = new URLSearchParams({ limit: String(limit) });
433
+ if (user) params.set("user", user);
434
+ return this.http.get(`/v1/search/history?${params}`);
435
+ }
436
+ /**
437
+ * Clear search history for the authenticated user.
438
+ * Only clears the caller's own partition.
439
+ */
440
+ async clearHistory(user) {
441
+ const params = user ? `?user=${encodeURIComponent(user)}` : "";
442
+ const res = await this.http.fetch(`/v1/search/history${params}`, { method: "DELETE" });
443
+ return res.json();
444
+ }
445
+ /** Search history statistics (admin). */
446
+ async stats() {
447
+ return this.http.get("/v1/search/stats");
448
+ }
449
+ };
450
+
335
451
  // src/index.ts
336
452
  var I64Client = class {
337
453
  http;
@@ -347,6 +463,8 @@ var I64Client = class {
347
463
  monitor;
348
464
  /** RAG — index, search, stats. */
349
465
  rag;
466
+ /** Web search — Perplexity-style with token-routed isolation. */
467
+ search;
350
468
  /**
351
469
  * Create a vllm-i64 client.
352
470
  *
@@ -361,6 +479,7 @@ var I64Client = class {
361
479
  this.lora = new LoRAEndpoint(this.http);
362
480
  this.monitor = new MonitorEndpoint(this.http);
363
481
  this.rag = new RAGEndpoint(this.http);
482
+ this.search = new SearchEndpoint(this.http);
364
483
  }
365
484
  /** Server base URL. */
366
485
  get baseUrl() {
@@ -377,5 +496,6 @@ var index_default = I64Client;
377
496
  I64Client,
378
497
  LoRAEndpoint,
379
498
  MonitorEndpoint,
380
- RAGEndpoint
499
+ RAGEndpoint,
500
+ SearchEndpoint
381
501
  });
package/dist/index.mjs CHANGED
@@ -8,7 +8,7 @@ var HttpClient = class {
8
8
  this.apiKey = options.apiKey;
9
9
  this.timeout = options.timeoutMs ?? 12e4;
10
10
  }
11
- async fetch(path, init = {}) {
11
+ async fetch(path, init = {}, externalSignal) {
12
12
  const headers = {
13
13
  "Content-Type": "application/json",
14
14
  ...init.headers
@@ -18,6 +18,13 @@ var HttpClient = class {
18
18
  }
19
19
  const controller = new AbortController();
20
20
  const timer = setTimeout(() => controller.abort(), this.timeout);
21
+ if (externalSignal) {
22
+ if (externalSignal.aborted) {
23
+ controller.abort();
24
+ } else {
25
+ externalSignal.addEventListener("abort", () => controller.abort(), { once: true });
26
+ }
27
+ }
21
28
  try {
22
29
  const res = await fetch(`${this.baseUrl}${path}`, {
23
30
  ...init,
@@ -144,30 +151,32 @@ var ChatEndpoint = class {
144
151
  * ```
145
152
  */
146
153
  async *stream(messages, options = {}) {
154
+ const { signal, ...rest } = options;
147
155
  const res = await this.http.fetch("/v1/chat/completions", {
148
156
  method: "POST",
149
157
  body: JSON.stringify({
150
158
  model: "default",
151
159
  messages,
152
- ...options,
160
+ ...rest,
153
161
  stream: true
154
162
  })
155
- });
163
+ }, signal);
156
164
  yield* this.http.readSSE(res);
157
165
  }
158
166
  /**
159
167
  * Streaming chat — yields raw SSE delta objects.
160
168
  */
161
169
  async *streamRaw(messages, options = {}) {
170
+ const { signal, ...rest } = options;
162
171
  const res = await this.http.fetch("/v1/chat/completions", {
163
172
  method: "POST",
164
173
  body: JSON.stringify({
165
174
  model: "default",
166
175
  messages,
167
- ...options,
176
+ ...rest,
168
177
  stream: true
169
178
  })
170
- });
179
+ }, signal);
171
180
  yield* this.http.readSSERaw(res);
172
181
  }
173
182
  };
@@ -298,6 +307,112 @@ var RAGEndpoint = class {
298
307
  }
299
308
  };
300
309
 
310
+ // src/endpoints/search.ts
311
+ var SearchEndpoint = class {
312
+ constructor(http) {
313
+ this.http = http;
314
+ }
315
+ /**
316
+ * Search-augmented generation: query → web search → cited answer.
317
+ *
318
+ * @example
319
+ * ```ts
320
+ * const res = await client.search.create({ query: "What is MoE?" });
321
+ * console.log(res.choices[0].message.content);
322
+ * for (const src of res.sources) {
323
+ * console.log(`[${src.index}] ${src.title} — ${src.url}`);
324
+ * }
325
+ * ```
326
+ */
327
+ async create(params) {
328
+ return this.http.post("/v1/search/completions", { ...params, stream: false });
329
+ }
330
+ /**
331
+ * Stream search completion — yields text chunks.
332
+ * Sources are sent as the final SSE event.
333
+ *
334
+ * @example
335
+ * ```ts
336
+ * const { stream, sources } = await client.search.stream({ query: "token routing" });
337
+ * for await (const chunk of stream) {
338
+ * process.stdout.write(chunk);
339
+ * }
340
+ * console.log("\nSources:", await sources);
341
+ * ```
342
+ */
343
+ async stream(params, signal) {
344
+ const res = await this.http.fetch(
345
+ "/v1/search/completions",
346
+ { method: "POST", body: JSON.stringify({ ...params, stream: true }) },
347
+ signal
348
+ );
349
+ let resolveSourcesFn;
350
+ const sourcesPromise = new Promise((resolve) => {
351
+ resolveSourcesFn = resolve;
352
+ });
353
+ const self = this;
354
+ async function* readStream() {
355
+ if (!res.body) return;
356
+ const reader = res.body.getReader();
357
+ const decoder = new TextDecoder();
358
+ let buffer = "";
359
+ let foundSources = false;
360
+ try {
361
+ while (true) {
362
+ const { done, value } = await reader.read();
363
+ if (done) break;
364
+ buffer += decoder.decode(value, { stream: true });
365
+ const lines = buffer.split("\n");
366
+ buffer = lines.pop() ?? "";
367
+ for (const line of lines) {
368
+ const trimmed = line.trim();
369
+ if (!trimmed.startsWith("data: ")) continue;
370
+ const payload = trimmed.slice(6);
371
+ if (payload === "[DONE]") return;
372
+ try {
373
+ const data = JSON.parse(payload);
374
+ if (data.sources) {
375
+ foundSources = true;
376
+ resolveSourcesFn(data.sources);
377
+ continue;
378
+ }
379
+ const content = data.choices?.[0]?.delta?.content ?? "";
380
+ if (content) yield content;
381
+ } catch {
382
+ }
383
+ }
384
+ }
385
+ } finally {
386
+ reader.releaseLock();
387
+ if (!foundSources) resolveSourcesFn([]);
388
+ }
389
+ }
390
+ return { stream: readStream(), sources: sourcesPromise };
391
+ }
392
+ /**
393
+ * Get search history for the authenticated user.
394
+ * History is partitioned by api_key + user — no cross-user access.
395
+ */
396
+ async history(user, limit = 50) {
397
+ const params = new URLSearchParams({ limit: String(limit) });
398
+ if (user) params.set("user", user);
399
+ return this.http.get(`/v1/search/history?${params}`);
400
+ }
401
+ /**
402
+ * Clear search history for the authenticated user.
403
+ * Only clears the caller's own partition.
404
+ */
405
+ async clearHistory(user) {
406
+ const params = user ? `?user=${encodeURIComponent(user)}` : "";
407
+ const res = await this.http.fetch(`/v1/search/history${params}`, { method: "DELETE" });
408
+ return res.json();
409
+ }
410
+ /** Search history statistics (admin). */
411
+ async stats() {
412
+ return this.http.get("/v1/search/stats");
413
+ }
414
+ };
415
+
301
416
  // src/index.ts
302
417
  var I64Client = class {
303
418
  http;
@@ -313,6 +428,8 @@ var I64Client = class {
313
428
  monitor;
314
429
  /** RAG — index, search, stats. */
315
430
  rag;
431
+ /** Web search — Perplexity-style with token-routed isolation. */
432
+ search;
316
433
  /**
317
434
  * Create a vllm-i64 client.
318
435
  *
@@ -327,6 +444,7 @@ var I64Client = class {
327
444
  this.lora = new LoRAEndpoint(this.http);
328
445
  this.monitor = new MonitorEndpoint(this.http);
329
446
  this.rag = new RAGEndpoint(this.http);
447
+ this.search = new SearchEndpoint(this.http);
330
448
  }
331
449
  /** Server base URL. */
332
450
  get baseUrl() {
@@ -343,5 +461,6 @@ export {
343
461
  LoRAEndpoint,
344
462
  MonitorEndpoint,
345
463
  RAGEndpoint,
464
+ SearchEndpoint,
346
465
  index_default as default
347
466
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vllm-i64",
3
- "version": "0.1.1",
3
+ "version": "0.3.0",
4
4
  "description": "TypeScript SDK for vllm-i64 — integer-first inference engine",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
@@ -28,7 +28,10 @@
28
28
  "moe",
29
29
  "openai",
30
30
  "ai",
31
- "complexity"
31
+ "complexity",
32
+ "search",
33
+ "perplexity",
34
+ "security"
32
35
  ],
33
36
  "author": "Complexity-ML / INL",
34
37
  "license": "Apache-2.0",