localm-web 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -10,6 +10,26 @@
10
10
  export declare class BackendNotAvailableError extends LocalmWebError {
11
11
  }
12
12
 
13
+ /** Snapshot of a single cached model's metadata. */
14
+ export declare interface CachedModelEntry {
15
+ /** Friendly id from the registry (e.g. `"llama-3.2-1b-int4"`). */
16
+ id: string;
17
+ /** Backend-specific id (e.g. WebLLM `webllmId`). */
18
+ backendId: string;
19
+ /** Human-readable family name. */
20
+ family: string;
21
+ /** Approx parameter count, e.g. `"1B"`. */
22
+ parameters: string;
23
+ }
24
+
25
+ /** Aggregate storage usage reported by the browser. */
26
+ export declare interface CacheUsage {
27
+ /** Bytes used by the entire origin's storage (not just our cache). */
28
+ usage: number;
29
+ /** Bytes the browser is willing to give the origin. */
30
+ quota: number;
31
+ }
32
+
13
33
  /**
14
34
  * Multi-turn chat task.
15
35
  *
@@ -111,6 +131,99 @@ export declare class ChatReply {
111
131
  */
112
132
  export declare function collectStream(stream: AsyncIterable<TokenChunk>): Promise<string>;
113
133
 
134
+ /**
135
+ * Raw text-completion task.
136
+ *
137
+ * Unlike {@link Chat}, `Completion` does not maintain a conversation history
138
+ * and does not apply a chat template. The prompt is fed to the model verbatim
139
+ * and the model continues it. Useful for "Once upon a time…" style generation,
140
+ * code completion, or any scenario where chat formatting would interfere.
141
+ *
142
+ * Use {@link Completion.create} to construct an instance — the constructor is
143
+ * private.
144
+ *
145
+ * @example
146
+ * ```ts
147
+ * const comp = await Completion.create("qwen2.5-1.5b-int4");
148
+ * const result = await comp.predict("Once upon a time", { maxTokens: 50 });
149
+ * console.log(result.text);
150
+ * ```
151
+ *
152
+ * @example Streaming
153
+ * ```ts
154
+ * const controller = new AbortController();
155
+ * for await (const token of comp.stream("def fibonacci(n):", { signal: controller.signal })) {
156
+ * process.stdout.write(token.text);
157
+ * }
158
+ * ```
159
+ */
160
+ export declare class Completion extends LMTask {
161
+ private constructor();
162
+ /**
163
+ * Create and load a `Completion` task for the given model.
164
+ *
165
+ * @param modelId - Friendly model id from the registry (e.g. `"qwen2.5-1.5b-int4"`).
166
+ * @param options - Optional creation options (progress callback, engine override).
167
+ */
168
+ static create(modelId: string, options?: LMTaskCreateOptions): Promise<Completion>;
169
+ /**
170
+ * Generate a continuation for the given prompt.
171
+ *
172
+ * @param prompt - Raw text fed to the model.
173
+ * @param options - Generation options.
174
+ * @returns A {@link CompletionResult} with the generated continuation.
175
+ */
176
+ predict(prompt: string, options?: GenerationOptions): Promise<CompletionResult>;
177
+ /**
178
+ * Stream a continuation for the given prompt as an async iterable of token
179
+ * chunks.
180
+ *
181
+ * @param prompt - Raw text fed to the model.
182
+ * @param options - Generation options including an optional `signal`.
183
+ */
184
+ stream(prompt: string, options?: GenerationOptions): AsyncIterable<TokenChunk>;
185
+ }
186
+
187
+ /**
188
+ * Result returned by `Completion.predict()`.
189
+ *
190
+ * Holds the generated continuation text (the prompt itself is not included)
191
+ * plus metadata about the generation loop.
192
+ */
193
+ export declare class CompletionResult {
194
+ /** The generated text (continuation only, prompt excluded). */
195
+ readonly text: string;
196
+ /** The original prompt that was fed to the model. */
197
+ readonly prompt: string;
198
+ /** Number of tokens generated. 0 when the engine does not report it. */
199
+ readonly tokensGenerated: number;
200
+ /** Why the generation loop stopped. */
201
+ readonly finishReason: FinishReason;
202
+ constructor(
203
+ /** The generated text (continuation only, prompt excluded). */
204
+ text: string,
205
+ /** The original prompt that was fed to the model. */
206
+ prompt: string,
207
+ /** Number of tokens generated. 0 when the engine does not report it. */
208
+ tokensGenerated: number,
209
+ /** Why the generation loop stopped. */
210
+ finishReason: FinishReason);
211
+ }
212
+
213
+ /**
214
+ * Spawn a new inference Web Worker.
215
+ *
216
+ * Uses Vite/webpack-friendly `new Worker(new URL(...), { type: "module" })`
217
+ * syntax. The bundler emits the worker as a separate ES module chunk.
218
+ *
219
+ * Consumers normally do not call this directly — `LMTask.create()` invokes it
220
+ * when `inWorker: true` is set. It is exported for advanced scenarios (custom
221
+ * worker management, pooling, lifecycle integration with a host app).
222
+ *
223
+ * @returns A {@link WorkerLike}-compatible Worker instance.
224
+ */
225
+ export declare function createInferenceWorker(): WorkerLike;
226
+
114
227
  /**
115
228
  * Runtime-agnostic inference contract.
116
229
  *
@@ -148,6 +261,32 @@ export declare interface Engine {
148
261
  * @throws GenerationAbortedError if `options.signal` is triggered.
149
262
  */
150
263
  stream(messages: Message[], options?: GenerationOptions): AsyncIterable<TokenChunk>;
264
+ /**
265
+ * Generate a non-streaming raw text completion.
266
+ *
267
+ * Unlike {@link Engine.generate}, this skips the chat template and feeds the
268
+ * prompt to the underlying model verbatim. Useful for "Once upon a time…"
269
+ * style continuation.
270
+ *
271
+ * @param prompt - Raw text fed to the model.
272
+ * @param options - Generation options.
273
+ * @returns The full generated text (excluding the prompt).
274
+ * @throws ModelNotLoadedError if called before {@link Engine.load}.
275
+ * @throws GenerationAbortedError if `options.signal` is triggered.
276
+ */
277
+ complete(prompt: string, options?: GenerationOptions): Promise<string>;
278
+ /**
279
+ * Stream a raw text completion as an async iterable of token chunks.
280
+ *
281
+ * Unlike {@link Engine.stream}, this skips the chat template.
282
+ *
283
+ * @param prompt - Raw text fed to the model.
284
+ * @param options - Generation options.
285
+ * @returns Async iterable yielding token chunks. The final chunk has `done: true`.
286
+ * @throws ModelNotLoadedError if called before {@link Engine.load}.
287
+ * @throws GenerationAbortedError if `options.signal` is triggered.
288
+ */
289
+ streamCompletion(prompt: string, options?: GenerationOptions): AsyncIterable<TokenChunk>;
151
290
  /** Release any resources held by the engine. Safe to call when not loaded. */
152
291
  unload(): Promise<void>;
153
292
  /** Whether a model is currently loaded and ready for inference. */
@@ -213,6 +352,7 @@ export declare abstract class LMTask {
213
352
  * @param options - Task creation options.
214
353
  */
215
354
  protected static createEngine(modelId: string, options?: LMTaskCreateOptions): Promise<ResolvedEngine>;
355
+ private static defaultEngine;
216
356
  /** Release engine resources. Safe to call multiple times. */
217
357
  unload(): Promise<void>;
218
358
  /** Whether the underlying engine has a loaded model. */
@@ -228,6 +368,15 @@ export declare interface LMTaskCreateOptions {
228
368
  * Production callers should let the SDK pick a backend automatically.
229
369
  */
230
370
  engine?: Engine;
371
+ /**
372
+ * Run inference inside a Web Worker, isolating the UI thread from
373
+ * tokenization and generation. Defaults to `false` in v0.2 (opt-in) and
374
+ * will flip to `true` in v0.3 once the Cache API / OPFS integration
375
+ * (also v0.2) has been validated against worker-thread storage access.
376
+ *
377
+ * Ignored when {@link engine} is provided.
378
+ */
379
+ inWorker?: boolean;
231
380
  }
232
381
 
233
382
  /**
@@ -269,10 +418,110 @@ export declare interface Message {
269
418
  */
270
419
  export declare const MODEL_PRESETS: Readonly<Record<string, ModelPreset>>;
271
420
 
421
+ /**
422
+ * Inspect and manage cached model weights.
423
+ *
424
+ * `localm-web` does not download or cache weights itself — that work is owned
425
+ * by `@mlc-ai/web-llm`, which writes to the browser Cache API. `ModelCache`
426
+ * is a thin wrapper that lets a consuming app surface cache state in its UI:
427
+ * "this model is downloaded", "you have 1.4 GB cached, free up space?",
428
+ * "clear all models on logout".
429
+ *
430
+ * @example
431
+ * ```ts
432
+ * const cache = new ModelCache();
433
+ * if (await cache.has("llama-3.2-1b-int4")) {
434
+ * console.log("ready offline");
435
+ * }
436
+ * const cached = await cache.list();
437
+ * await cache.delete("phi-3.5-mini-int4");
438
+ * const usage = await cache.estimateUsage();
439
+ * console.log(`${usage.usage} / ${usage.quota} bytes`);
440
+ * ```
441
+ */
442
+ export declare class ModelCache {
443
+ private readonly hasModelHook;
444
+ private readonly deleteModelHook;
445
+ private readonly estimateHook;
446
+ constructor(options?: ModelCacheOptions);
447
+ /**
448
+ * Whether the model's weights are present in the browser cache.
449
+ *
450
+ * @param modelId - Friendly id from the registry.
451
+ * @throws UnknownModelError if `modelId` is not in the registry.
452
+ */
453
+ has(modelId: string): Promise<boolean>;
454
+ /**
455
+ * Delete a single model's weights from the browser cache. No-op when the
456
+ * model is not cached.
457
+ *
458
+ * @param modelId - Friendly id from the registry.
459
+ * @throws UnknownModelError if `modelId` is not in the registry.
460
+ */
461
+ delete(modelId: string): Promise<void>;
462
+ /**
463
+ * List the registry models that are currently cached.
464
+ *
465
+ * Iterates `MODEL_PRESETS` and probes each one. Only returns models known
466
+ * to the SDK — models cached by external WebLLM calls outside our registry
467
+ * are not included.
468
+ *
469
+ * @returns Empty list when nothing is cached.
470
+ */
471
+ list(): Promise<CachedModelEntry[]>;
472
+ /**
473
+ * Delete every registry model from the cache. Useful for logout flows or
474
+ * "reset" buttons. Models cached outside the registry are not touched.
475
+ */
476
+ clear(): Promise<void>;
477
+ /**
478
+ * Aggregate storage stats from the browser. Returned numbers cover the
479
+ * entire origin (Cache API + IndexedDB + Service Workers + OPFS), not
480
+ * just our model cache — use it for "you have X of Y available" hints.
481
+ */
482
+ estimateUsage(): Promise<CacheUsage>;
483
+ /**
484
+ * Throw a descriptive error if the given id is not in the registry.
485
+ * Exposed for code paths that want to validate before calling other
486
+ * methods (those already throw on their own).
487
+ *
488
+ * @throws UnknownModelError
489
+ */
490
+ static assertKnown(modelId: string): void;
491
+ }
492
+
493
+ /**
494
+ * Hooks the {@link ModelCache} uses to talk to the underlying runtime and
495
+ * the browser. Tests inject mocks; production code leaves them undefined,
496
+ * letting `ModelCache` resolve the real `@mlc-ai/web-llm` helpers and
497
+ * `navigator.storage.estimate()` lazily.
498
+ */
499
+ export declare interface ModelCacheOptions {
500
+ /** Override `hasModelInCache` from the runtime. */
501
+ hasModel?: (backendId: string) => Promise<boolean>;
502
+ /** Override `deleteModelInCache` from the runtime. */
503
+ deleteModel?: (backendId: string) => Promise<void>;
504
+ /** Override `navigator.storage.estimate()`. */
505
+ estimate?: () => Promise<CacheUsage>;
506
+ }
507
+
272
508
  /** Thrown when a model fails to load (network, parsing, runtime init). */
273
509
  export declare class ModelLoadError extends LocalmWebError {
274
510
  }
275
511
 
512
+ /**
513
+ * Lifecycle phase of a model load.
514
+ *
515
+ * - `downloading`: weight files are being fetched from the network or cache.
516
+ * - `compiling`: the runtime is preparing the model (shader compilation,
517
+ * tensor allocation, KV cache setup).
518
+ * - `loading`: a generic "still working" phase reported by the runtime when
519
+ * it has not classified the work into download or compile.
520
+ * - `ready`: the model is loaded and the engine is ready for inference.
521
+ * Emitted exactly once, at the end of a successful load.
522
+ */
523
+ export declare type ModelLoadPhase = "downloading" | "compiling" | "loading" | "ready";
524
+
276
525
  /** Progress event emitted while a model is loading. */
277
526
  export declare interface ModelLoadProgress {
278
527
  /** Fraction of total work completed, in [0, 1]. */
@@ -283,6 +532,8 @@ export declare interface ModelLoadProgress {
283
532
  loaded: number;
284
533
  /** Total bytes to load. 0 when unavailable. */
285
534
  total: number;
535
+ /** Lifecycle phase classified from the runtime's status text. */
536
+ phase: ModelLoadPhase;
286
537
  }
287
538
 
288
539
  /** Thrown when an inference call is made before a model has loaded. */
@@ -337,6 +588,14 @@ export declare function resolveModelPreset(modelId: string): ModelPreset;
337
588
  /** Conversation roles supported by chat templates. */
338
589
  export declare type Role = "system" | "user" | "assistant" | "tool";
339
590
 
591
+ /**
592
+ * Subset of {@link GenerationOptions} that survives `postMessage`.
593
+ *
594
+ * `AbortSignal` cannot be cloned across the worker boundary, so it is replaced
595
+ * by a separate {@link AbortRequest} message keyed on the same operation id.
596
+ */
597
+ declare type SerializableGenerationOptions = Omit<GenerationOptions, "signal">;
598
+
340
599
  /**
341
600
  * Wrap an async iterable so that each `TokenChunk` is also passed to a
342
601
  * caller-supplied side-effect callback before being yielded downstream.
@@ -370,4 +629,119 @@ export declare const VERSION: string;
370
629
  export declare class WebGPUUnavailableError extends LocalmWebError {
371
630
  }
372
631
 
632
+ /**
633
+ * Engine implementation that proxies all calls to a Web Worker.
634
+ *
635
+ * The worker holds the actual {@link WebLLMEngine}; this class is a thin RPC
636
+ * shell that serializes requests, tracks pending operations by a numeric id,
637
+ * and turns worker responses back into Promises and async iterables.
638
+ *
639
+ * Use {@link createInferenceWorker} to obtain a real worker. Tests can pass a
640
+ * {@link WorkerLike} mock implementing the same `postMessage` /
641
+ * `addEventListener` surface.
642
+ */
643
+ export declare class WorkerEngine implements Engine {
644
+ private readonly worker;
645
+ private nextId;
646
+ private loaded;
647
+ private currentLoad;
648
+ private currentLoadId;
649
+ private currentLoadProgress;
650
+ private currentUnload;
651
+ private currentUnloadId;
652
+ private pendingGenerates;
653
+ private pendingStreams;
654
+ private readonly listener;
655
+ constructor(worker: WorkerLike);
656
+ isLoaded(): boolean;
657
+ load(modelId: string, onProgress?: ProgressCallback): Promise<void>;
658
+ generate(messages: Message[], options?: GenerationOptions): Promise<string>;
659
+ stream(messages: Message[], options?: GenerationOptions): AsyncIterable<TokenChunk>;
660
+ complete(prompt: string, options?: GenerationOptions): Promise<string>;
661
+ streamCompletion(prompt: string, options?: GenerationOptions): AsyncIterable<TokenChunk>;
662
+ unload(): Promise<void>;
663
+ /** Tear down the underlying worker. The engine is unusable after this. */
664
+ terminate(): void;
665
+ private allocateId;
666
+ private send;
667
+ private handleMessage;
668
+ }
669
+
670
+ /** Subset of `Worker` we depend on. Lets tests inject a mock. */
671
+ export declare interface WorkerLike {
672
+ postMessage(message: WorkerRequest): void;
673
+ addEventListener(type: "message", listener: (event: MessageEvent<WorkerResponse>) => void): void;
674
+ removeEventListener(type: "message", listener: (event: MessageEvent<WorkerResponse>) => void): void;
675
+ terminate(): void;
676
+ }
677
+
678
+ /** Operation request sent from the main thread to the worker. */
679
+ declare type WorkerRequest = {
680
+ op: "load";
681
+ id: number;
682
+ modelId: string;
683
+ } | {
684
+ op: "generate";
685
+ id: number;
686
+ messages: Message[];
687
+ options: SerializableGenerationOptions;
688
+ } | {
689
+ op: "stream";
690
+ id: number;
691
+ messages: Message[];
692
+ options: SerializableGenerationOptions;
693
+ } | {
694
+ op: "complete";
695
+ id: number;
696
+ prompt: string;
697
+ options: SerializableGenerationOptions;
698
+ } | {
699
+ op: "stream-completion";
700
+ id: number;
701
+ prompt: string;
702
+ options: SerializableGenerationOptions;
703
+ } | {
704
+ op: "abort";
705
+ id: number;
706
+ } | {
707
+ op: "unload";
708
+ id: number;
709
+ } | {
710
+ op: "isLoaded";
711
+ id: number;
712
+ };
713
+
714
+ /** Operation response sent from the worker back to the main thread. */
715
+ declare type WorkerResponse = {
716
+ op: "loaded";
717
+ id: number;
718
+ } | {
719
+ op: "generated";
720
+ id: number;
721
+ text: string;
722
+ } | {
723
+ op: "progress";
724
+ id: number;
725
+ payload: ModelLoadProgress;
726
+ } | {
727
+ op: "token";
728
+ id: number;
729
+ chunk: TokenChunk;
730
+ } | {
731
+ op: "stream-end";
732
+ id: number;
733
+ } | {
734
+ op: "error";
735
+ id: number;
736
+ name: string;
737
+ message: string;
738
+ } | {
739
+ op: "unloaded";
740
+ id: number;
741
+ } | {
742
+ op: "is-loaded";
743
+ id: number;
744
+ value: boolean;
745
+ };
746
+
373
747
  export { }