npm - localm-web - Versions diffs - 0.1.0 → 0.2.0 - Mend

localm-web 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +98 -0
package/README.md +3 -3
package/dist/assets/index-ChQoBCqA.js +23168 -0
package/dist/assets/index-ChQoBCqA.js.map +1 -0
package/dist/assets/inference.worker-CwvQtobb.js +330 -0
package/dist/assets/inference.worker-CwvQtobb.js.map +1 -0
package/dist/index.d.ts +374 -0
package/dist/index.js +518 -3
package/dist/index.js.map +1 -1
package/package.json +2 -2

package/dist/index.d.ts CHANGED Viewed

@@ -10,6 +10,26 @@
 export declare class BackendNotAvailableError extends LocalmWebError {
 }
+/** Snapshot of a single cached model's metadata. */
+export declare interface CachedModelEntry {
+    /** Friendly id from the registry (e.g. `"llama-3.2-1b-int4"`). */
+    id: string;
+    /** Backend-specific id (e.g. WebLLM `webllmId`). */
+    backendId: string;
+    /** Human-readable family name. */
+    family: string;
+    /** Approx parameter count, e.g. `"1B"`. */
+    parameters: string;
+}
+/** Aggregate storage usage reported by the browser. */
+export declare interface CacheUsage {
+    /** Bytes used by the entire origin's storage (not just our cache). */
+    usage: number;
+    /** Bytes the browser is willing to give the origin. */
+    quota: number;
+}
 /**
  * Multi-turn chat task.
  *
@@ -111,6 +131,99 @@ export declare class ChatReply {
  */
 export declare function collectStream(stream: AsyncIterable<TokenChunk>): Promise<string>;
+/**
+ * Raw text-completion task.
+ *
+ * Unlike {@link Chat}, `Completion` does not maintain a conversation history
+ * and does not apply a chat template. The prompt is fed to the model verbatim
+ * and the model continues it. Useful for "Once upon a time…" style generation,
+ * code completion, or any scenario where chat formatting would interfere.
+ *
+ * Use {@link Completion.create} to construct an instance — the constructor is
+ * private.
+ *
+ * @example
+ * ```ts
+ * const comp = await Completion.create("qwen2.5-1.5b-int4");
+ * const result = await comp.predict("Once upon a time", { maxTokens: 50 });
+ * console.log(result.text);
+ * ```
+ *
+ * @example Streaming
+ * ```ts
+ * const controller = new AbortController();
+ * for await (const token of comp.stream("def fibonacci(n):", { signal: controller.signal })) {
+ *   process.stdout.write(token.text);
+ * }
+ * ```
+ */
+export declare class Completion extends LMTask {
+    private constructor();
+    /**
+     * Create and load a `Completion` task for the given model.
+     *
+     * @param modelId - Friendly model id from the registry (e.g. `"qwen2.5-1.5b-int4"`).
+     * @param options - Optional creation options (progress callback, engine override).
+     */
+    static create(modelId: string, options?: LMTaskCreateOptions): Promise<Completion>;
+    /**
+     * Generate a continuation for the given prompt.
+     *
+     * @param prompt - Raw text fed to the model.
+     * @param options - Generation options.
+     * @returns A {@link CompletionResult} with the generated continuation.
+     */
+    predict(prompt: string, options?: GenerationOptions): Promise<CompletionResult>;
+    /**
+     * Stream a continuation for the given prompt as an async iterable of token
+     * chunks.
+     *
+     * @param prompt - Raw text fed to the model.
+     * @param options - Generation options including an optional `signal`.
+     */
+    stream(prompt: string, options?: GenerationOptions): AsyncIterable<TokenChunk>;
+}
+/**
+ * Result returned by `Completion.predict()`.
+ *
+ * Holds the generated continuation text (the prompt itself is not included)
+ * plus metadata about the generation loop.
+ */
+export declare class CompletionResult {
+    /** The generated text (continuation only, prompt excluded). */
+    readonly text: string;
+    /** The original prompt that was fed to the model. */
+    readonly prompt: string;
+    /** Number of tokens generated. 0 when the engine does not report it. */
+    readonly tokensGenerated: number;
+    /** Why the generation loop stopped. */
+    readonly finishReason: FinishReason;
+    constructor(
+    /** The generated text (continuation only, prompt excluded). */
+    text: string,
+    /** The original prompt that was fed to the model. */
+    prompt: string,
+    /** Number of tokens generated. 0 when the engine does not report it. */
+    tokensGenerated: number,
+    /** Why the generation loop stopped. */
+    finishReason: FinishReason);
+}
+/**
+ * Spawn a new inference Web Worker.
+ *
+ * Uses Vite/webpack-friendly `new Worker(new URL(...), { type: "module" })`
+ * syntax. The bundler emits the worker as a separate ES module chunk.
+ *
+ * Consumers normally do not call this directly — `LMTask.create()` invokes it
+ * when `inWorker: true` is set. It is exported for advanced scenarios (custom
+ * worker management, pooling, lifecycle integration with a host app).
+ *
+ * @returns A {@link WorkerLike}-compatible Worker instance.
+ */
+export declare function createInferenceWorker(): WorkerLike;
 /**
  * Runtime-agnostic inference contract.
  *
@@ -148,6 +261,32 @@ export declare interface Engine {
      * @throws GenerationAbortedError if `options.signal` is triggered.
      */
     stream(messages: Message[], options?: GenerationOptions): AsyncIterable<TokenChunk>;
+    /**
+     * Generate a non-streaming raw text completion.
+     *
+     * Unlike {@link Engine.generate}, this skips the chat template and feeds the
+     * prompt to the underlying model verbatim. Useful for "Once upon a time…"
+     * style continuation.
+     *
+     * @param prompt - Raw text fed to the model.
+     * @param options - Generation options.
+     * @returns The full generated text (excluding the prompt).
+     * @throws ModelNotLoadedError if called before {@link Engine.load}.
+     * @throws GenerationAbortedError if `options.signal` is triggered.
+     */
+    complete(prompt: string, options?: GenerationOptions): Promise<string>;
+    /**
+     * Stream a raw text completion as an async iterable of token chunks.
+     *
+     * Unlike {@link Engine.stream}, this skips the chat template.
+     *
+     * @param prompt - Raw text fed to the model.
+     * @param options - Generation options.
+     * @returns Async iterable yielding token chunks. The final chunk has `done: true`.
+     * @throws ModelNotLoadedError if called before {@link Engine.load}.
+     * @throws GenerationAbortedError if `options.signal` is triggered.
+     */
+    streamCompletion(prompt: string, options?: GenerationOptions): AsyncIterable<TokenChunk>;
     /** Release any resources held by the engine. Safe to call when not loaded. */
     unload(): Promise<void>;
     /** Whether a model is currently loaded and ready for inference. */
@@ -213,6 +352,7 @@ export declare abstract class LMTask {
      * @param options - Task creation options.
      */
     protected static createEngine(modelId: string, options?: LMTaskCreateOptions): Promise<ResolvedEngine>;
+    private static defaultEngine;
     /** Release engine resources. Safe to call multiple times. */
     unload(): Promise<void>;
     /** Whether the underlying engine has a loaded model. */
@@ -228,6 +368,15 @@ export declare interface LMTaskCreateOptions {
      * Production callers should let the SDK pick a backend automatically.
      */
     engine?: Engine;
+    /**
+     * Run inference inside a Web Worker, isolating the UI thread from
+     * tokenization and generation. Defaults to `false` in v0.2 (opt-in) and
+     * will flip to `true` in v0.3 once the Cache API / OPFS integration
+     * (also v0.2) has been validated against worker-thread storage access.
+     *
+     * Ignored when {@link engine} is provided.
+     */
+    inWorker?: boolean;
 }
 /**
@@ -269,10 +418,110 @@ export declare interface Message {
  */
 export declare const MODEL_PRESETS: Readonly<Record<string, ModelPreset>>;
+/**
+ * Inspect and manage cached model weights.
+ *
+ * `localm-web` does not download or cache weights itself — that work is owned
+ * by `@mlc-ai/web-llm`, which writes to the browser Cache API. `ModelCache`
+ * is a thin wrapper that lets a consuming app surface cache state in its UI:
+ * "this model is downloaded", "you have 1.4 GB cached, free up space?",
+ * "clear all models on logout".
+ *
+ * @example
+ * ```ts
+ * const cache = new ModelCache();
+ * if (await cache.has("llama-3.2-1b-int4")) {
+ *   console.log("ready offline");
+ * }
+ * const cached = await cache.list();
+ * await cache.delete("phi-3.5-mini-int4");
+ * const usage = await cache.estimateUsage();
+ * console.log(`${usage.usage} / ${usage.quota} bytes`);
+ * ```
+ */
+export declare class ModelCache {
+    private readonly hasModelHook;
+    private readonly deleteModelHook;
+    private readonly estimateHook;
+    constructor(options?: ModelCacheOptions);
+    /**
+     * Whether the model's weights are present in the browser cache.
+     *
+     * @param modelId - Friendly id from the registry.
+     * @throws UnknownModelError if `modelId` is not in the registry.
+     */
+    has(modelId: string): Promise<boolean>;
+    /**
+     * Delete a single model's weights from the browser cache. No-op when the
+     * model is not cached.
+     *
+     * @param modelId - Friendly id from the registry.
+     * @throws UnknownModelError if `modelId` is not in the registry.
+     */
+    delete(modelId: string): Promise<void>;
+    /**
+     * List the registry models that are currently cached.
+     *
+     * Iterates `MODEL_PRESETS` and probes each one. Only returns models known
+     * to the SDK — models cached by external WebLLM calls outside our registry
+     * are not included.
+     *
+     * @returns Empty list when nothing is cached.
+     */
+    list(): Promise<CachedModelEntry[]>;
+    /**
+     * Delete every registry model from the cache. Useful for logout flows or
+     * "reset" buttons. Models cached outside the registry are not touched.
+     */
+    clear(): Promise<void>;
+    /**
+     * Aggregate storage stats from the browser. Returned numbers cover the
+     * entire origin (Cache API + IndexedDB + Service Workers + OPFS), not
+     * just our model cache — use it for "you have X of Y available" hints.
+     */
+    estimateUsage(): Promise<CacheUsage>;
+    /**
+     * Throw a descriptive error if the given id is not in the registry.
+     * Exposed for code paths that want to validate before calling other
+     * methods (those already throw on their own).
+     *
+     * @throws UnknownModelError
+     */
+    static assertKnown(modelId: string): void;
+}
+/**
+ * Hooks the {@link ModelCache} uses to talk to the underlying runtime and
+ * the browser. Tests inject mocks; production code leaves them undefined,
+ * letting `ModelCache` resolve the real `@mlc-ai/web-llm` helpers and
+ * `navigator.storage.estimate()` lazily.
+ */
+export declare interface ModelCacheOptions {
+    /** Override `hasModelInCache` from the runtime. */
+    hasModel?: (backendId: string) => Promise<boolean>;
+    /** Override `deleteModelInCache` from the runtime. */
+    deleteModel?: (backendId: string) => Promise<void>;
+    /** Override `navigator.storage.estimate()`. */
+    estimate?: () => Promise<CacheUsage>;
+}
 /** Thrown when a model fails to load (network, parsing, runtime init). */
 export declare class ModelLoadError extends LocalmWebError {
 }
+/**
+ * Lifecycle phase of a model load.
+ *
+ * - `downloading`: weight files are being fetched from the network or cache.
+ * - `compiling`: the runtime is preparing the model (shader compilation,
+ *   tensor allocation, KV cache setup).
+ * - `loading`: a generic "still working" phase reported by the runtime when
+ *   it has not classified the work into download or compile.
+ * - `ready`: the model is loaded and the engine is ready for inference.
+ *   Emitted exactly once, at the end of a successful load.
+ */
+export declare type ModelLoadPhase = "downloading" | "compiling" | "loading" | "ready";
 /** Progress event emitted while a model is loading. */
 export declare interface ModelLoadProgress {
     /** Fraction of total work completed, in [0, 1]. */
@@ -283,6 +532,8 @@ export declare interface ModelLoadProgress {
     loaded: number;
     /** Total bytes to load. 0 when unavailable. */
     total: number;
+    /** Lifecycle phase classified from the runtime's status text. */
+    phase: ModelLoadPhase;
 }
 /** Thrown when an inference call is made before a model has loaded. */
@@ -337,6 +588,14 @@ export declare function resolveModelPreset(modelId: string): ModelPreset;
 /** Conversation roles supported by chat templates. */
 export declare type Role = "system" | "user" | "assistant" | "tool";
+/**
+ * Subset of {@link GenerationOptions} that survives `postMessage`.
+ *
+ * `AbortSignal` cannot be cloned across the worker boundary, so it is replaced
+ * by a separate {@link AbortRequest} message keyed on the same operation id.
+ */
+declare type SerializableGenerationOptions = Omit<GenerationOptions, "signal">;
 /**
  * Wrap an async iterable so that each `TokenChunk` is also passed to a
  * caller-supplied side-effect callback before being yielded downstream.
@@ -370,4 +629,119 @@ export declare const VERSION: string;
 export declare class WebGPUUnavailableError extends LocalmWebError {
 }
+/**
+ * Engine implementation that proxies all calls to a Web Worker.
+ *
+ * The worker holds the actual {@link WebLLMEngine}; this class is a thin RPC
+ * shell that serializes requests, tracks pending operations by a numeric id,
+ * and turns worker responses back into Promises and async iterables.
+ *
+ * Use {@link createInferenceWorker} to obtain a real worker. Tests can pass a
+ * {@link WorkerLike} mock implementing the same `postMessage` /
+ * `addEventListener` surface.
+ */
+export declare class WorkerEngine implements Engine {
+    private readonly worker;
+    private nextId;
+    private loaded;
+    private currentLoad;
+    private currentLoadId;
+    private currentLoadProgress;
+    private currentUnload;
+    private currentUnloadId;
+    private pendingGenerates;
+    private pendingStreams;
+    private readonly listener;
+    constructor(worker: WorkerLike);
+    isLoaded(): boolean;
+    load(modelId: string, onProgress?: ProgressCallback): Promise<void>;
+    generate(messages: Message[], options?: GenerationOptions): Promise<string>;
+    stream(messages: Message[], options?: GenerationOptions): AsyncIterable<TokenChunk>;
+    complete(prompt: string, options?: GenerationOptions): Promise<string>;
+    streamCompletion(prompt: string, options?: GenerationOptions): AsyncIterable<TokenChunk>;
+    unload(): Promise<void>;
+    /** Tear down the underlying worker. The engine is unusable after this. */
+    terminate(): void;
+    private allocateId;
+    private send;
+    private handleMessage;
+}
+/** Subset of `Worker` we depend on. Lets tests inject a mock. */
+export declare interface WorkerLike {
+    postMessage(message: WorkerRequest): void;
+    addEventListener(type: "message", listener: (event: MessageEvent<WorkerResponse>) => void): void;
+    removeEventListener(type: "message", listener: (event: MessageEvent<WorkerResponse>) => void): void;
+    terminate(): void;
+}
+/** Operation request sent from the main thread to the worker. */
+declare type WorkerRequest = {
+    op: "load";
+    id: number;
+    modelId: string;
+} | {
+    op: "generate";
+    id: number;
+    messages: Message[];
+    options: SerializableGenerationOptions;
+} | {
+    op: "stream";
+    id: number;
+    messages: Message[];
+    options: SerializableGenerationOptions;
+} | {
+    op: "complete";
+    id: number;
+    prompt: string;
+    options: SerializableGenerationOptions;
+} | {
+    op: "stream-completion";
+    id: number;
+    prompt: string;
+    options: SerializableGenerationOptions;
+} | {
+    op: "abort";
+    id: number;
+} | {
+    op: "unload";
+    id: number;
+} | {
+    op: "isLoaded";
+    id: number;
+};
+/** Operation response sent from the worker back to the main thread. */
+declare type WorkerResponse = {
+    op: "loaded";
+    id: number;
+} | {
+    op: "generated";
+    id: number;
+    text: string;
+} | {
+    op: "progress";
+    id: number;
+    payload: ModelLoadProgress;
+} | {
+    op: "token";
+    id: number;
+    chunk: TokenChunk;
+} | {
+    op: "stream-end";
+    id: number;
+} | {
+    op: "error";
+    id: number;
+    name: string;
+    message: string;
+} | {
+    op: "unloaded";
+    id: number;
+} | {
+    op: "is-loaded";
+    id: number;
+    value: boolean;
+};
 export { }