npm - @sauravpanda/flare - Versions diffs - 0.1.0 - Mend

@sauravpanda/flare 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/demo/README.md +40 -0
package/demo/index.html +1767 -0
package/js/index.ts +91 -0
package/js/types.ts +136 -0
package/js/webtransport-loader.js +126 -0
package/js/worker.ts +159 -0
package/package.json +58 -0
package/pkg/flare_web.d.ts +1164 -0
package/pkg/flare_web.js +2790 -0
package/pkg/flare_web_bg.wasm +0 -0
package/pkg/flare_web_bg.wasm.d.ts +105 -0
package/pkg/package.json +27 -0

package/js/index.ts ADDED Viewed

@@ -0,0 +1,91 @@
+/**
+ * @module @aspect/flare
+ *
+ * WASM-first LLM inference engine with WebGPU acceleration.
+ *
+ * @example
+ * ```typescript
+ * import { Flare } from '@aspect/flare';
+ *
+ * const flare = await Flare.init();
+ * console.log('WebGPU available:', flare.webgpuAvailable);
+ * ```
+ */
+// Re-export WASM bindings (generated by wasm-pack)
+export { webgpu_available, device_info, init } from '../pkg/flare_web';
+export interface FlareConfig {
+  /** URL to fetch the GGUF model from */
+  modelUrl?: string;
+  /** Whether to cache model weights in browser Cache API */
+  cache?: boolean;
+  /** Progress callback: (loaded bytes, total bytes) */
+  onProgress?: (loaded: number, total: number) => void;
+}
+export interface GenerateOptions {
+  /** Input prompt text */
+  prompt: string;
+  /** Maximum tokens to generate */
+  maxTokens?: number;
+  /** Sampling temperature (0 = greedy, higher = more random) */
+  temperature?: number;
+  /** Top-p (nucleus) sampling threshold */
+  topP?: number;
+  /** Top-k sampling limit */
+  topK?: number;
+  /** Repeat penalty for previously generated tokens */
+  repeatPenalty?: number;
+}
+export interface ChatMessage {
+  role: 'system' | 'user' | 'assistant';
+  content: string;
+}
+export interface ChatOptions {
+  messages: ChatMessage[];
+  maxTokens?: number;
+  temperature?: number;
+  topP?: number;
+  stream?: boolean;
+}
+/**
+ * High-level Flare engine interface.
+ * Wraps the low-level WASM bindings with a friendly API.
+ */
+export class Flare {
+  private constructor() {}
+  /**
+   * Initialize the Flare engine.
+   * Detects WebGPU availability and sets up compute pipelines.
+   */
+  static async init(): Promise<Flare> {
+    const { init } = await import('../pkg/flare_web');
+    await init();
+    return new Flare();
+  }
+  /** Check if WebGPU is available in this browser. */
+  get webgpuAvailable(): boolean {
+    try {
+      const { webgpu_available } = require('../pkg/flare_web');
+      return webgpu_available();
+    } catch {
+      return false;
+    }
+  }
+  /** Get device capability info as JSON. */
+  get deviceInfo(): Record<string, unknown> {
+    try {
+      const { device_info } = require('../pkg/flare_web');
+      return JSON.parse(device_info());
+    } catch {
+      return { webgpu: false };
+    }
+  }
+}

package/js/types.ts ADDED Viewed

@@ -0,0 +1,136 @@
+/**
+ * TypeScript type definitions for the Flare WASM module.
+ * These map to the wasm-bindgen exports from flare-web.
+ */
+/** Check if WebGPU is available in this browser. */
+export declare function webgpu_available(): boolean;
+/** Get JSON string with device capabilities. */
+export declare function device_info(): string;
+/** Initialize the Flare engine (async). */
+export declare function init(): Promise<string>;
+/** Progress callback: (loaded_bytes, total_bytes). total_bytes is 0 when unknown. */
+export type ProgressCallback = (loaded: number, total: number) => void;
+/**
+ * The Flare inference engine.
+ * Load a GGUF model, then use the streaming API (begin_stream / next_token)
+ * or the batch API (generate_tokens) to run inference.
+ */
+export declare class FlareEngine {
+  /** Load a GGUF model from raw bytes. */
+  static load(ggufBytes: Uint8Array): FlareEngine;
+  /** Reset the KV cache (start a new conversation). */
+  reset(): void;
+  /** Vocabulary size. */
+  readonly vocab_size: number;
+  /** Number of transformer layers. */
+  readonly num_layers: number;
+  /** Hidden dimension. */
+  readonly hidden_dim: number;
+  /**
+   * Auto-detected chat template name: "ChatML", "Llama3", "Alpaca", or "Raw".
+   * Detected from the GGUF tokenizer.chat_template metadata, falling back to
+   * architecture-based detection.
+   */
+  readonly chat_template_name: string;
+  /**
+   * EOS token ID read from the GGUF model metadata, if present.
+   * The generator stops automatically when this token is produced.
+   */
+  readonly eos_token_id: number | undefined;
+  /**
+   * Format a user message and optional system prompt using the model's chat
+   * template.  Pass the result to FlareTokenizer.encode() before generating.
+   * Pass an empty string for systemMessage to omit the system turn.
+   */
+  apply_chat_template(userMessage: string, systemMessage: string): string;
+  // --- Token-by-token streaming API ---
+  /**
+   * Prepare for token-by-token streaming.  Runs the prefill pass on
+   * promptTokens and initialises internal streaming state.  Call engine.reset()
+   * first to start a fresh conversation, then call next_token() in a
+   * requestAnimationFrame loop.
+   */
+  begin_stream(promptTokens: Uint32Array, maxTokens: number): void;
+  /**
+   * Generate the next token and return its ID, or undefined when the stream is
+   * complete (EOS reached, maxTokens exhausted, or stop_stream() was called).
+   * Call inside requestAnimationFrame so the browser can update the DOM between
+   * tokens.
+   */
+  next_token(): number | undefined;
+  /** Signal the current stream to stop after the next next_token() call. */
+  stop_stream(): void;
+  /** Whether the current stream has finished. */
+  readonly stream_done: boolean;
+  // --- Batch generation API (returns all tokens at once) ---
+  /** Generate tokens (greedy, temperature=0). Stops at EOS. Returns token ID array. */
+  generate_tokens(promptTokens: Uint32Array, maxTokens: number): Uint32Array;
+  /** Generate tokens with sampling parameters. Stops at EOS. */
+  generate_with_params(
+    promptTokens: Uint32Array,
+    maxTokens: number,
+    temperature: number,
+    topP: number
+  ): Uint32Array;
+}
+/**
+ * Progressive loader: fetches a GGUF model from a URL with streaming download
+ * progress, then parses and returns a FlareEngine.
+ */
+export declare class FlareProgressiveLoader {
+  constructor(url: string);
+  /** Fetch, stream, and parse the model. Calls onProgress as chunks arrive. */
+  load(onProgress: ProgressCallback): Promise<FlareEngine>;
+}
+/**
+ * BPE tokenizer: encode text to token IDs and decode token IDs back to text.
+ * Load from a HuggingFace tokenizer.json string.
+ */
+export declare class FlareTokenizer {
+  /** Load from a tokenizer.json string. */
+  static from_json(json: string): FlareTokenizer;
+  /** Encode text to token IDs. */
+  encode(text: string): Uint32Array;
+  /** Decode token IDs to text. */
+  decode(tokens: Uint32Array): string;
+  /** Decode a single token ID to text (for streaming). */
+  decode_one(tokenId: number): string;
+  /** BOS token ID (may be undefined). */
+  readonly bos_token_id: number | undefined;
+  /** EOS token ID (may be undefined). */
+  readonly eos_token_id: number | undefined;
+  /** Vocabulary size. */
+  readonly vocab_size: number;
+}
+/**
+ * BPE tokenizer: encode text to token IDs and decode token IDs back to text.
+ * Load from a HuggingFace tokenizer.json string.
+ */
+export declare class FlareTokenizer {
+  /** Load from a tokenizer.json string. */
+  static from_json(json: string): FlareTokenizer;
+  /** Encode text to token IDs. */
+  encode(text: string): Uint32Array;
+  /** Decode token IDs to text. */
+  decode(tokens: Uint32Array): string;
+  /** Decode a single token ID to text (for streaming). */
+  decode_one(tokenId: number): string;
+  /** BOS token ID (may be undefined). */
+  readonly bos_token_id: number | undefined;
+  /** EOS token ID (may be undefined). */
+  readonly eos_token_id: number | undefined;
+  /** Vocabulary size. */
+  readonly vocab_size: number;
+}

package/js/webtransport-loader.js ADDED Viewed

@@ -0,0 +1,126 @@
+/**
+ * Progressive model loader using WebTransport for parallel stream downloads.
+ *
+ * WebTransport is built on HTTP/3 QUIC and allows multiple bidirectional
+ * streams over a single connection, avoiding the head-of-line blocking that
+ * plagues HTTP/1.1/2 range requests when downloading large model files.
+ *
+ * NOTE ON SERVER SUPPORT:
+ *   For parallel stream downloads to actually happen, the server must:
+ *     1. Speak HTTP/3 and expose a WebTransport endpoint at the model URL.
+ *     2. Accept a per-stream protocol where the client sends a byte range
+ *        (e.g. as a small framed message) and the server streams those bytes
+ *        back on the same bidirectional stream.
+ *   No such server is shipped with Flare today. Until one exists, this loader
+ *   transparently falls back to `fetch()` with streaming, which still gives
+ *   progressive load + progress callbacks over HTTP/1.1 or HTTP/2.
+ *
+ * Usage:
+ *   const loader = new WebTransportLoader('https://example.com/model.gguf', 4);
+ *   const bytes = await loader.load((loaded, total) => {
+ *     console.log(`${loaded} / ${total}`);
+ *   });
+ */
+export class WebTransportLoader {
+    /**
+     * @param {string} url   Absolute URL to the model file.
+     * @param {number} numStreams  Number of parallel streams to attempt when
+     *                             WebTransport + a cooperating server are
+     *                             available. Ignored by the fetch fallback.
+     */
+    constructor(url, numStreams = 4) {
+        this.url = url;
+        this.numStreams = numStreams;
+    }
+    /**
+     * Load the model bytes, invoking `onProgress(loaded, total)` as data
+     * arrives. Returns a Uint8Array containing the full file.
+     *
+     * @param {(loaded: number, total: number) => void} [onProgress]
+     * @returns {Promise<Uint8Array>}
+     */
+    async load(onProgress) {
+        if (typeof WebTransport === 'undefined') {
+            // Browser has no WebTransport at all — use fetch streaming.
+            return this.loadViaFetch(onProgress);
+        }
+        try {
+            const wt = new WebTransport(this.url);
+            await wt.ready;
+            // Server-side protocol for parallel range streaming is not yet
+            // standardized in this project. Once a Flare WebTransport server
+            // exists, this block should:
+            //   1. HEAD the resource (or open a control stream) to learn the
+            //      total byte length.
+            //   2. Open `this.numStreams` bidirectional streams via
+            //      `wt.createBidirectionalStream()`.
+            //   3. Send a framed { offset, length } request on each stream.
+            //   4. Reassemble the chunks in offset order as they arrive,
+            //      invoking `onProgress` on every chunk.
+            //
+            // Until that server exists, we close the WebTransport session and
+            // fall back to fetch — the browser has already done a QUIC
+            // handshake, which is wasted work but harmless. We log so that
+            // anyone instrumenting this path can see it.
+            console.info(
+                'WebTransport session opened, but no parallel-range server ' +
+                'protocol is implemented yet; falling back to fetch().'
+            );
+            await wt.close();
+        } catch (e) {
+            console.warn('WebTransport failed, falling back to fetch:', e);
+        }
+        return this.loadViaFetch(onProgress);
+    }
+    /**
+     * Fallback path: stream the body via `fetch()` and report progress from
+     * the Content-Length header. Works on any HTTP/1.1+ origin.
+     *
+     * @param {(loaded: number, total: number) => void} [onProgress]
+     * @returns {Promise<Uint8Array>}
+     */
+    async loadViaFetch(onProgress) {
+        const response = await fetch(this.url);
+        if (!response.ok) {
+            throw new Error(
+                `Failed to fetch model: ${response.status} ${response.statusText}`
+            );
+        }
+        if (!response.body) {
+            // Non-streaming environment (very old browser or CORS-restricted).
+            const buf = new Uint8Array(await response.arrayBuffer());
+            if (onProgress) onProgress(buf.length, buf.length);
+            return buf;
+        }
+        const contentLength = parseInt(
+            response.headers.get('content-length') || '0',
+            10
+        );
+        const reader = response.body.getReader();
+        const chunks = [];
+        let loaded = 0;
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            chunks.push(value);
+            loaded += value.length;
+            if (onProgress) onProgress(loaded, contentLength);
+        }
+        // Concatenate chunks into a single contiguous Uint8Array.
+        const result = new Uint8Array(loaded);
+        let offset = 0;
+        for (const chunk of chunks) {
+            result.set(chunk, offset);
+            offset += chunk.length;
+        }
+        return result;
+    }
+}

package/js/worker.ts ADDED Viewed

@@ -0,0 +1,159 @@
+/**
+ * Web Worker bootstrap for running Flare inference off the main thread.
+ *
+ * Architecture:
+ * - Main thread sends messages: { type: 'init' | 'generate', ... }
+ * - Worker runs WASM inference and posts back tokens as they're generated
+ * - All GPU operations happen in the worker (WebGPU is available in workers)
+ *
+ * Usage from main thread:
+ * ```typescript
+ * const worker = new Worker(new URL('./worker.ts', import.meta.url), { type: 'module' });
+ * worker.postMessage({ type: 'init' });
+ * worker.postMessage({ type: 'generate', prompt: 'Hello', maxTokens: 128 });
+ * worker.onmessage = (e) => {
+ *   if (e.data.type === 'token') console.log(e.data.text);
+ *   if (e.data.type === 'done') console.log('Generation complete');
+ * };
+ * ```
+ */
+// Message types from main thread to worker
+interface InitMessage {
+  type: 'init';
+  modelUrl?: string;
+  wasmUrl?: string;
+}
+interface GenerateMessage {
+  type: 'generate';
+  prompt: string;
+  maxTokens?: number;
+  temperature?: number;
+  topP?: number;
+  topK?: number;
+}
+interface AbortMessage {
+  type: 'abort';
+}
+type IncomingMessage = InitMessage | GenerateMessage | AbortMessage;
+// Message types from worker to main thread
+interface ReadyMessage {
+  type: 'ready';
+  webgpu: boolean;
+}
+interface TokenMessage {
+  type: 'token';
+  text: string;
+  tokenId: number;
+}
+interface DoneMessage {
+  type: 'done';
+  totalTokens: number;
+  tokensPerSecond: number;
+}
+interface ErrorMessage {
+  type: 'error';
+  message: string;
+}
+interface ProgressMessage {
+  type: 'progress';
+  loaded: number;
+  total: number;
+}
+type OutgoingMessage = ReadyMessage | TokenMessage | DoneMessage | ErrorMessage | ProgressMessage;
+// Worker state
+let initialized = false;
+let aborted = false;
+function postResult(msg: OutgoingMessage) {
+  (self as unknown as { postMessage(msg: OutgoingMessage): void }).postMessage(msg);
+}
+async function handleInit(_msg: InitMessage) {
+  try {
+    // Import and initialize WASM module
+    // In a real build, this path comes from wasm-pack output
+    const flare = await import('../pkg/flare_web.js');
+    await flare.init();
+    const webgpu = flare.webgpu_available();
+    initialized = true;
+    postResult({ type: 'ready', webgpu });
+  } catch (err) {
+    postResult({
+      type: 'error',
+      message: `Init failed: ${err}`,
+    });
+  }
+}
+async function handleGenerate(msg: GenerateMessage) {
+  if (!initialized) {
+    postResult({ type: 'error', message: 'Worker not initialized. Send init message first.' });
+    return;
+  }
+  aborted = false;
+  const startTime = performance.now();
+  let tokenCount = 0;
+  try {
+    // TODO: Wire up actual WASM inference here
+    // For now, simulate token generation to validate the worker protocol
+    const maxTokens = msg.maxTokens ?? 128;
+    for (let i = 0; i < maxTokens && !aborted; i++) {
+      // In real implementation: call flare WASM generate step
+      tokenCount++;
+      postResult({
+        type: 'token',
+        text: ' ',
+        tokenId: i,
+      });
+      // Yield to allow abort messages to be processed
+      await new Promise((resolve) => setTimeout(resolve, 0));
+    }
+    const elapsed = (performance.now() - startTime) / 1000;
+    postResult({
+      type: 'done',
+      totalTokens: tokenCount,
+      tokensPerSecond: tokenCount / elapsed,
+    });
+  } catch (err) {
+    postResult({
+      type: 'error',
+      message: `Generation failed: ${err}`,
+    });
+  }
+}
+// Message handler
+self.onmessage = async (event: MessageEvent<IncomingMessage>) => {
+  const msg = event.data;
+  switch (msg.type) {
+    case 'init':
+      await handleInit(msg);
+      break;
+    case 'generate':
+      await handleGenerate(msg);
+      break;
+    case 'abort':
+      aborted = true;
+      break;
+  }
+};

package/package.json ADDED Viewed

@@ -0,0 +1,58 @@
+{
+  "name": "@sauravpanda/flare",
+  "version": "0.1.0",
+  "description": "WASM-first LLM inference engine with WebGPU acceleration — run LLMs in the browser with zero server costs",
+  "type": "module",
+  "main": "pkg/flare_web.js",
+  "module": "pkg/flare_web.js",
+  "types": "pkg/flare_web.d.ts",
+  "files": [
+    "pkg/flare_web_bg.wasm",
+    "pkg/flare_web_bg.wasm.d.ts",
+    "pkg/flare_web.js",
+    "pkg/flare_web.d.ts",
+    "pkg/package.json",
+    "js/",
+    "demo/"
+  ],
+  "scripts": {
+    "build": "wasm-pack build --target web --out-dir pkg",
+    "build:node": "wasm-pack build --target nodejs --out-dir pkg-node",
+    "build:bundler": "wasm-pack build --target bundler --out-dir pkg-bundler",
+    "prepublishOnly": "npm run build"
+  },
+  "exports": {
+    ".": {
+      "import": "./pkg/flare_web.js",
+      "types": "./pkg/flare_web.d.ts"
+    }
+  },
+  "keywords": [
+    "llm",
+    "wasm",
+    "webgpu",
+    "inference",
+    "ai",
+    "rust",
+    "browser",
+    "gguf",
+    "transformer"
+  ],
+  "license": "MIT OR Apache-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/sauravpanda/flarellm.git",
+    "directory": "flare-web"
+  },
+  "bugs": {
+    "url": "https://github.com/sauravpanda/flarellm/issues"
+  },
+  "homepage": "https://github.com/sauravpanda/flarellm#readme",
+  "publishConfig": {
+    "access": "public",
+    "registry": "https://registry.npmjs.org/"
+  },
+  "engines": {
+    "node": ">=18"
+  }
+}