npm - @tryhamster/gerbil - Versions diffs - 1.0.0-rc.8 → 1.0.0 - Mend

@tryhamster/gerbil 1.0.0-rc.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

package/LICENSE +1 -1
package/README.md +247 -84
package/dist/architectures-C1I5V3Dt.mjs +6070 -0
package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
package/dist/browser/index.d.ts +264 -588
package/dist/browser/index.d.ts.map +1 -1
package/dist/browser/index.js +585 -2334
package/dist/browser/index.js.map +1 -1
package/dist/cli.mjs +625 -1098
package/dist/cli.mjs.map +1 -1
package/dist/defaults-9komdrbY.mjs +24 -0
package/dist/defaults-9komdrbY.mjs.map +1 -0
package/dist/frameworks/express.d.mts +1 -3
package/dist/frameworks/express.d.mts.map +1 -1
package/dist/frameworks/express.mjs +7 -7
package/dist/frameworks/express.mjs.map +1 -1
package/dist/frameworks/fastify.d.mts +1 -1
package/dist/frameworks/fastify.d.mts.map +1 -1
package/dist/frameworks/fastify.mjs +3 -3
package/dist/frameworks/fastify.mjs.map +1 -1
package/dist/frameworks/hono.d.mts +1 -1
package/dist/frameworks/hono.d.mts.map +1 -1
package/dist/frameworks/hono.mjs +4 -4
package/dist/frameworks/hono.mjs.map +1 -1
package/dist/frameworks/next.d.mts +3 -2
package/dist/frameworks/next.d.mts.map +1 -1
package/dist/frameworks/next.mjs +4 -4
package/dist/frameworks/next.mjs.map +1 -1
package/dist/frameworks/react.d.mts +1 -1
package/dist/frameworks/trpc.d.mts +1 -1
package/dist/frameworks/trpc.d.mts.map +1 -1
package/dist/frameworks/trpc.mjs +4 -4
package/dist/frameworks/trpc.mjs.map +1 -1
package/dist/gerbil-BHrJJIa4.mjs +1656 -0
package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
package/dist/gerbil-BT9fCydo.d.mts +488 -0
package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
package/dist/gerbil-DomNfIr1.mjs +4 -0
package/dist/gpu/hooks.d.mts +520 -0
package/dist/gpu/hooks.d.mts.map +1 -0
package/dist/gpu/hooks.mjs +1188 -0
package/dist/gpu/hooks.mjs.map +1 -0
package/dist/gpu/index.d.mts +2 -0
package/dist/gpu/index.mjs +6 -0
package/dist/gpu-33qCAtHW.mjs +3615 -0
package/dist/gpu-33qCAtHW.mjs.map +1 -0
package/dist/index-Dgmb2kE3.d.mts +245 -0
package/dist/index-Dgmb2kE3.d.mts.map +1 -0
package/dist/index-jEAL2s-A.d.mts +2022 -0
package/dist/index-jEAL2s-A.d.mts.map +1 -0
package/dist/index.d.mts +22 -487
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +13 -8
package/dist/index.mjs.map +1 -1
package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
package/dist/integrations/ai-sdk.d.mts +75 -6
package/dist/integrations/ai-sdk.d.mts.map +1 -1
package/dist/integrations/ai-sdk.mjs +131 -15
package/dist/integrations/ai-sdk.mjs.map +1 -1
package/dist/integrations/langchain.d.mts +1 -1
package/dist/integrations/langchain.d.mts.map +1 -1
package/dist/integrations/langchain.mjs +5 -5
package/dist/integrations/langchain.mjs.map +1 -1
package/dist/integrations/llamaindex.d.mts +1 -1
package/dist/integrations/llamaindex.d.mts.map +1 -1
package/dist/integrations/llamaindex.mjs +5 -5
package/dist/integrations/llamaindex.mjs.map +1 -1
package/dist/integrations/mcp-client.mjs +3 -3
package/dist/integrations/mcp-client.mjs.map +1 -1
package/dist/integrations/mcp.d.mts +3 -2
package/dist/integrations/mcp.d.mts.map +1 -1
package/dist/integrations/mcp.mjs +5 -5
package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
package/dist/mcp-1DaMsaBc.mjs.map +1 -0
package/dist/memory/index.d.mts +3 -0
package/dist/memory/index.mjs +6 -0
package/dist/memory-D1P7Tmda.mjs +4 -0
package/dist/memory-DVN0MnIG.mjs +132 -0
package/dist/memory-DVN0MnIG.mjs.map +1 -0
package/dist/memory-Dj0J1v88.mjs +294 -0
package/dist/memory-Dj0J1v88.mjs.map +1 -0
package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
package/dist/repl-jV5gcJFA.mjs +9 -0
package/dist/skills/index.d.mts +270 -320
package/dist/skills/index.d.mts.map +1 -1
package/dist/skills/index.mjs +5 -5
package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
package/dist/skills-DX8D59UH.mjs.map +1 -0
package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
package/dist/tools-DQ1mPUw5.mjs.map +1 -0
package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
package/dist/types-D6FiR_oh.d.mts.map +1 -0
package/dist/types-DQBe2lFo.d.mts +165 -0
package/dist/types-DQBe2lFo.d.mts.map +1 -0
package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
package/dist/vector-B0panuy6.mjs +95 -0
package/dist/vector-B0panuy6.mjs.map +1 -0
package/docs/PROJECT-STATE.md +321 -0
package/docs/adding-a-model-family.md +280 -0
package/docs/ai-sdk.md +70 -61
package/docs/architecture/overview.md +17 -7
package/docs/browser.md +203 -8
package/docs/embeddings.md +156 -0
package/docs/gerbil-site-native-migration.md +217 -0
package/docs/gpu-engine/architectures.md +398 -0
package/docs/gpu-engine/ir.md +372 -0
package/docs/gpu-engine/kernels.md +718 -0
package/docs/gpu-engine/paper.html +1759 -0
package/docs/gpu-engine/paper.md +2109 -0
package/docs/gpu-engine/safetensors.md +312 -0
package/docs/gpu-engine/tokenizer.md +302 -0
package/docs/memory-rag.md +91 -0
package/docs/metal-safari-intel.md +190 -0
package/docs/mobile-failure-diagnosis.md +124 -0
package/docs/mobile.md +99 -0
package/docs/observability.md +230 -0
package/docs/onnx-removal-plan.md +339 -0
package/docs/research/autoresearch-portable.md +904 -0
package/docs/research/dispatch-reduction-hivemind.md +84 -0
package/docs/research/ios-safari-model-caching.md +117 -0
package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
package/docs/research/native-stt-model-selection.md +49 -0
package/docs/research/native-tts-model-selection.md +90 -0
package/docs/research/native-vs-chromium-decision.md +152 -0
package/docs/research/nemotron-mamba2-inference.md +910 -0
package/docs/research/qwen35-multimodal.md +293 -0
package/docs/research/qwen36-gemma4-targets.md +337 -0
package/docs/research/sota-embedding-models.md +179 -0
package/docs/research/sota-mobile-models-2026.md +263 -0
package/docs/research/sota-modality-models.md +202 -0
package/docs/research/tps-baselines.md +71 -0
package/docs/research/webgpu-m4-reference.md +104 -0
package/docs/site-update-plan.md +155 -0
package/docs/structured-output.md +123 -0
package/docs/stt.md +63 -446
package/docs/tts.md +77 -499
package/docs/vision.md +100 -338
package/package.json +22 -7
package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
package/dist/gerbil-CJ3ifloF.mjs +0 -4
package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
package/dist/gerbil-qOTe1nl2.d.mts +0 -431
package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
package/dist/kokoro-BNTb6egA.mjs +0 -20210
package/dist/kokoro-BNTb6egA.mjs.map +0 -1
package/dist/kokoro-DFRQ1OeM.js +0 -20212
package/dist/kokoro-DFRQ1OeM.js.map +0 -1
package/dist/mcp-BvbriaBy.mjs.map +0 -1
package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
package/dist/repl-DveXw36T.mjs +0 -9
package/dist/skills-CD3Orlex.mjs.map +0 -1
package/dist/stt-CpLYbGFd.mjs +0 -433
package/dist/stt-CpLYbGFd.mjs.map +0 -1
package/dist/stt-DRPLEEHB.mjs +0 -3
package/dist/stt-Te8Qz-Ay.js +0 -433
package/dist/stt-Te8Qz-Ay.js.map +0 -1
package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
package/dist/transformers.web-DokyH3rP.js +0 -3
package/dist/transformers.web-M6mCnEYJ.js +0 -30382
package/dist/transformers.web-M6mCnEYJ.js.map +0 -1
package/dist/tts-C0xx3CtE.js +0 -724
package/dist/tts-C0xx3CtE.js.map +0 -1
package/dist/tts-DXgsKGCe.mjs +0 -3
package/dist/tts-DeGANMNV.mjs +0 -730
package/dist/tts-DeGANMNV.mjs.map +0 -1
package/dist/types-CiTc7ez3.d.mts.map +0 -1
/package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
/package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
/package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0

package/dist/index-jEAL2s-A.d.mts ADDED Viewed

@@ -0,0 +1,2022 @@
+//#region src/gpu/ir.d.ts
+/**
+ * Gerbil WebGPU IR — the contract every component builds on.
+ *
+ * Generated at runtime from HuggingFace config.json by architecture-specific
+ * graph generators. The executor, kernels, and model loader all speak this IR.
+ */
+/** Every computation the engine can perform. */
+type OpType = "Embedding" | "EmbeddingInt4" | "MatMul" | "MatMulBias" | "MatMulInt4" | "Add" | "Mul" | "RMSNorm" | "LayerNorm" | "RoPE" | "Attention" | "Softmax" | "SiLU" | "SwiGLU" | "GELU" | "Gather" | "Reshape" | "Transpose" | "Concat" | "MoERouter" | "ExpertMatMul" | "MambaSSM" | "CausalConv1d" | "CausalConv1dSiLU" | "CausalConv1dGated" | "SigmoidGate" | "ResidualRMSNorm" | "KVCacheAppend" | "ConvStateUpdate" | "SliceLastRow" | "MeanPool" | "Scale" | "Softcap" | "L2Norm" | "ApplyRotaryEmb" | "MRoPE" | "EmbedSplice" | "AddBias" | "GeluErf" | "SliceCols" | "MulCols" | "PoolMatMul" | "ClippedMatMul" | "Conv1dFull" | "ConvTranspose1d" | "Snake1d" | "FSQDequant" | "HalfSnake1d" | "ConvTranspose1dDepthwise" | "Conv2d" | "AvgPool2d" | "CrossAttention" | "Tanh" | "GroupNorm";
+type DType = "f32" | "f16" | "i32" | "u32" | "i4";
+type TensorStorage = "constant" | "activation" | "kv_cache" | "ssm_state";
+interface TensorDesc {
+  /** Unique name within the graph (e.g. "layers.0.self_attn.q_proj.weight"). */
+  name: string;
+  /**
+   * Shape dimensions. Numbers are concrete; strings are symbolic
+   * ("T" for sequence length, "L_max" for max cache length).
+   */
+  shape: (number | string)[];
+  /** Element data type. */
+  dtype: DType;
+  /** Where this tensor lives. */
+  storage: TensorStorage;
+  /**
+   * Key in the safetensors file that maps to this tensor.
+   * Only set for storage === "constant".
+   */
+  safetensorsKey?: string;
+  /**
+   * Synthetic constant fill: when set (and the tensor has no weight data in the
+   * checkpoint), the loader materializes a constant tensor of this value at the
+   * declared shape instead of fetching it. Used for parameter-free norms such as
+   * Gemma 4's `v_norm` (RMSNormNoScale), which normalizes values with an implicit
+   * all-ones gain (fillValue = 1.0).
+   */
+  fillValue?: number;
+}
+interface OpNode {
+  /** Unique node ID (e.g. "layer0_norm1", "embed", "lm_head"). */
+  id: string;
+  /** Which operation to perform. */
+  opType: OpType;
+  /** Input tensor names (order matters — matches kernel binding order). */
+  inputs: string[];
+  /** Output tensor names. */
+  outputs: string[];
+  /** Op-specific parameters (hidden_size, eps, num_heads, group_size, etc.). */
+  attributes: Record<string, unknown>;
+}
+/** KV cache memory layout. LHSd = [layer, head, seq, head_dim]. */
+type KVLayout = "LHSd";
+interface ModelCapabilities {
+  text: true;
+  vision: boolean;
+  moe: boolean;
+}
+interface ModelArchConfig {
+  hidden_size: number;
+  num_layers: number;
+  num_heads: number;
+  num_kv_heads: number;
+  head_dim: number;
+  intermediate_size: number;
+  vocab_size: number;
+  context_length: number;
+  rms_norm_eps: number;
+  norm_type: "rmsnorm" | "layernorm";
+  rope_base: number;
+  rope_dim: number;
+  kv_layout: KVLayout;
+  is_moe: boolean;
+  num_experts?: number;
+  top_k_experts?: number;
+  has_vision_tower: boolean;
+  vision_architecture?: string;
+  vision_patch_size?: number;
+  vision_embed_dim?: number;
+}
+interface ModelGraph {
+  /** HF architecture string, e.g. "Qwen2ForCausalLM". */
+  architecture: string;
+  /** Resolved model config with all dimensions. */
+  config: ModelArchConfig;
+  /** What this model can do. */
+  capabilities: ModelCapabilities;
+  /** All tensors in the graph, keyed by name. */
+  tensors: Record<string, TensorDesc>;
+  /** All computation nodes. */
+  nodes: OpNode[];
+  /** Topologically-sorted node IDs — the order the executor runs them. */
+  executionOrder: string[];
+  /** Graph input tensor names (e.g. ["input_ids"]). */
+  inputs: string[];
+  /** Graph output tensor names (e.g. ["logits"]). */
+  outputs: string[];
+}
+/**
+ * Map a HuggingFace safetensors key to a canonical IR tensor name.
+ *
+ * Different model families use different prefixes:
+ *   Qwen:  "model.embed_tokens.weight"
+ *   LLaMA: "model.embed_tokens.weight"
+ *   Phi:   "model.embed_tokens.weight" (but "model.layers.X.mlp.fc1.weight" etc.)
+ *
+ * This helper strips the common "model." prefix and handles known divergences.
+ * Architecture-specific overrides can extend the mapping.
+ */
+type HFKeyMapper = (hfKey: string) => string | null;
+//#endregion
+//#region src/gpu/architectures/index.d.ts
+/** Weight quantization mode for graph generation. */
+type GraphDType = "f32" | "q4";
+/** KV cache element type — "f16" halves memory traffic during attention. */
+type KVDType = "f32" | "f16";
+/**
+ * KV cache kernel strategy.
+ * - "f32": standard f32 buffers + f32 kernels
+ * - "native-f16": `enable f16` + `array<f16>` (Chrome/Dawn)
+ * - "packed-f16": `array<u32>` + pack2x16float/unpack2x16float (Safari-safe, no `enable f16`)
+ *
+ * Both f16 modes use the same buffer size (2 bytes/element). The difference is
+ * which WGSL kernel reads/writes the buffers.
+ */
+type KvMode = "f32" | "native-f16" | "packed-f16";
+//#endregion
+//#region src/gpu/device.d.ts
+/**
+ * WebGPU device abstraction layer.
+ *
+ * Wraps GPUDevice with helpers for buffer allocation, pipeline compilation,
+ * compute dispatch, and readback. All GPU interaction flows through here.
+ */
+interface GPUContext {
+  /** The underlying WebGPU device. */
+  device: GPUDevice;
+  /** Device limits (max buffer size, workgroup size, etc.). */
+  limits: GPUSupportedLimits;
+  /** Whether f16 is supported as a shader type. */
+  hasF16: boolean;
+  /**
+   * Whether the WebGPU `subgroups` feature is available (Chrome 134+, Safari 26+).
+   * When true, kernels may use `subgroupAdd`/`subgroupBroadcast` etc. (requires
+   * `enable subgroups;` in the shader). Absence falls back to the portable
+   * shared-memory reductions — never assume this is present.
+   */
+  hasSubgroups: boolean;
+  /** True when the WebGPU implementation is WebKit's (Safari, all iOS/iPadOS browsers). */
+  isWebKitWebGPU: boolean;
+  /** Whether the `timestamp-query` feature is available (per-pass GPU timing). Used
+   * only by the env-gated decode profiler; never on the normal inference path. */
+  hasTimestamp: boolean;
+  /** Raw adapter info string for diagnostics. */
+  adapterDescription: string;
+}
+interface InitGPUOptions {
+  /** Called when the GPU device is lost (e.g. tab backgrounded on iOS). */
+  onDeviceLost?: (reason: string, message: string) => void;
+}
+/**
+ * Initialize WebGPU and request a device with the features we need.
+ *
+ * In Node.js, initializes Dawn's WebGPU polyfill if navigator.gpu is absent.
+ * In the browser, uses the native WebGPU API directly.
+ *
+ * Throws a clear error if WebGPU is unavailable.
+ */
+declare function initGPU(options?: InitGPUOptions): Promise<GPUContext>;
+interface GPUDiagnosticResult {
+  /** Whether the basic buffer upload → readback round-trip works. */
+  bufferIntegrity: boolean;
+  /** Whether a trivial compute shader executes correctly. */
+  computeWorks: boolean;
+  /** Whether shared memory + workgroupBarrier() works. */
+  sharedMemoryWorks: boolean;
+  /** Detailed messages for each test. */
+  details: string[];
+}
+//#endregion
+//#region src/gpu/tokenizer.d.ts
+/**
+ * Pure JavaScript BPE tokenizer.
+ *
+ * Reads HuggingFace tokenizer.json — no WASM, no external dependencies.
+ * Supports encoding, decoding, and chat template application.
+ */
+interface TokenizerConfig {
+  bosToken: string | null;
+  eosToken: string | null;
+  bosTokenId: number | null;
+  eosTokenId: number | null;
+  chatTemplate: string | null;
+  addBosToken: boolean;
+  addEosToken: boolean;
+}
+interface ChatMessage {
+  role: "system" | "user" | "assistant";
+  content: string;
+}
+declare class Tokenizer {
+  private vocab;
+  private vocabReverse;
+  private merges;
+  private specialTokens;
+  private addedTokens;
+  private byteFallback;
+  /**
+   * SentencePiece mode (Gemma/Llama-style). When true, vocab uses U+2581 (▁) for
+   * spaces and raw UTF-8 tokens (NOT the GPT-2 byte-to-unicode "Ġ" mapping), and
+   * raw bytes fall back to <0xHH> tokens. When false, GPT-2 byte-level BPE.
+   */
+  private spmMode;
+  readonly config: TokenizerConfig;
+  readonly vocabSize: number;
+  private constructor();
+  /**
+   * Create a tokenizer from HuggingFace JSON files.
+   */
+  static fromJSON(tokenizerJSON: any, tokenizerConfigJSON?: any): Tokenizer;
+  /**
+   * Resolve a literal token string (e.g. "<|endoftext|>") to its vocab id,
+   * or null if it isn't in the vocabulary.
+   */
+  tokenToId(token: string): number | null;
+  /**
+   * Encode text into token IDs.
+   */
+  encode(text: string): number[];
+  /**
+   * Decode token IDs back to text.
+   */
+  decode(ids: number[], skipSpecialTokens?: boolean): string;
+  /**
+   * Apply chat template to messages.
+   *
+   * For now, implements the common ChatML format used by Qwen models:
+   * <|im_start|>system\n{content}<|im_end|>\n
+   * <|im_start|>user\n{content}<|im_end|>\n
+   * <|im_start|>assistant\n
+   *
+   * TODO: Parse Jinja2 templates from tokenizer_config.json for full generality.
+   */
+  /**
+   * Gemma 4 turn format: `<bos><|turn>user\n{content}<turn|>\n<|turn>model\n`.
+   * Gemma has no "system" role, so a system message is folded into the next user
+   * turn (matching the reference chat template).
+   */
+  private applyGemmaTurnTemplate;
+  applyChatTemplate(messages: ChatMessage[], options?: {
+    addGenerationPrompt?: boolean;
+  }): string;
+  /**
+   * Encode a chat conversation into token IDs.
+   */
+  encodeChat(messages: ChatMessage[], options?: {
+    addGenerationPrompt?: boolean;
+  }): number[];
+  private splitOnSpecialTokens;
+  private preTokenize;
+  private textToTokenRepr;
+  private bpeEncode;
+  private encodeByteFallback;
+}
+//#endregion
+//#region src/gpu/weight-source.d.ts
+/** A single tensor's data + shape (the unit the executor uploads to a GPU buffer). */
+interface WeightEntry {
+  data: ArrayBufferView;
+  shape: number[];
+}
+/**
+ * Read-side view consumed by the executor's streaming `uploadWeights`. Async by
+ * design so a cache-backed store can fetch one tensor's bytes at a time.
+ */
+interface WeightSource {
+  has(name: string): boolean;
+  keys(): string[];
+  readonly size: number;
+  /** Pull a single tensor (bytes materialized + dtype-converted on demand). */
+  get(name: string): Promise<WeightEntry | undefined>;
+  /**
+   * Release any transient backing storage (e.g. the browser transform-staging
+   * cache) once the consumer has finished uploading. No-op for the heap backend.
+   */
+  dispose?(): Promise<void>;
+}
+//#endregion
+//#region src/gpu/model-loader.d.ts
+interface LoadModelOptions {
+  /** HF repo ID (e.g. "Qwen/Qwen3.5-0.8B") or full URL. */
+  repo: string;
+  /** Progress callback: (loaded, total, message) */
+  onProgress?: (loaded: number, total: number, message: string) => void;
+  /** Custom HF key mapper (defaults to stripping "model." prefix). */
+  keyMapper?: HFKeyMapper;
+  /** HuggingFace API token for gated models. */
+  hfToken?: string;
+  /** Revision/branch (default: "main"). */
+  revision?: string;
+  /** Local cache directory for downloaded files (Node.js only). */
+  cacheDir?: string;
+  /**
+   * Weight dtype:
+   *  - "f32"  full precision (or the repo's native quantization, e.g. MLX/GPTQ q4)
+   *  - "q4"   on-the-fly INT4 quantization (~4× smaller)
+   *  - "auto" (recommended) picks q4 on mobile (iOS/Android) to fit in device
+   *           memory and f32/native on desktop. Already-quantized repos
+   *           (MLX/GPTQ 4-bit) stay q4 regardless.
+   */
+  dtype?: GraphDType | "auto";
+  /** KV cache dtype: "f16" halves memory traffic during attention. Requires GPU f16 support. */
+  kvDtype?: KVDType;
+  /**
+   * Build an embedding graph (last-token pool + L2 norm) instead of an LM head.
+   * Only valid for Qwen2/Qwen3 CausalLM architectures (e.g. Qwen3-Embedding).
+   */
+  embedding?: boolean;
+  /**
+   * Build the multimodal LM graph variant (M-RoPE + image-embedding splice) so
+   * the text model can consume spliced image tokens. Only meaningful for
+   * Qwen3_5ForConditionalGeneration. Reserves `maxVisionTokens` rows for the
+   * vision-embedding buffer. Text-only generation through this graph is
+   * numerically identical to the non-multimodal graph (M-RoPE fed linear
+   * positions == standard 1D RoPE).
+   */
+  multimodal?: {
+    maxVisionTokens: number;
+  };
+  /**
+   * Force-download and key-map the vision tower even without the multimodal LM
+   * graph. `enableVision` (via `multimodal`) already implies this; this flag is
+   * for callers that load weights directly (e.g. the vision-encoder validation
+   * scripts) and build the vision graph/executor themselves. When neither this
+   * nor `multimodal` is set, the ~201MB ViT is excluded from the download.
+   */
+  loadVisionTower?: boolean;
+}
+/**
+ * Gemma 4 Per-Layer-Embeddings (PLE) source, kept CPU-resident.
+ *
+ * The PLE table (`embed_tokens_per_layer`, [vocab, num_layers*256]) is ~1.17GB
+ * at 4-bit. Uploading it to a GPU buffer would make the model non-mobile-viable
+ * and would hit the per-binding size cap. Instead the loader hands the quantized
+ * table to the executor in JS memory; the executor gathers + dequantizes only the
+ * rows for the current input tokens each forward step (a tiny [T, width] upload).
+ */
+interface PleSource {
+  /**
+   * Flat row-major INT4 nibbles (Gerbil packing, 8 per u32). HEAP-RESIDENT path
+   * (Node/desktop). In the browser this is empty and `cache` is set instead so
+   * the ~1.17 GB table never sits in the JS heap during load.
+   */
+  packed: Uint32Array;
+  /** Per-group scales (Gerbil (nibble - zero) * scale convention). */
+  scales: Float32Array;
+  /** Per-group zero points. */
+  zeros: Float32Array;
+  /** Row width = num_layers * hidden_size_per_layer_input (E2B: 35*256 = 8960). */
+  width: number;
+  /** Dequant group size (MLX: 64). */
+  groupSize: number;
+  /** Activation tensor the per-step gathered rows are written into. */
+  targetTensor: string;
+  /**
+   * Browser only: when set, the quantized PLE table's bytes live in CacheStorage
+   * (not the heap). The executor reads the slice of nibbles/scales/zeros it needs
+   * for the current tokens on demand. Keeps peak load heap bounded.
+   */
+  cache?: {
+    cacheName: string;
+    packedKey: string;
+    scalesKey: string;
+    zerosKey: string;
+    /** packed.length (u32 count) — for bounds/Range math. */
+    packedLen: number;
+  };
+}
+interface LoadedModel {
+  /** The generated computation graph (IR). */
+  graph: ModelGraph;
+  /** The tokenizer. */
+  tokenizer: Tokenizer;
+  /**
+   * Weight tensors mapped to canonical names. A `WeightSource` so the executor
+   * can pull one tensor at a time (cache-backed in the browser, heap-backed on
+   * Node) instead of requiring the whole model to sit in heap at once. Use
+   * `get(name)` (async) to materialize a tensor's bytes on demand.
+   */
+  weights: WeightSource;
+  /** Raw config.json for reference. */
+  rawConfig: Record<string, unknown>;
+  /**
+   * CPU-resident Gemma 4 PLE table (set only for Gemma 4). Pass to
+   * `executor.setPleSource()` so the big table never becomes GPU-resident.
+   */
+  pleSource?: PleSource;
+}
+/**
+ * Load a model from HuggingFace Hub.
+ *
+ * 1. Fetch config.json -> determine architecture -> generate IR graph
+ * 2. Fetch tokenizer.json + tokenizer_config.json -> build tokenizer
+ * 3. Download safetensors -> parse headers -> extract weight data
+ * 4. Map HF tensor keys -> canonical names
+ */
+declare function loadModel(options: LoadModelOptions): Promise<LoadedModel>;
+interface LoadedMoonshine {
+  /** Canonical-named f32 weights (data + shape), shared by encoder + decoder graphs. */
+  weights: Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>;
+  /** The (decode-capable) tokenizer. */
+  tokenizer: Tokenizer;
+  /** Raw config.json. */
+  rawConfig: Record<string, unknown>;
+}
+declare function loadMoonshine(options: {
+  repo: string;
+  revision?: string;
+  hfToken?: string;
+  cacheDir?: string;
+  onProgress?: (loaded: number, total: number, message: string) => void;
+}): Promise<LoadedMoonshine>;
+interface LoadedKaniTTS {
+  /** Canonical-named f32 weights for the codec-LM backbone (LFM2 keys). */
+  backboneWeights: Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>;
+  /** Folded NanoCodec decoder weights under canonical `nanocodec.*` names. */
+  codecWeights: Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>;
+  /** The text tokenizer. */
+  tokenizer: Tokenizer;
+  /** Raw backbone config.json (LFM2 dims + KaniTTS2 fields). */
+  rawConfig: Record<string, unknown>;
+}
+declare function loadKaniTTS(options: {
+  /** Backbone repo (default nineninesix/kani-tts-2-en). */
+  repo?: string;
+  /** NanoCodec repo (default KANI_NANOCODEC_REPO). */
+  codecRepo?: string;
+  revision?: string;
+  hfToken?: string;
+  cacheDir?: string;
+  onProgress?: (loaded: number, total: number, message: string) => void;
+}): Promise<LoadedKaniTTS>;
+//#endregion
+//#region src/gpu/sampler.d.ts
+/**
+ * CPU-side token sampling from logits.
+ *
+ * Applies temperature, top-k, and top-p (nucleus) filtering,
+ * then samples from the resulting probability distribution.
+ *
+ * Uses typed arrays and min-heap for zero-allocation top-K selection.
+ * For vocab_size ~152K with topK=50, this avoids creating 152K JS tuples.
+ */
+interface SamplingParams {
+  temperature?: number;
+  topK?: number;
+  topP?: number;
+  repetitionPenalty?: number;
+}
+//#endregion
+//#region src/gpu/vision-preprocess.d.ts
+/**
+ * Host-side vision preprocessing for the Qwen3.5 ViT.
+ *
+ * The learned position embeddings (bilinear-interpolated over the patch grid)
+ * and the 2D rotary cos/sin tables are functions of the image grid (t, h, w)
+ * ONLY — not of the model weights or pixel values. They are cheap to compute on
+ * the CPU and fed to the GPU graph as input activations, keeping the graph to
+ * weight-dependent math while staying byte-identical to HF transformers.
+ *
+ * Ports (verified against transformers 5.12 vision_utils.py + modeling_qwen3_5):
+ *   - get_vision_bilinear_indices_and_weights  → buildPosEmbeds()
+ *   - get_vision_position_ids                  → buildPositionIds()
+ *   - Qwen3_5VisionRotaryEmbedding             → buildRotaryCosSin()
+ *
+ * Patch ordering: both the pos-embed gather and the position ids reorder patches
+ * into spatial_merge_size×spatial_merge_size groups, matching the HF image
+ * processor's output ordering, so the merger's [N,h]→[N/u,h*u] reshape lines up.
+ */
+interface VisionGridConfig {
+  hiddenSize: number;
+  numHeads: number;
+  numPositionEmbeddings: number;
+  spatialMergeSize: number;
+  ropeTheta?: number;
+}
+interface VisionPositionTensors {
+  /** [N, hidden_size] bilinear-interpolated learned position embeddings. */
+  posEmbeds: Float32Array;
+  /** [N, head_dim] rotary cosines. */
+  cos: Float32Array;
+  /** [N, head_dim] rotary sines. */
+  sin: Float32Array;
+  numPatches: number;
+}
+/**
+ * Build bilinear-interpolated learned position embeddings [N, hidden].
+ * posEmbedTable is the raw pos_embed.weight [num_position_embeddings, hidden].
+ */
+declare function buildPosEmbeds(gridTHW: [number, number, number], posEmbedTable: Float32Array, cfg: VisionGridConfig): Float32Array;
+/**
+ * Build the reordered (row, col) position ids [N, 2] for rotary, matching
+ * get_vision_position_ids.
+ */
+declare function buildPositionIds(gridTHW: [number, number, number], merge: number): Int32Array;
+/**
+ * Build rotary cos/sin tables [N, head_dim] from position ids, matching
+ * Qwen3_5VisionRotaryEmbedding + the cat((rotary, rotary)) in VisionModel.forward.
+ *
+ * rotary_pos_emb(position_ids) = (position_ids[..,None] * inv_freq).flatten(1)
+ *   where inv_freq has length (head_dim/2)/2 = head_dim/4, computed over dim=head_dim/2.
+ * For each token the two position components (h, w) each produce head_dim/4 freqs,
+ * concatenated → head_dim/2, then duplicated → head_dim for cos/sin.
+ */
+declare function buildRotaryCosSin(positionIds: Int32Array, headDim: number, theta?: number): {
+  cos: Float32Array;
+  sin: Float32Array;
+  numPatches: number;
+};
+/**
+ * Build all host position tensors for a single image grid in one call.
+ */
+declare function buildVisionPositionTensors(gridTHW: [number, number, number], posEmbedTable: Float32Array, cfg: VisionGridConfig): VisionPositionTensors;
+interface Gemma4VisionGridConfig {
+  hiddenSize: number;
+  numHeads: number;
+  headDim: number;
+  ropeTheta: number;
+  poolingKernelSize: number;
+}
+interface Gemma4VisionPositionTensors {
+  /** [N, hidden] axial position embeddings (table[0][x] + table[1][y]). */
+  posEmbeds: Float32Array;
+  /** [N, headDim] axial rotary cosines. */
+  cos: Float32Array;
+  /** [N, headDim] axial rotary sines. */
+  sin: Float32Array;
+  /** [Np, N] average-pooling matrix (1/k² in-cell, 0 elsewhere). */
+  poolMatrix: Float32Array;
+  /** number of patches N (= gridH*gridW). */
+  numPatches: number;
+  /** number of pooled (soft) tokens Np (= ceil(gridH/k)*ceil(gridW/k)). */
+  numPooled: number;
+}
+/**
+ * Build axial learned position embeddings [N, hidden] from the [2, posSize, hidden]
+ * table: pos[p] = table[0][x_p] + table[1][y_p]. Direct lookup, no interpolation
+ * (HF F.embedding on clamped positions).
+ */
+declare function buildGemma4PosEmbeds(gridH: number, gridW: number, posEmbedTable: Float32Array,
+// [2, posSize, hidden] flattened
+hidden: number, posSize: number): Float32Array;
+/**
+ * Build the 2D axial rotary cos/sin tables [N, headDim].
+ *   spatial_dim = headDim / 2;  inv_freq[j] = 1/theta^((2j)/spatial_dim), j in [0, spatial_dim/2)
+ *   per spatial dim: f = pos * inv_freq (spatial_dim/2 values); emb = cat(f, f) (spatial_dim values)
+ *   cos/sin = cat([emb_x, emb_y]) → headDim values, layout [fx,fx,fy,fy].
+ * Applied with the global-half rotate_half kernel (ApplyRotaryEmb), which computes
+ * out = x*cos + rotate_half(x)*sin element-wise — exact for this layout.
+ */
+declare function buildGemma4RotaryCosSin(gridH: number, gridW: number, headDim: number, theta: number): {
+  cos: Float32Array;
+  sin: Float32Array;
+};
+/**
+ * Build the [Np, N] average-pooling matrix for k×k spatial pooling over the real
+ * (unpadded) grid, matching modeling_gemma4's kernel_idxs/one_hot pooling:
+ *   cell(p) = floor(x_p/k) + ceil(gridW/k) * floor(y_p/k)
+ *   poolMatrix[cell, p] = 1/k²  (so pooled = poolMatrix @ hidden = mean over the k×k block)
+ * Np = ceil(gridH/k) * ceil(gridW/k). Each pooled cell averages exactly the patches
+ * that fall in it (edge cells with fewer than k² patches still divide by k², matching
+ * HF's fixed 1/k² normalization).
+ */
+declare function buildGemma4PoolMatrix(gridH: number, gridW: number, k: number): {
+  poolMatrix: Float32Array;
+  numPooled: number;
+};
+/**
+ * Build all Gemma 4 vision host tensors for one image grid in one call.
+ * `posEmbedTable` is the raw [2, posSize, hidden] flattened table.
+ */
+declare function buildGemma4VisionPositionTensors(gridH: number, gridW: number, posEmbedTable: Float32Array, posSize: number, cfg: Gemma4VisionGridConfig): Gemma4VisionPositionTensors;
+/** Gemma 4 image processor config (from processor_config.json). */
+declare const GEMMA4_IMAGE_PROCESSOR: ImageProcessorConfig;
+interface Gemma4PreprocessedImage {
+  /** Flattened patches [N, 3·patch²] row-major (row-major patch grid). */
+  patches: Float32Array;
+  /** Patch grid (gridH, gridW). */
+  gridHW: [number, number];
+}
+/**
+ * Preprocess a decoded RGB image for the Gemma 4 ViT: aspect-preserving resize so
+ * the patch grid is ≤ max_soft_tokens·k² patches and H,W divisible by k·patch,
+ * rescale ×1/255 (no normalize), patchify row-major into [N, 3·16·16].
+ *
+ * @param pixels row-major HWC RGB (0..255), length width*height*3.
+ */
+declare function preprocessImageGemma4(pixels: Float32Array | Uint8ClampedArray | Uint8Array, width: number, height: number, maxSoftTokens?: number, poolingKernelSize?: number, patchSize?: number): Gemma4PreprocessedImage;
+interface ImageProcessorConfig {
+  patchSize: number;
+  temporalPatchSize: number;
+  mergeSize: number;
+  imageMean: [number, number, number];
+  imageStd: [number, number, number];
+  /** rescale factor applied to raw 0..255 pixels before normalization (1/255). */
+  rescaleFactor: number;
+  /** min total pixels after resize (shortest_edge). */
+  minPixels: number;
+  /** max total pixels after resize (longest_edge). */
+  maxPixels: number;
+}
+declare const QWEN3_5_IMAGE_PROCESSOR: ImageProcessorConfig;
+interface PreprocessedImage {
+  /** Flattened patches [N, 1536] in the spatial-merge order encodeImage expects. */
+  patches: Float32Array;
+  /** (t, h, w) patch grid. t=1 for a single image, h/w in patch units. */
+  gridTHW: [number, number, number];
+}
+/**
+ * Qwen2-VL smart-resize: round H and W to multiples of factor=patch*merge,
+ * keeping aspect ratio and clamping the total pixel budget to [minPixels, maxPixels].
+ * Matches transformers.models.qwen2_vl.image_processing.smart_resize.
+ */
+declare function smartResize(height: number, width: number, factor: number, minPixels: number, maxPixels: number): [number, number];
+/**
+ * Preprocess a decoded RGB image into the [N, 1536] patch tensor + grid_thw that
+ * `encodeImage()` expects, matching the HF Qwen2-VL image processor:
+ *   smart_resize → rescale (×1/255) → normalize → temporal-pair (×temporal_patch_size)
+ *   → patchify into spatial_merge×spatial_merge blocks → flatten to [N, C·T·P·P].
+ *
+ * @param pixels row-major HWC RGB, length width*height*3. Values 0..255 (default)
+ *   or already 0..1 if `rescaled` is true.
+ * @param width  source pixel width
+ * @param height source pixel height
+ */
+declare function preprocessImage(pixels: Float32Array | Uint8ClampedArray | Uint8Array, width: number, height: number, cfg?: ImageProcessorConfig, rescaled?: boolean): PreprocessedImage;
+declare function buildMRoPEPositionIds(inputIds: Int32Array | Uint32Array | number[], imageGrids: Array<[number, number, number]>, imageTokenId: number, mergeSize: number): Int32Array;
+/**
+ * Per-pair frequency→dimension assignment for interleaved M-RoPE, matching
+ * Qwen3_5TextRotaryEmbedding.apply_interleaved_mrope. For pair index i in
+ * [0, sum(section)) the position component is section-cyclic: T,H,W,T,H,W,...
+ * but each component capped at its section count. Returns an array of length
+ * (rope_dim/2) with values 0=T, 1=H, 2=W.
+ */
+declare function mropeFreqDims(mropeSection: [number, number, number]): Int32Array;
+/**
+ * Build the interleaved-M-RoPE cos/sin tables [seq, rope_dim] from 3D position
+ * ids, matching Qwen3_5TextRotaryEmbedding.forward:
+ *   freqs[d][i] = pos[d] * inv_freq[i],  inv_freq[i] = 1/theta^(2i/rope_dim)
+ *   freq[i] picks component mropeFreqDims[i]; emb = cat(freqs, freqs).
+ * cos/sin have length seq*rope_dim. For text-only (all 3 pos rows equal) this
+ * reduces exactly to standard 1D partial RoPE.
+ *
+ * @param positionIds3 [3, seq] as produced by buildMRoPEPositionIds.
+ * @param ropeDim      number of rotated dims per head (head_dim * partial_factor).
+ */
+declare function buildMRoPECosSin(positionIds3: Int32Array, seq: number, ropeDim: number, theta: number, mropeSection: [number, number, number]): {
+  cos: Float32Array;
+  sin: Float32Array;
+};
+//#endregion
+//#region src/gpu/defaults.d.ts
+/**
+ * Default model per capability. Kept in its own tiny module (no heavy imports)
+ * so the React hooks can resolve defaults without statically pulling in the GPU
+ * engine — they import the engine dynamically to stay light.
+ */
+declare const DEFAULT_MODELS: {
+  /** Text generation (also the vision-capable checkpoint). */
+  readonly text: "mlx-community/Qwen3.5-0.8B-4bit";
+  /** Image understanding — same checkpoint, vision tower built on demand. */
+  readonly vision: "mlx-community/Qwen3.5-0.8B-4bit";
+  /** Text embeddings. */
+  readonly embedding: "mlx-community/embeddinggemma-300m-4bit";
+  /** Text-to-speech. */
+  readonly tts: "nineninesix/kani-tts-2-en";
+  /** Speech-to-text. */
+  readonly stt: "UsefulSensors/moonshine-base";
+};
+/** Resolve the model repo for a set of options, falling back to the defaults. */
+declare function resolveDefaultRepo(opts: {
+  repo?: string;
+  embedding?: boolean;
+  enableVision?: boolean;
+}): string;
+//#endregion
+//#region src/gpu/architectures/gemma4_vision.d.ts
+interface Gemma4VisionGraphInfo {
+  hiddenSize: number;
+  numHeads: number;
+  headDim: number;
+  depth: number;
+  intermediateSize: number;
+  textHidden: number;
+  patchSize: number;
+  patchDim: number;
+  poolingKernelSize: number;
+  ropeTheta: number;
+  rmsNormEps: number;
+}
+/**
+ * Resolve the Gemma 4 vision dims from a raw HF config. Accepts either the
+ * top-level config (reads `.vision_config` + `.text_config.hidden_size`) or a
+ * bare vision_config (then `textHidden` falls back to the projector row count if
+ * present, else hidden). Family-general — no E2B constants.
+ */
+declare function resolveGemma4VisionInfo(rawConfig: Record<string, unknown>): Gemma4VisionGraphInfo;
+/**
+ * Dequantize an MLX affine-int4 weight to a plain f32 [rows, cols] matrix.
+ * MLX packs 8 int4 values per u32 (low-nibble first); each group of `groupSize`
+ * columns shares one scale + bias: w[r,c] = scale[r, c/gs] * q + bias[r, c/gs].
+ * Used for the Gemma 4 multimodal projector (`embed_vision.embedding_projection`)
+ * in MLX-4bit checkpoints, where (unlike the BF16 ViT body) the projector is int4.
+ */
+declare function dequantizeMLXProjection(packed: Uint32Array, scales: Float32Array, biases: Float32Array, rows: number, cols: number, groupSize: number): Float32Array;
+/**
+ * If the Gemma 4 multimodal projector arrived as an MLX affine-int4 triplet
+ * (`embed_vision.embedding_projection.{weight(U32), scales, biases}`), dequantize
+ * it in-place to a plain f32 `embed_vision.embedding_projection.weight` and drop
+ * the scales/biases, so the vision graph's plain MatMul on the projector works for
+ * MLX-4bit checkpoints too. No-op for BF16 (HF) checkpoints (weight already f32).
+ */
+declare function dequantizeGemma4VisionProjection(weights: Map<string, {
+  data: ArrayBufferView;
+  shape: number[];
+}>, groupSize: number, rows: number, cols: number): void;
+/**
+ * Patch the ClippedMatMul nodes of a Gemma 4 vision graph with the calibrated clip
+ * scalars from the checkpoint (Gemma4ClippableLinear's per-tensor input/output
+ * min/max buffers), then drop those scalar tensors from the weights map so the
+ * vision executor doesn't try to upload them as GPU buffers. Call BEFORE
+ * VisionExecutor.uploadWeights(). Missing scalars default to ±inf (clip = identity),
+ * so a checkpoint without calibration still loads.
+ */
+declare function patchGemma4VisionClips(graph: ModelGraph, weights: Map<string, {
+  data: ArrayBufferView;
+  shape: number[];
+}>): void;
+/**
+ * Build the Gemma 4 ViT graph. Shaped by symbolic "N" (number of patches, runtime)
+ * and "Np" (number of pooled tokens = ceil(grid_h/k)·ceil(grid_w/k), runtime),
+ * resolved from input tensor dims — like the Qwen ViT's "N"/"Nm".
+ */
+declare function generateGemma4VisionGraph(rawConfig: Record<string, unknown>): ModelGraph;
+//#endregion
+//#region src/gpu/architectures/kani_tts.d.ts
+/** Parsed KaniTTS2 backbone config (the LFM2 dims + the TTS-specific fields). */
+interface KaniConfig {
+  textVocabSize: number;
+  vocabSize: number;
+  tokensPerFrame: number;
+  audioStep: number;
+  useLearnableRope: boolean;
+  alphaMin: number;
+  alphaMax: number;
+  speakerEmbDim: number;
+  audioTokensStart: number;
+  startOfSpeech: number;
+  endOfSpeech: number;
+  codebookSize: number;
+}
+declare function parseKaniConfig(rawConfig: Record<string, unknown>): KaniConfig;
+/**
+ * Convert the model's flat audio-token stream (the IDs between start/end-of-speech)
+ * into NanoCodec codes [NUM_GROUPS, T]. Mirrors NemoAudioPlayer.get_nano_codes:
+ *   reshape [-1, 4]; codes[:,c] -= codebook_size*c; codes -= audio_tokens_start;
+ *   transpose → [4, T]. Returns a Uint32Array laid out group-major ([g*T + t]).
+ */
+declare function audioTokensToCodes(audioTokenIds: number[], cfg?: KaniConfig): {
+  codes: Uint32Array;
+  numFrames: number;
+};
+interface NanoCodecGraphOptions {
+  /** Number of audio frames T (the code grid width). PCM length = T * 1764. */
+  numFrames: number;
+}
+declare function generateNanoCodecDecoderGraph(opts: NanoCodecGraphOptions): ModelGraph;
+/**
+ * Generate the KaniTTS2 codec-LM backbone graph (LFM2-350M body, full-vocab logits,
+ * per-layer learnable MRoPE). Mirrors generateLfm2Graph block-for-block; the only
+ * difference is that each attention layer rotates Q/K with the MRoPE op fed a
+ * per-layer host cos/sin table instead of the position-counter RoPE op.
+ *
+ * f32 only (the checkpoint is bf16→f32; q4 would need the codec-LM head re-validated).
+ */
+declare function generateKaniTtsGraph(rawConfig: Record<string, unknown>, dtype?: GraphDType, _groupSize?: number, kvDtype?: KVDType): ModelGraph;
+//#endregion
+//#region src/gpu/architectures/moonshine.d.ts
+/**
+ * Historical record of the executor-side work the keystone left open; all items
+ * are now implemented (see the STATUS note above). Retained as exported
+ * documentation of the dependency surface.
+ */
+declare const MOONSHINE_REMAINING_WORK: readonly ["DONE: Transpose op kernel (conv output [C,L] → encoder input [L,C]).", "DONE: GroupNorm(num_groups=1) over conv channels (weight+bias).", "DONE: Tanh elementwise op (conv1 activation).", "DONE: Interleaved-RoPE variant (Moonshine rotates adjacent dim pairs 2p/2p+1).", "DONE: No-bias LayerNorm variant (Moonshine norms are weight-only).", "DONE: Per-utterance conv graph regeneration from the concrete sample count.", "DONE: Dual-graph executor — encoder once → frozen cross-attn K/V → AR decode.", "DONE: engine.transcribe(pcm) host path (MoonshineSTT): conv → encode → AR decode → detokenize."];
+interface MoonshineDims {
+  hidden_size: number;
+  enc_layers: number;
+  dec_layers: number;
+  num_heads: number;
+  num_kv_heads: number;
+  head_dim: number;
+  rotary_dim: number;
+  intermediate_size: number;
+  vocab_size: number;
+  rope_base: number;
+  context_length: number;
+  ln_eps: number;
+}
+/** Pull and derive Moonshine dimensions from the raw HF config. */
+declare function parseMoonshineConfig(raw: Record<string, unknown>): MoonshineDims;
+/** Number of encoder frames produced by the conv frontend for n_samples PCM. */
+declare function moonshineEncoderFrames(nSamples: number): number;
+/**
+ * Encoder graph: raw-waveform conv frontend + bidirectional transformer.
+ * @param nSamples concrete PCM sample count (the conv frontend is length-static).
+ * The output tensor "encoder_out" is [T_frames, hidden] and is consumed (after
+ * per-layer K/V projection) as the frozen K/V for the decoder's cross-attention.
+ */
+declare function generateMoonshineEncoderGraph(raw: Record<string, unknown>, nSamples: number): ModelGraph;
+/**
+ * Decoder graph: AR transformer with causal self-attn (KV-cache) + cross-attn to
+ * the frozen encoder output. Built for a single decode step (T=1). The encoder K/V
+ * are supplied as graph inputs "enc_k_layer{i}" / "enc_v_layer{i}" — the host
+ * pre-projects the encoder output through each layer's encoder_attn.k_proj/v_proj
+ * ONCE and binds them frozen for the whole decode (the CrossAttention contract).
+ */
+declare function generateMoonshineDecoderGraph(raw: Record<string, unknown>, sEnc: number): ModelGraph;
+//#endregion
+//#region src/gpu/architectures/qwen3_5_vision.d.ts
+/**
+ * Build the ViT graph. The graph is shaped by symbolic "N" (number of patches),
+ * resolved at run time from the input tensor's first dim — exactly like the LM's
+ * symbolic "T".
+ */
+declare function generateQwen3_5VisionGraph(rawConfig: Record<string, unknown>): ModelGraph;
+//#endregion
+//#region src/gpu/executor.d.ts
+/**
+ * Safari/Metal workaround: shader variant alternation.
+ * Metal caches argument buffers per compiled function. When consecutive dispatches
+ * use the same WGSL code (same Metal function), Metal reuses the previous dispatch's
+ * argument buffer, ignoring setBindGroup(). We alternate between variant 0/1 of each
+ * shader (prepending `const _MV: u32 = Xu;`) to force different Metal function
+ * specializations, preventing argument buffer reuse.
+ */
+interface ExecutorOptions {
+  maxSeqLen: number;
+  /** KV cache kernel strategy. Defaults to "native-f16" when not specified. */
+  kvMode?: KvMode;
+  /**
+   * WebKit only: dispatches per command buffer, with at most one command
+   * buffer in flight (awaited). 1 (default) is the proven-correct floor on
+   * iPad; larger values are faster if this WebKit version keeps storage
+   * writes visible across dispatches within one submission. Sweepable via
+   * the ?group=N URL param.
+   */
+  webkitGroupSize?: number;
+}
+interface ForwardResult {
+  logits: Float32Array;
+}
+declare class Executor {
+  private ctx;
+  private graph;
+  private weightBuffers;
+  private activationBuffers;
+  private ssmStateBuffers;
+  private kvCacheBuffers;
+  /** Pre-allocated input_ids buffer (maxSeqLen * 4 bytes). */
+  private inputIdsBuffer;
+  /**
+   * CPU-resident Per-Layer-Embeddings (PLE) source for Gemma 4. The PLE table
+   * (`embed_tokens_per_layer`, [vocab, L*256]) is ~1.17GB at 4-bit and is kept
+   * OFF the GPU. Each forward step we gather + dequantize only the rows for the
+   * actual input tokens and upload a tiny [T, L*256] f32 buffer. See setPleSource.
+   */
+  private pleSource;
+  /** Reusable scratch for the dequantized PLE rows (resized on demand). */
+  private pleScratch;
+  /**
+   * One-time promise that materializes a cache-backed PLE table into heap. The
+   * table is read from CacheStorage on the FIRST forward — i.e. AFTER the GPU
+   * weight upload has completed, so the ~1.17 GB does not stack on top of the
+   * upload's transient allocations at the load-time memory high-water mark.
+   */
+  private pleReady;
+  /**
+   * Dummy GPU buffer bound to an otherwise-aliasing storage-read-write slot.
+   * Used by RoPE-Q-only nodes (Gemma 4 KV-shared layers) whose node lists the
+   * same tensor as input and output: the RoPE kernel always declares two
+   * read_write bindings (q, k), but with num_kv_heads=0 the k slot is never
+   * touched. WebGPU still rejects two read_write bindings aliasing one buffer,
+   * so we bind this throwaway buffer to the unused k slot. Allocated lazily.
+   */
+  private bindingScratchBuffer;
+  /** Readback buffer for logits. */
+  private logitsReadback;
+  /** GPU buffer for argmax result (1 u32). */
+  private argmaxResultBuffer;
+  /** Readback buffer for argmax result (1 u32). */
+  private argmaxReadback;
+  /** Readback ring for pipelined greedy decode (created lazily). */
+  private decodeReadbacks;
+  /**
+   * Staging buffer for uniform param updates.
+   * Safari/Metal has weaker visibility guarantees for queue.writeBuffer() to
+   * UNIFORM buffers — early writes get dropped when hundreds are queued.
+   * Instead, we pack all params into this STORAGE staging buffer (1 writeBuffer),
+   * then use encoder.copyBufferToBuffer to distribute to each uniform buffer.
+   * Copies are GPU-sequenced and guaranteed to complete before compute passes.
+   */
+  private uniformStagingBuffer;
+  private uniformStagingCapacity;
+  /** Dispatch entries for prefill (M>1): uses tiled matmul kernels. */
+  private dispatchEntries;
+  /** Dispatch entries for decode (M=1): uses K-parallel matvec kernels. */
+  private decodeEntries;
+  /** Argmax dispatch entry (created in initBindGroups). */
+  private argmaxEntry;
+  /** True when running on Safari/WebKit (needs multi-encoder submit). */
+  readonly needsMultiEncoder: boolean;
+  private maxSeqLen;
+  private kvMode;
+  private seqPos;
+  private webkitGroupSize;
+  private profileEnabled;
+  private readonly profileData;
+  private querySet;
+  private queryResolveBuf;
+  private queryReadbackBuf;
+  constructor(ctx: GPUContext, graph: ModelGraph, options: ExecutorOptions);
+  /**
+   * Register the CPU-resident Gemma 4 PLE table. The quantized table is kept in
+   * JS memory (NOT uploaded to a GPU buffer), and {@link forward} gathers +
+   * dequantizes only the rows for the current input tokens each step, uploading a
+   * small [T, width] f32 buffer into `targetTensor`. This is what keeps Gemma 4
+   * mobile-viable: resident GPU memory is just the active transformer weights.
+   */
+  setPleSource(src: {
+    packed: Uint32Array;
+    scales: Float32Array;
+    zeros: Float32Array;
+    width: number;
+    groupSize: number;
+    targetTensor: string;
+    cache?: {
+      cacheName: string;
+      packedKey: string;
+      scalesKey: string;
+      zerosKey: string;
+      packedLen: number;
+    };
+  }): void;
+  /**
+   * Materialize a cache-backed PLE table into heap exactly once. Deferred to the
+   * first forward (after GPU upload) so the ~1.17 GB table is not co-resident
+   * with the upload's transient allocations during the load-time memory peak.
+   */
+  private ensurePleLoaded;
+  /**
+   * Gather + dequantize the PLE rows for `inputIds` and upload them into the
+   * target activation buffer. Touches only T rows (T*width floats) — the full
+   * [vocab, width] quantized table never goes to the GPU.
+   */
+  private streamPleRows;
+  /**
+   * Stream weights to the GPU one tensor at a time, pulling each from the
+   * `WeightSource` only when it is about to be uploaded and dropping the
+   * reference immediately afterward.
+   *
+   * This is the property that bounds peak JS heap: with a cache-backed source
+   * (browser/mobile), only ONE tensor's bytes are materialized in heap at a time
+   * (read from CacheStorage in `source.get()`), uploaded to its GPU buffer, then
+   * released before the next tensor is fetched. The whole model is never co-
+   * resident in heap. With a heap-backed source (Node/desktop), behavior matches
+   * the old Map path (the source deletes each entry as it is consumed).
+   *
+   * Accepts either a `WeightSource` (new, streamed/async) or a plain Map
+   * (back-compat for callers that build a Map directly, e.g. Kani/Moonshine).
+   */
+  uploadWeights(source: WeightSource | Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>): Promise<void>;
+  /**
+   * Synchronous Map upload (Node/desktop and the Kani/Moonshine sub-executors,
+   * which build small heap Maps). Deletes each entry as it is consumed to free
+   * the JS-side bytes once they are GPU-resident. Safe to call from a constructor.
+   */
+  uploadWeightsMap(weights: Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>): void;
+  /**
+   * Build all pipelines and bind groups. Call ONCE after uploadWeights().
+   *
+   * Creates two dispatch entry arrays:
+   * - dispatchEntries: tiled matmul for prefill (any M)
+   * - decodeEntries: K-parallel matvec for decode (M=1)
+   */
+  initBindGroups(): void;
+  /**
+   * Run a forward pass. Uses matvec kernels for M=1 (decode), tiled for M>1 (prefill).
+   */
+  forward(inputIds: Uint32Array): Promise<ForwardResult>;
+  /**
+   * Profiling variant of the desktop dispatch path: one compute pass per dispatch,
+   * each bracketed by timestamp queries, so we get per-op GPU time. Accumulates
+   * into profileData by opType. Only runs under GERBIL_PROFILE with timestamp-query
+   * support — slower than the batched path (it measures relative cost, not tok/s).
+   */
+  private runProfiledDispatches;
+  /** Per-opType GPU time (ns) + dispatch count accumulated by GERBIL_PROFILE, hottest first. */
+  getProfile(): Array<{
+    opType: string;
+    ns: number;
+    count: number;
+  }>;
+  /** Clear accumulated profiler data (e.g. to drop warm-up tokens). */
+  resetProfile(): void;
+  /** GPU dispatches per decode token (post-fusion). On mobile this drives the
+   * submit-group count = ceil(dispatchCount / webkitGroupSize). */
+  get decodeDispatchCount(): number;
+  /** Device limit that gates the INT4 projection fusions (they need ≥9). If a
+   * device caps at 8 the dual/gated/swiglu-gated INT4 fusions silently fall back,
+   * inflating the decode dispatch count. */
+  get maxStorageBuffers(): number;
+  /**
+   * Profile ONE real decode step: times the actual `decodeEntries` (the kernels
+   * the pipelined greedy benchmark runs) with per-dispatch timestamps. Timing is
+   * token-independent, so pass any valid id; runs un-pipelined with a synchronous
+   * timestamp readback (measurement only, not for production decode). Argmax (one
+   * tiny dispatch) is intentionally excluded — it is not a hotspot target.
+   */
+  profileDecodeStep(tokenId: number): Promise<void>;
+  /**
+   * Run a single forward pass over `inputIds` and read back the L2-normalized
+   * embedding vector. Requires an embedding graph (one whose output tensor is
+   * "embedding", produced by the last-token-pool + L2-norm tail).
+   *
+   * Always runs in a fresh-state single pass (caller should reset() first):
+   * embeddings are non-autoregressive, so the whole sequence is one prefill.
+   */
+  embed(inputIds: Uint32Array): Promise<Float32Array>;
+  /**
+   * Generic one-shot dispatch for a non-LM graph: run every entry once over a
+   * single forward and read back `elemCount` f32 elements of the named output
+   * tensor. Used to execute the NanoCodec decoder graph (codes→PCM) — its ops use
+   * concrete lengths and it has no "logits" output, so the normal forward()
+   * logits-readback path does not apply. Caller writes any inputs (e.g. audio_codes)
+   * via writeInput() and reset()s first.
+   */
+  runGraphOutput(outputName: string, elemCount: number): Promise<Float32Array>;
+  /**
+   * Greedy decode step: forward + GPU argmax. Returns token ID directly.
+   * Always uses matvec kernels (M=1). Reads back 4 bytes instead of vocab_size*4.
+   */
+  forwardArgmax(inputIds: Uint32Array): Promise<number>;
+  /** KV-cache positions still available for decode steps. */
+  decodeCapacityRemaining(): number;
+  /** Number of decode steps that may be in flight in the pipelined path. */
+  static readonly PIPELINE_DEPTH = 2;
+  /**
+   * Pipelined greedy decode step (Dawn only — WebKit uses forwardArgmax).
+   *
+   * Encodes one full decode forward + argmax and submits WITHOUT awaiting
+   * completion. The input token is taken from `tokenId` for the first step
+   * after prefill; for subsequent steps (tokenId === null) the previous step's
+   * argmax result is copied into input_ids ON THE GPU, so the decode loop
+   * never blocks on a readback before submitting the next step. The argmax
+   * result is copied to a per-slot readback buffer, read later (one step
+   * behind) via readDecodeToken(slot).
+   *
+   * queue.writeBuffer is queue-ordered: uniform updates land after the
+   * previously submitted step's command buffer and before this one's, so
+   * shared uniform buffers are safe with multiple steps in flight.
+   */
+  submitGreedyDecodeStep(tokenId: number | null, slot: number): void;
+  /** Read back the token produced by the pipelined step that used `slot`. */
+  readDecodeToken(slot: number): Promise<number>;
+  reset(): void;
+  /**
+   * Diagnostic: dispatch ONLY the first kernel (EmbeddingInt4) using the
+   * production bind group, pipeline, and buffers — but in isolation (1 dispatch,
+   * no staging, fresh encoder). Compares against full forward pass to isolate
+   * whether the issue is the bind group/pipeline or the multi-dispatch context.
+   */
+  debugFirstDispatch(inputIds: Uint32Array): Promise<{
+    nodeId: string;
+    opType: string;
+    dispatchSize: [number, number, number];
+    output: Float32Array;
+  }>;
+  /**
+   * Diagnostic: run a single dispatch entry by index, in isolation.
+   * Call after debugFirstDispatch() to test whether entry[1] (RMSNorm)
+   * can read embed_out written by entry[0] (EmbeddingInt4).
+   */
+  debugDispatchEntry(entryIndex: number, T: number): Promise<{
+    nodeId: string;
+    opType: string;
+    output: Float32Array;
+  }>;
+  /**
+   * Diagnostic: compute the JS-side params for the first N decode entries
+   * WITHOUT dispatching. Shows what buildParams produces.
+   */
+  debugComputeParams(T: number, count?: number): Array<{
+    idx: number;
+    nodeId: string;
+    opType: string;
+    paramsU32: number[];
+    dispatchSize: [number, number, number];
+  }>;
+  /**
+   * Diagnostic: after a forward pass, read back output tensors at several points
+   * in the pipeline to find where data drops to zero.
+   */
+  debugPipelineProbe(T: number): Promise<Array<{
+    idx: number;
+    nodeId: string;
+    opType: string;
+    tensor: string;
+    sum: number;
+    first4: number[];
+    uniformParams?: number[];
+  }>>;
+  debugWriteBuffer(tensorName: string, data: ArrayBufferView): void;
+  /**
+   * Write a host-supplied activation input buffer (e.g. multimodal M-RoPE
+   * cos/sin, spliced vision embeddings, image row-map). The buffer must be a
+   * persistent activation tensor in the graph. Call before forward().
+   */
+  writeInput(tensorName: string, data: ArrayBufferView): void;
+  /** Write a host activation buffer at a byte offset (for per-row decode updates). */
+  writeInputAt(tensorName: string, data: ArrayBufferView, byteOffset: number): void;
+  /** True if the graph has a buffer with this name (multimodal-capability probe). */
+  hasBuffer(tensorName: string): boolean;
+  /** Current sequence position (number of tokens processed since reset). */
+  get currentSeqPos(): number;
+  debugReadBuffer(tensorName: string, maxElements?: number, byteOffset?: number): Promise<Float32Array>;
+  destroy(): void;
+  /**
+   * Allocate activation buffers with liveness-based reuse.
+   *
+   * One dedicated buffer per activation tensor at full maxSeqLen is ~2.3GB for
+   * Qwen3.5-0.8B at T=512 — over the iOS jetsam budget on its own. Instead,
+   * a buffer returns to a size-keyed pool once its tensor's last reader has
+   * executed, so concurrently-live tensors share a small working set.
+   *
+   * Graph outputs and tensors read before they are written (cross-forward
+   * state) keep dedicated buffers. Within a forward, dispatches execute in
+   * executionOrder on every path (single-pass Dawn, per-dispatch WebKit), and
+   * WebGPU synchronizes hazards between dispatches, so reuse is safe.
+   *
+   * Caveat: debugReadBuffer() on an intermediate tensor is only meaningful
+   * before a later op reuses its buffer (probes that stop mid-graph are fine).
+   */
+  private allocateActivationBuffers;
+  private allocateSSMStateBuffers;
+  private allocateKVCacheBuffers;
+  private resolveShapes;
+  private getBuffer;
+  /**
+   * Detect gate_proj + up_proj + SwiGLU patterns in decode entries and replace
+   * with a single fused SwiGLUMatVec dispatch. Saves 2 dispatches per MLP block.
+   */
+  private fuseSwiGLUDecodeEntries;
+  /**
+   * Fuse two adjacent INT4 projections that share the same input activation and
+   * the same K/N (e.g. q_proj+gate_proj and k_proj+v_proj in full-attention
+   * decode) into a single DualMatVecInt4 dispatch. Reads the shared input vector
+   * once and writes both projection outputs — removing one GPU round-trip (one
+   * submit+drain on Safari/iOS) per fused pair.
+   *
+   * Numerics are identical to running the two MatVecInt4 kernels separately
+   * (same dequant, same K-parallel reduction), so this is WebKit-safe: it reuses
+   * the proven INT4 matvec math and only merges two writes into one dispatch.
+   *
+   * Must run AFTER fuseSwiGLUDecodeEntries so the MLP gate+up pair (which is
+   * consumed by a SwiGLU node) is already collapsed and won't be matched here.
+   */
+  private fuseDualMatVecDecodeEntries;
+  /**
+   * Fuse the adjacent K-cache and V-cache appends in each full-attention layer
+   * into a single DualKVCacheAppend dispatch. Both are pure memcpys into f32
+   * caches sharing the same width and dst_offset, so one dispatch with two
+   * src/dst buffers writes both — removing one GPU round-trip per layer.
+   *
+   * Supports f32, native-f16, and packed-f16 caches (the dual kernel mirrors the
+   * single-append kernel selected for the active kvMode). Numerically identical
+   * to the separate appends — WebKit-safe (the packed-f16 variant is the Safari
+   * path and uses pack2x16float, no `enable f16`).
+   */
+  private fuseDualKVCacheAppendEntries;
+  /**
+   * Fuse the attention SigmoidGate (attn_out * sigmoid(gate)) into the INT4
+   * o_proj that consumes it: a GatedMatVecInt4 reads attn_out and gate directly,
+   * applies the sigmoid gate to its input vector, and runs the projection in ONE
+   * dispatch — removing the standalone SigmoidGate (one round-trip per
+   * full-attention layer).
+   *
+   * Numerically identical to SigmoidGate→MatVecInt4 (same gate formula, same INT4
+   * dequant + reduction). Slight extra ALU: the gated input is recomputed per
+   * output column, but A reads hit L1 and the saved submit+drain dominates on
+   * mobile. WebKit risk: low — reuses the proven INT4 matvec, only the A vector
+   * is built from two reads + a sigmoid (no new reduction/barrier pattern).
+   *
+   * Runs after the dual fusions so it only sees the post-attention SigmoidGate.
+   */
+  private fuseGatedOProjDecodeEntries;
+  /**
+   * Fuse a standalone SwiGLU (silu(gate) * up) into the INT4 projection that
+   * consumes its output: a SwiGLUGatedMatVecInt4 reads gate and up directly,
+   * builds the gated input vector, and runs the projection in ONE dispatch.
+   * Targets the Mamba block's mamba_swiglu (silu(z) * norm_out) feeding out_proj
+   * — one round-trip saved per linear-attention layer.
+   *
+   * The MLP SwiGLU is already collapsed into a SwiGLUMatVec entry by
+   * fuseSwiGLUDecodeEntries (it has no surviving standalone SwiGLU node), so only
+   * the Mamba SwiGLU matches here. Numerically identical to SwiGLU→MatVecInt4;
+   * WebKit risk low (reuses the proven INT4 matvec, only the A vector changes).
+   */
+  private fuseSwiGLUGatedProjDecodeEntries;
+  /**
+   * Fuse two adjacent per-row RMSNorms sharing hidden_size + eps into a single
+   * DualRMSNorm dispatch (e.g. the per-head q_norm and k_norm in full-attention
+   * decode). One workgroup still handles one row; the fused grid just spans both
+   * inputs' rows, so each row's reduction is unchanged — numerically identical to
+   * two separate RMSNorm dispatches. One round-trip saved per fused pair.
+   *
+   * WebKit risk: low — same single-workgroup reduction as the proven RMSNorm
+   * kernel, only the row→input routing is added.
+   */
+  private fuseDualRMSNormDecodeEntries;
+  /**
+   * Gather buffer entries for a bind group, matching the kernel spec's binding layout.
+   * Uses the pre-allocated inputIdsBuffer for the "input_ids" tensor.
+   */
+  private gatherBuffers;
+  /** Lazily allocate a scratch storage buffer at least `minBytes` large. */
+  private getBindingScratchBuffer;
+}
+//#endregion
+//#region src/gpu/kani-tts.d.ts
+/**
+ * KaniTTS — native text-to-speech engine for Gerbil's WebGPU backend.
+ *
+ * Kani-TTS-2 (nineninesix/kani-tts-2-en) is a two-stage TTS model that, like
+ * Moonshine, needs more than one graph:
+ *
+ *   1. CODEC-LM BACKBONE (LFM2-350M body): autoregressively emits NanoCodec audio
+ *      tokens (4 per frame) into the same vocab as text. Reuses LFM2's block math
+ *      with two KaniTTS2 deltas — frame-level position IDs (the 4 audio tokens of a
+ *      frame share a position) and learnable per-layer RoPE (α^(l)-scaled freqs) —
+ *      both folded host-side into per-layer cos/sin tables fed to the MRoPE op.
+ *   2. NANOCODEC DECODER (NVIDIA NeMo 22 kHz): FSQ dequant + causal HiFi-GAN conv
+ *      decoder → 22 kHz PCM. Validated bit-exact vs MLX (test-nanocodec-decode.mjs).
+ *
+ * The AR loop runs on the host with full-logit readback so each frame's 4 audio
+ * tokens are sampled per-codebook (constrained to the valid codebook window), then
+ * the collected codes are decoded once through the NanoCodec graph.
+ *
+ * Validated on Dawn (desktop) via scripts/engine/test-kani-speak.mjs.
+ */
+interface KaniTTSOptions {
+  /** Backbone repo (default nineninesix/kani-tts-2-en). */
+  repo?: string;
+  /** NanoCodec repo (default the NeMo 22 kHz MLX checkpoint). */
+  codecRepo?: string;
+  revision?: string;
+  hfToken?: string;
+  cacheDir?: string;
+  /** Max self-attn KV-cache length (prompt + generated). Default 2048. */
+  maxSeqLen?: number;
+  onProgress?: (loaded: number, total: number, message: string) => void;
+}
+interface SpeakOptions {
+  /** Language/accent tag, e.g. "en_us" (default). Prepended as "{tag}: {text}". */
+  languageTag?: string;
+  /** Sampling temperature (default 1.0). */
+  temperature?: number;
+  /** Top-p nucleus threshold (default 0.95). */
+  topP?: number;
+  /** Repetition penalty (default 1.1). */
+  repetitionPenalty?: number;
+  /** Max audio frames to generate (caps duration). Default unbounded (maxSeqLen). */
+  maxFrames?: number;
+  /** Override max generated tokens (default 3000). */
+  maxNewTokens?: number;
+}
+interface SpeakResult {
+  /** Mono PCM in [-1, 1]. */
+  pcm: Float32Array;
+  /** Sample rate (22050). */
+  sampleRate: number;
+  /** Number of audio frames decoded. */
+  frames: number;
+  /** Audio duration in seconds. */
+  audioSeconds: number;
+}
+declare class KaniTTS {
+  private ctx;
+  private loaded;
+  private tokenizer;
+  private cfg;
+  private rawConfig;
+  private maxSeqLen;
+  /** Backbone executor (built once; reused across speak() calls). */
+  private backboneExec;
+  /** The attention layer indices (carry learnable α) and their α values. */
+  private attnLayers;
+  private layerAlpha;
+  private headDim;
+  private ropeBase;
+  private _destroyed;
+  readonly architecture = "KaniTTS2ForCausalLM";
+  private constructor();
+  static create(options?: KaniTTSOptions): Promise<KaniTTS>;
+  /** Write per-layer cos/sin for token rows [rowStart, rowStart+positions.length). */
+  private writeCosSin;
+  /**
+   * Synthesize speech for `text`. Returns 22 kHz mono PCM.
+   *
+   * Pipeline: build the [SOH]+text+[EOT,EOH] prompt → prefill the codec-LM →
+   * AR-decode 4-token frames (per-codebook constrained sampling) until end_of_speech
+   * → strip markers → codes → NanoCodec decode → PCM.
+   */
+  speak(text: string, opts?: SpeakOptions): Promise<SpeakResult>;
+  /**
+   * Autoregressive decode: from the prefill logits, emit one token per step —
+   * greedy for the structural markers ([SOA][SOS]) and per-codebook constrained
+   * sampling once in speech — collecting the audio tokens between SOS and EOS.
+   * Writes each step's per-layer cos/sin row (frame-level logical position) before
+   * the forward. Returns the collected audio tokens and whether EOS/cap was hit.
+   */
+  private runDecodeLoop;
+  /** Logical (frame-level) position of the LAST token in `seq`. */
+  private logicalPositionAt;
+  /** Greedy argmax over a logits row. */
+  private argmax;
+  /**
+   * Sample one audio token for codebook position `codebook`, constrained to that
+   * codebook's valid window [audio_tokens_start + 4032*c, +4032). Allows end_of_speech
+   * only at codebook 0 (frame boundary). Applies temperature, top-p, rep-penalty.
+   */
+  private sampleAudioToken;
+  /**
+   * Decode NanoCodec codes [groups, T] (group-major) → PCM.
+   *
+   * The decoder graph carries concrete lengths, and the upsampled conv activations
+   * for long clips overflow WebGPU's 65535 per-dimension dispatch cap. The decoder
+   * is fully causal with a small (≤ a few frames) receptive field, so we decode in
+   * frame chunks with a left-context lookback and keep only each chunk's own output
+   * samples — numerically identical to a single decode, but bounded per dispatch.
+   */
+  private decodeCodes;
+  /** Run the NanoCodec decoder graph for a single (bounded) code window → PCM. */
+  private decodeCodesWindow;
+  destroy(): void;
+}
+//#endregion
+//#region src/gpu/moonshine-executor.d.ts
+interface EncoderResult {
+  /** encoder_out [S_enc, hidden] (debug / parity). */
+  encoderOut: Float32Array;
+  /** Per-decoder-layer frozen K, indexed by layer. */
+  encK: Float32Array[];
+  /** Per-decoder-layer frozen V, indexed by layer. */
+  encV: Float32Array[];
+  /** Encoder length (frames). */
+  sEnc: number;
+}
+declare class MoonshineEncoderExecutor {
+  private ctx;
+  private graph;
+  private decLayers;
+  private hidden;
+  private weightBuffers;
+  private activationBuffers;
+  private dispatches;
+  constructor(ctx: GPUContext, graph: ModelGraph, decLayers: number);
+  /**
+   * Upload the encoder constants. `weights` holds canonical-named f32 tensors;
+   * only those referenced by the graph are uploaded (the decoder weights are
+   * uploaded into the decoder Executor separately).
+   */
+  uploadWeights(weights: Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>): void;
+  initBindGroups(): void;
+  /** Run the conv frontend + encoder + K/V projection. `pcm` is raw 16kHz mono. */
+  encode(pcm: Float32Array): Promise<EncoderResult>;
+  /** Read back a named activation buffer after encode() (debug / parity checks). */
+  readActivation(name: string, maxElements?: number): Promise<Float32Array>;
+  destroy(): void;
+  /** Encoder frame count = first dim of encoder_out (resolved, numeric). */
+  private encoderFrames;
+  private resolveShapes;
+  private allocateActivationBuffers;
+  private readBack;
+  private gatherBuffers;
+}
+//#endregion
+//#region src/gpu/moonshine-stt.d.ts
+/**
+ * MoonshineSTT — native speech-to-text engine for Gerbil's WebGPU backend.
+ *
+ * Moonshine is an encoder-decoder ASR model with a raw-waveform conv frontend
+ * (no FFT / mel spectrogram). Unlike the causal-LM path, it needs two graphs:
+ *
+ *   1. ENCODER (run once per utterance): conv frontend → bidirectional transformer
+ *      → encoder hidden state, which is then projected through every decoder
+ *      layer's cross-attention k_proj/v_proj into FROZEN K/V buffers.
+ *   2. DECODER (autoregressive): causal self-attention with a growing KV-cache,
+ *      plus cross-attention into the frozen encoder K/V at every step.
+ *
+ * The conv frontend is length-static (Conv1dFull carries concrete L/Lout), so the
+ * encoder graph is regenerated per utterance from the input sample count, and the
+ * decoder graph is regenerated with the resulting encoder frame count (S_enc).
+ * Weights are uploaded once and reused across utterances via per-call executors.
+ *
+ * Validated on Dawn (desktop) via scripts/engine/test-moonshine-transcribe.mjs.
+ * The kernels are mobile-safe (≤16KB workgroup memory, clamped exp/tanh, no
+ * select(), no `enable f16`) and the executors use the WebKit submit/drain
+ * discipline, so the same path runs on iPad.
+ */
+interface MoonshineSTTOptions {
+  /** HF repo (default UsefulSensors/moonshine-base). */
+  repo?: string;
+  revision?: string;
+  hfToken?: string;
+  cacheDir?: string;
+  onProgress?: (loaded: number, total: number, message: string) => void;
+}
+interface TranscribeOptions {
+  /** Stop after this many decoded tokens (default 194). */
+  maxNewTokens?: number;
+}
+interface TranscribeResult {
+  text: string;
+  /** Decoded token ids (excluding the start token, including the trailing EOS). */
+  tokens: number[];
+  /** Number of encoder frames produced by the conv frontend. */
+  encoderFrames: number;
+  /** Audio duration in seconds (samples / 16000). */
+  audioSeconds: number;
+}
+declare class MoonshineSTT {
+  private ctx;
+  private weights;
+  private tokenizer;
+  private rawConfig;
+  private bosTokenId;
+  private eosTokenId;
+  private decoderStartTokenId;
+  private _destroyed;
+  /** HF architecture string, for parity with WebGPUEngine. */
+  readonly architecture = "MoonshineForConditionalGeneration";
+  private constructor();
+  /** Download + initialize a Moonshine STT engine. */
+  static create(options?: MoonshineSTTOptions): Promise<MoonshineSTT>;
+  /**
+   * Transcribe raw 16 kHz mono PCM. Runs the conv frontend + encoder once, then
+   * greedily AR-decodes with cross-attention into the frozen encoder K/V, stopping
+   * on EOS. Returns the detokenized transcript.
+   */
+  transcribe(pcm: Float32Array, opts?: TranscribeOptions): Promise<TranscribeResult>;
+  destroy(): void;
+}
+//#endregion
+//#region src/gpu/vision-executor.d.ts
+interface VisionInputs {
+  /** Flattened patches [N, patch_dim]. */
+  patches: Float32Array;
+  /** Bilinear-interpolated learned pos embeddings [N, hidden_size]. */
+  posEmbeds: Float32Array;
+  /** Precomputed rotary cos [N, head_dim]. */
+  cos: Float32Array;
+  /** Precomputed rotary sin [N, head_dim]. */
+  sin: Float32Array;
+  /** Number of patches (rows). */
+  numPatches: number;
+}
+/**
+ * Coarse stage callback for the ViT encode. Fires before each transformer layer
+ * (and the pre/post stages) so a host harness can localize a mobile GPU-process
+ * crash to a specific layer instead of seeing only "crashed after load". Kept
+ * synchronous and cheap; throwing is the caller's responsibility.
+ */
+type VisionStageCallback = (stage: string, info?: {
+  layer?: number;
+  total?: number;
+}) => void;
+declare class VisionExecutor {
+  private ctx;
+  private graph;
+  private mergeUnit;
+  private weightBuffers;
+  private activationBuffers;
+  private dispatches;
+  private maxPatches;
+  /** Weight (B) names of MatMulBias nodes stored as f16 (empty without shader-f16). */
+  private f16WeightNames;
+  /** Runtime pooled-token count for the Gemma 4 ViT ("Np" dim); 0 for Qwen. */
+  private gemma4Np;
+  /** True when this graph is the Gemma 4 vision tower (uses "Np" + PoolMatMul). */
+  private readonly isGemma4;
+  constructor(ctx: GPUContext, graph: ModelGraph, maxPatches: number);
+  uploadWeights(weights: Map<string, {
+    data: ArrayBufferView;
+    shape: number[];
+  }>): void;
+  initBindGroups(): void;
+  /**
+   * Encode patches → merged image embeddings [Nm, out_hidden_size].
+   *
+   * `onStage` (optional) fires coarse phase breadcrumbs during the WebKit path so
+   * a host can localize a GPU-process crash to a specific layer.
+   */
+  encode(inputs: VisionInputs, onStage?: VisionStageCallback): Promise<{
+    embeds: Float32Array;
+    rows: number;
+    dim: number;
+  }>;
+  /**
+   * Encode patches through the Gemma 4 ViT → projected image tokens [Np, text_hidden].
+   *
+   * Distinct from the Qwen `encode()`: the Gemma graph has 5 inputs (patches,
+   * axial pos-embeds, axial rotary cos/sin, and a host-built [Np,N] pooling matrix)
+   * and its output rows (Np) are the pooled soft-token count, resolved from the
+   * pooling matrix rather than an N/mergeUnit ratio. Reuses the same dispatch
+   * machinery + WebKit per-dispatch-drain discipline.
+   */
+  encodeGemma4(inputs: {
+    patches: Float32Array;
+    posEmbeds: Float32Array;
+    cos: Float32Array;
+    sin: Float32Array;
+    poolMatrix: Float32Array;
+    numPatches: number;
+    numPooled: number;
+  }, onStage?: VisionStageCallback): Promise<{
+    embeds: Float32Array;
+    rows: number;
+    dim: number;
+  }>;
+  /** True if this executor is the Gemma 4 vision tower. */
+  get gemma4(): boolean;
+  /** Read back any named activation (debug). Must be called right after encode(). */
+  debugReadBuffer(name: string, maxElements?: number): Promise<Float32Array>;
+  destroy(): void;
+  /** Max pooled tokens for buffer sizing: maxPatches with no merge/pool ratio applied. */
+  private maxPooled;
+  private resolveShapes;
+  private allocateActivationBuffers;
+  private gatherBuffers;
+}
+//#endregion
+//#region src/gpu/index.d.ts
+interface WebGPUEngineOptions extends Omit<LoadModelOptions, "repo"> {
+  /**
+   * HuggingFace repo ID (e.g. "mlx-community/Qwen3.5-0.8B-4bit") or full URL.
+   * Optional — when omitted, a sensible default is chosen for the requested
+   * capability (text, vision, or embeddings). See {@link DEFAULT_MODELS}.
+   */
+  repo?: string;
+  /** Max sequence length (default: from model config, capped at 4096). */
+  maxSeqLen?: number;
+  /** Override KV mode: "f32", "native-f16", or "packed-f16". Auto-detected if omitted. */
+  kvMode?: KvMode;
+  /**
+   * Build the vision encoder (Qwen3.5 ViT) alongside the text model so
+   * `encodeImage()` can turn image patches into merged image-embedding tokens.
+   * Only valid for vision-capable checkpoints (Qwen3.5). Downloads the ~192MB
+   * vision tower. Default: false.
+   */
+  enableVision?: boolean;
+  /** Max patches the vision encoder can process in one call (default 4096). */
+  maxVisionPatches?: number;
+}
+interface EncodeImageResult {
+  /** Merged image-embedding tokens, row-major [rows * dim]. */
+  embeds: Float32Array;
+  /** Number of merged tokens (numPatches / spatial_merge_size^2). */
+  rows: number;
+  /** Embedding dimension (out_hidden_size, 1024 for Qwen3.5). */
+  dim: number;
+}
+interface EmbedOptions {
+  /**
+   * Instruction prefix for query embeddings (Qwen3-Embedding convention:
+   * "Instruct: {task}\nQuery:{text}"). Omit for document embeddings.
+   */
+  instruction?: string;
+  /**
+   * EmbeddingGemma task prefix. The model is asymmetric: queries and documents
+   * use different prefixes (`task: search result | query: ` vs `title: none |
+   * text: `). Pass "query" for search queries and "document" for the corpus
+   * being searched. Defaults to "query" for EmbeddingGemma when omitted. Ignored
+   * by non-Gemma embedding models (use `instruction` for Qwen3-Embedding).
+   */
+  taskType?: "query" | "document";
+  /**
+   * Override the raw task prefix prepended to the text (EmbeddingGemma). When
+   * set, takes precedence over `taskType`. Use for non-retrieval tasks, e.g.
+   * "task: clustering | query: " or "task: classification | query: ".
+   */
+  taskPrompt?: string;
+  /** Max tokens to encode (longer inputs are truncated). Default: model context, capped at maxSeqLen. */
+  maxTokens?: number;
+}
+interface GenerateOptions {
+  /** Max tokens to generate (default: 512). */
+  maxTokens?: number;
+  /** Stop generation on these strings. */
+  stopSequences?: string[];
+  /** Sampling parameters. */
+  sampling?: SamplingParams;
+  /** System prompt to prepend. */
+  systemPrompt?: string;
+  /** Callback for each generated token (for streaming). */
+  onToken?: (token: string) => void;
+}
+interface GenerateResult {
+  /** Generated text. */
+  text: string;
+  /** Number of tokens generated. */
+  tokensGenerated: number;
+  /** Tokens per second. */
+  tokensPerSecond: number;
+  /** Total generation time in ms. */
+  totalTime: number;
+  /** Why generation stopped. */
+  finishReason: "eos" | "max_tokens" | "stop_sequence";
+  /** Thinking content if model produced it (future). */
+  thinking?: string;
+}
+/**
+ * A minimal JSON-schema-ish shape used by {@link WebGPUEngine.generateObject} to
+ * validate generated output without pulling in a schema library. Only `required`
+ * and `properties` are inspected (presence of required keys). Pass a predicate
+ * function instead for arbitrary validation.
+ */
+interface ObjectSchema {
+  /** Keys that must be present on the parsed object. */
+  required?: string[];
+  /** Property descriptors (only the key set is used for validation). */
+  properties?: Record<string, unknown>;
+  /** Allow extra schema fields (type, etc.) without TS complaints. */
+  [key: string]: unknown;
+}
+/** Validator for {@link WebGPUEngine.generateObject}: a schema object or predicate. */
+type ObjectValidator<T = unknown> = ObjectSchema | ((o: T) => boolean);
+interface GenerateObjectOptions extends GenerateOptions {
+  /**
+   * Validation target. Either a predicate `(o) => boolean` or a minimal
+   * JSON-schema-ish object with `required`/`properties` (required keys must
+   * exist). Omit to only require syntactically valid JSON.
+   */
+  schema?: ObjectValidator;
+  /**
+   * Max RETRIES after the first attempt (so up to `maxRetries + 1` generations).
+   * Default: 4.
+   */
+  maxRetries?: number;
+}
+interface GenerateObjectResult<T = unknown> {
+  /** The parsed + validated object (or array). */
+  object: T;
+  /** The raw model text the object was extracted from. */
+  text: string;
+  /** How many generation attempts it took (1 = first try). */
+  attempts: number;
+}
+interface IntegrityCheckEntry {
+  label: string;
+  length: number;
+  sum: number;
+  first4: number[];
+  argmax: number;
+  maxVal: number;
+  match?: "PASS" | "FAIL";
+  refSum?: number;
+  refArgmax?: number;
+  error?: string;
+  note?: string;
+}
+interface IntegrityCheckResult {
+  checks: IntegrityCheckEntry[];
+  allPass: boolean;
+}
+/**
+ * The main WebGPU inference engine.
+ *
+ * Usage:
+ *   const engine = await WebGPUEngine.create({ repo: "Qwen/Qwen3.5-0.8B" });
+ *   const result = await engine.generate("Hello!");
+ *   console.log(result.text);
+ *   engine.destroy();
+ */
+declare class WebGPUEngine {
+  private ctx;
+  private executor;
+  private tokenizer;
+  private _destroyed;
+  private _isEmbedding;
+  /** HF architecture string (e.g. "Gemma3TextModel", "Qwen3ForCausalLM"). */
+  private _architecture;
+  /** Vision encoder (built only when enableVision and the model is vision-capable). */
+  private visionExecutor;
+  /** Raw vision_config (for host preprocessing of grids). */
+  private visionConfig;
+  /** Raw pos_embed.weight table for bilinear interpolation. */
+  private visionPosEmbedTable;
+  /** True when the LM graph was built with the multimodal (M-RoPE + splice) path. */
+  private _multimodalGraph;
+  /** Raw config.json (for M-RoPE params: mrope_section, rope_theta, partial factor). */
+  private rawConfig;
+  /** Effective max sequence length (cos/sin table coverage). */
+  private maxSeqLen;
+  /** Original create() options (used to lazily spin up the Kani-TTS engine for speak()). */
+  private _createOptions;
+  /** Lazily-created Kani-TTS engine (codec-LM + NanoCodec) backing speak(). */
+  private _kaniTTS;
+  /**
+   * WebKit group-size probe state. When true, a candidate group size is being
+   * tried this page-load and must be promoted (or capped) after the FIRST
+   * successful forward produces non-corrupt logits. Goes false once handled so
+   * promotion runs at most once per session. Always false on Dawn/node.
+   */
+  private _groupProbePending;
+  /** Model capabilities (text, vision, moe). */
+  readonly capabilities: ModelCapabilities;
+  /** Model architecture config. */
+  readonly config: ModelArchConfig;
+  private constructor();
+  /** True if this engine has a vision encoder built (use encodeImage()). */
+  get hasVision(): boolean;
+  /** Per-opType decode GPU-time breakdown (only populated under GERBIL_PROFILE). */
+  getDecodeProfile(): Array<{
+    opType: string;
+    ns: number;
+    count: number;
+  }>;
+  /** Clear accumulated decode profiler data (e.g. to drop warm-up tokens). */
+  resetDecodeProfile(): void;
+  /** Profile ONE real decode step (the pipelined-greedy kernels). Token-independent
+   * timing — pass any valid id. Only meaningful under GERBIL_PROFILE. */
+  profileDecodeStep(tokenId: number): Promise<void>;
+  /** Decode dispatch count + the device's storage-buffer limit (which gates the
+   * INT4 projection fusions). Lets the iPad runner report whether fusions applied
+   * on-device or silently fell back (8 < 9 ⇒ more dispatches ⇒ more mobile drains). */
+  getDecodeStats(): {
+    dispatches: number;
+    maxStorageBuffers: number;
+  };
+  /**
+   * Write a coarse crash-phase breadcrumb that survives a GPU-process kill / page
+   * reload. The iPad harness reads `localStorage["gerbil-crash-phase"]` after a
+   * crash; without these, a describe-time crash only shows the last load phase
+   * ("engine:ready"). The describe path tags vit-encode / splice / text-decode so
+   * the next run shows WHERE it died, not just "crashed after load".
+   */
+  private setPhase;
+  /** True if this engine was loaded as an embedding model (use embed(), not generate()). */
+  get isEmbedding(): boolean;
+  /**
+   * WebKit group-size probe promotion hook. Runs at most once per session, after
+   * the FIRST forward completes without the page dying. If the page had crashed
+   * at this group size, this code never runs and the localStorage breadcrumb
+   * (left by the resolver) caps the device on the next load — that is what makes
+   * the probe survive the crash class. Here we additionally handle the
+   * wrong-output class by inspecting the first forward's logits for corruption
+   * (NaN / Inf / all-zero / all-same), reusing the same signals as integrityCheck().
+   */
+  private maybePromoteGroupProbe;
+  /**
+   * Create and initialize a WebGPUEngine.
+   *
+   * Downloads the model from HuggingFace, compiles shaders, uploads weights.
+   */
+  static create(options?: WebGPUEngineOptions): Promise<WebGPUEngine>;
+  /**
+   * Encode an image (already preprocessed into patches) into merged
+   * image-embedding tokens of dim `out_hidden_size` (1024 for Qwen3.5).
+   *
+   * This is the VISION ENCODER ONLY — it returns the image tokens; it does not
+   * splice them into a text sequence or apply M-RoPE (that is the LM-side
+   * integration phase). Requires `enableVision: true` at create() on a
+   * vision-capable checkpoint.
+   *
+   * @param patches Flattened patches, row-major [numPatches, patch_dim].
+   *   patch_dim = in_channels * temporal_patch_size * patch_size^2 (1536 for Qwen3.5).
+   *   Patches must already be ordered in spatial_merge_size×spatial_merge_size
+   *   groups (as the HF image processor emits them).
+   * @param gridTHW The (temporal, height, width) patch-grid dims for the image.
+   *   numPatches must equal t*h*w.
+   */
+  encodeImage(patches: Float32Array, gridTHW: [number, number, number], onStage?: (stage: string, info?: {
+    layer?: number;
+    total?: number;
+  }) => void): Promise<EncodeImageResult>;
+  /** Resolve M-RoPE params from rawConfig: rope_dim, theta, mrope_section. */
+  private mropeParams;
+  /**
+   * Write the M-RoPE cos/sin (token order) + image row-map for a prefill of
+   * `positionIds3` ([3, seq]). `rowMap[i]` = vision-buffer row for image tokens,
+   * -1 for text. Returns the logical position of the last token (for decode).
+   */
+  private writeMRoPEPrefill;
+  /**
+   * Write a single decode-step M-RoPE cos/sin row at table slot `seqPos` for a
+   * text token at logical position `logicalPos`, plus a -1 row-map entry.
+   */
+  private writeMRoPEDecodeStep;
+  /** Write linear-position M-RoPE inputs for a pure-text forward (no image). */
+  private writeMRoPELinearText;
+  /**
+   * Generate text from a prompt.
+   */
+  generate(prompt: string | ChatMessage[], options?: GenerateOptions): Promise<GenerateResult>;
+  /**
+   * Generate a STRUCTURED object: generate text, extract the first JSON
+   * object/array, parse it, validate it, and RETRY until it is valid (on-device
+   * tokens are free, so re-rolling a malformed JSON is cheap).
+   *
+   * Extraction is tolerant: prose, markdown, and ```json code fences are
+   * stripped, then the outermost balanced `{...}` or `[...]` is matched and
+   * `JSON.parse`d. Validation is one of:
+   *  - a predicate `(o) => boolean` (return false to reject),
+   *  - a minimal JSON-schema-ish object with `required` (those keys must exist),
+   *  - nothing (only valid JSON is required).
+   *
+   * On each retry the prompt is nudged with a terse "return ONLY valid JSON…"
+   * instruction (including the required-key shape when known). Throws a clear
+   * error if it never validates within `maxRetries + 1` attempts.
+   *
+   * ```ts
+   * const { object } = await engine.generateObject(
+   *   'Extract {name, age} from: "I am Sarah, 28"',
+   *   { schema: { required: ["name", "age"] } },
+   * );
+   * // object === { name: "Sarah", age: 28 }
+   * ```
+   *
+   * @typeParam T Expected object type (not enforced at runtime — validate via schema).
+   */
+  generateObject<T = unknown>(prompt: string, options?: GenerateObjectOptions): Promise<GenerateObjectResult<T>>;
+  /**
+   * Text-to-speech: text → 22 kHz PCM via Kani-TTS-2 (LFM2-350M codec-LM + NVIDIA
+   * NeMo NanoCodec). Returns `{ pcm: Float32Array, sampleRate: 22050 }`.
+   *
+   * Runs the full pipeline: the codec-LM backbone autoregressively emits NanoCodec
+   * audio tokens (4 per frame, frame-level positions + learnable per-layer RoPE),
+   * then the bit-exact NanoCodec decoder (FSQ + causal HiFi-GAN) turns the codes
+   * into PCM. The heavy lifting lives in {@link KaniTTS} (src/gpu/kani-tts.ts); this
+   * lazily constructs that engine on first use (downloading the NanoCodec codec
+   * checkpoint alongside the backbone).
+   *
+   * Requires a Kani-TTS-2 checkpoint (architecture "KaniTTS2ForCausalLM").
+   */
+  speak(text: string, options?: {
+    languageTag?: string;
+    temperature?: number;
+    topP?: number;
+    repetitionPenalty?: number;
+    maxFrames?: number;
+  }): Promise<{
+    pcm: Float32Array;
+    sampleRate: number;
+    frames: number;
+    audioSeconds: number;
+  }>;
+  /**
+   * Describe an image: image-in → text-out. Runs the vision encoder, splices the
+   * merged image tokens into a text prompt, applies multimodal M-RoPE, and
+   * generates a description. Requires `enableVision: true` at create().
+   *
+   * Image input forms:
+   *  - `{ pixels, width, height }` — decoded RGB (HWC, 0..255), host-preprocessed
+   *    (smart-resize/normalize/patchify) to match the HF image processor.
+   *  - `{ patches, gridTHW }` — already-built [N,1536] patch tensor + grid (e.g.
+   *    HF-exact pixel_values from a reference; skips host preprocessing).
+   */
+  describeImage(image: {
+    pixels: Float32Array | Uint8ClampedArray | Uint8Array;
+    width: number;
+    height: number;
+  } | {
+    patches: Float32Array;
+    gridTHW: [number, number, number];
+  }, prompt?: string, options?: GenerateOptions & {
+    imageProcessor?: ImageProcessorConfig;
+  }): Promise<GenerateResult>;
+  /**
+   * Prepare the multimodal prefill: upload vision embeds, build the image row-map
+   * and 3D M-RoPE cos/sin, reset state, and write all host inputs. Returns the
+   * input ids and the post-image logical cursor for decode. Does NOT run forward.
+   */
+  private prepareMultimodalPrefill;
+  /**
+   * Gemma 4 multimodal prefill + decode. Unlike Qwen3.5 (M-RoPE), Gemma 4 uses
+   * STANDARD sequential 1D RoPE computed inside each layer from the KV write
+   * position, so there are no host cos/sin inputs and decode positions are simply
+   * the running seqPos — identical to plain text generation. We only upload the
+   * merged vision embeds + an image-token row-map (EmbedSplice scatters them into
+   * the image_token rows) before the forward pass.
+   */
+  private runMultimodalGemma4;
+  /** Prepare + prefill + decode for a fully-specified multimodal token sequence. */
+  private runMultimodal;
+  /**
+   * Debug: run ONLY the multimodal prefill for an explicit token sequence and
+   * return the spliced input embeddings [seq, hidden] + first-token logits. Lets
+   * tests compare the fused text+vision stream and M-RoPE numerically vs HF
+   * without the decode loop overwriting intermediate buffers.
+   */
+  debugMultimodalPrefill(patches: Float32Array, gridTHW: [number, number, number], inputIds: number[]): Promise<{
+    splicedEmbeds: Float32Array;
+    logits: Float32Array;
+    seq: number;
+  }>;
+  /**
+   * Internal: run prefill (assumes M-RoPE/splice inputs already written) + decode,
+   * with decode logical positions starting at `decodeStartPos`. Used by
+   * describeImage so the post-image cursor is honored.
+   */
+  private generateFromPrepared;
+  /**
+   * Embed text into an L2-normalized vector. The pooling strategy depends on the
+   * model: Qwen3-Embedding uses last-token (EOS-position) pooling, while
+   * EmbeddingGemma (Gemma3 encoder) uses mean pooling over all tokens followed by
+   * a 2-layer Dense head. Requires an embedding model (loaded with
+   * { embedding: true }).
+   *
+   * The returned Float32Array has unit L2 norm, so cosine similarity reduces to a
+   * dot product. Length is the model's embedding dim (768 for EmbeddingGemma;
+   * config.hidden_size for Qwen3-Embedding).
+   *
+   * EmbeddingGemma is asymmetric — pass `{ taskType: "query" }` for search
+   * queries and `{ taskType: "document" }` for the corpus, or a raw
+   * `{ taskPrompt }` for other tasks (clustering/classification/STS).
+   */
+  embed(text: string, options?: EmbedOptions): Promise<Float32Array>;
+  /**
+   * Generate text as an async iterator (streaming).
+   *
+   * Uses the onToken callback from generate() to push tokens into a queue
+   * that the async generator yields from. The generator returns the full
+   * GenerateResult when generation completes.
+   *
+   * Usage:
+   *   const gen = engine.stream("Hello!");
+   *   for await (const token of gen) {
+   *     process.stdout.write(token);
+   *   }
+   *   const result = gen.next(); // { done: true, value: GenerateResult }
+   */
+  stream(prompt: string | ChatMessage[], options?: GenerateOptions): AsyncGenerator<string, GenerateResult, undefined>;
+  /**
+   * Debug: read back a named GPU buffer (weight or activation).
+   * Call after forward() to inspect intermediate values.
+   */
+  debugReadBuffer(tensorName: string, maxElements?: number): Promise<Float32Array>;
+  /**
+   * Run GPU diagnostics (buffer integrity, compute, shared memory).
+   * Useful for isolating Safari/WebKit-specific WebGPU issues.
+   */
+  diagnose(): Promise<GPUDiagnosticResult>;
+  /**
+   * Run GPU diagnostics without loading a model.
+   * Quick way to check if WebGPU is working correctly on this device.
+   */
+  static quickDiagnose(): Promise<GPUDiagnosticResult>;
+  /**
+   * Run a raw forward pass (no tokenization/chat template).
+   * Returns logits for the last token.
+   */
+  rawForward(inputIds: Uint32Array): Promise<{
+    logits: Float32Array;
+  }>;
+  /**
+   * Reset executor state (SSM, positions, etc.)
+   */
+  resetState(): void;
+  /**
+   * Encode text to token IDs (useful for debugging / token counting).
+   */
+  encode(text: string): number[];
+  /**
+   * Decode token IDs to text.
+   */
+  decode(ids: number[], skipSpecialTokens?: boolean): string;
+  /**
+   * Integrity check: reads back key weight tensors and runs a single forward pass,
+   * returning checksums for comparison against a known-good reference (Dawn/Node.js).
+   *
+   * Use this to isolate Safari/iPad corruption:
+   * - If weights mismatch → fetch/download pipeline is corrupt
+   * - If weights match but logits mismatch → kernel computation bug on Metal
+   *
+   * Resets executor state before and after (safe to call anytime).
+   */
+  integrityCheck(): Promise<IntegrityCheckResult>;
+  /**
+   * Destroy the engine and free all GPU resources.
+   */
+  destroy(): void;
+  private checkDestroyed;
+}
+//#endregion
+export { buildGemma4RotaryCosSin as $, parseMoonshineConfig as A, resolveGemma4VisionInfo as B, SpeakResult as C, ModelArchConfig as Ct, generateMoonshineDecoderGraph as D, MOONSHINE_REMAINING_WORK as E, Gemma4VisionGraphInfo as F, Gemma4VisionPositionTensors as G, resolveDefaultRepo as H, dequantizeGemma4VisionProjection as I, QWEN3_5_IMAGE_PROCESSOR as J, ImageProcessorConfig as K, dequantizeMLXProjection as L, generateKaniTtsGraph as M, generateNanoCodecDecoderGraph as N, generateMoonshineEncoderGraph as O, parseKaniConfig as P, buildGemma4PosEmbeds as Q, generateGemma4VisionGraph as R, SpeakOptions as S, KvMode as St, generateQwen3_5VisionGraph as T, GEMMA4_IMAGE_PROCESSOR as U, DEFAULT_MODELS as V, Gemma4VisionGridConfig as W, VisionPositionTensors as X, VisionGridConfig as Y, buildGemma4PoolMatrix as Z, TranscribeOptions as _, ChatMessage as _t, GenerateOptions as a, buildRotaryCosSin as at, KaniTTS as b, GraphDType as bt, IntegrityCheckResult as c, preprocessImage as ct, WebGPUEngine as d, SamplingParams as dt, buildGemma4VisionPositionTensors as et, WebGPUEngineOptions as f, LoadedKaniTTS as ft, MoonshineSTTOptions as g, loadMoonshine as gt, MoonshineSTT as h, loadModel as ht, GenerateObjectResult as i, buildPositionIds as it, audioTokensToCodes as j, moonshineEncoderFrames as k, ObjectSchema as l, preprocessImageGemma4 as lt, VisionInputs as m, loadKaniTTS as mt, EncodeImageResult as n, buildMRoPEPositionIds as nt, GenerateResult as o, buildVisionPositionTensors as ot, VisionExecutor as p, LoadedMoonshine as pt, PreprocessedImage as q, GenerateObjectOptions as r, buildPosEmbeds as rt, IntegrityCheckEntry as s, mropeFreqDims as st, EmbedOptions as t, buildMRoPECosSin as tt, ObjectValidator as u, smartResize as ut, TranscribeResult as v, GPUDiagnosticResult as vt, Executor as w, ModelCapabilities as wt, KaniTTSOptions as x, KVDType as xt, MoonshineEncoderExecutor as y, initGPU as yt, patchGemma4VisionClips as z };
+//# sourceMappingURL=index-jEAL2s-A.d.mts.map