@tryhamster/gerbil 1.0.0-rc.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +247 -84
  3. package/dist/architectures-C1I5V3Dt.mjs +6070 -0
  4. package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
  5. package/dist/browser/index.d.ts +264 -588
  6. package/dist/browser/index.d.ts.map +1 -1
  7. package/dist/browser/index.js +585 -2334
  8. package/dist/browser/index.js.map +1 -1
  9. package/dist/cli.mjs +625 -1098
  10. package/dist/cli.mjs.map +1 -1
  11. package/dist/defaults-9komdrbY.mjs +24 -0
  12. package/dist/defaults-9komdrbY.mjs.map +1 -0
  13. package/dist/frameworks/express.d.mts +1 -3
  14. package/dist/frameworks/express.d.mts.map +1 -1
  15. package/dist/frameworks/express.mjs +7 -7
  16. package/dist/frameworks/express.mjs.map +1 -1
  17. package/dist/frameworks/fastify.d.mts +1 -1
  18. package/dist/frameworks/fastify.d.mts.map +1 -1
  19. package/dist/frameworks/fastify.mjs +3 -3
  20. package/dist/frameworks/fastify.mjs.map +1 -1
  21. package/dist/frameworks/hono.d.mts +1 -1
  22. package/dist/frameworks/hono.d.mts.map +1 -1
  23. package/dist/frameworks/hono.mjs +4 -4
  24. package/dist/frameworks/hono.mjs.map +1 -1
  25. package/dist/frameworks/next.d.mts +3 -2
  26. package/dist/frameworks/next.d.mts.map +1 -1
  27. package/dist/frameworks/next.mjs +4 -4
  28. package/dist/frameworks/next.mjs.map +1 -1
  29. package/dist/frameworks/react.d.mts +1 -1
  30. package/dist/frameworks/trpc.d.mts +1 -1
  31. package/dist/frameworks/trpc.d.mts.map +1 -1
  32. package/dist/frameworks/trpc.mjs +4 -4
  33. package/dist/frameworks/trpc.mjs.map +1 -1
  34. package/dist/gerbil-BHrJJIa4.mjs +1656 -0
  35. package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
  36. package/dist/gerbil-BT9fCydo.d.mts +488 -0
  37. package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
  38. package/dist/gerbil-DomNfIr1.mjs +4 -0
  39. package/dist/gpu/hooks.d.mts +520 -0
  40. package/dist/gpu/hooks.d.mts.map +1 -0
  41. package/dist/gpu/hooks.mjs +1188 -0
  42. package/dist/gpu/hooks.mjs.map +1 -0
  43. package/dist/gpu/index.d.mts +2 -0
  44. package/dist/gpu/index.mjs +6 -0
  45. package/dist/gpu-33qCAtHW.mjs +3615 -0
  46. package/dist/gpu-33qCAtHW.mjs.map +1 -0
  47. package/dist/index-Dgmb2kE3.d.mts +245 -0
  48. package/dist/index-Dgmb2kE3.d.mts.map +1 -0
  49. package/dist/index-jEAL2s-A.d.mts +2022 -0
  50. package/dist/index-jEAL2s-A.d.mts.map +1 -0
  51. package/dist/index.d.mts +22 -487
  52. package/dist/index.d.mts.map +1 -1
  53. package/dist/index.mjs +13 -8
  54. package/dist/index.mjs.map +1 -1
  55. package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
  56. package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
  57. package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
  58. package/dist/integrations/ai-sdk.d.mts +75 -6
  59. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  60. package/dist/integrations/ai-sdk.mjs +131 -15
  61. package/dist/integrations/ai-sdk.mjs.map +1 -1
  62. package/dist/integrations/langchain.d.mts +1 -1
  63. package/dist/integrations/langchain.d.mts.map +1 -1
  64. package/dist/integrations/langchain.mjs +5 -5
  65. package/dist/integrations/langchain.mjs.map +1 -1
  66. package/dist/integrations/llamaindex.d.mts +1 -1
  67. package/dist/integrations/llamaindex.d.mts.map +1 -1
  68. package/dist/integrations/llamaindex.mjs +5 -5
  69. package/dist/integrations/llamaindex.mjs.map +1 -1
  70. package/dist/integrations/mcp-client.mjs +3 -3
  71. package/dist/integrations/mcp-client.mjs.map +1 -1
  72. package/dist/integrations/mcp.d.mts +3 -2
  73. package/dist/integrations/mcp.d.mts.map +1 -1
  74. package/dist/integrations/mcp.mjs +5 -5
  75. package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
  76. package/dist/mcp-1DaMsaBc.mjs.map +1 -0
  77. package/dist/memory/index.d.mts +3 -0
  78. package/dist/memory/index.mjs +6 -0
  79. package/dist/memory-D1P7Tmda.mjs +4 -0
  80. package/dist/memory-DVN0MnIG.mjs +132 -0
  81. package/dist/memory-DVN0MnIG.mjs.map +1 -0
  82. package/dist/memory-Dj0J1v88.mjs +294 -0
  83. package/dist/memory-Dj0J1v88.mjs.map +1 -0
  84. package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
  85. package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
  86. package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
  87. package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
  88. package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
  89. package/dist/repl-jV5gcJFA.mjs +9 -0
  90. package/dist/skills/index.d.mts +270 -320
  91. package/dist/skills/index.d.mts.map +1 -1
  92. package/dist/skills/index.mjs +5 -5
  93. package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
  94. package/dist/skills-DX8D59UH.mjs.map +1 -0
  95. package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
  96. package/dist/tools-DQ1mPUw5.mjs.map +1 -0
  97. package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
  98. package/dist/types-D6FiR_oh.d.mts.map +1 -0
  99. package/dist/types-DQBe2lFo.d.mts +165 -0
  100. package/dist/types-DQBe2lFo.d.mts.map +1 -0
  101. package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
  102. package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
  103. package/dist/vector-B0panuy6.mjs +95 -0
  104. package/dist/vector-B0panuy6.mjs.map +1 -0
  105. package/docs/PROJECT-STATE.md +321 -0
  106. package/docs/adding-a-model-family.md +280 -0
  107. package/docs/ai-sdk.md +70 -61
  108. package/docs/architecture/overview.md +17 -7
  109. package/docs/browser.md +203 -8
  110. package/docs/embeddings.md +156 -0
  111. package/docs/gerbil-site-native-migration.md +217 -0
  112. package/docs/gpu-engine/architectures.md +398 -0
  113. package/docs/gpu-engine/ir.md +372 -0
  114. package/docs/gpu-engine/kernels.md +718 -0
  115. package/docs/gpu-engine/paper.html +1759 -0
  116. package/docs/gpu-engine/paper.md +2109 -0
  117. package/docs/gpu-engine/safetensors.md +312 -0
  118. package/docs/gpu-engine/tokenizer.md +302 -0
  119. package/docs/memory-rag.md +91 -0
  120. package/docs/metal-safari-intel.md +190 -0
  121. package/docs/mobile-failure-diagnosis.md +124 -0
  122. package/docs/mobile.md +99 -0
  123. package/docs/observability.md +230 -0
  124. package/docs/onnx-removal-plan.md +339 -0
  125. package/docs/research/autoresearch-portable.md +904 -0
  126. package/docs/research/dispatch-reduction-hivemind.md +84 -0
  127. package/docs/research/ios-safari-model-caching.md +117 -0
  128. package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
  129. package/docs/research/native-stt-model-selection.md +49 -0
  130. package/docs/research/native-tts-model-selection.md +90 -0
  131. package/docs/research/native-vs-chromium-decision.md +152 -0
  132. package/docs/research/nemotron-mamba2-inference.md +910 -0
  133. package/docs/research/qwen35-multimodal.md +293 -0
  134. package/docs/research/qwen36-gemma4-targets.md +337 -0
  135. package/docs/research/sota-embedding-models.md +179 -0
  136. package/docs/research/sota-mobile-models-2026.md +263 -0
  137. package/docs/research/sota-modality-models.md +202 -0
  138. package/docs/research/tps-baselines.md +71 -0
  139. package/docs/research/webgpu-m4-reference.md +104 -0
  140. package/docs/site-update-plan.md +155 -0
  141. package/docs/structured-output.md +123 -0
  142. package/docs/stt.md +63 -446
  143. package/docs/tts.md +77 -499
  144. package/docs/vision.md +100 -338
  145. package/package.json +22 -7
  146. package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
  147. package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
  148. package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
  149. package/dist/gerbil-CJ3ifloF.mjs +0 -4
  150. package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
  151. package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
  152. package/dist/gerbil-qOTe1nl2.d.mts +0 -431
  153. package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
  154. package/dist/kokoro-BNTb6egA.mjs +0 -20210
  155. package/dist/kokoro-BNTb6egA.mjs.map +0 -1
  156. package/dist/kokoro-DFRQ1OeM.js +0 -20212
  157. package/dist/kokoro-DFRQ1OeM.js.map +0 -1
  158. package/dist/mcp-BvbriaBy.mjs.map +0 -1
  159. package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
  160. package/dist/repl-DveXw36T.mjs +0 -9
  161. package/dist/skills-CD3Orlex.mjs.map +0 -1
  162. package/dist/stt-CpLYbGFd.mjs +0 -433
  163. package/dist/stt-CpLYbGFd.mjs.map +0 -1
  164. package/dist/stt-DRPLEEHB.mjs +0 -3
  165. package/dist/stt-Te8Qz-Ay.js +0 -433
  166. package/dist/stt-Te8Qz-Ay.js.map +0 -1
  167. package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
  168. package/dist/transformers.web-DokyH3rP.js +0 -3
  169. package/dist/transformers.web-M6mCnEYJ.js +0 -30382
  170. package/dist/transformers.web-M6mCnEYJ.js.map +0 -1
  171. package/dist/tts-C0xx3CtE.js +0 -724
  172. package/dist/tts-C0xx3CtE.js.map +0 -1
  173. package/dist/tts-DXgsKGCe.mjs +0 -3
  174. package/dist/tts-DeGANMNV.mjs +0 -730
  175. package/dist/tts-DeGANMNV.mjs.map +0 -1
  176. package/dist/types-CiTc7ez3.d.mts.map +0 -1
  177. /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
  178. /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
  179. /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
@@ -0,0 +1,2022 @@
1
+ //#region src/gpu/ir.d.ts
2
+ /**
3
+ * Gerbil WebGPU IR — the contract every component builds on.
4
+ *
5
+ * Generated at runtime from HuggingFace config.json by architecture-specific
6
+ * graph generators. The executor, kernels, and model loader all speak this IR.
7
+ */
8
+ /** Every computation the engine can perform. */
9
+ type OpType = "Embedding" | "EmbeddingInt4" | "MatMul" | "MatMulBias" | "MatMulInt4" | "Add" | "Mul" | "RMSNorm" | "LayerNorm" | "RoPE" | "Attention" | "Softmax" | "SiLU" | "SwiGLU" | "GELU" | "Gather" | "Reshape" | "Transpose" | "Concat" | "MoERouter" | "ExpertMatMul" | "MambaSSM" | "CausalConv1d" | "CausalConv1dSiLU" | "CausalConv1dGated" | "SigmoidGate" | "ResidualRMSNorm" | "KVCacheAppend" | "ConvStateUpdate" | "SliceLastRow" | "MeanPool" | "Scale" | "Softcap" | "L2Norm" | "ApplyRotaryEmb" | "MRoPE" | "EmbedSplice" | "AddBias" | "GeluErf" | "SliceCols" | "MulCols" | "PoolMatMul" | "ClippedMatMul" | "Conv1dFull" | "ConvTranspose1d" | "Snake1d" | "FSQDequant" | "HalfSnake1d" | "ConvTranspose1dDepthwise" | "Conv2d" | "AvgPool2d" | "CrossAttention" | "Tanh" | "GroupNorm";
10
+ type DType = "f32" | "f16" | "i32" | "u32" | "i4";
11
+ type TensorStorage = "constant" | "activation" | "kv_cache" | "ssm_state";
12
+ interface TensorDesc {
13
+ /** Unique name within the graph (e.g. "layers.0.self_attn.q_proj.weight"). */
14
+ name: string;
15
+ /**
16
+ * Shape dimensions. Numbers are concrete; strings are symbolic
17
+ * ("T" for sequence length, "L_max" for max cache length).
18
+ */
19
+ shape: (number | string)[];
20
+ /** Element data type. */
21
+ dtype: DType;
22
+ /** Where this tensor lives. */
23
+ storage: TensorStorage;
24
+ /**
25
+ * Key in the safetensors file that maps to this tensor.
26
+ * Only set for storage === "constant".
27
+ */
28
+ safetensorsKey?: string;
29
+ /**
30
+ * Synthetic constant fill: when set (and the tensor has no weight data in the
31
+ * checkpoint), the loader materializes a constant tensor of this value at the
32
+ * declared shape instead of fetching it. Used for parameter-free norms such as
33
+ * Gemma 4's `v_norm` (RMSNormNoScale), which normalizes values with an implicit
34
+ * all-ones gain (fillValue = 1.0).
35
+ */
36
+ fillValue?: number;
37
+ }
38
+ interface OpNode {
39
+ /** Unique node ID (e.g. "layer0_norm1", "embed", "lm_head"). */
40
+ id: string;
41
+ /** Which operation to perform. */
42
+ opType: OpType;
43
+ /** Input tensor names (order matters — matches kernel binding order). */
44
+ inputs: string[];
45
+ /** Output tensor names. */
46
+ outputs: string[];
47
+ /** Op-specific parameters (hidden_size, eps, num_heads, group_size, etc.). */
48
+ attributes: Record<string, unknown>;
49
+ }
50
+ /** KV cache memory layout. LHSd = [layer, head, seq, head_dim]. */
51
+ type KVLayout = "LHSd";
52
+ interface ModelCapabilities {
53
+ text: true;
54
+ vision: boolean;
55
+ moe: boolean;
56
+ }
57
+ interface ModelArchConfig {
58
+ hidden_size: number;
59
+ num_layers: number;
60
+ num_heads: number;
61
+ num_kv_heads: number;
62
+ head_dim: number;
63
+ intermediate_size: number;
64
+ vocab_size: number;
65
+ context_length: number;
66
+ rms_norm_eps: number;
67
+ norm_type: "rmsnorm" | "layernorm";
68
+ rope_base: number;
69
+ rope_dim: number;
70
+ kv_layout: KVLayout;
71
+ is_moe: boolean;
72
+ num_experts?: number;
73
+ top_k_experts?: number;
74
+ has_vision_tower: boolean;
75
+ vision_architecture?: string;
76
+ vision_patch_size?: number;
77
+ vision_embed_dim?: number;
78
+ }
79
+ interface ModelGraph {
80
+ /** HF architecture string, e.g. "Qwen2ForCausalLM". */
81
+ architecture: string;
82
+ /** Resolved model config with all dimensions. */
83
+ config: ModelArchConfig;
84
+ /** What this model can do. */
85
+ capabilities: ModelCapabilities;
86
+ /** All tensors in the graph, keyed by name. */
87
+ tensors: Record<string, TensorDesc>;
88
+ /** All computation nodes. */
89
+ nodes: OpNode[];
90
+ /** Topologically-sorted node IDs — the order the executor runs them. */
91
+ executionOrder: string[];
92
+ /** Graph input tensor names (e.g. ["input_ids"]). */
93
+ inputs: string[];
94
+ /** Graph output tensor names (e.g. ["logits"]). */
95
+ outputs: string[];
96
+ }
97
+ /**
98
+ * Map a HuggingFace safetensors key to a canonical IR tensor name.
99
+ *
100
+ * Different model families use different prefixes:
101
+ * Qwen: "model.embed_tokens.weight"
102
+ * LLaMA: "model.embed_tokens.weight"
103
+ * Phi: "model.embed_tokens.weight" (but "model.layers.X.mlp.fc1.weight" etc.)
104
+ *
105
+ * This helper strips the common "model." prefix and handles known divergences.
106
+ * Architecture-specific overrides can extend the mapping.
107
+ */
108
+ type HFKeyMapper = (hfKey: string) => string | null;
109
+ //#endregion
110
+ //#region src/gpu/architectures/index.d.ts
111
+ /** Weight quantization mode for graph generation. */
112
+ type GraphDType = "f32" | "q4";
113
+ /** KV cache element type — "f16" halves memory traffic during attention. */
114
+ type KVDType = "f32" | "f16";
115
+ /**
116
+ * KV cache kernel strategy.
117
+ * - "f32": standard f32 buffers + f32 kernels
118
+ * - "native-f16": `enable f16` + `array<f16>` (Chrome/Dawn)
119
+ * - "packed-f16": `array<u32>` + pack2x16float/unpack2x16float (Safari-safe, no `enable f16`)
120
+ *
121
+ * Both f16 modes use the same buffer size (2 bytes/element). The difference is
122
+ * which WGSL kernel reads/writes the buffers.
123
+ */
124
+ type KvMode = "f32" | "native-f16" | "packed-f16";
125
+ //#endregion
126
+ //#region src/gpu/device.d.ts
127
+ /**
128
+ * WebGPU device abstraction layer.
129
+ *
130
+ * Wraps GPUDevice with helpers for buffer allocation, pipeline compilation,
131
+ * compute dispatch, and readback. All GPU interaction flows through here.
132
+ */
133
+ interface GPUContext {
134
+ /** The underlying WebGPU device. */
135
+ device: GPUDevice;
136
+ /** Device limits (max buffer size, workgroup size, etc.). */
137
+ limits: GPUSupportedLimits;
138
+ /** Whether f16 is supported as a shader type. */
139
+ hasF16: boolean;
140
+ /**
141
+ * Whether the WebGPU `subgroups` feature is available (Chrome 134+, Safari 26+).
142
+ * When true, kernels may use `subgroupAdd`/`subgroupBroadcast` etc. (requires
143
+ * `enable subgroups;` in the shader). Absence falls back to the portable
144
+ * shared-memory reductions — never assume this is present.
145
+ */
146
+ hasSubgroups: boolean;
147
+ /** True when the WebGPU implementation is WebKit's (Safari, all iOS/iPadOS browsers). */
148
+ isWebKitWebGPU: boolean;
149
+ /** Whether the `timestamp-query` feature is available (per-pass GPU timing). Used
150
+ * only by the env-gated decode profiler; never on the normal inference path. */
151
+ hasTimestamp: boolean;
152
+ /** Raw adapter info string for diagnostics. */
153
+ adapterDescription: string;
154
+ }
155
+ interface InitGPUOptions {
156
+ /** Called when the GPU device is lost (e.g. tab backgrounded on iOS). */
157
+ onDeviceLost?: (reason: string, message: string) => void;
158
+ }
159
+ /**
160
+ * Initialize WebGPU and request a device with the features we need.
161
+ *
162
+ * In Node.js, initializes Dawn's WebGPU polyfill if navigator.gpu is absent.
163
+ * In the browser, uses the native WebGPU API directly.
164
+ *
165
+ * Throws a clear error if WebGPU is unavailable.
166
+ */
167
+ declare function initGPU(options?: InitGPUOptions): Promise<GPUContext>;
168
+ interface GPUDiagnosticResult {
169
+ /** Whether the basic buffer upload → readback round-trip works. */
170
+ bufferIntegrity: boolean;
171
+ /** Whether a trivial compute shader executes correctly. */
172
+ computeWorks: boolean;
173
+ /** Whether shared memory + workgroupBarrier() works. */
174
+ sharedMemoryWorks: boolean;
175
+ /** Detailed messages for each test. */
176
+ details: string[];
177
+ }
178
+ //#endregion
179
+ //#region src/gpu/tokenizer.d.ts
180
+ /**
181
+ * Pure JavaScript BPE tokenizer.
182
+ *
183
+ * Reads HuggingFace tokenizer.json — no WASM, no external dependencies.
184
+ * Supports encoding, decoding, and chat template application.
185
+ */
186
+ interface TokenizerConfig {
187
+ bosToken: string | null;
188
+ eosToken: string | null;
189
+ bosTokenId: number | null;
190
+ eosTokenId: number | null;
191
+ chatTemplate: string | null;
192
+ addBosToken: boolean;
193
+ addEosToken: boolean;
194
+ }
195
+ interface ChatMessage {
196
+ role: "system" | "user" | "assistant";
197
+ content: string;
198
+ }
199
+ declare class Tokenizer {
200
+ private vocab;
201
+ private vocabReverse;
202
+ private merges;
203
+ private specialTokens;
204
+ private addedTokens;
205
+ private byteFallback;
206
+ /**
207
+ * SentencePiece mode (Gemma/Llama-style). When true, vocab uses U+2581 (▁) for
208
+ * spaces and raw UTF-8 tokens (NOT the GPT-2 byte-to-unicode "Ġ" mapping), and
209
+ * raw bytes fall back to <0xHH> tokens. When false, GPT-2 byte-level BPE.
210
+ */
211
+ private spmMode;
212
+ readonly config: TokenizerConfig;
213
+ readonly vocabSize: number;
214
+ private constructor();
215
+ /**
216
+ * Create a tokenizer from HuggingFace JSON files.
217
+ */
218
+ static fromJSON(tokenizerJSON: any, tokenizerConfigJSON?: any): Tokenizer;
219
+ /**
220
+ * Resolve a literal token string (e.g. "<|endoftext|>") to its vocab id,
221
+ * or null if it isn't in the vocabulary.
222
+ */
223
+ tokenToId(token: string): number | null;
224
+ /**
225
+ * Encode text into token IDs.
226
+ */
227
+ encode(text: string): number[];
228
+ /**
229
+ * Decode token IDs back to text.
230
+ */
231
+ decode(ids: number[], skipSpecialTokens?: boolean): string;
232
+ /**
233
+ * Apply chat template to messages.
234
+ *
235
+ * For now, implements the common ChatML format used by Qwen models:
236
+ * <|im_start|>system\n{content}<|im_end|>\n
237
+ * <|im_start|>user\n{content}<|im_end|>\n
238
+ * <|im_start|>assistant\n
239
+ *
240
+ * TODO: Parse Jinja2 templates from tokenizer_config.json for full generality.
241
+ */
242
+ /**
243
+ * Gemma 4 turn format: `<bos><|turn>user\n{content}<turn|>\n<|turn>model\n`.
244
+ * Gemma has no "system" role, so a system message is folded into the next user
245
+ * turn (matching the reference chat template).
246
+ */
247
+ private applyGemmaTurnTemplate;
248
+ applyChatTemplate(messages: ChatMessage[], options?: {
249
+ addGenerationPrompt?: boolean;
250
+ }): string;
251
+ /**
252
+ * Encode a chat conversation into token IDs.
253
+ */
254
+ encodeChat(messages: ChatMessage[], options?: {
255
+ addGenerationPrompt?: boolean;
256
+ }): number[];
257
+ private splitOnSpecialTokens;
258
+ private preTokenize;
259
+ private textToTokenRepr;
260
+ private bpeEncode;
261
+ private encodeByteFallback;
262
+ }
263
+ //#endregion
264
+ //#region src/gpu/weight-source.d.ts
265
+ /** A single tensor's data + shape (the unit the executor uploads to a GPU buffer). */
266
+ interface WeightEntry {
267
+ data: ArrayBufferView;
268
+ shape: number[];
269
+ }
270
+ /**
271
+ * Read-side view consumed by the executor's streaming `uploadWeights`. Async by
272
+ * design so a cache-backed store can fetch one tensor's bytes at a time.
273
+ */
274
+ interface WeightSource {
275
+ has(name: string): boolean;
276
+ keys(): string[];
277
+ readonly size: number;
278
+ /** Pull a single tensor (bytes materialized + dtype-converted on demand). */
279
+ get(name: string): Promise<WeightEntry | undefined>;
280
+ /**
281
+ * Release any transient backing storage (e.g. the browser transform-staging
282
+ * cache) once the consumer has finished uploading. No-op for the heap backend.
283
+ */
284
+ dispose?(): Promise<void>;
285
+ }
286
+ //#endregion
287
+ //#region src/gpu/model-loader.d.ts
288
+ interface LoadModelOptions {
289
+ /** HF repo ID (e.g. "Qwen/Qwen3.5-0.8B") or full URL. */
290
+ repo: string;
291
+ /** Progress callback: (loaded, total, message) */
292
+ onProgress?: (loaded: number, total: number, message: string) => void;
293
+ /** Custom HF key mapper (defaults to stripping "model." prefix). */
294
+ keyMapper?: HFKeyMapper;
295
+ /** HuggingFace API token for gated models. */
296
+ hfToken?: string;
297
+ /** Revision/branch (default: "main"). */
298
+ revision?: string;
299
+ /** Local cache directory for downloaded files (Node.js only). */
300
+ cacheDir?: string;
301
+ /**
302
+ * Weight dtype:
303
+ * - "f32" full precision (or the repo's native quantization, e.g. MLX/GPTQ q4)
304
+ * - "q4" on-the-fly INT4 quantization (~4× smaller)
305
+ * - "auto" (recommended) picks q4 on mobile (iOS/Android) to fit in device
306
+ * memory and f32/native on desktop. Already-quantized repos
307
+ * (MLX/GPTQ 4-bit) stay q4 regardless.
308
+ */
309
+ dtype?: GraphDType | "auto";
310
+ /** KV cache dtype: "f16" halves memory traffic during attention. Requires GPU f16 support. */
311
+ kvDtype?: KVDType;
312
+ /**
313
+ * Build an embedding graph (last-token pool + L2 norm) instead of an LM head.
314
+ * Only valid for Qwen2/Qwen3 CausalLM architectures (e.g. Qwen3-Embedding).
315
+ */
316
+ embedding?: boolean;
317
+ /**
318
+ * Build the multimodal LM graph variant (M-RoPE + image-embedding splice) so
319
+ * the text model can consume spliced image tokens. Only meaningful for
320
+ * Qwen3_5ForConditionalGeneration. Reserves `maxVisionTokens` rows for the
321
+ * vision-embedding buffer. Text-only generation through this graph is
322
+ * numerically identical to the non-multimodal graph (M-RoPE fed linear
323
+ * positions == standard 1D RoPE).
324
+ */
325
+ multimodal?: {
326
+ maxVisionTokens: number;
327
+ };
328
+ /**
329
+ * Force-download and key-map the vision tower even without the multimodal LM
330
+ * graph. `enableVision` (via `multimodal`) already implies this; this flag is
331
+ * for callers that load weights directly (e.g. the vision-encoder validation
332
+ * scripts) and build the vision graph/executor themselves. When neither this
333
+ * nor `multimodal` is set, the ~201MB ViT is excluded from the download.
334
+ */
335
+ loadVisionTower?: boolean;
336
+ }
337
+ /**
338
+ * Gemma 4 Per-Layer-Embeddings (PLE) source, kept CPU-resident.
339
+ *
340
+ * The PLE table (`embed_tokens_per_layer`, [vocab, num_layers*256]) is ~1.17GB
341
+ * at 4-bit. Uploading it to a GPU buffer would make the model non-mobile-viable
342
+ * and would hit the per-binding size cap. Instead the loader hands the quantized
343
+ * table to the executor in JS memory; the executor gathers + dequantizes only the
344
+ * rows for the current input tokens each forward step (a tiny [T, width] upload).
345
+ */
346
+ interface PleSource {
347
+ /**
348
+ * Flat row-major INT4 nibbles (Gerbil packing, 8 per u32). HEAP-RESIDENT path
349
+ * (Node/desktop). In the browser this is empty and `cache` is set instead so
350
+ * the ~1.17 GB table never sits in the JS heap during load.
351
+ */
352
+ packed: Uint32Array;
353
+ /** Per-group scales (Gerbil (nibble - zero) * scale convention). */
354
+ scales: Float32Array;
355
+ /** Per-group zero points. */
356
+ zeros: Float32Array;
357
+ /** Row width = num_layers * hidden_size_per_layer_input (E2B: 35*256 = 8960). */
358
+ width: number;
359
+ /** Dequant group size (MLX: 64). */
360
+ groupSize: number;
361
+ /** Activation tensor the per-step gathered rows are written into. */
362
+ targetTensor: string;
363
+ /**
364
+ * Browser only: when set, the quantized PLE table's bytes live in CacheStorage
365
+ * (not the heap). The executor reads the slice of nibbles/scales/zeros it needs
366
+ * for the current tokens on demand. Keeps peak load heap bounded.
367
+ */
368
+ cache?: {
369
+ cacheName: string;
370
+ packedKey: string;
371
+ scalesKey: string;
372
+ zerosKey: string;
373
+ /** packed.length (u32 count) — for bounds/Range math. */
374
+ packedLen: number;
375
+ };
376
+ }
377
+ interface LoadedModel {
378
+ /** The generated computation graph (IR). */
379
+ graph: ModelGraph;
380
+ /** The tokenizer. */
381
+ tokenizer: Tokenizer;
382
+ /**
383
+ * Weight tensors mapped to canonical names. A `WeightSource` so the executor
384
+ * can pull one tensor at a time (cache-backed in the browser, heap-backed on
385
+ * Node) instead of requiring the whole model to sit in heap at once. Use
386
+ * `get(name)` (async) to materialize a tensor's bytes on demand.
387
+ */
388
+ weights: WeightSource;
389
+ /** Raw config.json for reference. */
390
+ rawConfig: Record<string, unknown>;
391
+ /**
392
+ * CPU-resident Gemma 4 PLE table (set only for Gemma 4). Pass to
393
+ * `executor.setPleSource()` so the big table never becomes GPU-resident.
394
+ */
395
+ pleSource?: PleSource;
396
+ }
397
+ /**
398
+ * Load a model from HuggingFace Hub.
399
+ *
400
+ * 1. Fetch config.json -> determine architecture -> generate IR graph
401
+ * 2. Fetch tokenizer.json + tokenizer_config.json -> build tokenizer
402
+ * 3. Download safetensors -> parse headers -> extract weight data
403
+ * 4. Map HF tensor keys -> canonical names
404
+ */
405
+ declare function loadModel(options: LoadModelOptions): Promise<LoadedModel>;
406
+ interface LoadedMoonshine {
407
+ /** Canonical-named f32 weights (data + shape), shared by encoder + decoder graphs. */
408
+ weights: Map<string, {
409
+ data: ArrayBufferView;
410
+ shape: number[];
411
+ }>;
412
+ /** The (decode-capable) tokenizer. */
413
+ tokenizer: Tokenizer;
414
+ /** Raw config.json. */
415
+ rawConfig: Record<string, unknown>;
416
+ }
417
+ declare function loadMoonshine(options: {
418
+ repo: string;
419
+ revision?: string;
420
+ hfToken?: string;
421
+ cacheDir?: string;
422
+ onProgress?: (loaded: number, total: number, message: string) => void;
423
+ }): Promise<LoadedMoonshine>;
424
+ interface LoadedKaniTTS {
425
+ /** Canonical-named f32 weights for the codec-LM backbone (LFM2 keys). */
426
+ backboneWeights: Map<string, {
427
+ data: ArrayBufferView;
428
+ shape: number[];
429
+ }>;
430
+ /** Folded NanoCodec decoder weights under canonical `nanocodec.*` names. */
431
+ codecWeights: Map<string, {
432
+ data: ArrayBufferView;
433
+ shape: number[];
434
+ }>;
435
+ /** The text tokenizer. */
436
+ tokenizer: Tokenizer;
437
+ /** Raw backbone config.json (LFM2 dims + KaniTTS2 fields). */
438
+ rawConfig: Record<string, unknown>;
439
+ }
440
+ declare function loadKaniTTS(options: {
441
+ /** Backbone repo (default nineninesix/kani-tts-2-en). */
442
+ repo?: string;
443
+ /** NanoCodec repo (default KANI_NANOCODEC_REPO). */
444
+ codecRepo?: string;
445
+ revision?: string;
446
+ hfToken?: string;
447
+ cacheDir?: string;
448
+ onProgress?: (loaded: number, total: number, message: string) => void;
449
+ }): Promise<LoadedKaniTTS>;
450
+ //#endregion
451
+ //#region src/gpu/sampler.d.ts
452
+ /**
453
+ * CPU-side token sampling from logits.
454
+ *
455
+ * Applies temperature, top-k, and top-p (nucleus) filtering,
456
+ * then samples from the resulting probability distribution.
457
+ *
458
+ * Uses typed arrays and min-heap for zero-allocation top-K selection.
459
+ * For vocab_size ~152K with topK=50, this avoids creating 152K JS tuples.
460
+ */
461
+ interface SamplingParams {
462
+ temperature?: number;
463
+ topK?: number;
464
+ topP?: number;
465
+ repetitionPenalty?: number;
466
+ }
467
+ //#endregion
468
+ //#region src/gpu/vision-preprocess.d.ts
469
+ /**
470
+ * Host-side vision preprocessing for the Qwen3.5 ViT.
471
+ *
472
+ * The learned position embeddings (bilinear-interpolated over the patch grid)
473
+ * and the 2D rotary cos/sin tables are functions of the image grid (t, h, w)
474
+ * ONLY — not of the model weights or pixel values. They are cheap to compute on
475
+ * the CPU and fed to the GPU graph as input activations, keeping the graph to
476
+ * weight-dependent math while staying byte-identical to HF transformers.
477
+ *
478
+ * Ports (verified against transformers 5.12 vision_utils.py + modeling_qwen3_5):
479
+ * - get_vision_bilinear_indices_and_weights → buildPosEmbeds()
480
+ * - get_vision_position_ids → buildPositionIds()
481
+ * - Qwen3_5VisionRotaryEmbedding → buildRotaryCosSin()
482
+ *
483
+ * Patch ordering: both the pos-embed gather and the position ids reorder patches
484
+ * into spatial_merge_size×spatial_merge_size groups, matching the HF image
485
+ * processor's output ordering, so the merger's [N,h]→[N/u,h*u] reshape lines up.
486
+ */
487
+ interface VisionGridConfig {
488
+ hiddenSize: number;
489
+ numHeads: number;
490
+ numPositionEmbeddings: number;
491
+ spatialMergeSize: number;
492
+ ropeTheta?: number;
493
+ }
494
+ interface VisionPositionTensors {
495
+ /** [N, hidden_size] bilinear-interpolated learned position embeddings. */
496
+ posEmbeds: Float32Array;
497
+ /** [N, head_dim] rotary cosines. */
498
+ cos: Float32Array;
499
+ /** [N, head_dim] rotary sines. */
500
+ sin: Float32Array;
501
+ numPatches: number;
502
+ }
503
+ /**
504
+ * Build bilinear-interpolated learned position embeddings [N, hidden].
505
+ * posEmbedTable is the raw pos_embed.weight [num_position_embeddings, hidden].
506
+ */
507
+ declare function buildPosEmbeds(gridTHW: [number, number, number], posEmbedTable: Float32Array, cfg: VisionGridConfig): Float32Array;
508
+ /**
509
+ * Build the reordered (row, col) position ids [N, 2] for rotary, matching
510
+ * get_vision_position_ids.
511
+ */
512
+ declare function buildPositionIds(gridTHW: [number, number, number], merge: number): Int32Array;
513
+ /**
514
+ * Build rotary cos/sin tables [N, head_dim] from position ids, matching
515
+ * Qwen3_5VisionRotaryEmbedding + the cat((rotary, rotary)) in VisionModel.forward.
516
+ *
517
+ * rotary_pos_emb(position_ids) = (position_ids[..,None] * inv_freq).flatten(1)
518
+ * where inv_freq has length (head_dim/2)/2 = head_dim/4, computed over dim=head_dim/2.
519
+ * For each token the two position components (h, w) each produce head_dim/4 freqs,
520
+ * concatenated → head_dim/2, then duplicated → head_dim for cos/sin.
521
+ */
522
+ declare function buildRotaryCosSin(positionIds: Int32Array, headDim: number, theta?: number): {
523
+ cos: Float32Array;
524
+ sin: Float32Array;
525
+ numPatches: number;
526
+ };
527
+ /**
528
+ * Build all host position tensors for a single image grid in one call.
529
+ */
530
+ declare function buildVisionPositionTensors(gridTHW: [number, number, number], posEmbedTable: Float32Array, cfg: VisionGridConfig): VisionPositionTensors;
531
+ interface Gemma4VisionGridConfig {
532
+ hiddenSize: number;
533
+ numHeads: number;
534
+ headDim: number;
535
+ ropeTheta: number;
536
+ poolingKernelSize: number;
537
+ }
538
+ interface Gemma4VisionPositionTensors {
539
+ /** [N, hidden] axial position embeddings (table[0][x] + table[1][y]). */
540
+ posEmbeds: Float32Array;
541
+ /** [N, headDim] axial rotary cosines. */
542
+ cos: Float32Array;
543
+ /** [N, headDim] axial rotary sines. */
544
+ sin: Float32Array;
545
+ /** [Np, N] average-pooling matrix (1/k² in-cell, 0 elsewhere). */
546
+ poolMatrix: Float32Array;
547
+ /** number of patches N (= gridH*gridW). */
548
+ numPatches: number;
549
+ /** number of pooled (soft) tokens Np (= ceil(gridH/k)*ceil(gridW/k)). */
550
+ numPooled: number;
551
+ }
552
+ /**
553
+ * Build axial learned position embeddings [N, hidden] from the [2, posSize, hidden]
554
+ * table: pos[p] = table[0][x_p] + table[1][y_p]. Direct lookup, no interpolation
555
+ * (HF F.embedding on clamped positions).
556
+ */
557
+ declare function buildGemma4PosEmbeds(gridH: number, gridW: number, posEmbedTable: Float32Array,
558
+ // [2, posSize, hidden] flattened
559
+ hidden: number, posSize: number): Float32Array;
560
+ /**
561
+ * Build the 2D axial rotary cos/sin tables [N, headDim].
562
+ * spatial_dim = headDim / 2; inv_freq[j] = 1/theta^((2j)/spatial_dim), j in [0, spatial_dim/2)
563
+ * per spatial dim: f = pos * inv_freq (spatial_dim/2 values); emb = cat(f, f) (spatial_dim values)
564
+ * cos/sin = cat([emb_x, emb_y]) → headDim values, layout [fx,fx,fy,fy].
565
+ * Applied with the global-half rotate_half kernel (ApplyRotaryEmb), which computes
566
+ * out = x*cos + rotate_half(x)*sin element-wise — exact for this layout.
567
+ */
568
+ declare function buildGemma4RotaryCosSin(gridH: number, gridW: number, headDim: number, theta: number): {
569
+ cos: Float32Array;
570
+ sin: Float32Array;
571
+ };
572
+ /**
573
+ * Build the [Np, N] average-pooling matrix for k×k spatial pooling over the real
574
+ * (unpadded) grid, matching modeling_gemma4's kernel_idxs/one_hot pooling:
575
+ * cell(p) = floor(x_p/k) + ceil(gridW/k) * floor(y_p/k)
576
+ * poolMatrix[cell, p] = 1/k² (so pooled = poolMatrix @ hidden = mean over the k×k block)
577
+ * Np = ceil(gridH/k) * ceil(gridW/k). Each pooled cell averages exactly the patches
578
+ * that fall in it (edge cells with fewer than k² patches still divide by k², matching
579
+ * HF's fixed 1/k² normalization).
580
+ */
581
+ declare function buildGemma4PoolMatrix(gridH: number, gridW: number, k: number): {
582
+ poolMatrix: Float32Array;
583
+ numPooled: number;
584
+ };
585
+ /**
586
+ * Build all Gemma 4 vision host tensors for one image grid in one call.
587
+ * `posEmbedTable` is the raw [2, posSize, hidden] flattened table.
588
+ */
589
+ declare function buildGemma4VisionPositionTensors(gridH: number, gridW: number, posEmbedTable: Float32Array, posSize: number, cfg: Gemma4VisionGridConfig): Gemma4VisionPositionTensors;
590
+ /** Gemma 4 image processor config (from processor_config.json). */
591
+ declare const GEMMA4_IMAGE_PROCESSOR: ImageProcessorConfig;
592
+ interface Gemma4PreprocessedImage {
593
+ /** Flattened patches [N, 3·patch²] row-major (row-major patch grid). */
594
+ patches: Float32Array;
595
+ /** Patch grid (gridH, gridW). */
596
+ gridHW: [number, number];
597
+ }
598
+ /**
599
+ * Preprocess a decoded RGB image for the Gemma 4 ViT: aspect-preserving resize so
600
+ * the patch grid is ≤ max_soft_tokens·k² patches and H,W divisible by k·patch,
601
+ * rescale ×1/255 (no normalize), patchify row-major into [N, 3·16·16].
602
+ *
603
+ * @param pixels row-major HWC RGB (0..255), length width*height*3.
604
+ */
605
+ declare function preprocessImageGemma4(pixels: Float32Array | Uint8ClampedArray | Uint8Array, width: number, height: number, maxSoftTokens?: number, poolingKernelSize?: number, patchSize?: number): Gemma4PreprocessedImage;
606
+ interface ImageProcessorConfig {
607
+ patchSize: number;
608
+ temporalPatchSize: number;
609
+ mergeSize: number;
610
+ imageMean: [number, number, number];
611
+ imageStd: [number, number, number];
612
+ /** rescale factor applied to raw 0..255 pixels before normalization (1/255). */
613
+ rescaleFactor: number;
614
+ /** min total pixels after resize (shortest_edge). */
615
+ minPixels: number;
616
+ /** max total pixels after resize (longest_edge). */
617
+ maxPixels: number;
618
+ }
619
+ declare const QWEN3_5_IMAGE_PROCESSOR: ImageProcessorConfig;
620
+ interface PreprocessedImage {
621
+ /** Flattened patches [N, 1536] in the spatial-merge order encodeImage expects. */
622
+ patches: Float32Array;
623
+ /** (t, h, w) patch grid. t=1 for a single image, h/w in patch units. */
624
+ gridTHW: [number, number, number];
625
+ }
626
+ /**
627
+ * Qwen2-VL smart-resize: round H and W to multiples of factor=patch*merge,
628
+ * keeping aspect ratio and clamping the total pixel budget to [minPixels, maxPixels].
629
+ * Matches transformers.models.qwen2_vl.image_processing.smart_resize.
630
+ */
631
+ declare function smartResize(height: number, width: number, factor: number, minPixels: number, maxPixels: number): [number, number];
632
+ /**
633
+ * Preprocess a decoded RGB image into the [N, 1536] patch tensor + grid_thw that
634
+ * `encodeImage()` expects, matching the HF Qwen2-VL image processor:
635
+ * smart_resize → rescale (×1/255) → normalize → temporal-pair (×temporal_patch_size)
636
+ * → patchify into spatial_merge×spatial_merge blocks → flatten to [N, C·T·P·P].
637
+ *
638
+ * @param pixels row-major HWC RGB, length width*height*3. Values 0..255 (default)
639
+ * or already 0..1 if `rescaled` is true.
640
+ * @param width source pixel width
641
+ * @param height source pixel height
642
+ */
643
+ declare function preprocessImage(pixels: Float32Array | Uint8ClampedArray | Uint8Array, width: number, height: number, cfg?: ImageProcessorConfig, rescaled?: boolean): PreprocessedImage;
644
+ declare function buildMRoPEPositionIds(inputIds: Int32Array | Uint32Array | number[], imageGrids: Array<[number, number, number]>, imageTokenId: number, mergeSize: number): Int32Array;
645
+ /**
646
+ * Per-pair frequency→dimension assignment for interleaved M-RoPE, matching
647
+ * Qwen3_5TextRotaryEmbedding.apply_interleaved_mrope. For pair index i in
648
+ * [0, sum(section)) the position component is section-cyclic: T,H,W,T,H,W,...
649
+ * but each component capped at its section count. Returns an array of length
650
+ * (rope_dim/2) with values 0=T, 1=H, 2=W.
651
+ */
652
+ declare function mropeFreqDims(mropeSection: [number, number, number]): Int32Array;
653
+ /**
654
+ * Build the interleaved-M-RoPE cos/sin tables [seq, rope_dim] from 3D position
655
+ * ids, matching Qwen3_5TextRotaryEmbedding.forward:
656
+ * freqs[d][i] = pos[d] * inv_freq[i], inv_freq[i] = 1/theta^(2i/rope_dim)
657
+ * freq[i] picks component mropeFreqDims[i]; emb = cat(freqs, freqs).
658
+ * cos/sin have length seq*rope_dim. For text-only (all 3 pos rows equal) this
659
+ * reduces exactly to standard 1D partial RoPE.
660
+ *
661
+ * @param positionIds3 [3, seq] as produced by buildMRoPEPositionIds.
662
+ * @param ropeDim number of rotated dims per head (head_dim * partial_factor).
663
+ */
664
+ declare function buildMRoPECosSin(positionIds3: Int32Array, seq: number, ropeDim: number, theta: number, mropeSection: [number, number, number]): {
665
+ cos: Float32Array;
666
+ sin: Float32Array;
667
+ };
668
+ //#endregion
669
+ //#region src/gpu/defaults.d.ts
670
+ /**
671
+ * Default model per capability. Kept in its own tiny module (no heavy imports)
672
+ * so the React hooks can resolve defaults without statically pulling in the GPU
673
+ * engine — they import the engine dynamically to stay light.
674
+ */
675
+ declare const DEFAULT_MODELS: {
676
+ /** Text generation (also the vision-capable checkpoint). */
677
+ readonly text: "mlx-community/Qwen3.5-0.8B-4bit";
678
+ /** Image understanding — same checkpoint, vision tower built on demand. */
679
+ readonly vision: "mlx-community/Qwen3.5-0.8B-4bit";
680
+ /** Text embeddings. */
681
+ readonly embedding: "mlx-community/embeddinggemma-300m-4bit";
682
+ /** Text-to-speech. */
683
+ readonly tts: "nineninesix/kani-tts-2-en";
684
+ /** Speech-to-text. */
685
+ readonly stt: "UsefulSensors/moonshine-base";
686
+ };
687
+ /** Resolve the model repo for a set of options, falling back to the defaults. */
688
+ declare function resolveDefaultRepo(opts: {
689
+ repo?: string;
690
+ embedding?: boolean;
691
+ enableVision?: boolean;
692
+ }): string;
693
+ //#endregion
694
+ //#region src/gpu/architectures/gemma4_vision.d.ts
695
+ interface Gemma4VisionGraphInfo {
696
+ hiddenSize: number;
697
+ numHeads: number;
698
+ headDim: number;
699
+ depth: number;
700
+ intermediateSize: number;
701
+ textHidden: number;
702
+ patchSize: number;
703
+ patchDim: number;
704
+ poolingKernelSize: number;
705
+ ropeTheta: number;
706
+ rmsNormEps: number;
707
+ }
708
+ /**
709
+ * Resolve the Gemma 4 vision dims from a raw HF config. Accepts either the
710
+ * top-level config (reads `.vision_config` + `.text_config.hidden_size`) or a
711
+ * bare vision_config (then `textHidden` falls back to the projector row count if
712
+ * present, else hidden). Family-general — no E2B constants.
713
+ */
714
+ declare function resolveGemma4VisionInfo(rawConfig: Record<string, unknown>): Gemma4VisionGraphInfo;
715
+ /**
716
+ * Dequantize an MLX affine-int4 weight to a plain f32 [rows, cols] matrix.
717
+ * MLX packs 8 int4 values per u32 (low-nibble first); each group of `groupSize`
718
+ * columns shares one scale + bias: w[r,c] = scale[r, c/gs] * q + bias[r, c/gs].
719
+ * Used for the Gemma 4 multimodal projector (`embed_vision.embedding_projection`)
720
+ * in MLX-4bit checkpoints, where (unlike the BF16 ViT body) the projector is int4.
721
+ */
722
+ declare function dequantizeMLXProjection(packed: Uint32Array, scales: Float32Array, biases: Float32Array, rows: number, cols: number, groupSize: number): Float32Array;
723
+ /**
724
+ * If the Gemma 4 multimodal projector arrived as an MLX affine-int4 triplet
725
+ * (`embed_vision.embedding_projection.{weight(U32), scales, biases}`), dequantize
726
+ * it in-place to a plain f32 `embed_vision.embedding_projection.weight` and drop
727
+ * the scales/biases, so the vision graph's plain MatMul on the projector works for
728
+ * MLX-4bit checkpoints too. No-op for BF16 (HF) checkpoints (weight already f32).
729
+ */
730
+ declare function dequantizeGemma4VisionProjection(weights: Map<string, {
731
+ data: ArrayBufferView;
732
+ shape: number[];
733
+ }>, groupSize: number, rows: number, cols: number): void;
734
+ /**
735
+ * Patch the ClippedMatMul nodes of a Gemma 4 vision graph with the calibrated clip
736
+ * scalars from the checkpoint (Gemma4ClippableLinear's per-tensor input/output
737
+ * min/max buffers), then drop those scalar tensors from the weights map so the
738
+ * vision executor doesn't try to upload them as GPU buffers. Call BEFORE
739
+ * VisionExecutor.uploadWeights(). Missing scalars default to ±inf (clip = identity),
740
+ * so a checkpoint without calibration still loads.
741
+ */
742
+ declare function patchGemma4VisionClips(graph: ModelGraph, weights: Map<string, {
743
+ data: ArrayBufferView;
744
+ shape: number[];
745
+ }>): void;
746
+ /**
747
+ * Build the Gemma 4 ViT graph. Shaped by symbolic "N" (number of patches, runtime)
748
+ * and "Np" (number of pooled tokens = ceil(grid_h/k)·ceil(grid_w/k), runtime),
749
+ * resolved from input tensor dims — like the Qwen ViT's "N"/"Nm".
750
+ */
751
+ declare function generateGemma4VisionGraph(rawConfig: Record<string, unknown>): ModelGraph;
752
+ //#endregion
753
+ //#region src/gpu/architectures/kani_tts.d.ts
754
+ /** Parsed KaniTTS2 backbone config (the LFM2 dims + the TTS-specific fields). */
755
+ interface KaniConfig {
756
+ textVocabSize: number;
757
+ vocabSize: number;
758
+ tokensPerFrame: number;
759
+ audioStep: number;
760
+ useLearnableRope: boolean;
761
+ alphaMin: number;
762
+ alphaMax: number;
763
+ speakerEmbDim: number;
764
+ audioTokensStart: number;
765
+ startOfSpeech: number;
766
+ endOfSpeech: number;
767
+ codebookSize: number;
768
+ }
769
+ declare function parseKaniConfig(rawConfig: Record<string, unknown>): KaniConfig;
770
+ /**
771
+ * Convert the model's flat audio-token stream (the IDs between start/end-of-speech)
772
+ * into NanoCodec codes [NUM_GROUPS, T]. Mirrors NemoAudioPlayer.get_nano_codes:
773
+ * reshape [-1, 4]; codes[:,c] -= codebook_size*c; codes -= audio_tokens_start;
774
+ * transpose → [4, T]. Returns a Uint32Array laid out group-major ([g*T + t]).
775
+ */
776
+ declare function audioTokensToCodes(audioTokenIds: number[], cfg?: KaniConfig): {
777
+ codes: Uint32Array;
778
+ numFrames: number;
779
+ };
780
+ interface NanoCodecGraphOptions {
781
+ /** Number of audio frames T (the code grid width). PCM length = T * 1764. */
782
+ numFrames: number;
783
+ }
784
+ declare function generateNanoCodecDecoderGraph(opts: NanoCodecGraphOptions): ModelGraph;
785
+ /**
786
+ * Generate the KaniTTS2 codec-LM backbone graph (LFM2-350M body, full-vocab logits,
787
+ * per-layer learnable MRoPE). Mirrors generateLfm2Graph block-for-block; the only
788
+ * difference is that each attention layer rotates Q/K with the MRoPE op fed a
789
+ * per-layer host cos/sin table instead of the position-counter RoPE op.
790
+ *
791
+ * f32 only (the checkpoint is bf16→f32; q4 would need the codec-LM head re-validated).
792
+ */
793
+ declare function generateKaniTtsGraph(rawConfig: Record<string, unknown>, dtype?: GraphDType, _groupSize?: number, kvDtype?: KVDType): ModelGraph;
794
+ //#endregion
795
+ //#region src/gpu/architectures/moonshine.d.ts
796
+ /**
797
+ * Historical record of the executor-side work the keystone left open; all items
798
+ * are now implemented (see the STATUS note above). Retained as exported
799
+ * documentation of the dependency surface.
800
+ */
801
+ declare const MOONSHINE_REMAINING_WORK: readonly ["DONE: Transpose op kernel (conv output [C,L] → encoder input [L,C]).", "DONE: GroupNorm(num_groups=1) over conv channels (weight+bias).", "DONE: Tanh elementwise op (conv1 activation).", "DONE: Interleaved-RoPE variant (Moonshine rotates adjacent dim pairs 2p/2p+1).", "DONE: No-bias LayerNorm variant (Moonshine norms are weight-only).", "DONE: Per-utterance conv graph regeneration from the concrete sample count.", "DONE: Dual-graph executor — encoder once → frozen cross-attn K/V → AR decode.", "DONE: engine.transcribe(pcm) host path (MoonshineSTT): conv → encode → AR decode → detokenize."];
802
+ interface MoonshineDims {
803
+ hidden_size: number;
804
+ enc_layers: number;
805
+ dec_layers: number;
806
+ num_heads: number;
807
+ num_kv_heads: number;
808
+ head_dim: number;
809
+ rotary_dim: number;
810
+ intermediate_size: number;
811
+ vocab_size: number;
812
+ rope_base: number;
813
+ context_length: number;
814
+ ln_eps: number;
815
+ }
816
+ /** Pull and derive Moonshine dimensions from the raw HF config. */
817
+ declare function parseMoonshineConfig(raw: Record<string, unknown>): MoonshineDims;
818
+ /** Number of encoder frames produced by the conv frontend for n_samples PCM. */
819
+ declare function moonshineEncoderFrames(nSamples: number): number;
820
+ /**
821
+ * Encoder graph: raw-waveform conv frontend + bidirectional transformer.
822
+ * @param nSamples concrete PCM sample count (the conv frontend is length-static).
823
+ * The output tensor "encoder_out" is [T_frames, hidden] and is consumed (after
824
+ * per-layer K/V projection) as the frozen K/V for the decoder's cross-attention.
825
+ */
826
+ declare function generateMoonshineEncoderGraph(raw: Record<string, unknown>, nSamples: number): ModelGraph;
827
+ /**
828
+ * Decoder graph: AR transformer with causal self-attn (KV-cache) + cross-attn to
829
+ * the frozen encoder output. Built for a single decode step (T=1). The encoder K/V
830
+ * are supplied as graph inputs "enc_k_layer{i}" / "enc_v_layer{i}" — the host
831
+ * pre-projects the encoder output through each layer's encoder_attn.k_proj/v_proj
832
+ * ONCE and binds them frozen for the whole decode (the CrossAttention contract).
833
+ */
834
+ declare function generateMoonshineDecoderGraph(raw: Record<string, unknown>, sEnc: number): ModelGraph;
835
+ //#endregion
836
+ //#region src/gpu/architectures/qwen3_5_vision.d.ts
837
+ /**
838
+ * Build the ViT graph. The graph is shaped by symbolic "N" (number of patches),
839
+ * resolved at run time from the input tensor's first dim — exactly like the LM's
840
+ * symbolic "T".
841
+ */
842
+ declare function generateQwen3_5VisionGraph(rawConfig: Record<string, unknown>): ModelGraph;
843
+ //#endregion
844
+ //#region src/gpu/executor.d.ts
845
+ /**
846
+ * Safari/Metal workaround: shader variant alternation.
847
+ * Metal caches argument buffers per compiled function. When consecutive dispatches
848
+ * use the same WGSL code (same Metal function), Metal reuses the previous dispatch's
849
+ * argument buffer, ignoring setBindGroup(). We alternate between variant 0/1 of each
850
+ * shader (prepending `const _MV: u32 = Xu;`) to force different Metal function
851
+ * specializations, preventing argument buffer reuse.
852
+ */
853
+ interface ExecutorOptions {
854
+ maxSeqLen: number;
855
+ /** KV cache kernel strategy. Defaults to "native-f16" when not specified. */
856
+ kvMode?: KvMode;
857
+ /**
858
+ * WebKit only: dispatches per command buffer, with at most one command
859
+ * buffer in flight (awaited). 1 (default) is the proven-correct floor on
860
+ * iPad; larger values are faster if this WebKit version keeps storage
861
+ * writes visible across dispatches within one submission. Sweepable via
862
+ * the ?group=N URL param.
863
+ */
864
+ webkitGroupSize?: number;
865
+ }
866
+ interface ForwardResult {
867
+ logits: Float32Array;
868
+ }
869
+ declare class Executor {
870
+ private ctx;
871
+ private graph;
872
+ private weightBuffers;
873
+ private activationBuffers;
874
+ private ssmStateBuffers;
875
+ private kvCacheBuffers;
876
+ /** Pre-allocated input_ids buffer (maxSeqLen * 4 bytes). */
877
+ private inputIdsBuffer;
878
+ /**
879
+ * CPU-resident Per-Layer-Embeddings (PLE) source for Gemma 4. The PLE table
880
+ * (`embed_tokens_per_layer`, [vocab, L*256]) is ~1.17GB at 4-bit and is kept
881
+ * OFF the GPU. Each forward step we gather + dequantize only the rows for the
882
+ * actual input tokens and upload a tiny [T, L*256] f32 buffer. See setPleSource.
883
+ */
884
+ private pleSource;
885
+ /** Reusable scratch for the dequantized PLE rows (resized on demand). */
886
+ private pleScratch;
887
+ /**
888
+ * One-time promise that materializes a cache-backed PLE table into heap. The
889
+ * table is read from CacheStorage on the FIRST forward — i.e. AFTER the GPU
890
+ * weight upload has completed, so the ~1.17 GB does not stack on top of the
891
+ * upload's transient allocations at the load-time memory high-water mark.
892
+ */
893
+ private pleReady;
894
+ /**
895
+ * Dummy GPU buffer bound to an otherwise-aliasing storage-read-write slot.
896
+ * Used by RoPE-Q-only nodes (Gemma 4 KV-shared layers) whose node lists the
897
+ * same tensor as input and output: the RoPE kernel always declares two
898
+ * read_write bindings (q, k), but with num_kv_heads=0 the k slot is never
899
+ * touched. WebGPU still rejects two read_write bindings aliasing one buffer,
900
+ * so we bind this throwaway buffer to the unused k slot. Allocated lazily.
901
+ */
902
+ private bindingScratchBuffer;
903
+ /** Readback buffer for logits. */
904
+ private logitsReadback;
905
+ /** GPU buffer for argmax result (1 u32). */
906
+ private argmaxResultBuffer;
907
+ /** Readback buffer for argmax result (1 u32). */
908
+ private argmaxReadback;
909
+ /** Readback ring for pipelined greedy decode (created lazily). */
910
+ private decodeReadbacks;
911
+ /**
912
+ * Staging buffer for uniform param updates.
913
+ * Safari/Metal has weaker visibility guarantees for queue.writeBuffer() to
914
+ * UNIFORM buffers — early writes get dropped when hundreds are queued.
915
+ * Instead, we pack all params into this STORAGE staging buffer (1 writeBuffer),
916
+ * then use encoder.copyBufferToBuffer to distribute to each uniform buffer.
917
+ * Copies are GPU-sequenced and guaranteed to complete before compute passes.
918
+ */
919
+ private uniformStagingBuffer;
920
+ private uniformStagingCapacity;
921
+ /** Dispatch entries for prefill (M>1): uses tiled matmul kernels. */
922
+ private dispatchEntries;
923
+ /** Dispatch entries for decode (M=1): uses K-parallel matvec kernels. */
924
+ private decodeEntries;
925
+ /** Argmax dispatch entry (created in initBindGroups). */
926
+ private argmaxEntry;
927
+ /** True when running on Safari/WebKit (needs multi-encoder submit). */
928
+ readonly needsMultiEncoder: boolean;
929
+ private maxSeqLen;
930
+ private kvMode;
931
+ private seqPos;
932
+ private webkitGroupSize;
933
+ private profileEnabled;
934
+ private readonly profileData;
935
+ private querySet;
936
+ private queryResolveBuf;
937
+ private queryReadbackBuf;
938
+ constructor(ctx: GPUContext, graph: ModelGraph, options: ExecutorOptions);
939
+ /**
940
+ * Register the CPU-resident Gemma 4 PLE table. The quantized table is kept in
941
+ * JS memory (NOT uploaded to a GPU buffer), and {@link forward} gathers +
942
+ * dequantizes only the rows for the current input tokens each step, uploading a
943
+ * small [T, width] f32 buffer into `targetTensor`. This is what keeps Gemma 4
944
+ * mobile-viable: resident GPU memory is just the active transformer weights.
945
+ */
946
+ setPleSource(src: {
947
+ packed: Uint32Array;
948
+ scales: Float32Array;
949
+ zeros: Float32Array;
950
+ width: number;
951
+ groupSize: number;
952
+ targetTensor: string;
953
+ cache?: {
954
+ cacheName: string;
955
+ packedKey: string;
956
+ scalesKey: string;
957
+ zerosKey: string;
958
+ packedLen: number;
959
+ };
960
+ }): void;
961
+ /**
962
+ * Materialize a cache-backed PLE table into heap exactly once. Deferred to the
963
+ * first forward (after GPU upload) so the ~1.17 GB table is not co-resident
964
+ * with the upload's transient allocations during the load-time memory peak.
965
+ */
966
+ private ensurePleLoaded;
967
+ /**
968
+ * Gather + dequantize the PLE rows for `inputIds` and upload them into the
969
+ * target activation buffer. Touches only T rows (T*width floats) — the full
970
+ * [vocab, width] quantized table never goes to the GPU.
971
+ */
972
+ private streamPleRows;
973
+ /**
974
+ * Stream weights to the GPU one tensor at a time, pulling each from the
975
+ * `WeightSource` only when it is about to be uploaded and dropping the
976
+ * reference immediately afterward.
977
+ *
978
+ * This is the property that bounds peak JS heap: with a cache-backed source
979
+ * (browser/mobile), only ONE tensor's bytes are materialized in heap at a time
980
+ * (read from CacheStorage in `source.get()`), uploaded to its GPU buffer, then
981
+ * released before the next tensor is fetched. The whole model is never co-
982
+ * resident in heap. With a heap-backed source (Node/desktop), behavior matches
983
+ * the old Map path (the source deletes each entry as it is consumed).
984
+ *
985
+ * Accepts either a `WeightSource` (new, streamed/async) or a plain Map
986
+ * (back-compat for callers that build a Map directly, e.g. Kani/Moonshine).
987
+ */
988
+ uploadWeights(source: WeightSource | Map<string, {
989
+ data: ArrayBufferView;
990
+ shape: number[];
991
+ }>): Promise<void>;
992
+ /**
993
+ * Synchronous Map upload (Node/desktop and the Kani/Moonshine sub-executors,
994
+ * which build small heap Maps). Deletes each entry as it is consumed to free
995
+ * the JS-side bytes once they are GPU-resident. Safe to call from a constructor.
996
+ */
997
+ uploadWeightsMap(weights: Map<string, {
998
+ data: ArrayBufferView;
999
+ shape: number[];
1000
+ }>): void;
1001
+ /**
1002
+ * Build all pipelines and bind groups. Call ONCE after uploadWeights().
1003
+ *
1004
+ * Creates two dispatch entry arrays:
1005
+ * - dispatchEntries: tiled matmul for prefill (any M)
1006
+ * - decodeEntries: K-parallel matvec for decode (M=1)
1007
+ */
1008
+ initBindGroups(): void;
1009
+ /**
1010
+ * Run a forward pass. Uses matvec kernels for M=1 (decode), tiled for M>1 (prefill).
1011
+ */
1012
+ forward(inputIds: Uint32Array): Promise<ForwardResult>;
1013
+ /**
1014
+ * Profiling variant of the desktop dispatch path: one compute pass per dispatch,
1015
+ * each bracketed by timestamp queries, so we get per-op GPU time. Accumulates
1016
+ * into profileData by opType. Only runs under GERBIL_PROFILE with timestamp-query
1017
+ * support — slower than the batched path (it measures relative cost, not tok/s).
1018
+ */
1019
+ private runProfiledDispatches;
1020
+ /** Per-opType GPU time (ns) + dispatch count accumulated by GERBIL_PROFILE, hottest first. */
1021
+ getProfile(): Array<{
1022
+ opType: string;
1023
+ ns: number;
1024
+ count: number;
1025
+ }>;
1026
+ /** Clear accumulated profiler data (e.g. to drop warm-up tokens). */
1027
+ resetProfile(): void;
1028
+ /** GPU dispatches per decode token (post-fusion). On mobile this drives the
1029
+ * submit-group count = ceil(dispatchCount / webkitGroupSize). */
1030
+ get decodeDispatchCount(): number;
1031
+ /** Device limit that gates the INT4 projection fusions (they need ≥9). If a
1032
+ * device caps at 8 the dual/gated/swiglu-gated INT4 fusions silently fall back,
1033
+ * inflating the decode dispatch count. */
1034
+ get maxStorageBuffers(): number;
1035
+ /**
1036
+ * Profile ONE real decode step: times the actual `decodeEntries` (the kernels
1037
+ * the pipelined greedy benchmark runs) with per-dispatch timestamps. Timing is
1038
+ * token-independent, so pass any valid id; runs un-pipelined with a synchronous
1039
+ * timestamp readback (measurement only, not for production decode). Argmax (one
1040
+ * tiny dispatch) is intentionally excluded — it is not a hotspot target.
1041
+ */
1042
+ profileDecodeStep(tokenId: number): Promise<void>;
1043
+ /**
1044
+ * Run a single forward pass over `inputIds` and read back the L2-normalized
1045
+ * embedding vector. Requires an embedding graph (one whose output tensor is
1046
+ * "embedding", produced by the last-token-pool + L2-norm tail).
1047
+ *
1048
+ * Always runs in a fresh-state single pass (caller should reset() first):
1049
+ * embeddings are non-autoregressive, so the whole sequence is one prefill.
1050
+ */
1051
+ embed(inputIds: Uint32Array): Promise<Float32Array>;
1052
+ /**
1053
+ * Generic one-shot dispatch for a non-LM graph: run every entry once over a
1054
+ * single forward and read back `elemCount` f32 elements of the named output
1055
+ * tensor. Used to execute the NanoCodec decoder graph (codes→PCM) — its ops use
1056
+ * concrete lengths and it has no "logits" output, so the normal forward()
1057
+ * logits-readback path does not apply. Caller writes any inputs (e.g. audio_codes)
1058
+ * via writeInput() and reset()s first.
1059
+ */
1060
+ runGraphOutput(outputName: string, elemCount: number): Promise<Float32Array>;
1061
+ /**
1062
+ * Greedy decode step: forward + GPU argmax. Returns token ID directly.
1063
+ * Always uses matvec kernels (M=1). Reads back 4 bytes instead of vocab_size*4.
1064
+ */
1065
+ forwardArgmax(inputIds: Uint32Array): Promise<number>;
1066
+ /** KV-cache positions still available for decode steps. */
1067
+ decodeCapacityRemaining(): number;
1068
+ /** Number of decode steps that may be in flight in the pipelined path. */
1069
+ static readonly PIPELINE_DEPTH = 2;
1070
+ /**
1071
+ * Pipelined greedy decode step (Dawn only — WebKit uses forwardArgmax).
1072
+ *
1073
+ * Encodes one full decode forward + argmax and submits WITHOUT awaiting
1074
+ * completion. The input token is taken from `tokenId` for the first step
1075
+ * after prefill; for subsequent steps (tokenId === null) the previous step's
1076
+ * argmax result is copied into input_ids ON THE GPU, so the decode loop
1077
+ * never blocks on a readback before submitting the next step. The argmax
1078
+ * result is copied to a per-slot readback buffer, read later (one step
1079
+ * behind) via readDecodeToken(slot).
1080
+ *
1081
+ * queue.writeBuffer is queue-ordered: uniform updates land after the
1082
+ * previously submitted step's command buffer and before this one's, so
1083
+ * shared uniform buffers are safe with multiple steps in flight.
1084
+ */
1085
+ submitGreedyDecodeStep(tokenId: number | null, slot: number): void;
1086
+ /** Read back the token produced by the pipelined step that used `slot`. */
1087
+ readDecodeToken(slot: number): Promise<number>;
1088
+ reset(): void;
1089
+ /**
1090
+ * Diagnostic: dispatch ONLY the first kernel (EmbeddingInt4) using the
1091
+ * production bind group, pipeline, and buffers — but in isolation (1 dispatch,
1092
+ * no staging, fresh encoder). Compares against full forward pass to isolate
1093
+ * whether the issue is the bind group/pipeline or the multi-dispatch context.
1094
+ */
1095
+ debugFirstDispatch(inputIds: Uint32Array): Promise<{
1096
+ nodeId: string;
1097
+ opType: string;
1098
+ dispatchSize: [number, number, number];
1099
+ output: Float32Array;
1100
+ }>;
1101
+ /**
1102
+ * Diagnostic: run a single dispatch entry by index, in isolation.
1103
+ * Call after debugFirstDispatch() to test whether entry[1] (RMSNorm)
1104
+ * can read embed_out written by entry[0] (EmbeddingInt4).
1105
+ */
1106
+ debugDispatchEntry(entryIndex: number, T: number): Promise<{
1107
+ nodeId: string;
1108
+ opType: string;
1109
+ output: Float32Array;
1110
+ }>;
1111
+ /**
1112
+ * Diagnostic: compute the JS-side params for the first N decode entries
1113
+ * WITHOUT dispatching. Shows what buildParams produces.
1114
+ */
1115
+ debugComputeParams(T: number, count?: number): Array<{
1116
+ idx: number;
1117
+ nodeId: string;
1118
+ opType: string;
1119
+ paramsU32: number[];
1120
+ dispatchSize: [number, number, number];
1121
+ }>;
1122
+ /**
1123
+ * Diagnostic: after a forward pass, read back output tensors at several points
1124
+ * in the pipeline to find where data drops to zero.
1125
+ */
1126
+ debugPipelineProbe(T: number): Promise<Array<{
1127
+ idx: number;
1128
+ nodeId: string;
1129
+ opType: string;
1130
+ tensor: string;
1131
+ sum: number;
1132
+ first4: number[];
1133
+ uniformParams?: number[];
1134
+ }>>;
1135
+ debugWriteBuffer(tensorName: string, data: ArrayBufferView): void;
1136
+ /**
1137
+ * Write a host-supplied activation input buffer (e.g. multimodal M-RoPE
1138
+ * cos/sin, spliced vision embeddings, image row-map). The buffer must be a
1139
+ * persistent activation tensor in the graph. Call before forward().
1140
+ */
1141
+ writeInput(tensorName: string, data: ArrayBufferView): void;
1142
+ /** Write a host activation buffer at a byte offset (for per-row decode updates). */
1143
+ writeInputAt(tensorName: string, data: ArrayBufferView, byteOffset: number): void;
1144
+ /** True if the graph has a buffer with this name (multimodal-capability probe). */
1145
+ hasBuffer(tensorName: string): boolean;
1146
+ /** Current sequence position (number of tokens processed since reset). */
1147
+ get currentSeqPos(): number;
1148
+ debugReadBuffer(tensorName: string, maxElements?: number, byteOffset?: number): Promise<Float32Array>;
1149
+ destroy(): void;
1150
+ /**
1151
+ * Allocate activation buffers with liveness-based reuse.
1152
+ *
1153
+ * One dedicated buffer per activation tensor at full maxSeqLen is ~2.3GB for
1154
+ * Qwen3.5-0.8B at T=512 — over the iOS jetsam budget on its own. Instead,
1155
+ * a buffer returns to a size-keyed pool once its tensor's last reader has
1156
+ * executed, so concurrently-live tensors share a small working set.
1157
+ *
1158
+ * Graph outputs and tensors read before they are written (cross-forward
1159
+ * state) keep dedicated buffers. Within a forward, dispatches execute in
1160
+ * executionOrder on every path (single-pass Dawn, per-dispatch WebKit), and
1161
+ * WebGPU synchronizes hazards between dispatches, so reuse is safe.
1162
+ *
1163
+ * Caveat: debugReadBuffer() on an intermediate tensor is only meaningful
1164
+ * before a later op reuses its buffer (probes that stop mid-graph are fine).
1165
+ */
1166
+ private allocateActivationBuffers;
1167
+ private allocateSSMStateBuffers;
1168
+ private allocateKVCacheBuffers;
1169
+ private resolveShapes;
1170
+ private getBuffer;
1171
+ /**
1172
+ * Detect gate_proj + up_proj + SwiGLU patterns in decode entries and replace
1173
+ * with a single fused SwiGLUMatVec dispatch. Saves 2 dispatches per MLP block.
1174
+ */
1175
+ private fuseSwiGLUDecodeEntries;
1176
+ /**
1177
+ * Fuse two adjacent INT4 projections that share the same input activation and
1178
+ * the same K/N (e.g. q_proj+gate_proj and k_proj+v_proj in full-attention
1179
+ * decode) into a single DualMatVecInt4 dispatch. Reads the shared input vector
1180
+ * once and writes both projection outputs — removing one GPU round-trip (one
1181
+ * submit+drain on Safari/iOS) per fused pair.
1182
+ *
1183
+ * Numerics are identical to running the two MatVecInt4 kernels separately
1184
+ * (same dequant, same K-parallel reduction), so this is WebKit-safe: it reuses
1185
+ * the proven INT4 matvec math and only merges two writes into one dispatch.
1186
+ *
1187
+ * Must run AFTER fuseSwiGLUDecodeEntries so the MLP gate+up pair (which is
1188
+ * consumed by a SwiGLU node) is already collapsed and won't be matched here.
1189
+ */
1190
+ private fuseDualMatVecDecodeEntries;
1191
+ /**
1192
+ * Fuse the adjacent K-cache and V-cache appends in each full-attention layer
1193
+ * into a single DualKVCacheAppend dispatch. Both are pure memcpys into f32
1194
+ * caches sharing the same width and dst_offset, so one dispatch with two
1195
+ * src/dst buffers writes both — removing one GPU round-trip per layer.
1196
+ *
1197
+ * Supports f32, native-f16, and packed-f16 caches (the dual kernel mirrors the
1198
+ * single-append kernel selected for the active kvMode). Numerically identical
1199
+ * to the separate appends — WebKit-safe (the packed-f16 variant is the Safari
1200
+ * path and uses pack2x16float, no `enable f16`).
1201
+ */
1202
+ private fuseDualKVCacheAppendEntries;
1203
+ /**
1204
+ * Fuse the attention SigmoidGate (attn_out * sigmoid(gate)) into the INT4
1205
+ * o_proj that consumes it: a GatedMatVecInt4 reads attn_out and gate directly,
1206
+ * applies the sigmoid gate to its input vector, and runs the projection in ONE
1207
+ * dispatch — removing the standalone SigmoidGate (one round-trip per
1208
+ * full-attention layer).
1209
+ *
1210
+ * Numerically identical to SigmoidGate→MatVecInt4 (same gate formula, same INT4
1211
+ * dequant + reduction). Slight extra ALU: the gated input is recomputed per
1212
+ * output column, but A reads hit L1 and the saved submit+drain dominates on
1213
+ * mobile. WebKit risk: low — reuses the proven INT4 matvec, only the A vector
1214
+ * is built from two reads + a sigmoid (no new reduction/barrier pattern).
1215
+ *
1216
+ * Runs after the dual fusions so it only sees the post-attention SigmoidGate.
1217
+ */
1218
+ private fuseGatedOProjDecodeEntries;
1219
+ /**
1220
+ * Fuse a standalone SwiGLU (silu(gate) * up) into the INT4 projection that
1221
+ * consumes its output: a SwiGLUGatedMatVecInt4 reads gate and up directly,
1222
+ * builds the gated input vector, and runs the projection in ONE dispatch.
1223
+ * Targets the Mamba block's mamba_swiglu (silu(z) * norm_out) feeding out_proj
1224
+ * — one round-trip saved per linear-attention layer.
1225
+ *
1226
+ * The MLP SwiGLU is already collapsed into a SwiGLUMatVec entry by
1227
+ * fuseSwiGLUDecodeEntries (it has no surviving standalone SwiGLU node), so only
1228
+ * the Mamba SwiGLU matches here. Numerically identical to SwiGLU→MatVecInt4;
1229
+ * WebKit risk low (reuses the proven INT4 matvec, only the A vector changes).
1230
+ */
1231
+ private fuseSwiGLUGatedProjDecodeEntries;
1232
+ /**
1233
+ * Fuse two adjacent per-row RMSNorms sharing hidden_size + eps into a single
1234
+ * DualRMSNorm dispatch (e.g. the per-head q_norm and k_norm in full-attention
1235
+ * decode). One workgroup still handles one row; the fused grid just spans both
1236
+ * inputs' rows, so each row's reduction is unchanged — numerically identical to
1237
+ * two separate RMSNorm dispatches. One round-trip saved per fused pair.
1238
+ *
1239
+ * WebKit risk: low — same single-workgroup reduction as the proven RMSNorm
1240
+ * kernel, only the row→input routing is added.
1241
+ */
1242
+ private fuseDualRMSNormDecodeEntries;
1243
+ /**
1244
+ * Gather buffer entries for a bind group, matching the kernel spec's binding layout.
1245
+ * Uses the pre-allocated inputIdsBuffer for the "input_ids" tensor.
1246
+ */
1247
+ private gatherBuffers;
1248
+ /** Lazily allocate a scratch storage buffer at least `minBytes` large. */
1249
+ private getBindingScratchBuffer;
1250
+ }
1251
+ //#endregion
1252
+ //#region src/gpu/kani-tts.d.ts
1253
+ /**
1254
+ * KaniTTS — native text-to-speech engine for Gerbil's WebGPU backend.
1255
+ *
1256
+ * Kani-TTS-2 (nineninesix/kani-tts-2-en) is a two-stage TTS model that, like
1257
+ * Moonshine, needs more than one graph:
1258
+ *
1259
+ * 1. CODEC-LM BACKBONE (LFM2-350M body): autoregressively emits NanoCodec audio
1260
+ * tokens (4 per frame) into the same vocab as text. Reuses LFM2's block math
1261
+ * with two KaniTTS2 deltas — frame-level position IDs (the 4 audio tokens of a
1262
+ * frame share a position) and learnable per-layer RoPE (α^(l)-scaled freqs) —
1263
+ * both folded host-side into per-layer cos/sin tables fed to the MRoPE op.
1264
+ * 2. NANOCODEC DECODER (NVIDIA NeMo 22 kHz): FSQ dequant + causal HiFi-GAN conv
1265
+ * decoder → 22 kHz PCM. Validated bit-exact vs MLX (test-nanocodec-decode.mjs).
1266
+ *
1267
+ * The AR loop runs on the host with full-logit readback so each frame's 4 audio
1268
+ * tokens are sampled per-codebook (constrained to the valid codebook window), then
1269
+ * the collected codes are decoded once through the NanoCodec graph.
1270
+ *
1271
+ * Validated on Dawn (desktop) via scripts/engine/test-kani-speak.mjs.
1272
+ */
1273
+ interface KaniTTSOptions {
1274
+ /** Backbone repo (default nineninesix/kani-tts-2-en). */
1275
+ repo?: string;
1276
+ /** NanoCodec repo (default the NeMo 22 kHz MLX checkpoint). */
1277
+ codecRepo?: string;
1278
+ revision?: string;
1279
+ hfToken?: string;
1280
+ cacheDir?: string;
1281
+ /** Max self-attn KV-cache length (prompt + generated). Default 2048. */
1282
+ maxSeqLen?: number;
1283
+ onProgress?: (loaded: number, total: number, message: string) => void;
1284
+ }
1285
+ interface SpeakOptions {
1286
+ /** Language/accent tag, e.g. "en_us" (default). Prepended as "{tag}: {text}". */
1287
+ languageTag?: string;
1288
+ /** Sampling temperature (default 1.0). */
1289
+ temperature?: number;
1290
+ /** Top-p nucleus threshold (default 0.95). */
1291
+ topP?: number;
1292
+ /** Repetition penalty (default 1.1). */
1293
+ repetitionPenalty?: number;
1294
+ /** Max audio frames to generate (caps duration). Default unbounded (maxSeqLen). */
1295
+ maxFrames?: number;
1296
+ /** Override max generated tokens (default 3000). */
1297
+ maxNewTokens?: number;
1298
+ }
1299
+ interface SpeakResult {
1300
+ /** Mono PCM in [-1, 1]. */
1301
+ pcm: Float32Array;
1302
+ /** Sample rate (22050). */
1303
+ sampleRate: number;
1304
+ /** Number of audio frames decoded. */
1305
+ frames: number;
1306
+ /** Audio duration in seconds. */
1307
+ audioSeconds: number;
1308
+ }
1309
+ declare class KaniTTS {
1310
+ private ctx;
1311
+ private loaded;
1312
+ private tokenizer;
1313
+ private cfg;
1314
+ private rawConfig;
1315
+ private maxSeqLen;
1316
+ /** Backbone executor (built once; reused across speak() calls). */
1317
+ private backboneExec;
1318
+ /** The attention layer indices (carry learnable α) and their α values. */
1319
+ private attnLayers;
1320
+ private layerAlpha;
1321
+ private headDim;
1322
+ private ropeBase;
1323
+ private _destroyed;
1324
+ readonly architecture = "KaniTTS2ForCausalLM";
1325
+ private constructor();
1326
+ static create(options?: KaniTTSOptions): Promise<KaniTTS>;
1327
+ /** Write per-layer cos/sin for token rows [rowStart, rowStart+positions.length). */
1328
+ private writeCosSin;
1329
+ /**
1330
+ * Synthesize speech for `text`. Returns 22 kHz mono PCM.
1331
+ *
1332
+ * Pipeline: build the [SOH]+text+[EOT,EOH] prompt → prefill the codec-LM →
1333
+ * AR-decode 4-token frames (per-codebook constrained sampling) until end_of_speech
1334
+ * → strip markers → codes → NanoCodec decode → PCM.
1335
+ */
1336
+ speak(text: string, opts?: SpeakOptions): Promise<SpeakResult>;
1337
+ /**
1338
+ * Autoregressive decode: from the prefill logits, emit one token per step —
1339
+ * greedy for the structural markers ([SOA][SOS]) and per-codebook constrained
1340
+ * sampling once in speech — collecting the audio tokens between SOS and EOS.
1341
+ * Writes each step's per-layer cos/sin row (frame-level logical position) before
1342
+ * the forward. Returns the collected audio tokens and whether EOS/cap was hit.
1343
+ */
1344
+ private runDecodeLoop;
1345
+ /** Logical (frame-level) position of the LAST token in `seq`. */
1346
+ private logicalPositionAt;
1347
+ /** Greedy argmax over a logits row. */
1348
+ private argmax;
1349
+ /**
1350
+ * Sample one audio token for codebook position `codebook`, constrained to that
1351
+ * codebook's valid window [audio_tokens_start + 4032*c, +4032). Allows end_of_speech
1352
+ * only at codebook 0 (frame boundary). Applies temperature, top-p, rep-penalty.
1353
+ */
1354
+ private sampleAudioToken;
1355
+ /**
1356
+ * Decode NanoCodec codes [groups, T] (group-major) → PCM.
1357
+ *
1358
+ * The decoder graph carries concrete lengths, and the upsampled conv activations
1359
+ * for long clips overflow WebGPU's 65535 per-dimension dispatch cap. The decoder
1360
+ * is fully causal with a small (≤ a few frames) receptive field, so we decode in
1361
+ * frame chunks with a left-context lookback and keep only each chunk's own output
1362
+ * samples — numerically identical to a single decode, but bounded per dispatch.
1363
+ */
1364
+ private decodeCodes;
1365
+ /** Run the NanoCodec decoder graph for a single (bounded) code window → PCM. */
1366
+ private decodeCodesWindow;
1367
+ destroy(): void;
1368
+ }
1369
+ //#endregion
1370
+ //#region src/gpu/moonshine-executor.d.ts
1371
+ interface EncoderResult {
1372
+ /** encoder_out [S_enc, hidden] (debug / parity). */
1373
+ encoderOut: Float32Array;
1374
+ /** Per-decoder-layer frozen K, indexed by layer. */
1375
+ encK: Float32Array[];
1376
+ /** Per-decoder-layer frozen V, indexed by layer. */
1377
+ encV: Float32Array[];
1378
+ /** Encoder length (frames). */
1379
+ sEnc: number;
1380
+ }
1381
+ declare class MoonshineEncoderExecutor {
1382
+ private ctx;
1383
+ private graph;
1384
+ private decLayers;
1385
+ private hidden;
1386
+ private weightBuffers;
1387
+ private activationBuffers;
1388
+ private dispatches;
1389
+ constructor(ctx: GPUContext, graph: ModelGraph, decLayers: number);
1390
+ /**
1391
+ * Upload the encoder constants. `weights` holds canonical-named f32 tensors;
1392
+ * only those referenced by the graph are uploaded (the decoder weights are
1393
+ * uploaded into the decoder Executor separately).
1394
+ */
1395
+ uploadWeights(weights: Map<string, {
1396
+ data: ArrayBufferView;
1397
+ shape: number[];
1398
+ }>): void;
1399
+ initBindGroups(): void;
1400
+ /** Run the conv frontend + encoder + K/V projection. `pcm` is raw 16kHz mono. */
1401
+ encode(pcm: Float32Array): Promise<EncoderResult>;
1402
+ /** Read back a named activation buffer after encode() (debug / parity checks). */
1403
+ readActivation(name: string, maxElements?: number): Promise<Float32Array>;
1404
+ destroy(): void;
1405
+ /** Encoder frame count = first dim of encoder_out (resolved, numeric). */
1406
+ private encoderFrames;
1407
+ private resolveShapes;
1408
+ private allocateActivationBuffers;
1409
+ private readBack;
1410
+ private gatherBuffers;
1411
+ }
1412
+ //#endregion
1413
+ //#region src/gpu/moonshine-stt.d.ts
1414
+ /**
1415
+ * MoonshineSTT — native speech-to-text engine for Gerbil's WebGPU backend.
1416
+ *
1417
+ * Moonshine is an encoder-decoder ASR model with a raw-waveform conv frontend
1418
+ * (no FFT / mel spectrogram). Unlike the causal-LM path, it needs two graphs:
1419
+ *
1420
+ * 1. ENCODER (run once per utterance): conv frontend → bidirectional transformer
1421
+ * → encoder hidden state, which is then projected through every decoder
1422
+ * layer's cross-attention k_proj/v_proj into FROZEN K/V buffers.
1423
+ * 2. DECODER (autoregressive): causal self-attention with a growing KV-cache,
1424
+ * plus cross-attention into the frozen encoder K/V at every step.
1425
+ *
1426
+ * The conv frontend is length-static (Conv1dFull carries concrete L/Lout), so the
1427
+ * encoder graph is regenerated per utterance from the input sample count, and the
1428
+ * decoder graph is regenerated with the resulting encoder frame count (S_enc).
1429
+ * Weights are uploaded once and reused across utterances via per-call executors.
1430
+ *
1431
+ * Validated on Dawn (desktop) via scripts/engine/test-moonshine-transcribe.mjs.
1432
+ * The kernels are mobile-safe (≤16KB workgroup memory, clamped exp/tanh, no
1433
+ * select(), no `enable f16`) and the executors use the WebKit submit/drain
1434
+ * discipline, so the same path runs on iPad.
1435
+ */
1436
+ interface MoonshineSTTOptions {
1437
+ /** HF repo (default UsefulSensors/moonshine-base). */
1438
+ repo?: string;
1439
+ revision?: string;
1440
+ hfToken?: string;
1441
+ cacheDir?: string;
1442
+ onProgress?: (loaded: number, total: number, message: string) => void;
1443
+ }
1444
+ interface TranscribeOptions {
1445
+ /** Stop after this many decoded tokens (default 194). */
1446
+ maxNewTokens?: number;
1447
+ }
1448
+ interface TranscribeResult {
1449
+ text: string;
1450
+ /** Decoded token ids (excluding the start token, including the trailing EOS). */
1451
+ tokens: number[];
1452
+ /** Number of encoder frames produced by the conv frontend. */
1453
+ encoderFrames: number;
1454
+ /** Audio duration in seconds (samples / 16000). */
1455
+ audioSeconds: number;
1456
+ }
1457
+ declare class MoonshineSTT {
1458
+ private ctx;
1459
+ private weights;
1460
+ private tokenizer;
1461
+ private rawConfig;
1462
+ private bosTokenId;
1463
+ private eosTokenId;
1464
+ private decoderStartTokenId;
1465
+ private _destroyed;
1466
+ /** HF architecture string, for parity with WebGPUEngine. */
1467
+ readonly architecture = "MoonshineForConditionalGeneration";
1468
+ private constructor();
1469
+ /** Download + initialize a Moonshine STT engine. */
1470
+ static create(options?: MoonshineSTTOptions): Promise<MoonshineSTT>;
1471
+ /**
1472
+ * Transcribe raw 16 kHz mono PCM. Runs the conv frontend + encoder once, then
1473
+ * greedily AR-decodes with cross-attention into the frozen encoder K/V, stopping
1474
+ * on EOS. Returns the detokenized transcript.
1475
+ */
1476
+ transcribe(pcm: Float32Array, opts?: TranscribeOptions): Promise<TranscribeResult>;
1477
+ destroy(): void;
1478
+ }
1479
+ //#endregion
1480
+ //#region src/gpu/vision-executor.d.ts
1481
+ interface VisionInputs {
1482
+ /** Flattened patches [N, patch_dim]. */
1483
+ patches: Float32Array;
1484
+ /** Bilinear-interpolated learned pos embeddings [N, hidden_size]. */
1485
+ posEmbeds: Float32Array;
1486
+ /** Precomputed rotary cos [N, head_dim]. */
1487
+ cos: Float32Array;
1488
+ /** Precomputed rotary sin [N, head_dim]. */
1489
+ sin: Float32Array;
1490
+ /** Number of patches (rows). */
1491
+ numPatches: number;
1492
+ }
1493
+ /**
1494
+ * Coarse stage callback for the ViT encode. Fires before each transformer layer
1495
+ * (and the pre/post stages) so a host harness can localize a mobile GPU-process
1496
+ * crash to a specific layer instead of seeing only "crashed after load". Kept
1497
+ * synchronous and cheap; throwing is the caller's responsibility.
1498
+ */
1499
+ type VisionStageCallback = (stage: string, info?: {
1500
+ layer?: number;
1501
+ total?: number;
1502
+ }) => void;
1503
+ declare class VisionExecutor {
1504
+ private ctx;
1505
+ private graph;
1506
+ private mergeUnit;
1507
+ private weightBuffers;
1508
+ private activationBuffers;
1509
+ private dispatches;
1510
+ private maxPatches;
1511
+ /** Weight (B) names of MatMulBias nodes stored as f16 (empty without shader-f16). */
1512
+ private f16WeightNames;
1513
+ /** Runtime pooled-token count for the Gemma 4 ViT ("Np" dim); 0 for Qwen. */
1514
+ private gemma4Np;
1515
+ /** True when this graph is the Gemma 4 vision tower (uses "Np" + PoolMatMul). */
1516
+ private readonly isGemma4;
1517
+ constructor(ctx: GPUContext, graph: ModelGraph, maxPatches: number);
1518
+ uploadWeights(weights: Map<string, {
1519
+ data: ArrayBufferView;
1520
+ shape: number[];
1521
+ }>): void;
1522
+ initBindGroups(): void;
1523
+ /**
1524
+ * Encode patches → merged image embeddings [Nm, out_hidden_size].
1525
+ *
1526
+ * `onStage` (optional) fires coarse phase breadcrumbs during the WebKit path so
1527
+ * a host can localize a GPU-process crash to a specific layer.
1528
+ */
1529
+ encode(inputs: VisionInputs, onStage?: VisionStageCallback): Promise<{
1530
+ embeds: Float32Array;
1531
+ rows: number;
1532
+ dim: number;
1533
+ }>;
1534
+ /**
1535
+ * Encode patches through the Gemma 4 ViT → projected image tokens [Np, text_hidden].
1536
+ *
1537
+ * Distinct from the Qwen `encode()`: the Gemma graph has 5 inputs (patches,
1538
+ * axial pos-embeds, axial rotary cos/sin, and a host-built [Np,N] pooling matrix)
1539
+ * and its output rows (Np) are the pooled soft-token count, resolved from the
1540
+ * pooling matrix rather than an N/mergeUnit ratio. Reuses the same dispatch
1541
+ * machinery + WebKit per-dispatch-drain discipline.
1542
+ */
1543
+ encodeGemma4(inputs: {
1544
+ patches: Float32Array;
1545
+ posEmbeds: Float32Array;
1546
+ cos: Float32Array;
1547
+ sin: Float32Array;
1548
+ poolMatrix: Float32Array;
1549
+ numPatches: number;
1550
+ numPooled: number;
1551
+ }, onStage?: VisionStageCallback): Promise<{
1552
+ embeds: Float32Array;
1553
+ rows: number;
1554
+ dim: number;
1555
+ }>;
1556
+ /** True if this executor is the Gemma 4 vision tower. */
1557
+ get gemma4(): boolean;
1558
+ /** Read back any named activation (debug). Must be called right after encode(). */
1559
+ debugReadBuffer(name: string, maxElements?: number): Promise<Float32Array>;
1560
+ destroy(): void;
1561
+ /** Max pooled tokens for buffer sizing: maxPatches with no merge/pool ratio applied. */
1562
+ private maxPooled;
1563
+ private resolveShapes;
1564
+ private allocateActivationBuffers;
1565
+ private gatherBuffers;
1566
+ }
1567
+ //#endregion
1568
+ //#region src/gpu/index.d.ts
1569
+ interface WebGPUEngineOptions extends Omit<LoadModelOptions, "repo"> {
1570
+ /**
1571
+ * HuggingFace repo ID (e.g. "mlx-community/Qwen3.5-0.8B-4bit") or full URL.
1572
+ * Optional — when omitted, a sensible default is chosen for the requested
1573
+ * capability (text, vision, or embeddings). See {@link DEFAULT_MODELS}.
1574
+ */
1575
+ repo?: string;
1576
+ /** Max sequence length (default: from model config, capped at 4096). */
1577
+ maxSeqLen?: number;
1578
+ /** Override KV mode: "f32", "native-f16", or "packed-f16". Auto-detected if omitted. */
1579
+ kvMode?: KvMode;
1580
+ /**
1581
+ * Build the vision encoder (Qwen3.5 ViT) alongside the text model so
1582
+ * `encodeImage()` can turn image patches into merged image-embedding tokens.
1583
+ * Only valid for vision-capable checkpoints (Qwen3.5). Downloads the ~192MB
1584
+ * vision tower. Default: false.
1585
+ */
1586
+ enableVision?: boolean;
1587
+ /** Max patches the vision encoder can process in one call (default 4096). */
1588
+ maxVisionPatches?: number;
1589
+ }
1590
+ interface EncodeImageResult {
1591
+ /** Merged image-embedding tokens, row-major [rows * dim]. */
1592
+ embeds: Float32Array;
1593
+ /** Number of merged tokens (numPatches / spatial_merge_size^2). */
1594
+ rows: number;
1595
+ /** Embedding dimension (out_hidden_size, 1024 for Qwen3.5). */
1596
+ dim: number;
1597
+ }
1598
+ interface EmbedOptions {
1599
+ /**
1600
+ * Instruction prefix for query embeddings (Qwen3-Embedding convention:
1601
+ * "Instruct: {task}\nQuery:{text}"). Omit for document embeddings.
1602
+ */
1603
+ instruction?: string;
1604
+ /**
1605
+ * EmbeddingGemma task prefix. The model is asymmetric: queries and documents
1606
+ * use different prefixes (`task: search result | query: ` vs `title: none |
1607
+ * text: `). Pass "query" for search queries and "document" for the corpus
1608
+ * being searched. Defaults to "query" for EmbeddingGemma when omitted. Ignored
1609
+ * by non-Gemma embedding models (use `instruction` for Qwen3-Embedding).
1610
+ */
1611
+ taskType?: "query" | "document";
1612
+ /**
1613
+ * Override the raw task prefix prepended to the text (EmbeddingGemma). When
1614
+ * set, takes precedence over `taskType`. Use for non-retrieval tasks, e.g.
1615
+ * "task: clustering | query: " or "task: classification | query: ".
1616
+ */
1617
+ taskPrompt?: string;
1618
+ /** Max tokens to encode (longer inputs are truncated). Default: model context, capped at maxSeqLen. */
1619
+ maxTokens?: number;
1620
+ }
1621
+ interface GenerateOptions {
1622
+ /** Max tokens to generate (default: 512). */
1623
+ maxTokens?: number;
1624
+ /** Stop generation on these strings. */
1625
+ stopSequences?: string[];
1626
+ /** Sampling parameters. */
1627
+ sampling?: SamplingParams;
1628
+ /** System prompt to prepend. */
1629
+ systemPrompt?: string;
1630
+ /** Callback for each generated token (for streaming). */
1631
+ onToken?: (token: string) => void;
1632
+ }
1633
+ interface GenerateResult {
1634
+ /** Generated text. */
1635
+ text: string;
1636
+ /** Number of tokens generated. */
1637
+ tokensGenerated: number;
1638
+ /** Tokens per second. */
1639
+ tokensPerSecond: number;
1640
+ /** Total generation time in ms. */
1641
+ totalTime: number;
1642
+ /** Why generation stopped. */
1643
+ finishReason: "eos" | "max_tokens" | "stop_sequence";
1644
+ /** Thinking content if model produced it (future). */
1645
+ thinking?: string;
1646
+ }
1647
+ /**
1648
+ * A minimal JSON-schema-ish shape used by {@link WebGPUEngine.generateObject} to
1649
+ * validate generated output without pulling in a schema library. Only `required`
1650
+ * and `properties` are inspected (presence of required keys). Pass a predicate
1651
+ * function instead for arbitrary validation.
1652
+ */
1653
+ interface ObjectSchema {
1654
+ /** Keys that must be present on the parsed object. */
1655
+ required?: string[];
1656
+ /** Property descriptors (only the key set is used for validation). */
1657
+ properties?: Record<string, unknown>;
1658
+ /** Allow extra schema fields (type, etc.) without TS complaints. */
1659
+ [key: string]: unknown;
1660
+ }
1661
+ /** Validator for {@link WebGPUEngine.generateObject}: a schema object or predicate. */
1662
+ type ObjectValidator<T = unknown> = ObjectSchema | ((o: T) => boolean);
1663
+ interface GenerateObjectOptions extends GenerateOptions {
1664
+ /**
1665
+ * Validation target. Either a predicate `(o) => boolean` or a minimal
1666
+ * JSON-schema-ish object with `required`/`properties` (required keys must
1667
+ * exist). Omit to only require syntactically valid JSON.
1668
+ */
1669
+ schema?: ObjectValidator;
1670
+ /**
1671
+ * Max RETRIES after the first attempt (so up to `maxRetries + 1` generations).
1672
+ * Default: 4.
1673
+ */
1674
+ maxRetries?: number;
1675
+ }
1676
+ interface GenerateObjectResult<T = unknown> {
1677
+ /** The parsed + validated object (or array). */
1678
+ object: T;
1679
+ /** The raw model text the object was extracted from. */
1680
+ text: string;
1681
+ /** How many generation attempts it took (1 = first try). */
1682
+ attempts: number;
1683
+ }
1684
+ interface IntegrityCheckEntry {
1685
+ label: string;
1686
+ length: number;
1687
+ sum: number;
1688
+ first4: number[];
1689
+ argmax: number;
1690
+ maxVal: number;
1691
+ match?: "PASS" | "FAIL";
1692
+ refSum?: number;
1693
+ refArgmax?: number;
1694
+ error?: string;
1695
+ note?: string;
1696
+ }
1697
+ interface IntegrityCheckResult {
1698
+ checks: IntegrityCheckEntry[];
1699
+ allPass: boolean;
1700
+ }
1701
+ /**
1702
+ * The main WebGPU inference engine.
1703
+ *
1704
+ * Usage:
1705
+ * const engine = await WebGPUEngine.create({ repo: "Qwen/Qwen3.5-0.8B" });
1706
+ * const result = await engine.generate("Hello!");
1707
+ * console.log(result.text);
1708
+ * engine.destroy();
1709
+ */
1710
+ declare class WebGPUEngine {
1711
+ private ctx;
1712
+ private executor;
1713
+ private tokenizer;
1714
+ private _destroyed;
1715
+ private _isEmbedding;
1716
+ /** HF architecture string (e.g. "Gemma3TextModel", "Qwen3ForCausalLM"). */
1717
+ private _architecture;
1718
+ /** Vision encoder (built only when enableVision and the model is vision-capable). */
1719
+ private visionExecutor;
1720
+ /** Raw vision_config (for host preprocessing of grids). */
1721
+ private visionConfig;
1722
+ /** Raw pos_embed.weight table for bilinear interpolation. */
1723
+ private visionPosEmbedTable;
1724
+ /** True when the LM graph was built with the multimodal (M-RoPE + splice) path. */
1725
+ private _multimodalGraph;
1726
+ /** Raw config.json (for M-RoPE params: mrope_section, rope_theta, partial factor). */
1727
+ private rawConfig;
1728
+ /** Effective max sequence length (cos/sin table coverage). */
1729
+ private maxSeqLen;
1730
+ /** Original create() options (used to lazily spin up the Kani-TTS engine for speak()). */
1731
+ private _createOptions;
1732
+ /** Lazily-created Kani-TTS engine (codec-LM + NanoCodec) backing speak(). */
1733
+ private _kaniTTS;
1734
+ /**
1735
+ * WebKit group-size probe state. When true, a candidate group size is being
1736
+ * tried this page-load and must be promoted (or capped) after the FIRST
1737
+ * successful forward produces non-corrupt logits. Goes false once handled so
1738
+ * promotion runs at most once per session. Always false on Dawn/node.
1739
+ */
1740
+ private _groupProbePending;
1741
+ /** Model capabilities (text, vision, moe). */
1742
+ readonly capabilities: ModelCapabilities;
1743
+ /** Model architecture config. */
1744
+ readonly config: ModelArchConfig;
1745
+ private constructor();
1746
+ /** True if this engine has a vision encoder built (use encodeImage()). */
1747
+ get hasVision(): boolean;
1748
+ /** Per-opType decode GPU-time breakdown (only populated under GERBIL_PROFILE). */
1749
+ getDecodeProfile(): Array<{
1750
+ opType: string;
1751
+ ns: number;
1752
+ count: number;
1753
+ }>;
1754
+ /** Clear accumulated decode profiler data (e.g. to drop warm-up tokens). */
1755
+ resetDecodeProfile(): void;
1756
+ /** Profile ONE real decode step (the pipelined-greedy kernels). Token-independent
1757
+ * timing — pass any valid id. Only meaningful under GERBIL_PROFILE. */
1758
+ profileDecodeStep(tokenId: number): Promise<void>;
1759
+ /** Decode dispatch count + the device's storage-buffer limit (which gates the
1760
+ * INT4 projection fusions). Lets the iPad runner report whether fusions applied
1761
+ * on-device or silently fell back (8 < 9 ⇒ more dispatches ⇒ more mobile drains). */
1762
+ getDecodeStats(): {
1763
+ dispatches: number;
1764
+ maxStorageBuffers: number;
1765
+ };
1766
+ /**
1767
+ * Write a coarse crash-phase breadcrumb that survives a GPU-process kill / page
1768
+ * reload. The iPad harness reads `localStorage["gerbil-crash-phase"]` after a
1769
+ * crash; without these, a describe-time crash only shows the last load phase
1770
+ * ("engine:ready"). The describe path tags vit-encode / splice / text-decode so
1771
+ * the next run shows WHERE it died, not just "crashed after load".
1772
+ */
1773
+ private setPhase;
1774
+ /** True if this engine was loaded as an embedding model (use embed(), not generate()). */
1775
+ get isEmbedding(): boolean;
1776
+ /**
1777
+ * WebKit group-size probe promotion hook. Runs at most once per session, after
1778
+ * the FIRST forward completes without the page dying. If the page had crashed
1779
+ * at this group size, this code never runs and the localStorage breadcrumb
1780
+ * (left by the resolver) caps the device on the next load — that is what makes
1781
+ * the probe survive the crash class. Here we additionally handle the
1782
+ * wrong-output class by inspecting the first forward's logits for corruption
1783
+ * (NaN / Inf / all-zero / all-same), reusing the same signals as integrityCheck().
1784
+ */
1785
+ private maybePromoteGroupProbe;
1786
+ /**
1787
+ * Create and initialize a WebGPUEngine.
1788
+ *
1789
+ * Downloads the model from HuggingFace, compiles shaders, uploads weights.
1790
+ */
1791
+ static create(options?: WebGPUEngineOptions): Promise<WebGPUEngine>;
1792
+ /**
1793
+ * Encode an image (already preprocessed into patches) into merged
1794
+ * image-embedding tokens of dim `out_hidden_size` (1024 for Qwen3.5).
1795
+ *
1796
+ * This is the VISION ENCODER ONLY — it returns the image tokens; it does not
1797
+ * splice them into a text sequence or apply M-RoPE (that is the LM-side
1798
+ * integration phase). Requires `enableVision: true` at create() on a
1799
+ * vision-capable checkpoint.
1800
+ *
1801
+ * @param patches Flattened patches, row-major [numPatches, patch_dim].
1802
+ * patch_dim = in_channels * temporal_patch_size * patch_size^2 (1536 for Qwen3.5).
1803
+ * Patches must already be ordered in spatial_merge_size×spatial_merge_size
1804
+ * groups (as the HF image processor emits them).
1805
+ * @param gridTHW The (temporal, height, width) patch-grid dims for the image.
1806
+ * numPatches must equal t*h*w.
1807
+ */
1808
+ encodeImage(patches: Float32Array, gridTHW: [number, number, number], onStage?: (stage: string, info?: {
1809
+ layer?: number;
1810
+ total?: number;
1811
+ }) => void): Promise<EncodeImageResult>;
1812
+ /** Resolve M-RoPE params from rawConfig: rope_dim, theta, mrope_section. */
1813
+ private mropeParams;
1814
+ /**
1815
+ * Write the M-RoPE cos/sin (token order) + image row-map for a prefill of
1816
+ * `positionIds3` ([3, seq]). `rowMap[i]` = vision-buffer row for image tokens,
1817
+ * -1 for text. Returns the logical position of the last token (for decode).
1818
+ */
1819
+ private writeMRoPEPrefill;
1820
+ /**
1821
+ * Write a single decode-step M-RoPE cos/sin row at table slot `seqPos` for a
1822
+ * text token at logical position `logicalPos`, plus a -1 row-map entry.
1823
+ */
1824
+ private writeMRoPEDecodeStep;
1825
+ /** Write linear-position M-RoPE inputs for a pure-text forward (no image). */
1826
+ private writeMRoPELinearText;
1827
+ /**
1828
+ * Generate text from a prompt.
1829
+ */
1830
+ generate(prompt: string | ChatMessage[], options?: GenerateOptions): Promise<GenerateResult>;
1831
+ /**
1832
+ * Generate a STRUCTURED object: generate text, extract the first JSON
1833
+ * object/array, parse it, validate it, and RETRY until it is valid (on-device
1834
+ * tokens are free, so re-rolling a malformed JSON is cheap).
1835
+ *
1836
+ * Extraction is tolerant: prose, markdown, and ```json code fences are
1837
+ * stripped, then the outermost balanced `{...}` or `[...]` is matched and
1838
+ * `JSON.parse`d. Validation is one of:
1839
+ * - a predicate `(o) => boolean` (return false to reject),
1840
+ * - a minimal JSON-schema-ish object with `required` (those keys must exist),
1841
+ * - nothing (only valid JSON is required).
1842
+ *
1843
+ * On each retry the prompt is nudged with a terse "return ONLY valid JSON…"
1844
+ * instruction (including the required-key shape when known). Throws a clear
1845
+ * error if it never validates within `maxRetries + 1` attempts.
1846
+ *
1847
+ * ```ts
1848
+ * const { object } = await engine.generateObject(
1849
+ * 'Extract {name, age} from: "I am Sarah, 28"',
1850
+ * { schema: { required: ["name", "age"] } },
1851
+ * );
1852
+ * // object === { name: "Sarah", age: 28 }
1853
+ * ```
1854
+ *
1855
+ * @typeParam T Expected object type (not enforced at runtime — validate via schema).
1856
+ */
1857
+ generateObject<T = unknown>(prompt: string, options?: GenerateObjectOptions): Promise<GenerateObjectResult<T>>;
1858
+ /**
1859
+ * Text-to-speech: text → 22 kHz PCM via Kani-TTS-2 (LFM2-350M codec-LM + NVIDIA
1860
+ * NeMo NanoCodec). Returns `{ pcm: Float32Array, sampleRate: 22050 }`.
1861
+ *
1862
+ * Runs the full pipeline: the codec-LM backbone autoregressively emits NanoCodec
1863
+ * audio tokens (4 per frame, frame-level positions + learnable per-layer RoPE),
1864
+ * then the bit-exact NanoCodec decoder (FSQ + causal HiFi-GAN) turns the codes
1865
+ * into PCM. The heavy lifting lives in {@link KaniTTS} (src/gpu/kani-tts.ts); this
1866
+ * lazily constructs that engine on first use (downloading the NanoCodec codec
1867
+ * checkpoint alongside the backbone).
1868
+ *
1869
+ * Requires a Kani-TTS-2 checkpoint (architecture "KaniTTS2ForCausalLM").
1870
+ */
1871
+ speak(text: string, options?: {
1872
+ languageTag?: string;
1873
+ temperature?: number;
1874
+ topP?: number;
1875
+ repetitionPenalty?: number;
1876
+ maxFrames?: number;
1877
+ }): Promise<{
1878
+ pcm: Float32Array;
1879
+ sampleRate: number;
1880
+ frames: number;
1881
+ audioSeconds: number;
1882
+ }>;
1883
+ /**
1884
+ * Describe an image: image-in → text-out. Runs the vision encoder, splices the
1885
+ * merged image tokens into a text prompt, applies multimodal M-RoPE, and
1886
+ * generates a description. Requires `enableVision: true` at create().
1887
+ *
1888
+ * Image input forms:
1889
+ * - `{ pixels, width, height }` — decoded RGB (HWC, 0..255), host-preprocessed
1890
+ * (smart-resize/normalize/patchify) to match the HF image processor.
1891
+ * - `{ patches, gridTHW }` — already-built [N,1536] patch tensor + grid (e.g.
1892
+ * HF-exact pixel_values from a reference; skips host preprocessing).
1893
+ */
1894
+ describeImage(image: {
1895
+ pixels: Float32Array | Uint8ClampedArray | Uint8Array;
1896
+ width: number;
1897
+ height: number;
1898
+ } | {
1899
+ patches: Float32Array;
1900
+ gridTHW: [number, number, number];
1901
+ }, prompt?: string, options?: GenerateOptions & {
1902
+ imageProcessor?: ImageProcessorConfig;
1903
+ }): Promise<GenerateResult>;
1904
+ /**
1905
+ * Prepare the multimodal prefill: upload vision embeds, build the image row-map
1906
+ * and 3D M-RoPE cos/sin, reset state, and write all host inputs. Returns the
1907
+ * input ids and the post-image logical cursor for decode. Does NOT run forward.
1908
+ */
1909
+ private prepareMultimodalPrefill;
1910
+ /**
1911
+ * Gemma 4 multimodal prefill + decode. Unlike Qwen3.5 (M-RoPE), Gemma 4 uses
1912
+ * STANDARD sequential 1D RoPE computed inside each layer from the KV write
1913
+ * position, so there are no host cos/sin inputs and decode positions are simply
1914
+ * the running seqPos — identical to plain text generation. We only upload the
1915
+ * merged vision embeds + an image-token row-map (EmbedSplice scatters them into
1916
+ * the image_token rows) before the forward pass.
1917
+ */
1918
+ private runMultimodalGemma4;
1919
+ /** Prepare + prefill + decode for a fully-specified multimodal token sequence. */
1920
+ private runMultimodal;
1921
+ /**
1922
+ * Debug: run ONLY the multimodal prefill for an explicit token sequence and
1923
+ * return the spliced input embeddings [seq, hidden] + first-token logits. Lets
1924
+ * tests compare the fused text+vision stream and M-RoPE numerically vs HF
1925
+ * without the decode loop overwriting intermediate buffers.
1926
+ */
1927
+ debugMultimodalPrefill(patches: Float32Array, gridTHW: [number, number, number], inputIds: number[]): Promise<{
1928
+ splicedEmbeds: Float32Array;
1929
+ logits: Float32Array;
1930
+ seq: number;
1931
+ }>;
1932
+ /**
1933
+ * Internal: run prefill (assumes M-RoPE/splice inputs already written) + decode,
1934
+ * with decode logical positions starting at `decodeStartPos`. Used by
1935
+ * describeImage so the post-image cursor is honored.
1936
+ */
1937
+ private generateFromPrepared;
1938
+ /**
1939
+ * Embed text into an L2-normalized vector. The pooling strategy depends on the
1940
+ * model: Qwen3-Embedding uses last-token (EOS-position) pooling, while
1941
+ * EmbeddingGemma (Gemma3 encoder) uses mean pooling over all tokens followed by
1942
+ * a 2-layer Dense head. Requires an embedding model (loaded with
1943
+ * { embedding: true }).
1944
+ *
1945
+ * The returned Float32Array has unit L2 norm, so cosine similarity reduces to a
1946
+ * dot product. Length is the model's embedding dim (768 for EmbeddingGemma;
1947
+ * config.hidden_size for Qwen3-Embedding).
1948
+ *
1949
+ * EmbeddingGemma is asymmetric — pass `{ taskType: "query" }` for search
1950
+ * queries and `{ taskType: "document" }` for the corpus, or a raw
1951
+ * `{ taskPrompt }` for other tasks (clustering/classification/STS).
1952
+ */
1953
+ embed(text: string, options?: EmbedOptions): Promise<Float32Array>;
1954
+ /**
1955
+ * Generate text as an async iterator (streaming).
1956
+ *
1957
+ * Uses the onToken callback from generate() to push tokens into a queue
1958
+ * that the async generator yields from. The generator returns the full
1959
+ * GenerateResult when generation completes.
1960
+ *
1961
+ * Usage:
1962
+ * const gen = engine.stream("Hello!");
1963
+ * for await (const token of gen) {
1964
+ * process.stdout.write(token);
1965
+ * }
1966
+ * const result = gen.next(); // { done: true, value: GenerateResult }
1967
+ */
1968
+ stream(prompt: string | ChatMessage[], options?: GenerateOptions): AsyncGenerator<string, GenerateResult, undefined>;
1969
+ /**
1970
+ * Debug: read back a named GPU buffer (weight or activation).
1971
+ * Call after forward() to inspect intermediate values.
1972
+ */
1973
+ debugReadBuffer(tensorName: string, maxElements?: number): Promise<Float32Array>;
1974
+ /**
1975
+ * Run GPU diagnostics (buffer integrity, compute, shared memory).
1976
+ * Useful for isolating Safari/WebKit-specific WebGPU issues.
1977
+ */
1978
+ diagnose(): Promise<GPUDiagnosticResult>;
1979
+ /**
1980
+ * Run GPU diagnostics without loading a model.
1981
+ * Quick way to check if WebGPU is working correctly on this device.
1982
+ */
1983
+ static quickDiagnose(): Promise<GPUDiagnosticResult>;
1984
+ /**
1985
+ * Run a raw forward pass (no tokenization/chat template).
1986
+ * Returns logits for the last token.
1987
+ */
1988
+ rawForward(inputIds: Uint32Array): Promise<{
1989
+ logits: Float32Array;
1990
+ }>;
1991
+ /**
1992
+ * Reset executor state (SSM, positions, etc.)
1993
+ */
1994
+ resetState(): void;
1995
+ /**
1996
+ * Encode text to token IDs (useful for debugging / token counting).
1997
+ */
1998
+ encode(text: string): number[];
1999
+ /**
2000
+ * Decode token IDs to text.
2001
+ */
2002
+ decode(ids: number[], skipSpecialTokens?: boolean): string;
2003
+ /**
2004
+ * Integrity check: reads back key weight tensors and runs a single forward pass,
2005
+ * returning checksums for comparison against a known-good reference (Dawn/Node.js).
2006
+ *
2007
+ * Use this to isolate Safari/iPad corruption:
2008
+ * - If weights mismatch → fetch/download pipeline is corrupt
2009
+ * - If weights match but logits mismatch → kernel computation bug on Metal
2010
+ *
2011
+ * Resets executor state before and after (safe to call anytime).
2012
+ */
2013
+ integrityCheck(): Promise<IntegrityCheckResult>;
2014
+ /**
2015
+ * Destroy the engine and free all GPU resources.
2016
+ */
2017
+ destroy(): void;
2018
+ private checkDestroyed;
2019
+ }
2020
+ //#endregion
2021
+ export { buildGemma4RotaryCosSin as $, parseMoonshineConfig as A, resolveGemma4VisionInfo as B, SpeakResult as C, ModelArchConfig as Ct, generateMoonshineDecoderGraph as D, MOONSHINE_REMAINING_WORK as E, Gemma4VisionGraphInfo as F, Gemma4VisionPositionTensors as G, resolveDefaultRepo as H, dequantizeGemma4VisionProjection as I, QWEN3_5_IMAGE_PROCESSOR as J, ImageProcessorConfig as K, dequantizeMLXProjection as L, generateKaniTtsGraph as M, generateNanoCodecDecoderGraph as N, generateMoonshineEncoderGraph as O, parseKaniConfig as P, buildGemma4PosEmbeds as Q, generateGemma4VisionGraph as R, SpeakOptions as S, KvMode as St, generateQwen3_5VisionGraph as T, GEMMA4_IMAGE_PROCESSOR as U, DEFAULT_MODELS as V, Gemma4VisionGridConfig as W, VisionPositionTensors as X, VisionGridConfig as Y, buildGemma4PoolMatrix as Z, TranscribeOptions as _, ChatMessage as _t, GenerateOptions as a, buildRotaryCosSin as at, KaniTTS as b, GraphDType as bt, IntegrityCheckResult as c, preprocessImage as ct, WebGPUEngine as d, SamplingParams as dt, buildGemma4VisionPositionTensors as et, WebGPUEngineOptions as f, LoadedKaniTTS as ft, MoonshineSTTOptions as g, loadMoonshine as gt, MoonshineSTT as h, loadModel as ht, GenerateObjectResult as i, buildPositionIds as it, audioTokensToCodes as j, moonshineEncoderFrames as k, ObjectSchema as l, preprocessImageGemma4 as lt, VisionInputs as m, loadKaniTTS as mt, EncodeImageResult as n, buildMRoPEPositionIds as nt, GenerateResult as o, buildVisionPositionTensors as ot, VisionExecutor as p, LoadedMoonshine as pt, PreprocessedImage as q, GenerateObjectOptions as r, buildPosEmbeds as rt, IntegrityCheckEntry as s, mropeFreqDims as st, EmbedOptions as t, buildMRoPECosSin as tt, ObjectValidator as u, smartResize as ut, TranscribeResult as v, GPUDiagnosticResult as vt, Executor as w, ModelCapabilities as wt, KaniTTSOptions as x, KVDType as xt, MoonshineEncoderExecutor as y, initGPU as yt, patchGemma4VisionClips as z };
2022
+ //# sourceMappingURL=index-jEAL2s-A.d.mts.map