@simulatte/doppler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1182) hide show
  1. package/BRANDING.md +14 -0
  2. package/LICENSE +201 -0
  3. package/NOTICE +5 -0
  4. package/README.md +85 -0
  5. package/SECURITY.md +19 -0
  6. package/package.json +144 -0
  7. package/src/adapters/adapter-manager.d.ts +200 -0
  8. package/src/adapters/adapter-manager.js +509 -0
  9. package/src/adapters/adapter-manifest.d.ts +290 -0
  10. package/src/adapters/adapter-manifest.js +320 -0
  11. package/src/adapters/adapter-registry.d.ts +192 -0
  12. package/src/adapters/adapter-registry.js +466 -0
  13. package/src/adapters/index.d.ts +89 -0
  14. package/src/adapters/index.js +42 -0
  15. package/src/adapters/lora-loader.d.ts +105 -0
  16. package/src/adapters/lora-loader.js +397 -0
  17. package/src/bootstrap.d.ts +1 -0
  18. package/src/bootstrap.js +30 -0
  19. package/src/bridge/extension/background.d.ts +14 -0
  20. package/src/bridge/extension/background.js +168 -0
  21. package/src/bridge/extension/manifest.json +34 -0
  22. package/src/bridge/extension-client.d.ts +109 -0
  23. package/src/bridge/extension-client.js +369 -0
  24. package/src/bridge/index.d.ts +68 -0
  25. package/src/bridge/index.js +51 -0
  26. package/src/bridge/protocol.d.ts +96 -0
  27. package/src/bridge/protocol.js +130 -0
  28. package/src/browser/browser-converter.d.ts +71 -0
  29. package/src/browser/browser-converter.js +947 -0
  30. package/src/browser/file-picker.d.ts +63 -0
  31. package/src/browser/file-picker.js +275 -0
  32. package/src/browser/gguf-importer.d.ts +136 -0
  33. package/src/browser/gguf-importer.js +532 -0
  34. package/src/browser/gguf-parser-browser.d.ts +14 -0
  35. package/src/browser/gguf-parser-browser.js +17 -0
  36. package/src/browser/quantization.d.ts +69 -0
  37. package/src/browser/quantization.js +328 -0
  38. package/src/browser/safetensors-parser-browser.d.ts +193 -0
  39. package/src/browser/safetensors-parser-browser.js +264 -0
  40. package/src/browser/shard-io-browser.d.ts +57 -0
  41. package/src/browser/shard-io-browser.js +89 -0
  42. package/src/browser/tensor-source-download.d.ts +27 -0
  43. package/src/browser/tensor-source-download.js +239 -0
  44. package/src/browser/tensor-source-file.d.ts +26 -0
  45. package/src/browser/tensor-source-file.js +53 -0
  46. package/src/browser/tensor-source-http.d.ts +28 -0
  47. package/src/browser/tensor-source-http.js +126 -0
  48. package/src/client/doppler-provider/generation.d.ts +25 -0
  49. package/src/client/doppler-provider/generation.js +114 -0
  50. package/src/client/doppler-provider/index.d.ts +2 -0
  51. package/src/client/doppler-provider/index.js +3 -0
  52. package/src/client/doppler-provider/model-manager.d.ts +61 -0
  53. package/src/client/doppler-provider/model-manager.js +667 -0
  54. package/src/client/doppler-provider/provider.d.ts +5 -0
  55. package/src/client/doppler-provider/provider.js +102 -0
  56. package/src/client/doppler-provider/source-runtime.d.ts +22 -0
  57. package/src/client/doppler-provider/source-runtime.js +522 -0
  58. package/src/client/doppler-provider/types.d.ts +127 -0
  59. package/src/client/doppler-provider/types.js +17 -0
  60. package/src/client/doppler-provider.d.ts +46 -0
  61. package/src/client/doppler-provider.js +36 -0
  62. package/src/config/README.md +69 -0
  63. package/src/config/backward-registry-loader.d.ts +3 -0
  64. package/src/config/backward-registry-loader.js +8 -0
  65. package/src/config/index.d.ts +63 -0
  66. package/src/config/index.js +31 -0
  67. package/src/config/kernel-path-loader.d.ts +149 -0
  68. package/src/config/kernel-path-loader.js +534 -0
  69. package/src/config/kernels/backward-registry.json +99 -0
  70. package/src/config/kernels/kernel-ref-digests.d.ts +1 -0
  71. package/src/config/kernels/kernel-ref-digests.js +214 -0
  72. package/src/config/kernels/kernel-ref.d.ts +17 -0
  73. package/src/config/kernels/kernel-ref.js +75 -0
  74. package/src/config/kernels/moe/gpt-oss.paths.json +49 -0
  75. package/src/config/kernels/registry.d.ts +86 -0
  76. package/src/config/kernels/registry.js +103 -0
  77. package/src/config/kernels/registry.json +6771 -0
  78. package/src/config/loader.d.ts +57 -0
  79. package/src/config/loader.js +513 -0
  80. package/src/config/merge.d.ts +142 -0
  81. package/src/config/merge.js +389 -0
  82. package/src/config/param-categories.d.ts +17 -0
  83. package/src/config/param-categories.js +72 -0
  84. package/src/config/param-validator.d.ts +26 -0
  85. package/src/config/param-validator.js +235 -0
  86. package/src/config/platforms/amd-rdna3.json +16 -0
  87. package/src/config/platforms/apple-m1.json +16 -0
  88. package/src/config/platforms/apple-m2.json +16 -0
  89. package/src/config/platforms/apple-m3.json +16 -0
  90. package/src/config/platforms/generic.json +14 -0
  91. package/src/config/platforms/loader.d.ts +65 -0
  92. package/src/config/platforms/loader.js +153 -0
  93. package/src/config/platforms/nvidia-rtx30.json +16 -0
  94. package/src/config/platforms/nvidia-rtx40.json +16 -0
  95. package/src/config/presets/kernel-paths/embeddinggemma-f16-f32a.json +60 -0
  96. package/src/config/presets/kernel-paths/embeddinggemma-f32-f32a.json +60 -0
  97. package/src/config/presets/kernel-paths/embeddinggemma-q4k-dequant-f32a.json +60 -0
  98. package/src/config/presets/kernel-paths/gemma2-f16-f16a.json +61 -0
  99. package/src/config/presets/kernel-paths/gemma2-f16-f32a.json +60 -0
  100. package/src/config/presets/kernel-paths/gemma2-q4k-dequant-f16a.json +61 -0
  101. package/src/config/presets/kernel-paths/gemma2-q4k-dequant-f32a.json +60 -0
  102. package/src/config/presets/kernel-paths/gemma2-q4k-fused-f32a.json +57 -0
  103. package/src/config/presets/kernel-paths/gemma3-f16-fused-f16a-online.json +200 -0
  104. package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online.json +223 -0
  105. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f16a-online.json +60 -0
  106. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +61 -0
  107. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a.json +61 -0
  108. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-online.json +61 -0
  109. package/src/config/presets/kernel-paths/registry.json +103 -0
  110. package/src/config/presets/models/deepseek.json +20 -0
  111. package/src/config/presets/models/diffusion.json +10 -0
  112. package/src/config/presets/models/embeddinggemma.json +74 -0
  113. package/src/config/presets/models/functiongemma.json +31 -0
  114. package/src/config/presets/models/gemma2.json +59 -0
  115. package/src/config/presets/models/gemma3.json +75 -0
  116. package/src/config/presets/models/gpt-oss.json +68 -0
  117. package/src/config/presets/models/kimi-k2.json +25 -0
  118. package/src/config/presets/models/lfm2.json +83 -0
  119. package/src/config/presets/models/llama3.json +40 -0
  120. package/src/config/presets/models/mamba.json +34 -0
  121. package/src/config/presets/models/mixtral.json +37 -0
  122. package/src/config/presets/models/modernbert.json +32 -0
  123. package/src/config/presets/models/qwen3.json +41 -0
  124. package/src/config/presets/models/transformer.json +73 -0
  125. package/src/config/presets/models/translategemma.json +30 -0
  126. package/src/config/presets/platforms/nvidia-gb200-8gpu.json +45 -0
  127. package/src/config/presets/platforms/nvidia-gb200-nvl72.json +45 -0
  128. package/src/config/presets/platforms/nvidia-gh200-nvl2.json +44 -0
  129. package/src/config/presets/platforms/nvidia-gh200.json +44 -0
  130. package/src/config/presets/runtime/compute/f16-activations.json +30 -0
  131. package/src/config/presets/runtime/compute/f16-batched.json +32 -0
  132. package/src/config/presets/runtime/default.json +101 -0
  133. package/src/config/presets/runtime/diagnostics/debug-logits.json +53 -0
  134. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +53 -0
  135. package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +210 -0
  136. package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +39 -0
  137. package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +20 -0
  138. package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +20 -0
  139. package/src/config/presets/runtime/kernels/fused-q4k.json +20 -0
  140. package/src/config/presets/runtime/kernels/safe-q4k.json +20 -0
  141. package/src/config/presets/runtime/model/gemma2-debug.json +77 -0
  142. package/src/config/presets/runtime/model/gemma2-pipeline-debug.json +66 -0
  143. package/src/config/presets/runtime/model/gemma2-pipeline.json +75 -0
  144. package/src/config/presets/runtime/model/gemma3-layer-probe.json +85 -0
  145. package/src/config/presets/runtime/modes/bench.json +37 -0
  146. package/src/config/presets/runtime/modes/debug.json +39 -0
  147. package/src/config/presets/runtime/modes/default.json +10 -0
  148. package/src/config/presets/runtime/modes/embedding-bench.json +28 -0
  149. package/src/config/presets/runtime/modes/embedding.json +54 -0
  150. package/src/config/presets/runtime/modes/low-memory.json +40 -0
  151. package/src/config/presets/runtime/modes/production.json +48 -0
  152. package/src/config/presets/runtime/modes/simulation.json +30 -0
  153. package/src/config/presets/runtime/modes/trace-layers.json +126 -0
  154. package/src/config/presets/runtime/platform/metal-apple-q4k.json +11 -0
  155. package/src/config/runtime-merge.d.ts +5 -0
  156. package/src/config/runtime-merge.js +21 -0
  157. package/src/config/runtime.d.ts +28 -0
  158. package/src/config/runtime.js +56 -0
  159. package/src/config/schema/adapter.schema.d.ts +53 -0
  160. package/src/config/schema/adapter.schema.js +60 -0
  161. package/src/config/schema/backward-registry.schema.d.ts +14 -0
  162. package/src/config/schema/backward-registry.schema.js +46 -0
  163. package/src/config/schema/benchmark.schema.d.ts +54 -0
  164. package/src/config/schema/benchmark.schema.js +74 -0
  165. package/src/config/schema/bridge.schema.d.ts +25 -0
  166. package/src/config/schema/bridge.schema.js +22 -0
  167. package/src/config/schema/buffer-pool.schema.d.ts +92 -0
  168. package/src/config/schema/buffer-pool.schema.js +50 -0
  169. package/src/config/schema/conversion.schema.d.ts +183 -0
  170. package/src/config/schema/conversion.schema.js +13 -0
  171. package/src/config/schema/converter.schema.d.ts +123 -0
  172. package/src/config/schema/converter.schema.js +136 -0
  173. package/src/config/schema/debug.schema.d.ts +245 -0
  174. package/src/config/schema/debug.schema.js +106 -0
  175. package/src/config/schema/diffusion.schema.d.ts +88 -0
  176. package/src/config/schema/diffusion.schema.js +62 -0
  177. package/src/config/schema/distill-training.schema.d.ts +48 -0
  178. package/src/config/schema/distill-training.schema.js +139 -0
  179. package/src/config/schema/distribution.schema.d.ts +155 -0
  180. package/src/config/schema/distribution.schema.js +81 -0
  181. package/src/config/schema/doppler.schema.d.ts +75 -0
  182. package/src/config/schema/doppler.schema.js +352 -0
  183. package/src/config/schema/ecosystem.schema.d.ts +255 -0
  184. package/src/config/schema/ecosystem.schema.js +534 -0
  185. package/src/config/schema/emulation.schema.d.ts +351 -0
  186. package/src/config/schema/emulation.schema.js +299 -0
  187. package/src/config/schema/energy.schema.d.ts +102 -0
  188. package/src/config/schema/energy.schema.js +72 -0
  189. package/src/config/schema/execution-v0.schema.d.ts +187 -0
  190. package/src/config/schema/execution-v0.schema.js +55 -0
  191. package/src/config/schema/gpu-cache.schema.d.ts +26 -0
  192. package/src/config/schema/gpu-cache.schema.js +8 -0
  193. package/src/config/schema/harness.schema.d.ts +32 -0
  194. package/src/config/schema/harness.schema.js +20 -0
  195. package/src/config/schema/hotswap.schema.d.ts +55 -0
  196. package/src/config/schema/hotswap.schema.js +18 -0
  197. package/src/config/schema/index.d.ts +863 -0
  198. package/src/config/schema/index.js +471 -0
  199. package/src/config/schema/inference-defaults.schema.d.ts +276 -0
  200. package/src/config/schema/inference-defaults.schema.js +185 -0
  201. package/src/config/schema/inference.schema.d.ts +289 -0
  202. package/src/config/schema/inference.schema.js +39 -0
  203. package/src/config/schema/intent-bundle.schema.d.ts +28 -0
  204. package/src/config/schema/intent-bundle.schema.js +12 -0
  205. package/src/config/schema/kernel-path.schema.d.ts +173 -0
  206. package/src/config/schema/kernel-path.schema.js +9 -0
  207. package/src/config/schema/kernel-registry.schema.d.ts +199 -0
  208. package/src/config/schema/kernel-registry.schema.js +46 -0
  209. package/src/config/schema/kernel-thresholds.schema.d.ts +302 -0
  210. package/src/config/schema/kernel-thresholds.schema.js +187 -0
  211. package/src/config/schema/kernel-warmup.schema.d.ts +19 -0
  212. package/src/config/schema/kernel-warmup.schema.js +5 -0
  213. package/src/config/schema/kvcache.schema.d.ts +131 -0
  214. package/src/config/schema/kvcache.schema.js +31 -0
  215. package/src/config/schema/loading.schema.d.ts +153 -0
  216. package/src/config/schema/loading.schema.js +84 -0
  217. package/src/config/schema/lora.schema.d.ts +12 -0
  218. package/src/config/schema/lora.schema.js +12 -0
  219. package/src/config/schema/manifest.schema.d.ts +500 -0
  220. package/src/config/schema/manifest.schema.js +130 -0
  221. package/src/config/schema/memory-limits.schema.d.ts +107 -0
  222. package/src/config/schema/memory-limits.schema.js +57 -0
  223. package/src/config/schema/moe.schema.d.ts +78 -0
  224. package/src/config/schema/moe.schema.js +31 -0
  225. package/src/config/schema/platform.schema.d.ts +121 -0
  226. package/src/config/schema/platform.schema.js +1 -0
  227. package/src/config/schema/preset.schema.d.ts +124 -0
  228. package/src/config/schema/preset.schema.js +1 -0
  229. package/src/config/schema/quantization-defaults.schema.d.ts +34 -0
  230. package/src/config/schema/quantization-defaults.schema.js +5 -0
  231. package/src/config/schema/quantization.schema.d.ts +10 -0
  232. package/src/config/schema/quantization.schema.js +33 -0
  233. package/src/config/schema/shared-runtime.schema.d.ts +75 -0
  234. package/src/config/schema/shared-runtime.schema.js +45 -0
  235. package/src/config/schema/speculative.schema.d.ts +21 -0
  236. package/src/config/schema/speculative.schema.js +11 -0
  237. package/src/config/schema/storage.schema.d.ts +123 -0
  238. package/src/config/schema/storage.schema.js +66 -0
  239. package/src/config/schema/tooling.schema.d.ts +29 -0
  240. package/src/config/schema/tooling.schema.js +12 -0
  241. package/src/config/schema/training-metrics.schema.d.ts +89 -0
  242. package/src/config/schema/training-metrics.schema.js +374 -0
  243. package/src/config/schema/training.schema.d.ts +88 -0
  244. package/src/config/schema/training.schema.js +106 -0
  245. package/src/config/schema/tuner.schema.d.ts +39 -0
  246. package/src/config/schema/tuner.schema.js +13 -0
  247. package/src/config/schema/ul-training.schema.d.ts +61 -0
  248. package/src/config/schema/ul-training.schema.js +140 -0
  249. package/src/config/schema/units.schema.d.ts +27 -0
  250. package/src/config/schema/units.schema.js +26 -0
  251. package/src/config/training-defaults.d.ts +24 -0
  252. package/src/config/training-defaults.js +91 -0
  253. package/src/converter/conversion-plan.d.ts +64 -0
  254. package/src/converter/conversion-plan.js +472 -0
  255. package/src/converter/core.d.ts +247 -0
  256. package/src/converter/core.js +1329 -0
  257. package/src/converter/execution-v0-manifest.d.ts +15 -0
  258. package/src/converter/execution-v0-manifest.js +146 -0
  259. package/src/converter/index.d.ts +98 -0
  260. package/src/converter/index.js +59 -0
  261. package/src/converter/manifest-inference.d.ts +20 -0
  262. package/src/converter/manifest-inference.js +492 -0
  263. package/src/converter/parsers/diffusion.d.ts +50 -0
  264. package/src/converter/parsers/diffusion.js +270 -0
  265. package/src/converter/parsers/gguf.d.ts +22 -0
  266. package/src/converter/parsers/gguf.js +46 -0
  267. package/src/converter/parsers/index.d.ts +21 -0
  268. package/src/converter/parsers/index.js +12 -0
  269. package/src/converter/parsers/transformer.d.ts +16 -0
  270. package/src/converter/parsers/transformer.js +25 -0
  271. package/src/converter/quantization-info.d.ts +37 -0
  272. package/src/converter/quantization-info.js +398 -0
  273. package/src/converter/quantizer.d.ts +96 -0
  274. package/src/converter/quantizer.js +422 -0
  275. package/src/converter/rope-config.d.ts +15 -0
  276. package/src/converter/rope-config.js +218 -0
  277. package/src/converter/shard-packer.d.ts +138 -0
  278. package/src/converter/shard-packer.js +422 -0
  279. package/src/converter/tokenizer-utils.d.ts +11 -0
  280. package/src/converter/tokenizer-utils.js +87 -0
  281. package/src/debug/config.d.ts +78 -0
  282. package/src/debug/config.js +235 -0
  283. package/src/debug/history.d.ts +65 -0
  284. package/src/debug/history.js +71 -0
  285. package/src/debug/index.d.ts +268 -0
  286. package/src/debug/index.js +192 -0
  287. package/src/debug/log.d.ts +46 -0
  288. package/src/debug/log.js +132 -0
  289. package/src/debug/perf.d.ts +33 -0
  290. package/src/debug/perf.js +51 -0
  291. package/src/debug/reference/README.md +114 -0
  292. package/src/debug/reference/hf_attn_debug.py +114 -0
  293. package/src/debug/reference/hf_embed_check.py +89 -0
  294. package/src/debug/reference/hf_layer_out.py +100 -0
  295. package/src/debug/reference/hf_rope_check.py +116 -0
  296. package/src/debug/reference/hf_weights.py +75 -0
  297. package/src/debug/signals.d.ts +63 -0
  298. package/src/debug/signals.js +33 -0
  299. package/src/debug/stats.d.ts +47 -0
  300. package/src/debug/stats.js +160 -0
  301. package/src/debug/tensor.d.ts +123 -0
  302. package/src/debug/tensor.js +257 -0
  303. package/src/debug/trace.d.ts +17 -0
  304. package/src/debug/trace.js +167 -0
  305. package/src/diffusion/image-regression.d.ts +31 -0
  306. package/src/diffusion/image-regression.js +107 -0
  307. package/src/diffusion/index.d.ts +8 -0
  308. package/src/diffusion/index.js +8 -0
  309. package/src/distribution/p2p-control-plane.d.ts +52 -0
  310. package/src/distribution/p2p-control-plane.js +232 -0
  311. package/src/distribution/p2p-observability.d.ts +116 -0
  312. package/src/distribution/p2p-observability.js +267 -0
  313. package/src/distribution/p2p-transport-contract.d.ts +57 -0
  314. package/src/distribution/p2p-transport-contract.js +310 -0
  315. package/src/distribution/p2p-webrtc-browser.d.ts +37 -0
  316. package/src/distribution/p2p-webrtc-browser.js +434 -0
  317. package/src/distribution/shard-delivery.d.ts +251 -0
  318. package/src/distribution/shard-delivery.js +2096 -0
  319. package/src/energy/index.d.ts +2 -0
  320. package/src/energy/index.js +2 -0
  321. package/src/errors/doppler-error.d.ts +21 -0
  322. package/src/errors/doppler-error.js +25 -0
  323. package/src/errors/index.d.ts +1 -0
  324. package/src/errors/index.js +1 -0
  325. package/src/formats/gguf/index.d.ts +8 -0
  326. package/src/formats/gguf/index.js +4 -0
  327. package/src/formats/gguf/types.d.ts +137 -0
  328. package/src/formats/gguf/types.js +443 -0
  329. package/src/formats/index.d.ts +51 -0
  330. package/src/formats/index.js +13 -0
  331. package/src/formats/rdrr/classification.d.ts +39 -0
  332. package/src/formats/rdrr/classification.js +275 -0
  333. package/src/formats/rdrr/groups.d.ts +27 -0
  334. package/src/formats/rdrr/groups.js +76 -0
  335. package/src/formats/rdrr/index.d.ts +25 -0
  336. package/src/formats/rdrr/index.js +19 -0
  337. package/src/formats/rdrr/manifest.d.ts +32 -0
  338. package/src/formats/rdrr/manifest.js +108 -0
  339. package/src/formats/rdrr/parsing.d.ts +23 -0
  340. package/src/formats/rdrr/parsing.js +101 -0
  341. package/src/formats/rdrr/tensor-config-validator.d.ts +42 -0
  342. package/src/formats/rdrr/tensor-config-validator.js +156 -0
  343. package/src/formats/rdrr/types.d.ts +200 -0
  344. package/src/formats/rdrr/types.js +16 -0
  345. package/src/formats/rdrr/validation.d.ts +9 -0
  346. package/src/formats/rdrr/validation.js +200 -0
  347. package/src/formats/safetensors/index.d.ts +8 -0
  348. package/src/formats/safetensors/index.js +4 -0
  349. package/src/formats/safetensors/types.d.ts +67 -0
  350. package/src/formats/safetensors/types.js +102 -0
  351. package/src/formats/tokenizer/index.d.ts +5 -0
  352. package/src/formats/tokenizer/index.js +3 -0
  353. package/src/formats/tokenizer/types.d.ts +9 -0
  354. package/src/formats/tokenizer/types.js +22 -0
  355. package/src/generation/index.d.ts +18 -0
  356. package/src/generation/index.js +12 -0
  357. package/src/gpu/command-recorder.d.ts +175 -0
  358. package/src/gpu/command-recorder.js +473 -0
  359. package/src/gpu/device.d.ts +141 -0
  360. package/src/gpu/device.js +350 -0
  361. package/src/gpu/kernel-runtime.d.ts +20 -0
  362. package/src/gpu/kernel-runtime.js +37 -0
  363. package/src/gpu/kernel-selection-cache.d.ts +13 -0
  364. package/src/gpu/kernel-selection-cache.js +13 -0
  365. package/src/gpu/kernel-selection-log.d.ts +12 -0
  366. package/src/gpu/kernel-selection-log.js +28 -0
  367. package/src/gpu/kernel-selector.d.ts +11 -0
  368. package/src/gpu/kernel-selector.js +10 -0
  369. package/src/gpu/kernel-tuner/benchmarks.d.ts +144 -0
  370. package/src/gpu/kernel-tuner/benchmarks.js +892 -0
  371. package/src/gpu/kernel-tuner/cache.d.ts +55 -0
  372. package/src/gpu/kernel-tuner/cache.js +66 -0
  373. package/src/gpu/kernel-tuner/index.d.ts +59 -0
  374. package/src/gpu/kernel-tuner/index.js +38 -0
  375. package/src/gpu/kernel-tuner/tuner.d.ts +82 -0
  376. package/src/gpu/kernel-tuner/tuner.js +229 -0
  377. package/src/gpu/kernel-tuner/types.d.ts +101 -0
  378. package/src/gpu/kernel-tuner/types.js +4 -0
  379. package/src/gpu/kernel-tuner.d.ts +33 -0
  380. package/src/gpu/kernel-tuner.js +12 -0
  381. package/src/gpu/kernels/README.md +127 -0
  382. package/src/gpu/kernels/attention.d.ts +236 -0
  383. package/src/gpu/kernels/attention.js +1359 -0
  384. package/src/gpu/kernels/attention.wgsl +249 -0
  385. package/src/gpu/kernels/attention_bdpa_decode_f16.wgsl +246 -0
  386. package/src/gpu/kernels/attention_decode.wgsl +233 -0
  387. package/src/gpu/kernels/attention_decode_chunked_f16.wgsl +183 -0
  388. package/src/gpu/kernels/attention_decode_chunked_f16kv.wgsl +208 -0
  389. package/src/gpu/kernels/attention_decode_f16.wgsl +202 -0
  390. package/src/gpu/kernels/attention_decode_f16kv.wgsl +224 -0
  391. package/src/gpu/kernels/attention_decode_online_f16.wgsl +223 -0
  392. package/src/gpu/kernels/attention_decode_online_f16kv.wgsl +225 -0
  393. package/src/gpu/kernels/attention_decode_optimized.wgsl +445 -0
  394. package/src/gpu/kernels/attention_decode_paged_f16.wgsl +172 -0
  395. package/src/gpu/kernels/attention_decode_paged_f16kv.wgsl +174 -0
  396. package/src/gpu/kernels/attention_decode_subgroup.wgsl +233 -0
  397. package/src/gpu/kernels/attention_decode_tiered_f16.wgsl +218 -0
  398. package/src/gpu/kernels/attention_decode_tiered_f16kv.wgsl +220 -0
  399. package/src/gpu/kernels/attention_decode_tiered_int4_f16kv.wgsl +242 -0
  400. package/src/gpu/kernels/attention_decode_tiered_int8_f16kv.wgsl +242 -0
  401. package/src/gpu/kernels/attention_f16.wgsl +214 -0
  402. package/src/gpu/kernels/attention_f16kv.wgsl +242 -0
  403. package/src/gpu/kernels/attention_small.wgsl +260 -0
  404. package/src/gpu/kernels/attention_small_f16.wgsl +240 -0
  405. package/src/gpu/kernels/attention_small_f16kv.wgsl +266 -0
  406. package/src/gpu/kernels/attention_streaming.wgsl +149 -0
  407. package/src/gpu/kernels/attention_streaming_f16.wgsl +147 -0
  408. package/src/gpu/kernels/attention_streaming_f16kv.wgsl +151 -0
  409. package/src/gpu/kernels/backward/adam.d.ts +28 -0
  410. package/src/gpu/kernels/backward/adam.js +199 -0
  411. package/src/gpu/kernels/backward/adam.wgsl +50 -0
  412. package/src/gpu/kernels/backward/attention_backward.d.ts +22 -0
  413. package/src/gpu/kernels/backward/attention_backward.js +276 -0
  414. package/src/gpu/kernels/backward/attention_backward.wgsl +49 -0
  415. package/src/gpu/kernels/backward/bias_add_backward.d.ts +17 -0
  416. package/src/gpu/kernels/backward/bias_add_backward.js +24 -0
  417. package/src/gpu/kernels/backward/bias_add_backward.wgsl +33 -0
  418. package/src/gpu/kernels/backward/conv2d_backward.d.ts +31 -0
  419. package/src/gpu/kernels/backward/conv2d_backward.js +135 -0
  420. package/src/gpu/kernels/backward/conv2d_backward_input.wgsl +83 -0
  421. package/src/gpu/kernels/backward/conv2d_backward_weight.wgsl +70 -0
  422. package/src/gpu/kernels/backward/cross_entropy_backward.d.ts +23 -0
  423. package/src/gpu/kernels/backward/cross_entropy_backward.js +29 -0
  424. package/src/gpu/kernels/backward/cross_entropy_backward.wgsl +39 -0
  425. package/src/gpu/kernels/backward/embed_backward.d.ts +29 -0
  426. package/src/gpu/kernels/backward/embed_backward.js +118 -0
  427. package/src/gpu/kernels/backward/embed_backward.wgsl +73 -0
  428. package/src/gpu/kernels/backward/gelu_backward.d.ts +16 -0
  429. package/src/gpu/kernels/backward/gelu_backward.js +39 -0
  430. package/src/gpu/kernels/backward/gelu_backward.wgsl +38 -0
  431. package/src/gpu/kernels/backward/groupnorm_backward.d.ts +24 -0
  432. package/src/gpu/kernels/backward/groupnorm_backward.js +29 -0
  433. package/src/gpu/kernels/backward/groupnorm_backward.wgsl +143 -0
  434. package/src/gpu/kernels/backward/index.d.ts +17 -0
  435. package/src/gpu/kernels/backward/index.js +23 -0
  436. package/src/gpu/kernels/backward/layernorm_backward.d.ts +22 -0
  437. package/src/gpu/kernels/backward/layernorm_backward.js +135 -0
  438. package/src/gpu/kernels/backward/layernorm_backward.wgsl +194 -0
  439. package/src/gpu/kernels/backward/matmul_backward.d.ts +32 -0
  440. package/src/gpu/kernels/backward/matmul_backward.js +124 -0
  441. package/src/gpu/kernels/backward/matmul_backward.wgsl +90 -0
  442. package/src/gpu/kernels/backward/matmul_transpose_a.wgsl +84 -0
  443. package/src/gpu/kernels/backward/pixel_shuffle_backward.d.ts +22 -0
  444. package/src/gpu/kernels/backward/pixel_shuffle_backward.js +30 -0
  445. package/src/gpu/kernels/backward/pixel_shuffle_backward.wgsl +54 -0
  446. package/src/gpu/kernels/backward/rmsnorm_backward.d.ts +24 -0
  447. package/src/gpu/kernels/backward/rmsnorm_backward.js +101 -0
  448. package/src/gpu/kernels/backward/rmsnorm_backward.wgsl +78 -0
  449. package/src/gpu/kernels/backward/rope_backward.d.ts +25 -0
  450. package/src/gpu/kernels/backward/rope_backward.js +109 -0
  451. package/src/gpu/kernels/backward/rope_backward.wgsl +59 -0
  452. package/src/gpu/kernels/backward/scale_backward.d.ts +16 -0
  453. package/src/gpu/kernels/backward/scale_backward.js +84 -0
  454. package/src/gpu/kernels/backward/scale_backward.wgsl +27 -0
  455. package/src/gpu/kernels/backward/silu_backward.d.ts +16 -0
  456. package/src/gpu/kernels/backward/silu_backward.js +39 -0
  457. package/src/gpu/kernels/backward/silu_backward.wgsl +31 -0
  458. package/src/gpu/kernels/backward/softmax_backward.d.ts +16 -0
  459. package/src/gpu/kernels/backward/softmax_backward.js +43 -0
  460. package/src/gpu/kernels/backward/softmax_backward.wgsl +44 -0
  461. package/src/gpu/kernels/backward/upsample2d_backward.d.ts +21 -0
  462. package/src/gpu/kernels/backward/upsample2d_backward.js +30 -0
  463. package/src/gpu/kernels/backward/upsample2d_backward.wgsl +59 -0
  464. package/src/gpu/kernels/backward/utils.d.ts +45 -0
  465. package/src/gpu/kernels/backward/utils.js +371 -0
  466. package/src/gpu/kernels/bf16_to_f16.wgsl +54 -0
  467. package/src/gpu/kernels/bf16_to_f32.wgsl +70 -0
  468. package/src/gpu/kernels/bias_add.wgsl +40 -0
  469. package/src/gpu/kernels/bias_add_f16.wgsl +44 -0
  470. package/src/gpu/kernels/cast.d.ts +67 -0
  471. package/src/gpu/kernels/cast.js +422 -0
  472. package/src/gpu/kernels/cast_f16_to_f32.wgsl +31 -0
  473. package/src/gpu/kernels/cast_f32_to_f16.wgsl +36 -0
  474. package/src/gpu/kernels/check-finiteness.d.ts +15 -0
  475. package/src/gpu/kernels/check-finiteness.js +149 -0
  476. package/src/gpu/kernels/check-stop.d.ts +31 -0
  477. package/src/gpu/kernels/check-stop.js +181 -0
  478. package/src/gpu/kernels/clamp.d.ts +22 -0
  479. package/src/gpu/kernels/clamp.js +42 -0
  480. package/src/gpu/kernels/clamp.wgsl +24 -0
  481. package/src/gpu/kernels/constants.d.ts +168 -0
  482. package/src/gpu/kernels/constants.js +129 -0
  483. package/src/gpu/kernels/conv2d.d.ts +34 -0
  484. package/src/gpu/kernels/conv2d.js +81 -0
  485. package/src/gpu/kernels/conv2d.wgsl +71 -0
  486. package/src/gpu/kernels/conv2d_f16.wgsl +73 -0
  487. package/src/gpu/kernels/cross_entropy_loss.d.ts +21 -0
  488. package/src/gpu/kernels/cross_entropy_loss.js +54 -0
  489. package/src/gpu/kernels/cross_entropy_loss.wgsl +39 -0
  490. package/src/gpu/kernels/dequant.d.ts +108 -0
  491. package/src/gpu/kernels/dequant.js +524 -0
  492. package/src/gpu/kernels/dequant_f16_out.wgsl +151 -0
  493. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +149 -0
  494. package/src/gpu/kernels/dequant_f16_rowwise.wgsl +139 -0
  495. package/src/gpu/kernels/dequant_f32_rowwise.wgsl +133 -0
  496. package/src/gpu/kernels/dequant_mxfp4.wgsl +120 -0
  497. package/src/gpu/kernels/dequant_mxfp4_expert.wgsl +129 -0
  498. package/src/gpu/kernels/dequant_mxfp4_expert_f16.wgsl +105 -0
  499. package/src/gpu/kernels/dequant_mxfp4_vec4.wgsl +116 -0
  500. package/src/gpu/kernels/dequant_q6k.wgsl +140 -0
  501. package/src/gpu/kernels/dequant_q8_0.wgsl +98 -0
  502. package/src/gpu/kernels/dequant_shared.wgsl +202 -0
  503. package/src/gpu/kernels/dequant_shared_vec4.wgsl +153 -0
  504. package/src/gpu/kernels/dequant_subgroup.wgsl +202 -0
  505. package/src/gpu/kernels/dispatch.d.ts +157 -0
  506. package/src/gpu/kernels/dispatch.js +235 -0
  507. package/src/gpu/kernels/energy.d.ts +131 -0
  508. package/src/gpu/kernels/energy.js +425 -0
  509. package/src/gpu/kernels/energy_eval.wgsl +26 -0
  510. package/src/gpu/kernels/energy_eval_f16.wgsl +30 -0
  511. package/src/gpu/kernels/energy_quintel_grad.wgsl +92 -0
  512. package/src/gpu/kernels/energy_quintel_grad_f16.wgsl +96 -0
  513. package/src/gpu/kernels/energy_quintel_reduce.wgsl +112 -0
  514. package/src/gpu/kernels/energy_quintel_reduce_f16.wgsl +116 -0
  515. package/src/gpu/kernels/energy_quintel_update.wgsl +92 -0
  516. package/src/gpu/kernels/energy_quintel_update_f16.wgsl +96 -0
  517. package/src/gpu/kernels/energy_update.wgsl +25 -0
  518. package/src/gpu/kernels/energy_update_f16.wgsl +30 -0
  519. package/src/gpu/kernels/feature-check.d.ts +42 -0
  520. package/src/gpu/kernels/feature-check.js +70 -0
  521. package/src/gpu/kernels/fused_ffn.d.ts +65 -0
  522. package/src/gpu/kernels/fused_ffn.js +318 -0
  523. package/src/gpu/kernels/fused_ffn.wgsl +420 -0
  524. package/src/gpu/kernels/fused_ffn_f16.wgsl +213 -0
  525. package/src/gpu/kernels/fused_ffn_q4k.wgsl +375 -0
  526. package/src/gpu/kernels/fused_matmul_q4.wgsl +404 -0
  527. package/src/gpu/kernels/fused_matmul_q4_batched.wgsl +194 -0
  528. package/src/gpu/kernels/fused_matmul_q4_batched_f16.wgsl +170 -0
  529. package/src/gpu/kernels/fused_matmul_q4_batched_f16a.wgsl +154 -0
  530. package/src/gpu/kernels/fused_matmul_q4_f16a.wgsl +219 -0
  531. package/src/gpu/kernels/fused_matmul_q4_multicol_f16.wgsl +216 -0
  532. package/src/gpu/kernels/fused_matmul_q4_multicol_f16a.wgsl +204 -0
  533. package/src/gpu/kernels/fused_matmul_residual.d.ts +46 -0
  534. package/src/gpu/kernels/fused_matmul_residual.js +152 -0
  535. package/src/gpu/kernels/fused_matmul_rmsnorm.d.ts +64 -0
  536. package/src/gpu/kernels/fused_matmul_rmsnorm.js +273 -0
  537. package/src/gpu/kernels/fused_matmul_rmsnorm.wgsl +324 -0
  538. package/src/gpu/kernels/fused_matmul_rmsnorm_f16.wgsl +303 -0
  539. package/src/gpu/kernels/fused_swiglu.wgsl +63 -0
  540. package/src/gpu/kernels/fused_swiglu_f16.wgsl +57 -0
  541. package/src/gpu/kernels/gather.d.ts +64 -0
  542. package/src/gpu/kernels/gather.js +119 -0
  543. package/src/gpu/kernels/gather.wgsl +61 -0
  544. package/src/gpu/kernels/gather_f16.wgsl +65 -0
  545. package/src/gpu/kernels/gather_f16_f16_out.wgsl +55 -0
  546. package/src/gpu/kernels/gather_f16_out.wgsl +55 -0
  547. package/src/gpu/kernels/gather_f16_vec4.wgsl +76 -0
  548. package/src/gpu/kernels/gather_f16_vec4_f16_out.wgsl +68 -0
  549. package/src/gpu/kernels/gather_vec4.wgsl +74 -0
  550. package/src/gpu/kernels/gather_vec4_f16_out.wgsl +68 -0
  551. package/src/gpu/kernels/gelu.d.ts +33 -0
  552. package/src/gpu/kernels/gelu.js +47 -0
  553. package/src/gpu/kernels/gelu.wgsl +64 -0
  554. package/src/gpu/kernels/gelu_f16.wgsl +66 -0
  555. package/src/gpu/kernels/gptoss_mxfp4_expert_fused.wgsl +127 -0
  556. package/src/gpu/kernels/gptoss_router_topk.wgsl +119 -0
  557. package/src/gpu/kernels/groupnorm.d.ts +31 -0
  558. package/src/gpu/kernels/groupnorm.js +91 -0
  559. package/src/gpu/kernels/groupnorm_apply.wgsl +41 -0
  560. package/src/gpu/kernels/groupnorm_apply_f16.wgsl +46 -0
  561. package/src/gpu/kernels/groupnorm_stats.wgsl +76 -0
  562. package/src/gpu/kernels/groupnorm_stats_f16.wgsl +79 -0
  563. package/src/gpu/kernels/index.d.ts +336 -0
  564. package/src/gpu/kernels/index.js +284 -0
  565. package/src/gpu/kernels/kernel-base.d.ts +33 -0
  566. package/src/gpu/kernels/kernel-base.js +46 -0
  567. package/src/gpu/kernels/kernel-configs.d.ts +65 -0
  568. package/src/gpu/kernels/kernel-configs.js +50 -0
  569. package/src/gpu/kernels/kernel-tuning.d.ts +42 -0
  570. package/src/gpu/kernels/kernel-tuning.js +149 -0
  571. package/src/gpu/kernels/kv-quantize.d.ts +37 -0
  572. package/src/gpu/kernels/kv-quantize.js +138 -0
  573. package/src/gpu/kernels/kv_quantize_int4.wgsl +119 -0
  574. package/src/gpu/kernels/kv_quantize_int8.wgsl +119 -0
  575. package/src/gpu/kernels/layernorm.d.ts +37 -0
  576. package/src/gpu/kernels/layernorm.js +80 -0
  577. package/src/gpu/kernels/layernorm.wgsl +121 -0
  578. package/src/gpu/kernels/layernorm_f16.wgsl +103 -0
  579. package/src/gpu/kernels/linear-attention-core.d.ts +39 -0
  580. package/src/gpu/kernels/linear-attention-core.js +535 -0
  581. package/src/gpu/kernels/logit-merge.d.ts +110 -0
  582. package/src/gpu/kernels/logit-merge.js +392 -0
  583. package/src/gpu/kernels/matmul-dispatch.d.ts +38 -0
  584. package/src/gpu/kernels/matmul-dispatch.js +155 -0
  585. package/src/gpu/kernels/matmul-selection.d.ts +87 -0
  586. package/src/gpu/kernels/matmul-selection.js +474 -0
  587. package/src/gpu/kernels/matmul.d.ts +109 -0
  588. package/src/gpu/kernels/matmul.js +271 -0
  589. package/src/gpu/kernels/matmul_f16.wgsl +170 -0
  590. package/src/gpu/kernels/matmul_f16_tiled.wgsl +165 -0
  591. package/src/gpu/kernels/matmul_f16w_f32a.wgsl +89 -0
  592. package/src/gpu/kernels/matmul_f16w_f32a_tiled.wgsl +154 -0
  593. package/src/gpu/kernels/matmul_f32.wgsl +100 -0
  594. package/src/gpu/kernels/matmul_gemv.wgsl +80 -0
  595. package/src/gpu/kernels/matmul_gemv_f16a.wgsl +81 -0
  596. package/src/gpu/kernels/matmul_gemv_residual.wgsl +119 -0
  597. package/src/gpu/kernels/matmul_gemv_residual_f16.wgsl +78 -0
  598. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +345 -0
  599. package/src/gpu/kernels/matmul_gemv_subgroup_f16a.wgsl +514 -0
  600. package/src/gpu/kernels/modulate.d.ts +29 -0
  601. package/src/gpu/kernels/modulate.js +49 -0
  602. package/src/gpu/kernels/modulate.wgsl +40 -0
  603. package/src/gpu/kernels/modulate_f16.wgsl +43 -0
  604. package/src/gpu/kernels/moe.d.ts +164 -0
  605. package/src/gpu/kernels/moe.js +496 -0
  606. package/src/gpu/kernels/moe_gather.wgsl +170 -0
  607. package/src/gpu/kernels/moe_gather_f16.wgsl +82 -0
  608. package/src/gpu/kernels/moe_gather_vec4.wgsl +74 -0
  609. package/src/gpu/kernels/moe_offsets.wgsl +48 -0
  610. package/src/gpu/kernels/pipeline-cache.d.ts +88 -0
  611. package/src/gpu/kernels/pipeline-cache.js +305 -0
  612. package/src/gpu/kernels/pixel_shuffle.d.ts +27 -0
  613. package/src/gpu/kernels/pixel_shuffle.js +49 -0
  614. package/src/gpu/kernels/pixel_shuffle.wgsl +44 -0
  615. package/src/gpu/kernels/pixel_shuffle_f16.wgsl +47 -0
  616. package/src/gpu/kernels/residual.d.ts +74 -0
  617. package/src/gpu/kernels/residual.js +127 -0
  618. package/src/gpu/kernels/residual.wgsl +53 -0
  619. package/src/gpu/kernels/residual_f16.wgsl +35 -0
  620. package/src/gpu/kernels/residual_f16_vec4.wgsl +47 -0
  621. package/src/gpu/kernels/residual_vec4.wgsl +46 -0
  622. package/src/gpu/kernels/rmsnorm.d.ts +53 -0
  623. package/src/gpu/kernels/rmsnorm.js +140 -0
  624. package/src/gpu/kernels/rmsnorm.wgsl +417 -0
  625. package/src/gpu/kernels/rmsnorm_f16.wgsl +164 -0
  626. package/src/gpu/kernels/rope.d.ts +48 -0
  627. package/src/gpu/kernels/rope.js +53 -0
  628. package/src/gpu/kernels/rope.wgsl +328 -0
  629. package/src/gpu/kernels/rope_f16.wgsl +271 -0
  630. package/src/gpu/kernels/rule-matcher.d.ts +30 -0
  631. package/src/gpu/kernels/rule-matcher.js +42 -0
  632. package/src/gpu/kernels/rule-registry.d.ts +7 -0
  633. package/src/gpu/kernels/rule-registry.js +41 -0
  634. package/src/gpu/kernels/sample.d.ts +75 -0
  635. package/src/gpu/kernels/sample.js +578 -0
  636. package/src/gpu/kernels/sample.wgsl +377 -0
  637. package/src/gpu/kernels/sample_f16.wgsl +331 -0
  638. package/src/gpu/kernels/scale.d.ts +35 -0
  639. package/src/gpu/kernels/scale.js +37 -0
  640. package/src/gpu/kernels/scale.wgsl +38 -0
  641. package/src/gpu/kernels/scatter_add.wgsl +88 -0
  642. package/src/gpu/kernels/scatter_add_dynamic.wgsl +59 -0
  643. package/src/gpu/kernels/scatter_add_dynamic_f16.wgsl +52 -0
  644. package/src/gpu/kernels/scatter_add_dynamic_f16_weights.wgsl +50 -0
  645. package/src/gpu/kernels/scatter_add_vec4.wgsl +70 -0
  646. package/src/gpu/kernels/shader-cache.d.ts +56 -0
  647. package/src/gpu/kernels/shader-cache.js +206 -0
  648. package/src/gpu/kernels/silu.d.ts +75 -0
  649. package/src/gpu/kernels/silu.js +340 -0
  650. package/src/gpu/kernels/silu.wgsl +99 -0
  651. package/src/gpu/kernels/silu_f16.wgsl +98 -0
  652. package/src/gpu/kernels/softmax.d.ts +57 -0
  653. package/src/gpu/kernels/softmax.js +106 -0
  654. package/src/gpu/kernels/softmax.wgsl +388 -0
  655. package/src/gpu/kernels/softmax_subgroup.wgsl +175 -0
  656. package/src/gpu/kernels/split_qkv.d.ts +51 -0
  657. package/src/gpu/kernels/split_qkv.js +41 -0
  658. package/src/gpu/kernels/split_qkv.wgsl +71 -0
  659. package/src/gpu/kernels/split_qkv_f16.wgsl +75 -0
  660. package/src/gpu/kernels/topk.wgsl +243 -0
  661. package/src/gpu/kernels/topk_f16.wgsl +108 -0
  662. package/src/gpu/kernels/topk_f16_weights.wgsl +101 -0
  663. package/src/gpu/kernels/transpose.d.ts +21 -0
  664. package/src/gpu/kernels/transpose.js +30 -0
  665. package/src/gpu/kernels/transpose.wgsl +32 -0
  666. package/src/gpu/kernels/types.d.ts +21 -0
  667. package/src/gpu/kernels/types.js +4 -0
  668. package/src/gpu/kernels/uniform-utils.d.ts +48 -0
  669. package/src/gpu/kernels/uniform-utils.js +94 -0
  670. package/src/gpu/kernels/upsample2d.d.ts +25 -0
  671. package/src/gpu/kernels/upsample2d.js +58 -0
  672. package/src/gpu/kernels/upsample2d.wgsl +37 -0
  673. package/src/gpu/kernels/upsample2d_f16.wgsl +41 -0
  674. package/src/gpu/kernels/utils.d.ts +106 -0
  675. package/src/gpu/kernels/utils.js +224 -0
  676. package/src/gpu/multi-model-recorder.d.ts +21 -0
  677. package/src/gpu/multi-model-recorder.js +31 -0
  678. package/src/gpu/partitioned-buffer-pool.d.ts +28 -0
  679. package/src/gpu/partitioned-buffer-pool.js +49 -0
  680. package/src/gpu/perf-guards.d.ts +25 -0
  681. package/src/gpu/perf-guards.js +140 -0
  682. package/src/gpu/profiler.d.ts +114 -0
  683. package/src/gpu/profiler.js +391 -0
  684. package/src/gpu/submit-tracker.d.ts +111 -0
  685. package/src/gpu/submit-tracker.js +229 -0
  686. package/src/gpu/tensor.d.ts +69 -0
  687. package/src/gpu/tensor.js +75 -0
  688. package/src/gpu/uniform-cache.d.ts +108 -0
  689. package/src/gpu/uniform-cache.js +242 -0
  690. package/src/gpu/weight-buffer.d.ts +115 -0
  691. package/src/gpu/weight-buffer.js +118 -0
  692. package/src/hotswap/intent-bundle.d.ts +37 -0
  693. package/src/hotswap/intent-bundle.js +123 -0
  694. package/src/hotswap/manifest.d.ts +33 -0
  695. package/src/hotswap/manifest.js +114 -0
  696. package/src/hotswap/runtime.d.ts +31 -0
  697. package/src/hotswap/runtime.js +128 -0
  698. package/src/index-browser.d.ts +47 -0
  699. package/src/index-browser.js +53 -0
  700. package/src/index-internal.d.ts +2 -0
  701. package/src/index-internal.js +2 -0
  702. package/src/index.d.ts +102 -0
  703. package/src/index.js +75 -0
  704. package/src/inference/README.md +593 -0
  705. package/src/inference/browser-harness.d.ts +234 -0
  706. package/src/inference/browser-harness.js +2665 -0
  707. package/src/inference/decode-buffers.d.ts +108 -0
  708. package/src/inference/decode-buffers.js +181 -0
  709. package/src/inference/decode-ring.d.ts +52 -0
  710. package/src/inference/decode-ring.js +273 -0
  711. package/src/inference/expert-router.d.ts +27 -0
  712. package/src/inference/expert-router.js +55 -0
  713. package/src/inference/functiongemma.d.ts +15 -0
  714. package/src/inference/functiongemma.js +1 -0
  715. package/src/inference/kv-cache/base.d.ts +150 -0
  716. package/src/inference/kv-cache/base.js +1037 -0
  717. package/src/inference/kv-cache/basis-decomposed-paged.d.ts +50 -0
  718. package/src/inference/kv-cache/basis-decomposed-paged.js +276 -0
  719. package/src/inference/kv-cache/index.d.ts +35 -0
  720. package/src/inference/kv-cache/index.js +20 -0
  721. package/src/inference/kv-cache/sliding-window.d.ts +72 -0
  722. package/src/inference/kv-cache/sliding-window.js +243 -0
  723. package/src/inference/kv-cache/tiered.d.ts +89 -0
  724. package/src/inference/kv-cache/tiered.js +574 -0
  725. package/src/inference/kv-cache/types.d.ts +188 -0
  726. package/src/inference/kv-cache/types.js +80 -0
  727. package/src/inference/kv-cache.d.ts +36 -0
  728. package/src/inference/kv-cache.js +18 -0
  729. package/src/inference/moe-router.d.ts +212 -0
  730. package/src/inference/moe-router.js +553 -0
  731. package/src/inference/multi-model-network.d.ts +139 -0
  732. package/src/inference/multi-model-network.js +769 -0
  733. package/src/inference/multi-pipeline-pool.d.ts +62 -0
  734. package/src/inference/multi-pipeline-pool.js +161 -0
  735. package/src/inference/network-evolution.d.ts +46 -0
  736. package/src/inference/network-evolution.js +80 -0
  737. package/src/inference/pipelines/context.d.ts +18 -0
  738. package/src/inference/pipelines/context.js +44 -0
  739. package/src/inference/pipelines/diffusion/helpers.d.ts +29 -0
  740. package/src/inference/pipelines/diffusion/helpers.js +112 -0
  741. package/src/inference/pipelines/diffusion/index.d.ts +3 -0
  742. package/src/inference/pipelines/diffusion/index.js +3 -0
  743. package/src/inference/pipelines/diffusion/init.d.ts +24 -0
  744. package/src/inference/pipelines/diffusion/init.js +124 -0
  745. package/src/inference/pipelines/diffusion/pipeline.d.ts +38 -0
  746. package/src/inference/pipelines/diffusion/pipeline.js +632 -0
  747. package/src/inference/pipelines/diffusion/scheduler.d.ts +19 -0
  748. package/src/inference/pipelines/diffusion/scheduler.js +65 -0
  749. package/src/inference/pipelines/diffusion/sd3-transformer.d.ts +20 -0
  750. package/src/inference/pipelines/diffusion/sd3-transformer.js +1194 -0
  751. package/src/inference/pipelines/diffusion/sd3-weights.d.ts +21 -0
  752. package/src/inference/pipelines/diffusion/sd3-weights.js +287 -0
  753. package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +80 -0
  754. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +935 -0
  755. package/src/inference/pipelines/diffusion/text-encoder.d.ts +29 -0
  756. package/src/inference/pipelines/diffusion/text-encoder.js +178 -0
  757. package/src/inference/pipelines/diffusion/types.d.ts +112 -0
  758. package/src/inference/pipelines/diffusion/types.js +1 -0
  759. package/src/inference/pipelines/diffusion/vae.d.ts +20 -0
  760. package/src/inference/pipelines/diffusion/vae.js +675 -0
  761. package/src/inference/pipelines/diffusion/weights.d.ts +40 -0
  762. package/src/inference/pipelines/diffusion/weights.js +150 -0
  763. package/src/inference/pipelines/dream/energy-head-pipeline.d.ts +29 -0
  764. package/src/inference/pipelines/dream/energy-head-pipeline.js +6 -0
  765. package/src/inference/pipelines/dream/pipeline.d.ts +17 -0
  766. package/src/inference/pipelines/dream/pipeline.js +8 -0
  767. package/src/inference/pipelines/energy/index.d.ts +1 -0
  768. package/src/inference/pipelines/energy/index.js +1 -0
  769. package/src/inference/pipelines/energy/pipeline.d.ts +27 -0
  770. package/src/inference/pipelines/energy/pipeline.js +680 -0
  771. package/src/inference/pipelines/energy/quintel.d.ts +87 -0
  772. package/src/inference/pipelines/energy/quintel.js +207 -0
  773. package/src/inference/pipelines/energy/types.d.ts +63 -0
  774. package/src/inference/pipelines/energy/types.js +1 -0
  775. package/src/inference/pipelines/energy-head/index.d.ts +6 -0
  776. package/src/inference/pipelines/energy-head/index.js +6 -0
  777. package/src/inference/pipelines/energy-head/row-head-pipeline.d.ts +103 -0
  778. package/src/inference/pipelines/energy-head/row-head-pipeline.js +487 -0
  779. package/src/inference/pipelines/factory.d.ts +10 -0
  780. package/src/inference/pipelines/factory.js +6 -0
  781. package/src/inference/pipelines/index.d.ts +22 -0
  782. package/src/inference/pipelines/index.js +19 -0
  783. package/src/inference/pipelines/registry.d.ts +15 -0
  784. package/src/inference/pipelines/registry.js +23 -0
  785. package/src/inference/pipelines/rng.d.ts +2 -0
  786. package/src/inference/pipelines/rng.js +17 -0
  787. package/src/inference/pipelines/structured/index.d.ts +8 -0
  788. package/src/inference/pipelines/structured/index.js +8 -0
  789. package/src/inference/pipelines/structured/json-head-pipeline.d.ts +58 -0
  790. package/src/inference/pipelines/structured/json-head-pipeline.js +181 -0
  791. package/src/inference/pipelines/text/attention/index.d.ts +24 -0
  792. package/src/inference/pipelines/text/attention/index.js +17 -0
  793. package/src/inference/pipelines/text/attention/projections.d.ts +101 -0
  794. package/src/inference/pipelines/text/attention/projections.js +435 -0
  795. package/src/inference/pipelines/text/attention/record.d.ts +36 -0
  796. package/src/inference/pipelines/text/attention/record.js +613 -0
  797. package/src/inference/pipelines/text/attention/run.d.ts +38 -0
  798. package/src/inference/pipelines/text/attention/run.js +826 -0
  799. package/src/inference/pipelines/text/attention/types.d.ts +98 -0
  800. package/src/inference/pipelines/text/attention/types.js +67 -0
  801. package/src/inference/pipelines/text/attention.d.ts +23 -0
  802. package/src/inference/pipelines/text/attention.js +12 -0
  803. package/src/inference/pipelines/text/bdpa-steamroller.d.ts +22 -0
  804. package/src/inference/pipelines/text/bdpa-steamroller.js +158 -0
  805. package/src/inference/pipelines/text/buffer-types.d.ts +7 -0
  806. package/src/inference/pipelines/text/buffer-types.js +4 -0
  807. package/src/inference/pipelines/text/chat-format.d.ts +46 -0
  808. package/src/inference/pipelines/text/chat-format.js +366 -0
  809. package/src/inference/pipelines/text/config.d.ts +235 -0
  810. package/src/inference/pipelines/text/config.js +623 -0
  811. package/src/inference/pipelines/text/debug-utils/config.d.ts +144 -0
  812. package/src/inference/pipelines/text/debug-utils/config.js +156 -0
  813. package/src/inference/pipelines/text/debug-utils/index.d.ts +53 -0
  814. package/src/inference/pipelines/text/debug-utils/index.js +44 -0
  815. package/src/inference/pipelines/text/debug-utils/logging.d.ts +106 -0
  816. package/src/inference/pipelines/text/debug-utils/logging.js +152 -0
  817. package/src/inference/pipelines/text/debug-utils/tensor.d.ts +119 -0
  818. package/src/inference/pipelines/text/debug-utils/tensor.js +268 -0
  819. package/src/inference/pipelines/text/debug-utils/utils.d.ts +77 -0
  820. package/src/inference/pipelines/text/debug-utils/utils.js +139 -0
  821. package/src/inference/pipelines/text/debug-utils.d.ts +42 -0
  822. package/src/inference/pipelines/text/debug-utils.js +34 -0
  823. package/src/inference/pipelines/text/embed.d.ts +67 -0
  824. package/src/inference/pipelines/text/embed.js +461 -0
  825. package/src/inference/pipelines/text/execution-plan.d.ts +116 -0
  826. package/src/inference/pipelines/text/execution-plan.js +314 -0
  827. package/src/inference/pipelines/text/execution-v0.d.ts +66 -0
  828. package/src/inference/pipelines/text/execution-v0.js +1139 -0
  829. package/src/inference/pipelines/text/ffn/dense.d.ts +40 -0
  830. package/src/inference/pipelines/text/ffn/dense.js +759 -0
  831. package/src/inference/pipelines/text/ffn/index.d.ts +23 -0
  832. package/src/inference/pipelines/text/ffn/index.js +16 -0
  833. package/src/inference/pipelines/text/ffn/moe.d.ts +21 -0
  834. package/src/inference/pipelines/text/ffn/moe.js +49 -0
  835. package/src/inference/pipelines/text/ffn/sandwich.d.ts +25 -0
  836. package/src/inference/pipelines/text/ffn/sandwich.js +196 -0
  837. package/src/inference/pipelines/text/ffn/standard.d.ts +23 -0
  838. package/src/inference/pipelines/text/ffn/standard.js +84 -0
  839. package/src/inference/pipelines/text/ffn/types.d.ts +30 -0
  840. package/src/inference/pipelines/text/ffn/types.js +25 -0
  841. package/src/inference/pipelines/text/ffn.d.ts +31 -0
  842. package/src/inference/pipelines/text/ffn.js +18 -0
  843. package/src/inference/pipelines/text/finiteness-guard-status.d.ts +11 -0
  844. package/src/inference/pipelines/text/finiteness-guard-status.js +21 -0
  845. package/src/inference/pipelines/text/finiteness-policy.d.ts +35 -0
  846. package/src/inference/pipelines/text/finiteness-policy.js +45 -0
  847. package/src/inference/pipelines/text/generator-helpers.d.ts +34 -0
  848. package/src/inference/pipelines/text/generator-helpers.js +175 -0
  849. package/src/inference/pipelines/text/generator-runtime.d.ts +93 -0
  850. package/src/inference/pipelines/text/generator-runtime.js +373 -0
  851. package/src/inference/pipelines/text/generator-steps.d.ts +75 -0
  852. package/src/inference/pipelines/text/generator-steps.js +1078 -0
  853. package/src/inference/pipelines/text/generator.d.ts +41 -0
  854. package/src/inference/pipelines/text/generator.js +1345 -0
  855. package/src/inference/pipelines/text/index.d.ts +5 -0
  856. package/src/inference/pipelines/text/index.js +6 -0
  857. package/src/inference/pipelines/text/init.d.ts +295 -0
  858. package/src/inference/pipelines/text/init.js +965 -0
  859. package/src/inference/pipelines/text/kernel-path-auto-select.d.ts +12 -0
  860. package/src/inference/pipelines/text/kernel-path-auto-select.js +90 -0
  861. package/src/inference/pipelines/text/kernel-trace.d.ts +150 -0
  862. package/src/inference/pipelines/text/kernel-trace.js +324 -0
  863. package/src/inference/pipelines/text/layer-plan.d.ts +65 -0
  864. package/src/inference/pipelines/text/layer-plan.js +249 -0
  865. package/src/inference/pipelines/text/layer.d.ts +56 -0
  866. package/src/inference/pipelines/text/layer.js +916 -0
  867. package/src/inference/pipelines/text/linear-attention.d.ts +94 -0
  868. package/src/inference/pipelines/text/linear-attention.js +803 -0
  869. package/src/inference/pipelines/text/logits/cpu.d.ts +81 -0
  870. package/src/inference/pipelines/text/logits/cpu.js +91 -0
  871. package/src/inference/pipelines/text/logits/gpu.d.ts +113 -0
  872. package/src/inference/pipelines/text/logits/gpu.js +406 -0
  873. package/src/inference/pipelines/text/logits/index.d.ts +57 -0
  874. package/src/inference/pipelines/text/logits/index.js +305 -0
  875. package/src/inference/pipelines/text/logits/types.d.ts +46 -0
  876. package/src/inference/pipelines/text/logits/types.js +4 -0
  877. package/src/inference/pipelines/text/logits/utils.d.ts +49 -0
  878. package/src/inference/pipelines/text/logits/utils.js +59 -0
  879. package/src/inference/pipelines/text/logits.d.ts +27 -0
  880. package/src/inference/pipelines/text/logits.js +16 -0
  881. package/src/inference/pipelines/text/lora-apply.d.ts +28 -0
  882. package/src/inference/pipelines/text/lora-apply.js +58 -0
  883. package/src/inference/pipelines/text/lora-types.d.ts +39 -0
  884. package/src/inference/pipelines/text/lora-types.js +18 -0
  885. package/src/inference/pipelines/text/lora.d.ts +18 -0
  886. package/src/inference/pipelines/text/lora.js +12 -0
  887. package/src/inference/pipelines/text/model-load.d.ts +58 -0
  888. package/src/inference/pipelines/text/model-load.js +561 -0
  889. package/src/inference/pipelines/text/moe-cache.d.ts +32 -0
  890. package/src/inference/pipelines/text/moe-cache.js +107 -0
  891. package/src/inference/pipelines/text/moe-cpu-gptoss.d.ts +9 -0
  892. package/src/inference/pipelines/text/moe-cpu-gptoss.js +110 -0
  893. package/src/inference/pipelines/text/moe-cpu.d.ts +13 -0
  894. package/src/inference/pipelines/text/moe-cpu.js +116 -0
  895. package/src/inference/pipelines/text/moe-gpu.d.ts +13 -0
  896. package/src/inference/pipelines/text/moe-gpu.js +611 -0
  897. package/src/inference/pipelines/text/moe-helpers.d.ts +12 -0
  898. package/src/inference/pipelines/text/moe-helpers.js +21 -0
  899. package/src/inference/pipelines/text/moe-impl.d.ts +117 -0
  900. package/src/inference/pipelines/text/moe-impl.js +9 -0
  901. package/src/inference/pipelines/text/moe-shape-validator.d.ts +31 -0
  902. package/src/inference/pipelines/text/moe-shape-validator.js +78 -0
  903. package/src/inference/pipelines/text/ops.d.ts +167 -0
  904. package/src/inference/pipelines/text/ops.js +367 -0
  905. package/src/inference/pipelines/text/probes.d.ts +31 -0
  906. package/src/inference/pipelines/text/probes.js +170 -0
  907. package/src/inference/pipelines/text/sampling.d.ts +54 -0
  908. package/src/inference/pipelines/text/sampling.js +203 -0
  909. package/src/inference/pipelines/text/state.d.ts +112 -0
  910. package/src/inference/pipelines/text/state.js +152 -0
  911. package/src/inference/pipelines/text/types.d.ts +627 -0
  912. package/src/inference/pipelines/text/types.js +4 -0
  913. package/src/inference/pipelines/text/weights.d.ts +110 -0
  914. package/src/inference/pipelines/text/weights.js +163 -0
  915. package/src/inference/pipelines/text.d.ts +157 -0
  916. package/src/inference/pipelines/text.js +586 -0
  917. package/src/inference/speculative.d.ts +239 -0
  918. package/src/inference/speculative.js +416 -0
  919. package/src/inference/test-harness.d.ts +178 -0
  920. package/src/inference/test-harness.js +349 -0
  921. package/src/inference/tokenizer.d.ts +77 -0
  922. package/src/inference/tokenizer.js +258 -0
  923. package/src/inference/tokenizers/base.d.ts +39 -0
  924. package/src/inference/tokenizers/base.js +69 -0
  925. package/src/inference/tokenizers/bpe.d.ts +27 -0
  926. package/src/inference/tokenizers/bpe.js +171 -0
  927. package/src/inference/tokenizers/bundled.d.ts +63 -0
  928. package/src/inference/tokenizers/bundled.js +866 -0
  929. package/src/inference/tokenizers/sentencepiece.d.ts +28 -0
  930. package/src/inference/tokenizers/sentencepiece.js +389 -0
  931. package/src/inference/tokenizers/types.d.ts +166 -0
  932. package/src/inference/tokenizers/types.js +7 -0
  933. package/src/loader/doppler-loader.d.ts +134 -0
  934. package/src/loader/doppler-loader.js +1036 -0
  935. package/src/loader/dtype-utils.d.ts +40 -0
  936. package/src/loader/dtype-utils.js +102 -0
  937. package/src/loader/embedding-loader.d.ts +56 -0
  938. package/src/loader/embedding-loader.js +207 -0
  939. package/src/loader/experts/expert-cache.d.ts +156 -0
  940. package/src/loader/experts/expert-cache.js +375 -0
  941. package/src/loader/experts/expert-loader.d.ts +108 -0
  942. package/src/loader/experts/expert-loader.js +384 -0
  943. package/src/loader/final-weights-loader.d.ts +68 -0
  944. package/src/loader/final-weights-loader.js +262 -0
  945. package/src/loader/index.d.ts +150 -0
  946. package/src/loader/index.js +124 -0
  947. package/src/loader/layer-loader.d.ts +63 -0
  948. package/src/loader/layer-loader.js +417 -0
  949. package/src/loader/loader-state.d.ts +51 -0
  950. package/src/loader/loader-state.js +142 -0
  951. package/src/loader/loader-types.d.ts +236 -0
  952. package/src/loader/loader-types.js +4 -0
  953. package/src/loader/manifest-config.d.ts +97 -0
  954. package/src/loader/manifest-config.js +132 -0
  955. package/src/loader/memory-monitor.d.ts +112 -0
  956. package/src/loader/memory-monitor.js +276 -0
  957. package/src/loader/multi-model-loader.d.ts +37 -0
  958. package/src/loader/multi-model-loader.js +87 -0
  959. package/src/loader/quantization-constants.d.ts +23 -0
  960. package/src/loader/quantization-constants.js +14 -0
  961. package/src/loader/shard-cache.d.ts +60 -0
  962. package/src/loader/shard-cache.js +568 -0
  963. package/src/loader/shard-resolver.d.ts +12 -0
  964. package/src/loader/shard-resolver.js +83 -0
  965. package/src/loader/tensors/tensor-loader.d.ts +154 -0
  966. package/src/loader/tensors/tensor-loader.js +427 -0
  967. package/src/loader/tensors/tensor-reader.d.ts +22 -0
  968. package/src/loader/tensors/tensor-reader.js +56 -0
  969. package/src/loader/tensors/tensor-role.d.ts +7 -0
  970. package/src/loader/tensors/tensor-role.js +12 -0
  971. package/src/loader/weight-downcast.d.ts +62 -0
  972. package/src/loader/weight-downcast.js +213 -0
  973. package/src/loader/weights.d.ts +22 -0
  974. package/src/loader/weights.js +4 -0
  975. package/src/memory/address-table.d.ts +104 -0
  976. package/src/memory/address-table.js +114 -0
  977. package/src/memory/buffer-pool.d.ts +196 -0
  978. package/src/memory/buffer-pool.js +756 -0
  979. package/src/memory/capability.d.ts +49 -0
  980. package/src/memory/capability.js +95 -0
  981. package/src/memory/heap-manager.d.ts +104 -0
  982. package/src/memory/heap-manager.js +264 -0
  983. package/src/memory/unified-detect.d.ts +59 -0
  984. package/src/memory/unified-detect.js +192 -0
  985. package/src/rules/converter/execution.rules.json +20 -0
  986. package/src/rules/converter/tensor-roles.rules.json +13 -0
  987. package/src/rules/converter/tokenizer.rules.json +7 -0
  988. package/src/rules/inference/attention.rules.json +54 -0
  989. package/src/rules/inference/config.rules.json +58 -0
  990. package/src/rules/inference/dtype.rules.json +94 -0
  991. package/src/rules/inference/execution.rules.json +45 -0
  992. package/src/rules/inference/ffn.rules.json +35 -0
  993. package/src/rules/inference/kernel-path.rules.json +76 -0
  994. package/src/rules/inference/layer-pattern.rules.json +16 -0
  995. package/src/rules/inference/layer.rules.json +7 -0
  996. package/src/rules/inference/moe.rules.json +48 -0
  997. package/src/rules/kernels/attention.rules.json +61 -0
  998. package/src/rules/kernels/conv2d.rules.json +6 -0
  999. package/src/rules/kernels/dequant.rules.json +58 -0
  1000. package/src/rules/kernels/energy.rules.json +22 -0
  1001. package/src/rules/kernels/fused-ffn.rules.json +13 -0
  1002. package/src/rules/kernels/fused-matmul-residual.rules.json +6 -0
  1003. package/src/rules/kernels/fused-matmul-rmsnorm.rules.json +8 -0
  1004. package/src/rules/kernels/gather.rules.json +12 -0
  1005. package/src/rules/kernels/gelu.rules.json +11 -0
  1006. package/src/rules/kernels/groupnorm.rules.json +10 -0
  1007. package/src/rules/kernels/kernel-validator.d.ts +24 -0
  1008. package/src/rules/kernels/kernel-validator.js +160 -0
  1009. package/src/rules/kernels/kv_quantize.rules.json +7 -0
  1010. package/src/rules/kernels/layernorm.rules.json +6 -0
  1011. package/src/rules/kernels/matmul.rules.json +60 -0
  1012. package/src/rules/kernels/modulate.rules.json +6 -0
  1013. package/src/rules/kernels/moe.rules.gptoss.json +105 -0
  1014. package/src/rules/kernels/moe.rules.json +11 -0
  1015. package/src/rules/kernels/pixel_shuffle.rules.json +6 -0
  1016. package/src/rules/kernels/residual.rules.json +12 -0
  1017. package/src/rules/kernels/rmsnorm.rules.json +11 -0
  1018. package/src/rules/kernels/rope.rules.json +6 -0
  1019. package/src/rules/kernels/sample.rules.json +6 -0
  1020. package/src/rules/kernels/scale.rules.json +6 -0
  1021. package/src/rules/kernels/silu.rules.json +21 -0
  1022. package/src/rules/kernels/softmax.rules.json +23 -0
  1023. package/src/rules/kernels/split-qkv.rules.json +6 -0
  1024. package/src/rules/kernels/upsample2d.rules.json +6 -0
  1025. package/src/rules/loader/tensor-loader.rules.json +15 -0
  1026. package/src/rules/loader/weights.rules.json +41 -0
  1027. package/src/rules/rule-registry.d.ts +48 -0
  1028. package/src/rules/rule-registry.js +177 -0
  1029. package/src/rules/tooling/command-runtime.rules.json +38 -0
  1030. package/src/storage/backends/idb-store.d.ts +52 -0
  1031. package/src/storage/backends/idb-store.js +590 -0
  1032. package/src/storage/backends/memory-store.d.ts +36 -0
  1033. package/src/storage/backends/memory-store.js +242 -0
  1034. package/src/storage/backends/opfs-store.d.ts +41 -0
  1035. package/src/storage/backends/opfs-store.js +429 -0
  1036. package/src/storage/blake3.d.ts +17 -0
  1037. package/src/storage/blake3.js +269 -0
  1038. package/src/storage/download-types.d.ts +157 -0
  1039. package/src/storage/download-types.js +48 -0
  1040. package/src/storage/downloader.d.ts +103 -0
  1041. package/src/storage/downloader.js +839 -0
  1042. package/src/storage/emulated-vram.d.ts +264 -0
  1043. package/src/storage/emulated-vram.js +576 -0
  1044. package/src/storage/export.d.ts +20 -0
  1045. package/src/storage/export.js +159 -0
  1046. package/src/storage/index.d.ts +253 -0
  1047. package/src/storage/index.js +185 -0
  1048. package/src/storage/inventory.d.ts +26 -0
  1049. package/src/storage/inventory.js +218 -0
  1050. package/src/storage/preflight.d.ts +144 -0
  1051. package/src/storage/preflight.js +294 -0
  1052. package/src/storage/quickstart-downloader.d.ts +154 -0
  1053. package/src/storage/quickstart-downloader.js +265 -0
  1054. package/src/storage/quota.d.ts +150 -0
  1055. package/src/storage/quota.js +304 -0
  1056. package/src/storage/registry.d.ts +28 -0
  1057. package/src/storage/registry.js +125 -0
  1058. package/src/storage/reports.d.ts +20 -0
  1059. package/src/storage/reports.js +94 -0
  1060. package/src/storage/shard-manager.d.ts +137 -0
  1061. package/src/storage/shard-manager.js +801 -0
  1062. package/src/sw.d.ts +1 -0
  1063. package/src/sw.js +187 -0
  1064. package/src/tooling/browser-command-runner.d.ts +28 -0
  1065. package/src/tooling/browser-command-runner.js +82 -0
  1066. package/src/tooling/command-api.d.ts +147 -0
  1067. package/src/tooling/command-api.js +523 -0
  1068. package/src/tooling/command-envelope.d.ts +81 -0
  1069. package/src/tooling/command-envelope.js +195 -0
  1070. package/src/tooling/command-runner-shared.d.ts +73 -0
  1071. package/src/tooling/command-runner-shared.js +146 -0
  1072. package/src/tooling/command-runner.html +45 -0
  1073. package/src/tooling/node-browser-command-runner.d.ts +30 -0
  1074. package/src/tooling/node-browser-command-runner.js +868 -0
  1075. package/src/tooling/node-command-runner.d.ts +36 -0
  1076. package/src/tooling/node-command-runner.js +127 -0
  1077. package/src/tooling/node-convert-worker-pool.d.ts +16 -0
  1078. package/src/tooling/node-convert-worker-pool.js +186 -0
  1079. package/src/tooling/node-convert-worker.d.ts +1 -0
  1080. package/src/tooling/node-convert-worker.js +60 -0
  1081. package/src/tooling/node-convert.d.ts +44 -0
  1082. package/src/tooling/node-converter.d.ts +1 -0
  1083. package/src/tooling/node-converter.js +1227 -0
  1084. package/src/tooling/node-file-fetch.d.ts +1 -0
  1085. package/src/tooling/node-file-fetch.js +38 -0
  1086. package/src/tooling/node-source-runtime.d.ts +19 -0
  1087. package/src/tooling/node-source-runtime.js +469 -0
  1088. package/src/tooling/node-webgpu.d.ts +6 -0
  1089. package/src/tooling/node-webgpu.js +321 -0
  1090. package/src/tooling/opfs-cache.d.ts +11 -0
  1091. package/src/tooling/opfs-cache.js +174 -0
  1092. package/src/tooling/source-runtime-bundle.d.ts +102 -0
  1093. package/src/tooling/source-runtime-bundle.js +484 -0
  1094. package/src/tooling-exports.browser.d.ts +7 -0
  1095. package/src/tooling-exports.browser.js +2 -0
  1096. package/src/tooling-exports.d.ts +22 -0
  1097. package/src/tooling-exports.js +7 -0
  1098. package/src/tooling-exports.shared.d.ts +105 -0
  1099. package/src/tooling-exports.shared.js +92 -0
  1100. package/src/training/README.md +153 -0
  1101. package/src/training/artifacts.d.ts +160 -0
  1102. package/src/training/artifacts.js +896 -0
  1103. package/src/training/attention-backward.d.ts +30 -0
  1104. package/src/training/attention-backward.js +217 -0
  1105. package/src/training/attention-forward.d.ts +22 -0
  1106. package/src/training/attention-forward.js +82 -0
  1107. package/src/training/autograd.d.ts +51 -0
  1108. package/src/training/autograd.js +380 -0
  1109. package/src/training/checkpoint.d.ts +31 -0
  1110. package/src/training/checkpoint.js +238 -0
  1111. package/src/training/clip.d.ts +9 -0
  1112. package/src/training/clip.js +54 -0
  1113. package/src/training/dataloader.d.ts +8 -0
  1114. package/src/training/dataloader.js +44 -0
  1115. package/src/training/datasets/index.d.ts +12 -0
  1116. package/src/training/datasets/index.js +6 -0
  1117. package/src/training/datasets/jsonl.d.ts +11 -0
  1118. package/src/training/datasets/jsonl.js +50 -0
  1119. package/src/training/datasets/reploid.d.ts +3 -0
  1120. package/src/training/datasets/reploid.js +36 -0
  1121. package/src/training/datasets/text-pairs.d.ts +21 -0
  1122. package/src/training/datasets/text-pairs.js +42 -0
  1123. package/src/training/datasets/token-batch.d.ts +21 -0
  1124. package/src/training/datasets/token-batch.js +40 -0
  1125. package/src/training/datasets/translation-pairs.d.ts +34 -0
  1126. package/src/training/datasets/translation-pairs.js +49 -0
  1127. package/src/training/export.d.ts +32 -0
  1128. package/src/training/export.js +112 -0
  1129. package/src/training/index.d.ts +52 -0
  1130. package/src/training/index.js +41 -0
  1131. package/src/training/lora.d.ts +19 -0
  1132. package/src/training/lora.js +57 -0
  1133. package/src/training/loss-scaling.d.ts +21 -0
  1134. package/src/training/loss-scaling.js +80 -0
  1135. package/src/training/loss.d.ts +10 -0
  1136. package/src/training/loss.js +41 -0
  1137. package/src/training/objectives/base.d.ts +58 -0
  1138. package/src/training/objectives/base.js +38 -0
  1139. package/src/training/objectives/cross_entropy.d.ts +18 -0
  1140. package/src/training/objectives/cross_entropy.js +37 -0
  1141. package/src/training/objectives/distill_kd.d.ts +16 -0
  1142. package/src/training/objectives/distill_kd.js +369 -0
  1143. package/src/training/objectives/distill_triplet.d.ts +16 -0
  1144. package/src/training/objectives/distill_triplet.js +412 -0
  1145. package/src/training/objectives/index.d.ts +12 -0
  1146. package/src/training/objectives/index.js +6 -0
  1147. package/src/training/objectives/ul_stage1_joint.d.ts +16 -0
  1148. package/src/training/objectives/ul_stage1_joint.js +188 -0
  1149. package/src/training/objectives/ul_stage2_base.d.ts +16 -0
  1150. package/src/training/objectives/ul_stage2_base.js +222 -0
  1151. package/src/training/optimizer.d.ts +22 -0
  1152. package/src/training/optimizer.js +115 -0
  1153. package/src/training/runner.d.ts +196 -0
  1154. package/src/training/runner.js +1194 -0
  1155. package/src/training/suite.d.ts +187 -0
  1156. package/src/training/suite.js +3156 -0
  1157. package/src/training/trainer.d.ts +89 -0
  1158. package/src/training/trainer.js +301 -0
  1159. package/src/training/ul_dataset.d.ts +47 -0
  1160. package/src/training/ul_dataset.js +153 -0
  1161. package/src/training/ul_schedule.d.ts +6 -0
  1162. package/src/training/ul_schedule.js +29 -0
  1163. package/src/types/chrome.d.ts +36 -0
  1164. package/src/types/chrome.js +1 -0
  1165. package/src/types/gpu.d.ts +185 -0
  1166. package/src/types/gpu.js +5 -0
  1167. package/src/types/index.d.ts +3 -0
  1168. package/src/types/index.js +3 -0
  1169. package/src/types/inference.d.ts +197 -0
  1170. package/src/types/inference.js +5 -0
  1171. package/src/types/model.d.ts +125 -0
  1172. package/src/types/model.js +5 -0
  1173. package/src/utils/index.d.ts +7 -0
  1174. package/src/utils/index.js +7 -0
  1175. package/src/utils/load-json.d.ts +5 -0
  1176. package/src/utils/load-json.js +23 -0
  1177. package/src/utils/plain-object.d.ts +1 -0
  1178. package/src/utils/plain-object.js +3 -0
  1179. package/src/utils/sha256.d.ts +4 -0
  1180. package/src/utils/sha256.js +135 -0
  1181. package/tools/convert-safetensors-node.js +180 -0
  1182. package/tools/doppler-cli.js +1170 -0
@@ -0,0 +1,1359 @@
1
+
2
+
3
+ import { getDevice, getDeviceEpoch, getDeviceLimits, getKernelCapabilities } from '../device.js';
4
+ import { acquireBuffer } from '../../memory/buffer-pool.js';
5
+ import { createTensor } from '../tensor.js';
6
+ import { KernelBase } from './kernel-base.js';
7
+ import { TILE_SIZES } from './constants.js';
8
+ import { getKernelThresholds, padToQ4KBlock } from '../../config/schema/index.js';
9
+ import { createUniformBufferWithView, getKernelConfig, hasRequiredFeatures } from './utils.js';
10
+ import { dispatchIndirect, recordDispatchIndirect } from './dispatch.js';
11
+ import { releaseUniformBuffer } from '../uniform-cache.js';
12
+ import { log, trace } from '../../debug/index.js';
13
+ import { getKernelPathAttentionVariant, getKernelPathStrict } from '../../config/kernel-path-loader.js';
14
+ import { selectRuleValue as selectKernelRuleValue } from './rule-registry.js';
15
+ import { selectRuleValue as selectSharedRuleValue } from '../../rules/rule-registry.js';
16
+ import { logKernelSelectionOnce } from '../kernel-selection-log.js';
17
+
18
+ // Track if we've logged the attention tier selection (avoid spam)
19
+ let loggedAttentionTier = false;
20
+
21
+
22
+ let _chunkedMaxKVLen = null;
23
+
24
+
25
+ function getChunkedMaxKVLen() {
26
+ if (_chunkedMaxKVLen === null) {
27
+ const config = getKernelConfig('attention', 'decode_chunked_f16kv');
28
+ const maxKVLen = config.variantMetadata?.maxKVLen;
29
+ if (!Number.isFinite(maxKVLen)) {
30
+ throw new Error('Kernel config missing attention.decode_chunked_f16kv maxKVLen');
31
+ }
32
+ _chunkedMaxKVLen = maxKVLen;
33
+ }
34
+ return _chunkedMaxKVLen;
35
+ }
36
+
37
+ let _tieredMaxKVLen = null;
38
+
39
+ function getTieredMaxKVLen() {
40
+ if (_tieredMaxKVLen === null) {
41
+ const config = getKernelConfig('attention_tiered', 'decode_tiered_f16');
42
+ const maxKVLen = config.variantMetadata?.maxKVLen;
43
+ if (!Number.isFinite(maxKVLen)) {
44
+ throw new Error('Kernel config missing attention_tiered.decode_tiered_f16 maxKVLen');
45
+ }
46
+ _tieredMaxKVLen = maxKVLen;
47
+ }
48
+ return _tieredMaxKVLen;
49
+ }
50
+
51
+ let _tieredQuantMaxKVLen = null;
52
+
53
+ function getTieredQuantMaxKVLen() {
54
+ if (_tieredQuantMaxKVLen === null) {
55
+ const config = getKernelConfig('attention_tiered_quant', 'decode_tiered_int8_f16kv');
56
+ const maxKVLen = config.variantMetadata?.maxKVLen;
57
+ if (!Number.isFinite(maxKVLen)) {
58
+ throw new Error('Kernel config missing attention_tiered_quant.decode_tiered_int8_f16kv maxKVLen');
59
+ }
60
+ _tieredQuantMaxKVLen = maxKVLen;
61
+ }
62
+ return _tieredQuantMaxKVLen;
63
+ }
64
+
65
+
66
+ let kvLenFallbackBuffer = null;
67
+ let kvLenFallbackBufferEpoch = -1;
68
+ const U32_BYTES = Uint32Array.BYTES_PER_ELEMENT;
69
+
70
+
71
+ function getKvLenFallbackBuffer(device) {
72
+ const epoch = getDeviceEpoch();
73
+ if (!kvLenFallbackBuffer || kvLenFallbackBufferEpoch !== epoch) {
74
+ kvLenFallbackBuffer = device.createBuffer({
75
+ label: 'attention_kv_len_fallback',
76
+ size: U32_BYTES,
77
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
78
+ });
79
+ device.queue.writeBuffer(kvLenFallbackBuffer, 0, new Uint32Array([0]));
80
+ kvLenFallbackBufferEpoch = epoch;
81
+ }
82
+ return kvLenFallbackBuffer;
83
+ }
84
+
85
+ let pageTableFallbackBuffer = null;
86
+ let pageTableFallbackBufferEpoch = -1;
87
+
88
+ function getPageTableFallbackBuffer(device) {
89
+ const epoch = getDeviceEpoch();
90
+ if (!pageTableFallbackBuffer || pageTableFallbackBufferEpoch !== epoch) {
91
+ pageTableFallbackBuffer = device.createBuffer({
92
+ label: 'attention_page_table_fallback',
93
+ size: U32_BYTES,
94
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
95
+ });
96
+ device.queue.writeBuffer(pageTableFallbackBuffer, 0, new Uint32Array([0]));
97
+ pageTableFallbackBufferEpoch = epoch;
98
+ }
99
+ return pageTableFallbackBuffer;
100
+ }
101
+
102
+
103
+
104
+
105
+ class AttentionKernel extends KernelBase {
106
+
107
+ async getPipeline(variant) {
108
+ return this.getPipelineFor('attention', variant);
109
+ }
110
+
111
+
112
+ dispatch(
113
+ pipeline,
114
+ bindGroup,
115
+ workgroups
116
+ ) {
117
+ this.dispatchKernel(pipeline, bindGroup, workgroups, 'attention');
118
+ }
119
+
120
+
121
+ record(
122
+ recorder,
123
+ pipeline,
124
+ bindGroup,
125
+ workgroups
126
+ ) {
127
+ this.recordKernel(recorder, pipeline, bindGroup, workgroups, 'attention');
128
+ }
129
+ }
130
+
131
+ class AttentionTieredKernel extends KernelBase {
132
+
133
+ async getPipeline(variant) {
134
+ return this.getPipelineFor('attention_tiered', variant);
135
+ }
136
+
137
+
138
+ dispatch(
139
+ pipeline,
140
+ bindGroup,
141
+ workgroups
142
+ ) {
143
+ this.dispatchKernel(pipeline, bindGroup, workgroups, 'attention_tiered');
144
+ }
145
+
146
+
147
+ record(
148
+ recorder,
149
+ pipeline,
150
+ bindGroup,
151
+ workgroups
152
+ ) {
153
+ this.recordKernel(recorder, pipeline, bindGroup, workgroups, 'attention_tiered');
154
+ }
155
+ }
156
+
157
+ class AttentionTieredQuantKernel extends KernelBase {
158
+
159
+ async getPipeline(variant) {
160
+ return this.getPipelineFor('attention_tiered_quant', variant);
161
+ }
162
+
163
+
164
+ dispatch(
165
+ pipeline,
166
+ bindGroup,
167
+ workgroups
168
+ ) {
169
+ this.dispatchKernel(pipeline, bindGroup, workgroups, 'attention_tiered_quant');
170
+ }
171
+
172
+
173
+ record(
174
+ recorder,
175
+ pipeline,
176
+ bindGroup,
177
+ workgroups
178
+ ) {
179
+ this.recordKernel(recorder, pipeline, bindGroup, workgroups, 'attention_tiered_quant');
180
+ }
181
+ }
182
+
183
+ class AttentionBDPAKernel extends KernelBase {
184
+ async getPipeline(variant) {
185
+ return this.getPipelineFor('attention_bdpa', variant);
186
+ }
187
+
188
+ dispatch(
189
+ pipeline,
190
+ bindGroup,
191
+ workgroups
192
+ ) {
193
+ this.dispatchKernel(pipeline, bindGroup, workgroups, 'attention_bdpa');
194
+ }
195
+
196
+ record(
197
+ recorder,
198
+ pipeline,
199
+ bindGroup,
200
+ workgroups
201
+ ) {
202
+ this.recordKernel(recorder, pipeline, bindGroup, workgroups, 'attention_bdpa');
203
+ }
204
+ }
205
+
206
+
207
+ function selectAttentionTier(
208
+ headDim,
209
+ seqLen,
210
+ useF16KV,
211
+ forcedTier,
212
+ sharedLimit,
213
+ caps
214
+ ) {
215
+ const isDecode = seqLen === 1;
216
+ const thresholds = getKernelThresholds().attention;
217
+ const largeRequired = useF16KV
218
+ ? thresholds.largeSharedF16
219
+ : thresholds.largeSharedF32;
220
+ const canLarge =
221
+ headDim <= thresholds.largeMaxHeadDim &&
222
+ sharedLimit >= largeRequired;
223
+ const smallRequired = useF16KV
224
+ ? thresholds.smallSharedF16
225
+ : thresholds.smallSharedF32;
226
+ const canSmall =
227
+ headDim <= thresholds.smallMaxHeadDim &&
228
+ sharedLimit >= smallRequired;
229
+ const canSubgroup =
230
+ caps.hasSubgroups &&
231
+ headDim <= thresholds.subgroupMaxHeadDim &&
232
+ sharedLimit >= thresholds.subgroupShared &&
233
+ isDecode;
234
+
235
+
236
+ let tier = forcedTier;
237
+ let reason = forcedTier ? `forced:${forcedTier}` : '';
238
+
239
+ if (tier === 'tiled_large' && !canLarge) {
240
+ throw new Error(`Requested tiled_large but device doesn't support it (headDim=${headDim}, shared=${sharedLimit}).`);
241
+ }
242
+ if (tier === 'tiled_small' && !canSmall) {
243
+ throw new Error(`Requested tiled_small but device doesn't support it (headDim=${headDim}, shared=${sharedLimit}).`);
244
+ }
245
+ if (tier === 'subgroup' && !canSubgroup) {
246
+ throw new Error(`Requested subgroup attention but device doesn't support it (headDim=${headDim}, shared=${sharedLimit}, subgroups=${caps.hasSubgroups}).`);
247
+ }
248
+
249
+ if (!tier) {
250
+ tier = selectKernelRuleValue('attention', 'tier', { canSubgroup, canLarge, canSmall, isDecode });
251
+ if (!reason) {
252
+ if (canSubgroup) {
253
+ reason = 'subgroup_capable';
254
+ } else if (canLarge) {
255
+ reason = 'tiled_large_capable';
256
+ } else if (canSmall) {
257
+ reason = 'tiled_small_capable';
258
+ } else if (isDecode) {
259
+ reason = 'decode_streaming_fallback';
260
+ } else {
261
+ reason = 'streaming_fallback';
262
+ }
263
+ }
264
+ if (tier === 'subgroup' && !loggedAttentionTier) {
265
+ trace.attn(0, `Using subgroup decode kernel (headDim=${headDim}, hasSubgroups=true)`);
266
+ loggedAttentionTier = true;
267
+ }
268
+ }
269
+
270
+ return { tier, reason };
271
+ }
272
+
273
+ // Track if we've logged chunked kernel selection
274
+ let loggedChunkedKernel = false;
275
+
276
+
277
+ function resolveAttentionVariant(
278
+ tier,
279
+ isDecode,
280
+ useF16KV,
281
+ useF16Q,
282
+ numHeads,
283
+ headDim,
284
+ kvLen,
285
+ isPaged,
286
+ caps,
287
+ sharedLimit
288
+ ) {
289
+ const base = selectKernelRuleValue('attention', 'phase', { isDecode });
290
+ const useF16 = useF16KV && useF16Q;
291
+ const suffix = selectKernelRuleValue('attention', 'suffix', { useF16, useF16KV });
292
+
293
+ // Check if chunked kernel is viable:
294
+ // - Decode only (seqLen=1)
295
+ // - F16 KV cache
296
+ // - Large headDim (parallelizes across dimensions)
297
+ // - KV length within shared memory limit (from kernel config)
298
+ const chunkedMaxKVLen = getChunkedMaxKVLen();
299
+ const minHeadDimForChunked = getKernelThresholds().attention.minHeadDimForChunked;
300
+ const canUseChunked = isDecode && useF16KV && headDim >= minHeadDimForChunked && kvLen <= chunkedMaxKVLen;
301
+ const decodeSubgroupMaxKVLen = chunkedMaxKVLen;
302
+ const decodeSubgroupMaxHeadDim = getKernelThresholds().attention.subgroupMaxHeadDim;
303
+ const canUseDecodeSubgroup = isDecode && !useF16KV && !useF16Q && headDim <= decodeSubgroupMaxHeadDim && kvLen <= decodeSubgroupMaxKVLen;
304
+ const canUseDecodeOptimized = isDecode
305
+ && useF16KV
306
+ && caps.hasF16
307
+ && caps.hasSubgroups
308
+ && headDim <= decodeSubgroupMaxHeadDim
309
+ && sharedLimit >= getKernelThresholds().attention.subgroupShared;
310
+ const chunkedVariant = selectKernelRuleValue('attention', 'chunkedVariant', { useF16 });
311
+ const pagedVariant = selectKernelRuleValue('attention', 'pagedVariant', { useF16 });
312
+ const optimizedVariant = selectKernelRuleValue('attention', 'optimizedVariant', { useF16 });
313
+ const variant = selectKernelRuleValue(
314
+ 'attention',
315
+ 'variant',
316
+ {
317
+ tier,
318
+ useF16KV,
319
+ canUseChunked,
320
+ canUseDecodeSubgroup,
321
+ canUseDecodeOptimized,
322
+ base,
323
+ suffix,
324
+ chunkedVariant,
325
+ pagedVariant,
326
+ optimizedVariant,
327
+ isPaged,
328
+ isDecode,
329
+ }
330
+ );
331
+
332
+ if (variant === chunkedVariant && !loggedChunkedKernel) {
333
+ trace.attn(0, `Using chunked decode kernel (headDim=${headDim}, numHeads=${numHeads}, f16kv=${!useF16Q})`);
334
+ loggedChunkedKernel = true;
335
+ }
336
+
337
+ return variant;
338
+ }
339
+
340
+
341
+ function calculateAttentionWorkgroups(tier, seqLen, numHeads) {
342
+ if (tier === 'subgroup') {
343
+ return numHeads;
344
+ }
345
+ if (tier === 'streaming') {
346
+ return seqLen * numHeads;
347
+ }
348
+ if (tier === 'tiled_large') {
349
+ return Math.ceil(seqLen / TILE_SIZES.ATTENTION_LARGE_BLOCK_SIZE) * numHeads;
350
+ }
351
+ return Math.ceil(seqLen / TILE_SIZES.ATTENTION_SMALL_BLOCK_SIZE) * numHeads;
352
+ }
353
+
354
+
355
+ function inferAttentionTierFromVariant(variant) {
356
+ if (variant === 'decode_subgroup') return 'subgroup';
357
+ if (variant.startsWith('decode_online')) return 'subgroup';
358
+ if (variant.startsWith('decode_paged')) return 'tiled_large';
359
+ if (variant.startsWith('prefill_streaming') || variant.startsWith('decode_streaming') || variant === 'decode_chunked_f16kv') {
360
+ return 'streaming';
361
+ }
362
+ if (variant.startsWith('prefill_small') || variant.startsWith('decode_small')) return 'tiled_small';
363
+ return 'tiled_large';
364
+ }
365
+
366
+
367
+ function validateAttentionVariant(
368
+ variant,
369
+ isDecode,
370
+ useF16KV,
371
+ useF16Q,
372
+ caps,
373
+ headDim,
374
+ kvLen,
375
+ sharedLimit
376
+ ) {
377
+ const normalized = variant.trim();
378
+
379
+ let config;
380
+ try {
381
+ config = getKernelConfig('attention', normalized);
382
+ } catch {
383
+ throw new Error(`Unknown attention kernel variant "${variant}".`);
384
+ }
385
+
386
+ if (!hasRequiredFeatures(config.requires, caps)) {
387
+ throw new Error(`Attention kernel "${variant}" requires unsupported GPU features.`);
388
+ }
389
+
390
+ const expectsF16KV = normalized.includes('_f16kv');
391
+ const expectsF16 = normalized.includes('_f16') && !expectsF16KV;
392
+ if (expectsF16) {
393
+ if (!(useF16KV && useF16Q)) {
394
+ const kvLabel = selectSharedRuleValue('shared', 'dtype', 'f16OrF32', { useF16: useF16KV });
395
+ const qLabel = selectSharedRuleValue('shared', 'dtype', 'f16OrF32', { useF16: useF16Q });
396
+ throw new Error(`Attention kernel "${variant}" requires f16 Q/K/V but got Q=${qLabel}, KV=${kvLabel}.`);
397
+ }
398
+ } else if (expectsF16KV) {
399
+ if (!useF16KV || useF16Q) {
400
+ const kvLabel = selectSharedRuleValue('shared', 'dtype', 'f16OrF32', { useF16: useF16KV });
401
+ const qLabel = selectSharedRuleValue('shared', 'dtype', 'f16OrF32', { useF16: useF16Q });
402
+ throw new Error(`Attention kernel "${variant}" requires f32 Q with f16 KV but got Q=${qLabel}, KV=${kvLabel}.`);
403
+ }
404
+ } else {
405
+ if (useF16KV || useF16Q) {
406
+ const kvLabel = selectSharedRuleValue('shared', 'dtype', 'f16OrF32', { useF16: useF16KV });
407
+ const qLabel = selectSharedRuleValue('shared', 'dtype', 'f16OrF32', { useF16: useF16Q });
408
+ throw new Error(`Attention kernel "${variant}" requires f32 Q/K/V but got Q=${qLabel}, KV=${kvLabel}.`);
409
+ }
410
+ }
411
+
412
+ const isDecodeVariant = normalized.startsWith('decode');
413
+ const isPrefillVariant = normalized.startsWith('prefill');
414
+ if (isDecode && isPrefillVariant) {
415
+ throw new Error(`Attention kernel "${variant}" is prefill-only but decode requested.`);
416
+ }
417
+ if (!isDecode && isDecodeVariant) {
418
+ throw new Error(`Attention kernel "${variant}" is decode-only but prefill requested.`);
419
+ }
420
+
421
+ const thresholds = getKernelThresholds().attention;
422
+ const chunkedMaxKVLen = getChunkedMaxKVLen();
423
+ const isChunked = normalized.startsWith('decode_chunked');
424
+ if (isChunked) {
425
+ const minHeadDimForChunked = thresholds.minHeadDimForChunked;
426
+ if (headDim < minHeadDimForChunked) {
427
+ throw new Error(`Attention kernel "${variant}" requires headDim >= ${minHeadDimForChunked} but got ${headDim}.`);
428
+ }
429
+ if (kvLen > chunkedMaxKVLen) {
430
+ throw new Error(`Attention kernel "${variant}" requires kvLen <= ${chunkedMaxKVLen} but got ${kvLen}.`);
431
+ }
432
+ }
433
+
434
+ if (normalized === 'decode_subgroup') {
435
+ if (!caps.hasSubgroups) {
436
+ throw new Error(`Attention kernel "${variant}" requires subgroup support.`);
437
+ }
438
+ if (headDim > thresholds.subgroupMaxHeadDim) {
439
+ throw new Error(`Attention kernel "${variant}" requires headDim <= ${thresholds.subgroupMaxHeadDim} but got ${headDim}.`);
440
+ }
441
+ if (kvLen > chunkedMaxKVLen) {
442
+ throw new Error(`Attention kernel "${variant}" requires kvLen <= ${chunkedMaxKVLen} but got ${kvLen}.`);
443
+ }
444
+ if (sharedLimit < thresholds.subgroupShared) {
445
+ throw new Error(`Attention kernel "${variant}" requires shared >= ${thresholds.subgroupShared} but got ${sharedLimit}.`);
446
+ }
447
+ }
448
+
449
+ if (normalized.startsWith('decode_online')) {
450
+ if (!caps.hasSubgroups) {
451
+ throw new Error(`Attention kernel "${variant}" requires subgroup support.`);
452
+ }
453
+ if (headDim > thresholds.subgroupMaxHeadDim) {
454
+ throw new Error(`Attention kernel "${variant}" requires headDim <= ${thresholds.subgroupMaxHeadDim} but got ${headDim}.`);
455
+ }
456
+ if (sharedLimit < thresholds.subgroupShared) {
457
+ throw new Error(`Attention kernel "${variant}" requires shared >= ${thresholds.subgroupShared} but got ${sharedLimit}.`);
458
+ }
459
+ }
460
+
461
+ if (normalized.startsWith('prefill') || normalized.startsWith('decode')) {
462
+ const isSmall = normalized.includes('_small');
463
+ const isStreaming = normalized.includes('_streaming');
464
+ const isTiled = !isStreaming
465
+ && !normalized.startsWith('decode_subgroup')
466
+ && !normalized.startsWith('decode_online')
467
+ && !isChunked;
468
+ if (isTiled) {
469
+ const requiredShared = isSmall
470
+ ? (useF16KV ? thresholds.smallSharedF16 : thresholds.smallSharedF32)
471
+ : (useF16KV ? thresholds.largeSharedF16 : thresholds.largeSharedF32);
472
+ const maxHeadDim = isSmall ? thresholds.smallMaxHeadDim : thresholds.largeMaxHeadDim;
473
+ if (headDim > maxHeadDim) {
474
+ throw new Error(`Attention kernel "${variant}" requires headDim <= ${maxHeadDim} but got ${headDim}.`);
475
+ }
476
+ if (sharedLimit < requiredShared) {
477
+ throw new Error(`Attention kernel "${variant}" requires shared >= ${requiredShared} but got ${sharedLimit}.`);
478
+ }
479
+ }
480
+ }
481
+
482
+ return normalized;
483
+ }
484
+
485
+
486
+ function resolveAttentionPlan(
487
+ seqLen,
488
+ kvLen,
489
+ headDim,
490
+ numHeads,
491
+ kvDtype,
492
+ qDtype,
493
+ sharedLimit,
494
+ caps,
495
+ layerIdx,
496
+ isPaged,
497
+ kernelPath
498
+ ) {
499
+ const useF16KV = kvDtype === 'f16';
500
+ const useF16Q = qDtype === 'f16';
501
+ const isDecode = seqLen === 1;
502
+ const phase = selectKernelRuleValue('attention', 'phase', { isDecode });
503
+ const pathVariant = getKernelPathAttentionVariant(phase, layerIdx, kernelPath);
504
+ const strictPath = getKernelPathStrict();
505
+
506
+ if (pathVariant) {
507
+ let variantOverride;
508
+ try {
509
+ variantOverride = validateAttentionVariant(
510
+ pathVariant,
511
+ isDecode,
512
+ useF16KV,
513
+ useF16Q,
514
+ caps,
515
+ headDim,
516
+ kvLen,
517
+ sharedLimit
518
+ );
519
+ } catch (error) {
520
+ if (strictPath) {
521
+ throw error;
522
+ }
523
+ const reason = error instanceof Error ? error.message : String(error);
524
+ log.warn(
525
+ 'Attention',
526
+ `Kernel path override "${pathVariant}" rejected; falling back to capability selection: ${reason}`
527
+ );
528
+ const adaptiveSelection = selectAttentionTier(headDim, seqLen, useF16KV, null, sharedLimit, caps);
529
+ const adaptiveVariant = resolveAttentionVariant(
530
+ adaptiveSelection.tier,
531
+ isDecode,
532
+ useF16KV,
533
+ useF16Q,
534
+ numHeads,
535
+ kvLen,
536
+ caps,
537
+ headDim,
538
+ sharedLimit
539
+ );
540
+ const workgroups = calculateAttentionWorkgroups(adaptiveSelection.tier, seqLen, numHeads);
541
+ logKernelSelectionOnce('attention', {
542
+ variant: adaptiveVariant,
543
+ reason: `path_override_fallback:${adaptiveSelection.tier}`,
544
+ });
545
+ return {
546
+ tier: adaptiveSelection.tier,
547
+ variant: adaptiveVariant,
548
+ workgroups,
549
+ useF16KV,
550
+ isDecode,
551
+ };
552
+ }
553
+ let selectionReason = 'path_override';
554
+
555
+ if (!isDecode && variantOverride.startsWith('prefill_streaming') && seqLen <= 64) {
556
+ const adaptivePrefillVariant = variantOverride.endsWith('_f16kv')
557
+ ? 'prefill_f16kv'
558
+ : variantOverride.endsWith('_f16')
559
+ ? 'prefill_f16'
560
+ : 'prefill';
561
+ try {
562
+ const validatedAdaptive = validateAttentionVariant(
563
+ adaptivePrefillVariant,
564
+ isDecode,
565
+ useF16KV,
566
+ useF16Q,
567
+ caps,
568
+ headDim,
569
+ kvLen,
570
+ sharedLimit
571
+ );
572
+ if (validatedAdaptive !== variantOverride) {
573
+ variantOverride = validatedAdaptive;
574
+ selectionReason = 'path_override_adaptive_prefill';
575
+ }
576
+ } catch {
577
+ // Keep original strict-path variant when adaptive fallback is not valid.
578
+ }
579
+ }
580
+
581
+ const tier = inferAttentionTierFromVariant(variantOverride);
582
+ const workgroups = calculateAttentionWorkgroups(tier, seqLen, numHeads);
583
+ logKernelSelectionOnce('attention', {
584
+ variant: variantOverride,
585
+ reason: `${selectionReason}:${tier}`,
586
+ });
587
+ return { tier, variant: variantOverride, workgroups, useF16KV, isDecode };
588
+ }
589
+
590
+ const selection = selectAttentionTier(headDim, seqLen, useF16KV, null, sharedLimit, caps);
591
+ const tier = selection.tier;
592
+ const variant = resolveAttentionVariant(
593
+ tier,
594
+ isDecode,
595
+ useF16KV,
596
+ useF16Q,
597
+ numHeads,
598
+ headDim,
599
+ kvLen,
600
+ isPaged,
601
+ caps,
602
+ sharedLimit
603
+ );
604
+ const validatedVariant = validateAttentionVariant(
605
+ variant,
606
+ isDecode,
607
+ useF16KV,
608
+ useF16Q,
609
+ caps,
610
+ headDim,
611
+ kvLen,
612
+ sharedLimit
613
+ );
614
+ const workgroups = calculateAttentionWorkgroups(tier, seqLen, numHeads);
615
+
616
+ logKernelSelectionOnce('attention', {
617
+ variant: validatedVariant,
618
+ reason: selection.reason,
619
+ });
620
+
621
+ return { tier, variant: validatedVariant, workgroups, useF16KV, isDecode };
622
+ }
623
+
624
+ export function resolveAttentionPlanForTest(
625
+ seqLen,
626
+ kvLen,
627
+ headDim,
628
+ numHeads,
629
+ kvDtype,
630
+ qDtype,
631
+ sharedLimit,
632
+ caps,
633
+ layerIdx,
634
+ isPaged = false,
635
+ kernelPath = null
636
+ ) {
637
+ return resolveAttentionPlan(
638
+ seqLen,
639
+ kvLen,
640
+ headDim,
641
+ numHeads,
642
+ kvDtype,
643
+ qDtype,
644
+ sharedLimit,
645
+ caps,
646
+ layerIdx,
647
+ isPaged,
648
+ kernelPath
649
+ );
650
+ }
651
+
652
+
653
+ function createAttentionUniformBuffer(
654
+ device,
655
+ recorder,
656
+ params
657
+ ) {
658
+ return createUniformBufferWithView(
659
+ 'attention_uniforms',
660
+ 64, // 60 bytes used + 4 padding for 16-byte alignment
661
+ (view) => {
662
+ view.setUint32(0, params.numHeads, true);
663
+ view.setUint32(4, params.numKVHeads, true);
664
+ view.setUint32(8, params.headDim, true);
665
+ view.setUint32(12, params.kvLen, true);
666
+ view.setUint32(16, params.seqLen, true);
667
+ view.setFloat32(20, params.scale, true);
668
+ view.setUint32(24, params.causal ? 1 : 0, true);
669
+ view.setUint32(28, params.startPos, true);
670
+ view.setFloat32(32, params.attnSoftcap, true); // Gemma 2: 50.0, 0 = disabled
671
+ view.setUint32(36, params.slidingWindow, true); // Sliding window size, 0 = disabled
672
+ view.setUint32(40, params.kvLenSource, true); // 0 = uniform kvLen, 1 = buffer
673
+ view.setUint32(44, params.kvStart ?? 0, true);
674
+ view.setUint32(48, params.pageSize ?? 0, true);
675
+ view.setUint32(52, params.kvLayout ?? 0, true);
676
+ view.setUint32(56, 0, true);
677
+ },
678
+ recorder,
679
+ device
680
+ );
681
+ }
682
+
683
+ function createTieredAttentionUniformBuffer(
684
+ device,
685
+ recorder,
686
+ params
687
+ ) {
688
+ return createUniformBufferWithView(
689
+ 'attention_tiered_uniforms',
690
+ 80,
691
+ (view) => {
692
+ view.setUint32(0, params.numHeads, true);
693
+ view.setUint32(4, params.numKVHeads, true);
694
+ view.setUint32(8, params.headDim, true);
695
+ view.setUint32(12, params.coldLen, true);
696
+ view.setUint32(16, params.hotLen, true);
697
+ view.setUint32(20, params.seqLen, true);
698
+ view.setFloat32(24, params.scale, true);
699
+ view.setUint32(28, params.causal ? 1 : 0, true);
700
+ view.setUint32(32, params.startPos, true);
701
+ view.setFloat32(36, params.attnSoftcap, true);
702
+ view.setUint32(40, params.slidingWindow, true);
703
+ view.setUint32(44, params.hotWindow, true);
704
+ view.setUint32(48, params.hotStart, true);
705
+ view.setUint32(52, params.coldPageSize, true);
706
+ view.setUint32(56, params.coldLayout ?? 0, true);
707
+ view.setUint32(60, params.hotLayout ?? 1, true);
708
+ view.setUint32(64, 0, true);
709
+ },
710
+ recorder,
711
+ device
712
+ );
713
+ }
714
+
715
+ function createTieredQuantAttentionUniformBuffer(
716
+ device,
717
+ recorder,
718
+ params
719
+ ) {
720
+ return createUniformBufferWithView(
721
+ 'attention_tiered_quant_uniforms',
722
+ 64,
723
+ (view) => {
724
+ view.setUint32(0, params.numHeads, true);
725
+ view.setUint32(4, params.numKVHeads, true);
726
+ view.setUint32(8, params.headDim, true);
727
+ view.setUint32(12, params.coldLen, true);
728
+ view.setUint32(16, params.hotLen, true);
729
+ view.setUint32(20, params.seqLen, true);
730
+ view.setFloat32(24, params.scale, true);
731
+ view.setUint32(28, params.causal ? 1 : 0, true);
732
+ view.setUint32(32, params.startPos, true);
733
+ view.setFloat32(36, params.attnSoftcap, true);
734
+ view.setUint32(40, params.slidingWindow, true);
735
+ view.setUint32(44, params.hotWindow, true);
736
+ view.setUint32(48, params.hotStart, true);
737
+ view.setUint32(52, params.packedStride, true);
738
+ view.setUint32(56, 0, true);
739
+ },
740
+ recorder,
741
+ device
742
+ );
743
+ }
744
+
745
+ function createBDPAAttentionUniformBuffer(
746
+ device,
747
+ recorder,
748
+ params
749
+ ) {
750
+ return createUniformBufferWithView(
751
+ 'attention_bdpa_uniforms',
752
+ 64,
753
+ (view) => {
754
+ view.setUint32(0, params.numHeads, true);
755
+ view.setUint32(4, params.numKVHeads, true);
756
+ view.setUint32(8, params.headDim, true);
757
+ view.setUint32(12, params.kvLen, true);
758
+ view.setUint32(16, params.seqLen, true);
759
+ view.setFloat32(20, params.scale, true);
760
+ view.setUint32(24, params.causal ? 1 : 0, true);
761
+ view.setUint32(28, params.startPos, true);
762
+ view.setFloat32(32, params.attnSoftcap, true);
763
+ view.setUint32(36, params.slidingWindow, true);
764
+ view.setUint32(40, 0, true); // padding
765
+ view.setUint32(44, 0, true); // padding
766
+ view.setUint32(48, 0, true); // padding
767
+ view.setUint32(52, 0, true); // padding
768
+ view.setUint32(56, 0, true); // padding
769
+ view.setUint32(60, 0, true); // padding
770
+ },
771
+ recorder,
772
+ device
773
+ );
774
+ }
775
+
776
+ function resolveAttentionExecution(recorder) {
777
+ return {
778
+ recorder: recorder || null,
779
+ device: recorder?.device || getDevice(),
780
+ };
781
+ }
782
+
783
+ function releaseAttentionUniform(execution, uniformBuffer) {
784
+ if (!execution.recorder) {
785
+ releaseUniformBuffer(uniformBuffer);
786
+ }
787
+ }
788
+
789
+ function dispatchAttentionKernel(execution, kernel, pipeline, bindGroup, workgroups) {
790
+ if (execution.recorder) {
791
+ kernel.record(execution.recorder, pipeline, bindGroup, workgroups);
792
+ return;
793
+ }
794
+ kernel.dispatch(pipeline, bindGroup, workgroups);
795
+ }
796
+
797
+ async function executeAttentionBDPA(
798
+ recorder,
799
+ Q,
800
+ basisK,
801
+ basisV,
802
+ pagedK,
803
+ pagedV,
804
+ index,
805
+ numHeads,
806
+ headDim,
807
+ options = {}
808
+ ) {
809
+ const execution = resolveAttentionExecution(recorder);
810
+ const {
811
+ seqLen = 1,
812
+ kvLen = seqLen,
813
+ numKVHeads = numHeads,
814
+ scale = 1.0 / Math.sqrt(headDim),
815
+ causal = true,
816
+ startPos = 0,
817
+ outputBuffer = null,
818
+ attnSoftcap = 0,
819
+ slidingWindow = 0,
820
+ ropeCos = null,
821
+ ropeSin = null,
822
+ } = options;
823
+
824
+ if (seqLen !== 1) {
825
+ throw new Error(`BDPA attention currently supports decode only (seqLen=1), got seqLen=${seqLen}.`);
826
+ }
827
+ if (Q.dtype !== 'f16' || basisK.dtype !== 'f16' || basisV.dtype !== 'f16') {
828
+ throw new Error(`BDPA attention requires f16 Q/basis tensors; got Q=${Q.dtype}, basisK=${basisK.dtype}, basisV=${basisV.dtype}.`);
829
+ }
830
+ if (!(ropeCos instanceof GPUBuffer) || !(ropeSin instanceof GPUBuffer)) {
831
+ throw new Error('BDPA attention requires GPU ropeCos/ropeSin buffers.');
832
+ }
833
+
834
+ const variant = 'decode_bdpa_f16';
835
+ const caps = getKernelCapabilities();
836
+ const config = getKernelConfig('attention_bdpa', variant);
837
+ if (!hasRequiredFeatures(config.requires, caps)) {
838
+ throw new Error(`BDPA attention kernel "${variant}" requires unsupported GPU features.`);
839
+ }
840
+ const maxKVLen = config.variantMetadata?.maxKVLen;
841
+ if (Number.isFinite(maxKVLen) && kvLen > maxKVLen) {
842
+ throw new Error(`BDPA attention requires kvLen <= ${maxKVLen} but got ${kvLen}.`);
843
+ }
844
+
845
+ const kernel = new AttentionBDPAKernel(execution.device);
846
+ const pipeline = await kernel.getPipeline(variant);
847
+
848
+ const outputDtype = config.outputDtype;
849
+ if (!outputDtype) {
850
+ throw new Error(`Kernel config missing outputDtype for attention_bdpa variant "${variant}".`);
851
+ }
852
+ const bytesPerElement = outputDtype === 'f16' ? 2 : 4;
853
+ const paddedHiddenSize = padToQ4KBlock(numHeads * headDim);
854
+ const outputSize = seqLen * paddedHiddenSize * bytesPerElement;
855
+ const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'attention_bdpa_output');
856
+
857
+ const uniformBuffer = createBDPAAttentionUniformBuffer(execution.device, execution.recorder, {
858
+ numHeads,
859
+ numKVHeads,
860
+ headDim,
861
+ kvLen,
862
+ seqLen,
863
+ scale,
864
+ causal,
865
+ startPos,
866
+ attnSoftcap,
867
+ slidingWindow,
868
+ });
869
+
870
+ const bindGroup = execution.device.createBindGroup({
871
+ label: 'attention_bdpa_bind_group',
872
+ layout: pipeline.getBindGroupLayout(0),
873
+ entries: [
874
+ { binding: 0, resource: { buffer: uniformBuffer } },
875
+ { binding: 1, resource: { buffer: Q.buffer } },
876
+ { binding: 2, resource: { buffer: basisK.buffer } },
877
+ { binding: 3, resource: { buffer: basisV.buffer } },
878
+ { binding: 4, resource: { buffer: pagedK } },
879
+ { binding: 5, resource: { buffer: pagedV } },
880
+ { binding: 6, resource: { buffer: index } },
881
+ { binding: 7, resource: { buffer: ropeCos } },
882
+ { binding: 8, resource: { buffer: ropeSin } },
883
+ { binding: 9, resource: { buffer: outputBuf } },
884
+ ],
885
+ });
886
+
887
+ dispatchAttentionKernel(execution, kernel, pipeline, bindGroup, numHeads);
888
+ releaseAttentionUniform(execution, uniformBuffer);
889
+
890
+ return createTensor(outputBuf, outputDtype, [seqLen, numHeads, headDim], 'attention_bdpa_output');
891
+ }
892
+
893
+ async function executeAttention(
894
+ recorder,
895
+ Q,
896
+ K,
897
+ V,
898
+ mask,
899
+ numHeads,
900
+ headDim,
901
+ options = {}
902
+ ) {
903
+ const execution = resolveAttentionExecution(recorder);
904
+ const {
905
+ seqLen = 1,
906
+ kvLen = seqLen,
907
+ numKVHeads = numHeads,
908
+ scale = 1.0 / Math.sqrt(headDim),
909
+ causal = true,
910
+ startPos = 0,
911
+ layerIdx,
912
+ outputBuffer = null,
913
+ attnSoftcap = 0,
914
+ slidingWindow = 0,
915
+ kvLenBuffer = null,
916
+ indirectBuffer = null,
917
+ indirectOffset = 0,
918
+ kvStart = 0,
919
+ kvLayout = 'contiguous',
920
+ kvPageTable = null,
921
+ kvPageSize = 0,
922
+ kernelPath = null,
923
+ } = options;
924
+
925
+ const limits = getDeviceLimits();
926
+ const sharedLimit = limits?.maxComputeWorkgroupStorageSize ?? Infinity;
927
+ const caps = getKernelCapabilities();
928
+
929
+ const kvDtype = K.dtype;
930
+ const qDtype = Q.dtype;
931
+ const isPaged = kvLayout === 'paged';
932
+ const plan = resolveAttentionPlan(
933
+ seqLen,
934
+ kvLen,
935
+ headDim,
936
+ numHeads,
937
+ kvDtype,
938
+ qDtype,
939
+ sharedLimit,
940
+ caps,
941
+ layerIdx,
942
+ isPaged,
943
+ kernelPath
944
+ );
945
+
946
+ if (execution.recorder) {
947
+ trace.attn(0, `recordAttention: isDecode=${plan.isDecode}, tier=${plan.tier}, variant=${plan.variant}, seqLen=${seqLen}, kvLen=${kvLen}, numHeads=${numHeads}, headDim=${headDim}, useF16KV=${plan.useF16KV}`);
948
+ }
949
+
950
+ const kernel = new AttentionKernel(execution.device);
951
+ const pipeline = await kernel.getPipeline(plan.variant);
952
+
953
+ const outputConfig = getKernelConfig('attention', plan.variant);
954
+ const outputDtype = outputConfig.outputDtype;
955
+ if (!outputDtype) {
956
+ if (execution.recorder) {
957
+ throw new Error(`Kernel config missing outputDtype for attention variant "${plan.variant}".`);
958
+ }
959
+ throw new Error(`[Attention] outputDtype is required for variant "${plan.variant}".`);
960
+ }
961
+ const bytesPerElement = outputDtype === 'f16' ? 2 : 4;
962
+ const paddedHiddenSize = padToQ4KBlock(numHeads * headDim);
963
+ const outputSize = seqLen * paddedHiddenSize * bytesPerElement;
964
+ const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'attention_output');
965
+
966
+ const uniformBuffer = createAttentionUniformBuffer(execution.device, execution.recorder, {
967
+ numHeads,
968
+ numKVHeads,
969
+ headDim,
970
+ kvLen,
971
+ seqLen,
972
+ scale,
973
+ causal,
974
+ startPos,
975
+ attnSoftcap,
976
+ slidingWindow,
977
+ kvLenSource: kvLenBuffer ? 1 : 0,
978
+ kvStart,
979
+ pageSize: kvPageSize,
980
+ kvLayout: kvLayout === 'paged' ? 2 : (kvLayout === 'ring' ? 1 : 0),
981
+ });
982
+
983
+ const kvLenBinding = kvLenBuffer || getKvLenFallbackBuffer(execution.device);
984
+ const pageTableBinding = kvPageTable || getPageTableFallbackBuffer(execution.device);
985
+ const bindGroup = execution.device.createBindGroup({
986
+ label: 'attention_bind_group',
987
+ layout: pipeline.getBindGroupLayout(0),
988
+ entries: [
989
+ { binding: 0, resource: { buffer: uniformBuffer } },
990
+ { binding: 1, resource: { buffer: Q.buffer } },
991
+ { binding: 2, resource: { buffer: K.buffer } },
992
+ { binding: 3, resource: { buffer: V.buffer } },
993
+ { binding: 4, resource: { buffer: outputBuf } },
994
+ { binding: 5, resource: { buffer: kvLenBinding } },
995
+ { binding: 6, resource: { buffer: pageTableBinding } },
996
+ ],
997
+ });
998
+
999
+ if (!indirectBuffer && limits && plan.workgroups > limits.maxComputeWorkgroupsPerDimension) {
1000
+ throw new Error(
1001
+ `Attention dispatch requires ${plan.workgroups} workgroups but device limit is ` +
1002
+ `${limits.maxComputeWorkgroupsPerDimension}. Reduce prompt length or use streaming attention.`
1003
+ );
1004
+ }
1005
+
1006
+ if (indirectBuffer) {
1007
+ if (execution.recorder) {
1008
+ recordDispatchIndirect(execution.recorder, pipeline, bindGroup, indirectBuffer, indirectOffset, 'attention');
1009
+ } else {
1010
+ dispatchIndirect(execution.device, pipeline, bindGroup, indirectBuffer, indirectOffset, 'attention');
1011
+ }
1012
+ } else {
1013
+ dispatchAttentionKernel(execution, kernel, pipeline, bindGroup, plan.workgroups);
1014
+ }
1015
+
1016
+ releaseAttentionUniform(execution, uniformBuffer);
1017
+
1018
+ return createTensor(outputBuf, outputDtype, [seqLen, numHeads, headDim], 'attention_output');
1019
+ }
1020
+
1021
+ async function executeAttentionTiered(
1022
+ recorder,
1023
+ Q,
1024
+ hotK,
1025
+ hotV,
1026
+ coldK,
1027
+ coldV,
1028
+ numHeads,
1029
+ headDim,
1030
+ options = {}
1031
+ ) {
1032
+ const execution = resolveAttentionExecution(recorder);
1033
+ const {
1034
+ seqLen = 1,
1035
+ coldLen = 0,
1036
+ hotLen = 0,
1037
+ numKVHeads = numHeads,
1038
+ scale = 1.0 / Math.sqrt(headDim),
1039
+ causal = true,
1040
+ startPos = 0,
1041
+ outputBuffer = null,
1042
+ attnSoftcap = 0,
1043
+ slidingWindow = 0,
1044
+ hotWindow = hotLen,
1045
+ hotStart = 0,
1046
+ coldPageTable = null,
1047
+ coldPageSize = 0,
1048
+ coldLayout = 2,
1049
+ hotLayout = 1,
1050
+ } = options;
1051
+
1052
+ const totalLen = coldLen + hotLen;
1053
+ const maxKVLen = getTieredMaxKVLen();
1054
+ if (totalLen > maxKVLen) {
1055
+ throw new Error(`Tiered attention requires total KV len <= ${maxKVLen} but got ${totalLen}.`);
1056
+ }
1057
+
1058
+ const useF16 = Q.dtype === 'f16' && hotK.dtype === 'f16' && coldK.dtype === 'f16';
1059
+ const useF16KV = hotK.dtype === 'f16' && coldK.dtype === 'f16';
1060
+ const variant = selectKernelRuleValue('attention', 'tieredVariant', { useF16 });
1061
+ const caps = getKernelCapabilities();
1062
+ const config = getKernelConfig('attention_tiered', variant);
1063
+ if (!hasRequiredFeatures(config.requires, caps)) {
1064
+ throw new Error(`Tiered attention kernel "${variant}" requires unsupported GPU features.`);
1065
+ }
1066
+ if (!useF16KV) {
1067
+ throw new Error('Tiered attention requires f16 KV buffers.');
1068
+ }
1069
+
1070
+ const kernel = new AttentionTieredKernel(execution.device);
1071
+ const pipeline = await kernel.getPipeline(variant);
1072
+
1073
+ const outputDtype = config.outputDtype;
1074
+ if (!outputDtype) {
1075
+ throw new Error(`Kernel config missing outputDtype for attention_tiered variant "${variant}".`);
1076
+ }
1077
+ const bytesPerElement = outputDtype === 'f16' ? 2 : 4;
1078
+ const paddedHiddenSize = padToQ4KBlock(numHeads * headDim);
1079
+ const outputSize = seqLen * paddedHiddenSize * bytesPerElement;
1080
+ const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'attention_tiered_output');
1081
+
1082
+ const uniformBuffer = createTieredAttentionUniformBuffer(execution.device, execution.recorder, {
1083
+ numHeads,
1084
+ numKVHeads,
1085
+ headDim,
1086
+ coldLen,
1087
+ hotLen,
1088
+ seqLen,
1089
+ scale,
1090
+ causal,
1091
+ startPos,
1092
+ attnSoftcap,
1093
+ slidingWindow,
1094
+ hotWindow,
1095
+ hotStart,
1096
+ coldPageSize,
1097
+ coldLayout,
1098
+ hotLayout,
1099
+ });
1100
+
1101
+ const pageTableBinding = coldPageTable || getPageTableFallbackBuffer(execution.device);
1102
+ const bindGroup = execution.device.createBindGroup({
1103
+ label: 'attention_tiered_bind_group',
1104
+ layout: pipeline.getBindGroupLayout(0),
1105
+ entries: [
1106
+ { binding: 0, resource: { buffer: uniformBuffer } },
1107
+ { binding: 1, resource: { buffer: Q.buffer } },
1108
+ { binding: 2, resource: { buffer: hotK.buffer } },
1109
+ { binding: 3, resource: { buffer: hotV.buffer } },
1110
+ { binding: 4, resource: { buffer: coldK.buffer } },
1111
+ { binding: 5, resource: { buffer: coldV.buffer } },
1112
+ { binding: 6, resource: { buffer: outputBuf } },
1113
+ { binding: 7, resource: { buffer: pageTableBinding } },
1114
+ ],
1115
+ });
1116
+
1117
+ dispatchAttentionKernel(execution, kernel, pipeline, bindGroup, numHeads);
1118
+ releaseAttentionUniform(execution, uniformBuffer);
1119
+
1120
+ return createTensor(outputBuf, outputDtype, [seqLen, numHeads, headDim], 'attention_tiered_output');
1121
+ }
1122
+
1123
+ async function executeAttentionTieredQuant(
1124
+ recorder,
1125
+ Q,
1126
+ hotK,
1127
+ hotV,
1128
+ coldPackedK,
1129
+ coldPackedV,
1130
+ coldScalesK,
1131
+ coldScalesV,
1132
+ numHeads,
1133
+ headDim,
1134
+ options = {}
1135
+ ) {
1136
+ const execution = resolveAttentionExecution(recorder);
1137
+ const {
1138
+ seqLen = 1,
1139
+ coldLen = 0,
1140
+ hotLen = 0,
1141
+ numKVHeads = numHeads,
1142
+ scale = 1.0 / Math.sqrt(headDim),
1143
+ causal = true,
1144
+ startPos = 0,
1145
+ outputBuffer = null,
1146
+ attnSoftcap = 0,
1147
+ slidingWindow = 0,
1148
+ hotWindow = hotLen,
1149
+ hotStart = 0,
1150
+ packedStride = 0,
1151
+ mode = 'int8',
1152
+ } = options;
1153
+
1154
+ const totalLen = coldLen + hotLen;
1155
+ const maxKVLen = getTieredQuantMaxKVLen();
1156
+ if (totalLen > maxKVLen) {
1157
+ throw new Error(`Tiered quant attention requires total KV len <= ${maxKVLen} but got ${totalLen}.`);
1158
+ }
1159
+ if (!Number.isFinite(packedStride) || packedStride <= 0) {
1160
+ throw new Error('Tiered quant attention requires packedStride > 0.');
1161
+ }
1162
+
1163
+ if (Q.dtype !== 'f32') {
1164
+ throw new Error('Tiered quant attention requires f32 Q.');
1165
+ }
1166
+
1167
+ const variant = selectKernelRuleValue('attention', 'tieredQuantVariant', { mode });
1168
+ const caps = getKernelCapabilities();
1169
+ const config = getKernelConfig('attention_tiered_quant', variant);
1170
+ if (!hasRequiredFeatures(config.requires, caps)) {
1171
+ throw new Error(`Tiered quant attention kernel "${variant}" requires unsupported GPU features.`);
1172
+ }
1173
+
1174
+ const kernel = new AttentionTieredQuantKernel(execution.device);
1175
+ const pipeline = await kernel.getPipeline(variant);
1176
+
1177
+ const outputDtype = config.outputDtype;
1178
+ if (!outputDtype) {
1179
+ throw new Error(`Kernel config missing outputDtype for attention_tiered_quant variant "${variant}".`);
1180
+ }
1181
+ const bytesPerElement = outputDtype === 'f16' ? 2 : 4;
1182
+ const paddedHiddenSize = padToQ4KBlock(numHeads * headDim);
1183
+ const outputSize = seqLen * paddedHiddenSize * bytesPerElement;
1184
+ const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'attention_tiered_quant_output');
1185
+
1186
+ const uniformBuffer = createTieredQuantAttentionUniformBuffer(execution.device, execution.recorder, {
1187
+ numHeads,
1188
+ numKVHeads,
1189
+ headDim,
1190
+ coldLen,
1191
+ hotLen,
1192
+ seqLen,
1193
+ scale,
1194
+ causal,
1195
+ startPos,
1196
+ attnSoftcap,
1197
+ slidingWindow,
1198
+ hotWindow,
1199
+ hotStart,
1200
+ packedStride,
1201
+ });
1202
+
1203
+ const bindGroup = execution.device.createBindGroup({
1204
+ label: 'attention_tiered_quant_bind_group',
1205
+ layout: pipeline.getBindGroupLayout(0),
1206
+ entries: [
1207
+ { binding: 0, resource: { buffer: uniformBuffer } },
1208
+ { binding: 1, resource: { buffer: Q.buffer } },
1209
+ { binding: 2, resource: { buffer: hotK.buffer } },
1210
+ { binding: 3, resource: { buffer: hotV.buffer } },
1211
+ { binding: 4, resource: { buffer: coldPackedK } },
1212
+ { binding: 5, resource: { buffer: coldPackedV } },
1213
+ { binding: 6, resource: { buffer: coldScalesK } },
1214
+ { binding: 7, resource: { buffer: coldScalesV } },
1215
+ { binding: 8, resource: { buffer: outputBuf } },
1216
+ ],
1217
+ });
1218
+
1219
+ dispatchAttentionKernel(execution, kernel, pipeline, bindGroup, numHeads);
1220
+ releaseAttentionUniform(execution, uniformBuffer);
1221
+
1222
+ return createTensor(outputBuf, outputDtype, [seqLen, numHeads, headDim], 'attention_tiered_quant_output');
1223
+ }
1224
+
1225
+ export async function runAttentionBDPA(
1226
+ Q,
1227
+ basisK,
1228
+ basisV,
1229
+ pagedK,
1230
+ pagedV,
1231
+ index,
1232
+ numHeads,
1233
+ headDim,
1234
+ options = {}
1235
+ ) {
1236
+ return executeAttentionBDPA(null, Q, basisK, basisV, pagedK, pagedV, index, numHeads, headDim, options);
1237
+ }
1238
+
1239
+ export async function recordAttentionBDPA(
1240
+ recorder,
1241
+ Q,
1242
+ basisK,
1243
+ basisV,
1244
+ pagedK,
1245
+ pagedV,
1246
+ index,
1247
+ numHeads,
1248
+ headDim,
1249
+ options = {}
1250
+ ) {
1251
+ return executeAttentionBDPA(recorder, Q, basisK, basisV, pagedK, pagedV, index, numHeads, headDim, options);
1252
+ }
1253
+
1254
+ export async function runAttention(
1255
+ Q,
1256
+ K,
1257
+ V,
1258
+ mask,
1259
+ numHeads,
1260
+ headDim,
1261
+ options = {}
1262
+ ) {
1263
+ return executeAttention(null, Q, K, V, mask, numHeads, headDim, options);
1264
+ }
1265
+
1266
+ export async function recordAttention(
1267
+ recorder,
1268
+ Q,
1269
+ K,
1270
+ V,
1271
+ mask,
1272
+ numHeads,
1273
+ headDim,
1274
+ options = {}
1275
+ ) {
1276
+ return executeAttention(recorder, Q, K, V, mask, numHeads, headDim, options);
1277
+ }
1278
+
1279
+ export async function runAttentionTiered(
1280
+ Q,
1281
+ hotK,
1282
+ hotV,
1283
+ coldK,
1284
+ coldV,
1285
+ numHeads,
1286
+ headDim,
1287
+ options = {}
1288
+ ) {
1289
+ return executeAttentionTiered(null, Q, hotK, hotV, coldK, coldV, numHeads, headDim, options);
1290
+ }
1291
+
1292
+ export async function recordAttentionTiered(
1293
+ recorder,
1294
+ Q,
1295
+ hotK,
1296
+ hotV,
1297
+ coldK,
1298
+ coldV,
1299
+ numHeads,
1300
+ headDim,
1301
+ options = {}
1302
+ ) {
1303
+ return executeAttentionTiered(recorder, Q, hotK, hotV, coldK, coldV, numHeads, headDim, options);
1304
+ }
1305
+
1306
+ export async function runAttentionTieredQuant(
1307
+ Q,
1308
+ hotK,
1309
+ hotV,
1310
+ coldPackedK,
1311
+ coldPackedV,
1312
+ coldScalesK,
1313
+ coldScalesV,
1314
+ numHeads,
1315
+ headDim,
1316
+ options = {}
1317
+ ) {
1318
+ return executeAttentionTieredQuant(
1319
+ null,
1320
+ Q,
1321
+ hotK,
1322
+ hotV,
1323
+ coldPackedK,
1324
+ coldPackedV,
1325
+ coldScalesK,
1326
+ coldScalesV,
1327
+ numHeads,
1328
+ headDim,
1329
+ options
1330
+ );
1331
+ }
1332
+
1333
+ export async function recordAttentionTieredQuant(
1334
+ recorder,
1335
+ Q,
1336
+ hotK,
1337
+ hotV,
1338
+ coldPackedK,
1339
+ coldPackedV,
1340
+ coldScalesK,
1341
+ coldScalesV,
1342
+ numHeads,
1343
+ headDim,
1344
+ options = {}
1345
+ ) {
1346
+ return executeAttentionTieredQuant(
1347
+ recorder,
1348
+ Q,
1349
+ hotK,
1350
+ hotV,
1351
+ coldPackedK,
1352
+ coldPackedV,
1353
+ coldScalesK,
1354
+ coldScalesV,
1355
+ numHeads,
1356
+ headDim,
1357
+ options
1358
+ );
1359
+ }