@simulatte/doppler 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (355) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +16 -23
  3. package/package.json +30 -32
  4. package/src/adapters/adapter-registry.js +12 -1
  5. package/src/adapters/lora-loader.js +23 -6
  6. package/src/bridge/extension-client.d.ts +5 -0
  7. package/src/bridge/extension-client.js +40 -0
  8. package/src/bridge/index.d.ts +2 -1
  9. package/src/bridge/index.js +6 -4
  10. package/src/browser/browser-converter.js +31 -1
  11. package/src/browser/file-picker.js +6 -0
  12. package/src/browser/safetensors-parser-browser.js +84 -1
  13. package/src/browser/shard-io-browser.js +2 -2
  14. package/src/browser/tensor-source-download.js +8 -2
  15. package/src/browser/tensor-source-http.d.ts +1 -0
  16. package/src/browser/tensor-source-http.js +5 -1
  17. package/src/client/doppler-api.browser.js +20 -4
  18. package/src/client/doppler-api.js +19 -3
  19. package/src/client/doppler-provider/generation.js +12 -0
  20. package/src/client/doppler-provider/model-manager.d.ts +10 -0
  21. package/src/client/doppler-provider/model-manager.js +91 -19
  22. package/src/client/doppler-provider/source-runtime.d.ts +2 -1
  23. package/src/client/doppler-provider/source-runtime.js +132 -13
  24. package/src/client/doppler-registry.json +5 -20
  25. package/src/config/backward-registry-loader.js +17 -2
  26. package/src/config/execution-v0-contract-check.js +113 -15
  27. package/src/config/kernel-path-contract-check.js +57 -29
  28. package/src/config/kernel-path-loader.d.ts +5 -0
  29. package/src/config/kernel-path-loader.js +18 -36
  30. package/src/config/kernels/kernel-ref-digests.js +1 -1
  31. package/src/config/kernels/registry.js +14 -1
  32. package/src/config/kernels/registry.json +81 -5
  33. package/src/config/loader.d.ts +1 -1
  34. package/src/config/loader.js +15 -2
  35. package/src/config/merge-contract-check.js +66 -4
  36. package/src/config/merge-helpers.js +128 -7
  37. package/src/config/merge.d.ts +1 -0
  38. package/src/config/merge.js +10 -0
  39. package/src/config/param-validator.js +47 -2
  40. package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
  41. package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
  42. package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
  43. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  44. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  45. package/src/config/presets/kernel-paths/registry.json +43 -8
  46. package/src/config/presets/models/gemma2.json +3 -2
  47. package/src/config/presets/models/gemma3.json +2 -0
  48. package/src/config/presets/models/qwen3.json +4 -3
  49. package/src/config/presets/models/qwen3_5.json +16 -0
  50. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
  51. package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
  52. package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
  53. package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
  54. package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
  55. package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
  56. package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
  57. package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
  58. package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
  59. package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
  60. package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
  61. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  62. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  63. package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
  64. package/src/config/runtime.js +6 -1
  65. package/src/config/schema/conversion.schema.d.ts +1 -0
  66. package/src/config/schema/debug.schema.d.ts +5 -0
  67. package/src/config/schema/doppler.schema.js +16 -21
  68. package/src/config/schema/inference-defaults.schema.js +3 -3
  69. package/src/config/schema/kernel-path.schema.d.ts +5 -1
  70. package/src/config/schema/kernel-thresholds.schema.js +12 -4
  71. package/src/config/schema/manifest.schema.d.ts +3 -2
  72. package/src/config/schema/manifest.schema.js +17 -4
  73. package/src/config/schema/storage.schema.js +1 -1
  74. package/src/config/training-defaults.js +30 -22
  75. package/src/converter/conversion-plan.js +104 -11
  76. package/src/converter/core.d.ts +7 -0
  77. package/src/converter/core.js +16 -9
  78. package/src/converter/execution-v0-manifest.js +4 -1
  79. package/src/converter/index.d.ts +1 -0
  80. package/src/converter/index.js +1 -0
  81. package/src/converter/manifest-inference.js +50 -29
  82. package/src/converter/parsers/diffusion.js +0 -3
  83. package/src/converter/parsers/transformer.js +4 -0
  84. package/src/converter/quantization-info.js +40 -16
  85. package/src/converter/quantizer.js +19 -12
  86. package/src/converter/rope-config.js +8 -6
  87. package/src/converter/shard-packer.d.ts +1 -1
  88. package/src/converter/shard-packer.js +4 -1
  89. package/src/converter/tokenizer-utils.d.ts +1 -0
  90. package/src/converter/tokenizer-utils.js +4 -1
  91. package/src/debug/config.js +123 -11
  92. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  93. package/src/debug/signals.js +7 -1
  94. package/src/debug/tensor.d.ts +2 -0
  95. package/src/debug/tensor.js +13 -2
  96. package/src/distribution/p2p-control-plane.js +52 -12
  97. package/src/distribution/p2p-observability.js +43 -7
  98. package/src/distribution/p2p-webrtc-browser.js +20 -0
  99. package/src/distribution/shard-delivery.js +83 -27
  100. package/src/formats/gguf/types.js +33 -16
  101. package/src/formats/rdrr/groups.d.ts +12 -4
  102. package/src/formats/rdrr/groups.js +3 -6
  103. package/src/formats/rdrr/parsing.d.ts +4 -0
  104. package/src/formats/rdrr/parsing.js +53 -3
  105. package/src/formats/rdrr/types.d.ts +2 -1
  106. package/src/gpu/command-recorder.js +86 -61
  107. package/src/gpu/device.d.ts +1 -0
  108. package/src/gpu/device.js +73 -19
  109. package/src/gpu/kernel-tuner/benchmarks.js +326 -316
  110. package/src/gpu/kernel-tuner/cache.js +71 -4
  111. package/src/gpu/kernel-tuner/tuner.js +22 -4
  112. package/src/gpu/kernels/attention.js +15 -34
  113. package/src/gpu/kernels/backward/adam.js +62 -58
  114. package/src/gpu/kernels/backward/attention_backward.js +257 -169
  115. package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
  116. package/src/gpu/kernels/cast.js +191 -149
  117. package/src/gpu/kernels/check-stop.js +33 -44
  118. package/src/gpu/kernels/conv2d.js +27 -17
  119. package/src/gpu/kernels/cross_entropy_loss.js +21 -15
  120. package/src/gpu/kernels/depthwise_conv2d.js +36 -26
  121. package/src/gpu/kernels/dequant.js +178 -126
  122. package/src/gpu/kernels/energy.d.ts +3 -21
  123. package/src/gpu/kernels/energy.js +111 -88
  124. package/src/gpu/kernels/feature-check.js +1 -1
  125. package/src/gpu/kernels/fused_ffn.js +84 -65
  126. package/src/gpu/kernels/fused_matmul_residual.js +56 -33
  127. package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
  128. package/src/gpu/kernels/gather.js +33 -15
  129. package/src/gpu/kernels/gelu.js +19 -11
  130. package/src/gpu/kernels/grouped_pointwise_conv2d.js +33 -23
  131. package/src/gpu/kernels/groupnorm.js +34 -23
  132. package/src/gpu/kernels/index.d.ts +8 -0
  133. package/src/gpu/kernels/index.js +6 -0
  134. package/src/gpu/kernels/kv-quantize.js +5 -2
  135. package/src/gpu/kernels/layernorm.js +35 -19
  136. package/src/gpu/kernels/logit-merge.js +5 -3
  137. package/src/gpu/kernels/matmul-selection.js +47 -4
  138. package/src/gpu/kernels/matmul.d.ts +2 -0
  139. package/src/gpu/kernels/matmul.js +59 -40
  140. package/src/gpu/kernels/modulate.js +23 -15
  141. package/src/gpu/kernels/moe.js +221 -175
  142. package/src/gpu/kernels/pixel_shuffle.js +22 -14
  143. package/src/gpu/kernels/relu.js +18 -10
  144. package/src/gpu/kernels/repeat_channels.js +25 -17
  145. package/src/gpu/kernels/residual.js +37 -27
  146. package/src/gpu/kernels/rmsnorm.js +66 -43
  147. package/src/gpu/kernels/rope.js +3 -0
  148. package/src/gpu/kernels/sample.js +27 -38
  149. package/src/gpu/kernels/sana_linear_attention.js +18 -10
  150. package/src/gpu/kernels/scale.js +18 -11
  151. package/src/gpu/kernels/shader-cache.js +4 -2
  152. package/src/gpu/kernels/silu.js +120 -72
  153. package/src/gpu/kernels/softmax.js +44 -25
  154. package/src/gpu/kernels/split_qg.d.ts +50 -0
  155. package/src/gpu/kernels/split_qg.js +46 -0
  156. package/src/gpu/kernels/split_qg.wgsl +58 -0
  157. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  158. package/src/gpu/kernels/split_qkv.js +23 -13
  159. package/src/gpu/kernels/transpose.js +18 -10
  160. package/src/gpu/kernels/transpose.wgsl +5 -3
  161. package/src/gpu/kernels/upsample2d.js +21 -13
  162. package/src/gpu/kernels/utils.js +20 -13
  163. package/src/gpu/partitioned-buffer-pool.js +10 -2
  164. package/src/gpu/perf-guards.js +2 -9
  165. package/src/gpu/profiler.js +27 -22
  166. package/src/gpu/readback-utils.d.ts +16 -0
  167. package/src/gpu/readback-utils.js +41 -0
  168. package/src/gpu/submit-tracker.js +13 -0
  169. package/src/gpu/uniform-cache.d.ts +1 -0
  170. package/src/gpu/uniform-cache.js +30 -9
  171. package/src/gpu/weight-buffer.d.ts +1 -1
  172. package/src/gpu/weight-buffer.js +1 -1
  173. package/src/hotswap/intent-bundle.js +6 -0
  174. package/src/hotswap/manifest.d.ts +10 -1
  175. package/src/hotswap/manifest.js +12 -2
  176. package/src/hotswap/runtime.js +30 -8
  177. package/src/index-browser.d.ts +44 -0
  178. package/src/index-browser.js +14 -0
  179. package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
  180. package/src/inference/browser-harness-contract-helpers.js +28 -0
  181. package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
  182. package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
  183. package/src/inference/browser-harness-model-helpers.d.ts +16 -0
  184. package/src/inference/browser-harness-model-helpers.js +217 -0
  185. package/src/inference/browser-harness-report-helpers.d.ts +7 -0
  186. package/src/inference/browser-harness-report-helpers.js +42 -0
  187. package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
  188. package/src/inference/browser-harness-runtime-helpers.js +415 -0
  189. package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
  190. package/src/inference/browser-harness-suite-helpers.js +268 -0
  191. package/src/inference/browser-harness-text-helpers.d.ts +27 -0
  192. package/src/inference/browser-harness-text-helpers.js +788 -0
  193. package/src/inference/browser-harness.d.ts +8 -0
  194. package/src/inference/browser-harness.js +149 -1996
  195. package/src/inference/kv-cache/base.js +140 -94
  196. package/src/inference/kv-cache/tiered.js +5 -3
  197. package/src/inference/moe-router.js +88 -56
  198. package/src/inference/multi-model-network.js +5 -3
  199. package/src/inference/network-evolution.d.ts +11 -2
  200. package/src/inference/network-evolution.js +20 -21
  201. package/src/inference/pipelines/context.d.ts +3 -0
  202. package/src/inference/pipelines/context.js +142 -2
  203. package/src/inference/pipelines/diffusion/helpers.js +10 -2
  204. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  205. package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
  206. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
  207. package/src/inference/pipelines/diffusion/vae.js +3 -7
  208. package/src/inference/pipelines/energy/pipeline.js +27 -21
  209. package/src/inference/pipelines/energy/quintel.d.ts +5 -0
  210. package/src/inference/pipelines/energy/quintel.js +11 -0
  211. package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
  212. package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
  213. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  214. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  215. package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
  216. package/src/inference/pipelines/text/attention/projections.js +192 -112
  217. package/src/inference/pipelines/text/attention/record.js +77 -14
  218. package/src/inference/pipelines/text/attention/run.js +112 -14
  219. package/src/inference/pipelines/text/config.js +17 -4
  220. package/src/inference/pipelines/text/embed.js +2 -8
  221. package/src/inference/pipelines/text/execution-plan.js +46 -23
  222. package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
  223. package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
  224. package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
  225. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
  226. package/src/inference/pipelines/text/execution-v0.js +62 -1013
  227. package/src/inference/pipelines/text/generator-runtime.js +5 -0
  228. package/src/inference/pipelines/text/generator-steps.d.ts +52 -0
  229. package/src/inference/pipelines/text/generator-steps.js +340 -221
  230. package/src/inference/pipelines/text/generator.js +56 -40
  231. package/src/inference/pipelines/text/init.d.ts +13 -0
  232. package/src/inference/pipelines/text/init.js +94 -25
  233. package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
  234. package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
  235. package/src/inference/pipelines/text/kernel-trace.js +6 -0
  236. package/src/inference/pipelines/text/layer.js +4 -9
  237. package/src/inference/pipelines/text/linear-attention.d.ts +15 -0
  238. package/src/inference/pipelines/text/linear-attention.js +113 -9
  239. package/src/inference/pipelines/text/logits/gpu.js +12 -7
  240. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  241. package/src/inference/pipelines/text/logits/index.js +13 -12
  242. package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
  243. package/src/inference/pipelines/text/logits/utils.js +9 -0
  244. package/src/inference/pipelines/text/lora-apply.js +50 -32
  245. package/src/inference/pipelines/text/model-load.js +282 -104
  246. package/src/inference/pipelines/text/moe-cache.js +5 -4
  247. package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
  248. package/src/inference/pipelines/text/moe-cpu.js +42 -38
  249. package/src/inference/pipelines/text/moe-gpu.js +110 -86
  250. package/src/inference/pipelines/text/ops.js +90 -90
  251. package/src/inference/pipelines/text/probes.js +9 -9
  252. package/src/inference/pipelines/text/sampling.js +52 -6
  253. package/src/inference/pipelines/text/weights.js +17 -7
  254. package/src/inference/pipelines/text.js +13 -1
  255. package/src/inference/speculative.d.ts +2 -2
  256. package/src/inference/speculative.js +4 -18
  257. package/src/inference/test-harness.d.ts +1 -1
  258. package/src/inference/test-harness.js +17 -7
  259. package/src/inference/tokenizer.d.ts +0 -5
  260. package/src/inference/tokenizer.js +4 -23
  261. package/src/inference/tokenizers/bpe.js +9 -0
  262. package/src/inference/tokenizers/bundled.js +20 -0
  263. package/src/inference/tokenizers/sentencepiece.js +12 -0
  264. package/src/loader/doppler-loader.js +38 -22
  265. package/src/loader/dtype-utils.js +3 -44
  266. package/src/loader/embedding-loader.js +7 -3
  267. package/src/loader/experts/expert-cache.js +13 -6
  268. package/src/loader/experts/expert-loader.js +10 -6
  269. package/src/loader/final-weights-loader.js +10 -4
  270. package/src/loader/layer-loader.js +2 -1
  271. package/src/loader/loader-state.js +2 -2
  272. package/src/loader/memory-monitor.js +8 -0
  273. package/src/loader/multi-model-loader.d.ts +14 -0
  274. package/src/loader/multi-model-loader.js +70 -24
  275. package/src/loader/shard-cache.js +84 -14
  276. package/src/loader/shard-resolver.js +25 -3
  277. package/src/loader/tensors/tensor-loader.js +214 -144
  278. package/src/loader/tensors/tensor-reader.js +76 -19
  279. package/src/loader/weight-downcast.js +1 -1
  280. package/src/memory/buffer-pool.d.ts +9 -1
  281. package/src/memory/buffer-pool.js +109 -44
  282. package/src/memory/unified-detect.js +1 -1
  283. package/src/rules/inference/dtype.rules.json +5 -0
  284. package/src/rules/inference/kernel-path.rules.json +24 -8
  285. package/src/rules/kernels/split-qg.rules.json +6 -0
  286. package/src/rules/rule-registry.js +27 -1
  287. package/src/storage/backends/opfs-store.js +68 -24
  288. package/src/storage/downloader.js +365 -83
  289. package/src/storage/index.d.ts +3 -0
  290. package/src/storage/index.js +3 -0
  291. package/src/storage/preflight.d.ts +2 -2
  292. package/src/storage/preflight.js +24 -2
  293. package/src/storage/quickstart-downloader.js +11 -5
  294. package/src/storage/registry.js +10 -4
  295. package/src/storage/reports.js +1 -1
  296. package/src/storage/shard-manager.d.ts +15 -1
  297. package/src/storage/shard-manager.js +55 -6
  298. package/src/storage/source-artifact-store.d.ts +52 -0
  299. package/src/storage/source-artifact-store.js +234 -0
  300. package/src/tooling/command-api-constants.d.ts +9 -0
  301. package/src/tooling/command-api-constants.js +9 -0
  302. package/src/tooling/command-api-family-normalizers.d.ts +9 -0
  303. package/src/tooling/command-api-family-normalizers.js +343 -0
  304. package/src/tooling/command-api-helpers.d.ts +25 -0
  305. package/src/tooling/command-api-helpers.js +262 -0
  306. package/src/tooling/command-api.js +16 -602
  307. package/src/tooling/command-envelope.js +4 -1
  308. package/src/tooling/command-runner-shared.js +52 -18
  309. package/src/tooling/conversion-config-materializer.js +3 -5
  310. package/src/tooling/lean-execution-contract.js +150 -3
  311. package/src/tooling/node-browser-command-runner.js +161 -271
  312. package/src/tooling/node-command-runner.js +29 -3
  313. package/src/tooling/node-converter.js +30 -1
  314. package/src/tooling/node-source-runtime.d.ts +1 -1
  315. package/src/tooling/node-source-runtime.js +120 -3
  316. package/src/tooling/node-webgpu.js +24 -21
  317. package/src/tooling/opfs-cache.js +21 -4
  318. package/src/tooling/runtime-input-composition.d.ts +38 -0
  319. package/src/tooling/runtime-input-composition.js +86 -0
  320. package/src/tooling/source-runtime-bundle.d.ts +40 -5
  321. package/src/tooling/source-runtime-bundle.js +261 -34
  322. package/src/tooling/source-runtime-materializer.d.ts +6 -0
  323. package/src/tooling/source-runtime-materializer.js +93 -0
  324. package/src/training/attention-backward.js +32 -17
  325. package/src/training/autograd.js +80 -52
  326. package/src/training/checkpoint-watch.d.ts +2 -1
  327. package/src/training/checkpoint-watch.js +39 -6
  328. package/src/training/checkpoint.js +40 -11
  329. package/src/training/clip.js +2 -1
  330. package/src/training/datasets/token-batch.js +20 -8
  331. package/src/training/distillation/checkpoint-watch.js +1 -0
  332. package/src/training/distillation/student-fixture.d.ts +22 -0
  333. package/src/training/distillation/student-fixture.js +846 -0
  334. package/src/training/distillation/suite-data.d.ts +45 -0
  335. package/src/training/distillation/suite-data.js +189 -0
  336. package/src/training/lora-pipeline.js +4 -7
  337. package/src/training/lora.js +26 -12
  338. package/src/training/loss.js +5 -6
  339. package/src/training/objectives/cross_entropy.js +2 -5
  340. package/src/training/objectives/distill_kd.js +4 -8
  341. package/src/training/objectives/distill_triplet.js +4 -8
  342. package/src/training/objectives/ul_stage2_base.js +4 -8
  343. package/src/training/operator-command.js +2 -0
  344. package/src/training/optimizer.js +19 -7
  345. package/src/training/runner.js +2 -1
  346. package/src/training/suite.js +18 -978
  347. package/src/training/tensor-factory.d.ts +9 -0
  348. package/src/training/tensor-factory.js +13 -0
  349. package/src/training/trainer.js +3 -5
  350. package/src/training/ul_dataset.js +3 -5
  351. package/src/training/workloads.js +70 -79
  352. package/src/types/model.d.ts +5 -0
  353. package/src/version.js +1 -1
  354. package/tools/convert-safetensors-node.js +22 -16
  355. package/tools/doppler-cli.js +50 -26
@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
28
28
  import { SlidingWindowKVCache } from '../../../kv-cache.js';
29
29
  import {
30
30
  recordAttentionInputs,
31
+ shouldForceF32AttentionProjectionForRoPE,
31
32
  resolveAttentionProjectionOutputDtype,
32
33
  projectAttentionQKV,
33
34
  applyAttentionQKNorm,
34
35
  } from './projections.js';
36
+ import { prepareAttentionProjectionInput } from './output-projection.js';
35
37
 
36
38
  import {
37
39
  shouldDebugLayer,
@@ -97,9 +99,20 @@ export async function runLayerAttentionGPU(
97
99
  const allowF16Attention = wantsF16Output && kvCacheDtype === 'f16';
98
100
  let attentionInput = input;
99
101
  let attentionInputTemp = false;
102
+ let normed = attentionInput;
103
+ let qTensor = null;
104
+ let qGateTensor = null;
105
+ let kTensor = null;
106
+ let vTensor = null;
107
+ let attnOutput = null;
108
+ let attnForProjection = null;
109
+ let output = null;
110
+ let finalOutput = null;
111
+ let oProjInputTemp = null;
100
112
  if (wantsF16Output && !allowF16Attention) {
101
113
  attentionInput = await castF16ToF32(input);
102
114
  attentionInputTemp = true;
115
+ normed = attentionInput;
103
116
  }
104
117
 
105
118
  // Debug: attention input for configured layers
@@ -123,7 +136,7 @@ export async function runLayerAttentionGPU(
123
136
 
124
137
  // 1. Input norm
125
138
 
126
- let normed = attentionInput;
139
+ try {
127
140
  if (!skipInputNorm && layerWeights.inputNorm && getNormWeightBuffer) {
128
141
  const normWeightBuf = getNormWeightBuffer(layerWeights.inputNorm, 'input_norm');
129
142
 
@@ -182,8 +195,16 @@ export async function runLayerAttentionGPU(
182
195
  }
183
196
 
184
197
  // 2. Q/K/V projections
185
- const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
186
- let { qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
198
+ const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
199
+ forceF32: shouldForceF32AttentionProjectionForRoPE({
200
+ attentionInputDtype: desiredOutputDtype,
201
+ headDim,
202
+ rotaryDim: config.ropeRotaryDim,
203
+ interleaved: config.ropeInterleaved,
204
+ }),
205
+ });
206
+ let usedFusedQKV = false;
207
+ ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
187
208
  recorder: null,
188
209
  normed,
189
210
  layerWeights,
@@ -204,7 +225,7 @@ export async function runLayerAttentionGPU(
204
225
  trace.attn(layerIdx, `Using fused QKV path: ${qSizeFused}+${kSizeFused}+${vSizeFused}=${totalSize}`);
205
226
  }
206
227
  : null,
207
- });
228
+ }));
208
229
 
209
230
  // Trace Q/K/V projections
210
231
  if (kernelTrace.enabled) {
@@ -212,6 +233,27 @@ export async function runLayerAttentionGPU(
212
233
  await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
213
234
  await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
214
235
  }
236
+ await runProbes('q_proj', qTensor.buffer, {
237
+ layerIdx,
238
+ numTokens,
239
+ hiddenSize: numHeads * headDim,
240
+ probes: state.debugProbes,
241
+ dtype: qTensor.dtype,
242
+ });
243
+ await runProbes('k_proj', kTensor.buffer, {
244
+ layerIdx,
245
+ numTokens,
246
+ hiddenSize: numKVHeads * headDim,
247
+ probes: state.debugProbes,
248
+ dtype: kTensor.dtype,
249
+ });
250
+ await runProbes('v_proj', vTensor.buffer, {
251
+ layerIdx,
252
+ numTokens,
253
+ hiddenSize: numKVHeads * headDim,
254
+ probes: state.debugProbes,
255
+ dtype: vTensor.dtype,
256
+ });
215
257
 
216
258
  // Kernel step debug: Q/K/V projections
217
259
  if (isKernelDebugEnabled(layerIdx)) {
@@ -319,6 +361,20 @@ export async function runLayerAttentionGPU(
319
361
  await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
320
362
  }
321
363
  }
364
+ await runProbes('q_rope', qTensor.buffer, {
365
+ layerIdx,
366
+ numTokens,
367
+ hiddenSize: numHeads * headDim,
368
+ probes: state.debugProbes,
369
+ dtype: qTensor.dtype,
370
+ });
371
+ await runProbes('k_rope', kTensor.buffer, {
372
+ layerIdx,
373
+ numTokens,
374
+ hiddenSize: numKVHeads * headDim,
375
+ probes: state.debugProbes,
376
+ dtype: kTensor.dtype,
377
+ });
322
378
  if (isKernelDebugEnabled(layerIdx)) {
323
379
  logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
324
380
  await dumpTokenVector(qTensor.buffer, 'Q_rope', {
@@ -669,7 +725,7 @@ export async function runLayerAttentionGPU(
669
725
  throw new Error(`Unsupported attention kernel variant "${attentionKernelVariant}" at layer ${layerIdx}`);
670
726
  }
671
727
 
672
- const attnOutput = await runAttentionKernel();
728
+ attnOutput = await runAttentionKernel();
673
729
 
674
730
  // Trace attention output
675
731
  if (kernelTrace.enabled) {
@@ -692,7 +748,7 @@ export async function runLayerAttentionGPU(
692
748
  await debugCheckBuffer(attnOutput.buffer, `L${layerIdx} attention output (before o_proj, GPU)`, numTokens, numHeads * headDim);
693
749
  }
694
750
 
695
- let attnForProjection = attnOutput;
751
+ attnForProjection = attnOutput;
696
752
  if (qGateTensor) {
697
753
  attnForProjection = await runSiLU(attnOutput, {
698
754
  size: numTokens * numHeads * headDim,
@@ -706,19 +762,19 @@ export async function runLayerAttentionGPU(
706
762
 
707
763
  // 6. Output projection (with optional fused residual for decode)
708
764
 
709
- let output;
765
+ output = null;
710
766
  let residualFused = false;
711
767
  let oProjInput = attnForProjection;
712
- let oProjInputTemp = null;
768
+ oProjInputTemp = null;
713
769
  if (layerWeights.oProj && getWeightBuffer) {
770
+ ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
771
+ attnForProjection,
772
+ matmulOutputDtype,
773
+ castF32ToF16
774
+ ));
714
775
  const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
715
776
  const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
716
777
 
717
- if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
718
- oProjInput = await castF32ToF16(attnOutput);
719
- oProjInputTemp = oProjInput;
720
- }
721
-
722
778
  // Use fused o_proj + residual for decode when possible
723
779
  // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
724
780
  const oProjDtype = getWeightDtype(oProjBuf);
@@ -807,7 +863,7 @@ export async function runLayerAttentionGPU(
807
863
  await debugCheckBuffer(output.buffer, `L${layerIdx} attention output (after o_proj, GPU)`, numTokens, hiddenSize);
808
864
  }
809
865
 
810
- let finalOutput = output;
866
+ finalOutput = output;
811
867
 
812
868
  const buffersToRelease = [];
813
869
  if (output.buffer !== attnForProjection.buffer) {
@@ -832,4 +888,46 @@ export async function runLayerAttentionGPU(
832
888
  }
833
889
 
834
890
  return { output: finalOutput, residualFused };
891
+ } catch (error) {
892
+ const released = new Set();
893
+ const releaseOnce = (buffer) => {
894
+ if (!buffer || released.has(buffer)) return;
895
+ released.add(buffer);
896
+ releaseBuffer(buffer);
897
+ };
898
+ if (finalOutput?.buffer && finalOutput.buffer !== output?.buffer) {
899
+ releaseOnce(finalOutput.buffer);
900
+ }
901
+ if (output?.buffer && output.buffer !== attnForProjection?.buffer) {
902
+ releaseOnce(output.buffer);
903
+ }
904
+ if (oProjInputTemp?.buffer) {
905
+ releaseOnce(oProjInputTemp.buffer);
906
+ }
907
+ if (attnForProjection?.buffer && attnForProjection.buffer !== attnOutput?.buffer) {
908
+ releaseOnce(attnForProjection.buffer);
909
+ }
910
+ if (attnOutput?.buffer) {
911
+ releaseOnce(attnOutput.buffer);
912
+ }
913
+ if (qGateTensor?.buffer) {
914
+ releaseOnce(qGateTensor.buffer);
915
+ }
916
+ if (qTensor?.buffer) {
917
+ releaseOnce(qTensor.buffer);
918
+ }
919
+ if (kTensor?.buffer) {
920
+ releaseOnce(kTensor.buffer);
921
+ }
922
+ if (vTensor?.buffer) {
923
+ releaseOnce(vTensor.buffer);
924
+ }
925
+ if (normed?.buffer && normed.buffer !== attentionInput?.buffer) {
926
+ releaseOnce(normed.buffer);
927
+ }
928
+ if (attentionInputTemp && attentionInput?.buffer) {
929
+ releaseOnce(attentionInput.buffer);
930
+ }
931
+ throw error;
932
+ }
835
933
  }
@@ -134,11 +134,10 @@ function resolveIntermediateSizeForRuntime(manifest, inf, arch, modelId) {
134
134
  if (inferred == null || inferred === fromArch) {
135
135
  return fromArch;
136
136
  }
137
- log.warn(
138
- 'Config',
139
- `Manifest "${modelId}" has intermediateSize=${fromArch}, inferred ${inferred} from FFN tensor shapes; using inferred value.`
137
+ throw new Error(
138
+ `Manifest "${modelId}" has intermediateSize=${fromArch}, but FFN tensors imply ${inferred}. ` +
139
+ 'Re-convert the model so manifest architecture matches the weights.'
140
140
  );
141
- return inferred;
142
141
  }
143
142
 
144
143
  // =============================================================================
@@ -483,6 +482,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
483
482
  const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
484
483
  const causalAttention = inf.attention.causal;
485
484
 
485
+ // Cross-field sanity: queryPreAttnScalar should typically equal headDim.
486
+ // A value of sqrt(headDim) indicates a known converter bug that produces
487
+ // attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
488
+ if (queryPreAttnScalar != null && headDim != null
489
+ && queryPreAttnScalar !== headDim
490
+ && Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
491
+ throw new Error(
492
+ `Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
493
+ `equals sqrt(headDim) instead of headDim (${headDim}). ` +
494
+ `This is a known converter bug — the manifest must be regenerated ` +
495
+ `with the corrected converter.`
496
+ );
497
+ }
498
+
486
499
  // Get stop token IDs (cast to Manifest for compatibility)
487
500
  const stopTokenIds = getStopTokenIds(manifest);
488
501
 
@@ -319,14 +319,8 @@ export async function embed(tokenIds, embedBuffer, config) {
319
319
  const firstTokenId = tokenIdArray[0];
320
320
  const bytesPerElement = useF16 ? 2 : 4;
321
321
  const sampleSize = Math.min(32 * bytesPerElement, hiddenSize * bytesPerElement);
322
- const staging = device.createBuffer({ size: sampleSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ });
323
- const enc = device.createCommandEncoder();
324
- enc.copyBufferToBuffer(gatherOutput.buffer, 0, staging, 0, sampleSize);
325
- device.queue.submit([enc.finish()]);
326
- await staging.mapAsync(GPUMapMode.READ);
327
- const data = decodeReadback(staging.getMappedRange().slice(0), gatherOptions.outputDtype);
328
- staging.unmap();
329
- staging.destroy();
322
+ const readback = await readBuffer(gatherOutput.buffer, sampleSize);
323
+ const data = decodeReadback(readback, gatherOptions.outputDtype);
330
324
 
331
325
  // Compute statistics
332
326
  let sum = 0, sumSq = 0;
@@ -1,4 +1,3 @@
1
- import { log } from '../../../debug/index.js';
2
1
  import { resolveKernelPath } from '../../../config/kernel-path-loader.js';
3
2
  import { selectRuleValue } from '../../../rules/rule-registry.js';
4
3
  import {
@@ -9,19 +8,36 @@ import {
9
8
  export const PRIMARY_EXECUTION_PLAN_ID = 'primary';
10
9
  export const FINITENESS_FALLBACK_EXECUTION_PLAN_ID = 'finiteness_fallback';
11
10
 
12
- function normalizePositiveInt(value, fallback, label) {
13
- if (!Number.isFinite(value)) return fallback;
14
- const normalized = Math.floor(value);
15
- if (normalized >= 1) return normalized;
16
- log.warn('Pipeline', `[ExecutionPlan] ${label}=${value} is invalid; using ${fallback}.`);
17
- return fallback;
11
+ function assertOptionalBoolean(value, label) {
12
+ if (value === undefined) {
13
+ return undefined;
14
+ }
15
+ if (typeof value !== 'boolean') {
16
+ throw new Error(`[ExecutionPlan] ${label} must be boolean when provided; got ${JSON.stringify(value)}.`);
17
+ }
18
+ return value;
18
19
  }
19
20
 
20
- function normalizeStopCheckMode(value, fallback) {
21
- if (value === 'batch' || value === 'per-token') {
22
- return value;
21
+ function assertOptionalPositiveInt(value, label) {
22
+ if (value === undefined) {
23
+ return undefined;
23
24
  }
24
- return fallback;
25
+ if (!Number.isInteger(value) || value < 1) {
26
+ throw new Error(`[ExecutionPlan] ${label} must be a positive integer when provided; got ${JSON.stringify(value)}.`);
27
+ }
28
+ return value;
29
+ }
30
+
31
+ function assertOptionalStopCheckMode(value) {
32
+ if (value === undefined) {
33
+ return undefined;
34
+ }
35
+ if (value !== 'batch' && value !== 'per-token') {
36
+ throw new Error(
37
+ `[ExecutionPlan] stopCheckMode must be "batch" or "per-token" when provided; got ${JSON.stringify(value)}.`
38
+ );
39
+ }
40
+ return value;
25
41
  }
26
42
 
27
43
  function resolveFallbackActivationDtype(primaryActivationDtype) {
@@ -42,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
42
58
  function resolveFallbackKernelPath(primaryKernelPath) {
43
59
  const primaryKernelPathId = primaryKernelPath?.id ?? null;
44
60
  if (!primaryKernelPathId) {
45
- throw new Error(
46
- '[ExecutionPlan] F16 finiteness fallback requires a primary kernel path with a stable id. ' +
47
- 'Add a registered kernelPath id and a finiteness fallback rule.'
48
- );
61
+ return {
62
+ kernelPath: null,
63
+ kernelPathId: null,
64
+ kernelPathSource: 'none',
65
+ };
49
66
  }
50
67
 
51
68
  const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'
@@ -244,11 +261,17 @@ export function activateFallbackExecutionPlan(container) {
244
261
 
245
262
  function resolveExecutionOverrides(options = {}) {
246
263
  return {
247
- disableCommandBatching: options.disableCommandBatching,
248
- disableMultiTokenDecode: options.disableMultiTokenDecode,
249
- batchSize: options.batchSize,
250
- stopCheckMode: options.stopCheckMode,
251
- maxTokens: options.maxTokens,
264
+ disableCommandBatching: assertOptionalBoolean(
265
+ options.disableCommandBatching,
266
+ 'disableCommandBatching'
267
+ ),
268
+ disableMultiTokenDecode: assertOptionalBoolean(
269
+ options.disableMultiTokenDecode,
270
+ 'disableMultiTokenDecode'
271
+ ),
272
+ batchSize: assertOptionalPositiveInt(options.batchSize, 'batchSize'),
273
+ stopCheckMode: assertOptionalStopCheckMode(options.stopCheckMode),
274
+ maxTokens: assertOptionalPositiveInt(options.maxTokens, 'maxTokens'),
252
275
  };
253
276
  }
254
277
 
@@ -268,9 +291,9 @@ export function resolveExecutionSessionPlan(container, options = {}) {
268
291
  deferredRoundingWindowTokens: activePlan.deferredRoundingWindowTokens,
269
292
  disableCommandBatching: overrides.disableCommandBatching ?? activePlan.defaultDisableCommandBatching,
270
293
  disableMultiTokenDecode: overrides.disableMultiTokenDecode ?? activePlan.defaultDisableMultiTokenDecode,
271
- batchSize: normalizePositiveInt(overrides.batchSize, activePlan.defaultBatchSize, 'batchSize'),
272
- stopCheckMode: normalizeStopCheckMode(overrides.stopCheckMode, activePlan.defaultStopCheckMode),
273
- maxTokens: normalizePositiveInt(overrides.maxTokens, activePlan.defaultMaxTokens, 'maxTokens'),
294
+ batchSize: overrides.batchSize ?? activePlan.defaultBatchSize,
295
+ stopCheckMode: overrides.stopCheckMode ?? activePlan.defaultStopCheckMode,
296
+ maxTokens: overrides.maxTokens ?? activePlan.defaultMaxTokens,
274
297
  readbackInterval: activePlan.readbackInterval,
275
298
  ringTokens: activePlan.ringTokens,
276
299
  ringStop: activePlan.ringStop,
@@ -0,0 +1,59 @@
1
+ export declare function cloneJson<T>(value: T): T;
2
+ export declare function validateManifestSessionDefaultsContract(manifestInference: Record<string, unknown> | null): void;
3
+ export declare function isPhaseMatch(phase: string, targetPhase: string): boolean;
4
+ export declare function stepHasLayer(step: Record<string, unknown>, layerIdx: number): boolean;
5
+ export declare function normalizePhase(value: unknown, label: string): string;
6
+ export declare function normalizeSection(value: unknown, label: string): string;
7
+ export declare function normalizeSlot(value: unknown, label: string): string;
8
+ export declare function createSourceTrace(): { session: Record<string, unknown>; steps: Record<string, unknown> };
9
+ export declare function setSourceTrace(trace: Record<string, unknown>, path: string, source: string): void;
10
+ export declare function collectLeafPaths(value: unknown, prefix?: string[], out?: string[][]): string[][];
11
+ export declare function hasDefinedPath(root: unknown, pathSegments: string[]): boolean;
12
+ export declare function validateStepShape(step: Record<string, unknown>, index: number): void;
13
+ export declare function assertExecutionRuntimeOverlay(runtimeInference: Record<string, unknown> | null | undefined): void;
14
+ export declare function validateUniqueStepIds(steps: Array<Record<string, unknown>>): void;
15
+ export declare function hasExecutionV0(manifestInference: Record<string, unknown> | null | undefined): boolean;
16
+ export declare function assertExecutionV0Schema(manifestInference: Record<string, unknown> | null | undefined): void;
17
+ export declare function applyExecutionPatchAtomic(
18
+ baseSteps: Array<Record<string, unknown>>,
19
+ patch: Record<string, unknown> | null | undefined
20
+ ): Array<Record<string, unknown>>;
21
+ export declare function indexRuntimePatchMeta(
22
+ patch: Record<string, unknown> | null | undefined
23
+ ): {
24
+ addedSteps: Set<string>;
25
+ precisionFieldsByStep: Map<string, Set<string>>;
26
+ kvIOFieldsByStep: Set<string>;
27
+ };
28
+ export declare function requireSessionActivationDtype(
29
+ sessionDefaults: Record<string, unknown> | null | undefined,
30
+ label?: string
31
+ ): string;
32
+ export declare function createInitialSlotDtypes(sessionDefaults: Record<string, unknown>): Map<string, string>;
33
+ export declare function resolvePhaseSteps(
34
+ phase: string,
35
+ steps: Array<Record<string, unknown>>,
36
+ sessionDefaults: Record<string, unknown>,
37
+ profileIndex: Map<string, unknown>,
38
+ policies: Record<string, unknown>,
39
+ options?: Record<string, unknown>
40
+ ): {
41
+ steps: Array<Record<string, unknown>>;
42
+ finalSlotDtypes: Map<string, string>;
43
+ };
44
+ export declare function normalizeRuntimeSessionForExecutionV0(
45
+ runtimeSession: Record<string, unknown> | null | undefined,
46
+ manifestInference: Record<string, unknown> | null | undefined,
47
+ defaultComputeDefaults: Record<string, unknown>
48
+ ): Record<string, unknown> | null | undefined;
49
+ export declare function validatePhaseBoundaryCompatibility(options: Record<string, unknown>): void;
50
+ export declare function assertKVLayoutExecutionCompatibility(
51
+ steps: Array<Record<string, unknown>>,
52
+ sessionDefaults: Record<string, unknown>
53
+ ): void;
54
+ export declare const buildKernelProfileKey: (
55
+ kernelRef: Record<string, unknown> | null | undefined,
56
+ step?: Record<string, unknown> | null | undefined
57
+ ) => string;
58
+ export declare const indexKernelProfiles: (sessionDefaults: Record<string, unknown>) => Map<string, unknown>;
59
+ export declare const normalizeDtype: (value: unknown, label: string) => string;