npm - @simulatte/doppler - Versions diffs - 0.1.5 → 0.1.7 - Mend

@simulatte/doppler 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (392) hide show

package/CHANGELOG.md +126 -0
package/README.md +25 -17
package/package.json +20 -4
package/src/adapters/adapter-registry.js +12 -1
package/src/adapters/lora-loader.js +23 -6
package/src/bridge/extension-client.d.ts +5 -0
package/src/bridge/extension-client.js +40 -0
package/src/bridge/index.d.ts +2 -1
package/src/bridge/index.js +6 -4
package/src/browser/browser-converter.js +26 -1
package/src/browser/file-picker.js +6 -0
package/src/browser/safetensors-parser-browser.js +84 -1
package/src/browser/shard-io-browser.js +2 -2
package/src/browser/tensor-source-download.js +8 -2
package/src/browser/tensor-source-http.d.ts +1 -0
package/src/browser/tensor-source-http.js +5 -1
package/src/client/doppler-api.browser.js +20 -4
package/src/client/doppler-api.js +19 -3
package/src/client/doppler-provider/generation.js +12 -0
package/src/client/doppler-provider/model-manager.d.ts +10 -0
package/src/client/doppler-provider/model-manager.js +91 -19
package/src/client/doppler-provider/source-runtime.d.ts +2 -1
package/src/client/doppler-provider/source-runtime.js +132 -13
package/src/client/doppler-registry.json +8 -7
package/src/config/backward-registry-loader.js +17 -2
package/src/config/execution-v0-contract-check.js +113 -15
package/src/config/kernel-path-contract-check.js +57 -29
package/src/config/kernel-path-loader.js +5 -36
package/src/config/kernels/kernel-ref-digests.js +39 -39
package/src/config/kernels/registry.js +14 -1
package/src/config/kernels/registry.json +49 -7
package/src/config/loader.d.ts +1 -1
package/src/config/loader.js +43 -4
package/src/config/merge-contract-check.js +59 -4
package/src/config/merge-helpers.js +128 -7
package/src/config/merge.d.ts +1 -0
package/src/config/merge.js +28 -0
package/src/config/param-validator.js +47 -2
package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
package/src/config/presets/kernel-paths/registry.json +29 -8
package/src/config/presets/models/gemma2.json +2 -2
package/src/config/presets/models/qwen3.json +9 -2
package/src/config/presets/models/transformer.json +5 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
package/src/config/required-inference-fields-contract-check.js +6 -0
package/src/config/runtime.js +6 -1
package/src/config/schema/debug.schema.d.ts +5 -0
package/src/config/schema/doppler.schema.js +16 -21
package/src/config/schema/inference-defaults.schema.js +6 -3
package/src/config/schema/inference.schema.d.ts +9 -0
package/src/config/schema/kernel-path.schema.d.ts +11 -1
package/src/config/schema/kernel-thresholds.schema.js +12 -4
package/src/config/schema/manifest.schema.d.ts +8 -1
package/src/config/schema/manifest.schema.js +19 -3
package/src/config/training-defaults.js +30 -22
package/src/converter/conversion-plan.js +94 -9
package/src/converter/core.d.ts +7 -0
package/src/converter/core.js +14 -9
package/src/converter/execution-v0-manifest.js +4 -1
package/src/converter/index.d.ts +1 -0
package/src/converter/index.js +1 -0
package/src/converter/manifest-inference.js +43 -12
package/src/converter/parsers/diffusion.js +0 -3
package/src/converter/quantization-info.js +35 -15
package/src/converter/rope-config.js +42 -0
package/src/converter/shard-packer.d.ts +1 -1
package/src/converter/shard-packer.js +4 -1
package/src/debug/config.js +123 -11
package/src/debug/signals.js +7 -1
package/src/debug/tensor.d.ts +2 -0
package/src/debug/tensor.js +13 -2
package/src/distribution/p2p-control-plane.js +52 -12
package/src/distribution/p2p-observability.js +43 -7
package/src/distribution/p2p-webrtc-browser.js +20 -0
package/src/distribution/shard-delivery.js +77 -26
package/src/formats/gguf/types.js +33 -16
package/src/formats/rdrr/groups.d.ts +12 -4
package/src/formats/rdrr/groups.js +3 -6
package/src/formats/rdrr/parsing.js +39 -2
package/src/formats/rdrr/types.d.ts +2 -1
package/src/gpu/command-recorder.js +86 -61
package/src/gpu/device.d.ts +1 -0
package/src/gpu/device.js +131 -19
package/src/gpu/kernel-tuner/benchmarks.js +326 -316
package/src/gpu/kernel-tuner/cache.js +71 -4
package/src/gpu/kernel-tuner/tuner.js +22 -4
package/src/gpu/kernels/attention.js +113 -34
package/src/gpu/kernels/backward/adam.js +62 -58
package/src/gpu/kernels/backward/attention_backward.js +257 -169
package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
package/src/gpu/kernels/bias_add.wgsl +8 -6
package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
package/src/gpu/kernels/cast.js +191 -149
package/src/gpu/kernels/check-stop.js +33 -44
package/src/gpu/kernels/conv2d.js +27 -17
package/src/gpu/kernels/conv2d.wgsl +7 -8
package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
package/src/gpu/kernels/cross_entropy_loss.js +21 -15
package/src/gpu/kernels/depthwise_conv2d.js +37 -26
package/src/gpu/kernels/depthwise_conv2d.wgsl +6 -9
package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +6 -9
package/src/gpu/kernels/dequant.js +178 -126
package/src/gpu/kernels/energy.d.ts +3 -21
package/src/gpu/kernels/energy.js +111 -88
package/src/gpu/kernels/feature-check.js +1 -1
package/src/gpu/kernels/fused_ffn.js +84 -65
package/src/gpu/kernels/fused_matmul_residual.js +56 -33
package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
package/src/gpu/kernels/gather.js +33 -15
package/src/gpu/kernels/gelu.js +19 -11
package/src/gpu/kernels/grouped_pointwise_conv2d.js +34 -23
package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +6 -9
package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +6 -9
package/src/gpu/kernels/groupnorm.js +34 -23
package/src/gpu/kernels/kv-quantize.js +5 -2
package/src/gpu/kernels/layernorm.js +35 -19
package/src/gpu/kernels/logit-merge.js +5 -3
package/src/gpu/kernels/matmul.js +83 -39
package/src/gpu/kernels/modulate.js +23 -15
package/src/gpu/kernels/moe.js +221 -175
package/src/gpu/kernels/pixel_shuffle.js +22 -14
package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
package/src/gpu/kernels/relu.js +31 -10
package/src/gpu/kernels/relu.wgsl +2 -1
package/src/gpu/kernels/relu_f16.wgsl +2 -1
package/src/gpu/kernels/repeat_channels.js +25 -17
package/src/gpu/kernels/repeat_channels.wgsl +4 -5
package/src/gpu/kernels/repeat_channels_f16.wgsl +4 -5
package/src/gpu/kernels/residual.js +69 -23
package/src/gpu/kernels/residual.wgsl +6 -3
package/src/gpu/kernels/residual_f16.wgsl +2 -1
package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
package/src/gpu/kernels/residual_vec4.wgsl +2 -1
package/src/gpu/kernels/rmsnorm.js +96 -28
package/src/gpu/kernels/rmsnorm.wgsl +14 -6
package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
package/src/gpu/kernels/rope.d.ts +2 -0
package/src/gpu/kernels/rope.js +14 -1
package/src/gpu/kernels/rope.wgsl +56 -40
package/src/gpu/kernels/sample.js +27 -38
package/src/gpu/kernels/sana_linear_attention.js +19 -12
package/src/gpu/kernels/sana_linear_attention_apply.wgsl +4 -5
package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +4 -5
package/src/gpu/kernels/sana_linear_attention_summary.wgsl +4 -0
package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +4 -0
package/src/gpu/kernels/scale.js +18 -11
package/src/gpu/kernels/shader-cache.js +4 -2
package/src/gpu/kernels/silu.d.ts +1 -0
package/src/gpu/kernels/silu.js +148 -82
package/src/gpu/kernels/silu.wgsl +19 -9
package/src/gpu/kernels/silu_f16.wgsl +19 -9
package/src/gpu/kernels/softmax.js +44 -25
package/src/gpu/kernels/split_qkv.js +23 -13
package/src/gpu/kernels/transpose.js +31 -10
package/src/gpu/kernels/transpose.wgsl +6 -5
package/src/gpu/kernels/upsample2d.js +22 -13
package/src/gpu/kernels/upsample2d.wgsl +6 -9
package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
package/src/gpu/kernels/utils.js +35 -13
package/src/gpu/partitioned-buffer-pool.js +10 -2
package/src/gpu/perf-guards.js +2 -9
package/src/gpu/profiler.js +27 -22
package/src/gpu/readback-utils.d.ts +16 -0
package/src/gpu/readback-utils.js +41 -0
package/src/gpu/submit-tracker.js +13 -0
package/src/gpu/uniform-cache.d.ts +1 -0
package/src/gpu/uniform-cache.js +30 -9
package/src/hotswap/intent-bundle.js +6 -0
package/src/hotswap/manifest.d.ts +10 -1
package/src/hotswap/manifest.js +12 -2
package/src/hotswap/runtime.js +30 -8
package/src/index-browser.d.ts +44 -0
package/src/index-browser.js +14 -0
package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
package/src/inference/browser-harness-contract-helpers.js +28 -0
package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
package/src/inference/browser-harness-model-helpers.d.ts +16 -0
package/src/inference/browser-harness-model-helpers.js +217 -0
package/src/inference/browser-harness-report-helpers.d.ts +7 -0
package/src/inference/browser-harness-report-helpers.js +42 -0
package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
package/src/inference/browser-harness-runtime-helpers.js +415 -0
package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
package/src/inference/browser-harness-suite-helpers.js +268 -0
package/src/inference/browser-harness-text-helpers.d.ts +27 -0
package/src/inference/browser-harness-text-helpers.js +788 -0
package/src/inference/browser-harness.d.ts +6 -0
package/src/inference/browser-harness.js +130 -1950
package/src/inference/kv-cache/base.js +140 -94
package/src/inference/kv-cache/tiered.js +5 -3
package/src/inference/moe-router.js +88 -56
package/src/inference/multi-model-network.js +5 -3
package/src/inference/network-evolution.d.ts +11 -2
package/src/inference/network-evolution.js +20 -21
package/src/inference/pipelines/context.d.ts +3 -0
package/src/inference/pipelines/context.js +142 -2
package/src/inference/pipelines/diffusion/helpers.js +7 -2
package/src/inference/pipelines/diffusion/pipeline.js +17 -7
package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +5 -0
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +27 -15
package/src/inference/pipelines/diffusion/vae.js +3 -7
package/src/inference/pipelines/energy/pipeline.js +27 -21
package/src/inference/pipelines/energy/quintel.d.ts +5 -0
package/src/inference/pipelines/energy/quintel.js +11 -0
package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
package/src/inference/pipelines/text/attention/projections.js +151 -101
package/src/inference/pipelines/text/attention/record.js +73 -10
package/src/inference/pipelines/text/attention/run.js +73 -10
package/src/inference/pipelines/text/chat-format.js +25 -1
package/src/inference/pipelines/text/config.d.ts +4 -0
package/src/inference/pipelines/text/config.js +71 -5
package/src/inference/pipelines/text/embed.js +2 -8
package/src/inference/pipelines/text/execution-plan.js +64 -50
package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
package/src/inference/pipelines/text/execution-v0.js +78 -1002
package/src/inference/pipelines/text/ffn/standard.js +3 -0
package/src/inference/pipelines/text/generator-steps.d.ts +46 -0
package/src/inference/pipelines/text/generator-steps.js +298 -207
package/src/inference/pipelines/text/generator.js +6 -23
package/src/inference/pipelines/text/init.d.ts +4 -0
package/src/inference/pipelines/text/init.js +134 -29
package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
package/src/inference/pipelines/text/kernel-trace.js +6 -0
package/src/inference/pipelines/text/layer.js +14 -9
package/src/inference/pipelines/text/linear-attention.d.ts +10 -0
package/src/inference/pipelines/text/linear-attention.js +80 -6
package/src/inference/pipelines/text/logits/gpu.js +10 -5
package/src/inference/pipelines/text/logits/index.js +10 -11
package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
package/src/inference/pipelines/text/logits/utils.js +9 -0
package/src/inference/pipelines/text/lora-apply.js +50 -32
package/src/inference/pipelines/text/model-load.js +279 -104
package/src/inference/pipelines/text/moe-cache.js +5 -4
package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
package/src/inference/pipelines/text/moe-cpu.js +42 -38
package/src/inference/pipelines/text/moe-gpu.js +110 -86
package/src/inference/pipelines/text/ops.js +90 -90
package/src/inference/pipelines/text/probes.js +9 -9
package/src/inference/pipelines/text/weights.js +17 -7
package/src/inference/pipelines/text.js +17 -1
package/src/inference/speculative.d.ts +2 -2
package/src/inference/speculative.js +4 -18
package/src/inference/test-harness.d.ts +1 -1
package/src/inference/test-harness.js +15 -5
package/src/inference/tokenizer.d.ts +0 -5
package/src/inference/tokenizer.js +4 -23
package/src/inference/tokenizers/bpe.js +9 -0
package/src/inference/tokenizers/bundled.js +176 -33
package/src/inference/tokenizers/sentencepiece.js +12 -0
package/src/loader/doppler-loader.js +38 -22
package/src/loader/dtype-utils.js +3 -44
package/src/loader/embedding-loader.js +7 -3
package/src/loader/experts/expert-cache.js +13 -6
package/src/loader/experts/expert-loader.js +10 -6
package/src/loader/final-weights-loader.js +8 -4
package/src/loader/layer-loader.js +2 -1
package/src/loader/loader-state.js +2 -2
package/src/loader/memory-monitor.js +8 -0
package/src/loader/multi-model-loader.d.ts +14 -0
package/src/loader/multi-model-loader.js +70 -24
package/src/loader/shard-cache.js +81 -12
package/src/loader/shard-resolver.js +25 -3
package/src/loader/tensors/tensor-loader.js +209 -144
package/src/loader/tensors/tensor-reader.js +76 -19
package/src/loader/weight-downcast.js +1 -1
package/src/memory/buffer-pool.d.ts +9 -1
package/src/memory/buffer-pool.js +109 -44
package/src/memory/unified-detect.js +1 -1
package/src/rules/inference/kernel-path.rules.json +24 -8
package/src/rules/rule-registry.js +25 -1
package/src/rules/tooling/command-runtime.rules.json +18 -0
package/src/storage/backends/opfs-store.js +68 -24
package/src/storage/downloader.js +364 -83
package/src/storage/index.d.ts +3 -0
package/src/storage/index.js +3 -0
package/src/storage/preflight.d.ts +2 -2
package/src/storage/preflight.js +24 -2
package/src/storage/quickstart-downloader.js +11 -5
package/src/storage/registry.js +10 -4
package/src/storage/reports.js +1 -1
package/src/storage/shard-manager.d.ts +15 -1
package/src/storage/shard-manager.js +51 -3
package/src/storage/source-artifact-store.d.ts +52 -0
package/src/storage/source-artifact-store.js +234 -0
package/src/tooling/command-api-constants.d.ts +9 -0
package/src/tooling/command-api-constants.js +9 -0
package/src/tooling/command-api-family-normalizers.d.ts +9 -0
package/src/tooling/command-api-family-normalizers.js +343 -0
package/src/tooling/command-api-helpers.d.ts +25 -0
package/src/tooling/command-api-helpers.js +262 -0
package/src/tooling/command-api.d.ts +27 -1
package/src/tooling/command-api.js +26 -473
package/src/tooling/command-envelope.js +4 -1
package/src/tooling/command-runner-shared.js +52 -18
package/src/tooling/lean-execution-contract.js +150 -3
package/src/tooling/node-browser-command-runner.d.ts +4 -0
package/src/tooling/node-browser-command-runner.js +218 -273
package/src/tooling/node-command-runner.js +44 -3
package/src/tooling/node-converter.js +27 -1
package/src/tooling/node-source-runtime.d.ts +1 -1
package/src/tooling/node-source-runtime.js +84 -3
package/src/tooling/node-webgpu.js +30 -105
package/src/tooling/opfs-cache.js +21 -4
package/src/tooling/runtime-input-composition.d.ts +38 -0
package/src/tooling/runtime-input-composition.js +86 -0
package/src/tooling/source-runtime-bundle.d.ts +40 -5
package/src/tooling/source-runtime-bundle.js +261 -34
package/src/tooling/source-runtime-materializer.d.ts +6 -0
package/src/tooling/source-runtime-materializer.js +93 -0
package/src/training/attention-backward.js +32 -17
package/src/training/autograd.js +80 -52
package/src/training/checkpoint-watch.d.ts +8 -0
package/src/training/checkpoint-watch.js +139 -0
package/src/training/checkpoint.d.ts +6 -1
package/src/training/checkpoint.js +46 -7
package/src/training/clip.js +2 -1
package/src/training/datasets/token-batch.js +20 -8
package/src/training/distillation/artifacts.d.ts +71 -0
package/src/training/distillation/artifacts.js +132 -0
package/src/training/distillation/checkpoint-watch.d.ts +10 -0
package/src/training/distillation/checkpoint-watch.js +58 -0
package/src/training/distillation/dataset.d.ts +59 -0
package/src/training/distillation/dataset.js +337 -0
package/src/training/distillation/eval.d.ts +34 -0
package/src/training/distillation/eval.js +310 -0
package/src/training/distillation/index.d.ts +29 -0
package/src/training/distillation/index.js +29 -0
package/src/training/distillation/runtime.d.ts +20 -0
package/src/training/distillation/runtime.js +121 -0
package/src/training/distillation/scoreboard.d.ts +6 -0
package/src/training/distillation/scoreboard.js +8 -0
package/src/training/distillation/stage-a.d.ts +45 -0
package/src/training/distillation/stage-a.js +338 -0
package/src/training/distillation/stage-b.d.ts +24 -0
package/src/training/distillation/stage-b.js +20 -0
package/src/training/distillation/student-fixture.d.ts +22 -0
package/src/training/distillation/student-fixture.js +846 -0
package/src/training/distillation/suite-data.d.ts +45 -0
package/src/training/distillation/suite-data.js +189 -0
package/src/training/index.d.ts +10 -0
package/src/training/index.js +10 -0
package/src/training/lora-pipeline.d.ts +40 -0
package/src/training/lora-pipeline.js +793 -0
package/src/training/lora.js +26 -12
package/src/training/loss.js +5 -6
package/src/training/objectives/cross_entropy.js +2 -5
package/src/training/objectives/distill_kd.js +4 -8
package/src/training/objectives/distill_triplet.js +4 -8
package/src/training/objectives/ul_stage2_base.js +4 -8
package/src/training/operator-artifacts.d.ts +62 -0
package/src/training/operator-artifacts.js +140 -0
package/src/training/operator-command.d.ts +5 -0
package/src/training/operator-command.js +455 -0
package/src/training/operator-eval.d.ts +48 -0
package/src/training/operator-eval.js +230 -0
package/src/training/operator-scoreboard.d.ts +5 -0
package/src/training/operator-scoreboard.js +44 -0
package/src/training/optimizer.js +19 -7
package/src/training/runner.d.ts +52 -0
package/src/training/runner.js +31 -5
package/src/training/suite.d.ts +112 -0
package/src/training/suite.js +24 -984
package/src/training/tensor-factory.d.ts +9 -0
package/src/training/tensor-factory.js +13 -0
package/src/training/trainer.js +3 -5
package/src/training/ul_dataset.js +3 -5
package/src/training/workloads.d.ts +164 -0
package/src/training/workloads.js +530 -0
package/src/version.js +1 -1
package/tools/convert-safetensors-node.js +22 -16
package/tools/doppler-cli.js +179 -63

package/src/inference/pipelines/text/generator.js CHANGED Viewed

@@ -1043,18 +1043,9 @@ export class PipelineGenerator {
           if (allowReadback(`pipeline.prefill.layer-${l}`)) {
             try {
               const sampleSize = config.hiddenSize * activationBytes;
-              const staging = device.createBuffer({
-                size: sampleSize,
-                usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
-              });
-              const enc = device.createCommandEncoder();
               const lastTokenOffset = (numTokens - 1) * config.hiddenSize * activationBytes;
-              enc.copyBufferToBuffer(currentHiddenBuffer, lastTokenOffset, staging, 0, sampleSize);
-              device.queue.submit([enc.finish()]);
-              await staging.mapAsync(GPUMapMode.READ);
-              const data = decodeReadback(staging.getMappedRange().slice(0), activationDtype);
-              staging.unmap();
-              staging.destroy();
+              const readback = await readBufferSlice(currentHiddenBuffer, lastTokenOffset, sampleSize);
+              const data = decodeReadback(readback, activationDtype);
               let min = Infinity;
               let max = -Infinity;
               let maxAbs = 0;
@@ -1112,20 +1103,12 @@ export class PipelineGenerator {
     if (opts.debug) {
       log.debug('Pipeline', `LAYER_LOOP_DONE, currentHiddenBuffer type=${currentHiddenBuffer?.constructor?.name}`);
       if (currentHiddenBuffer && allowReadback('pipeline.prefill.final-hidden')) {
-        const device = getDevice();
         const lastTokenOffset = (numTokens - 1) * config.hiddenSize * activationBytes;
         const sampleSize = config.hiddenSize * activationBytes;
-        const staging = device.createBuffer({
-          size: sampleSize,
-          usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
-        });
-        const enc = device.createCommandEncoder();
-        enc.copyBufferToBuffer(currentHiddenBuffer, lastTokenOffset, staging, 0, sampleSize);
-        device.queue.submit([enc.finish()]);
-        await staging.mapAsync(GPUMapMode.READ);
-        const data = decodeReadback(staging.getMappedRange().slice(0), activationDtype);
-        staging.unmap();
-        staging.destroy();
+        const data = decodeReadback(
+          await readBufferSlice(currentHiddenBuffer, lastTokenOffset, sampleSize),
+          activationDtype
+        );
         const nanCount = Array.from(data).filter(x => !Number.isFinite(x)).length;
         const nonZero = Array.from(data).filter(x => Number.isFinite(x) && x !== 0).slice(0, 5);
         log.debug('Pipeline', `FINAL_HIDDEN[pos=${numTokens - 1}]: nan=${nanCount}/${data.length}, sample=[${nonZero.map(x => x.toFixed(4)).join(', ')}]`);

package/src/inference/pipelines/text/init.d.ts CHANGED Viewed

@@ -71,9 +71,13 @@ export interface PipelineContexts {
  */
 export interface RoPEConfig {
   headDim: number;
+  rotaryDim?: number;
   maxSeqLen: number;
   ropeTheta: number;
   ropeLocalTheta?: number | null;
+  mropeInterleaved?: boolean;
+  mropeSection?: number[] | null;
+  partialRotaryFactor?: number | null;
   ropeScale: number;
   ropeLocalScale?: number;
   ropeScalingType?: string | null;

package/src/inference/pipelines/text/init.js CHANGED Viewed

@@ -2,7 +2,7 @@
 import { parseModelConfig } from './config.js';
 import { getDevice, getDeviceLimits, getKernelCapabilities } from '../../../gpu/device.js';
-import { acquireBuffer } from '../../../memory/buffer-pool.js';
+import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
 import { KVCache, SlidingWindowKVCache, TieredKVCache, BasisDecomposedPagedCache } from '../../kv-cache.js';
 import { Tokenizer } from '../../tokenizer.js';
 import { MoERouter } from '../../moe-router.js';
@@ -14,6 +14,10 @@ import { PAGED_LAYOUT_SEQ_LEN_THRESHOLD } from '../../../config/schema/index.js'
 import { isKernelPathFusedQ4K } from '../../../config/kernel-path-loader.js';
 import { createWeightBuffer, getWeightDtype, isWeightBuffer } from '../../../gpu/weight-buffer.js';
 import { selectRuleValue } from '../../../rules/rule-registry.js';
+import {
+  createSourceStorageContext,
+  getSourceRuntimeMetadata,
+} from '../../../tooling/source-runtime-bundle.js';
 function resolveErrorMessage(error) {
   if (error && typeof error === 'object' && typeof error.message === 'string') {
@@ -56,12 +60,61 @@ function normalizeBaseUrl(baseUrl) {
   return baseUrl.replace(/\/$/, '');
 }
+async function fetchBytes(url, offset = null, length = null) {
+  const headers = {};
+  if (Number.isFinite(offset) && Number.isFinite(length) && length > 0) {
+    const start = Math.max(0, Math.floor(offset));
+    const end = start + Math.max(0, Math.floor(length)) - 1;
+    headers.Range = `bytes=${start}-${end}`;
+  }
+  const response = await fetch(url, { headers });
+  if (!response.ok) {
+    throw new Error(`Failed to fetch ${url}: ${response.status}`);
+  }
+  return new Uint8Array(await response.arrayBuffer());
+}
 function createRemoteStorageContext(baseUrl, manifest) {
   const root = normalizeBaseUrl(baseUrl);
   if (!root || !isRDRRManifest(manifest)) {
     return null;
   }
+  const sourceRuntime = getSourceRuntimeMetadata(manifest);
+  if (sourceRuntime) {
+    const readRange = async (relativePath, offset, length) => {
+      const filename = String(relativePath || '').replace(/^\/+/, '');
+      if (!filename) {
+        throw new Error('Direct-source artifact path is required.');
+      }
+      const url = `${root}/${filename}`;
+      return fetchBytes(url, offset, length);
+    };
+    const readText = async (relativePath) => {
+      const filename = String(relativePath || '').replace(/^\/+/, '');
+      if (!filename) return null;
+      const response = await fetch(`${root}/${filename}`);
+      if (!response.ok) {
+        throw new Error(`Failed to fetch ${filename} from ${root}: ${response.status}`);
+      }
+      return response.text();
+    };
+    const readBinary = async (relativePath) => {
+      const filename = String(relativePath || '').replace(/^\/+/, '');
+      if (!filename) {
+        throw new Error('Direct-source binary asset path is required.');
+      }
+      return fetchBytes(`${root}/${filename}`);
+    };
+    return createSourceStorageContext({
+      manifest,
+      readRange,
+      readText,
+      readBinary,
+      verifyHashes: true,
+    });
+  }
   return {
     async loadShard(index) {
       const shard = manifest.shards[index];
@@ -69,11 +122,7 @@ function createRemoteStorageContext(baseUrl, manifest) {
       if (!filename) {
         throw new Error(`Manifest shard ${index} is missing filename.`);
       }
-      const response = await fetch(`${root}/${filename.replace(/^\/+/, '')}`);
-      if (!response.ok) {
-        throw new Error(`Failed to fetch shard ${index} from ${root}: ${response.status}`);
-      }
-      return new Uint8Array(await response.arrayBuffer());
+      return fetchBytes(`${root}/${filename.replace(/^\/+/, '')}`);
     },
   };
 }
@@ -206,13 +255,45 @@ function isSameRoPEScalingConfig(
       === (rightScaling?.original_max_position_embeddings ?? null);
 }
+function resolveRotaryDim(headDim, rotaryDim, partialRotaryFactor) {
+  if (rotaryDim != null) {
+    if (!Number.isFinite(rotaryDim) || rotaryDim <= 0 || (rotaryDim % 2) !== 0) {
+      throw new Error(`RoPE rotary dim must be a positive even integer; got "${rotaryDim}".`);
+    }
+    if (rotaryDim > headDim) {
+      throw new Error(`RoPE rotary dim ${rotaryDim} cannot exceed headDim ${headDim}.`);
+    }
+    return rotaryDim;
+  }
+  if (partialRotaryFactor == null) {
+    return headDim;
+  }
+  if (!Number.isFinite(partialRotaryFactor) || partialRotaryFactor <= 0 || partialRotaryFactor > 1) {
+    throw new Error(
+      `RoPE partialRotaryFactor must be a number in (0, 1]; got "${partialRotaryFactor}".`
+    );
+  }
+  const resolved = Math.trunc(headDim * partialRotaryFactor);
+  if (resolved <= 0 || (resolved % 2) !== 0) {
+    throw new Error(
+      `RoPE partialRotaryFactor=${partialRotaryFactor} with headDim=${headDim} resolves ` +
+      `to rotaryDim=${resolved}, but rotaryDim must be a positive even integer.`
+    );
+  }
+  return resolved;
+}
 export async function initRoPEFrequencies(config, useGPU) {
   const {
     headDim,
+    rotaryDim,
     maxSeqLen,
     ropeTheta,
     ropeLocalTheta,
+    mropeInterleaved,
+    mropeSection,
+    partialRotaryFactor,
     ropeScale,
     ropeLocalScale,
     ropeScalingType,
@@ -230,14 +311,23 @@ export async function initRoPEFrequencies(config, useGPU) {
   const resolvedLocalTheta = ropeLocalTheta ?? ropeTheta;
   const resolvedLocalScalingType = ropeLocalScalingType ?? ropeScalingType;
   const resolvedLocalScaling = ropeLocalScaling ?? ropeScaling;
+  const resolvedRotaryDim = resolveRotaryDim(headDim, rotaryDim, partialRotaryFactor);
+  const halfDim = resolvedRotaryDim / 2;
+  if (mropeInterleaved === true && Array.isArray(mropeSection)) {
+    const expandedDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
+    if (expandedDim !== resolvedRotaryDim) {
+      throw new Error(
+        `RoPE mropeSection expands to ${expandedDim} dims, but rotaryDim is ${resolvedRotaryDim}.`
+      );
+    }
+  }
-  const halfDim = headDim / 2;
   const isYarn = ropeScalingType === 'yarn';
   const isLocalYarn = resolvedLocalScalingType === 'yarn';
   // Compute global (full_attention) frequencies
   const globalFreqs = computeRoPEFreqsForTheta(
-    ropeTheta, headDim, maxSeqLen, ropeScale, ropeScalingType, ropeScaling
+    ropeTheta, resolvedRotaryDim, maxSeqLen, ropeScale, ropeScalingType, ropeScaling
   );
   // Compute local (sliding_attention) frequencies if different from global.
@@ -256,7 +346,7 @@ export async function initRoPEFrequencies(config, useGPU) {
   if (hasDistinctLocalTheta || hasDistinctLocalScaling) {
     localFreqs = computeRoPEFreqsForTheta(
       resolvedLocalTheta,
-      headDim,
+      resolvedRotaryDim,
       maxSeqLen,
       resolvedLocalScale,
       resolvedLocalScalingType,
@@ -285,27 +375,37 @@ export async function initRoPEFrequencies(config, useGPU) {
   // Upload to GPU if available
   const device = getDevice();
   if (device && useGPU) {
-    const cosBuffer = acquireBuffer(globalFreqs.cos.byteLength, undefined, 'rope_cos');
-    const sinBuffer = acquireBuffer(globalFreqs.sin.byteLength, undefined, 'rope_sin');
-    device.queue.writeBuffer(cosBuffer, 0, globalFreqs.cos.buffer, globalFreqs.cos.byteOffset, globalFreqs.cos.byteLength);
-    device.queue.writeBuffer(sinBuffer, 0, globalFreqs.sin.buffer, globalFreqs.sin.byteOffset, globalFreqs.sin.byteLength);
-    let localCosBuffer;
-    let localSinBuffer;
-    if (localFreqs) {
-      localCosBuffer = acquireBuffer(localFreqs.cos.byteLength, undefined, 'rope_local_cos');
-      localSinBuffer = acquireBuffer(localFreqs.sin.byteLength, undefined, 'rope_local_sin');
-      device.queue.writeBuffer(localCosBuffer, 0, localFreqs.cos.buffer, localFreqs.cos.byteOffset, localFreqs.cos.byteLength);
-      device.queue.writeBuffer(localSinBuffer, 0, localFreqs.sin.buffer, localFreqs.sin.byteOffset, localFreqs.sin.byteLength);
+    let cosBuffer = null;
+    let sinBuffer = null;
+    let localCosBuffer = null;
+    let localSinBuffer = null;
+    try {
+      cosBuffer = acquireBuffer(globalFreqs.cos.byteLength, undefined, 'rope_cos');
+      sinBuffer = acquireBuffer(globalFreqs.sin.byteLength, undefined, 'rope_sin');
+      device.queue.writeBuffer(cosBuffer, 0, globalFreqs.cos.buffer, globalFreqs.cos.byteOffset, globalFreqs.cos.byteLength);
+      device.queue.writeBuffer(sinBuffer, 0, globalFreqs.sin.buffer, globalFreqs.sin.byteOffset, globalFreqs.sin.byteLength);
+      if (localFreqs) {
+        localCosBuffer = acquireBuffer(localFreqs.cos.byteLength, undefined, 'rope_local_cos');
+        localSinBuffer = acquireBuffer(localFreqs.sin.byteLength, undefined, 'rope_local_sin');
+        device.queue.writeBuffer(localCosBuffer, 0, localFreqs.cos.buffer, localFreqs.cos.byteOffset, localFreqs.cos.byteLength);
+        device.queue.writeBuffer(localSinBuffer, 0, localFreqs.sin.buffer, localFreqs.sin.byteOffset, localFreqs.sin.byteLength);
+      }
+    } catch (error) {
+      for (const buffer of [cosBuffer, sinBuffer, localCosBuffer, localSinBuffer]) {
+        if (buffer) {
+          releaseBuffer(buffer);
+        }
+      }
+      throw error;
     }
     log.debug(
       'Pipeline',
-      `RoPE frequencies initialized (GPU): ${maxSeqLen} positions, dim=${halfDim}, headDim=${headDim}, ` +
+      `RoPE frequencies initialized (GPU): ${maxSeqLen} positions, dim=${halfDim}, headDim=${headDim}, rotaryDim=${resolvedRotaryDim}, ` +
       `theta=${ropeTheta}${hasDistinctLocalTheta ? `, localTheta=${resolvedLocalTheta}` : ''}, ` +
-      `scaling=${ropeScalingType ?? 'none'}:${ropeScale}${hasDistinctLocalScaling ? `, localScaling=${resolvedLocalScalingType ?? 'none'}:${resolvedLocalScale}` : ''}`
+      `scaling=${ropeScalingType ?? 'none'}:${ropeScale}${hasDistinctLocalScaling ? `, localScaling=${resolvedLocalScalingType ?? 'none'}:${resolvedLocalScale}` : ''}, ` +
+      `interleaved=${mropeInterleaved === true}`
     );
     return {
@@ -318,9 +418,10 @@ export async function initRoPEFrequencies(config, useGPU) {
   log.debug(
     'Pipeline',
-    `RoPE frequencies initialized (CPU): ${maxSeqLen} positions, dim=${halfDim}, headDim=${headDim}, ` +
+    `RoPE frequencies initialized (CPU): ${maxSeqLen} positions, dim=${halfDim}, headDim=${headDim}, rotaryDim=${resolvedRotaryDim}, ` +
     `theta=${ropeTheta}${hasDistinctLocalTheta ? `, localTheta=${resolvedLocalTheta}` : ''}, ` +
-    `scaling=${ropeScalingType ?? 'none'}:${ropeScale}${hasDistinctLocalScaling ? `, localScaling=${resolvedLocalScalingType ?? 'none'}:${resolvedLocalScale}` : ''}`
+    `scaling=${ropeScalingType ?? 'none'}:${ropeScale}${hasDistinctLocalScaling ? `, localScaling=${resolvedLocalScalingType ?? 'none'}:${resolvedLocalScale}` : ''}, ` +
+    `interleaved=${mropeInterleaved === true}`
   );
   return {
@@ -688,6 +789,10 @@ function applyChatMLTemplate(prompt) {
   return `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
 }
+function applyQwenTemplate(prompt) {
+  return `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n`;
+}
 function applyTranslateGemmaTemplate() {
   throw new Error(
     'TranslateGemma template requires structured messages. ' +
@@ -702,7 +807,7 @@ const PROMPT_TEMPLATES = {
   'llama3': applyHeaderBasedTemplate,
   'gpt-oss': applyChannelBasedTemplate,
   'chatml': applyChatMLTemplate,
-  'qwen': applyChatMLTemplate,  // Qwen uses ChatML format
+  'qwen': applyQwenTemplate,
   'translategemma': applyTranslateGemmaTemplate,
 };
@@ -721,7 +826,7 @@ export function applyChatTemplate(prompt, templateType) {
 export const applyGemmaChatTemplate = applyTurnBasedTemplate;
 export const applyLlama3ChatTemplate = applyHeaderBasedTemplate;
 export const applyGptOssChatTemplate = applyChannelBasedTemplate;
-export const applyQwenChatTemplate = applyChatMLTemplate;
+export const applyQwenChatTemplate = applyQwenTemplate;
 export function isStopToken(token, stopTokenIds, eosTokenId) {

package/src/inference/pipelines/text/kernel-path-auto-select.js CHANGED Viewed

@@ -78,6 +78,7 @@ export function resolveCapabilityKernelPathRef(configuredKernelPathRef, kernelPa
   const normalizedPolicy = resolveKernelPathPolicy(kernelPathPolicy);
   const hasSubgroups = capabilities?.hasSubgroups === true;
+  const hasF16 = capabilities?.hasF16 === true;
   const normalizedSource = normalizeKernelPathSource(kernelPathSource);
   const allowCapabilityAutoSelection = normalizedPolicy.mode === 'capability-aware'
     && normalizedPolicy.sourceScope.includes(normalizedSource);
@@ -85,6 +86,7 @@ export function resolveCapabilityKernelPathRef(configuredKernelPathRef, kernelPa
   return selectRuleValue('inference', 'kernelPath', 'autoSelect', {
     kernelPathRef: configuredKernelPathRef,
     hasSubgroups,
+    hasF16,
     allowCapabilityAutoSelection,
   });
 }

package/src/inference/pipelines/text/kernel-trace.d.ts CHANGED Viewed

@@ -12,6 +12,8 @@
  * Snapshot of a tensor's statistics (no full data, just stats).
  */
 export interface TensorSnapshot {
+  ok: boolean;
+  error: string | null;
   shape: number[];
   dtype: string;
   stats: {

package/src/inference/pipelines/text/kernel-trace.js CHANGED Viewed

@@ -283,6 +283,9 @@ export async function traceStep(name, label, layer, outputBuffer, outputShape, o
   if (layer >= 0 && !kernelTrace.shouldTraceLayer(layer)) return;
   const output = await snapshotTensor(outputBuffer, outputShape);
+  if (!output.ok) {
+    throw new Error(`[TRACE] Failed to snapshot output for ${label}: ${output.error}`);
+  }
   // Snapshot inputs if provided (expensive - only do if tracing)
@@ -290,6 +293,9 @@ export async function traceStep(name, label, layer, outputBuffer, outputShape, o
   if (options?.inputs && options?.inputShapes) {
     for (let i = 0; i < options.inputs.length; i++) {
       const snap = await snapshotTensor(options.inputs[i], options.inputShapes[i]);
+      if (!snap.ok) {
+        throw new Error(`[TRACE] Failed to snapshot input ${i} for ${label}: ${snap.error}`);
+      }
       inputs.push(snap);
     }
   }

package/src/inference/pipelines/text/layer.js CHANGED Viewed

@@ -2,7 +2,7 @@
 import { log, trace } from '../../../debug/index.js';
 import { getDevice } from '../../../gpu/device.js';
-import { releaseBuffer } from '../../../memory/buffer-pool.js';
+import { releaseBuffer, readBuffer } from '../../../memory/buffer-pool.js';
 import { allowReadback } from '../../../gpu/perf-guards.js';
 import { createTensor } from '../../../gpu/tensor.js';
 import {
@@ -228,6 +228,7 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
       linearRuntime: context.linearAttentionRuntime ?? null,
       getWeightBuffer: (weight, label) => getWeightBuffer(weight, label),
       getNormWeightBuffer: (weight, label) => getNormWeightBuffer(weight, label, weightConfig, debugFlags),
+      debugProbes: context.debugProbes,
       recorder: recorder ?? null,
     });
   } else {
@@ -259,6 +260,8 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
       attentionOutputGate: config.attentionOutputGate,
       causalAttention: config.causalAttention,
       rmsNormWeightOffset: config.rmsNormWeightOffset,
+      ropeRotaryDim: config.ropeRotaryDim,
+      ropeInterleaved: config.ropeInterleaved,
       tokenIds: context.currentTokenIds ?? null,
       kernelPath: context.kernelPath ?? null,
       disableRoPE,
@@ -312,14 +315,7 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
       if (allowReadback(`layer.attn-out.${layerIdx}`)) {
         try {
           const sampleSize = Math.min(128, attnOutput.buffer.size);
-          const staging = device.createBuffer({ size: sampleSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ });
-          const enc = device.createCommandEncoder();
-          enc.copyBufferToBuffer(attnOutput.buffer, 0, staging, 0, sampleSize);
-          device.queue.submit([enc.finish()]);
-          await staging.mapAsync(GPUMapMode.READ);
-          const data = new Float32Array(staging.getMappedRange().slice(0));
-          staging.unmap();
-          staging.destroy();
+          const data = new Float32Array(await readBuffer(attnOutput.buffer, sampleSize));
           let maxAbs = 0;
           for (let i = 0; i < data.length; i++) {
             const abs = Math.abs(data[i]);
@@ -661,6 +657,8 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
             attentionOutputGate: config.attentionOutputGate,
             causalAttention: config.causalAttention,
             rmsNormWeightOffset: config.rmsNormWeightOffset,
+            ropeRotaryDim: config.ropeRotaryDim,
+            ropeInterleaved: config.ropeInterleaved,
             tokenIds: context.currentTokenIds ?? null,
             skipInputNorm: step.skipInputNorm === true,
             activationDtype,
@@ -690,6 +688,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
               hiddenSize,
               probes: context.debugProbes,
               recorder,
+              dtype: outputDtype,
             });
           }
           break;
@@ -733,6 +732,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
               hiddenSize,
               probes: context.debugProbes,
               recorder,
+              dtype: outputDtype,
             });
           }
           break;
@@ -767,6 +767,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
               hiddenSize,
               probes: context.debugProbes,
               recorder,
+              dtype: outputDtype,
             });
           }
           break;
@@ -801,6 +802,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
               hiddenSize,
               probes: context.debugProbes,
               recorder,
+              dtype: outputDtype,
             });
           }
           break;
@@ -825,6 +827,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
               hiddenSize,
               probes: context.debugProbes,
               recorder,
+              dtype: outputDtype,
             });
           }
           break;
@@ -851,6 +854,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
               hiddenSize,
               probes: context.debugProbes,
               recorder,
+              dtype: toDtype,
             });
           }
           break;
@@ -880,6 +884,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
     hiddenSize,
     probes: context.debugProbes,
     recorder,
+    dtype: getSlotDtype('state') ?? activationDtype,
   });
   const computeConfig = context.runtimeComputeConfig ?? null;

package/src/inference/pipelines/text/linear-attention.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import type { Tensor } from '../../../gpu/tensor.js';
 import type { WeightBuffer } from '../../../gpu/weight-buffer.js';
 import type { CommandRecorder } from '../../../gpu/command-recorder.js';
 import type { LinearNormMode } from '../../../config/schema/index.js';
+import type { ProbeConfigSchema } from '../../../config/schema/index.js';
 export interface LinearLayerRuntimeState {
   layerIdx: number;
@@ -67,6 +68,7 @@ export interface RunLinearAttentionLayerOptions {
     weight: GPUBuffer | Float32Array | ArrayBuffer,
     label: string
   ) => GPUBuffer;
+  debugProbes?: ProbeConfigSchema[] | null;
   recorder?: CommandRecorder | null;
 }
@@ -74,6 +76,14 @@ export declare function hasLinearAttentionLayers(layerTypes: unknown): boolean;
 export declare function createLinearAttentionRuntime(): LinearAttentionRuntime;
+export declare function inferLinearNormMode(
+  weight: { size?: number; dtype?: string } | GPUBuffer | WeightBuffer | ArrayBufferView | ArrayBuffer | null | undefined,
+  projectionLayout: {
+    headVDim: number;
+    valueDim: number;
+  }
+): LinearNormMode | null;
 export declare function resetLinearAttentionRuntime(
   runtime: LinearAttentionRuntime | null | undefined
 ): LinearAttentionRuntime;

package/src/inference/pipelines/text/linear-attention.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { readBuffer, releaseBuffer, uploadData, acquireBuffer } from '../../../m
 import { log } from '../../../debug/index.js';
 import { decodeReadback } from './debug-utils/index.js';
 import { runLinearAttentionCoreGPU } from '../../../gpu/kernels/linear-attention-core.js';
+import { runProbes } from './probes.js';
 const LINEAR_RUNTIME_SCHEMA_VERSION = 1;
 const QK_L2NORM_EPS = 1e-6;
@@ -173,9 +174,22 @@ function inferLinearNormModeFromWeight(weight, projectionLayout) {
   if (weight instanceof ArrayBuffer) {
     return classify(Math.trunc(weight.byteLength / Float32Array.BYTES_PER_ELEMENT));
   }
+  const explicitDtype = typeof weight?.dtype === 'string' ? weight.dtype.toLowerCase() : null;
+  const trackedDtype = isGpuBuffer(weight) ? String(getBufferDtype(weight) ?? '').toLowerCase() : '';
+  const bytesPerElement = bytesFromDtype(explicitDtype || trackedDtype || null);
+  const sizedElements = Number.isFinite(weight?.size)
+    ? Math.trunc(Number(weight.size) / bytesPerElement)
+    : null;
+  if (sizedElements && Number(weight.size) % bytesPerElement === 0) {
+    return classify(sizedElements);
+  }
   return null;
 }
+export function inferLinearNormMode(weight, projectionLayout) {
+  return inferLinearNormModeFromWeight(weight, projectionLayout);
+}
 function resolveLinearNormMode(configNormMode, normWeight, projectionLayout, layerIdx) {
   const configuredMode = normalizeLinearNormMode(configNormMode);
   const inferredMode = inferLinearNormModeFromWeight(normWeight, projectionLayout);
@@ -185,7 +199,15 @@ function resolveLinearNormMode(configNormMode, normWeight, projectionLayout, lay
       `but norm.weight shape implies "${inferredMode}".`
     );
   }
-  return configuredMode ?? inferredMode ?? 'shared';
+  if (configuredMode) {
+    return configuredMode;
+  }
+  if (inferredMode) {
+    return inferredMode;
+  }
+  throw new Error(
+    `linear_attention layer ${layerIdx} requires explicit linearNormMode or a norm.weight shape that resolves it.`
+  );
 }
 async function readWeightAsF32(weight, expectedElements, label) {
@@ -395,10 +417,17 @@ async function createLayerRuntimeState(
   let convKernelSize = toPositiveInt(config.linearConvKernelDim) ?? null;
   if (isWeightBuffer(convKernel) && Array.isArray(convKernel.shape) && convKernel.shape.length >= 3) {
-    convKernelSize = toPositiveInt(convKernel.shape[2]) ?? convKernelSize;
+    const shapeKernelSize = toPositiveInt(convKernel.shape[2]) ?? null;
+    if (convKernelSize != null && shapeKernelSize != null && convKernelSize !== shapeKernelSize) {
+      throw new Error(
+        `linear_attention layer ${layerIdx} declares linearConvKernelDim=${convKernelSize}, ` +
+        `but conv1d weight shape implies ${shapeKernelSize}.`
+      );
+    }
+    convKernelSize = shapeKernelSize ?? convKernelSize;
   }
   if (!convKernelSize) {
-    convKernelSize = 4;
+    throw new Error(`linear_attention layer ${layerIdx} requires linearConvKernelDim.`);
   }
   const convWeight = await readWeightAsF32(
@@ -435,6 +464,11 @@ async function createLayerRuntimeState(
   const recurrentState = new Float32Array(
     projectionLayout.numVHeads * projectionLayout.headKDim * projectionLayout.headVDim
   );
+  const rmsNormEps = Number(config.rmsNormEps);
+  if (!Number.isFinite(rmsNormEps) || rmsNormEps <= 0) {
+    throw new Error(`linear_attention layer ${layerIdx} requires a positive rmsNormEps.`);
+  }
   const layerState = {
     layerIdx,
     seqLen: currentSeqLen,
@@ -452,7 +486,7 @@ async function createLayerRuntimeState(
     vSize: projectionLayout.vSize,
     qRep: projectionLayout.qRep,
     normMode,
-    rmsNormEps: Number(config.rmsNormEps) || 1e-6,
+    rmsNormEps,
     convWeight,
     dtBias,
     aNegExp,
@@ -681,13 +715,13 @@ export async function runLinearAttentionLayer(inputTensor, layerWeights, options
     const normWeightBuffer = getNormWeightBuffer(layerWeights.inputNorm, `L${layerIdx}.linear_input_norm`);
     try {
       if (recorder) {
-        normedTensor = await recordRMSNorm(recorder, inputTensor, normWeightBuffer, Number(config.rmsNormEps) || 1e-6, {
+        normedTensor = await recordRMSNorm(recorder, inputTensor, normWeightBuffer, layerState.rmsNormEps, {
           batchSize: numTokens,
           hiddenSize,
           rmsNormWeightOffset: config.rmsNormWeightOffset,
         });
       } else {
-        normedTensor = await runRMSNorm(inputTensor, normWeightBuffer, Number(config.rmsNormEps) || 1e-6, {
+        normedTensor = await runRMSNorm(inputTensor, normWeightBuffer, layerState.rmsNormEps, {
           batchSize: numTokens,
           hiddenSize,
           rmsNormWeightOffset: config.rmsNormWeightOffset,
@@ -755,6 +789,38 @@ export async function runLinearAttentionLayer(inputTensor, layerWeights, options
   });
   try {
+    await runProbes('linear_qkv_proj', qkvTensor.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize: projectionLayout.convDim,
+      probes: options.debugProbes,
+      recorder,
+      dtype: qkvTensor.dtype,
+    });
+    await runProbes('linear_z_proj', zTensor.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize: projectionLayout.valueDim,
+      probes: options.debugProbes,
+      recorder,
+      dtype: zTensor.dtype,
+    });
+    await runProbes('linear_a_proj', aTensor.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize: projectionLayout.numVHeads,
+      probes: options.debugProbes,
+      recorder,
+      dtype: aTensor.dtype,
+    });
+    await runProbes('linear_b_proj', bTensor.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize: projectionLayout.numVHeads,
+      probes: options.debugProbes,
+      recorder,
+      dtype: bTensor.dtype,
+    });
     const coreTensor = await runLinearAttentionCoreGPU(
       qkvTensor,
       zTensor,
@@ -768,6 +834,14 @@ export async function runLinearAttentionLayer(inputTensor, layerWeights, options
         recorder,
       }
     );
+    await runProbes('linear_core_out', coreTensor.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize: projectionLayout.valueDim,
+      probes: options.debugProbes,
+      recorder,
+      dtype: coreTensor.dtype,
+    });
     layerState.seqLen = currentSeqLen + numTokens;
     const outProjWeight = getWeightBuffer(layerWeights.oProj, `L${layerIdx}.linear_out_proj`);
     try {