npm - @simulatte/doppler - Versions diffs - 0.1.5 → 0.1.7 - Mend

@simulatte/doppler 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (392) hide show

package/CHANGELOG.md +126 -0
package/README.md +25 -17
package/package.json +20 -4
package/src/adapters/adapter-registry.js +12 -1
package/src/adapters/lora-loader.js +23 -6
package/src/bridge/extension-client.d.ts +5 -0
package/src/bridge/extension-client.js +40 -0
package/src/bridge/index.d.ts +2 -1
package/src/bridge/index.js +6 -4
package/src/browser/browser-converter.js +26 -1
package/src/browser/file-picker.js +6 -0
package/src/browser/safetensors-parser-browser.js +84 -1
package/src/browser/shard-io-browser.js +2 -2
package/src/browser/tensor-source-download.js +8 -2
package/src/browser/tensor-source-http.d.ts +1 -0
package/src/browser/tensor-source-http.js +5 -1
package/src/client/doppler-api.browser.js +20 -4
package/src/client/doppler-api.js +19 -3
package/src/client/doppler-provider/generation.js +12 -0
package/src/client/doppler-provider/model-manager.d.ts +10 -0
package/src/client/doppler-provider/model-manager.js +91 -19
package/src/client/doppler-provider/source-runtime.d.ts +2 -1
package/src/client/doppler-provider/source-runtime.js +132 -13
package/src/client/doppler-registry.json +8 -7
package/src/config/backward-registry-loader.js +17 -2
package/src/config/execution-v0-contract-check.js +113 -15
package/src/config/kernel-path-contract-check.js +57 -29
package/src/config/kernel-path-loader.js +5 -36
package/src/config/kernels/kernel-ref-digests.js +39 -39
package/src/config/kernels/registry.js +14 -1
package/src/config/kernels/registry.json +49 -7
package/src/config/loader.d.ts +1 -1
package/src/config/loader.js +43 -4
package/src/config/merge-contract-check.js +59 -4
package/src/config/merge-helpers.js +128 -7
package/src/config/merge.d.ts +1 -0
package/src/config/merge.js +28 -0
package/src/config/param-validator.js +47 -2
package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
package/src/config/presets/kernel-paths/registry.json +29 -8
package/src/config/presets/models/gemma2.json +2 -2
package/src/config/presets/models/qwen3.json +9 -2
package/src/config/presets/models/transformer.json +5 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
package/src/config/required-inference-fields-contract-check.js +6 -0
package/src/config/runtime.js +6 -1
package/src/config/schema/debug.schema.d.ts +5 -0
package/src/config/schema/doppler.schema.js +16 -21
package/src/config/schema/inference-defaults.schema.js +6 -3
package/src/config/schema/inference.schema.d.ts +9 -0
package/src/config/schema/kernel-path.schema.d.ts +11 -1
package/src/config/schema/kernel-thresholds.schema.js +12 -4
package/src/config/schema/manifest.schema.d.ts +8 -1
package/src/config/schema/manifest.schema.js +19 -3
package/src/config/training-defaults.js +30 -22
package/src/converter/conversion-plan.js +94 -9
package/src/converter/core.d.ts +7 -0
package/src/converter/core.js +14 -9
package/src/converter/execution-v0-manifest.js +4 -1
package/src/converter/index.d.ts +1 -0
package/src/converter/index.js +1 -0
package/src/converter/manifest-inference.js +43 -12
package/src/converter/parsers/diffusion.js +0 -3
package/src/converter/quantization-info.js +35 -15
package/src/converter/rope-config.js +42 -0
package/src/converter/shard-packer.d.ts +1 -1
package/src/converter/shard-packer.js +4 -1
package/src/debug/config.js +123 -11
package/src/debug/signals.js +7 -1
package/src/debug/tensor.d.ts +2 -0
package/src/debug/tensor.js +13 -2
package/src/distribution/p2p-control-plane.js +52 -12
package/src/distribution/p2p-observability.js +43 -7
package/src/distribution/p2p-webrtc-browser.js +20 -0
package/src/distribution/shard-delivery.js +77 -26
package/src/formats/gguf/types.js +33 -16
package/src/formats/rdrr/groups.d.ts +12 -4
package/src/formats/rdrr/groups.js +3 -6
package/src/formats/rdrr/parsing.js +39 -2
package/src/formats/rdrr/types.d.ts +2 -1
package/src/gpu/command-recorder.js +86 -61
package/src/gpu/device.d.ts +1 -0
package/src/gpu/device.js +131 -19
package/src/gpu/kernel-tuner/benchmarks.js +326 -316
package/src/gpu/kernel-tuner/cache.js +71 -4
package/src/gpu/kernel-tuner/tuner.js +22 -4
package/src/gpu/kernels/attention.js +113 -34
package/src/gpu/kernels/backward/adam.js +62 -58
package/src/gpu/kernels/backward/attention_backward.js +257 -169
package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
package/src/gpu/kernels/bias_add.wgsl +8 -6
package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
package/src/gpu/kernels/cast.js +191 -149
package/src/gpu/kernels/check-stop.js +33 -44
package/src/gpu/kernels/conv2d.js +27 -17
package/src/gpu/kernels/conv2d.wgsl +7 -8
package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
package/src/gpu/kernels/cross_entropy_loss.js +21 -15
package/src/gpu/kernels/depthwise_conv2d.js +37 -26
package/src/gpu/kernels/depthwise_conv2d.wgsl +6 -9
package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +6 -9
package/src/gpu/kernels/dequant.js +178 -126
package/src/gpu/kernels/energy.d.ts +3 -21
package/src/gpu/kernels/energy.js +111 -88
package/src/gpu/kernels/feature-check.js +1 -1
package/src/gpu/kernels/fused_ffn.js +84 -65
package/src/gpu/kernels/fused_matmul_residual.js +56 -33
package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
package/src/gpu/kernels/gather.js +33 -15
package/src/gpu/kernels/gelu.js +19 -11
package/src/gpu/kernels/grouped_pointwise_conv2d.js +34 -23
package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +6 -9
package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +6 -9
package/src/gpu/kernels/groupnorm.js +34 -23
package/src/gpu/kernels/kv-quantize.js +5 -2
package/src/gpu/kernels/layernorm.js +35 -19
package/src/gpu/kernels/logit-merge.js +5 -3
package/src/gpu/kernels/matmul.js +83 -39
package/src/gpu/kernels/modulate.js +23 -15
package/src/gpu/kernels/moe.js +221 -175
package/src/gpu/kernels/pixel_shuffle.js +22 -14
package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
package/src/gpu/kernels/relu.js +31 -10
package/src/gpu/kernels/relu.wgsl +2 -1
package/src/gpu/kernels/relu_f16.wgsl +2 -1
package/src/gpu/kernels/repeat_channels.js +25 -17
package/src/gpu/kernels/repeat_channels.wgsl +4 -5
package/src/gpu/kernels/repeat_channels_f16.wgsl +4 -5
package/src/gpu/kernels/residual.js +69 -23
package/src/gpu/kernels/residual.wgsl +6 -3
package/src/gpu/kernels/residual_f16.wgsl +2 -1
package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
package/src/gpu/kernels/residual_vec4.wgsl +2 -1
package/src/gpu/kernels/rmsnorm.js +96 -28
package/src/gpu/kernels/rmsnorm.wgsl +14 -6
package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
package/src/gpu/kernels/rope.d.ts +2 -0
package/src/gpu/kernels/rope.js +14 -1
package/src/gpu/kernels/rope.wgsl +56 -40
package/src/gpu/kernels/sample.js +27 -38
package/src/gpu/kernels/sana_linear_attention.js +19 -12
package/src/gpu/kernels/sana_linear_attention_apply.wgsl +4 -5
package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +4 -5
package/src/gpu/kernels/sana_linear_attention_summary.wgsl +4 -0
package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +4 -0
package/src/gpu/kernels/scale.js +18 -11
package/src/gpu/kernels/shader-cache.js +4 -2
package/src/gpu/kernels/silu.d.ts +1 -0
package/src/gpu/kernels/silu.js +148 -82
package/src/gpu/kernels/silu.wgsl +19 -9
package/src/gpu/kernels/silu_f16.wgsl +19 -9
package/src/gpu/kernels/softmax.js +44 -25
package/src/gpu/kernels/split_qkv.js +23 -13
package/src/gpu/kernels/transpose.js +31 -10
package/src/gpu/kernels/transpose.wgsl +6 -5
package/src/gpu/kernels/upsample2d.js +22 -13
package/src/gpu/kernels/upsample2d.wgsl +6 -9
package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
package/src/gpu/kernels/utils.js +35 -13
package/src/gpu/partitioned-buffer-pool.js +10 -2
package/src/gpu/perf-guards.js +2 -9
package/src/gpu/profiler.js +27 -22
package/src/gpu/readback-utils.d.ts +16 -0
package/src/gpu/readback-utils.js +41 -0
package/src/gpu/submit-tracker.js +13 -0
package/src/gpu/uniform-cache.d.ts +1 -0
package/src/gpu/uniform-cache.js +30 -9
package/src/hotswap/intent-bundle.js +6 -0
package/src/hotswap/manifest.d.ts +10 -1
package/src/hotswap/manifest.js +12 -2
package/src/hotswap/runtime.js +30 -8
package/src/index-browser.d.ts +44 -0
package/src/index-browser.js +14 -0
package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
package/src/inference/browser-harness-contract-helpers.js +28 -0
package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
package/src/inference/browser-harness-model-helpers.d.ts +16 -0
package/src/inference/browser-harness-model-helpers.js +217 -0
package/src/inference/browser-harness-report-helpers.d.ts +7 -0
package/src/inference/browser-harness-report-helpers.js +42 -0
package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
package/src/inference/browser-harness-runtime-helpers.js +415 -0
package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
package/src/inference/browser-harness-suite-helpers.js +268 -0
package/src/inference/browser-harness-text-helpers.d.ts +27 -0
package/src/inference/browser-harness-text-helpers.js +788 -0
package/src/inference/browser-harness.d.ts +6 -0
package/src/inference/browser-harness.js +130 -1950
package/src/inference/kv-cache/base.js +140 -94
package/src/inference/kv-cache/tiered.js +5 -3
package/src/inference/moe-router.js +88 -56
package/src/inference/multi-model-network.js +5 -3
package/src/inference/network-evolution.d.ts +11 -2
package/src/inference/network-evolution.js +20 -21
package/src/inference/pipelines/context.d.ts +3 -0
package/src/inference/pipelines/context.js +142 -2
package/src/inference/pipelines/diffusion/helpers.js +7 -2
package/src/inference/pipelines/diffusion/pipeline.js +17 -7
package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +5 -0
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +27 -15
package/src/inference/pipelines/diffusion/vae.js +3 -7
package/src/inference/pipelines/energy/pipeline.js +27 -21
package/src/inference/pipelines/energy/quintel.d.ts +5 -0
package/src/inference/pipelines/energy/quintel.js +11 -0
package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
package/src/inference/pipelines/text/attention/projections.js +151 -101
package/src/inference/pipelines/text/attention/record.js +73 -10
package/src/inference/pipelines/text/attention/run.js +73 -10
package/src/inference/pipelines/text/chat-format.js +25 -1
package/src/inference/pipelines/text/config.d.ts +4 -0
package/src/inference/pipelines/text/config.js +71 -5
package/src/inference/pipelines/text/embed.js +2 -8
package/src/inference/pipelines/text/execution-plan.js +64 -50
package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
package/src/inference/pipelines/text/execution-v0.js +78 -1002
package/src/inference/pipelines/text/ffn/standard.js +3 -0
package/src/inference/pipelines/text/generator-steps.d.ts +46 -0
package/src/inference/pipelines/text/generator-steps.js +298 -207
package/src/inference/pipelines/text/generator.js +6 -23
package/src/inference/pipelines/text/init.d.ts +4 -0
package/src/inference/pipelines/text/init.js +134 -29
package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
package/src/inference/pipelines/text/kernel-trace.js +6 -0
package/src/inference/pipelines/text/layer.js +14 -9
package/src/inference/pipelines/text/linear-attention.d.ts +10 -0
package/src/inference/pipelines/text/linear-attention.js +80 -6
package/src/inference/pipelines/text/logits/gpu.js +10 -5
package/src/inference/pipelines/text/logits/index.js +10 -11
package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
package/src/inference/pipelines/text/logits/utils.js +9 -0
package/src/inference/pipelines/text/lora-apply.js +50 -32
package/src/inference/pipelines/text/model-load.js +279 -104
package/src/inference/pipelines/text/moe-cache.js +5 -4
package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
package/src/inference/pipelines/text/moe-cpu.js +42 -38
package/src/inference/pipelines/text/moe-gpu.js +110 -86
package/src/inference/pipelines/text/ops.js +90 -90
package/src/inference/pipelines/text/probes.js +9 -9
package/src/inference/pipelines/text/weights.js +17 -7
package/src/inference/pipelines/text.js +17 -1
package/src/inference/speculative.d.ts +2 -2
package/src/inference/speculative.js +4 -18
package/src/inference/test-harness.d.ts +1 -1
package/src/inference/test-harness.js +15 -5
package/src/inference/tokenizer.d.ts +0 -5
package/src/inference/tokenizer.js +4 -23
package/src/inference/tokenizers/bpe.js +9 -0
package/src/inference/tokenizers/bundled.js +176 -33
package/src/inference/tokenizers/sentencepiece.js +12 -0
package/src/loader/doppler-loader.js +38 -22
package/src/loader/dtype-utils.js +3 -44
package/src/loader/embedding-loader.js +7 -3
package/src/loader/experts/expert-cache.js +13 -6
package/src/loader/experts/expert-loader.js +10 -6
package/src/loader/final-weights-loader.js +8 -4
package/src/loader/layer-loader.js +2 -1
package/src/loader/loader-state.js +2 -2
package/src/loader/memory-monitor.js +8 -0
package/src/loader/multi-model-loader.d.ts +14 -0
package/src/loader/multi-model-loader.js +70 -24
package/src/loader/shard-cache.js +81 -12
package/src/loader/shard-resolver.js +25 -3
package/src/loader/tensors/tensor-loader.js +209 -144
package/src/loader/tensors/tensor-reader.js +76 -19
package/src/loader/weight-downcast.js +1 -1
package/src/memory/buffer-pool.d.ts +9 -1
package/src/memory/buffer-pool.js +109 -44
package/src/memory/unified-detect.js +1 -1
package/src/rules/inference/kernel-path.rules.json +24 -8
package/src/rules/rule-registry.js +25 -1
package/src/rules/tooling/command-runtime.rules.json +18 -0
package/src/storage/backends/opfs-store.js +68 -24
package/src/storage/downloader.js +364 -83
package/src/storage/index.d.ts +3 -0
package/src/storage/index.js +3 -0
package/src/storage/preflight.d.ts +2 -2
package/src/storage/preflight.js +24 -2
package/src/storage/quickstart-downloader.js +11 -5
package/src/storage/registry.js +10 -4
package/src/storage/reports.js +1 -1
package/src/storage/shard-manager.d.ts +15 -1
package/src/storage/shard-manager.js +51 -3
package/src/storage/source-artifact-store.d.ts +52 -0
package/src/storage/source-artifact-store.js +234 -0
package/src/tooling/command-api-constants.d.ts +9 -0
package/src/tooling/command-api-constants.js +9 -0
package/src/tooling/command-api-family-normalizers.d.ts +9 -0
package/src/tooling/command-api-family-normalizers.js +343 -0
package/src/tooling/command-api-helpers.d.ts +25 -0
package/src/tooling/command-api-helpers.js +262 -0
package/src/tooling/command-api.d.ts +27 -1
package/src/tooling/command-api.js +26 -473
package/src/tooling/command-envelope.js +4 -1
package/src/tooling/command-runner-shared.js +52 -18
package/src/tooling/lean-execution-contract.js +150 -3
package/src/tooling/node-browser-command-runner.d.ts +4 -0
package/src/tooling/node-browser-command-runner.js +218 -273
package/src/tooling/node-command-runner.js +44 -3
package/src/tooling/node-converter.js +27 -1
package/src/tooling/node-source-runtime.d.ts +1 -1
package/src/tooling/node-source-runtime.js +84 -3
package/src/tooling/node-webgpu.js +30 -105
package/src/tooling/opfs-cache.js +21 -4
package/src/tooling/runtime-input-composition.d.ts +38 -0
package/src/tooling/runtime-input-composition.js +86 -0
package/src/tooling/source-runtime-bundle.d.ts +40 -5
package/src/tooling/source-runtime-bundle.js +261 -34
package/src/tooling/source-runtime-materializer.d.ts +6 -0
package/src/tooling/source-runtime-materializer.js +93 -0
package/src/training/attention-backward.js +32 -17
package/src/training/autograd.js +80 -52
package/src/training/checkpoint-watch.d.ts +8 -0
package/src/training/checkpoint-watch.js +139 -0
package/src/training/checkpoint.d.ts +6 -1
package/src/training/checkpoint.js +46 -7
package/src/training/clip.js +2 -1
package/src/training/datasets/token-batch.js +20 -8
package/src/training/distillation/artifacts.d.ts +71 -0
package/src/training/distillation/artifacts.js +132 -0
package/src/training/distillation/checkpoint-watch.d.ts +10 -0
package/src/training/distillation/checkpoint-watch.js +58 -0
package/src/training/distillation/dataset.d.ts +59 -0
package/src/training/distillation/dataset.js +337 -0
package/src/training/distillation/eval.d.ts +34 -0
package/src/training/distillation/eval.js +310 -0
package/src/training/distillation/index.d.ts +29 -0
package/src/training/distillation/index.js +29 -0
package/src/training/distillation/runtime.d.ts +20 -0
package/src/training/distillation/runtime.js +121 -0
package/src/training/distillation/scoreboard.d.ts +6 -0
package/src/training/distillation/scoreboard.js +8 -0
package/src/training/distillation/stage-a.d.ts +45 -0
package/src/training/distillation/stage-a.js +338 -0
package/src/training/distillation/stage-b.d.ts +24 -0
package/src/training/distillation/stage-b.js +20 -0
package/src/training/distillation/student-fixture.d.ts +22 -0
package/src/training/distillation/student-fixture.js +846 -0
package/src/training/distillation/suite-data.d.ts +45 -0
package/src/training/distillation/suite-data.js +189 -0
package/src/training/index.d.ts +10 -0
package/src/training/index.js +10 -0
package/src/training/lora-pipeline.d.ts +40 -0
package/src/training/lora-pipeline.js +793 -0
package/src/training/lora.js +26 -12
package/src/training/loss.js +5 -6
package/src/training/objectives/cross_entropy.js +2 -5
package/src/training/objectives/distill_kd.js +4 -8
package/src/training/objectives/distill_triplet.js +4 -8
package/src/training/objectives/ul_stage2_base.js +4 -8
package/src/training/operator-artifacts.d.ts +62 -0
package/src/training/operator-artifacts.js +140 -0
package/src/training/operator-command.d.ts +5 -0
package/src/training/operator-command.js +455 -0
package/src/training/operator-eval.d.ts +48 -0
package/src/training/operator-eval.js +230 -0
package/src/training/operator-scoreboard.d.ts +5 -0
package/src/training/operator-scoreboard.js +44 -0
package/src/training/optimizer.js +19 -7
package/src/training/runner.d.ts +52 -0
package/src/training/runner.js +31 -5
package/src/training/suite.d.ts +112 -0
package/src/training/suite.js +24 -984
package/src/training/tensor-factory.d.ts +9 -0
package/src/training/tensor-factory.js +13 -0
package/src/training/trainer.js +3 -5
package/src/training/ul_dataset.js +3 -5
package/src/training/workloads.d.ts +164 -0
package/src/training/workloads.js +530 -0
package/src/version.js +1 -1
package/tools/convert-safetensors-node.js +22 -16
package/tools/doppler-cli.js +179 -63

package/src/gpu/kernels/rope.wgsl CHANGED Viewed

@@ -26,8 +26,8 @@ struct Uniforms {
     start_pos: u32,        // Starting position (for decode)
     rope_base: f32,        // Base frequency (default 10000)
     rope_scale: f32,       // Scaling factor for extended context
-    _pad0: u32,
-    _pad1: u32,
+    rotary_dim: u32,       // Rotary slice within head_dim
+    interleaved: u32,      // 1 = adjacent pairs, 0 = rotate-half
 }
 @group(0) @binding(0) var<uniform> u: Uniforms;
@@ -46,7 +46,8 @@ fn main(
     let start_pos = u.start_pos;
     // Global thread index (one thread per complex pair)
-    let half_dim = head_dim / 2u;
+    let rotary_dim = u.rotary_dim;
+    let half_dim = rotary_dim / 2u;
     let total_pairs = seq_len * num_heads * half_dim;
     let idx = global_id.x;
@@ -68,16 +69,18 @@ fn main(
     // Apply "rotate-half" layout: pair (x[i], x[i + half_dim])
     let base_idx = pos * num_heads * head_dim + head_idx * head_dim;
-    let x0 = input[base_idx + pair_idx];
-    let x1 = input[base_idx + pair_idx + half_dim];
+    let first_idx = select(pair_idx, pair_idx * 2u, u.interleaved == 1u);
+    let second_idx = select(pair_idx + half_dim, pair_idx * 2u + 1u, u.interleaved == 1u);
+    let x0 = input[base_idx + first_idx];
+    let x1 = input[base_idx + second_idx];
     // Apply rotation
     let y0 = x0 * cos_val - x1 * sin_val;
     let y1 = x0 * sin_val + x1 * cos_val;
     // Write back
-    input[base_idx + pair_idx] = y0;
-    input[base_idx + pair_idx + half_dim] = y1;
+    input[base_idx + first_idx] = y0;
+    input[base_idx + second_idx] = y1;
 }
 // Compute frequencies on-the-fly (no precomputation needed)
@@ -91,9 +94,10 @@ fn rope_compute_freqs(
     let start_pos = u.start_pos;
     let rope_base = u.rope_base;
     let rope_scale = u.rope_scale;
+    let rotary_dim = u.rotary_dim;
     let idx = global_id.x;
-    let half_dim = head_dim / 2u;
+    let half_dim = rotary_dim / 2u;
     let total_pairs = seq_len * num_heads * half_dim;
     if (idx >= total_pairs) {
@@ -109,7 +113,7 @@ fn rope_compute_freqs(
     let actual_pos = f32(start_pos + pos) / rope_scale;
     // Compute frequency: 1 / (base^(2*pair_idx/head_dim))
-    let exponent = f32(pair_idx * 2u) / f32(head_dim);
+    let exponent = f32(pair_idx * 2u) / f32(rotary_dim);
     let freq = 1.0 / pow(rope_base, exponent);
     let theta = actual_pos * freq;
@@ -118,12 +122,14 @@ fn rope_compute_freqs(
     // Apply "rotate-half" layout: pair (x[i], x[i + half_dim])
     let base_idx = pos * num_heads * head_dim + head_idx * head_dim;
-    let x0 = input[base_idx + pair_idx];
-    let x1 = input[base_idx + pair_idx + half_dim];
+    let first_idx = select(pair_idx, pair_idx * 2u, u.interleaved == 1u);
+    let second_idx = select(pair_idx + half_dim, pair_idx * 2u + 1u, u.interleaved == 1u);
+    let x0 = input[base_idx + first_idx];
+    let x1 = input[base_idx + second_idx];
     // Apply rotation
-    input[base_idx + pair_idx] = x0 * cos_val - x1 * sin_val;
-    input[base_idx + pair_idx + half_dim] = x0 * sin_val + x1 * cos_val;
+    input[base_idx + first_idx] = x0 * cos_val - x1 * sin_val;
+    input[base_idx + second_idx] = x0 * sin_val + x1 * cos_val;
 }
 // Apply RoPE to both Q and K in one pass
@@ -138,10 +144,11 @@ fn rope_qk(
     let start_pos = u.start_pos;
     let rope_base = u.rope_base;
     let rope_scale = u.rope_scale;
+    let rotary_dim = u.rotary_dim;
     let idx = global_id.x;
     // Each thread handles one Q-K pair at one dimension pair
-    let half_dim = head_dim / 2u;
+    let half_dim = rotary_dim / 2u;
     let total_pairs = seq_len * num_heads * half_dim;
     if (idx >= total_pairs) {
@@ -156,7 +163,7 @@ fn rope_qk(
     let actual_pos = f32(start_pos + pos) / rope_scale;
     // Compute frequency
-    let exponent = f32(pair_idx * 2u) / f32(head_dim);
+    let exponent = f32(pair_idx * 2u) / f32(rotary_dim);
     let freq = 1.0 / pow(rope_base, exponent);
     let theta = actual_pos * freq;
@@ -168,16 +175,18 @@ fn rope_qk(
     let k_base_idx = q_base_idx + head_dim;  // K starts after Q
     // Process Q
-    let q0 = input[q_base_idx + pair_idx];
-    let q1 = input[q_base_idx + pair_idx + half_dim];
-    input[q_base_idx + pair_idx] = q0 * cos_val - q1 * sin_val;
-    input[q_base_idx + pair_idx + half_dim] = q0 * sin_val + q1 * cos_val;
+    let first_idx = select(pair_idx, pair_idx * 2u, u.interleaved == 1u);
+    let second_idx = select(pair_idx + half_dim, pair_idx * 2u + 1u, u.interleaved == 1u);
+    let q0 = input[q_base_idx + first_idx];
+    let q1 = input[q_base_idx + second_idx];
+    input[q_base_idx + first_idx] = q0 * cos_val - q1 * sin_val;
+    input[q_base_idx + second_idx] = q0 * sin_val + q1 * cos_val;
     // Process K
-    let k0 = input[k_base_idx + pair_idx];
-    let k1 = input[k_base_idx + pair_idx + half_dim];
-    input[k_base_idx + pair_idx] = k0 * cos_val - k1 * sin_val;
-    input[k_base_idx + pair_idx + half_dim] = k0 * sin_val + k1 * cos_val;
+    let k0 = input[k_base_idx + first_idx];
+    let k1 = input[k_base_idx + second_idx];
+    input[k_base_idx + first_idx] = k0 * cos_val - k1 * sin_val;
+    input[k_base_idx + second_idx] = k0 * sin_val + k1 * cos_val;
 }
 // Precompute frequency table (run once at init)
@@ -190,9 +199,10 @@ fn precompute_freqs(
     let seq_len = u.seq_len;  // maxSeqLen for precomputation
     let rope_base = u.rope_base;
     let rope_scale = u.rope_scale;
+    let rotary_dim = u.rotary_dim;
     let idx = global_id.x;
-    let half_dim = head_dim / 2u;
+    let half_dim = rotary_dim / 2u;
     let total_elements = seq_len * half_dim;
     if (idx >= total_elements) {
@@ -203,7 +213,7 @@ fn precompute_freqs(
     let dim_idx = idx % half_dim;
     let actual_pos = f32(pos) / rope_scale;
-    let exponent = f32(dim_idx * 2u) / f32(head_dim);
+    let exponent = f32(dim_idx * 2u) / f32(rotary_dim);
     let freq = 1.0 / pow(rope_base, exponent);
     let theta = actual_pos * freq;
@@ -218,6 +228,7 @@ fn rope_ntk_scaled(
     @builtin(global_invocation_id) global_id: vec3<u32>
 ) {
     let head_dim = u.head_dim;
+    let rotary_dim = u.rotary_dim;
     let num_heads = u.num_heads;
     let seq_len = u.seq_len;
     let start_pos = u.start_pos;
@@ -225,7 +236,7 @@ fn rope_ntk_scaled(
     let rope_scale = u.rope_scale;
     let idx = global_id.x;
-    let half_dim = head_dim / 2u;
+    let half_dim = rotary_dim / 2u;
     let total_pairs = seq_len * num_heads * half_dim;
     if (idx >= total_pairs) {
@@ -234,7 +245,7 @@ fn rope_ntk_scaled(
     // NTK scaling: increase base proportionally to scale factor
     // This preserves high-frequency components better than linear interpolation
-    rope_base = rope_base * pow(rope_scale, f32(head_dim) / (f32(head_dim) - 2.0));
+    rope_base = rope_base * pow(rope_scale, f32(rotary_dim) / (f32(rotary_dim) - 2.0));
     let pos = idx / (num_heads * half_dim);
     let remainder = idx % (num_heads * half_dim);
@@ -243,7 +254,7 @@ fn rope_ntk_scaled(
     let actual_pos = f32(start_pos + pos);
-    let exponent = f32(pair_idx * 2u) / f32(head_dim);
+    let exponent = f32(pair_idx * 2u) / f32(rotary_dim);
     let freq = 1.0 / pow(rope_base, exponent);
     let theta = actual_pos * freq;
@@ -251,11 +262,13 @@ fn rope_ntk_scaled(
     let sin_val = sin(theta);
     let base_idx = pos * num_heads * head_dim + head_idx * head_dim;
-    let x0 = input[base_idx + pair_idx];
-    let x1 = input[base_idx + pair_idx + half_dim];
+    let first_idx = select(pair_idx, pair_idx * 2u, u.interleaved == 1u);
+    let second_idx = select(pair_idx + half_dim, pair_idx * 2u + 1u, u.interleaved == 1u);
+    let x0 = input[base_idx + first_idx];
+    let x1 = input[base_idx + second_idx];
-    input[base_idx + pair_idx] = x0 * cos_val - x1 * sin_val;
-    input[base_idx + pair_idx + half_dim] = x0 * sin_val + x1 * cos_val;
+    input[base_idx + first_idx] = x0 * cos_val - x1 * sin_val;
+    input[base_idx + second_idx] = x0 * sin_val + x1 * cos_val;
 }
 // YaRN-style RoPE with attention scaling
@@ -265,6 +278,7 @@ fn rope_yarn(
     @builtin(global_invocation_id) global_id: vec3<u32>
 ) {
     let head_dim = u.head_dim;
+    let rotary_dim = u.rotary_dim;
     let num_heads = u.num_heads;
     let seq_len = u.seq_len;
     let start_pos = u.start_pos;
@@ -272,7 +286,7 @@ fn rope_yarn(
     let rope_scale = u.rope_scale;
     let idx = global_id.x;
-    let half_dim = head_dim / 2u;
+    let half_dim = rotary_dim / 2u;
     let total_pairs = seq_len * num_heads * half_dim;
     if (idx >= total_pairs) {
@@ -292,7 +306,7 @@ fn rope_yarn(
     let alpha: f32 = 1.0;
     // Compute original frequency
-    let exponent = f32(pair_idx * 2u) / f32(head_dim);
+    let exponent = f32(pair_idx * 2u) / f32(rotary_dim);
     let orig_freq = 1.0 / pow(rope_base, exponent);
     // Compute wavelength
@@ -300,8 +314,8 @@ fn rope_yarn(
     // Interpolation factor based on wavelength
     var ramp: f32;
-    let low_wavelength = f32(head_dim) / beta_fast;
-    let high_wavelength = f32(head_dim) / beta_slow;
+    let low_wavelength = f32(rotary_dim) / beta_fast;
+    let high_wavelength = f32(rotary_dim) / beta_slow;
     if (wavelength < low_wavelength) {
         ramp = 0.0;  // No interpolation for high frequencies
@@ -320,9 +334,11 @@ fn rope_yarn(
     let sin_val = sin(theta);
     let base_idx = pos * num_heads * head_dim + head_idx * head_dim;
-    let x0 = input[base_idx + pair_idx];
-    let x1 = input[base_idx + pair_idx + half_dim];
+    let first_idx = select(pair_idx, pair_idx * 2u, u.interleaved == 1u);
+    let second_idx = select(pair_idx + half_dim, pair_idx * 2u + 1u, u.interleaved == 1u);
+    let x0 = input[base_idx + first_idx];
+    let x1 = input[base_idx + second_idx];
-    input[base_idx + pair_idx] = x0 * cos_val - x1 * sin_val;
-    input[base_idx + pair_idx + half_dim] = x0 * sin_val + x1 * cos_val;
+    input[base_idx + first_idx] = x0 * cos_val - x1 * sin_val;
+    input[base_idx + second_idx] = x0 * sin_val + x1 * cos_val;
 }

package/src/gpu/kernels/sample.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { getDevice, getKernelCapabilities } from '../device.js';
-import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
+import { acquireBuffer, readBufferSlice, releaseBuffer } from '../../memory/buffer-pool.js';
 import { WORKGROUP_SIZES } from './constants.js';
 import { createPipeline, createUniformBufferWithView, getOrCreateBindGroupLayout } from './utils.js';
 import { allowReadback } from '../perf-guards.js';
@@ -156,18 +156,19 @@ function ensureOutputBufferSize(outputBuffer, minBytes, label) {
   }
 }
-function readTokenFromOutput(device, outputBuffer, outputIndex, label) {
-  const stagingBuffer = device.createBuffer({
-    label,
-    size: 4,
-    usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
-  });
-  const copyEncoder = device.createCommandEncoder({ label: `${label}_copy` });
-  copyEncoder.copyBufferToBuffer(outputBuffer, outputIndex * 4, stagingBuffer, 0, 4);
-  device.queue.submit([copyEncoder.finish()]);
+async function readTokenFromOutput(outputBuffer, outputIndex) {
+  return new Uint32Array(await readBufferSlice(outputBuffer, outputIndex * 4, 4))[0];
+}
-  return stagingBuffer;
+function cleanupRunResources(uniformBuffer, ownedBuffers) {
+  if (uniformBuffer) {
+    uniformBuffer.destroy();
+  }
+  for (const buffer of ownedBuffers) {
+    if (buffer) {
+      releaseBuffer(buffer);
+    }
+  }
 }
 async function executeArgmaxRun(logits, vocabSize, options) {
@@ -238,20 +239,14 @@ async function executeArgmaxRun(logits, vocabSize, options) {
   device.queue.submit([encoder.finish()]);
-  const stagingBuffer = readTokenFromOutput(device, outputBuffer, outputIndex, 'argmax_staging');
-  await stagingBuffer.mapAsync(GPUMapMode.READ);
-  const tokenId = new Uint32Array(stagingBuffer.getMappedRange())[0];
-  stagingBuffer.unmap();
-  stagingBuffer.destroy();
-  uniformBuffer.destroy();
-  releaseBuffer(tempLogits);
-  releaseBuffer(tempIndices);
-  if (ownsOutputBuffer) {
-    releaseBuffer(outputBuffer);
+  try {
+    return await readTokenFromOutput(outputBuffer, outputIndex);
+  } finally {
+    cleanupRunResources(
+      uniformBuffer,
+      [tempLogits, tempIndices, ownsOutputBuffer ? outputBuffer : null]
+    );
   }
-  return tokenId;
 }
 async function executeArgmaxRecord(recorder, logits, vocabSize, options) {
@@ -428,20 +423,14 @@ export async function runGPUSample(
   device.queue.submit([encoder.finish()]);
-  const stagingBuffer = readTokenFromOutput(device, outputBuffer, outputIndex, 'sample_staging');
-  await stagingBuffer.mapAsync(GPUMapMode.READ);
-  const tokenId = new Uint32Array(stagingBuffer.getMappedRange())[0];
-  stagingBuffer.unmap();
-  stagingBuffer.destroy();
-  uniformBuffer.destroy();
-  releaseBuffer(topkLogits);
-  releaseBuffer(topkIndices);
-  if (ownsOutputBuffer) {
-    releaseBuffer(outputBuffer);
+  try {
+    return await readTokenFromOutput(outputBuffer, outputIndex);
+  } finally {
+    cleanupRunResources(
+      uniformBuffer,
+      [topkLogits, topkIndices, ownsOutputBuffer ? outputBuffer : null]
+    );
   }
-  return tokenId;
 }

package/src/gpu/kernels/sana_linear_attention.js CHANGED Viewed

@@ -29,7 +29,6 @@ async function runSummary(target, query, key, value, summaryBuffer, uniforms, va
 }
 async function runApply(target, query, summaryBuffer, outputBuffer, uniforms, variant) {
-  const outputSize = uniforms.num_tokens * uniforms.hidden_size;
   await unifiedKernelWrapper(
     'sana_linear_attention_apply',
     target,
@@ -45,7 +44,7 @@ async function runApply(target, query, summaryBuffer, outputBuffer, uniforms, va
       _pad1: 0,
       _pad2: 0,
     },
-    Math.ceil(outputSize / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil(uniforms.hidden_size / WORKGROUP_SIZES.DEFAULT), uniforms.num_tokens, 1]
   );
 }
@@ -65,6 +64,8 @@ async function _sanaLinearAttention(target, query, key, value, options = {}) {
     outputBuffer = null,
     summaryBuffer = null,
   } = options;
+  const ownsSummary = summaryBuffer == null;
+  const ownsOutput = outputBuffer == null;
   if (
     !Number.isFinite(numHeads) ||
@@ -99,18 +100,24 @@ async function _sanaLinearAttention(target, query, key, value, options = {}) {
     eps,
   };
-  await runSummary(target, query, key, value, temporarySummary, uniforms, variant);
-  await runApply(target, query, temporarySummary, output, uniforms, variant);
-  if (!summaryBuffer) {
-    if (recorder) {
-      recorder.trackTemporaryBuffer(temporarySummary);
-    } else {
-      releaseBuffer(temporarySummary);
+  try {
+    await runSummary(target, query, key, value, temporarySummary, uniforms, variant);
+    await runApply(target, query, temporarySummary, output, uniforms, variant);
+    return createTensor(output, query.dtype, [numTokens, hiddenSize], 'sana_linear_attention_output');
+  } catch (error) {
+    if (ownsOutput) {
+      releaseBuffer(output);
+    }
+    throw error;
+  } finally {
+    if (ownsSummary) {
+      if (recorder) {
+        recorder.trackTemporaryBuffer(temporarySummary);
+      } else {
+        releaseBuffer(temporarySummary);
+      }
     }
   }
-  return createTensor(output, query.dtype, [numTokens, hiddenSize], 'sana_linear_attention_output');
 }
 export async function runSanaLinearAttention(query, key, value, options = {}) {

package/src/gpu/kernels/sana_linear_attention_apply.wgsl CHANGED Viewed

@@ -18,14 +18,13 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
-    let total = u.num_tokens * u.hidden_size;
-    if (idx >= total) {
+    let hidden = gid.x;
+    let token = gid.y;
+    if (token >= u.num_tokens || hidden >= u.hidden_size) {
         return;
     }
-    let token = idx / u.hidden_size;
-    let hidden = idx - token * u.hidden_size;
+    let idx = token * u.hidden_size + hidden;
     let head = hidden / u.head_dim;
     let dim = hidden - head * u.head_dim;
     let rows_per_head = u.head_dim + 1u;

package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl CHANGED Viewed

@@ -20,14 +20,13 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
-    let total = u.num_tokens * u.hidden_size;
-    if (idx >= total) {
+    let hidden = gid.x;
+    let token = gid.y;
+    if (token >= u.num_tokens || hidden >= u.hidden_size) {
         return;
     }
-    let token = idx / u.hidden_size;
-    let hidden = idx - token * u.hidden_size;
+    let idx = token * u.hidden_size + hidden;
     let head = hidden / u.head_dim;
     let dim = hidden - head * u.head_dim;
     let rows_per_head = u.head_dim + 1u;

package/src/gpu/kernels/sana_linear_attention_summary.wgsl CHANGED Viewed

@@ -33,6 +33,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     var acc: f32 = 0.0;
     for (var token: u32 = 0u; token < u.num_tokens; token = token + 1u) {
+        let query_value = query[token * u.hidden_size + hidden_base + col];
         let key_idx = token * u.hidden_size + hidden_base + col;
         let key_value = max(key[key_idx], 0.0);
         let value_value = select(
@@ -40,6 +41,9 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
             1.0,
             row == u.head_dim
         );
+        if (u.hidden_size == 0u) {
+            acc = acc + query_value;
+        }
         acc = acc + value_value * key_value;
     }

package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl CHANGED Viewed

@@ -35,6 +35,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     var acc: f32 = 0.0;
     for (var token: u32 = 0u; token < u.num_tokens; token = token + 1u) {
+        let query_value = f32(query[token * u.hidden_size + hidden_base + col]);
         let key_idx = token * u.hidden_size + hidden_base + col;
         let key_value = max(f32(key[key_idx]), 0.0);
         let value_value = select(
@@ -42,6 +43,9 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
             1.0,
             row == u.head_dim
         );
+        if (u.hidden_size == 0u) {
+            acc = acc + query_value;
+        }
         acc = acc + value_value * key_value;
     }

package/src/gpu/kernels/scale.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { acquireBuffer } from '../../memory/buffer-pool.js';
+import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
 import { createTensor, dtypeBytes } from '../tensor.js';
 import { WORKGROUP_SIZES } from './constants.js';
 import { unifiedKernelWrapper } from './utils.js';
@@ -6,6 +6,7 @@ import { selectRuleValue } from './rule-registry.js';
 async function _scale(target, input, scale, options = {}) {
   const { count, outputBuffer = null, inplace = false } = options;
+  const ownsOutput = !inplace && outputBuffer == null;
   const bytesPerElement = dtypeBytes(input.dtype);
   const inferredCount = count ?? Math.floor(input.buffer.size / bytesPerElement);
@@ -16,16 +17,22 @@ async function _scale(target, input, scale, options = {}) {
   const bindings = inplace ? [outputBuf, outputBuf] : [input, outputBuf];
-  await unifiedKernelWrapper(
-    'scale',
-    target,
-    variant,
-    bindings,
-    { size: inferredCount, scale },
-    Math.ceil(inferredCount / WORKGROUP_SIZES.DEFAULT)
-  );
-  return createTensor(outputBuf, input.dtype, [...input.shape], 'scale_output');
+  try {
+    await unifiedKernelWrapper(
+      'scale',
+      target,
+      variant,
+      bindings,
+      { size: inferredCount, scale },
+      Math.ceil(inferredCount / WORKGROUP_SIZES.DEFAULT)
+    );
+    return createTensor(outputBuf, input.dtype, [...input.shape], 'scale_output');
+  } catch (error) {
+    if (ownsOutput) {
+      releaseBuffer(outputBuf);
+    }
+    throw error;
+  }
 }
 export async function runScale(input, scale, options = {}) {

package/src/gpu/kernels/shader-cache.js CHANGED Viewed

@@ -138,8 +138,10 @@ export async function compileShader(
     code: source,
   });
-  // Check for compilation errors
-  const compilationInfo = await module.getCompilationInfo();
+  // Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
+  const compilationInfo = typeof module.getCompilationInfo === 'function'
+    ? await module.getCompilationInfo()
+    : { messages: [] };
   if (compilationInfo.messages.length > 0) {
     for (const msg of compilationInfo.messages) {
       if (msg.type === 'error') {

package/src/gpu/kernels/silu.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export interface SiLUOptions extends OutputBufferOptions {
   size?: number | null;
   gate?: Tensor | null;
   gateActivation?: 'silu' | 'sigmoid';
+  inputActivation?: 'silu' | 'identity';
   useVec4?: boolean;
   biasOffset?: number;
   swigluLimit: number | null;