@simulatte/doppler 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +16 -23
- package/package.json +30 -32
- package/src/adapters/adapter-registry.js +12 -1
- package/src/adapters/lora-loader.js +23 -6
- package/src/bridge/extension-client.d.ts +5 -0
- package/src/bridge/extension-client.js +40 -0
- package/src/bridge/index.d.ts +2 -1
- package/src/bridge/index.js +6 -4
- package/src/browser/browser-converter.js +31 -1
- package/src/browser/file-picker.js +6 -0
- package/src/browser/safetensors-parser-browser.js +84 -1
- package/src/browser/shard-io-browser.js +2 -2
- package/src/browser/tensor-source-download.js +8 -2
- package/src/browser/tensor-source-http.d.ts +1 -0
- package/src/browser/tensor-source-http.js +5 -1
- package/src/client/doppler-api.browser.js +20 -4
- package/src/client/doppler-api.js +19 -3
- package/src/client/doppler-provider/generation.js +12 -0
- package/src/client/doppler-provider/model-manager.d.ts +10 -0
- package/src/client/doppler-provider/model-manager.js +91 -19
- package/src/client/doppler-provider/source-runtime.d.ts +2 -1
- package/src/client/doppler-provider/source-runtime.js +132 -13
- package/src/client/doppler-registry.json +5 -20
- package/src/config/backward-registry-loader.js +17 -2
- package/src/config/execution-v0-contract-check.js +113 -15
- package/src/config/kernel-path-contract-check.js +57 -29
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +18 -36
- package/src/config/kernels/kernel-ref-digests.js +1 -1
- package/src/config/kernels/registry.js +14 -1
- package/src/config/kernels/registry.json +81 -5
- package/src/config/loader.d.ts +1 -1
- package/src/config/loader.js +15 -2
- package/src/config/merge-contract-check.js +66 -4
- package/src/config/merge-helpers.js +128 -7
- package/src/config/merge.d.ts +1 -0
- package/src/config/merge.js +10 -0
- package/src/config/param-validator.js +47 -2
- package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
- package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +43 -8
- package/src/config/presets/models/gemma2.json +3 -2
- package/src/config/presets/models/gemma3.json +2 -0
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
- package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
- package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
- package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
- package/src/config/runtime.js +6 -1
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +5 -0
- package/src/config/schema/doppler.schema.js +16 -21
- package/src/config/schema/inference-defaults.schema.js +3 -3
- package/src/config/schema/kernel-path.schema.d.ts +5 -1
- package/src/config/schema/kernel-thresholds.schema.js +12 -4
- package/src/config/schema/manifest.schema.d.ts +3 -2
- package/src/config/schema/manifest.schema.js +17 -4
- package/src/config/schema/storage.schema.js +1 -1
- package/src/config/training-defaults.js +30 -22
- package/src/converter/conversion-plan.js +104 -11
- package/src/converter/core.d.ts +7 -0
- package/src/converter/core.js +16 -9
- package/src/converter/execution-v0-manifest.js +4 -1
- package/src/converter/index.d.ts +1 -0
- package/src/converter/index.js +1 -0
- package/src/converter/manifest-inference.js +50 -29
- package/src/converter/parsers/diffusion.js +0 -3
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +40 -16
- package/src/converter/quantizer.js +19 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/shard-packer.d.ts +1 -1
- package/src/converter/shard-packer.js +4 -1
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/config.js +123 -11
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/debug/signals.js +7 -1
- package/src/debug/tensor.d.ts +2 -0
- package/src/debug/tensor.js +13 -2
- package/src/distribution/p2p-control-plane.js +52 -12
- package/src/distribution/p2p-observability.js +43 -7
- package/src/distribution/p2p-webrtc-browser.js +20 -0
- package/src/distribution/shard-delivery.js +83 -27
- package/src/formats/gguf/types.js +33 -16
- package/src/formats/rdrr/groups.d.ts +12 -4
- package/src/formats/rdrr/groups.js +3 -6
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +53 -3
- package/src/formats/rdrr/types.d.ts +2 -1
- package/src/gpu/command-recorder.js +86 -61
- package/src/gpu/device.d.ts +1 -0
- package/src/gpu/device.js +73 -19
- package/src/gpu/kernel-tuner/benchmarks.js +326 -316
- package/src/gpu/kernel-tuner/cache.js +71 -4
- package/src/gpu/kernel-tuner/tuner.js +22 -4
- package/src/gpu/kernels/attention.js +15 -34
- package/src/gpu/kernels/backward/adam.js +62 -58
- package/src/gpu/kernels/backward/attention_backward.js +257 -169
- package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
- package/src/gpu/kernels/cast.js +191 -149
- package/src/gpu/kernels/check-stop.js +33 -44
- package/src/gpu/kernels/conv2d.js +27 -17
- package/src/gpu/kernels/cross_entropy_loss.js +21 -15
- package/src/gpu/kernels/depthwise_conv2d.js +36 -26
- package/src/gpu/kernels/dequant.js +178 -126
- package/src/gpu/kernels/energy.d.ts +3 -21
- package/src/gpu/kernels/energy.js +111 -88
- package/src/gpu/kernels/feature-check.js +1 -1
- package/src/gpu/kernels/fused_ffn.js +84 -65
- package/src/gpu/kernels/fused_matmul_residual.js +56 -33
- package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
- package/src/gpu/kernels/gather.js +33 -15
- package/src/gpu/kernels/gelu.js +19 -11
- package/src/gpu/kernels/grouped_pointwise_conv2d.js +33 -23
- package/src/gpu/kernels/groupnorm.js +34 -23
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/kv-quantize.js +5 -2
- package/src/gpu/kernels/layernorm.js +35 -19
- package/src/gpu/kernels/logit-merge.js +5 -3
- package/src/gpu/kernels/matmul-selection.js +47 -4
- package/src/gpu/kernels/matmul.d.ts +2 -0
- package/src/gpu/kernels/matmul.js +59 -40
- package/src/gpu/kernels/modulate.js +23 -15
- package/src/gpu/kernels/moe.js +221 -175
- package/src/gpu/kernels/pixel_shuffle.js +22 -14
- package/src/gpu/kernels/relu.js +18 -10
- package/src/gpu/kernels/repeat_channels.js +25 -17
- package/src/gpu/kernels/residual.js +37 -27
- package/src/gpu/kernels/rmsnorm.js +66 -43
- package/src/gpu/kernels/rope.js +3 -0
- package/src/gpu/kernels/sample.js +27 -38
- package/src/gpu/kernels/sana_linear_attention.js +18 -10
- package/src/gpu/kernels/scale.js +18 -11
- package/src/gpu/kernels/shader-cache.js +4 -2
- package/src/gpu/kernels/silu.js +120 -72
- package/src/gpu/kernels/softmax.js +44 -25
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/kernels/split_qkv.js +23 -13
- package/src/gpu/kernels/transpose.js +18 -10
- package/src/gpu/kernels/transpose.wgsl +5 -3
- package/src/gpu/kernels/upsample2d.js +21 -13
- package/src/gpu/kernels/utils.js +20 -13
- package/src/gpu/partitioned-buffer-pool.js +10 -2
- package/src/gpu/perf-guards.js +2 -9
- package/src/gpu/profiler.js +27 -22
- package/src/gpu/readback-utils.d.ts +16 -0
- package/src/gpu/readback-utils.js +41 -0
- package/src/gpu/submit-tracker.js +13 -0
- package/src/gpu/uniform-cache.d.ts +1 -0
- package/src/gpu/uniform-cache.js +30 -9
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/hotswap/intent-bundle.js +6 -0
- package/src/hotswap/manifest.d.ts +10 -1
- package/src/hotswap/manifest.js +12 -2
- package/src/hotswap/runtime.js +30 -8
- package/src/index-browser.d.ts +44 -0
- package/src/index-browser.js +14 -0
- package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
- package/src/inference/browser-harness-contract-helpers.js +28 -0
- package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
- package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
- package/src/inference/browser-harness-model-helpers.d.ts +16 -0
- package/src/inference/browser-harness-model-helpers.js +217 -0
- package/src/inference/browser-harness-report-helpers.d.ts +7 -0
- package/src/inference/browser-harness-report-helpers.js +42 -0
- package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
- package/src/inference/browser-harness-runtime-helpers.js +415 -0
- package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
- package/src/inference/browser-harness-suite-helpers.js +268 -0
- package/src/inference/browser-harness-text-helpers.d.ts +27 -0
- package/src/inference/browser-harness-text-helpers.js +788 -0
- package/src/inference/browser-harness.d.ts +8 -0
- package/src/inference/browser-harness.js +149 -1996
- package/src/inference/kv-cache/base.js +140 -94
- package/src/inference/kv-cache/tiered.js +5 -3
- package/src/inference/moe-router.js +88 -56
- package/src/inference/multi-model-network.js +5 -3
- package/src/inference/network-evolution.d.ts +11 -2
- package/src/inference/network-evolution.js +20 -21
- package/src/inference/pipelines/context.d.ts +3 -0
- package/src/inference/pipelines/context.js +142 -2
- package/src/inference/pipelines/diffusion/helpers.js +10 -2
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
- package/src/inference/pipelines/diffusion/vae.js +3 -7
- package/src/inference/pipelines/energy/pipeline.js +27 -21
- package/src/inference/pipelines/energy/quintel.d.ts +5 -0
- package/src/inference/pipelines/energy/quintel.js +11 -0
- package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
- package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
- package/src/inference/pipelines/text/attention/projections.js +192 -112
- package/src/inference/pipelines/text/attention/record.js +77 -14
- package/src/inference/pipelines/text/attention/run.js +112 -14
- package/src/inference/pipelines/text/config.js +17 -4
- package/src/inference/pipelines/text/embed.js +2 -8
- package/src/inference/pipelines/text/execution-plan.js +46 -23
- package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
- package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
- package/src/inference/pipelines/text/execution-v0.js +62 -1013
- package/src/inference/pipelines/text/generator-runtime.js +5 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +52 -0
- package/src/inference/pipelines/text/generator-steps.js +340 -221
- package/src/inference/pipelines/text/generator.js +56 -40
- package/src/inference/pipelines/text/init.d.ts +13 -0
- package/src/inference/pipelines/text/init.js +94 -25
- package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
- package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
- package/src/inference/pipelines/text/kernel-trace.js +6 -0
- package/src/inference/pipelines/text/layer.js +4 -9
- package/src/inference/pipelines/text/linear-attention.d.ts +15 -0
- package/src/inference/pipelines/text/linear-attention.js +113 -9
- package/src/inference/pipelines/text/logits/gpu.js +12 -7
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +13 -12
- package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
- package/src/inference/pipelines/text/logits/utils.js +9 -0
- package/src/inference/pipelines/text/lora-apply.js +50 -32
- package/src/inference/pipelines/text/model-load.js +282 -104
- package/src/inference/pipelines/text/moe-cache.js +5 -4
- package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
- package/src/inference/pipelines/text/moe-cpu.js +42 -38
- package/src/inference/pipelines/text/moe-gpu.js +110 -86
- package/src/inference/pipelines/text/ops.js +90 -90
- package/src/inference/pipelines/text/probes.js +9 -9
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/weights.js +17 -7
- package/src/inference/pipelines/text.js +13 -1
- package/src/inference/speculative.d.ts +2 -2
- package/src/inference/speculative.js +4 -18
- package/src/inference/test-harness.d.ts +1 -1
- package/src/inference/test-harness.js +17 -7
- package/src/inference/tokenizer.d.ts +0 -5
- package/src/inference/tokenizer.js +4 -23
- package/src/inference/tokenizers/bpe.js +9 -0
- package/src/inference/tokenizers/bundled.js +20 -0
- package/src/inference/tokenizers/sentencepiece.js +12 -0
- package/src/loader/doppler-loader.js +38 -22
- package/src/loader/dtype-utils.js +3 -44
- package/src/loader/embedding-loader.js +7 -3
- package/src/loader/experts/expert-cache.js +13 -6
- package/src/loader/experts/expert-loader.js +10 -6
- package/src/loader/final-weights-loader.js +10 -4
- package/src/loader/layer-loader.js +2 -1
- package/src/loader/loader-state.js +2 -2
- package/src/loader/memory-monitor.js +8 -0
- package/src/loader/multi-model-loader.d.ts +14 -0
- package/src/loader/multi-model-loader.js +70 -24
- package/src/loader/shard-cache.js +84 -14
- package/src/loader/shard-resolver.js +25 -3
- package/src/loader/tensors/tensor-loader.js +214 -144
- package/src/loader/tensors/tensor-reader.js +76 -19
- package/src/loader/weight-downcast.js +1 -1
- package/src/memory/buffer-pool.d.ts +9 -1
- package/src/memory/buffer-pool.js +109 -44
- package/src/memory/unified-detect.js +1 -1
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +24 -8
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.js +27 -1
- package/src/storage/backends/opfs-store.js +68 -24
- package/src/storage/downloader.js +365 -83
- package/src/storage/index.d.ts +3 -0
- package/src/storage/index.js +3 -0
- package/src/storage/preflight.d.ts +2 -2
- package/src/storage/preflight.js +24 -2
- package/src/storage/quickstart-downloader.js +11 -5
- package/src/storage/registry.js +10 -4
- package/src/storage/reports.js +1 -1
- package/src/storage/shard-manager.d.ts +15 -1
- package/src/storage/shard-manager.js +55 -6
- package/src/storage/source-artifact-store.d.ts +52 -0
- package/src/storage/source-artifact-store.js +234 -0
- package/src/tooling/command-api-constants.d.ts +9 -0
- package/src/tooling/command-api-constants.js +9 -0
- package/src/tooling/command-api-family-normalizers.d.ts +9 -0
- package/src/tooling/command-api-family-normalizers.js +343 -0
- package/src/tooling/command-api-helpers.d.ts +25 -0
- package/src/tooling/command-api-helpers.js +262 -0
- package/src/tooling/command-api.js +16 -602
- package/src/tooling/command-envelope.js +4 -1
- package/src/tooling/command-runner-shared.js +52 -18
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/lean-execution-contract.js +150 -3
- package/src/tooling/node-browser-command-runner.js +161 -271
- package/src/tooling/node-command-runner.js +29 -3
- package/src/tooling/node-converter.js +30 -1
- package/src/tooling/node-source-runtime.d.ts +1 -1
- package/src/tooling/node-source-runtime.js +120 -3
- package/src/tooling/node-webgpu.js +24 -21
- package/src/tooling/opfs-cache.js +21 -4
- package/src/tooling/runtime-input-composition.d.ts +38 -0
- package/src/tooling/runtime-input-composition.js +86 -0
- package/src/tooling/source-runtime-bundle.d.ts +40 -5
- package/src/tooling/source-runtime-bundle.js +261 -34
- package/src/tooling/source-runtime-materializer.d.ts +6 -0
- package/src/tooling/source-runtime-materializer.js +93 -0
- package/src/training/attention-backward.js +32 -17
- package/src/training/autograd.js +80 -52
- package/src/training/checkpoint-watch.d.ts +2 -1
- package/src/training/checkpoint-watch.js +39 -6
- package/src/training/checkpoint.js +40 -11
- package/src/training/clip.js +2 -1
- package/src/training/datasets/token-batch.js +20 -8
- package/src/training/distillation/checkpoint-watch.js +1 -0
- package/src/training/distillation/student-fixture.d.ts +22 -0
- package/src/training/distillation/student-fixture.js +846 -0
- package/src/training/distillation/suite-data.d.ts +45 -0
- package/src/training/distillation/suite-data.js +189 -0
- package/src/training/lora-pipeline.js +4 -7
- package/src/training/lora.js +26 -12
- package/src/training/loss.js +5 -6
- package/src/training/objectives/cross_entropy.js +2 -5
- package/src/training/objectives/distill_kd.js +4 -8
- package/src/training/objectives/distill_triplet.js +4 -8
- package/src/training/objectives/ul_stage2_base.js +4 -8
- package/src/training/operator-command.js +2 -0
- package/src/training/optimizer.js +19 -7
- package/src/training/runner.js +2 -1
- package/src/training/suite.js +18 -978
- package/src/training/tensor-factory.d.ts +9 -0
- package/src/training/tensor-factory.js +13 -0
- package/src/training/trainer.js +3 -5
- package/src/training/ul_dataset.js +3 -5
- package/src/training/workloads.js +70 -79
- package/src/types/model.d.ts +5 -0
- package/src/version.js +1 -1
- package/tools/convert-safetensors-node.js +22 -16
- package/tools/doppler-cli.js +50 -26
|
@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
|
|
|
28
28
|
import { SlidingWindowKVCache } from '../../../kv-cache.js';
|
|
29
29
|
import {
|
|
30
30
|
recordAttentionInputs,
|
|
31
|
+
shouldForceF32AttentionProjectionForRoPE,
|
|
31
32
|
resolveAttentionProjectionOutputDtype,
|
|
32
33
|
projectAttentionQKV,
|
|
33
34
|
applyAttentionQKNorm,
|
|
34
35
|
} from './projections.js';
|
|
36
|
+
import { prepareAttentionProjectionInput } from './output-projection.js';
|
|
35
37
|
|
|
36
38
|
import {
|
|
37
39
|
shouldDebugLayer,
|
|
@@ -97,9 +99,20 @@ export async function runLayerAttentionGPU(
|
|
|
97
99
|
const allowF16Attention = wantsF16Output && kvCacheDtype === 'f16';
|
|
98
100
|
let attentionInput = input;
|
|
99
101
|
let attentionInputTemp = false;
|
|
102
|
+
let normed = attentionInput;
|
|
103
|
+
let qTensor = null;
|
|
104
|
+
let qGateTensor = null;
|
|
105
|
+
let kTensor = null;
|
|
106
|
+
let vTensor = null;
|
|
107
|
+
let attnOutput = null;
|
|
108
|
+
let attnForProjection = null;
|
|
109
|
+
let output = null;
|
|
110
|
+
let finalOutput = null;
|
|
111
|
+
let oProjInputTemp = null;
|
|
100
112
|
if (wantsF16Output && !allowF16Attention) {
|
|
101
113
|
attentionInput = await castF16ToF32(input);
|
|
102
114
|
attentionInputTemp = true;
|
|
115
|
+
normed = attentionInput;
|
|
103
116
|
}
|
|
104
117
|
|
|
105
118
|
// Debug: attention input for configured layers
|
|
@@ -123,7 +136,7 @@ export async function runLayerAttentionGPU(
|
|
|
123
136
|
|
|
124
137
|
// 1. Input norm
|
|
125
138
|
|
|
126
|
-
|
|
139
|
+
try {
|
|
127
140
|
if (!skipInputNorm && layerWeights.inputNorm && getNormWeightBuffer) {
|
|
128
141
|
const normWeightBuf = getNormWeightBuffer(layerWeights.inputNorm, 'input_norm');
|
|
129
142
|
|
|
@@ -182,8 +195,16 @@ export async function runLayerAttentionGPU(
|
|
|
182
195
|
}
|
|
183
196
|
|
|
184
197
|
// 2. Q/K/V projections
|
|
185
|
-
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype
|
|
186
|
-
|
|
198
|
+
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
|
|
199
|
+
forceF32: shouldForceF32AttentionProjectionForRoPE({
|
|
200
|
+
attentionInputDtype: desiredOutputDtype,
|
|
201
|
+
headDim,
|
|
202
|
+
rotaryDim: config.ropeRotaryDim,
|
|
203
|
+
interleaved: config.ropeInterleaved,
|
|
204
|
+
}),
|
|
205
|
+
});
|
|
206
|
+
let usedFusedQKV = false;
|
|
207
|
+
({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
|
|
187
208
|
recorder: null,
|
|
188
209
|
normed,
|
|
189
210
|
layerWeights,
|
|
@@ -204,7 +225,7 @@ export async function runLayerAttentionGPU(
|
|
|
204
225
|
trace.attn(layerIdx, `Using fused QKV path: ${qSizeFused}+${kSizeFused}+${vSizeFused}=${totalSize}`);
|
|
205
226
|
}
|
|
206
227
|
: null,
|
|
207
|
-
});
|
|
228
|
+
}));
|
|
208
229
|
|
|
209
230
|
// Trace Q/K/V projections
|
|
210
231
|
if (kernelTrace.enabled) {
|
|
@@ -212,6 +233,27 @@ export async function runLayerAttentionGPU(
|
|
|
212
233
|
await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
213
234
|
await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
214
235
|
}
|
|
236
|
+
await runProbes('q_proj', qTensor.buffer, {
|
|
237
|
+
layerIdx,
|
|
238
|
+
numTokens,
|
|
239
|
+
hiddenSize: numHeads * headDim,
|
|
240
|
+
probes: state.debugProbes,
|
|
241
|
+
dtype: qTensor.dtype,
|
|
242
|
+
});
|
|
243
|
+
await runProbes('k_proj', kTensor.buffer, {
|
|
244
|
+
layerIdx,
|
|
245
|
+
numTokens,
|
|
246
|
+
hiddenSize: numKVHeads * headDim,
|
|
247
|
+
probes: state.debugProbes,
|
|
248
|
+
dtype: kTensor.dtype,
|
|
249
|
+
});
|
|
250
|
+
await runProbes('v_proj', vTensor.buffer, {
|
|
251
|
+
layerIdx,
|
|
252
|
+
numTokens,
|
|
253
|
+
hiddenSize: numKVHeads * headDim,
|
|
254
|
+
probes: state.debugProbes,
|
|
255
|
+
dtype: vTensor.dtype,
|
|
256
|
+
});
|
|
215
257
|
|
|
216
258
|
// Kernel step debug: Q/K/V projections
|
|
217
259
|
if (isKernelDebugEnabled(layerIdx)) {
|
|
@@ -319,6 +361,20 @@ export async function runLayerAttentionGPU(
|
|
|
319
361
|
await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
320
362
|
}
|
|
321
363
|
}
|
|
364
|
+
await runProbes('q_rope', qTensor.buffer, {
|
|
365
|
+
layerIdx,
|
|
366
|
+
numTokens,
|
|
367
|
+
hiddenSize: numHeads * headDim,
|
|
368
|
+
probes: state.debugProbes,
|
|
369
|
+
dtype: qTensor.dtype,
|
|
370
|
+
});
|
|
371
|
+
await runProbes('k_rope', kTensor.buffer, {
|
|
372
|
+
layerIdx,
|
|
373
|
+
numTokens,
|
|
374
|
+
hiddenSize: numKVHeads * headDim,
|
|
375
|
+
probes: state.debugProbes,
|
|
376
|
+
dtype: kTensor.dtype,
|
|
377
|
+
});
|
|
322
378
|
if (isKernelDebugEnabled(layerIdx)) {
|
|
323
379
|
logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
|
|
324
380
|
await dumpTokenVector(qTensor.buffer, 'Q_rope', {
|
|
@@ -669,7 +725,7 @@ export async function runLayerAttentionGPU(
|
|
|
669
725
|
throw new Error(`Unsupported attention kernel variant "${attentionKernelVariant}" at layer ${layerIdx}`);
|
|
670
726
|
}
|
|
671
727
|
|
|
672
|
-
|
|
728
|
+
attnOutput = await runAttentionKernel();
|
|
673
729
|
|
|
674
730
|
// Trace attention output
|
|
675
731
|
if (kernelTrace.enabled) {
|
|
@@ -692,7 +748,7 @@ export async function runLayerAttentionGPU(
|
|
|
692
748
|
await debugCheckBuffer(attnOutput.buffer, `L${layerIdx} attention output (before o_proj, GPU)`, numTokens, numHeads * headDim);
|
|
693
749
|
}
|
|
694
750
|
|
|
695
|
-
|
|
751
|
+
attnForProjection = attnOutput;
|
|
696
752
|
if (qGateTensor) {
|
|
697
753
|
attnForProjection = await runSiLU(attnOutput, {
|
|
698
754
|
size: numTokens * numHeads * headDim,
|
|
@@ -706,19 +762,19 @@ export async function runLayerAttentionGPU(
|
|
|
706
762
|
|
|
707
763
|
// 6. Output projection (with optional fused residual for decode)
|
|
708
764
|
|
|
709
|
-
|
|
765
|
+
output = null;
|
|
710
766
|
let residualFused = false;
|
|
711
767
|
let oProjInput = attnForProjection;
|
|
712
|
-
|
|
768
|
+
oProjInputTemp = null;
|
|
713
769
|
if (layerWeights.oProj && getWeightBuffer) {
|
|
770
|
+
({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
|
|
771
|
+
attnForProjection,
|
|
772
|
+
matmulOutputDtype,
|
|
773
|
+
castF32ToF16
|
|
774
|
+
));
|
|
714
775
|
const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
|
|
715
776
|
const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
|
|
716
777
|
|
|
717
|
-
if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
|
|
718
|
-
oProjInput = await castF32ToF16(attnOutput);
|
|
719
|
-
oProjInputTemp = oProjInput;
|
|
720
|
-
}
|
|
721
|
-
|
|
722
778
|
// Use fused o_proj + residual for decode when possible
|
|
723
779
|
// Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
|
|
724
780
|
const oProjDtype = getWeightDtype(oProjBuf);
|
|
@@ -807,7 +863,7 @@ export async function runLayerAttentionGPU(
|
|
|
807
863
|
await debugCheckBuffer(output.buffer, `L${layerIdx} attention output (after o_proj, GPU)`, numTokens, hiddenSize);
|
|
808
864
|
}
|
|
809
865
|
|
|
810
|
-
|
|
866
|
+
finalOutput = output;
|
|
811
867
|
|
|
812
868
|
const buffersToRelease = [];
|
|
813
869
|
if (output.buffer !== attnForProjection.buffer) {
|
|
@@ -832,4 +888,46 @@ export async function runLayerAttentionGPU(
|
|
|
832
888
|
}
|
|
833
889
|
|
|
834
890
|
return { output: finalOutput, residualFused };
|
|
891
|
+
} catch (error) {
|
|
892
|
+
const released = new Set();
|
|
893
|
+
const releaseOnce = (buffer) => {
|
|
894
|
+
if (!buffer || released.has(buffer)) return;
|
|
895
|
+
released.add(buffer);
|
|
896
|
+
releaseBuffer(buffer);
|
|
897
|
+
};
|
|
898
|
+
if (finalOutput?.buffer && finalOutput.buffer !== output?.buffer) {
|
|
899
|
+
releaseOnce(finalOutput.buffer);
|
|
900
|
+
}
|
|
901
|
+
if (output?.buffer && output.buffer !== attnForProjection?.buffer) {
|
|
902
|
+
releaseOnce(output.buffer);
|
|
903
|
+
}
|
|
904
|
+
if (oProjInputTemp?.buffer) {
|
|
905
|
+
releaseOnce(oProjInputTemp.buffer);
|
|
906
|
+
}
|
|
907
|
+
if (attnForProjection?.buffer && attnForProjection.buffer !== attnOutput?.buffer) {
|
|
908
|
+
releaseOnce(attnForProjection.buffer);
|
|
909
|
+
}
|
|
910
|
+
if (attnOutput?.buffer) {
|
|
911
|
+
releaseOnce(attnOutput.buffer);
|
|
912
|
+
}
|
|
913
|
+
if (qGateTensor?.buffer) {
|
|
914
|
+
releaseOnce(qGateTensor.buffer);
|
|
915
|
+
}
|
|
916
|
+
if (qTensor?.buffer) {
|
|
917
|
+
releaseOnce(qTensor.buffer);
|
|
918
|
+
}
|
|
919
|
+
if (kTensor?.buffer) {
|
|
920
|
+
releaseOnce(kTensor.buffer);
|
|
921
|
+
}
|
|
922
|
+
if (vTensor?.buffer) {
|
|
923
|
+
releaseOnce(vTensor.buffer);
|
|
924
|
+
}
|
|
925
|
+
if (normed?.buffer && normed.buffer !== attentionInput?.buffer) {
|
|
926
|
+
releaseOnce(normed.buffer);
|
|
927
|
+
}
|
|
928
|
+
if (attentionInputTemp && attentionInput?.buffer) {
|
|
929
|
+
releaseOnce(attentionInput.buffer);
|
|
930
|
+
}
|
|
931
|
+
throw error;
|
|
932
|
+
}
|
|
835
933
|
}
|
|
@@ -134,11 +134,10 @@ function resolveIntermediateSizeForRuntime(manifest, inf, arch, modelId) {
|
|
|
134
134
|
if (inferred == null || inferred === fromArch) {
|
|
135
135
|
return fromArch;
|
|
136
136
|
}
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
137
|
+
throw new Error(
|
|
138
|
+
`Manifest "${modelId}" has intermediateSize=${fromArch}, but FFN tensors imply ${inferred}. ` +
|
|
139
|
+
'Re-convert the model so manifest architecture matches the weights.'
|
|
140
140
|
);
|
|
141
|
-
return inferred;
|
|
142
141
|
}
|
|
143
142
|
|
|
144
143
|
// =============================================================================
|
|
@@ -483,6 +482,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
483
482
|
const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
|
|
484
483
|
const causalAttention = inf.attention.causal;
|
|
485
484
|
|
|
485
|
+
// Cross-field sanity: queryPreAttnScalar should typically equal headDim.
|
|
486
|
+
// A value of sqrt(headDim) indicates a known converter bug that produces
|
|
487
|
+
// attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
|
|
488
|
+
if (queryPreAttnScalar != null && headDim != null
|
|
489
|
+
&& queryPreAttnScalar !== headDim
|
|
490
|
+
&& Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
|
|
491
|
+
throw new Error(
|
|
492
|
+
`Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
|
|
493
|
+
`equals sqrt(headDim) instead of headDim (${headDim}). ` +
|
|
494
|
+
`This is a known converter bug — the manifest must be regenerated ` +
|
|
495
|
+
`with the corrected converter.`
|
|
496
|
+
);
|
|
497
|
+
}
|
|
498
|
+
|
|
486
499
|
// Get stop token IDs (cast to Manifest for compatibility)
|
|
487
500
|
const stopTokenIds = getStopTokenIds(manifest);
|
|
488
501
|
|
|
@@ -319,14 +319,8 @@ export async function embed(tokenIds, embedBuffer, config) {
|
|
|
319
319
|
const firstTokenId = tokenIdArray[0];
|
|
320
320
|
const bytesPerElement = useF16 ? 2 : 4;
|
|
321
321
|
const sampleSize = Math.min(32 * bytesPerElement, hiddenSize * bytesPerElement);
|
|
322
|
-
const
|
|
323
|
-
const
|
|
324
|
-
enc.copyBufferToBuffer(gatherOutput.buffer, 0, staging, 0, sampleSize);
|
|
325
|
-
device.queue.submit([enc.finish()]);
|
|
326
|
-
await staging.mapAsync(GPUMapMode.READ);
|
|
327
|
-
const data = decodeReadback(staging.getMappedRange().slice(0), gatherOptions.outputDtype);
|
|
328
|
-
staging.unmap();
|
|
329
|
-
staging.destroy();
|
|
322
|
+
const readback = await readBuffer(gatherOutput.buffer, sampleSize);
|
|
323
|
+
const data = decodeReadback(readback, gatherOptions.outputDtype);
|
|
330
324
|
|
|
331
325
|
// Compute statistics
|
|
332
326
|
let sum = 0, sumSq = 0;
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { log } from '../../../debug/index.js';
|
|
2
1
|
import { resolveKernelPath } from '../../../config/kernel-path-loader.js';
|
|
3
2
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
4
3
|
import {
|
|
@@ -9,19 +8,36 @@ import {
|
|
|
9
8
|
export const PRIMARY_EXECUTION_PLAN_ID = 'primary';
|
|
10
9
|
export const FINITENESS_FALLBACK_EXECUTION_PLAN_ID = 'finiteness_fallback';
|
|
11
10
|
|
|
12
|
-
function
|
|
13
|
-
if (
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
11
|
+
function assertOptionalBoolean(value, label) {
|
|
12
|
+
if (value === undefined) {
|
|
13
|
+
return undefined;
|
|
14
|
+
}
|
|
15
|
+
if (typeof value !== 'boolean') {
|
|
16
|
+
throw new Error(`[ExecutionPlan] ${label} must be boolean when provided; got ${JSON.stringify(value)}.`);
|
|
17
|
+
}
|
|
18
|
+
return value;
|
|
18
19
|
}
|
|
19
20
|
|
|
20
|
-
function
|
|
21
|
-
if (value ===
|
|
22
|
-
return
|
|
21
|
+
function assertOptionalPositiveInt(value, label) {
|
|
22
|
+
if (value === undefined) {
|
|
23
|
+
return undefined;
|
|
23
24
|
}
|
|
24
|
-
|
|
25
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
26
|
+
throw new Error(`[ExecutionPlan] ${label} must be a positive integer when provided; got ${JSON.stringify(value)}.`);
|
|
27
|
+
}
|
|
28
|
+
return value;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function assertOptionalStopCheckMode(value) {
|
|
32
|
+
if (value === undefined) {
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
if (value !== 'batch' && value !== 'per-token') {
|
|
36
|
+
throw new Error(
|
|
37
|
+
`[ExecutionPlan] stopCheckMode must be "batch" or "per-token" when provided; got ${JSON.stringify(value)}.`
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
return value;
|
|
25
41
|
}
|
|
26
42
|
|
|
27
43
|
function resolveFallbackActivationDtype(primaryActivationDtype) {
|
|
@@ -42,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
|
|
|
42
58
|
function resolveFallbackKernelPath(primaryKernelPath) {
|
|
43
59
|
const primaryKernelPathId = primaryKernelPath?.id ?? null;
|
|
44
60
|
if (!primaryKernelPathId) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
61
|
+
return {
|
|
62
|
+
kernelPath: null,
|
|
63
|
+
kernelPathId: null,
|
|
64
|
+
kernelPathSource: 'none',
|
|
65
|
+
};
|
|
49
66
|
}
|
|
50
67
|
|
|
51
68
|
const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'
|
|
@@ -244,11 +261,17 @@ export function activateFallbackExecutionPlan(container) {
|
|
|
244
261
|
|
|
245
262
|
function resolveExecutionOverrides(options = {}) {
|
|
246
263
|
return {
|
|
247
|
-
disableCommandBatching:
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
264
|
+
disableCommandBatching: assertOptionalBoolean(
|
|
265
|
+
options.disableCommandBatching,
|
|
266
|
+
'disableCommandBatching'
|
|
267
|
+
),
|
|
268
|
+
disableMultiTokenDecode: assertOptionalBoolean(
|
|
269
|
+
options.disableMultiTokenDecode,
|
|
270
|
+
'disableMultiTokenDecode'
|
|
271
|
+
),
|
|
272
|
+
batchSize: assertOptionalPositiveInt(options.batchSize, 'batchSize'),
|
|
273
|
+
stopCheckMode: assertOptionalStopCheckMode(options.stopCheckMode),
|
|
274
|
+
maxTokens: assertOptionalPositiveInt(options.maxTokens, 'maxTokens'),
|
|
252
275
|
};
|
|
253
276
|
}
|
|
254
277
|
|
|
@@ -268,9 +291,9 @@ export function resolveExecutionSessionPlan(container, options = {}) {
|
|
|
268
291
|
deferredRoundingWindowTokens: activePlan.deferredRoundingWindowTokens,
|
|
269
292
|
disableCommandBatching: overrides.disableCommandBatching ?? activePlan.defaultDisableCommandBatching,
|
|
270
293
|
disableMultiTokenDecode: overrides.disableMultiTokenDecode ?? activePlan.defaultDisableMultiTokenDecode,
|
|
271
|
-
batchSize:
|
|
272
|
-
stopCheckMode:
|
|
273
|
-
maxTokens:
|
|
294
|
+
batchSize: overrides.batchSize ?? activePlan.defaultBatchSize,
|
|
295
|
+
stopCheckMode: overrides.stopCheckMode ?? activePlan.defaultStopCheckMode,
|
|
296
|
+
maxTokens: overrides.maxTokens ?? activePlan.defaultMaxTokens,
|
|
274
297
|
readbackInterval: activePlan.readbackInterval,
|
|
275
298
|
ringTokens: activePlan.ringTokens,
|
|
276
299
|
ringStop: activePlan.ringStop,
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
export declare function cloneJson<T>(value: T): T;
|
|
2
|
+
export declare function validateManifestSessionDefaultsContract(manifestInference: Record<string, unknown> | null): void;
|
|
3
|
+
export declare function isPhaseMatch(phase: string, targetPhase: string): boolean;
|
|
4
|
+
export declare function stepHasLayer(step: Record<string, unknown>, layerIdx: number): boolean;
|
|
5
|
+
export declare function normalizePhase(value: unknown, label: string): string;
|
|
6
|
+
export declare function normalizeSection(value: unknown, label: string): string;
|
|
7
|
+
export declare function normalizeSlot(value: unknown, label: string): string;
|
|
8
|
+
export declare function createSourceTrace(): { session: Record<string, unknown>; steps: Record<string, unknown> };
|
|
9
|
+
export declare function setSourceTrace(trace: Record<string, unknown>, path: string, source: string): void;
|
|
10
|
+
export declare function collectLeafPaths(value: unknown, prefix?: string[], out?: string[][]): string[][];
|
|
11
|
+
export declare function hasDefinedPath(root: unknown, pathSegments: string[]): boolean;
|
|
12
|
+
export declare function validateStepShape(step: Record<string, unknown>, index: number): void;
|
|
13
|
+
export declare function assertExecutionRuntimeOverlay(runtimeInference: Record<string, unknown> | null | undefined): void;
|
|
14
|
+
export declare function validateUniqueStepIds(steps: Array<Record<string, unknown>>): void;
|
|
15
|
+
export declare function hasExecutionV0(manifestInference: Record<string, unknown> | null | undefined): boolean;
|
|
16
|
+
export declare function assertExecutionV0Schema(manifestInference: Record<string, unknown> | null | undefined): void;
|
|
17
|
+
export declare function applyExecutionPatchAtomic(
|
|
18
|
+
baseSteps: Array<Record<string, unknown>>,
|
|
19
|
+
patch: Record<string, unknown> | null | undefined
|
|
20
|
+
): Array<Record<string, unknown>>;
|
|
21
|
+
export declare function indexRuntimePatchMeta(
|
|
22
|
+
patch: Record<string, unknown> | null | undefined
|
|
23
|
+
): {
|
|
24
|
+
addedSteps: Set<string>;
|
|
25
|
+
precisionFieldsByStep: Map<string, Set<string>>;
|
|
26
|
+
kvIOFieldsByStep: Set<string>;
|
|
27
|
+
};
|
|
28
|
+
export declare function requireSessionActivationDtype(
|
|
29
|
+
sessionDefaults: Record<string, unknown> | null | undefined,
|
|
30
|
+
label?: string
|
|
31
|
+
): string;
|
|
32
|
+
export declare function createInitialSlotDtypes(sessionDefaults: Record<string, unknown>): Map<string, string>;
|
|
33
|
+
export declare function resolvePhaseSteps(
|
|
34
|
+
phase: string,
|
|
35
|
+
steps: Array<Record<string, unknown>>,
|
|
36
|
+
sessionDefaults: Record<string, unknown>,
|
|
37
|
+
profileIndex: Map<string, unknown>,
|
|
38
|
+
policies: Record<string, unknown>,
|
|
39
|
+
options?: Record<string, unknown>
|
|
40
|
+
): {
|
|
41
|
+
steps: Array<Record<string, unknown>>;
|
|
42
|
+
finalSlotDtypes: Map<string, string>;
|
|
43
|
+
};
|
|
44
|
+
export declare function normalizeRuntimeSessionForExecutionV0(
|
|
45
|
+
runtimeSession: Record<string, unknown> | null | undefined,
|
|
46
|
+
manifestInference: Record<string, unknown> | null | undefined,
|
|
47
|
+
defaultComputeDefaults: Record<string, unknown>
|
|
48
|
+
): Record<string, unknown> | null | undefined;
|
|
49
|
+
export declare function validatePhaseBoundaryCompatibility(options: Record<string, unknown>): void;
|
|
50
|
+
export declare function assertKVLayoutExecutionCompatibility(
|
|
51
|
+
steps: Array<Record<string, unknown>>,
|
|
52
|
+
sessionDefaults: Record<string, unknown>
|
|
53
|
+
): void;
|
|
54
|
+
export declare const buildKernelProfileKey: (
|
|
55
|
+
kernelRef: Record<string, unknown> | null | undefined,
|
|
56
|
+
step?: Record<string, unknown> | null | undefined
|
|
57
|
+
) => string;
|
|
58
|
+
export declare const indexKernelProfiles: (sessionDefaults: Record<string, unknown>) => Map<string, unknown>;
|
|
59
|
+
export declare const normalizeDtype: (value: unknown, label: string) => string;
|