@simulatte/doppler 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +126 -0
- package/README.md +25 -17
- package/package.json +20 -4
- package/src/adapters/adapter-registry.js +12 -1
- package/src/adapters/lora-loader.js +23 -6
- package/src/bridge/extension-client.d.ts +5 -0
- package/src/bridge/extension-client.js +40 -0
- package/src/bridge/index.d.ts +2 -1
- package/src/bridge/index.js +6 -4
- package/src/browser/browser-converter.js +26 -1
- package/src/browser/file-picker.js +6 -0
- package/src/browser/safetensors-parser-browser.js +84 -1
- package/src/browser/shard-io-browser.js +2 -2
- package/src/browser/tensor-source-download.js +8 -2
- package/src/browser/tensor-source-http.d.ts +1 -0
- package/src/browser/tensor-source-http.js +5 -1
- package/src/client/doppler-api.browser.js +20 -4
- package/src/client/doppler-api.js +19 -3
- package/src/client/doppler-provider/generation.js +12 -0
- package/src/client/doppler-provider/model-manager.d.ts +10 -0
- package/src/client/doppler-provider/model-manager.js +91 -19
- package/src/client/doppler-provider/source-runtime.d.ts +2 -1
- package/src/client/doppler-provider/source-runtime.js +132 -13
- package/src/client/doppler-registry.json +8 -7
- package/src/config/backward-registry-loader.js +17 -2
- package/src/config/execution-v0-contract-check.js +113 -15
- package/src/config/kernel-path-contract-check.js +57 -29
- package/src/config/kernel-path-loader.js +5 -36
- package/src/config/kernels/kernel-ref-digests.js +39 -39
- package/src/config/kernels/registry.js +14 -1
- package/src/config/kernels/registry.json +49 -7
- package/src/config/loader.d.ts +1 -1
- package/src/config/loader.js +43 -4
- package/src/config/merge-contract-check.js +59 -4
- package/src/config/merge-helpers.js +128 -7
- package/src/config/merge.d.ts +1 -0
- package/src/config/merge.js +28 -0
- package/src/config/param-validator.js +47 -2
- package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
- package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/registry.json +29 -8
- package/src/config/presets/models/gemma2.json +2 -2
- package/src/config/presets/models/qwen3.json +9 -2
- package/src/config/presets/models/transformer.json +5 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
- package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
- package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
- package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
- package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
- package/src/config/required-inference-fields-contract-check.js +6 -0
- package/src/config/runtime.js +6 -1
- package/src/config/schema/debug.schema.d.ts +5 -0
- package/src/config/schema/doppler.schema.js +16 -21
- package/src/config/schema/inference-defaults.schema.js +6 -3
- package/src/config/schema/inference.schema.d.ts +9 -0
- package/src/config/schema/kernel-path.schema.d.ts +11 -1
- package/src/config/schema/kernel-thresholds.schema.js +12 -4
- package/src/config/schema/manifest.schema.d.ts +8 -1
- package/src/config/schema/manifest.schema.js +19 -3
- package/src/config/training-defaults.js +30 -22
- package/src/converter/conversion-plan.js +94 -9
- package/src/converter/core.d.ts +7 -0
- package/src/converter/core.js +14 -9
- package/src/converter/execution-v0-manifest.js +4 -1
- package/src/converter/index.d.ts +1 -0
- package/src/converter/index.js +1 -0
- package/src/converter/manifest-inference.js +43 -12
- package/src/converter/parsers/diffusion.js +0 -3
- package/src/converter/quantization-info.js +35 -15
- package/src/converter/rope-config.js +42 -0
- package/src/converter/shard-packer.d.ts +1 -1
- package/src/converter/shard-packer.js +4 -1
- package/src/debug/config.js +123 -11
- package/src/debug/signals.js +7 -1
- package/src/debug/tensor.d.ts +2 -0
- package/src/debug/tensor.js +13 -2
- package/src/distribution/p2p-control-plane.js +52 -12
- package/src/distribution/p2p-observability.js +43 -7
- package/src/distribution/p2p-webrtc-browser.js +20 -0
- package/src/distribution/shard-delivery.js +77 -26
- package/src/formats/gguf/types.js +33 -16
- package/src/formats/rdrr/groups.d.ts +12 -4
- package/src/formats/rdrr/groups.js +3 -6
- package/src/formats/rdrr/parsing.js +39 -2
- package/src/formats/rdrr/types.d.ts +2 -1
- package/src/gpu/command-recorder.js +86 -61
- package/src/gpu/device.d.ts +1 -0
- package/src/gpu/device.js +131 -19
- package/src/gpu/kernel-tuner/benchmarks.js +326 -316
- package/src/gpu/kernel-tuner/cache.js +71 -4
- package/src/gpu/kernel-tuner/tuner.js +22 -4
- package/src/gpu/kernels/attention.js +113 -34
- package/src/gpu/kernels/backward/adam.js +62 -58
- package/src/gpu/kernels/backward/attention_backward.js +257 -169
- package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
- package/src/gpu/kernels/bias_add.wgsl +8 -6
- package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
- package/src/gpu/kernels/cast.js +191 -149
- package/src/gpu/kernels/check-stop.js +33 -44
- package/src/gpu/kernels/conv2d.js +27 -17
- package/src/gpu/kernels/conv2d.wgsl +7 -8
- package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
- package/src/gpu/kernels/cross_entropy_loss.js +21 -15
- package/src/gpu/kernels/depthwise_conv2d.js +37 -26
- package/src/gpu/kernels/depthwise_conv2d.wgsl +6 -9
- package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +6 -9
- package/src/gpu/kernels/dequant.js +178 -126
- package/src/gpu/kernels/energy.d.ts +3 -21
- package/src/gpu/kernels/energy.js +111 -88
- package/src/gpu/kernels/feature-check.js +1 -1
- package/src/gpu/kernels/fused_ffn.js +84 -65
- package/src/gpu/kernels/fused_matmul_residual.js +56 -33
- package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
- package/src/gpu/kernels/gather.js +33 -15
- package/src/gpu/kernels/gelu.js +19 -11
- package/src/gpu/kernels/grouped_pointwise_conv2d.js +34 -23
- package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +6 -9
- package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +6 -9
- package/src/gpu/kernels/groupnorm.js +34 -23
- package/src/gpu/kernels/kv-quantize.js +5 -2
- package/src/gpu/kernels/layernorm.js +35 -19
- package/src/gpu/kernels/logit-merge.js +5 -3
- package/src/gpu/kernels/matmul.js +83 -39
- package/src/gpu/kernels/modulate.js +23 -15
- package/src/gpu/kernels/moe.js +221 -175
- package/src/gpu/kernels/pixel_shuffle.js +22 -14
- package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
- package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
- package/src/gpu/kernels/relu.js +31 -10
- package/src/gpu/kernels/relu.wgsl +2 -1
- package/src/gpu/kernels/relu_f16.wgsl +2 -1
- package/src/gpu/kernels/repeat_channels.js +25 -17
- package/src/gpu/kernels/repeat_channels.wgsl +4 -5
- package/src/gpu/kernels/repeat_channels_f16.wgsl +4 -5
- package/src/gpu/kernels/residual.js +69 -23
- package/src/gpu/kernels/residual.wgsl +6 -3
- package/src/gpu/kernels/residual_f16.wgsl +2 -1
- package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
- package/src/gpu/kernels/residual_vec4.wgsl +2 -1
- package/src/gpu/kernels/rmsnorm.js +96 -28
- package/src/gpu/kernels/rmsnorm.wgsl +14 -6
- package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
- package/src/gpu/kernels/rope.d.ts +2 -0
- package/src/gpu/kernels/rope.js +14 -1
- package/src/gpu/kernels/rope.wgsl +56 -40
- package/src/gpu/kernels/sample.js +27 -38
- package/src/gpu/kernels/sana_linear_attention.js +19 -12
- package/src/gpu/kernels/sana_linear_attention_apply.wgsl +4 -5
- package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +4 -5
- package/src/gpu/kernels/sana_linear_attention_summary.wgsl +4 -0
- package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +4 -0
- package/src/gpu/kernels/scale.js +18 -11
- package/src/gpu/kernels/shader-cache.js +4 -2
- package/src/gpu/kernels/silu.d.ts +1 -0
- package/src/gpu/kernels/silu.js +148 -82
- package/src/gpu/kernels/silu.wgsl +19 -9
- package/src/gpu/kernels/silu_f16.wgsl +19 -9
- package/src/gpu/kernels/softmax.js +44 -25
- package/src/gpu/kernels/split_qkv.js +23 -13
- package/src/gpu/kernels/transpose.js +31 -10
- package/src/gpu/kernels/transpose.wgsl +6 -5
- package/src/gpu/kernels/upsample2d.js +22 -13
- package/src/gpu/kernels/upsample2d.wgsl +6 -9
- package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
- package/src/gpu/kernels/utils.js +35 -13
- package/src/gpu/partitioned-buffer-pool.js +10 -2
- package/src/gpu/perf-guards.js +2 -9
- package/src/gpu/profiler.js +27 -22
- package/src/gpu/readback-utils.d.ts +16 -0
- package/src/gpu/readback-utils.js +41 -0
- package/src/gpu/submit-tracker.js +13 -0
- package/src/gpu/uniform-cache.d.ts +1 -0
- package/src/gpu/uniform-cache.js +30 -9
- package/src/hotswap/intent-bundle.js +6 -0
- package/src/hotswap/manifest.d.ts +10 -1
- package/src/hotswap/manifest.js +12 -2
- package/src/hotswap/runtime.js +30 -8
- package/src/index-browser.d.ts +44 -0
- package/src/index-browser.js +14 -0
- package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
- package/src/inference/browser-harness-contract-helpers.js +28 -0
- package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
- package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
- package/src/inference/browser-harness-model-helpers.d.ts +16 -0
- package/src/inference/browser-harness-model-helpers.js +217 -0
- package/src/inference/browser-harness-report-helpers.d.ts +7 -0
- package/src/inference/browser-harness-report-helpers.js +42 -0
- package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
- package/src/inference/browser-harness-runtime-helpers.js +415 -0
- package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
- package/src/inference/browser-harness-suite-helpers.js +268 -0
- package/src/inference/browser-harness-text-helpers.d.ts +27 -0
- package/src/inference/browser-harness-text-helpers.js +788 -0
- package/src/inference/browser-harness.d.ts +6 -0
- package/src/inference/browser-harness.js +130 -1950
- package/src/inference/kv-cache/base.js +140 -94
- package/src/inference/kv-cache/tiered.js +5 -3
- package/src/inference/moe-router.js +88 -56
- package/src/inference/multi-model-network.js +5 -3
- package/src/inference/network-evolution.d.ts +11 -2
- package/src/inference/network-evolution.js +20 -21
- package/src/inference/pipelines/context.d.ts +3 -0
- package/src/inference/pipelines/context.js +142 -2
- package/src/inference/pipelines/diffusion/helpers.js +7 -2
- package/src/inference/pipelines/diffusion/pipeline.js +17 -7
- package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
- package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +5 -0
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +27 -15
- package/src/inference/pipelines/diffusion/vae.js +3 -7
- package/src/inference/pipelines/energy/pipeline.js +27 -21
- package/src/inference/pipelines/energy/quintel.d.ts +5 -0
- package/src/inference/pipelines/energy/quintel.js +11 -0
- package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
- package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
- package/src/inference/pipelines/text/attention/projections.js +151 -101
- package/src/inference/pipelines/text/attention/record.js +73 -10
- package/src/inference/pipelines/text/attention/run.js +73 -10
- package/src/inference/pipelines/text/chat-format.js +25 -1
- package/src/inference/pipelines/text/config.d.ts +4 -0
- package/src/inference/pipelines/text/config.js +71 -5
- package/src/inference/pipelines/text/embed.js +2 -8
- package/src/inference/pipelines/text/execution-plan.js +64 -50
- package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
- package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
- package/src/inference/pipelines/text/execution-v0.js +78 -1002
- package/src/inference/pipelines/text/ffn/standard.js +3 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +46 -0
- package/src/inference/pipelines/text/generator-steps.js +298 -207
- package/src/inference/pipelines/text/generator.js +6 -23
- package/src/inference/pipelines/text/init.d.ts +4 -0
- package/src/inference/pipelines/text/init.js +134 -29
- package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
- package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
- package/src/inference/pipelines/text/kernel-trace.js +6 -0
- package/src/inference/pipelines/text/layer.js +14 -9
- package/src/inference/pipelines/text/linear-attention.d.ts +10 -0
- package/src/inference/pipelines/text/linear-attention.js +80 -6
- package/src/inference/pipelines/text/logits/gpu.js +10 -5
- package/src/inference/pipelines/text/logits/index.js +10 -11
- package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
- package/src/inference/pipelines/text/logits/utils.js +9 -0
- package/src/inference/pipelines/text/lora-apply.js +50 -32
- package/src/inference/pipelines/text/model-load.js +279 -104
- package/src/inference/pipelines/text/moe-cache.js +5 -4
- package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
- package/src/inference/pipelines/text/moe-cpu.js +42 -38
- package/src/inference/pipelines/text/moe-gpu.js +110 -86
- package/src/inference/pipelines/text/ops.js +90 -90
- package/src/inference/pipelines/text/probes.js +9 -9
- package/src/inference/pipelines/text/weights.js +17 -7
- package/src/inference/pipelines/text.js +17 -1
- package/src/inference/speculative.d.ts +2 -2
- package/src/inference/speculative.js +4 -18
- package/src/inference/test-harness.d.ts +1 -1
- package/src/inference/test-harness.js +15 -5
- package/src/inference/tokenizer.d.ts +0 -5
- package/src/inference/tokenizer.js +4 -23
- package/src/inference/tokenizers/bpe.js +9 -0
- package/src/inference/tokenizers/bundled.js +176 -33
- package/src/inference/tokenizers/sentencepiece.js +12 -0
- package/src/loader/doppler-loader.js +38 -22
- package/src/loader/dtype-utils.js +3 -44
- package/src/loader/embedding-loader.js +7 -3
- package/src/loader/experts/expert-cache.js +13 -6
- package/src/loader/experts/expert-loader.js +10 -6
- package/src/loader/final-weights-loader.js +8 -4
- package/src/loader/layer-loader.js +2 -1
- package/src/loader/loader-state.js +2 -2
- package/src/loader/memory-monitor.js +8 -0
- package/src/loader/multi-model-loader.d.ts +14 -0
- package/src/loader/multi-model-loader.js +70 -24
- package/src/loader/shard-cache.js +81 -12
- package/src/loader/shard-resolver.js +25 -3
- package/src/loader/tensors/tensor-loader.js +209 -144
- package/src/loader/tensors/tensor-reader.js +76 -19
- package/src/loader/weight-downcast.js +1 -1
- package/src/memory/buffer-pool.d.ts +9 -1
- package/src/memory/buffer-pool.js +109 -44
- package/src/memory/unified-detect.js +1 -1
- package/src/rules/inference/kernel-path.rules.json +24 -8
- package/src/rules/rule-registry.js +25 -1
- package/src/rules/tooling/command-runtime.rules.json +18 -0
- package/src/storage/backends/opfs-store.js +68 -24
- package/src/storage/downloader.js +364 -83
- package/src/storage/index.d.ts +3 -0
- package/src/storage/index.js +3 -0
- package/src/storage/preflight.d.ts +2 -2
- package/src/storage/preflight.js +24 -2
- package/src/storage/quickstart-downloader.js +11 -5
- package/src/storage/registry.js +10 -4
- package/src/storage/reports.js +1 -1
- package/src/storage/shard-manager.d.ts +15 -1
- package/src/storage/shard-manager.js +51 -3
- package/src/storage/source-artifact-store.d.ts +52 -0
- package/src/storage/source-artifact-store.js +234 -0
- package/src/tooling/command-api-constants.d.ts +9 -0
- package/src/tooling/command-api-constants.js +9 -0
- package/src/tooling/command-api-family-normalizers.d.ts +9 -0
- package/src/tooling/command-api-family-normalizers.js +343 -0
- package/src/tooling/command-api-helpers.d.ts +25 -0
- package/src/tooling/command-api-helpers.js +262 -0
- package/src/tooling/command-api.d.ts +27 -1
- package/src/tooling/command-api.js +26 -473
- package/src/tooling/command-envelope.js +4 -1
- package/src/tooling/command-runner-shared.js +52 -18
- package/src/tooling/lean-execution-contract.js +150 -3
- package/src/tooling/node-browser-command-runner.d.ts +4 -0
- package/src/tooling/node-browser-command-runner.js +218 -273
- package/src/tooling/node-command-runner.js +44 -3
- package/src/tooling/node-converter.js +27 -1
- package/src/tooling/node-source-runtime.d.ts +1 -1
- package/src/tooling/node-source-runtime.js +84 -3
- package/src/tooling/node-webgpu.js +30 -105
- package/src/tooling/opfs-cache.js +21 -4
- package/src/tooling/runtime-input-composition.d.ts +38 -0
- package/src/tooling/runtime-input-composition.js +86 -0
- package/src/tooling/source-runtime-bundle.d.ts +40 -5
- package/src/tooling/source-runtime-bundle.js +261 -34
- package/src/tooling/source-runtime-materializer.d.ts +6 -0
- package/src/tooling/source-runtime-materializer.js +93 -0
- package/src/training/attention-backward.js +32 -17
- package/src/training/autograd.js +80 -52
- package/src/training/checkpoint-watch.d.ts +8 -0
- package/src/training/checkpoint-watch.js +139 -0
- package/src/training/checkpoint.d.ts +6 -1
- package/src/training/checkpoint.js +46 -7
- package/src/training/clip.js +2 -1
- package/src/training/datasets/token-batch.js +20 -8
- package/src/training/distillation/artifacts.d.ts +71 -0
- package/src/training/distillation/artifacts.js +132 -0
- package/src/training/distillation/checkpoint-watch.d.ts +10 -0
- package/src/training/distillation/checkpoint-watch.js +58 -0
- package/src/training/distillation/dataset.d.ts +59 -0
- package/src/training/distillation/dataset.js +337 -0
- package/src/training/distillation/eval.d.ts +34 -0
- package/src/training/distillation/eval.js +310 -0
- package/src/training/distillation/index.d.ts +29 -0
- package/src/training/distillation/index.js +29 -0
- package/src/training/distillation/runtime.d.ts +20 -0
- package/src/training/distillation/runtime.js +121 -0
- package/src/training/distillation/scoreboard.d.ts +6 -0
- package/src/training/distillation/scoreboard.js +8 -0
- package/src/training/distillation/stage-a.d.ts +45 -0
- package/src/training/distillation/stage-a.js +338 -0
- package/src/training/distillation/stage-b.d.ts +24 -0
- package/src/training/distillation/stage-b.js +20 -0
- package/src/training/distillation/student-fixture.d.ts +22 -0
- package/src/training/distillation/student-fixture.js +846 -0
- package/src/training/distillation/suite-data.d.ts +45 -0
- package/src/training/distillation/suite-data.js +189 -0
- package/src/training/index.d.ts +10 -0
- package/src/training/index.js +10 -0
- package/src/training/lora-pipeline.d.ts +40 -0
- package/src/training/lora-pipeline.js +793 -0
- package/src/training/lora.js +26 -12
- package/src/training/loss.js +5 -6
- package/src/training/objectives/cross_entropy.js +2 -5
- package/src/training/objectives/distill_kd.js +4 -8
- package/src/training/objectives/distill_triplet.js +4 -8
- package/src/training/objectives/ul_stage2_base.js +4 -8
- package/src/training/operator-artifacts.d.ts +62 -0
- package/src/training/operator-artifacts.js +140 -0
- package/src/training/operator-command.d.ts +5 -0
- package/src/training/operator-command.js +455 -0
- package/src/training/operator-eval.d.ts +48 -0
- package/src/training/operator-eval.js +230 -0
- package/src/training/operator-scoreboard.d.ts +5 -0
- package/src/training/operator-scoreboard.js +44 -0
- package/src/training/optimizer.js +19 -7
- package/src/training/runner.d.ts +52 -0
- package/src/training/runner.js +31 -5
- package/src/training/suite.d.ts +112 -0
- package/src/training/suite.js +24 -984
- package/src/training/tensor-factory.d.ts +9 -0
- package/src/training/tensor-factory.js +13 -0
- package/src/training/trainer.js +3 -5
- package/src/training/ul_dataset.js +3 -5
- package/src/training/workloads.d.ts +164 -0
- package/src/training/workloads.js +530 -0
- package/src/version.js +1 -1
- package/tools/convert-safetensors-node.js +22 -16
- package/tools/doppler-cli.js +179 -63
|
@@ -15,10 +15,14 @@ import { KERNEL_CONFIGS } from '../../../gpu/kernels/kernel-configs.js';
|
|
|
15
15
|
import { resolveCapabilityKernelPathRef, resolveKernelPathPolicy } from './kernel-path-auto-select.js';
|
|
16
16
|
import { initTokenizer } from './init.js';
|
|
17
17
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
18
|
+
import { mergeRuntimeValues } from '../../../config/runtime-merge.js';
|
|
18
19
|
import {
|
|
19
20
|
DEFAULT_BATCHING_DEFAULTS,
|
|
21
|
+
DEFAULT_COMPUTE_DEFAULTS,
|
|
20
22
|
DEFAULT_GENERATION_CONFIG,
|
|
21
23
|
} from '../../../config/schema/inference-defaults.schema.js';
|
|
24
|
+
import { DEFAULT_KVCACHE_CONFIG } from '../../../config/schema/kvcache.schema.js';
|
|
25
|
+
import { DEFAULT_EXECUTION_V0_SESSION_DEFAULTS } from '../../../config/schema/execution-v0.schema.js';
|
|
22
26
|
|
|
23
27
|
function validateKernelWarmupMode(mode) {
|
|
24
28
|
if (mode !== 'parallel' && mode !== 'sequential') {
|
|
@@ -48,23 +52,97 @@ function normalizeBoolean(value) {
|
|
|
48
52
|
return typeof value === 'boolean' ? value : null;
|
|
49
53
|
}
|
|
50
54
|
|
|
55
|
+
function parseManifestDecodeLoopOptionalPositiveInt(value, label, modelId) {
|
|
56
|
+
if (value === undefined) {
|
|
57
|
+
return undefined;
|
|
58
|
+
}
|
|
59
|
+
if (value === null) {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
const normalized = normalizePositiveInt(value);
|
|
63
|
+
if (normalized == null) {
|
|
64
|
+
throw new Error(
|
|
65
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.${label} must be a positive integer or null.`
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
return normalized;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function parseManifestDecodeLoopOptionalBoolean(value, label, modelId) {
|
|
72
|
+
if (value === undefined) {
|
|
73
|
+
return undefined;
|
|
74
|
+
}
|
|
75
|
+
if (typeof value !== 'boolean') {
|
|
76
|
+
throw new Error(
|
|
77
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.${label} must be a boolean when provided.`
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
return value;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function requireGlobalBatchingDefault(value, label) {
|
|
84
|
+
const normalized = normalizePositiveInt(value);
|
|
85
|
+
if (normalized == null) {
|
|
86
|
+
throw new Error(`${label} must be a positive integer.`);
|
|
87
|
+
}
|
|
88
|
+
return normalized;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function requireGlobalStopCheckMode(value, label) {
|
|
92
|
+
const normalized = normalizeStopCheckMode(value);
|
|
93
|
+
if (normalized == null) {
|
|
94
|
+
throw new Error(`${label} must be "batch" or "per-token".`);
|
|
95
|
+
}
|
|
96
|
+
return normalized;
|
|
97
|
+
}
|
|
98
|
+
|
|
51
99
|
const GLOBAL_DEFAULT_BATCHING = Object.freeze({
|
|
52
|
-
batchSize:
|
|
53
|
-
|
|
54
|
-
|
|
100
|
+
batchSize: requireGlobalBatchingDefault(
|
|
101
|
+
DEFAULT_BATCHING_DEFAULTS.batchSize,
|
|
102
|
+
'DEFAULT_BATCHING_DEFAULTS.batchSize'
|
|
103
|
+
),
|
|
104
|
+
stopCheckMode: requireGlobalStopCheckMode(
|
|
105
|
+
DEFAULT_BATCHING_DEFAULTS.stopCheckMode,
|
|
106
|
+
'DEFAULT_BATCHING_DEFAULTS.stopCheckMode'
|
|
107
|
+
),
|
|
108
|
+
readbackInterval: requireGlobalBatchingDefault(
|
|
109
|
+
DEFAULT_BATCHING_DEFAULTS.readbackInterval,
|
|
110
|
+
'DEFAULT_BATCHING_DEFAULTS.readbackInterval'
|
|
111
|
+
),
|
|
112
|
+
ringTokens: requireGlobalBatchingDefault(
|
|
113
|
+
DEFAULT_BATCHING_DEFAULTS.ringTokens,
|
|
114
|
+
'DEFAULT_BATCHING_DEFAULTS.ringTokens'
|
|
115
|
+
),
|
|
116
|
+
ringStop: requireGlobalBatchingDefault(
|
|
117
|
+
DEFAULT_BATCHING_DEFAULTS.ringStop,
|
|
118
|
+
'DEFAULT_BATCHING_DEFAULTS.ringStop'
|
|
119
|
+
),
|
|
120
|
+
ringStaging: requireGlobalBatchingDefault(
|
|
121
|
+
DEFAULT_BATCHING_DEFAULTS.ringStaging,
|
|
122
|
+
'DEFAULT_BATCHING_DEFAULTS.ringStaging'
|
|
123
|
+
),
|
|
55
124
|
});
|
|
56
125
|
|
|
57
126
|
const GLOBAL_DEFAULT_GENERATION = Object.freeze({
|
|
58
127
|
disableCommandBatching: DEFAULT_GENERATION_CONFIG.disableCommandBatching === true,
|
|
59
128
|
});
|
|
60
129
|
|
|
130
|
+
const GLOBAL_DEFAULT_KERNEL_PATH_DTYPES = Object.freeze({
|
|
131
|
+
activationDtype: DEFAULT_COMPUTE_DEFAULTS.activationDtype,
|
|
132
|
+
kvDtype: DEFAULT_KVCACHE_CONFIG.kvDtype,
|
|
133
|
+
outputDtype: DEFAULT_EXECUTION_V0_SESSION_DEFAULTS.compute.defaults.outputDtype,
|
|
134
|
+
});
|
|
135
|
+
|
|
61
136
|
function isRuntimeBatchingAtGlobalDefaults(batching) {
|
|
62
137
|
if (!batching || typeof batching !== 'object') {
|
|
63
138
|
return false;
|
|
64
139
|
}
|
|
65
140
|
return normalizePositiveInt(batching.batchSize) === GLOBAL_DEFAULT_BATCHING.batchSize
|
|
66
141
|
&& normalizeStopCheckMode(batching.stopCheckMode) === GLOBAL_DEFAULT_BATCHING.stopCheckMode
|
|
67
|
-
&& normalizeReadbackInterval(batching.readbackInterval) === GLOBAL_DEFAULT_BATCHING.readbackInterval
|
|
142
|
+
&& normalizeReadbackInterval(batching.readbackInterval) === GLOBAL_DEFAULT_BATCHING.readbackInterval
|
|
143
|
+
&& normalizeReadbackInterval(batching.ringTokens) === GLOBAL_DEFAULT_BATCHING.ringTokens
|
|
144
|
+
&& normalizeReadbackInterval(batching.ringStop) === GLOBAL_DEFAULT_BATCHING.ringStop
|
|
145
|
+
&& normalizeReadbackInterval(batching.ringStaging) === GLOBAL_DEFAULT_BATCHING.ringStaging;
|
|
68
146
|
}
|
|
69
147
|
|
|
70
148
|
function isRuntimeGenerationAtGlobalDefaults(generation) {
|
|
@@ -74,98 +152,127 @@ function isRuntimeGenerationAtGlobalDefaults(generation) {
|
|
|
74
152
|
return (generation.disableCommandBatching === true) === GLOBAL_DEFAULT_GENERATION.disableCommandBatching;
|
|
75
153
|
}
|
|
76
154
|
|
|
77
|
-
function
|
|
78
|
-
const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
155
|
+
function requireManifestDecodeLoopPositiveInt(value, label, modelId) {
|
|
156
|
+
const normalized = normalizePositiveInt(value);
|
|
157
|
+
if (normalized == null) {
|
|
158
|
+
throw new Error(`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.${label} must be a positive integer.`);
|
|
159
|
+
}
|
|
160
|
+
return normalized;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function requireManifestDecodeLoopStopCheckMode(value, modelId) {
|
|
164
|
+
const normalized = normalizeStopCheckMode(value);
|
|
165
|
+
if (normalized == null) {
|
|
166
|
+
throw new Error(
|
|
167
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.stopCheckMode must be "batch" or "per-token".`
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
return normalized;
|
|
87
171
|
}
|
|
88
172
|
|
|
89
|
-
function
|
|
173
|
+
function buildManifestDecodeLoopRuntimePatch(manifest) {
|
|
90
174
|
const decodeLoop = manifest?.inference?.sessionDefaults?.decodeLoop;
|
|
91
|
-
if (
|
|
175
|
+
if (decodeLoop == null) {
|
|
92
176
|
return null;
|
|
93
177
|
}
|
|
94
|
-
const
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
return null;
|
|
178
|
+
const modelId = String(manifest?.modelId ?? 'unknown').trim() || 'unknown';
|
|
179
|
+
if (typeof decodeLoop !== 'object') {
|
|
180
|
+
throw new Error(
|
|
181
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop must be an object when provided.`
|
|
182
|
+
);
|
|
100
183
|
}
|
|
101
|
-
|
|
184
|
+
const batchSize = requireManifestDecodeLoopPositiveInt(decodeLoop.batchSize, 'batchSize', modelId);
|
|
185
|
+
const stopCheckMode = requireManifestDecodeLoopStopCheckMode(decodeLoop.stopCheckMode, modelId);
|
|
186
|
+
const readbackInterval = requireManifestDecodeLoopPositiveInt(
|
|
187
|
+
decodeLoop.readbackInterval,
|
|
188
|
+
'readbackInterval',
|
|
189
|
+
modelId
|
|
190
|
+
);
|
|
191
|
+
const disableCommandBatching = parseManifestDecodeLoopOptionalBoolean(
|
|
192
|
+
decodeLoop.disableCommandBatching,
|
|
193
|
+
'disableCommandBatching',
|
|
194
|
+
modelId
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
const batchingPatch = {
|
|
102
198
|
batchSize,
|
|
103
199
|
stopCheckMode,
|
|
104
200
|
readbackInterval,
|
|
105
|
-
|
|
201
|
+
};
|
|
202
|
+
const ringTokens = parseManifestDecodeLoopOptionalPositiveInt(
|
|
203
|
+
decodeLoop.ringTokens,
|
|
204
|
+
'ringTokens',
|
|
205
|
+
modelId
|
|
206
|
+
);
|
|
207
|
+
if (ringTokens !== undefined) {
|
|
208
|
+
batchingPatch.ringTokens = ringTokens;
|
|
209
|
+
}
|
|
210
|
+
const ringStop = parseManifestDecodeLoopOptionalPositiveInt(
|
|
211
|
+
decodeLoop.ringStop,
|
|
212
|
+
'ringStop',
|
|
213
|
+
modelId
|
|
214
|
+
);
|
|
215
|
+
if (ringStop !== undefined) {
|
|
216
|
+
batchingPatch.ringStop = ringStop;
|
|
217
|
+
}
|
|
218
|
+
const ringStaging = parseManifestDecodeLoopOptionalPositiveInt(
|
|
219
|
+
decodeLoop.ringStaging,
|
|
220
|
+
'ringStaging',
|
|
221
|
+
modelId
|
|
222
|
+
);
|
|
223
|
+
if (ringStaging !== undefined) {
|
|
224
|
+
batchingPatch.ringStaging = ringStaging;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
batching: batchingPatch,
|
|
229
|
+
generation: disableCommandBatching == null
|
|
230
|
+
? null
|
|
231
|
+
: { disableCommandBatching: disableCommandBatching === true },
|
|
106
232
|
};
|
|
107
233
|
}
|
|
108
234
|
|
|
109
235
|
export function applyModelBatchingRuntimeDefaults(runtimeConfig, manifest, modelConfig) {
|
|
236
|
+
void modelConfig;
|
|
110
237
|
const batching = runtimeConfig?.inference?.batching;
|
|
111
238
|
const generation = runtimeConfig?.inference?.generation;
|
|
112
239
|
const runtimeBatchingAtDefaults = isRuntimeBatchingAtGlobalDefaults(batching);
|
|
113
240
|
const runtimeGenerationAtDefaults = isRuntimeGenerationAtGlobalDefaults(generation);
|
|
114
241
|
|
|
115
|
-
const
|
|
116
|
-
|
|
117
|
-
if (!defaults || typeof defaults !== 'object') {
|
|
242
|
+
const patch = buildManifestDecodeLoopRuntimePatch(manifest);
|
|
243
|
+
if (!patch) {
|
|
118
244
|
return runtimeConfig;
|
|
119
245
|
}
|
|
120
246
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if (runtimeBatchingAtDefaults) {
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
nextBatching = {
|
|
129
|
-
...batching,
|
|
130
|
-
batchSize: nextBatchSize,
|
|
131
|
-
stopCheckMode: nextStopCheckMode,
|
|
132
|
-
readbackInterval: nextReadbackInterval,
|
|
133
|
-
};
|
|
134
|
-
appliedBatching = true;
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
const shouldApplyDisableCommandBatching = runtimeGenerationAtDefaults
|
|
139
|
-
&& normalizeBoolean(defaults.disableCommandBatching) != null;
|
|
140
|
-
const nextGeneration = shouldApplyDisableCommandBatching
|
|
141
|
-
? {
|
|
142
|
-
...generation,
|
|
143
|
-
disableCommandBatching: defaults.disableCommandBatching === true,
|
|
144
|
-
}
|
|
145
|
-
: generation;
|
|
146
|
-
|
|
147
|
-
if (!appliedBatching && !shouldApplyDisableCommandBatching) {
|
|
148
|
-
return runtimeConfig;
|
|
247
|
+
const runtimeDisableCommandBatching = generation?.disableCommandBatching === true;
|
|
248
|
+
const manifestDisableCommandBatching = patch.generation?.disableCommandBatching === true;
|
|
249
|
+
if (!runtimeBatchingAtDefaults) {
|
|
250
|
+
throw new Error(
|
|
251
|
+
'Manifest decodeLoop defaults cannot be merged after runtime batching overrides were already resolved. ' +
|
|
252
|
+
'Set runtime.inference.batching explicitly to the desired final values, or remove manifest.inference.sessionDefaults.decodeLoop.'
|
|
253
|
+
);
|
|
149
254
|
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
'
|
|
154
|
-
`Model defaults applied (${manifest?.inference?.presetId ?? 'unknown'}): ` +
|
|
155
|
-
`batchSize=${nextBatching.batchSize}, stopCheckMode=${nextBatching.stopCheckMode}, ` +
|
|
156
|
-
`readbackInterval=${nextBatching.readbackInterval}, ` +
|
|
157
|
-
`disableCommandBatching=${nextGeneration.disableCommandBatching === true}`
|
|
255
|
+
if (patch.generation && !runtimeGenerationAtDefaults && runtimeDisableCommandBatching !== manifestDisableCommandBatching) {
|
|
256
|
+
throw new Error(
|
|
257
|
+
'Manifest decodeLoop.disableCommandBatching conflicts with runtime.inference.generation.disableCommandBatching. ' +
|
|
258
|
+
'Choose one explicit source of truth.'
|
|
158
259
|
);
|
|
159
260
|
}
|
|
160
261
|
|
|
161
|
-
|
|
162
|
-
...runtimeConfig,
|
|
262
|
+
const nextRuntimeConfig = mergeRuntimeValues(runtimeConfig, {
|
|
163
263
|
inference: {
|
|
164
|
-
|
|
165
|
-
...(
|
|
166
|
-
...(shouldApplyDisableCommandBatching ? { generation: nextGeneration } : {}),
|
|
264
|
+
batching: patch.batching,
|
|
265
|
+
...(patch.generation ? { generation: patch.generation } : {}),
|
|
167
266
|
},
|
|
168
|
-
};
|
|
267
|
+
});
|
|
268
|
+
log.info(
|
|
269
|
+
'Pipeline',
|
|
270
|
+
`Manifest decodeLoop applied (${manifest?.modelId ?? 'unknown'}): ` +
|
|
271
|
+
`batchSize=${patch.batching.batchSize}, stopCheckMode=${patch.batching.stopCheckMode}, ` +
|
|
272
|
+
`readbackInterval=${patch.batching.readbackInterval}, ` +
|
|
273
|
+
`disableCommandBatching=${patch.generation?.disableCommandBatching === true}`
|
|
274
|
+
);
|
|
275
|
+
return nextRuntimeConfig;
|
|
169
276
|
}
|
|
170
277
|
|
|
171
278
|
export async function runKernelWarmup(options) {
|
|
@@ -206,7 +313,7 @@ function normalizeKernelPathSourceHint(value) {
|
|
|
206
313
|
function resolveKernelPathSource(runtimeConfigKernelPath, runtimeKernelPathSourceHint, modelKernelPath) {
|
|
207
314
|
if (runtimeConfigKernelPath) {
|
|
208
315
|
const sourceHint = normalizeKernelPathSourceHint(runtimeKernelPathSourceHint);
|
|
209
|
-
if (sourceHint
|
|
316
|
+
if (sourceHint !== 'none') return sourceHint;
|
|
210
317
|
return 'config';
|
|
211
318
|
}
|
|
212
319
|
if (modelKernelPath) return 'model';
|
|
@@ -334,7 +441,7 @@ function assertKernelPathFeatureCompatibility(
|
|
|
334
441
|
|
|
335
442
|
if (kernelPathSource === 'execution-v0' && typeof effectiveKernelPathRef !== 'string') {
|
|
336
443
|
const remediation = policyAllowsSource
|
|
337
|
-
? 'Execution-v0 inline kernel paths are not auto-remapped yet. Use subgroup/f16-compatible execution steps, or set runtime.inference.kernelPath to a compatible string preset (for example "gemma2-q4k-dequant-f32a").'
|
|
444
|
+
? 'Execution-v0 inline kernel paths are not auto-remapped yet. Use subgroup/f16-compatible execution steps, or set runtime.inference.kernelPath to a compatible string preset (for example "gemma2-q4k-dequant-f32a-nosubgroups").'
|
|
338
445
|
: 'Enable runtime.inference.kernelPathPolicy.sourceScope to include "execution-v0", then use compatible execution steps or a compatible preset id.';
|
|
339
446
|
throw new Error(
|
|
340
447
|
`[ExecutionV0] Inline kernelPath requires unsupported GPU features. ` +
|
|
@@ -366,6 +473,55 @@ function normalizeKernelDtype(value) {
|
|
|
366
473
|
});
|
|
367
474
|
}
|
|
368
475
|
|
|
476
|
+
function buildKernelPathDtypeContract(resolvedKernelPath) {
|
|
477
|
+
if (!resolvedKernelPath) {
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
const activationDtype = normalizeKernelDtype(getKernelPathActivationDtype(resolvedKernelPath));
|
|
481
|
+
const outputDtype = normalizeKernelDtype(
|
|
482
|
+
getKernelPathOutputDtype(resolvedKernelPath) ?? activationDtype
|
|
483
|
+
);
|
|
484
|
+
const kvDtype = normalizeKernelDtype(getKernelPathKVDtype(resolvedKernelPath) ?? activationDtype);
|
|
485
|
+
if (!activationDtype && !outputDtype && !kvDtype) {
|
|
486
|
+
return null;
|
|
487
|
+
}
|
|
488
|
+
return {
|
|
489
|
+
activationDtype,
|
|
490
|
+
outputDtype,
|
|
491
|
+
kvDtype,
|
|
492
|
+
};
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
function isGlobalKernelPathDtypeDefault(currentValue, key) {
|
|
496
|
+
if (currentValue == null) {
|
|
497
|
+
return true;
|
|
498
|
+
}
|
|
499
|
+
return currentValue === GLOBAL_DEFAULT_KERNEL_PATH_DTYPES[key];
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
function describeKernelPathDtypeMismatch(contract, current) {
|
|
503
|
+
const mismatches = [];
|
|
504
|
+
if (contract.activationDtype && current.activationDtype !== contract.activationDtype) {
|
|
505
|
+
mismatches.push(
|
|
506
|
+
`runtime.inference.compute.activationDtype=${current.activationDtype ?? 'unset'} ` +
|
|
507
|
+
`(expected ${contract.activationDtype})`
|
|
508
|
+
);
|
|
509
|
+
}
|
|
510
|
+
if (contract.kvDtype && current.kvDtype !== contract.kvDtype) {
|
|
511
|
+
mismatches.push(
|
|
512
|
+
`runtime.inference.kvcache.kvDtype=${current.kvDtype ?? 'unset'} ` +
|
|
513
|
+
`(expected ${contract.kvDtype})`
|
|
514
|
+
);
|
|
515
|
+
}
|
|
516
|
+
if (contract.outputDtype && current.outputDtype !== contract.outputDtype) {
|
|
517
|
+
mismatches.push(
|
|
518
|
+
`runtime.inference.session.compute.defaults.outputDtype=${current.outputDtype ?? 'unset'} ` +
|
|
519
|
+
`(expected ${contract.outputDtype})`
|
|
520
|
+
);
|
|
521
|
+
}
|
|
522
|
+
return mismatches;
|
|
523
|
+
}
|
|
524
|
+
|
|
369
525
|
function assertManifestKernelPathDtypeCompatibility(manifest, resolvedKernelPath, kernelPathSource) {
|
|
370
526
|
if (!resolvedKernelPath) return;
|
|
371
527
|
if (kernelPathSource === 'config') return;
|
|
@@ -376,16 +532,6 @@ function assertManifestKernelPathDtypeCompatibility(manifest, resolvedKernelPath
|
|
|
376
532
|
if (!manifestCompute || !kernelActivation) return;
|
|
377
533
|
if (manifestCompute === kernelActivation) return;
|
|
378
534
|
|
|
379
|
-
const presetId = String(manifest?.inference?.presetId ?? '').trim().toLowerCase();
|
|
380
|
-
if (presetId === 'lfm2' && manifestCompute === 'f32' && kernelActivation === 'f16') {
|
|
381
|
-
log.warn(
|
|
382
|
-
'Pipeline',
|
|
383
|
-
`Manifest "${manifest?.modelId ?? 'unknown'}" uses quantizationInfo.compute=f32 ` +
|
|
384
|
-
`with kernelPath activationDtype=f16 (${resolvedKernelPath.id}); continuing for LFM2 mixed-precision compatibility.`
|
|
385
|
-
);
|
|
386
|
-
return;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
535
|
throw new Error(
|
|
390
536
|
`Manifest kernel path dtype mismatch for "${manifest?.modelId ?? 'unknown'}": ` +
|
|
391
537
|
`quantizationInfo.compute=${manifestCompute} but ` +
|
|
@@ -402,17 +548,45 @@ function getKernelCapabilitiesSafe() {
|
|
|
402
548
|
}
|
|
403
549
|
}
|
|
404
550
|
|
|
405
|
-
function
|
|
406
|
-
const
|
|
407
|
-
|
|
408
|
-
const kernelPathKVDtype = getKernelPathKVDtype(resolvedKernelPath);
|
|
409
|
-
if (!kernelPathActivationDtype && !kernelPathOutputDtype && !kernelPathKVDtype) {
|
|
551
|
+
function applyKernelPathRuntimeDtypeContract(resolvedKernelPath, runtimeConfig, kernelPathSource, modelId) {
|
|
552
|
+
const contract = buildKernelPathDtypeContract(resolvedKernelPath);
|
|
553
|
+
if (!contract) {
|
|
410
554
|
return runtimeConfig;
|
|
411
555
|
}
|
|
412
556
|
|
|
413
|
-
const
|
|
414
|
-
|
|
415
|
-
|
|
557
|
+
const current = {
|
|
558
|
+
activationDtype: normalizeKernelDtype(runtimeConfig.inference?.compute?.activationDtype),
|
|
559
|
+
kvDtype: normalizeKernelDtype(runtimeConfig.inference?.kvcache?.kvDtype),
|
|
560
|
+
outputDtype: normalizeKernelDtype(runtimeConfig.inference?.session?.compute?.defaults?.outputDtype),
|
|
561
|
+
};
|
|
562
|
+
const mismatches = describeKernelPathDtypeMismatch(contract, current);
|
|
563
|
+
if (mismatches.length === 0) {
|
|
564
|
+
return runtimeConfig;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
if (kernelPathSource === 'config' || kernelPathSource === 'execution-v0') {
|
|
568
|
+
throw new Error(
|
|
569
|
+
`KernelPath "${resolvedKernelPath?.id ?? 'unknown'}" selected from ${kernelPathSource} ` +
|
|
570
|
+
`requires explicit matching runtime dtypes for "${modelId}". ` +
|
|
571
|
+
`Mismatches: ${mismatches.join('; ')}. ` +
|
|
572
|
+
'Set runtime.inference.compute.activationDtype, runtime.inference.kvcache.kvDtype, ' +
|
|
573
|
+
'and runtime.inference.session.compute.defaults.outputDtype to match the kernel path.'
|
|
574
|
+
);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
const canApplyManifestDefaults = (
|
|
578
|
+
(contract.activationDtype == null || isGlobalKernelPathDtypeDefault(current.activationDtype, 'activationDtype'))
|
|
579
|
+
&& (contract.kvDtype == null || isGlobalKernelPathDtypeDefault(current.kvDtype, 'kvDtype'))
|
|
580
|
+
&& (contract.outputDtype == null || isGlobalKernelPathDtypeDefault(current.outputDtype, 'outputDtype'))
|
|
581
|
+
);
|
|
582
|
+
if (!canApplyManifestDefaults) {
|
|
583
|
+
throw new Error(
|
|
584
|
+
`Manifest/model kernelPath "${resolvedKernelPath?.id ?? 'unknown'}" for "${modelId}" ` +
|
|
585
|
+
`conflicts with runtime dtype overrides. Mismatches: ${mismatches.join('; ')}. ` +
|
|
586
|
+
'Either remove the runtime dtype override or set it to match the kernel path.'
|
|
587
|
+
);
|
|
588
|
+
}
|
|
589
|
+
|
|
416
590
|
const nextInference = {
|
|
417
591
|
...runtimeConfig.inference,
|
|
418
592
|
compute: { ...runtimeConfig.inference.compute },
|
|
@@ -420,37 +594,33 @@ function applyKernelPathRuntimeDtypeOverrides(resolvedKernelPath, runtimeConfig)
|
|
|
420
594
|
};
|
|
421
595
|
const dtypeChanges = [];
|
|
422
596
|
|
|
423
|
-
if (
|
|
424
|
-
nextInference.compute.activationDtype =
|
|
425
|
-
dtypeChanges.push(`activation=${
|
|
597
|
+
if (contract.activationDtype && current.activationDtype !== contract.activationDtype) {
|
|
598
|
+
nextInference.compute.activationDtype = contract.activationDtype;
|
|
599
|
+
dtypeChanges.push(`activation=${current.activationDtype ?? 'unset'}->${contract.activationDtype}`);
|
|
426
600
|
}
|
|
427
601
|
|
|
428
|
-
if (
|
|
429
|
-
nextInference.kvcache.kvDtype =
|
|
430
|
-
dtypeChanges.push(`kv=${
|
|
602
|
+
if (contract.kvDtype && current.kvDtype !== contract.kvDtype) {
|
|
603
|
+
nextInference.kvcache.kvDtype = contract.kvDtype;
|
|
604
|
+
dtypeChanges.push(`kv=${current.kvDtype ?? 'unset'}->${contract.kvDtype}`);
|
|
431
605
|
}
|
|
432
606
|
|
|
433
|
-
if (
|
|
607
|
+
if (contract.outputDtype && current.outputDtype !== contract.outputDtype) {
|
|
434
608
|
nextInference.session = {
|
|
435
609
|
...(nextInference.session ?? {}),
|
|
436
610
|
compute: {
|
|
437
611
|
...(nextInference.session?.compute ?? {}),
|
|
438
612
|
defaults: {
|
|
439
613
|
...(nextInference.session?.compute?.defaults ?? {}),
|
|
440
|
-
outputDtype:
|
|
614
|
+
outputDtype: contract.outputDtype,
|
|
441
615
|
},
|
|
442
616
|
},
|
|
443
617
|
};
|
|
444
|
-
dtypeChanges.push(`session.outputDtype=${
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
if (dtypeChanges.length === 0) {
|
|
448
|
-
return runtimeConfig;
|
|
618
|
+
dtypeChanges.push(`session.outputDtype=${current.outputDtype ?? 'unset'}->${contract.outputDtype}`);
|
|
449
619
|
}
|
|
450
620
|
|
|
451
621
|
log.info(
|
|
452
622
|
'Pipeline',
|
|
453
|
-
`KernelPath ${resolvedKernelPath?.id ?? 'unknown'} runtime dtype
|
|
623
|
+
`KernelPath ${resolvedKernelPath?.id ?? 'unknown'} applied manifest/model runtime dtype defaults: ${dtypeChanges.join(', ')}`
|
|
454
624
|
);
|
|
455
625
|
return { ...runtimeConfig, inference: nextInference };
|
|
456
626
|
}
|
|
@@ -521,7 +691,12 @@ export function resolveKernelPathState(options) {
|
|
|
521
691
|
log.info('Pipeline', 'KernelPath: none (no kernel path configured)');
|
|
522
692
|
}
|
|
523
693
|
|
|
524
|
-
const nextRuntimeConfig =
|
|
694
|
+
const nextRuntimeConfig = applyKernelPathRuntimeDtypeContract(
|
|
695
|
+
resolvedKernelPath,
|
|
696
|
+
runtimeConfig,
|
|
697
|
+
kernelPathSource,
|
|
698
|
+
String(manifest?.modelId ?? 'unknown').trim() || 'unknown'
|
|
699
|
+
);
|
|
525
700
|
return {
|
|
526
701
|
resolvedKernelPath,
|
|
527
702
|
kernelPathSource,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { getRuntimeConfig } from '../../../config/runtime.js';
|
|
2
2
|
import { QK_K } from '../../../config/schema/index.js';
|
|
3
|
+
import { releaseBuffer } from '../../../memory/buffer-pool.js';
|
|
3
4
|
|
|
4
5
|
const dequantCache = new Map();
|
|
5
6
|
let dequantCacheMaxEntriesOverride = null;
|
|
@@ -73,8 +74,8 @@ export function setCachedDequant(layerIdx, expertIdx, outputDtype, gateUp, down)
|
|
|
73
74
|
if (oldestKey) {
|
|
74
75
|
const evicted = dequantCache.get(oldestKey);
|
|
75
76
|
if (evicted) {
|
|
76
|
-
evicted.gateUp
|
|
77
|
-
evicted.down
|
|
77
|
+
releaseBuffer(evicted.gateUp);
|
|
78
|
+
releaseBuffer(evicted.down);
|
|
78
79
|
}
|
|
79
80
|
dequantCache.delete(oldestKey);
|
|
80
81
|
}
|
|
@@ -85,8 +86,8 @@ export function setCachedDequant(layerIdx, expertIdx, outputDtype, gateUp, down)
|
|
|
85
86
|
|
|
86
87
|
export function clearDequantCache() {
|
|
87
88
|
for (const cached of dequantCache.values()) {
|
|
88
|
-
cached.gateUp
|
|
89
|
-
cached.down
|
|
89
|
+
releaseBuffer(cached.gateUp);
|
|
90
|
+
releaseBuffer(cached.down);
|
|
90
91
|
}
|
|
91
92
|
dequantCache.clear();
|
|
92
93
|
dequantCacheHits = 0;
|