@simulatte/doppler 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +16 -23
- package/package.json +30 -32
- package/src/adapters/adapter-registry.js +12 -1
- package/src/adapters/lora-loader.js +23 -6
- package/src/bridge/extension-client.d.ts +5 -0
- package/src/bridge/extension-client.js +40 -0
- package/src/bridge/index.d.ts +2 -1
- package/src/bridge/index.js +6 -4
- package/src/browser/browser-converter.js +31 -1
- package/src/browser/file-picker.js +6 -0
- package/src/browser/safetensors-parser-browser.js +84 -1
- package/src/browser/shard-io-browser.js +2 -2
- package/src/browser/tensor-source-download.js +8 -2
- package/src/browser/tensor-source-http.d.ts +1 -0
- package/src/browser/tensor-source-http.js +5 -1
- package/src/client/doppler-api.browser.js +20 -4
- package/src/client/doppler-api.js +19 -3
- package/src/client/doppler-provider/generation.js +12 -0
- package/src/client/doppler-provider/model-manager.d.ts +10 -0
- package/src/client/doppler-provider/model-manager.js +91 -19
- package/src/client/doppler-provider/source-runtime.d.ts +2 -1
- package/src/client/doppler-provider/source-runtime.js +132 -13
- package/src/client/doppler-registry.json +5 -20
- package/src/config/backward-registry-loader.js +17 -2
- package/src/config/execution-v0-contract-check.js +113 -15
- package/src/config/kernel-path-contract-check.js +57 -29
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +18 -36
- package/src/config/kernels/kernel-ref-digests.js +1 -1
- package/src/config/kernels/registry.js +14 -1
- package/src/config/kernels/registry.json +81 -5
- package/src/config/loader.d.ts +1 -1
- package/src/config/loader.js +15 -2
- package/src/config/merge-contract-check.js +66 -4
- package/src/config/merge-helpers.js +128 -7
- package/src/config/merge.d.ts +1 -0
- package/src/config/merge.js +10 -0
- package/src/config/param-validator.js +47 -2
- package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
- package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +43 -8
- package/src/config/presets/models/gemma2.json +3 -2
- package/src/config/presets/models/gemma3.json +2 -0
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
- package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
- package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
- package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
- package/src/config/runtime.js +6 -1
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +5 -0
- package/src/config/schema/doppler.schema.js +16 -21
- package/src/config/schema/inference-defaults.schema.js +3 -3
- package/src/config/schema/kernel-path.schema.d.ts +5 -1
- package/src/config/schema/kernel-thresholds.schema.js +12 -4
- package/src/config/schema/manifest.schema.d.ts +3 -2
- package/src/config/schema/manifest.schema.js +17 -4
- package/src/config/schema/storage.schema.js +1 -1
- package/src/config/training-defaults.js +30 -22
- package/src/converter/conversion-plan.js +104 -11
- package/src/converter/core.d.ts +7 -0
- package/src/converter/core.js +16 -9
- package/src/converter/execution-v0-manifest.js +4 -1
- package/src/converter/index.d.ts +1 -0
- package/src/converter/index.js +1 -0
- package/src/converter/manifest-inference.js +50 -29
- package/src/converter/parsers/diffusion.js +0 -3
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +40 -16
- package/src/converter/quantizer.js +19 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/shard-packer.d.ts +1 -1
- package/src/converter/shard-packer.js +4 -1
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/config.js +123 -11
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/debug/signals.js +7 -1
- package/src/debug/tensor.d.ts +2 -0
- package/src/debug/tensor.js +13 -2
- package/src/distribution/p2p-control-plane.js +52 -12
- package/src/distribution/p2p-observability.js +43 -7
- package/src/distribution/p2p-webrtc-browser.js +20 -0
- package/src/distribution/shard-delivery.js +83 -27
- package/src/formats/gguf/types.js +33 -16
- package/src/formats/rdrr/groups.d.ts +12 -4
- package/src/formats/rdrr/groups.js +3 -6
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +53 -3
- package/src/formats/rdrr/types.d.ts +2 -1
- package/src/gpu/command-recorder.js +86 -61
- package/src/gpu/device.d.ts +1 -0
- package/src/gpu/device.js +73 -19
- package/src/gpu/kernel-tuner/benchmarks.js +326 -316
- package/src/gpu/kernel-tuner/cache.js +71 -4
- package/src/gpu/kernel-tuner/tuner.js +22 -4
- package/src/gpu/kernels/attention.js +15 -34
- package/src/gpu/kernels/backward/adam.js +62 -58
- package/src/gpu/kernels/backward/attention_backward.js +257 -169
- package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
- package/src/gpu/kernels/cast.js +191 -149
- package/src/gpu/kernels/check-stop.js +33 -44
- package/src/gpu/kernels/conv2d.js +27 -17
- package/src/gpu/kernels/cross_entropy_loss.js +21 -15
- package/src/gpu/kernels/depthwise_conv2d.js +36 -26
- package/src/gpu/kernels/dequant.js +178 -126
- package/src/gpu/kernels/energy.d.ts +3 -21
- package/src/gpu/kernels/energy.js +111 -88
- package/src/gpu/kernels/feature-check.js +1 -1
- package/src/gpu/kernels/fused_ffn.js +84 -65
- package/src/gpu/kernels/fused_matmul_residual.js +56 -33
- package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
- package/src/gpu/kernels/gather.js +33 -15
- package/src/gpu/kernels/gelu.js +19 -11
- package/src/gpu/kernels/grouped_pointwise_conv2d.js +33 -23
- package/src/gpu/kernels/groupnorm.js +34 -23
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/kv-quantize.js +5 -2
- package/src/gpu/kernels/layernorm.js +35 -19
- package/src/gpu/kernels/logit-merge.js +5 -3
- package/src/gpu/kernels/matmul-selection.js +47 -4
- package/src/gpu/kernels/matmul.d.ts +2 -0
- package/src/gpu/kernels/matmul.js +59 -40
- package/src/gpu/kernels/modulate.js +23 -15
- package/src/gpu/kernels/moe.js +221 -175
- package/src/gpu/kernels/pixel_shuffle.js +22 -14
- package/src/gpu/kernels/relu.js +18 -10
- package/src/gpu/kernels/repeat_channels.js +25 -17
- package/src/gpu/kernels/residual.js +37 -27
- package/src/gpu/kernels/rmsnorm.js +66 -43
- package/src/gpu/kernels/rope.js +3 -0
- package/src/gpu/kernels/sample.js +27 -38
- package/src/gpu/kernels/sana_linear_attention.js +18 -10
- package/src/gpu/kernels/scale.js +18 -11
- package/src/gpu/kernels/shader-cache.js +4 -2
- package/src/gpu/kernels/silu.js +120 -72
- package/src/gpu/kernels/softmax.js +44 -25
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/kernels/split_qkv.js +23 -13
- package/src/gpu/kernels/transpose.js +18 -10
- package/src/gpu/kernels/transpose.wgsl +5 -3
- package/src/gpu/kernels/upsample2d.js +21 -13
- package/src/gpu/kernels/utils.js +20 -13
- package/src/gpu/partitioned-buffer-pool.js +10 -2
- package/src/gpu/perf-guards.js +2 -9
- package/src/gpu/profiler.js +27 -22
- package/src/gpu/readback-utils.d.ts +16 -0
- package/src/gpu/readback-utils.js +41 -0
- package/src/gpu/submit-tracker.js +13 -0
- package/src/gpu/uniform-cache.d.ts +1 -0
- package/src/gpu/uniform-cache.js +30 -9
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/hotswap/intent-bundle.js +6 -0
- package/src/hotswap/manifest.d.ts +10 -1
- package/src/hotswap/manifest.js +12 -2
- package/src/hotswap/runtime.js +30 -8
- package/src/index-browser.d.ts +44 -0
- package/src/index-browser.js +14 -0
- package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
- package/src/inference/browser-harness-contract-helpers.js +28 -0
- package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
- package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
- package/src/inference/browser-harness-model-helpers.d.ts +16 -0
- package/src/inference/browser-harness-model-helpers.js +217 -0
- package/src/inference/browser-harness-report-helpers.d.ts +7 -0
- package/src/inference/browser-harness-report-helpers.js +42 -0
- package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
- package/src/inference/browser-harness-runtime-helpers.js +415 -0
- package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
- package/src/inference/browser-harness-suite-helpers.js +268 -0
- package/src/inference/browser-harness-text-helpers.d.ts +27 -0
- package/src/inference/browser-harness-text-helpers.js +788 -0
- package/src/inference/browser-harness.d.ts +8 -0
- package/src/inference/browser-harness.js +149 -1996
- package/src/inference/kv-cache/base.js +140 -94
- package/src/inference/kv-cache/tiered.js +5 -3
- package/src/inference/moe-router.js +88 -56
- package/src/inference/multi-model-network.js +5 -3
- package/src/inference/network-evolution.d.ts +11 -2
- package/src/inference/network-evolution.js +20 -21
- package/src/inference/pipelines/context.d.ts +3 -0
- package/src/inference/pipelines/context.js +142 -2
- package/src/inference/pipelines/diffusion/helpers.js +10 -2
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
- package/src/inference/pipelines/diffusion/vae.js +3 -7
- package/src/inference/pipelines/energy/pipeline.js +27 -21
- package/src/inference/pipelines/energy/quintel.d.ts +5 -0
- package/src/inference/pipelines/energy/quintel.js +11 -0
- package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
- package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
- package/src/inference/pipelines/text/attention/projections.js +192 -112
- package/src/inference/pipelines/text/attention/record.js +77 -14
- package/src/inference/pipelines/text/attention/run.js +112 -14
- package/src/inference/pipelines/text/config.js +17 -4
- package/src/inference/pipelines/text/embed.js +2 -8
- package/src/inference/pipelines/text/execution-plan.js +46 -23
- package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
- package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
- package/src/inference/pipelines/text/execution-v0.js +62 -1013
- package/src/inference/pipelines/text/generator-runtime.js +5 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +52 -0
- package/src/inference/pipelines/text/generator-steps.js +340 -221
- package/src/inference/pipelines/text/generator.js +56 -40
- package/src/inference/pipelines/text/init.d.ts +13 -0
- package/src/inference/pipelines/text/init.js +94 -25
- package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
- package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
- package/src/inference/pipelines/text/kernel-trace.js +6 -0
- package/src/inference/pipelines/text/layer.js +4 -9
- package/src/inference/pipelines/text/linear-attention.d.ts +15 -0
- package/src/inference/pipelines/text/linear-attention.js +113 -9
- package/src/inference/pipelines/text/logits/gpu.js +12 -7
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +13 -12
- package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
- package/src/inference/pipelines/text/logits/utils.js +9 -0
- package/src/inference/pipelines/text/lora-apply.js +50 -32
- package/src/inference/pipelines/text/model-load.js +282 -104
- package/src/inference/pipelines/text/moe-cache.js +5 -4
- package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
- package/src/inference/pipelines/text/moe-cpu.js +42 -38
- package/src/inference/pipelines/text/moe-gpu.js +110 -86
- package/src/inference/pipelines/text/ops.js +90 -90
- package/src/inference/pipelines/text/probes.js +9 -9
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/weights.js +17 -7
- package/src/inference/pipelines/text.js +13 -1
- package/src/inference/speculative.d.ts +2 -2
- package/src/inference/speculative.js +4 -18
- package/src/inference/test-harness.d.ts +1 -1
- package/src/inference/test-harness.js +17 -7
- package/src/inference/tokenizer.d.ts +0 -5
- package/src/inference/tokenizer.js +4 -23
- package/src/inference/tokenizers/bpe.js +9 -0
- package/src/inference/tokenizers/bundled.js +20 -0
- package/src/inference/tokenizers/sentencepiece.js +12 -0
- package/src/loader/doppler-loader.js +38 -22
- package/src/loader/dtype-utils.js +3 -44
- package/src/loader/embedding-loader.js +7 -3
- package/src/loader/experts/expert-cache.js +13 -6
- package/src/loader/experts/expert-loader.js +10 -6
- package/src/loader/final-weights-loader.js +10 -4
- package/src/loader/layer-loader.js +2 -1
- package/src/loader/loader-state.js +2 -2
- package/src/loader/memory-monitor.js +8 -0
- package/src/loader/multi-model-loader.d.ts +14 -0
- package/src/loader/multi-model-loader.js +70 -24
- package/src/loader/shard-cache.js +84 -14
- package/src/loader/shard-resolver.js +25 -3
- package/src/loader/tensors/tensor-loader.js +214 -144
- package/src/loader/tensors/tensor-reader.js +76 -19
- package/src/loader/weight-downcast.js +1 -1
- package/src/memory/buffer-pool.d.ts +9 -1
- package/src/memory/buffer-pool.js +109 -44
- package/src/memory/unified-detect.js +1 -1
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +24 -8
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.js +27 -1
- package/src/storage/backends/opfs-store.js +68 -24
- package/src/storage/downloader.js +365 -83
- package/src/storage/index.d.ts +3 -0
- package/src/storage/index.js +3 -0
- package/src/storage/preflight.d.ts +2 -2
- package/src/storage/preflight.js +24 -2
- package/src/storage/quickstart-downloader.js +11 -5
- package/src/storage/registry.js +10 -4
- package/src/storage/reports.js +1 -1
- package/src/storage/shard-manager.d.ts +15 -1
- package/src/storage/shard-manager.js +55 -6
- package/src/storage/source-artifact-store.d.ts +52 -0
- package/src/storage/source-artifact-store.js +234 -0
- package/src/tooling/command-api-constants.d.ts +9 -0
- package/src/tooling/command-api-constants.js +9 -0
- package/src/tooling/command-api-family-normalizers.d.ts +9 -0
- package/src/tooling/command-api-family-normalizers.js +343 -0
- package/src/tooling/command-api-helpers.d.ts +25 -0
- package/src/tooling/command-api-helpers.js +262 -0
- package/src/tooling/command-api.js +16 -602
- package/src/tooling/command-envelope.js +4 -1
- package/src/tooling/command-runner-shared.js +52 -18
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/lean-execution-contract.js +150 -3
- package/src/tooling/node-browser-command-runner.js +161 -271
- package/src/tooling/node-command-runner.js +29 -3
- package/src/tooling/node-converter.js +30 -1
- package/src/tooling/node-source-runtime.d.ts +1 -1
- package/src/tooling/node-source-runtime.js +120 -3
- package/src/tooling/node-webgpu.js +24 -21
- package/src/tooling/opfs-cache.js +21 -4
- package/src/tooling/runtime-input-composition.d.ts +38 -0
- package/src/tooling/runtime-input-composition.js +86 -0
- package/src/tooling/source-runtime-bundle.d.ts +40 -5
- package/src/tooling/source-runtime-bundle.js +261 -34
- package/src/tooling/source-runtime-materializer.d.ts +6 -0
- package/src/tooling/source-runtime-materializer.js +93 -0
- package/src/training/attention-backward.js +32 -17
- package/src/training/autograd.js +80 -52
- package/src/training/checkpoint-watch.d.ts +2 -1
- package/src/training/checkpoint-watch.js +39 -6
- package/src/training/checkpoint.js +40 -11
- package/src/training/clip.js +2 -1
- package/src/training/datasets/token-batch.js +20 -8
- package/src/training/distillation/checkpoint-watch.js +1 -0
- package/src/training/distillation/student-fixture.d.ts +22 -0
- package/src/training/distillation/student-fixture.js +846 -0
- package/src/training/distillation/suite-data.d.ts +45 -0
- package/src/training/distillation/suite-data.js +189 -0
- package/src/training/lora-pipeline.js +4 -7
- package/src/training/lora.js +26 -12
- package/src/training/loss.js +5 -6
- package/src/training/objectives/cross_entropy.js +2 -5
- package/src/training/objectives/distill_kd.js +4 -8
- package/src/training/objectives/distill_triplet.js +4 -8
- package/src/training/objectives/ul_stage2_base.js +4 -8
- package/src/training/operator-command.js +2 -0
- package/src/training/optimizer.js +19 -7
- package/src/training/runner.js +2 -1
- package/src/training/suite.js +18 -978
- package/src/training/tensor-factory.d.ts +9 -0
- package/src/training/tensor-factory.js +13 -0
- package/src/training/trainer.js +3 -5
- package/src/training/ul_dataset.js +3 -5
- package/src/training/workloads.js +70 -79
- package/src/types/model.d.ts +5 -0
- package/src/version.js +1 -1
- package/tools/convert-safetensors-node.js +22 -16
- package/tools/doppler-cli.js +50 -26
|
@@ -15,10 +15,14 @@ import { KERNEL_CONFIGS } from '../../../gpu/kernels/kernel-configs.js';
|
|
|
15
15
|
import { resolveCapabilityKernelPathRef, resolveKernelPathPolicy } from './kernel-path-auto-select.js';
|
|
16
16
|
import { initTokenizer } from './init.js';
|
|
17
17
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
18
|
+
import { mergeRuntimeValues } from '../../../config/runtime-merge.js';
|
|
18
19
|
import {
|
|
19
20
|
DEFAULT_BATCHING_DEFAULTS,
|
|
21
|
+
DEFAULT_COMPUTE_DEFAULTS,
|
|
20
22
|
DEFAULT_GENERATION_CONFIG,
|
|
21
23
|
} from '../../../config/schema/inference-defaults.schema.js';
|
|
24
|
+
import { DEFAULT_KVCACHE_CONFIG } from '../../../config/schema/kvcache.schema.js';
|
|
25
|
+
import { DEFAULT_EXECUTION_V0_SESSION_DEFAULTS } from '../../../config/schema/execution-v0.schema.js';
|
|
22
26
|
|
|
23
27
|
function validateKernelWarmupMode(mode) {
|
|
24
28
|
if (mode !== 'parallel' && mode !== 'sequential') {
|
|
@@ -48,23 +52,97 @@ function normalizeBoolean(value) {
|
|
|
48
52
|
return typeof value === 'boolean' ? value : null;
|
|
49
53
|
}
|
|
50
54
|
|
|
55
|
+
function parseManifestDecodeLoopOptionalPositiveInt(value, label, modelId) {
|
|
56
|
+
if (value === undefined) {
|
|
57
|
+
return undefined;
|
|
58
|
+
}
|
|
59
|
+
if (value === null) {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
const normalized = normalizePositiveInt(value);
|
|
63
|
+
if (normalized == null) {
|
|
64
|
+
throw new Error(
|
|
65
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.${label} must be a positive integer or null.`
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
return normalized;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function parseManifestDecodeLoopOptionalBoolean(value, label, modelId) {
|
|
72
|
+
if (value === undefined) {
|
|
73
|
+
return undefined;
|
|
74
|
+
}
|
|
75
|
+
if (typeof value !== 'boolean') {
|
|
76
|
+
throw new Error(
|
|
77
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.${label} must be a boolean when provided.`
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
return value;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function requireGlobalBatchingDefault(value, label) {
|
|
84
|
+
const normalized = normalizePositiveInt(value);
|
|
85
|
+
if (normalized == null) {
|
|
86
|
+
throw new Error(`${label} must be a positive integer.`);
|
|
87
|
+
}
|
|
88
|
+
return normalized;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function requireGlobalStopCheckMode(value, label) {
|
|
92
|
+
const normalized = normalizeStopCheckMode(value);
|
|
93
|
+
if (normalized == null) {
|
|
94
|
+
throw new Error(`${label} must be "batch" or "per-token".`);
|
|
95
|
+
}
|
|
96
|
+
return normalized;
|
|
97
|
+
}
|
|
98
|
+
|
|
51
99
|
const GLOBAL_DEFAULT_BATCHING = Object.freeze({
|
|
52
|
-
batchSize:
|
|
53
|
-
|
|
54
|
-
|
|
100
|
+
batchSize: requireGlobalBatchingDefault(
|
|
101
|
+
DEFAULT_BATCHING_DEFAULTS.batchSize,
|
|
102
|
+
'DEFAULT_BATCHING_DEFAULTS.batchSize'
|
|
103
|
+
),
|
|
104
|
+
stopCheckMode: requireGlobalStopCheckMode(
|
|
105
|
+
DEFAULT_BATCHING_DEFAULTS.stopCheckMode,
|
|
106
|
+
'DEFAULT_BATCHING_DEFAULTS.stopCheckMode'
|
|
107
|
+
),
|
|
108
|
+
readbackInterval: requireGlobalBatchingDefault(
|
|
109
|
+
DEFAULT_BATCHING_DEFAULTS.readbackInterval,
|
|
110
|
+
'DEFAULT_BATCHING_DEFAULTS.readbackInterval'
|
|
111
|
+
),
|
|
112
|
+
ringTokens: requireGlobalBatchingDefault(
|
|
113
|
+
DEFAULT_BATCHING_DEFAULTS.ringTokens,
|
|
114
|
+
'DEFAULT_BATCHING_DEFAULTS.ringTokens'
|
|
115
|
+
),
|
|
116
|
+
ringStop: requireGlobalBatchingDefault(
|
|
117
|
+
DEFAULT_BATCHING_DEFAULTS.ringStop,
|
|
118
|
+
'DEFAULT_BATCHING_DEFAULTS.ringStop'
|
|
119
|
+
),
|
|
120
|
+
ringStaging: requireGlobalBatchingDefault(
|
|
121
|
+
DEFAULT_BATCHING_DEFAULTS.ringStaging,
|
|
122
|
+
'DEFAULT_BATCHING_DEFAULTS.ringStaging'
|
|
123
|
+
),
|
|
55
124
|
});
|
|
56
125
|
|
|
57
126
|
const GLOBAL_DEFAULT_GENERATION = Object.freeze({
|
|
58
127
|
disableCommandBatching: DEFAULT_GENERATION_CONFIG.disableCommandBatching === true,
|
|
59
128
|
});
|
|
60
129
|
|
|
130
|
+
const GLOBAL_DEFAULT_KERNEL_PATH_DTYPES = Object.freeze({
|
|
131
|
+
activationDtype: DEFAULT_COMPUTE_DEFAULTS.activationDtype,
|
|
132
|
+
kvDtype: DEFAULT_KVCACHE_CONFIG.kvDtype,
|
|
133
|
+
outputDtype: DEFAULT_EXECUTION_V0_SESSION_DEFAULTS.compute.defaults.outputDtype,
|
|
134
|
+
});
|
|
135
|
+
|
|
61
136
|
function isRuntimeBatchingAtGlobalDefaults(batching) {
|
|
62
137
|
if (!batching || typeof batching !== 'object') {
|
|
63
138
|
return false;
|
|
64
139
|
}
|
|
65
140
|
return normalizePositiveInt(batching.batchSize) === GLOBAL_DEFAULT_BATCHING.batchSize
|
|
66
141
|
&& normalizeStopCheckMode(batching.stopCheckMode) === GLOBAL_DEFAULT_BATCHING.stopCheckMode
|
|
67
|
-
&& normalizeReadbackInterval(batching.readbackInterval) === GLOBAL_DEFAULT_BATCHING.readbackInterval
|
|
142
|
+
&& normalizeReadbackInterval(batching.readbackInterval) === GLOBAL_DEFAULT_BATCHING.readbackInterval
|
|
143
|
+
&& normalizeReadbackInterval(batching.ringTokens) === GLOBAL_DEFAULT_BATCHING.ringTokens
|
|
144
|
+
&& normalizeReadbackInterval(batching.ringStop) === GLOBAL_DEFAULT_BATCHING.ringStop
|
|
145
|
+
&& normalizeReadbackInterval(batching.ringStaging) === GLOBAL_DEFAULT_BATCHING.ringStaging;
|
|
68
146
|
}
|
|
69
147
|
|
|
70
148
|
function isRuntimeGenerationAtGlobalDefaults(generation) {
|
|
@@ -74,98 +152,130 @@ function isRuntimeGenerationAtGlobalDefaults(generation) {
|
|
|
74
152
|
return (generation.disableCommandBatching === true) === GLOBAL_DEFAULT_GENERATION.disableCommandBatching;
|
|
75
153
|
}
|
|
76
154
|
|
|
77
|
-
function
|
|
78
|
-
const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
modelType: modelType || null,
|
|
84
|
-
numLayers: Number(modelConfig?.numLayers ?? 0),
|
|
85
|
-
hiddenSize: Number(modelConfig?.hiddenSize ?? 0),
|
|
86
|
-
});
|
|
155
|
+
function requireManifestDecodeLoopPositiveInt(value, label, modelId) {
|
|
156
|
+
const normalized = normalizePositiveInt(value);
|
|
157
|
+
if (normalized == null) {
|
|
158
|
+
throw new Error(`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.${label} must be a positive integer.`);
|
|
159
|
+
}
|
|
160
|
+
return normalized;
|
|
87
161
|
}
|
|
88
162
|
|
|
89
|
-
function
|
|
163
|
+
function requireManifestDecodeLoopStopCheckMode(value, modelId) {
|
|
164
|
+
const normalized = normalizeStopCheckMode(value);
|
|
165
|
+
if (normalized == null) {
|
|
166
|
+
throw new Error(
|
|
167
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop.stopCheckMode must be "batch" or "per-token".`
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
return normalized;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function buildManifestDecodeLoopRuntimePatch(manifest) {
|
|
90
174
|
const decodeLoop = manifest?.inference?.sessionDefaults?.decodeLoop;
|
|
91
|
-
if (
|
|
175
|
+
if (decodeLoop == null) {
|
|
92
176
|
return null;
|
|
93
177
|
}
|
|
94
|
-
const
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
return null;
|
|
178
|
+
const modelId = String(manifest?.modelId ?? 'unknown').trim() || 'unknown';
|
|
179
|
+
if (typeof decodeLoop !== 'object') {
|
|
180
|
+
throw new Error(
|
|
181
|
+
`Manifest "${modelId}" inference.sessionDefaults.decodeLoop must be an object when provided.`
|
|
182
|
+
);
|
|
100
183
|
}
|
|
101
|
-
|
|
184
|
+
const batchSize = requireManifestDecodeLoopPositiveInt(decodeLoop.batchSize, 'batchSize', modelId);
|
|
185
|
+
const stopCheckMode = requireManifestDecodeLoopStopCheckMode(decodeLoop.stopCheckMode, modelId);
|
|
186
|
+
const readbackInterval = requireManifestDecodeLoopPositiveInt(
|
|
187
|
+
decodeLoop.readbackInterval,
|
|
188
|
+
'readbackInterval',
|
|
189
|
+
modelId
|
|
190
|
+
);
|
|
191
|
+
const disableCommandBatching = parseManifestDecodeLoopOptionalBoolean(
|
|
192
|
+
decodeLoop.disableCommandBatching,
|
|
193
|
+
'disableCommandBatching',
|
|
194
|
+
modelId
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
const batchingPatch = {
|
|
102
198
|
batchSize,
|
|
103
199
|
stopCheckMode,
|
|
104
200
|
readbackInterval,
|
|
105
|
-
|
|
201
|
+
};
|
|
202
|
+
const ringTokens = parseManifestDecodeLoopOptionalPositiveInt(
|
|
203
|
+
decodeLoop.ringTokens,
|
|
204
|
+
'ringTokens',
|
|
205
|
+
modelId
|
|
206
|
+
);
|
|
207
|
+
if (ringTokens !== undefined) {
|
|
208
|
+
batchingPatch.ringTokens = ringTokens;
|
|
209
|
+
}
|
|
210
|
+
const ringStop = parseManifestDecodeLoopOptionalPositiveInt(
|
|
211
|
+
decodeLoop.ringStop,
|
|
212
|
+
'ringStop',
|
|
213
|
+
modelId
|
|
214
|
+
);
|
|
215
|
+
if (ringStop !== undefined) {
|
|
216
|
+
batchingPatch.ringStop = ringStop;
|
|
217
|
+
}
|
|
218
|
+
const ringStaging = parseManifestDecodeLoopOptionalPositiveInt(
|
|
219
|
+
decodeLoop.ringStaging,
|
|
220
|
+
'ringStaging',
|
|
221
|
+
modelId
|
|
222
|
+
);
|
|
223
|
+
if (ringStaging !== undefined) {
|
|
224
|
+
batchingPatch.ringStaging = ringStaging;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
batching: batchingPatch,
|
|
229
|
+
generation: disableCommandBatching == null
|
|
230
|
+
? null
|
|
231
|
+
: { disableCommandBatching: disableCommandBatching === true },
|
|
106
232
|
};
|
|
107
233
|
}
|
|
108
234
|
|
|
109
235
|
export function applyModelBatchingRuntimeDefaults(runtimeConfig, manifest, modelConfig) {
|
|
236
|
+
void modelConfig;
|
|
237
|
+
if (manifest?.inference?.schema === 'doppler.execution/v0') {
|
|
238
|
+
return runtimeConfig;
|
|
239
|
+
}
|
|
110
240
|
const batching = runtimeConfig?.inference?.batching;
|
|
111
241
|
const generation = runtimeConfig?.inference?.generation;
|
|
112
242
|
const runtimeBatchingAtDefaults = isRuntimeBatchingAtGlobalDefaults(batching);
|
|
113
243
|
const runtimeGenerationAtDefaults = isRuntimeGenerationAtGlobalDefaults(generation);
|
|
114
244
|
|
|
115
|
-
const
|
|
116
|
-
|
|
117
|
-
if (!defaults || typeof defaults !== 'object') {
|
|
245
|
+
const patch = buildManifestDecodeLoopRuntimePatch(manifest);
|
|
246
|
+
if (!patch) {
|
|
118
247
|
return runtimeConfig;
|
|
119
248
|
}
|
|
120
249
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if (runtimeBatchingAtDefaults) {
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
nextBatching = {
|
|
129
|
-
...batching,
|
|
130
|
-
batchSize: nextBatchSize,
|
|
131
|
-
stopCheckMode: nextStopCheckMode,
|
|
132
|
-
readbackInterval: nextReadbackInterval,
|
|
133
|
-
};
|
|
134
|
-
appliedBatching = true;
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
const shouldApplyDisableCommandBatching = runtimeGenerationAtDefaults
|
|
139
|
-
&& normalizeBoolean(defaults.disableCommandBatching) != null;
|
|
140
|
-
const nextGeneration = shouldApplyDisableCommandBatching
|
|
141
|
-
? {
|
|
142
|
-
...generation,
|
|
143
|
-
disableCommandBatching: defaults.disableCommandBatching === true,
|
|
144
|
-
}
|
|
145
|
-
: generation;
|
|
146
|
-
|
|
147
|
-
if (!appliedBatching && !shouldApplyDisableCommandBatching) {
|
|
148
|
-
return runtimeConfig;
|
|
250
|
+
const runtimeDisableCommandBatching = generation?.disableCommandBatching === true;
|
|
251
|
+
const manifestDisableCommandBatching = patch.generation?.disableCommandBatching === true;
|
|
252
|
+
if (!runtimeBatchingAtDefaults) {
|
|
253
|
+
throw new Error(
|
|
254
|
+
'Manifest decodeLoop defaults cannot be merged after runtime batching overrides were already resolved. ' +
|
|
255
|
+
'Set runtime.inference.batching explicitly to the desired final values, or remove manifest.inference.sessionDefaults.decodeLoop.'
|
|
256
|
+
);
|
|
149
257
|
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
'
|
|
154
|
-
`Model defaults applied (${manifest?.inference?.presetId ?? 'unknown'}): ` +
|
|
155
|
-
`batchSize=${nextBatching.batchSize}, stopCheckMode=${nextBatching.stopCheckMode}, ` +
|
|
156
|
-
`readbackInterval=${nextBatching.readbackInterval}, ` +
|
|
157
|
-
`disableCommandBatching=${nextGeneration.disableCommandBatching === true}`
|
|
258
|
+
if (patch.generation && !runtimeGenerationAtDefaults && runtimeDisableCommandBatching !== manifestDisableCommandBatching) {
|
|
259
|
+
throw new Error(
|
|
260
|
+
'Manifest decodeLoop.disableCommandBatching conflicts with runtime.inference.generation.disableCommandBatching. ' +
|
|
261
|
+
'Choose one explicit source of truth.'
|
|
158
262
|
);
|
|
159
263
|
}
|
|
160
264
|
|
|
161
|
-
|
|
162
|
-
...runtimeConfig,
|
|
265
|
+
const nextRuntimeConfig = mergeRuntimeValues(runtimeConfig, {
|
|
163
266
|
inference: {
|
|
164
|
-
|
|
165
|
-
...(
|
|
166
|
-
...(shouldApplyDisableCommandBatching ? { generation: nextGeneration } : {}),
|
|
267
|
+
batching: patch.batching,
|
|
268
|
+
...(patch.generation ? { generation: patch.generation } : {}),
|
|
167
269
|
},
|
|
168
|
-
};
|
|
270
|
+
});
|
|
271
|
+
log.info(
|
|
272
|
+
'Pipeline',
|
|
273
|
+
`Manifest decodeLoop applied (${manifest?.modelId ?? 'unknown'}): ` +
|
|
274
|
+
`batchSize=${patch.batching.batchSize}, stopCheckMode=${patch.batching.stopCheckMode}, ` +
|
|
275
|
+
`readbackInterval=${patch.batching.readbackInterval}, ` +
|
|
276
|
+
`disableCommandBatching=${patch.generation?.disableCommandBatching === true}`
|
|
277
|
+
);
|
|
278
|
+
return nextRuntimeConfig;
|
|
169
279
|
}
|
|
170
280
|
|
|
171
281
|
export async function runKernelWarmup(options) {
|
|
@@ -206,7 +316,7 @@ function normalizeKernelPathSourceHint(value) {
|
|
|
206
316
|
function resolveKernelPathSource(runtimeConfigKernelPath, runtimeKernelPathSourceHint, modelKernelPath) {
|
|
207
317
|
if (runtimeConfigKernelPath) {
|
|
208
318
|
const sourceHint = normalizeKernelPathSourceHint(runtimeKernelPathSourceHint);
|
|
209
|
-
if (sourceHint
|
|
319
|
+
if (sourceHint !== 'none') return sourceHint;
|
|
210
320
|
return 'config';
|
|
211
321
|
}
|
|
212
322
|
if (modelKernelPath) return 'model';
|
|
@@ -334,7 +444,7 @@ function assertKernelPathFeatureCompatibility(
|
|
|
334
444
|
|
|
335
445
|
if (kernelPathSource === 'execution-v0' && typeof effectiveKernelPathRef !== 'string') {
|
|
336
446
|
const remediation = policyAllowsSource
|
|
337
|
-
? 'Execution-v0 inline kernel paths are not auto-remapped yet. Use subgroup/f16-compatible execution steps, or set runtime.inference.kernelPath to a compatible string preset (for example "gemma2-q4k-dequant-f32a").'
|
|
447
|
+
? 'Execution-v0 inline kernel paths are not auto-remapped yet. Use subgroup/f16-compatible execution steps, or set runtime.inference.kernelPath to a compatible string preset (for example "gemma2-q4k-dequant-f32a-nosubgroups").'
|
|
338
448
|
: 'Enable runtime.inference.kernelPathPolicy.sourceScope to include "execution-v0", then use compatible execution steps or a compatible preset id.';
|
|
339
449
|
throw new Error(
|
|
340
450
|
`[ExecutionV0] Inline kernelPath requires unsupported GPU features. ` +
|
|
@@ -366,6 +476,55 @@ function normalizeKernelDtype(value) {
|
|
|
366
476
|
});
|
|
367
477
|
}
|
|
368
478
|
|
|
479
|
+
function buildKernelPathDtypeContract(resolvedKernelPath) {
|
|
480
|
+
if (!resolvedKernelPath) {
|
|
481
|
+
return null;
|
|
482
|
+
}
|
|
483
|
+
const activationDtype = normalizeKernelDtype(getKernelPathActivationDtype(resolvedKernelPath));
|
|
484
|
+
const outputDtype = normalizeKernelDtype(
|
|
485
|
+
getKernelPathOutputDtype(resolvedKernelPath) ?? activationDtype
|
|
486
|
+
);
|
|
487
|
+
const kvDtype = normalizeKernelDtype(getKernelPathKVDtype(resolvedKernelPath) ?? activationDtype);
|
|
488
|
+
if (!activationDtype && !outputDtype && !kvDtype) {
|
|
489
|
+
return null;
|
|
490
|
+
}
|
|
491
|
+
return {
|
|
492
|
+
activationDtype,
|
|
493
|
+
outputDtype,
|
|
494
|
+
kvDtype,
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
function isGlobalKernelPathDtypeDefault(currentValue, key) {
|
|
499
|
+
if (currentValue == null) {
|
|
500
|
+
return true;
|
|
501
|
+
}
|
|
502
|
+
return currentValue === GLOBAL_DEFAULT_KERNEL_PATH_DTYPES[key];
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
function describeKernelPathDtypeMismatch(contract, current) {
|
|
506
|
+
const mismatches = [];
|
|
507
|
+
if (contract.activationDtype && current.activationDtype !== contract.activationDtype) {
|
|
508
|
+
mismatches.push(
|
|
509
|
+
`runtime.inference.compute.activationDtype=${current.activationDtype ?? 'unset'} ` +
|
|
510
|
+
`(expected ${contract.activationDtype})`
|
|
511
|
+
);
|
|
512
|
+
}
|
|
513
|
+
if (contract.kvDtype && current.kvDtype !== contract.kvDtype) {
|
|
514
|
+
mismatches.push(
|
|
515
|
+
`runtime.inference.kvcache.kvDtype=${current.kvDtype ?? 'unset'} ` +
|
|
516
|
+
`(expected ${contract.kvDtype})`
|
|
517
|
+
);
|
|
518
|
+
}
|
|
519
|
+
if (contract.outputDtype && current.outputDtype !== contract.outputDtype) {
|
|
520
|
+
mismatches.push(
|
|
521
|
+
`runtime.inference.session.compute.defaults.outputDtype=${current.outputDtype ?? 'unset'} ` +
|
|
522
|
+
`(expected ${contract.outputDtype})`
|
|
523
|
+
);
|
|
524
|
+
}
|
|
525
|
+
return mismatches;
|
|
526
|
+
}
|
|
527
|
+
|
|
369
528
|
function assertManifestKernelPathDtypeCompatibility(manifest, resolvedKernelPath, kernelPathSource) {
|
|
370
529
|
if (!resolvedKernelPath) return;
|
|
371
530
|
if (kernelPathSource === 'config') return;
|
|
@@ -376,16 +535,6 @@ function assertManifestKernelPathDtypeCompatibility(manifest, resolvedKernelPath
|
|
|
376
535
|
if (!manifestCompute || !kernelActivation) return;
|
|
377
536
|
if (manifestCompute === kernelActivation) return;
|
|
378
537
|
|
|
379
|
-
const presetId = String(manifest?.inference?.presetId ?? '').trim().toLowerCase();
|
|
380
|
-
if (presetId === 'lfm2' && manifestCompute === 'f32' && kernelActivation === 'f16') {
|
|
381
|
-
log.warn(
|
|
382
|
-
'Pipeline',
|
|
383
|
-
`Manifest "${manifest?.modelId ?? 'unknown'}" uses quantizationInfo.compute=f32 ` +
|
|
384
|
-
`with kernelPath activationDtype=f16 (${resolvedKernelPath.id}); continuing for LFM2 mixed-precision compatibility.`
|
|
385
|
-
);
|
|
386
|
-
return;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
538
|
throw new Error(
|
|
390
539
|
`Manifest kernel path dtype mismatch for "${manifest?.modelId ?? 'unknown'}": ` +
|
|
391
540
|
`quantizationInfo.compute=${manifestCompute} but ` +
|
|
@@ -402,17 +551,45 @@ function getKernelCapabilitiesSafe() {
|
|
|
402
551
|
}
|
|
403
552
|
}
|
|
404
553
|
|
|
405
|
-
function
|
|
406
|
-
const
|
|
407
|
-
|
|
408
|
-
const kernelPathKVDtype = getKernelPathKVDtype(resolvedKernelPath);
|
|
409
|
-
if (!kernelPathActivationDtype && !kernelPathOutputDtype && !kernelPathKVDtype) {
|
|
554
|
+
function applyKernelPathRuntimeDtypeContract(resolvedKernelPath, runtimeConfig, kernelPathSource, modelId) {
|
|
555
|
+
const contract = buildKernelPathDtypeContract(resolvedKernelPath);
|
|
556
|
+
if (!contract) {
|
|
410
557
|
return runtimeConfig;
|
|
411
558
|
}
|
|
412
559
|
|
|
413
|
-
const
|
|
414
|
-
|
|
415
|
-
|
|
560
|
+
const current = {
|
|
561
|
+
activationDtype: normalizeKernelDtype(runtimeConfig.inference?.compute?.activationDtype),
|
|
562
|
+
kvDtype: normalizeKernelDtype(runtimeConfig.inference?.kvcache?.kvDtype),
|
|
563
|
+
outputDtype: normalizeKernelDtype(runtimeConfig.inference?.session?.compute?.defaults?.outputDtype),
|
|
564
|
+
};
|
|
565
|
+
const mismatches = describeKernelPathDtypeMismatch(contract, current);
|
|
566
|
+
if (mismatches.length === 0) {
|
|
567
|
+
return runtimeConfig;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
if (kernelPathSource === 'config' || kernelPathSource === 'execution-v0') {
|
|
571
|
+
throw new Error(
|
|
572
|
+
`KernelPath "${resolvedKernelPath?.id ?? 'unknown'}" selected from ${kernelPathSource} ` +
|
|
573
|
+
`requires explicit matching runtime dtypes for "${modelId}". ` +
|
|
574
|
+
`Mismatches: ${mismatches.join('; ')}. ` +
|
|
575
|
+
'Set runtime.inference.compute.activationDtype, runtime.inference.kvcache.kvDtype, ' +
|
|
576
|
+
'and runtime.inference.session.compute.defaults.outputDtype to match the kernel path.'
|
|
577
|
+
);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
const canApplyManifestDefaults = (
|
|
581
|
+
(contract.activationDtype == null || isGlobalKernelPathDtypeDefault(current.activationDtype, 'activationDtype'))
|
|
582
|
+
&& (contract.kvDtype == null || isGlobalKernelPathDtypeDefault(current.kvDtype, 'kvDtype'))
|
|
583
|
+
&& (contract.outputDtype == null || isGlobalKernelPathDtypeDefault(current.outputDtype, 'outputDtype'))
|
|
584
|
+
);
|
|
585
|
+
if (!canApplyManifestDefaults) {
|
|
586
|
+
throw new Error(
|
|
587
|
+
`Manifest/model kernelPath "${resolvedKernelPath?.id ?? 'unknown'}" for "${modelId}" ` +
|
|
588
|
+
`conflicts with runtime dtype overrides. Mismatches: ${mismatches.join('; ')}. ` +
|
|
589
|
+
'Either remove the runtime dtype override or set it to match the kernel path.'
|
|
590
|
+
);
|
|
591
|
+
}
|
|
592
|
+
|
|
416
593
|
const nextInference = {
|
|
417
594
|
...runtimeConfig.inference,
|
|
418
595
|
compute: { ...runtimeConfig.inference.compute },
|
|
@@ -420,37 +597,33 @@ function applyKernelPathRuntimeDtypeOverrides(resolvedKernelPath, runtimeConfig)
|
|
|
420
597
|
};
|
|
421
598
|
const dtypeChanges = [];
|
|
422
599
|
|
|
423
|
-
if (
|
|
424
|
-
nextInference.compute.activationDtype =
|
|
425
|
-
dtypeChanges.push(`activation=${
|
|
600
|
+
if (contract.activationDtype && current.activationDtype !== contract.activationDtype) {
|
|
601
|
+
nextInference.compute.activationDtype = contract.activationDtype;
|
|
602
|
+
dtypeChanges.push(`activation=${current.activationDtype ?? 'unset'}->${contract.activationDtype}`);
|
|
426
603
|
}
|
|
427
604
|
|
|
428
|
-
if (
|
|
429
|
-
nextInference.kvcache.kvDtype =
|
|
430
|
-
dtypeChanges.push(`kv=${
|
|
605
|
+
if (contract.kvDtype && current.kvDtype !== contract.kvDtype) {
|
|
606
|
+
nextInference.kvcache.kvDtype = contract.kvDtype;
|
|
607
|
+
dtypeChanges.push(`kv=${current.kvDtype ?? 'unset'}->${contract.kvDtype}`);
|
|
431
608
|
}
|
|
432
609
|
|
|
433
|
-
if (
|
|
610
|
+
if (contract.outputDtype && current.outputDtype !== contract.outputDtype) {
|
|
434
611
|
nextInference.session = {
|
|
435
612
|
...(nextInference.session ?? {}),
|
|
436
613
|
compute: {
|
|
437
614
|
...(nextInference.session?.compute ?? {}),
|
|
438
615
|
defaults: {
|
|
439
616
|
...(nextInference.session?.compute?.defaults ?? {}),
|
|
440
|
-
outputDtype:
|
|
617
|
+
outputDtype: contract.outputDtype,
|
|
441
618
|
},
|
|
442
619
|
},
|
|
443
620
|
};
|
|
444
|
-
dtypeChanges.push(`session.outputDtype=${
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
if (dtypeChanges.length === 0) {
|
|
448
|
-
return runtimeConfig;
|
|
621
|
+
dtypeChanges.push(`session.outputDtype=${current.outputDtype ?? 'unset'}->${contract.outputDtype}`);
|
|
449
622
|
}
|
|
450
623
|
|
|
451
624
|
log.info(
|
|
452
625
|
'Pipeline',
|
|
453
|
-
`KernelPath ${resolvedKernelPath?.id ?? 'unknown'} runtime dtype
|
|
626
|
+
`KernelPath ${resolvedKernelPath?.id ?? 'unknown'} applied manifest/model runtime dtype defaults: ${dtypeChanges.join(', ')}`
|
|
454
627
|
);
|
|
455
628
|
return { ...runtimeConfig, inference: nextInference };
|
|
456
629
|
}
|
|
@@ -521,7 +694,12 @@ export function resolveKernelPathState(options) {
|
|
|
521
694
|
log.info('Pipeline', 'KernelPath: none (no kernel path configured)');
|
|
522
695
|
}
|
|
523
696
|
|
|
524
|
-
const nextRuntimeConfig =
|
|
697
|
+
const nextRuntimeConfig = applyKernelPathRuntimeDtypeContract(
|
|
698
|
+
resolvedKernelPath,
|
|
699
|
+
runtimeConfig,
|
|
700
|
+
kernelPathSource,
|
|
701
|
+
String(manifest?.modelId ?? 'unknown').trim() || 'unknown'
|
|
702
|
+
);
|
|
525
703
|
return {
|
|
526
704
|
resolvedKernelPath,
|
|
527
705
|
kernelPathSource,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { getRuntimeConfig } from '../../../config/runtime.js';
|
|
2
2
|
import { QK_K } from '../../../config/schema/index.js';
|
|
3
|
+
import { releaseBuffer } from '../../../memory/buffer-pool.js';
|
|
3
4
|
|
|
4
5
|
const dequantCache = new Map();
|
|
5
6
|
let dequantCacheMaxEntriesOverride = null;
|
|
@@ -73,8 +74,8 @@ export function setCachedDequant(layerIdx, expertIdx, outputDtype, gateUp, down)
|
|
|
73
74
|
if (oldestKey) {
|
|
74
75
|
const evicted = dequantCache.get(oldestKey);
|
|
75
76
|
if (evicted) {
|
|
76
|
-
evicted.gateUp
|
|
77
|
-
evicted.down
|
|
77
|
+
releaseBuffer(evicted.gateUp);
|
|
78
|
+
releaseBuffer(evicted.down);
|
|
78
79
|
}
|
|
79
80
|
dequantCache.delete(oldestKey);
|
|
80
81
|
}
|
|
@@ -85,8 +86,8 @@ export function setCachedDequant(layerIdx, expertIdx, outputDtype, gateUp, down)
|
|
|
85
86
|
|
|
86
87
|
export function clearDequantCache() {
|
|
87
88
|
for (const cached of dequantCache.values()) {
|
|
88
|
-
cached.gateUp
|
|
89
|
-
cached.down
|
|
89
|
+
releaseBuffer(cached.gateUp);
|
|
90
|
+
releaseBuffer(cached.down);
|
|
90
91
|
}
|
|
91
92
|
dequantCache.clear();
|
|
92
93
|
dequantCacheHits = 0;
|