@simulatte/doppler 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +16 -23
- package/package.json +30 -32
- package/src/adapters/adapter-registry.js +12 -1
- package/src/adapters/lora-loader.js +23 -6
- package/src/bridge/extension-client.d.ts +5 -0
- package/src/bridge/extension-client.js +40 -0
- package/src/bridge/index.d.ts +2 -1
- package/src/bridge/index.js +6 -4
- package/src/browser/browser-converter.js +31 -1
- package/src/browser/file-picker.js +6 -0
- package/src/browser/safetensors-parser-browser.js +84 -1
- package/src/browser/shard-io-browser.js +2 -2
- package/src/browser/tensor-source-download.js +8 -2
- package/src/browser/tensor-source-http.d.ts +1 -0
- package/src/browser/tensor-source-http.js +5 -1
- package/src/client/doppler-api.browser.js +20 -4
- package/src/client/doppler-api.js +19 -3
- package/src/client/doppler-provider/generation.js +12 -0
- package/src/client/doppler-provider/model-manager.d.ts +10 -0
- package/src/client/doppler-provider/model-manager.js +91 -19
- package/src/client/doppler-provider/source-runtime.d.ts +2 -1
- package/src/client/doppler-provider/source-runtime.js +132 -13
- package/src/client/doppler-registry.json +5 -20
- package/src/config/backward-registry-loader.js +17 -2
- package/src/config/execution-v0-contract-check.js +113 -15
- package/src/config/kernel-path-contract-check.js +57 -29
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +18 -36
- package/src/config/kernels/kernel-ref-digests.js +1 -1
- package/src/config/kernels/registry.js +14 -1
- package/src/config/kernels/registry.json +81 -5
- package/src/config/loader.d.ts +1 -1
- package/src/config/loader.js +15 -2
- package/src/config/merge-contract-check.js +66 -4
- package/src/config/merge-helpers.js +128 -7
- package/src/config/merge.d.ts +1 -0
- package/src/config/merge.js +10 -0
- package/src/config/param-validator.js +47 -2
- package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
- package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +43 -8
- package/src/config/presets/models/gemma2.json +3 -2
- package/src/config/presets/models/gemma3.json +2 -0
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
- package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
- package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
- package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
- package/src/config/runtime.js +6 -1
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +5 -0
- package/src/config/schema/doppler.schema.js +16 -21
- package/src/config/schema/inference-defaults.schema.js +3 -3
- package/src/config/schema/kernel-path.schema.d.ts +5 -1
- package/src/config/schema/kernel-thresholds.schema.js +12 -4
- package/src/config/schema/manifest.schema.d.ts +3 -2
- package/src/config/schema/manifest.schema.js +17 -4
- package/src/config/schema/storage.schema.js +1 -1
- package/src/config/training-defaults.js +30 -22
- package/src/converter/conversion-plan.js +104 -11
- package/src/converter/core.d.ts +7 -0
- package/src/converter/core.js +16 -9
- package/src/converter/execution-v0-manifest.js +4 -1
- package/src/converter/index.d.ts +1 -0
- package/src/converter/index.js +1 -0
- package/src/converter/manifest-inference.js +50 -29
- package/src/converter/parsers/diffusion.js +0 -3
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +40 -16
- package/src/converter/quantizer.js +19 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/shard-packer.d.ts +1 -1
- package/src/converter/shard-packer.js +4 -1
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/config.js +123 -11
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/debug/signals.js +7 -1
- package/src/debug/tensor.d.ts +2 -0
- package/src/debug/tensor.js +13 -2
- package/src/distribution/p2p-control-plane.js +52 -12
- package/src/distribution/p2p-observability.js +43 -7
- package/src/distribution/p2p-webrtc-browser.js +20 -0
- package/src/distribution/shard-delivery.js +83 -27
- package/src/formats/gguf/types.js +33 -16
- package/src/formats/rdrr/groups.d.ts +12 -4
- package/src/formats/rdrr/groups.js +3 -6
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +53 -3
- package/src/formats/rdrr/types.d.ts +2 -1
- package/src/gpu/command-recorder.js +86 -61
- package/src/gpu/device.d.ts +1 -0
- package/src/gpu/device.js +73 -19
- package/src/gpu/kernel-tuner/benchmarks.js +326 -316
- package/src/gpu/kernel-tuner/cache.js +71 -4
- package/src/gpu/kernel-tuner/tuner.js +22 -4
- package/src/gpu/kernels/attention.js +15 -34
- package/src/gpu/kernels/backward/adam.js +62 -58
- package/src/gpu/kernels/backward/attention_backward.js +257 -169
- package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
- package/src/gpu/kernels/cast.js +191 -149
- package/src/gpu/kernels/check-stop.js +33 -44
- package/src/gpu/kernels/conv2d.js +27 -17
- package/src/gpu/kernels/cross_entropy_loss.js +21 -15
- package/src/gpu/kernels/depthwise_conv2d.js +36 -26
- package/src/gpu/kernels/dequant.js +178 -126
- package/src/gpu/kernels/energy.d.ts +3 -21
- package/src/gpu/kernels/energy.js +111 -88
- package/src/gpu/kernels/feature-check.js +1 -1
- package/src/gpu/kernels/fused_ffn.js +84 -65
- package/src/gpu/kernels/fused_matmul_residual.js +56 -33
- package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
- package/src/gpu/kernels/gather.js +33 -15
- package/src/gpu/kernels/gelu.js +19 -11
- package/src/gpu/kernels/grouped_pointwise_conv2d.js +33 -23
- package/src/gpu/kernels/groupnorm.js +34 -23
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/kv-quantize.js +5 -2
- package/src/gpu/kernels/layernorm.js +35 -19
- package/src/gpu/kernels/logit-merge.js +5 -3
- package/src/gpu/kernels/matmul-selection.js +47 -4
- package/src/gpu/kernels/matmul.d.ts +2 -0
- package/src/gpu/kernels/matmul.js +59 -40
- package/src/gpu/kernels/modulate.js +23 -15
- package/src/gpu/kernels/moe.js +221 -175
- package/src/gpu/kernels/pixel_shuffle.js +22 -14
- package/src/gpu/kernels/relu.js +18 -10
- package/src/gpu/kernels/repeat_channels.js +25 -17
- package/src/gpu/kernels/residual.js +37 -27
- package/src/gpu/kernels/rmsnorm.js +66 -43
- package/src/gpu/kernels/rope.js +3 -0
- package/src/gpu/kernels/sample.js +27 -38
- package/src/gpu/kernels/sana_linear_attention.js +18 -10
- package/src/gpu/kernels/scale.js +18 -11
- package/src/gpu/kernels/shader-cache.js +4 -2
- package/src/gpu/kernels/silu.js +120 -72
- package/src/gpu/kernels/softmax.js +44 -25
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/kernels/split_qkv.js +23 -13
- package/src/gpu/kernels/transpose.js +18 -10
- package/src/gpu/kernels/transpose.wgsl +5 -3
- package/src/gpu/kernels/upsample2d.js +21 -13
- package/src/gpu/kernels/utils.js +20 -13
- package/src/gpu/partitioned-buffer-pool.js +10 -2
- package/src/gpu/perf-guards.js +2 -9
- package/src/gpu/profiler.js +27 -22
- package/src/gpu/readback-utils.d.ts +16 -0
- package/src/gpu/readback-utils.js +41 -0
- package/src/gpu/submit-tracker.js +13 -0
- package/src/gpu/uniform-cache.d.ts +1 -0
- package/src/gpu/uniform-cache.js +30 -9
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/hotswap/intent-bundle.js +6 -0
- package/src/hotswap/manifest.d.ts +10 -1
- package/src/hotswap/manifest.js +12 -2
- package/src/hotswap/runtime.js +30 -8
- package/src/index-browser.d.ts +44 -0
- package/src/index-browser.js +14 -0
- package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
- package/src/inference/browser-harness-contract-helpers.js +28 -0
- package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
- package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
- package/src/inference/browser-harness-model-helpers.d.ts +16 -0
- package/src/inference/browser-harness-model-helpers.js +217 -0
- package/src/inference/browser-harness-report-helpers.d.ts +7 -0
- package/src/inference/browser-harness-report-helpers.js +42 -0
- package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
- package/src/inference/browser-harness-runtime-helpers.js +415 -0
- package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
- package/src/inference/browser-harness-suite-helpers.js +268 -0
- package/src/inference/browser-harness-text-helpers.d.ts +27 -0
- package/src/inference/browser-harness-text-helpers.js +788 -0
- package/src/inference/browser-harness.d.ts +8 -0
- package/src/inference/browser-harness.js +149 -1996
- package/src/inference/kv-cache/base.js +140 -94
- package/src/inference/kv-cache/tiered.js +5 -3
- package/src/inference/moe-router.js +88 -56
- package/src/inference/multi-model-network.js +5 -3
- package/src/inference/network-evolution.d.ts +11 -2
- package/src/inference/network-evolution.js +20 -21
- package/src/inference/pipelines/context.d.ts +3 -0
- package/src/inference/pipelines/context.js +142 -2
- package/src/inference/pipelines/diffusion/helpers.js +10 -2
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
- package/src/inference/pipelines/diffusion/vae.js +3 -7
- package/src/inference/pipelines/energy/pipeline.js +27 -21
- package/src/inference/pipelines/energy/quintel.d.ts +5 -0
- package/src/inference/pipelines/energy/quintel.js +11 -0
- package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
- package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
- package/src/inference/pipelines/text/attention/projections.js +192 -112
- package/src/inference/pipelines/text/attention/record.js +77 -14
- package/src/inference/pipelines/text/attention/run.js +112 -14
- package/src/inference/pipelines/text/config.js +17 -4
- package/src/inference/pipelines/text/embed.js +2 -8
- package/src/inference/pipelines/text/execution-plan.js +46 -23
- package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
- package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
- package/src/inference/pipelines/text/execution-v0.js +62 -1013
- package/src/inference/pipelines/text/generator-runtime.js +5 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +52 -0
- package/src/inference/pipelines/text/generator-steps.js +340 -221
- package/src/inference/pipelines/text/generator.js +56 -40
- package/src/inference/pipelines/text/init.d.ts +13 -0
- package/src/inference/pipelines/text/init.js +94 -25
- package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
- package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
- package/src/inference/pipelines/text/kernel-trace.js +6 -0
- package/src/inference/pipelines/text/layer.js +4 -9
- package/src/inference/pipelines/text/linear-attention.d.ts +15 -0
- package/src/inference/pipelines/text/linear-attention.js +113 -9
- package/src/inference/pipelines/text/logits/gpu.js +12 -7
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +13 -12
- package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
- package/src/inference/pipelines/text/logits/utils.js +9 -0
- package/src/inference/pipelines/text/lora-apply.js +50 -32
- package/src/inference/pipelines/text/model-load.js +282 -104
- package/src/inference/pipelines/text/moe-cache.js +5 -4
- package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
- package/src/inference/pipelines/text/moe-cpu.js +42 -38
- package/src/inference/pipelines/text/moe-gpu.js +110 -86
- package/src/inference/pipelines/text/ops.js +90 -90
- package/src/inference/pipelines/text/probes.js +9 -9
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/weights.js +17 -7
- package/src/inference/pipelines/text.js +13 -1
- package/src/inference/speculative.d.ts +2 -2
- package/src/inference/speculative.js +4 -18
- package/src/inference/test-harness.d.ts +1 -1
- package/src/inference/test-harness.js +17 -7
- package/src/inference/tokenizer.d.ts +0 -5
- package/src/inference/tokenizer.js +4 -23
- package/src/inference/tokenizers/bpe.js +9 -0
- package/src/inference/tokenizers/bundled.js +20 -0
- package/src/inference/tokenizers/sentencepiece.js +12 -0
- package/src/loader/doppler-loader.js +38 -22
- package/src/loader/dtype-utils.js +3 -44
- package/src/loader/embedding-loader.js +7 -3
- package/src/loader/experts/expert-cache.js +13 -6
- package/src/loader/experts/expert-loader.js +10 -6
- package/src/loader/final-weights-loader.js +10 -4
- package/src/loader/layer-loader.js +2 -1
- package/src/loader/loader-state.js +2 -2
- package/src/loader/memory-monitor.js +8 -0
- package/src/loader/multi-model-loader.d.ts +14 -0
- package/src/loader/multi-model-loader.js +70 -24
- package/src/loader/shard-cache.js +84 -14
- package/src/loader/shard-resolver.js +25 -3
- package/src/loader/tensors/tensor-loader.js +214 -144
- package/src/loader/tensors/tensor-reader.js +76 -19
- package/src/loader/weight-downcast.js +1 -1
- package/src/memory/buffer-pool.d.ts +9 -1
- package/src/memory/buffer-pool.js +109 -44
- package/src/memory/unified-detect.js +1 -1
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +24 -8
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.js +27 -1
- package/src/storage/backends/opfs-store.js +68 -24
- package/src/storage/downloader.js +365 -83
- package/src/storage/index.d.ts +3 -0
- package/src/storage/index.js +3 -0
- package/src/storage/preflight.d.ts +2 -2
- package/src/storage/preflight.js +24 -2
- package/src/storage/quickstart-downloader.js +11 -5
- package/src/storage/registry.js +10 -4
- package/src/storage/reports.js +1 -1
- package/src/storage/shard-manager.d.ts +15 -1
- package/src/storage/shard-manager.js +55 -6
- package/src/storage/source-artifact-store.d.ts +52 -0
- package/src/storage/source-artifact-store.js +234 -0
- package/src/tooling/command-api-constants.d.ts +9 -0
- package/src/tooling/command-api-constants.js +9 -0
- package/src/tooling/command-api-family-normalizers.d.ts +9 -0
- package/src/tooling/command-api-family-normalizers.js +343 -0
- package/src/tooling/command-api-helpers.d.ts +25 -0
- package/src/tooling/command-api-helpers.js +262 -0
- package/src/tooling/command-api.js +16 -602
- package/src/tooling/command-envelope.js +4 -1
- package/src/tooling/command-runner-shared.js +52 -18
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/lean-execution-contract.js +150 -3
- package/src/tooling/node-browser-command-runner.js +161 -271
- package/src/tooling/node-command-runner.js +29 -3
- package/src/tooling/node-converter.js +30 -1
- package/src/tooling/node-source-runtime.d.ts +1 -1
- package/src/tooling/node-source-runtime.js +120 -3
- package/src/tooling/node-webgpu.js +24 -21
- package/src/tooling/opfs-cache.js +21 -4
- package/src/tooling/runtime-input-composition.d.ts +38 -0
- package/src/tooling/runtime-input-composition.js +86 -0
- package/src/tooling/source-runtime-bundle.d.ts +40 -5
- package/src/tooling/source-runtime-bundle.js +261 -34
- package/src/tooling/source-runtime-materializer.d.ts +6 -0
- package/src/tooling/source-runtime-materializer.js +93 -0
- package/src/training/attention-backward.js +32 -17
- package/src/training/autograd.js +80 -52
- package/src/training/checkpoint-watch.d.ts +2 -1
- package/src/training/checkpoint-watch.js +39 -6
- package/src/training/checkpoint.js +40 -11
- package/src/training/clip.js +2 -1
- package/src/training/datasets/token-batch.js +20 -8
- package/src/training/distillation/checkpoint-watch.js +1 -0
- package/src/training/distillation/student-fixture.d.ts +22 -0
- package/src/training/distillation/student-fixture.js +846 -0
- package/src/training/distillation/suite-data.d.ts +45 -0
- package/src/training/distillation/suite-data.js +189 -0
- package/src/training/lora-pipeline.js +4 -7
- package/src/training/lora.js +26 -12
- package/src/training/loss.js +5 -6
- package/src/training/objectives/cross_entropy.js +2 -5
- package/src/training/objectives/distill_kd.js +4 -8
- package/src/training/objectives/distill_triplet.js +4 -8
- package/src/training/objectives/ul_stage2_base.js +4 -8
- package/src/training/operator-command.js +2 -0
- package/src/training/optimizer.js +19 -7
- package/src/training/runner.js +2 -1
- package/src/training/suite.js +18 -978
- package/src/training/tensor-factory.d.ts +9 -0
- package/src/training/tensor-factory.js +13 -0
- package/src/training/trainer.js +3 -5
- package/src/training/ul_dataset.js +3 -5
- package/src/training/workloads.js +70 -79
- package/src/types/model.d.ts +5 -0
- package/src/version.js +1 -1
- package/tools/convert-safetensors-node.js +22 -16
- package/tools/doppler-cli.js +50 -26
|
@@ -175,103 +175,103 @@ export async function doConv(
|
|
|
175
175
|
}
|
|
176
176
|
|
|
177
177
|
// Use the first 2x hidden projection channels as a gated conv-state projection.
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
swigluLimit: options.swigluLimit ?? null,
|
|
198
|
-
label: `${label}.activation`,
|
|
199
|
-
layerIdx,
|
|
200
|
-
}, recorder);
|
|
201
|
-
|
|
202
|
-
if (recorder) {
|
|
203
|
-
recorder.trackTemporaryBuffer(inProj.buffer);
|
|
204
|
-
} else {
|
|
205
|
-
releaseBuffer(inProj.buffer);
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
// Optional generic conv2d stage when explicit shape metadata is provided.
|
|
209
|
-
// LFM2 depthwise conv kernels use model-specific packing, so this path is best-effort only.
|
|
210
|
-
let convInput = activated;
|
|
211
|
-
if (convKernel && options.conv2d && options.conv2d.enabled === true) {
|
|
212
|
-
const convTensorInput = createTensor(activated.buffer, activated.dtype, [
|
|
213
|
-
options.conv2d.inChannels,
|
|
214
|
-
options.conv2d.height,
|
|
215
|
-
options.conv2d.width,
|
|
216
|
-
], `${label}.conv_input`);
|
|
217
|
-
const convOptions = {
|
|
218
|
-
inChannels: options.conv2d.inChannels,
|
|
219
|
-
outChannels: options.conv2d.outChannels,
|
|
220
|
-
height: options.conv2d.height,
|
|
221
|
-
width: options.conv2d.width,
|
|
222
|
-
kernelH: options.conv2d.kernelH,
|
|
223
|
-
kernelW: options.conv2d.kernelW,
|
|
224
|
-
stride: options.conv2d.stride ?? 1,
|
|
225
|
-
pad: options.conv2d.pad ?? 0,
|
|
226
|
-
};
|
|
227
|
-
const convResult = recorder
|
|
228
|
-
? await recordConv2D(recorder, convTensorInput, convKernel, null, convOptions)
|
|
229
|
-
: await runConv2D(convTensorInput, convKernel, null, convOptions);
|
|
230
|
-
convInput = createTensor(
|
|
231
|
-
convResult.buffer,
|
|
232
|
-
convResult.dtype,
|
|
233
|
-
[numTokens, hiddenSize],
|
|
234
|
-
`${label}.conv_output`
|
|
178
|
+
let inProj = null;
|
|
179
|
+
let activated = null;
|
|
180
|
+
let convInput = null;
|
|
181
|
+
let outProj = null;
|
|
182
|
+
try {
|
|
183
|
+
inProj = await doMatmul(
|
|
184
|
+
inputTensor,
|
|
185
|
+
convInProj,
|
|
186
|
+
numTokens,
|
|
187
|
+
hiddenSize * 2,
|
|
188
|
+
hiddenSize,
|
|
189
|
+
{
|
|
190
|
+
transposeB: 'auto',
|
|
191
|
+
label: `${label}.in_proj`,
|
|
192
|
+
layerIdx,
|
|
193
|
+
kernelPath,
|
|
194
|
+
role: 'conv_in_proj',
|
|
195
|
+
},
|
|
196
|
+
recorder
|
|
235
197
|
);
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
198
|
+
activated = await doSiLURowSplit(inProj, {
|
|
199
|
+
numTokens,
|
|
200
|
+
dim: hiddenSize,
|
|
201
|
+
activation: 'silu',
|
|
202
|
+
swigluLimit: options.swigluLimit ?? null,
|
|
203
|
+
label: `${label}.activation`,
|
|
204
|
+
layerIdx,
|
|
205
|
+
}, recorder);
|
|
206
|
+
|
|
207
|
+
releaseOrTrack(recorder, inProj.buffer);
|
|
208
|
+
inProj = null;
|
|
209
|
+
|
|
210
|
+
convInput = activated;
|
|
211
|
+
if (convKernel && options.conv2d && options.conv2d.enabled === true) {
|
|
212
|
+
const convTensorInput = createTensor(activated.buffer, activated.dtype, [
|
|
213
|
+
options.conv2d.inChannels,
|
|
214
|
+
options.conv2d.height,
|
|
215
|
+
options.conv2d.width,
|
|
216
|
+
], `${label}.conv_input`);
|
|
217
|
+
const convOptions = {
|
|
218
|
+
inChannels: options.conv2d.inChannels,
|
|
219
|
+
outChannels: options.conv2d.outChannels,
|
|
220
|
+
height: options.conv2d.height,
|
|
221
|
+
width: options.conv2d.width,
|
|
222
|
+
kernelH: options.conv2d.kernelH,
|
|
223
|
+
kernelW: options.conv2d.kernelW,
|
|
224
|
+
stride: options.conv2d.stride ?? 1,
|
|
225
|
+
pad: options.conv2d.pad ?? 0,
|
|
226
|
+
};
|
|
227
|
+
const convResult = recorder
|
|
228
|
+
? await recordConv2D(recorder, convTensorInput, convKernel, null, convOptions)
|
|
229
|
+
: await runConv2D(convTensorInput, convKernel, null, convOptions);
|
|
230
|
+
convInput = createTensor(
|
|
231
|
+
convResult.buffer,
|
|
232
|
+
convResult.dtype,
|
|
233
|
+
[numTokens, hiddenSize],
|
|
234
|
+
`${label}.conv_output`
|
|
235
|
+
);
|
|
236
|
+
releaseOrTrack(recorder, activated.buffer);
|
|
237
|
+
activated = null;
|
|
240
238
|
}
|
|
241
|
-
}
|
|
242
239
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
240
|
+
outProj = await doMatmul(
|
|
241
|
+
convInput,
|
|
242
|
+
convOutProj,
|
|
243
|
+
numTokens,
|
|
244
|
+
hiddenSize,
|
|
245
|
+
hiddenSize,
|
|
246
|
+
{
|
|
247
|
+
transposeB: 'auto',
|
|
248
|
+
label: `${label}.out_proj`,
|
|
249
|
+
layerIdx,
|
|
250
|
+
kernelPath,
|
|
251
|
+
role: 'conv_out_proj',
|
|
252
|
+
},
|
|
253
|
+
recorder
|
|
254
|
+
);
|
|
258
255
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
} else {
|
|
263
|
-
|
|
256
|
+
if (convInput && (!activated || convInput.buffer !== activated.buffer)) {
|
|
257
|
+
releaseOrTrack(recorder, convInput.buffer);
|
|
258
|
+
convInput = null;
|
|
259
|
+
} else if (activated) {
|
|
260
|
+
releaseOrTrack(recorder, activated.buffer);
|
|
261
|
+
activated = null;
|
|
264
262
|
}
|
|
265
|
-
} else if (recorder) {
|
|
266
|
-
recorder.trackTemporaryBuffer(activated.buffer);
|
|
267
|
-
} else {
|
|
268
|
-
releaseBuffer(activated.buffer);
|
|
269
|
-
}
|
|
270
263
|
|
|
271
|
-
|
|
272
|
-
|
|
264
|
+
if (kernelTrace.enabled && !recorder) {
|
|
265
|
+
await traceStep('conv', label, layerIdx, outProj.buffer, [numTokens, hiddenSize]);
|
|
266
|
+
}
|
|
267
|
+
return outProj;
|
|
268
|
+
} catch (error) {
|
|
269
|
+
if (outProj) releaseOrTrack(recorder, outProj.buffer);
|
|
270
|
+
if (convInput && (!activated || convInput.buffer !== activated.buffer)) releaseOrTrack(recorder, convInput.buffer);
|
|
271
|
+
if (activated) releaseOrTrack(recorder, activated.buffer);
|
|
272
|
+
if (inProj) releaseOrTrack(recorder, inProj.buffer);
|
|
273
|
+
throw error;
|
|
273
274
|
}
|
|
274
|
-
return outProj;
|
|
275
275
|
}
|
|
276
276
|
|
|
277
277
|
export async function doCast(input, toDtype, recorder) {
|
|
@@ -4,6 +4,7 @@ import { trace } from '../../../debug/index.js';
|
|
|
4
4
|
import { getDevice } from '../../../gpu/device.js';
|
|
5
5
|
import { allowReadback } from '../../../gpu/perf-guards.js';
|
|
6
6
|
import { f16ToF32 } from '../../../loader/dtype-utils.js';
|
|
7
|
+
import { readBufferSlice } from '../../../memory/buffer-pool.js';
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
const STAGE_DEFAULT_CATEGORY = {
|
|
@@ -11,6 +12,11 @@ const STAGE_DEFAULT_CATEGORY = {
|
|
|
11
12
|
// Attention stages (per-layer)
|
|
12
13
|
attn_input: 'attn',
|
|
13
14
|
attn_normed: 'attn',
|
|
15
|
+
linear_qkv_proj: 'attn',
|
|
16
|
+
linear_z_proj: 'attn',
|
|
17
|
+
linear_a_proj: 'attn',
|
|
18
|
+
linear_b_proj: 'attn',
|
|
19
|
+
linear_core_out: 'attn',
|
|
14
20
|
q_proj: 'attn',
|
|
15
21
|
k_proj: 'attn',
|
|
16
22
|
v_proj: 'attn',
|
|
@@ -139,22 +145,16 @@ export async function runProbes(stage, buffer, options) {
|
|
|
139
145
|
const alignedOffset = Math.floor(byteOffset / 4) * 4;
|
|
140
146
|
const offsetWithinRead = byteOffset - alignedOffset;
|
|
141
147
|
const readSize = 4; // Always read 4 bytes (aligned)
|
|
142
|
-
const
|
|
143
|
-
const enc = (device).createCommandEncoder();
|
|
144
|
-
enc.copyBufferToBuffer( (buffer), alignedOffset, staging, 0, readSize);
|
|
145
|
-
(device).queue.submit([enc.finish()]);
|
|
146
|
-
await staging.mapAsync(GPUMapMode.READ);
|
|
148
|
+
const readback = await readBufferSlice(buffer, alignedOffset, readSize);
|
|
147
149
|
let value;
|
|
148
150
|
if (dtype === 'f16') {
|
|
149
151
|
// offsetWithinRead is 0 or 2 for F16 - extract correct u16
|
|
150
|
-
const u16Array = new Uint16Array(
|
|
152
|
+
const u16Array = new Uint16Array(readback);
|
|
151
153
|
const u16Index = offsetWithinRead / 2;
|
|
152
154
|
value = f16ToF32(u16Array[u16Index]);
|
|
153
155
|
} else {
|
|
154
|
-
value = new Float32Array(
|
|
156
|
+
value = new Float32Array(readback)[0];
|
|
155
157
|
}
|
|
156
|
-
staging.unmap();
|
|
157
|
-
staging.destroy();
|
|
158
158
|
values.push(`${dimIdx}=${value.toFixed(4)}`);
|
|
159
159
|
}
|
|
160
160
|
|
|
@@ -58,6 +58,30 @@ export function softmax(logits) {
|
|
|
58
58
|
return exps;
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
+
function countFiniteCandidates(logits, padTokenId) {
|
|
62
|
+
let finiteCandidateCount = 0;
|
|
63
|
+
for (let i = 0; i < logits.length; i++) {
|
|
64
|
+
if (padTokenId != null && i === padTokenId) {
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
if (Number.isFinite(logits[i])) {
|
|
68
|
+
finiteCandidateCount += 1;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return finiteCandidateCount;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function assertFiniteSamplingCandidates(logits, padTokenId, label) {
|
|
75
|
+
const finiteCandidateCount = countFiniteCandidates(logits, padTokenId);
|
|
76
|
+
if (finiteCandidateCount > 0) {
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
throw new Error(
|
|
80
|
+
`[Sampling] ${label} has no finite candidate logits after masking the pad token. ` +
|
|
81
|
+
'Upstream decode likely produced NaN/Inf or an all-masked distribution.'
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
61
85
|
|
|
62
86
|
export function sample(logits, opts) {
|
|
63
87
|
const { temperature, topP, topK, decode, debug = false, padTokenId, seed } = opts;
|
|
@@ -66,16 +90,28 @@ export function sample(logits, opts) {
|
|
|
66
90
|
logits[padTokenId] = -Infinity;
|
|
67
91
|
}
|
|
68
92
|
|
|
93
|
+
assertFiniteSamplingCandidates(logits, padTokenId, 'Logits');
|
|
94
|
+
|
|
69
95
|
// Greedy (argmax) when temperature = 0
|
|
70
96
|
if (temperature === 0) {
|
|
71
|
-
let maxIdx =
|
|
72
|
-
let maxVal =
|
|
73
|
-
for (let i =
|
|
74
|
-
|
|
75
|
-
|
|
97
|
+
let maxIdx = -1;
|
|
98
|
+
let maxVal = -Infinity;
|
|
99
|
+
for (let i = 0; i < logits.length; i++) {
|
|
100
|
+
const value = logits[i];
|
|
101
|
+
if (!Number.isFinite(value)) {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
if (value > maxVal) {
|
|
105
|
+
maxVal = value;
|
|
76
106
|
maxIdx = i;
|
|
77
107
|
}
|
|
78
108
|
}
|
|
109
|
+
if (maxIdx < 0) {
|
|
110
|
+
throw new Error(
|
|
111
|
+
'[Sampling] Greedy sampling could not find a finite candidate logit. ' +
|
|
112
|
+
'Upstream decode likely produced NaN/Inf.'
|
|
113
|
+
);
|
|
114
|
+
}
|
|
79
115
|
if (debug) {
|
|
80
116
|
const text = decode?.([maxIdx]) ?? '?';
|
|
81
117
|
trace.sample(`Greedy: id=${maxIdx} "${text}" logit=${maxVal.toFixed(4)}`);
|
|
@@ -96,7 +132,17 @@ export function sample(logits, opts) {
|
|
|
96
132
|
|
|
97
133
|
let candidates = [];
|
|
98
134
|
for (let i = 0; i < probs.length; i++) {
|
|
99
|
-
|
|
135
|
+
const probability = probs[i];
|
|
136
|
+
if (!Number.isFinite(probability) || probability <= 0) {
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
candidates.push({ token: i, prob: probability });
|
|
140
|
+
}
|
|
141
|
+
if (candidates.length === 0) {
|
|
142
|
+
throw new Error(
|
|
143
|
+
'[Sampling] Softmax produced no finite candidate probabilities. ' +
|
|
144
|
+
'Upstream decode likely produced NaN/Inf logits.'
|
|
145
|
+
);
|
|
100
146
|
}
|
|
101
147
|
candidates.sort((a, b) => b.prob - a.prob);
|
|
102
148
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
import { getDevice } from '../../../gpu/device.js';
|
|
4
|
-
import { acquireBuffer } from '../../../memory/buffer-pool.js';
|
|
4
|
+
import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
|
|
5
5
|
import { log } from '../../../debug/index.js';
|
|
6
6
|
import { isWeightBuffer, isCpuWeightBuffer, tagBufferDtype } from '../../../gpu/weight-buffer.js';
|
|
7
7
|
|
|
@@ -53,9 +53,14 @@ export function getWeightBuffer(weight, label) {
|
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
const buf = acquireBuffer(data.byteLength, undefined, label);
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
try {
|
|
57
|
+
device.queue.writeBuffer(buf, 0, ( (data)));
|
|
58
|
+
tagBufferDtype(buf, bufferDtype);
|
|
59
|
+
return buf;
|
|
60
|
+
} catch (error) {
|
|
61
|
+
releaseBuffer(buf);
|
|
62
|
+
throw error;
|
|
63
|
+
}
|
|
59
64
|
}
|
|
60
65
|
|
|
61
66
|
|
|
@@ -92,9 +97,14 @@ export function getNormWeightBuffer(weight, label, config, debugFlags) {
|
|
|
92
97
|
}
|
|
93
98
|
|
|
94
99
|
const buf = acquireBuffer(data.byteLength, undefined, label);
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
100
|
+
try {
|
|
101
|
+
device.queue.writeBuffer(buf, 0, ( (data)));
|
|
102
|
+
tagBufferDtype(buf, 'f32');
|
|
103
|
+
return buf;
|
|
104
|
+
} catch (error) {
|
|
105
|
+
releaseBuffer(buf);
|
|
106
|
+
throw error;
|
|
107
|
+
}
|
|
98
108
|
}
|
|
99
109
|
|
|
100
110
|
|
|
@@ -6,7 +6,7 @@ import { configurePerfGuards } from '../../gpu/perf-guards.js';
|
|
|
6
6
|
import { MoERouter } from '../moe-router.js';
|
|
7
7
|
import { DecodeBufferManager } from '../decode-buffers.js';
|
|
8
8
|
import { DecodeRing } from '../decode-ring.js';
|
|
9
|
-
import { applyPipelineContexts } from './context.js';
|
|
9
|
+
import { applyPipelineContexts, restorePipelineContexts } from './context.js';
|
|
10
10
|
import { createInitializedPipeline } from './factory.js';
|
|
11
11
|
|
|
12
12
|
// Pipeline sub-modules
|
|
@@ -44,6 +44,11 @@ import { getDopplerLoader } from '../../loader/doppler-loader.js';
|
|
|
44
44
|
import { registerPipeline, getPipelineFactory } from './registry.js';
|
|
45
45
|
import { selectRuleValue } from '../../rules/rule-registry.js';
|
|
46
46
|
|
|
47
|
+
function destroyMoERouter(router) {
|
|
48
|
+
if (router && typeof router.destroy === 'function') {
|
|
49
|
+
router.destroy();
|
|
50
|
+
}
|
|
51
|
+
}
|
|
47
52
|
|
|
48
53
|
|
|
49
54
|
// ============================================================================
|
|
@@ -102,6 +107,8 @@ export class InferencePipeline extends PipelineState {
|
|
|
102
107
|
this.manifest = manifest;
|
|
103
108
|
this.decodeRing?.release();
|
|
104
109
|
this.linearAttentionRuntime = resetLinearAttentionRuntime(this.linearAttentionRuntime);
|
|
110
|
+
destroyMoERouter(this.moeRouter);
|
|
111
|
+
this.moeRouter = null;
|
|
105
112
|
|
|
106
113
|
const executionV0Runtime = applyExecutionV0RuntimeConfig({
|
|
107
114
|
runtimeConfig: this.runtimeConfig,
|
|
@@ -490,12 +497,15 @@ export class InferencePipeline extends PipelineState {
|
|
|
490
497
|
this.expertWeights.clear();
|
|
491
498
|
this.linearAttentionRuntime = resetLinearAttentionRuntime(this.linearAttentionRuntime);
|
|
492
499
|
this.lora = null;
|
|
500
|
+
destroyMoERouter(this.moeRouter);
|
|
501
|
+
this.moeRouter = null;
|
|
493
502
|
if (this.finitenessBuffer) {
|
|
494
503
|
this.finitenessBuffer.destroy();
|
|
495
504
|
this.finitenessBuffer = null;
|
|
496
505
|
}
|
|
497
506
|
this.isLoaded = false;
|
|
498
507
|
this.currentSeqLen = 0;
|
|
508
|
+
restorePipelineContexts(this);
|
|
499
509
|
log.info('Pipeline', 'Unloaded');
|
|
500
510
|
}
|
|
501
511
|
|
|
@@ -533,6 +543,8 @@ export class InferencePipeline extends PipelineState {
|
|
|
533
543
|
releaseGPUResources() {
|
|
534
544
|
this.decodeBuffers?.release();
|
|
535
545
|
this.decodeRing?.release();
|
|
546
|
+
destroyMoERouter(this.moeRouter);
|
|
547
|
+
this.moeRouter = null;
|
|
536
548
|
if (this.finitenessBuffer) {
|
|
537
549
|
this.finitenessBuffer.destroy();
|
|
538
550
|
this.finitenessBuffer = null;
|
|
@@ -66,8 +66,8 @@ export interface SpeculativeConfig {
|
|
|
66
66
|
enableTreeDraft: boolean;
|
|
67
67
|
/** Temperature for draft sampling */
|
|
68
68
|
temperature: number;
|
|
69
|
-
/**
|
|
70
|
-
randomSeed
|
|
69
|
+
/** Deterministic seed for speculative sampling */
|
|
70
|
+
randomSeed: number;
|
|
71
71
|
}
|
|
72
72
|
|
|
73
73
|
/**
|
|
@@ -10,22 +10,6 @@ function createRng(seed) {
|
|
|
10
10
|
};
|
|
11
11
|
}
|
|
12
12
|
|
|
13
|
-
function createUnseededRng() {
|
|
14
|
-
let fallbackState = ((Date.now() >>> 0) ^ 0xa341316c) >>> 0;
|
|
15
|
-
return () => {
|
|
16
|
-
const cryptoApi = typeof globalThis !== 'undefined' ? globalThis.crypto : null;
|
|
17
|
-
if (cryptoApi && typeof cryptoApi.getRandomValues === 'function') {
|
|
18
|
-
const random = new Uint32Array(1);
|
|
19
|
-
cryptoApi.getRandomValues(random);
|
|
20
|
-
return random[0] / 4294967296;
|
|
21
|
-
}
|
|
22
|
-
fallbackState = (fallbackState + 0x6d2b79f5) | 0;
|
|
23
|
-
let t = Math.imul(fallbackState ^ (fallbackState >>> 15), 1 | fallbackState);
|
|
24
|
-
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
25
|
-
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
26
|
-
};
|
|
27
|
-
}
|
|
28
|
-
|
|
29
13
|
function coerceLogitsVector(value, label) {
|
|
30
14
|
if (value instanceof Float32Array) {
|
|
31
15
|
if (value.length === 0) {
|
|
@@ -110,6 +94,9 @@ export class SpeculativeDecoder {
|
|
|
110
94
|
if (config.temperature == null) {
|
|
111
95
|
throw new Error('SpeculativeDecoder requires temperature.');
|
|
112
96
|
}
|
|
97
|
+
if (!Number.isFinite(config.randomSeed)) {
|
|
98
|
+
throw new Error('SpeculativeDecoder requires randomSeed.');
|
|
99
|
+
}
|
|
113
100
|
|
|
114
101
|
assertTemperature(config.temperature, 'temperature');
|
|
115
102
|
this.numDraftTokens = config.numDraftTokens;
|
|
@@ -117,8 +104,7 @@ export class SpeculativeDecoder {
|
|
|
117
104
|
this.enableTreeDraft = config.enableTreeDraft;
|
|
118
105
|
this.temperature = config.temperature;
|
|
119
106
|
|
|
120
|
-
|
|
121
|
-
this.random = seed === null ? createUnseededRng() : createRng(seed);
|
|
107
|
+
this.random = createRng(Math.floor(config.randomSeed));
|
|
122
108
|
}
|
|
123
109
|
|
|
124
110
|
setDraftModel(model) {
|
|
@@ -74,7 +74,7 @@ export interface InitializeResult {
|
|
|
74
74
|
/**
|
|
75
75
|
* Discover available models from the catalog.json endpoint.
|
|
76
76
|
*
|
|
77
|
-
* @param fallbackModels -
|
|
77
|
+
* @param fallbackModels - Explicit fallback models to use when catalog fetch is unavailable
|
|
78
78
|
* @returns Array of model info objects
|
|
79
79
|
*/
|
|
80
80
|
export declare function discoverModels(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
|
|
4
|
-
import { parseManifest } from '../formats/rdrr/index.js';
|
|
4
|
+
import { parseManifest, getExpectedShardHash } from '../formats/rdrr/index.js';
|
|
5
5
|
import { createPipeline } from './pipelines/text.js';
|
|
6
6
|
import { log as debugLog } from '../debug/index.js';
|
|
7
7
|
import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
|
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
export async function discoverModels(
|
|
28
|
-
fallbackModels
|
|
28
|
+
fallbackModels
|
|
29
29
|
) {
|
|
30
30
|
try {
|
|
31
31
|
const resp = await fetch('/models/catalog.json');
|
|
@@ -40,10 +40,13 @@ export async function discoverModels(
|
|
|
40
40
|
}));
|
|
41
41
|
}
|
|
42
42
|
}
|
|
43
|
-
} catch (e) {
|
|
44
|
-
|
|
43
|
+
} catch (e) {}
|
|
44
|
+
|
|
45
|
+
if (Array.isArray(fallbackModels) && fallbackModels.length > 0) {
|
|
46
|
+
return fallbackModels.map((id) => ({ id, name: id }));
|
|
45
47
|
}
|
|
46
|
-
|
|
48
|
+
|
|
49
|
+
throw new Error('discoverModels: failed to fetch /models/catalog.json and no explicit fallback model list was provided.');
|
|
47
50
|
}
|
|
48
51
|
|
|
49
52
|
// ============================================================================
|
|
@@ -165,7 +168,7 @@ export function createHttpShardLoader(baseUrl, manifest, log) {
|
|
|
165
168
|
distributionConfig,
|
|
166
169
|
algorithm,
|
|
167
170
|
requiredEncoding,
|
|
168
|
-
expectedHash: shard
|
|
171
|
+
expectedHash: getExpectedShardHash(shard, algorithm) || null,
|
|
169
172
|
expectedSize: Number.isFinite(shard.size) ? Math.floor(shard.size) : null,
|
|
170
173
|
expectedManifestVersionSet: manifestVersionSet,
|
|
171
174
|
writeToStore: false,
|
|
@@ -238,7 +241,13 @@ export async function initializeInference(modelUrl, options = {}) {
|
|
|
238
241
|
onProgress('hotswap', 0.05, 'Loading hot-swap manifest...');
|
|
239
242
|
log(`Hot-swap: loading manifest ${hotSwapConfig.manifestUrl}`);
|
|
240
243
|
const hotSwapManifest = await fetchHotSwapManifest(hotSwapConfig.manifestUrl);
|
|
241
|
-
const verification = await verifyHotSwapManifest(hotSwapManifest, hotSwapConfig
|
|
244
|
+
const verification = await verifyHotSwapManifest(hotSwapManifest, hotSwapConfig, {
|
|
245
|
+
source: {
|
|
246
|
+
kind: 'remote',
|
|
247
|
+
isLocal: false,
|
|
248
|
+
url: hotSwapConfig.manifestUrl,
|
|
249
|
+
},
|
|
250
|
+
});
|
|
242
251
|
if (!verification.ok) {
|
|
243
252
|
throw new Error(`Hot-swap manifest rejected: ${verification.reason}`);
|
|
244
253
|
}
|
|
@@ -309,6 +318,7 @@ export async function initializeInference(modelUrl, options = {}) {
|
|
|
309
318
|
const pipeline = await createPipeline( ( (manifest)), {
|
|
310
319
|
storage: { loadShard },
|
|
311
320
|
gpu: { device },
|
|
321
|
+
runtime,
|
|
312
322
|
baseUrl: modelUrl,
|
|
313
323
|
onProgress: ( progress) => {
|
|
314
324
|
const pct = 0.2 + progress.percent * 0.8;
|
|
@@ -46,11 +46,6 @@ export declare class Tokenizer {
|
|
|
46
46
|
*/
|
|
47
47
|
initialize(manifest: ModelManifest, options?: TokenizerInitOptions): Promise<void>;
|
|
48
48
|
|
|
49
|
-
/**
|
|
50
|
-
* Infer HuggingFace model ID from manifest architecture
|
|
51
|
-
*/
|
|
52
|
-
private _inferHuggingFaceModel(manifest: ModelManifest): string | null;
|
|
53
|
-
|
|
54
49
|
/**
|
|
55
50
|
* Encode text to token IDs
|
|
56
51
|
*/
|
|
@@ -130,14 +130,12 @@ export class Tokenizer {
|
|
|
130
130
|
);
|
|
131
131
|
}
|
|
132
132
|
|
|
133
|
-
let hfModel = tokenizerConfig.hfModel;
|
|
133
|
+
let hfModel = tokenizerConfig.hfModel ?? tokenizerConfig.modelId ?? null;
|
|
134
134
|
const allowArchFallback = tokenizerConfig.allowArchFallback === true;
|
|
135
135
|
if (allowArchFallback && !hfModel) {
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
log.warn('Tokenizer', `Using inferred HuggingFace model: ${inferred}`);
|
|
140
|
-
}
|
|
136
|
+
throw new Error(
|
|
137
|
+
`[Tokenizer] tokenizer.allowArchFallback requires explicit tokenizer.hfModel or tokenizer.modelId for model "${modelId}".`
|
|
138
|
+
);
|
|
141
139
|
}
|
|
142
140
|
|
|
143
141
|
if (hfModel) {
|
|
@@ -212,23 +210,6 @@ export class Tokenizer {
|
|
|
212
210
|
|
|
213
211
|
this.config = tokenizerConfig;
|
|
214
212
|
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
_inferHuggingFaceModel(manifest) {
|
|
218
|
-
const tokenizer = manifest?.tokenizer ?? {};
|
|
219
|
-
if (typeof tokenizer.modelId === 'string' && tokenizer.modelId.length > 0) {
|
|
220
|
-
return tokenizer.modelId;
|
|
221
|
-
}
|
|
222
|
-
if (typeof tokenizer.hfModel === 'string' && tokenizer.hfModel.length > 0) {
|
|
223
|
-
return tokenizer.hfModel;
|
|
224
|
-
}
|
|
225
|
-
if (typeof manifest?.modelId === 'string' && manifest.modelId.length > 0) {
|
|
226
|
-
return manifest.modelId;
|
|
227
|
-
}
|
|
228
|
-
return null;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
|
|
232
213
|
encode(text) {
|
|
233
214
|
if (!this.backend) {
|
|
234
215
|
throw new Error('Tokenizer not initialized');
|
|
@@ -21,8 +21,17 @@ export class BPETokenizer extends BaseTokenizer {
|
|
|
21
21
|
});
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
+
#resetState() {
|
|
25
|
+
this.#vocab.clear();
|
|
26
|
+
this.#reverseVocab.clear();
|
|
27
|
+
this.#merges = [];
|
|
28
|
+
this.#mergeRanks.clear();
|
|
29
|
+
this.vocabSize = 0;
|
|
30
|
+
}
|
|
31
|
+
|
|
24
32
|
|
|
25
33
|
load(vocab, merges) {
|
|
34
|
+
this.#resetState();
|
|
26
35
|
// Build vocab maps
|
|
27
36
|
for (const [token, id] of Object.entries(vocab)) {
|
|
28
37
|
this.#vocab.set(token, id);
|
|
@@ -230,6 +230,25 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
230
230
|
});
|
|
231
231
|
}
|
|
232
232
|
|
|
233
|
+
#resetState() {
|
|
234
|
+
this.#vocab.clear();
|
|
235
|
+
this.#reverseVocab.clear();
|
|
236
|
+
this.#merges = [];
|
|
237
|
+
this.#mergeRanks.clear();
|
|
238
|
+
this.#scores = [];
|
|
239
|
+
this.#tokenTypes = [];
|
|
240
|
+
this.#type = null;
|
|
241
|
+
this.#byteTokens.clear();
|
|
242
|
+
this.#specialTokenPatterns = [];
|
|
243
|
+
this.#specialTokenIds = new Set();
|
|
244
|
+
this.#addSpacePrefix = true;
|
|
245
|
+
this.#spacePrefixChar = '▁';
|
|
246
|
+
this.#byteDecoder = null;
|
|
247
|
+
this.#byteEncoder = null;
|
|
248
|
+
this.#useByteLevelEncoding = false;
|
|
249
|
+
this.vocabSize = 0;
|
|
250
|
+
}
|
|
251
|
+
|
|
233
252
|
|
|
234
253
|
isSpecialToken(tokenId) {
|
|
235
254
|
if (this.#specialTokenIds.size > 0) {
|
|
@@ -283,6 +302,7 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
283
302
|
|
|
284
303
|
|
|
285
304
|
load(tokenizerJson) {
|
|
305
|
+
this.#resetState();
|
|
286
306
|
// Detect format: HuggingFace has model.vocab, bundled has top-level vocab
|
|
287
307
|
const isHuggingFace = 'model' in tokenizerJson && tokenizerJson.model?.vocab !== undefined;
|
|
288
308
|
|