@simulatte/doppler 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +126 -0
- package/README.md +25 -17
- package/package.json +20 -4
- package/src/adapters/adapter-registry.js +12 -1
- package/src/adapters/lora-loader.js +23 -6
- package/src/bridge/extension-client.d.ts +5 -0
- package/src/bridge/extension-client.js +40 -0
- package/src/bridge/index.d.ts +2 -1
- package/src/bridge/index.js +6 -4
- package/src/browser/browser-converter.js +26 -1
- package/src/browser/file-picker.js +6 -0
- package/src/browser/safetensors-parser-browser.js +84 -1
- package/src/browser/shard-io-browser.js +2 -2
- package/src/browser/tensor-source-download.js +8 -2
- package/src/browser/tensor-source-http.d.ts +1 -0
- package/src/browser/tensor-source-http.js +5 -1
- package/src/client/doppler-api.browser.js +20 -4
- package/src/client/doppler-api.js +19 -3
- package/src/client/doppler-provider/generation.js +12 -0
- package/src/client/doppler-provider/model-manager.d.ts +10 -0
- package/src/client/doppler-provider/model-manager.js +91 -19
- package/src/client/doppler-provider/source-runtime.d.ts +2 -1
- package/src/client/doppler-provider/source-runtime.js +132 -13
- package/src/client/doppler-registry.json +8 -7
- package/src/config/backward-registry-loader.js +17 -2
- package/src/config/execution-v0-contract-check.js +113 -15
- package/src/config/kernel-path-contract-check.js +57 -29
- package/src/config/kernel-path-loader.js +5 -36
- package/src/config/kernels/kernel-ref-digests.js +39 -39
- package/src/config/kernels/registry.js +14 -1
- package/src/config/kernels/registry.json +49 -7
- package/src/config/loader.d.ts +1 -1
- package/src/config/loader.js +43 -4
- package/src/config/merge-contract-check.js +59 -4
- package/src/config/merge-helpers.js +128 -7
- package/src/config/merge.d.ts +1 -0
- package/src/config/merge.js +28 -0
- package/src/config/param-validator.js +47 -2
- package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
- package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
- package/src/config/presets/kernel-paths/registry.json +29 -8
- package/src/config/presets/models/gemma2.json +2 -2
- package/src/config/presets/models/qwen3.json +9 -2
- package/src/config/presets/models/transformer.json +5 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
- package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
- package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
- package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
- package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
- package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
- package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
- package/src/config/required-inference-fields-contract-check.js +6 -0
- package/src/config/runtime.js +6 -1
- package/src/config/schema/debug.schema.d.ts +5 -0
- package/src/config/schema/doppler.schema.js +16 -21
- package/src/config/schema/inference-defaults.schema.js +6 -3
- package/src/config/schema/inference.schema.d.ts +9 -0
- package/src/config/schema/kernel-path.schema.d.ts +11 -1
- package/src/config/schema/kernel-thresholds.schema.js +12 -4
- package/src/config/schema/manifest.schema.d.ts +8 -1
- package/src/config/schema/manifest.schema.js +19 -3
- package/src/config/training-defaults.js +30 -22
- package/src/converter/conversion-plan.js +94 -9
- package/src/converter/core.d.ts +7 -0
- package/src/converter/core.js +14 -9
- package/src/converter/execution-v0-manifest.js +4 -1
- package/src/converter/index.d.ts +1 -0
- package/src/converter/index.js +1 -0
- package/src/converter/manifest-inference.js +43 -12
- package/src/converter/parsers/diffusion.js +0 -3
- package/src/converter/quantization-info.js +35 -15
- package/src/converter/rope-config.js +42 -0
- package/src/converter/shard-packer.d.ts +1 -1
- package/src/converter/shard-packer.js +4 -1
- package/src/debug/config.js +123 -11
- package/src/debug/signals.js +7 -1
- package/src/debug/tensor.d.ts +2 -0
- package/src/debug/tensor.js +13 -2
- package/src/distribution/p2p-control-plane.js +52 -12
- package/src/distribution/p2p-observability.js +43 -7
- package/src/distribution/p2p-webrtc-browser.js +20 -0
- package/src/distribution/shard-delivery.js +77 -26
- package/src/formats/gguf/types.js +33 -16
- package/src/formats/rdrr/groups.d.ts +12 -4
- package/src/formats/rdrr/groups.js +3 -6
- package/src/formats/rdrr/parsing.js +39 -2
- package/src/formats/rdrr/types.d.ts +2 -1
- package/src/gpu/command-recorder.js +86 -61
- package/src/gpu/device.d.ts +1 -0
- package/src/gpu/device.js +131 -19
- package/src/gpu/kernel-tuner/benchmarks.js +326 -316
- package/src/gpu/kernel-tuner/cache.js +71 -4
- package/src/gpu/kernel-tuner/tuner.js +22 -4
- package/src/gpu/kernels/attention.js +113 -34
- package/src/gpu/kernels/backward/adam.js +62 -58
- package/src/gpu/kernels/backward/attention_backward.js +257 -169
- package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
- package/src/gpu/kernels/bias_add.wgsl +8 -6
- package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
- package/src/gpu/kernels/cast.js +191 -149
- package/src/gpu/kernels/check-stop.js +33 -44
- package/src/gpu/kernels/conv2d.js +27 -17
- package/src/gpu/kernels/conv2d.wgsl +7 -8
- package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
- package/src/gpu/kernels/cross_entropy_loss.js +21 -15
- package/src/gpu/kernels/depthwise_conv2d.js +37 -26
- package/src/gpu/kernels/depthwise_conv2d.wgsl +6 -9
- package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +6 -9
- package/src/gpu/kernels/dequant.js +178 -126
- package/src/gpu/kernels/energy.d.ts +3 -21
- package/src/gpu/kernels/energy.js +111 -88
- package/src/gpu/kernels/feature-check.js +1 -1
- package/src/gpu/kernels/fused_ffn.js +84 -65
- package/src/gpu/kernels/fused_matmul_residual.js +56 -33
- package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
- package/src/gpu/kernels/gather.js +33 -15
- package/src/gpu/kernels/gelu.js +19 -11
- package/src/gpu/kernels/grouped_pointwise_conv2d.js +34 -23
- package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +6 -9
- package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +6 -9
- package/src/gpu/kernels/groupnorm.js +34 -23
- package/src/gpu/kernels/kv-quantize.js +5 -2
- package/src/gpu/kernels/layernorm.js +35 -19
- package/src/gpu/kernels/logit-merge.js +5 -3
- package/src/gpu/kernels/matmul.js +83 -39
- package/src/gpu/kernels/modulate.js +23 -15
- package/src/gpu/kernels/moe.js +221 -175
- package/src/gpu/kernels/pixel_shuffle.js +22 -14
- package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
- package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
- package/src/gpu/kernels/relu.js +31 -10
- package/src/gpu/kernels/relu.wgsl +2 -1
- package/src/gpu/kernels/relu_f16.wgsl +2 -1
- package/src/gpu/kernels/repeat_channels.js +25 -17
- package/src/gpu/kernels/repeat_channels.wgsl +4 -5
- package/src/gpu/kernels/repeat_channels_f16.wgsl +4 -5
- package/src/gpu/kernels/residual.js +69 -23
- package/src/gpu/kernels/residual.wgsl +6 -3
- package/src/gpu/kernels/residual_f16.wgsl +2 -1
- package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
- package/src/gpu/kernels/residual_vec4.wgsl +2 -1
- package/src/gpu/kernels/rmsnorm.js +96 -28
- package/src/gpu/kernels/rmsnorm.wgsl +14 -6
- package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
- package/src/gpu/kernels/rope.d.ts +2 -0
- package/src/gpu/kernels/rope.js +14 -1
- package/src/gpu/kernels/rope.wgsl +56 -40
- package/src/gpu/kernels/sample.js +27 -38
- package/src/gpu/kernels/sana_linear_attention.js +19 -12
- package/src/gpu/kernels/sana_linear_attention_apply.wgsl +4 -5
- package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +4 -5
- package/src/gpu/kernels/sana_linear_attention_summary.wgsl +4 -0
- package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +4 -0
- package/src/gpu/kernels/scale.js +18 -11
- package/src/gpu/kernels/shader-cache.js +4 -2
- package/src/gpu/kernels/silu.d.ts +1 -0
- package/src/gpu/kernels/silu.js +148 -82
- package/src/gpu/kernels/silu.wgsl +19 -9
- package/src/gpu/kernels/silu_f16.wgsl +19 -9
- package/src/gpu/kernels/softmax.js +44 -25
- package/src/gpu/kernels/split_qkv.js +23 -13
- package/src/gpu/kernels/transpose.js +31 -10
- package/src/gpu/kernels/transpose.wgsl +6 -5
- package/src/gpu/kernels/upsample2d.js +22 -13
- package/src/gpu/kernels/upsample2d.wgsl +6 -9
- package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
- package/src/gpu/kernels/utils.js +35 -13
- package/src/gpu/partitioned-buffer-pool.js +10 -2
- package/src/gpu/perf-guards.js +2 -9
- package/src/gpu/profiler.js +27 -22
- package/src/gpu/readback-utils.d.ts +16 -0
- package/src/gpu/readback-utils.js +41 -0
- package/src/gpu/submit-tracker.js +13 -0
- package/src/gpu/uniform-cache.d.ts +1 -0
- package/src/gpu/uniform-cache.js +30 -9
- package/src/hotswap/intent-bundle.js +6 -0
- package/src/hotswap/manifest.d.ts +10 -1
- package/src/hotswap/manifest.js +12 -2
- package/src/hotswap/runtime.js +30 -8
- package/src/index-browser.d.ts +44 -0
- package/src/index-browser.js +14 -0
- package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
- package/src/inference/browser-harness-contract-helpers.js +28 -0
- package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
- package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
- package/src/inference/browser-harness-model-helpers.d.ts +16 -0
- package/src/inference/browser-harness-model-helpers.js +217 -0
- package/src/inference/browser-harness-report-helpers.d.ts +7 -0
- package/src/inference/browser-harness-report-helpers.js +42 -0
- package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
- package/src/inference/browser-harness-runtime-helpers.js +415 -0
- package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
- package/src/inference/browser-harness-suite-helpers.js +268 -0
- package/src/inference/browser-harness-text-helpers.d.ts +27 -0
- package/src/inference/browser-harness-text-helpers.js +788 -0
- package/src/inference/browser-harness.d.ts +6 -0
- package/src/inference/browser-harness.js +130 -1950
- package/src/inference/kv-cache/base.js +140 -94
- package/src/inference/kv-cache/tiered.js +5 -3
- package/src/inference/moe-router.js +88 -56
- package/src/inference/multi-model-network.js +5 -3
- package/src/inference/network-evolution.d.ts +11 -2
- package/src/inference/network-evolution.js +20 -21
- package/src/inference/pipelines/context.d.ts +3 -0
- package/src/inference/pipelines/context.js +142 -2
- package/src/inference/pipelines/diffusion/helpers.js +7 -2
- package/src/inference/pipelines/diffusion/pipeline.js +17 -7
- package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
- package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +5 -0
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +27 -15
- package/src/inference/pipelines/diffusion/vae.js +3 -7
- package/src/inference/pipelines/energy/pipeline.js +27 -21
- package/src/inference/pipelines/energy/quintel.d.ts +5 -0
- package/src/inference/pipelines/energy/quintel.js +11 -0
- package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
- package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
- package/src/inference/pipelines/text/attention/projections.js +151 -101
- package/src/inference/pipelines/text/attention/record.js +73 -10
- package/src/inference/pipelines/text/attention/run.js +73 -10
- package/src/inference/pipelines/text/chat-format.js +25 -1
- package/src/inference/pipelines/text/config.d.ts +4 -0
- package/src/inference/pipelines/text/config.js +71 -5
- package/src/inference/pipelines/text/embed.js +2 -8
- package/src/inference/pipelines/text/execution-plan.js +64 -50
- package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
- package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
- package/src/inference/pipelines/text/execution-v0.js +78 -1002
- package/src/inference/pipelines/text/ffn/standard.js +3 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +46 -0
- package/src/inference/pipelines/text/generator-steps.js +298 -207
- package/src/inference/pipelines/text/generator.js +6 -23
- package/src/inference/pipelines/text/init.d.ts +4 -0
- package/src/inference/pipelines/text/init.js +134 -29
- package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
- package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
- package/src/inference/pipelines/text/kernel-trace.js +6 -0
- package/src/inference/pipelines/text/layer.js +14 -9
- package/src/inference/pipelines/text/linear-attention.d.ts +10 -0
- package/src/inference/pipelines/text/linear-attention.js +80 -6
- package/src/inference/pipelines/text/logits/gpu.js +10 -5
- package/src/inference/pipelines/text/logits/index.js +10 -11
- package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
- package/src/inference/pipelines/text/logits/utils.js +9 -0
- package/src/inference/pipelines/text/lora-apply.js +50 -32
- package/src/inference/pipelines/text/model-load.js +279 -104
- package/src/inference/pipelines/text/moe-cache.js +5 -4
- package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
- package/src/inference/pipelines/text/moe-cpu.js +42 -38
- package/src/inference/pipelines/text/moe-gpu.js +110 -86
- package/src/inference/pipelines/text/ops.js +90 -90
- package/src/inference/pipelines/text/probes.js +9 -9
- package/src/inference/pipelines/text/weights.js +17 -7
- package/src/inference/pipelines/text.js +17 -1
- package/src/inference/speculative.d.ts +2 -2
- package/src/inference/speculative.js +4 -18
- package/src/inference/test-harness.d.ts +1 -1
- package/src/inference/test-harness.js +15 -5
- package/src/inference/tokenizer.d.ts +0 -5
- package/src/inference/tokenizer.js +4 -23
- package/src/inference/tokenizers/bpe.js +9 -0
- package/src/inference/tokenizers/bundled.js +176 -33
- package/src/inference/tokenizers/sentencepiece.js +12 -0
- package/src/loader/doppler-loader.js +38 -22
- package/src/loader/dtype-utils.js +3 -44
- package/src/loader/embedding-loader.js +7 -3
- package/src/loader/experts/expert-cache.js +13 -6
- package/src/loader/experts/expert-loader.js +10 -6
- package/src/loader/final-weights-loader.js +8 -4
- package/src/loader/layer-loader.js +2 -1
- package/src/loader/loader-state.js +2 -2
- package/src/loader/memory-monitor.js +8 -0
- package/src/loader/multi-model-loader.d.ts +14 -0
- package/src/loader/multi-model-loader.js +70 -24
- package/src/loader/shard-cache.js +81 -12
- package/src/loader/shard-resolver.js +25 -3
- package/src/loader/tensors/tensor-loader.js +209 -144
- package/src/loader/tensors/tensor-reader.js +76 -19
- package/src/loader/weight-downcast.js +1 -1
- package/src/memory/buffer-pool.d.ts +9 -1
- package/src/memory/buffer-pool.js +109 -44
- package/src/memory/unified-detect.js +1 -1
- package/src/rules/inference/kernel-path.rules.json +24 -8
- package/src/rules/rule-registry.js +25 -1
- package/src/rules/tooling/command-runtime.rules.json +18 -0
- package/src/storage/backends/opfs-store.js +68 -24
- package/src/storage/downloader.js +364 -83
- package/src/storage/index.d.ts +3 -0
- package/src/storage/index.js +3 -0
- package/src/storage/preflight.d.ts +2 -2
- package/src/storage/preflight.js +24 -2
- package/src/storage/quickstart-downloader.js +11 -5
- package/src/storage/registry.js +10 -4
- package/src/storage/reports.js +1 -1
- package/src/storage/shard-manager.d.ts +15 -1
- package/src/storage/shard-manager.js +51 -3
- package/src/storage/source-artifact-store.d.ts +52 -0
- package/src/storage/source-artifact-store.js +234 -0
- package/src/tooling/command-api-constants.d.ts +9 -0
- package/src/tooling/command-api-constants.js +9 -0
- package/src/tooling/command-api-family-normalizers.d.ts +9 -0
- package/src/tooling/command-api-family-normalizers.js +343 -0
- package/src/tooling/command-api-helpers.d.ts +25 -0
- package/src/tooling/command-api-helpers.js +262 -0
- package/src/tooling/command-api.d.ts +27 -1
- package/src/tooling/command-api.js +26 -473
- package/src/tooling/command-envelope.js +4 -1
- package/src/tooling/command-runner-shared.js +52 -18
- package/src/tooling/lean-execution-contract.js +150 -3
- package/src/tooling/node-browser-command-runner.d.ts +4 -0
- package/src/tooling/node-browser-command-runner.js +218 -273
- package/src/tooling/node-command-runner.js +44 -3
- package/src/tooling/node-converter.js +27 -1
- package/src/tooling/node-source-runtime.d.ts +1 -1
- package/src/tooling/node-source-runtime.js +84 -3
- package/src/tooling/node-webgpu.js +30 -105
- package/src/tooling/opfs-cache.js +21 -4
- package/src/tooling/runtime-input-composition.d.ts +38 -0
- package/src/tooling/runtime-input-composition.js +86 -0
- package/src/tooling/source-runtime-bundle.d.ts +40 -5
- package/src/tooling/source-runtime-bundle.js +261 -34
- package/src/tooling/source-runtime-materializer.d.ts +6 -0
- package/src/tooling/source-runtime-materializer.js +93 -0
- package/src/training/attention-backward.js +32 -17
- package/src/training/autograd.js +80 -52
- package/src/training/checkpoint-watch.d.ts +8 -0
- package/src/training/checkpoint-watch.js +139 -0
- package/src/training/checkpoint.d.ts +6 -1
- package/src/training/checkpoint.js +46 -7
- package/src/training/clip.js +2 -1
- package/src/training/datasets/token-batch.js +20 -8
- package/src/training/distillation/artifacts.d.ts +71 -0
- package/src/training/distillation/artifacts.js +132 -0
- package/src/training/distillation/checkpoint-watch.d.ts +10 -0
- package/src/training/distillation/checkpoint-watch.js +58 -0
- package/src/training/distillation/dataset.d.ts +59 -0
- package/src/training/distillation/dataset.js +337 -0
- package/src/training/distillation/eval.d.ts +34 -0
- package/src/training/distillation/eval.js +310 -0
- package/src/training/distillation/index.d.ts +29 -0
- package/src/training/distillation/index.js +29 -0
- package/src/training/distillation/runtime.d.ts +20 -0
- package/src/training/distillation/runtime.js +121 -0
- package/src/training/distillation/scoreboard.d.ts +6 -0
- package/src/training/distillation/scoreboard.js +8 -0
- package/src/training/distillation/stage-a.d.ts +45 -0
- package/src/training/distillation/stage-a.js +338 -0
- package/src/training/distillation/stage-b.d.ts +24 -0
- package/src/training/distillation/stage-b.js +20 -0
- package/src/training/distillation/student-fixture.d.ts +22 -0
- package/src/training/distillation/student-fixture.js +846 -0
- package/src/training/distillation/suite-data.d.ts +45 -0
- package/src/training/distillation/suite-data.js +189 -0
- package/src/training/index.d.ts +10 -0
- package/src/training/index.js +10 -0
- package/src/training/lora-pipeline.d.ts +40 -0
- package/src/training/lora-pipeline.js +793 -0
- package/src/training/lora.js +26 -12
- package/src/training/loss.js +5 -6
- package/src/training/objectives/cross_entropy.js +2 -5
- package/src/training/objectives/distill_kd.js +4 -8
- package/src/training/objectives/distill_triplet.js +4 -8
- package/src/training/objectives/ul_stage2_base.js +4 -8
- package/src/training/operator-artifacts.d.ts +62 -0
- package/src/training/operator-artifacts.js +140 -0
- package/src/training/operator-command.d.ts +5 -0
- package/src/training/operator-command.js +455 -0
- package/src/training/operator-eval.d.ts +48 -0
- package/src/training/operator-eval.js +230 -0
- package/src/training/operator-scoreboard.d.ts +5 -0
- package/src/training/operator-scoreboard.js +44 -0
- package/src/training/optimizer.js +19 -7
- package/src/training/runner.d.ts +52 -0
- package/src/training/runner.js +31 -5
- package/src/training/suite.d.ts +112 -0
- package/src/training/suite.js +24 -984
- package/src/training/tensor-factory.d.ts +9 -0
- package/src/training/tensor-factory.js +13 -0
- package/src/training/trainer.js +3 -5
- package/src/training/ul_dataset.js +3 -5
- package/src/training/workloads.d.ts +164 -0
- package/src/training/workloads.js +530 -0
- package/src/version.js +1 -1
- package/tools/convert-safetensors-node.js +22 -16
- package/tools/doppler-cli.js +179 -63
|
@@ -64,6 +64,68 @@ function resolveSpecialTokens(specialTokensRaw, fallbackTokens, vocab) {
|
|
|
64
64
|
return resolved;
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
function resolveByteLevelPretokenizerConfig(preTokenizer) {
|
|
68
|
+
if (!preTokenizer || typeof preTokenizer !== 'object') {
|
|
69
|
+
return {
|
|
70
|
+
useByteLevel: false,
|
|
71
|
+
addPrefixSpace: null,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (preTokenizer.type === 'ByteLevel') {
|
|
76
|
+
return {
|
|
77
|
+
useByteLevel: true,
|
|
78
|
+
addPrefixSpace: preTokenizer.add_prefix_space === true,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (preTokenizer.type === 'Sequence' && Array.isArray(preTokenizer.pretokenizers)) {
|
|
83
|
+
for (const entry of preTokenizer.pretokenizers) {
|
|
84
|
+
const resolved = resolveByteLevelPretokenizerConfig(entry);
|
|
85
|
+
if (resolved.useByteLevel) {
|
|
86
|
+
return resolved;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
useByteLevel: false,
|
|
93
|
+
addPrefixSpace: null,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function registerAddedTokens(addedTokens, vocab, reverseVocab, patterns, specialTokenIds, derivedSpecialTokens = null) {
|
|
98
|
+
let maxId = -1;
|
|
99
|
+
for (const token of addedTokens) {
|
|
100
|
+
const content = token?.content;
|
|
101
|
+
const id = typeof token?.id === 'number' ? token.id : parseInt(token?.id, 10);
|
|
102
|
+
if (!Number.isFinite(id) || !content) continue;
|
|
103
|
+
if (!vocab.has(content)) {
|
|
104
|
+
vocab.set(content, id);
|
|
105
|
+
reverseVocab.set(id, content);
|
|
106
|
+
}
|
|
107
|
+
if (id > maxId) maxId = id;
|
|
108
|
+
if (content.length > 1) {
|
|
109
|
+
patterns.push({ content, id });
|
|
110
|
+
}
|
|
111
|
+
if (token.special) {
|
|
112
|
+
specialTokenIds.add(id);
|
|
113
|
+
if (derivedSpecialTokens) {
|
|
114
|
+
if (derivedSpecialTokens.bos == null && (content === '<bos>' || content === '<s>' || content.includes('bos'))) {
|
|
115
|
+
derivedSpecialTokens.bos = id;
|
|
116
|
+
} else if (derivedSpecialTokens.eos == null && (content === '<eos>' || content === '</s>' || content.includes('eos'))) {
|
|
117
|
+
derivedSpecialTokens.eos = id;
|
|
118
|
+
} else if (derivedSpecialTokens.pad == null && (content === '<pad>' || content.includes('pad'))) {
|
|
119
|
+
derivedSpecialTokens.pad = id;
|
|
120
|
+
} else if (derivedSpecialTokens.unk == null && (content === '<unk>' || content.includes('unk'))) {
|
|
121
|
+
derivedSpecialTokens.unk = id;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return maxId;
|
|
127
|
+
}
|
|
128
|
+
|
|
67
129
|
|
|
68
130
|
export class TransformersTokenizer extends BaseTokenizer {
|
|
69
131
|
|
|
@@ -156,6 +218,10 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
156
218
|
|
|
157
219
|
#byteDecoder = null;
|
|
158
220
|
|
|
221
|
+
#byteEncoder = null;
|
|
222
|
+
|
|
223
|
+
#useByteLevelEncoding = false;
|
|
224
|
+
|
|
159
225
|
|
|
160
226
|
constructor(config = {}) {
|
|
161
227
|
// BundledTokenizer gets vocabSize from load(), so defer validation
|
|
@@ -164,6 +230,25 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
164
230
|
});
|
|
165
231
|
}
|
|
166
232
|
|
|
233
|
+
#resetState() {
|
|
234
|
+
this.#vocab.clear();
|
|
235
|
+
this.#reverseVocab.clear();
|
|
236
|
+
this.#merges = [];
|
|
237
|
+
this.#mergeRanks.clear();
|
|
238
|
+
this.#scores = [];
|
|
239
|
+
this.#tokenTypes = [];
|
|
240
|
+
this.#type = null;
|
|
241
|
+
this.#byteTokens.clear();
|
|
242
|
+
this.#specialTokenPatterns = [];
|
|
243
|
+
this.#specialTokenIds = new Set();
|
|
244
|
+
this.#addSpacePrefix = true;
|
|
245
|
+
this.#spacePrefixChar = '▁';
|
|
246
|
+
this.#byteDecoder = null;
|
|
247
|
+
this.#byteEncoder = null;
|
|
248
|
+
this.#useByteLevelEncoding = false;
|
|
249
|
+
this.vocabSize = 0;
|
|
250
|
+
}
|
|
251
|
+
|
|
167
252
|
|
|
168
253
|
isSpecialToken(tokenId) {
|
|
169
254
|
if (this.#specialTokenIds.size > 0) {
|
|
@@ -199,13 +284,25 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
199
284
|
}
|
|
200
285
|
|
|
201
286
|
this.#byteDecoder = new Map();
|
|
287
|
+
this.#byteEncoder = new Map();
|
|
202
288
|
for (let i = 0; i < base.length; i++) {
|
|
203
289
|
this.#byteDecoder.set(String.fromCodePoint(chars[i]), base[i]);
|
|
290
|
+
this.#byteEncoder.set(base[i], String.fromCodePoint(chars[i]));
|
|
204
291
|
}
|
|
205
292
|
}
|
|
206
293
|
|
|
294
|
+
#encodeByteLevelText(text) {
|
|
295
|
+
const bytes = new TextEncoder().encode(text);
|
|
296
|
+
let out = '';
|
|
297
|
+
for (const byte of bytes) {
|
|
298
|
+
out += this.#byteEncoder?.get(byte) ?? String.fromCharCode(byte);
|
|
299
|
+
}
|
|
300
|
+
return out;
|
|
301
|
+
}
|
|
302
|
+
|
|
207
303
|
|
|
208
304
|
load(tokenizerJson) {
|
|
305
|
+
this.#resetState();
|
|
209
306
|
// Detect format: HuggingFace has model.vocab, bundled has top-level vocab
|
|
210
307
|
const isHuggingFace = 'model' in tokenizerJson && tokenizerJson.model?.vocab !== undefined;
|
|
211
308
|
|
|
@@ -290,30 +387,16 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
290
387
|
eos: null,
|
|
291
388
|
unk: null,
|
|
292
389
|
};
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
specialTokenIds.add(id);
|
|
304
|
-
if (content.length > 1) {
|
|
305
|
-
specialTokenPatterns.push({ content, id });
|
|
306
|
-
}
|
|
307
|
-
if (derivedSpecialTokens.bos == null && (content === '<bos>' || content === '<s>' || content.includes('bos'))) {
|
|
308
|
-
derivedSpecialTokens.bos = id;
|
|
309
|
-
} else if (derivedSpecialTokens.eos == null && (content === '<eos>' || content === '</s>' || content.includes('eos'))) {
|
|
310
|
-
derivedSpecialTokens.eos = id;
|
|
311
|
-
} else if (derivedSpecialTokens.pad == null && (content === '<pad>' || content.includes('pad'))) {
|
|
312
|
-
derivedSpecialTokens.pad = id;
|
|
313
|
-
} else if (derivedSpecialTokens.unk == null && (content === '<unk>' || content.includes('unk'))) {
|
|
314
|
-
derivedSpecialTokens.unk = id;
|
|
315
|
-
}
|
|
316
|
-
}
|
|
390
|
+
const addedMaxId = registerAddedTokens(
|
|
391
|
+
addedTokens,
|
|
392
|
+
this.#vocab,
|
|
393
|
+
this.#reverseVocab,
|
|
394
|
+
specialTokenPatterns,
|
|
395
|
+
specialTokenIds,
|
|
396
|
+
derivedSpecialTokens
|
|
397
|
+
);
|
|
398
|
+
if (addedMaxId > maxId) {
|
|
399
|
+
maxId = addedMaxId;
|
|
317
400
|
}
|
|
318
401
|
|
|
319
402
|
const specialTokensRaw = hf.special_tokens_map || hf.specialTokens || hf.special_tokens || null;
|
|
@@ -351,6 +434,7 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
351
434
|
|
|
352
435
|
// Handle behavior flags (use HF config if present, else runtime defaults)
|
|
353
436
|
const runtimeDefaults = getRuntimeConfig().inference.tokenizer;
|
|
437
|
+
const byteLevelPretokenizer = resolveByteLevelPretokenizerConfig(hf.pre_tokenizer);
|
|
354
438
|
const configuredAddBosToken = this.addBosToken;
|
|
355
439
|
const configuredAddEosToken = this.addEosToken;
|
|
356
440
|
this.addBosToken =
|
|
@@ -378,9 +462,16 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
378
462
|
// - runtime config addSpacePrefix (user override or null for auto-detect)
|
|
379
463
|
const decoderPrepend = hf.decoder?.prepend_scheme === 'always' || hf.decoder?.add_prefix_space === true;
|
|
380
464
|
const normalizerPrepend = hf.normalizer?.prepend_scheme === 'always' || hf.normalizer?.add_prefix_space === true;
|
|
465
|
+
this.#useByteLevelEncoding = byteLevelPretokenizer.useByteLevel;
|
|
381
466
|
const runtimeSpacePrefix = runtimeDefaults.addSpacePrefix;
|
|
382
467
|
// Use explicit runtime config if set (non-null), otherwise auto-detect from tokenizer.json
|
|
383
|
-
this.#addSpacePrefix = runtimeSpacePrefix
|
|
468
|
+
this.#addSpacePrefix = runtimeSpacePrefix
|
|
469
|
+
?? byteLevelPretokenizer.addPrefixSpace
|
|
470
|
+
?? model.add_prefix_space
|
|
471
|
+
?? model.add_dummy_prefix
|
|
472
|
+
?? decoderPrepend
|
|
473
|
+
?? normalizerPrepend
|
|
474
|
+
?? false;
|
|
384
475
|
log.debug('Tokenizer', `addSpacePrefix=${this.#addSpacePrefix} (runtime=${runtimeSpacePrefix}, model=${model.add_prefix_space ?? model.add_dummy_prefix}, decoder=${decoderPrepend}, normalizer=${normalizerPrepend})`);
|
|
385
476
|
|
|
386
477
|
// Detect space prefix style by checking which WORD tokens exist in vocab
|
|
@@ -469,11 +560,47 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
469
560
|
this.#tokenTypes = tokenizerJson.tokenTypes;
|
|
470
561
|
}
|
|
471
562
|
|
|
563
|
+
let maxId = -1;
|
|
564
|
+
for (const id of this.#vocab.values()) {
|
|
565
|
+
if (Number.isFinite(id) && id > maxId) {
|
|
566
|
+
maxId = id;
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
const addedTokens = Array.isArray(tokenizerJson.added_tokens) ? tokenizerJson.added_tokens : [];
|
|
571
|
+
const tokenPatterns = [];
|
|
572
|
+
const specialTokenIds = new Set();
|
|
573
|
+
const derivedSpecialTokens = {
|
|
574
|
+
pad: null,
|
|
575
|
+
bos: null,
|
|
576
|
+
eos: null,
|
|
577
|
+
unk: null,
|
|
578
|
+
};
|
|
579
|
+
const addedMaxId = registerAddedTokens(
|
|
580
|
+
addedTokens,
|
|
581
|
+
this.#vocab,
|
|
582
|
+
this.#reverseVocab,
|
|
583
|
+
tokenPatterns,
|
|
584
|
+
specialTokenIds,
|
|
585
|
+
derivedSpecialTokens
|
|
586
|
+
);
|
|
587
|
+
if (addedMaxId > maxId) {
|
|
588
|
+
maxId = addedMaxId;
|
|
589
|
+
}
|
|
590
|
+
|
|
472
591
|
// Set special tokens - support both camelCase and snake_case formats
|
|
473
592
|
const specialTokensRaw = (tokenizerJson.specialTokens || (tokenizerJson).special_tokens);
|
|
474
|
-
this.specialTokens = resolveSpecialTokens(
|
|
593
|
+
this.specialTokens = resolveSpecialTokens(
|
|
594
|
+
specialTokensRaw,
|
|
595
|
+
{
|
|
596
|
+
...derivedSpecialTokens,
|
|
597
|
+
...this.specialTokens,
|
|
598
|
+
},
|
|
599
|
+
this.#vocab
|
|
600
|
+
);
|
|
475
601
|
log.debug('Tokenizer', `Special tokens: BOS=${this.specialTokens.bos}, EOS=${this.specialTokens.eos}`);
|
|
476
|
-
this.#specialTokenIds =
|
|
602
|
+
this.#specialTokenIds = specialTokenIds;
|
|
603
|
+
this.#specialTokenPatterns = tokenPatterns;
|
|
477
604
|
const builtinSpecials = [
|
|
478
605
|
this.specialTokens.pad,
|
|
479
606
|
this.specialTokens.bos,
|
|
@@ -485,8 +612,13 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
485
612
|
this.#specialTokenIds.add(id);
|
|
486
613
|
}
|
|
487
614
|
}
|
|
615
|
+
this.#specialTokenPatterns.sort((a, b) => b.content.length - a.content.length);
|
|
616
|
+
if (maxId >= 0) {
|
|
617
|
+
this.vocabSize = Math.max(this.vocabSize, maxId + 1);
|
|
618
|
+
}
|
|
488
619
|
|
|
489
620
|
const runtimeDefaults = getRuntimeConfig().inference.tokenizer;
|
|
621
|
+
const byteLevelPretokenizer = resolveByteLevelPretokenizerConfig(tokenizerJson.pre_tokenizer);
|
|
490
622
|
const configuredAddBosToken = this.addBosToken;
|
|
491
623
|
const configuredAddEosToken = this.addEosToken;
|
|
492
624
|
this.addBosToken =
|
|
@@ -505,9 +637,11 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
505
637
|
if (this.addEosToken && this.specialTokens.eos == null) {
|
|
506
638
|
throw new Error('[Tokenizer] addEosToken is enabled but eos token is missing.');
|
|
507
639
|
}
|
|
640
|
+
this.#useByteLevelEncoding = byteLevelPretokenizer.useByteLevel;
|
|
508
641
|
// NOTE: Default to FALSE - first word shouldn't get space prefix
|
|
509
642
|
// Space prefixes are only for words that follow a space in original text
|
|
510
|
-
this.#addSpacePrefix = tokenizerJson.addSpacePrefix === true
|
|
643
|
+
this.#addSpacePrefix = tokenizerJson.addSpacePrefix === true
|
|
644
|
+
|| byteLevelPretokenizer.addPrefixSpace === true;
|
|
511
645
|
|
|
512
646
|
// Detect space prefix style based on vocab tokens
|
|
513
647
|
// GPT-style uses 'Ġ' (U+0120), SentencePiece uses '▁' (U+2581)
|
|
@@ -548,7 +682,8 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
548
682
|
ids.push(this.specialTokens.bos);
|
|
549
683
|
}
|
|
550
684
|
|
|
551
|
-
// Split text around
|
|
685
|
+
// Split text around literal added tokens and special tokens, then tokenize
|
|
686
|
+
// the remaining plain-text segments normally.
|
|
552
687
|
const segments = this.#splitOnSpecialTokens(text);
|
|
553
688
|
for (const seg of segments) {
|
|
554
689
|
if (seg.isSpecial && seg.id !== undefined) {
|
|
@@ -690,11 +825,19 @@ export class BundledTokenizer extends BaseTokenizer {
|
|
|
690
825
|
if (text.length === 0) return [];
|
|
691
826
|
|
|
692
827
|
let normalized = text;
|
|
693
|
-
|
|
694
|
-
|
|
828
|
+
let prefixed;
|
|
829
|
+
if (this.#useByteLevelEncoding) {
|
|
830
|
+
if (this.#addSpacePrefix && !normalized.startsWith(' ')) {
|
|
831
|
+
normalized = ` ${normalized}`;
|
|
832
|
+
}
|
|
833
|
+
prefixed = this.#encodeByteLevelText(normalized);
|
|
834
|
+
} else {
|
|
835
|
+
if (this.#addSpacePrefix && !normalized.startsWith(' ')) {
|
|
836
|
+
normalized = ` ${normalized}`;
|
|
837
|
+
}
|
|
838
|
+
const sp = this.#spacePrefixChar;
|
|
839
|
+
prefixed = normalized.replace(/ /g, sp);
|
|
695
840
|
}
|
|
696
|
-
const sp = this.#spacePrefixChar;
|
|
697
|
-
const prefixed = normalized.replace(/ /g, sp);
|
|
698
841
|
|
|
699
842
|
if (this.#mergeRanks.size === 0) {
|
|
700
843
|
return this.#encodeBPEGreedy(prefixed);
|
|
@@ -31,8 +31,18 @@ export class SentencePieceTokenizer extends BaseTokenizer {
|
|
|
31
31
|
});
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
#resetState() {
|
|
35
|
+
this.#modelData = null;
|
|
36
|
+
this.#pieces.clear();
|
|
37
|
+
this.#reverseVocab.clear();
|
|
38
|
+
this.#algorithm = 'unigram';
|
|
39
|
+
this.#byteTokens.clear();
|
|
40
|
+
this.vocabSize = 0;
|
|
41
|
+
}
|
|
42
|
+
|
|
34
43
|
|
|
35
44
|
async load(modelData) {
|
|
45
|
+
this.#resetState();
|
|
36
46
|
this.#modelData = modelData;
|
|
37
47
|
|
|
38
48
|
try {
|
|
@@ -42,6 +52,8 @@ export class SentencePieceTokenizer extends BaseTokenizer {
|
|
|
42
52
|
} catch (err) {
|
|
43
53
|
const message = err instanceof Error ? err.message : String(err);
|
|
44
54
|
log.warn('Tokenizer', `Failed to parse model, using byte fallback: ${message}`);
|
|
55
|
+
this.#resetState();
|
|
56
|
+
this.#modelData = modelData;
|
|
45
57
|
this.#initByteFallback();
|
|
46
58
|
}
|
|
47
59
|
}
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
verifyIntegrity,
|
|
10
10
|
loadManifestFromStore,
|
|
11
11
|
} from '../storage/shard-manager.js';
|
|
12
|
-
import { parseManifest } from '../formats/rdrr/index.js';
|
|
12
|
+
import { clearManifest, parseManifest, setManifest as setCurrentManifest } from '../formats/rdrr/index.js';
|
|
13
13
|
import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
|
|
14
14
|
import { acquireBuffer, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
|
|
15
15
|
import { getExpertCache } from './experts/expert-cache.js';
|
|
@@ -50,6 +50,10 @@ function hasExpertGroups(manifest) {
|
|
|
50
50
|
return Object.keys(manifest.groups).some((groupId) => groupId.includes('.expert.'));
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
+
function isGpuBufferInstance(value) {
|
|
54
|
+
return typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer;
|
|
55
|
+
}
|
|
56
|
+
|
|
53
57
|
// Re-export types for backward compatibility
|
|
54
58
|
export {
|
|
55
59
|
// Types are in .d.ts file
|
|
@@ -252,6 +256,7 @@ export class DopplerLoader {
|
|
|
252
256
|
|
|
253
257
|
setManifest(manifest) {
|
|
254
258
|
this.manifest = manifest;
|
|
259
|
+
setCurrentManifest(manifest);
|
|
255
260
|
const moeConfig = manifest.moeConfig;
|
|
256
261
|
this.isMoE = moeConfig != null && (moeConfig.numExperts ?? 0) > 1;
|
|
257
262
|
if (!this.isMoE && hasExpertGroups(manifest)) {
|
|
@@ -259,6 +264,7 @@ export class DopplerLoader {
|
|
|
259
264
|
`Manifest "${manifest.modelId ?? 'unknown'}" missing moeConfig for MoE model. Re-convert with moeConfig.`
|
|
260
265
|
);
|
|
261
266
|
}
|
|
267
|
+
this.shardCache.setManifest(this.manifest);
|
|
262
268
|
this.shardCache.configureForModel(this.manifest, this.shardCache.hasCustomLoader);
|
|
263
269
|
debugTrace.loader('Manifest set externally');
|
|
264
270
|
}
|
|
@@ -679,7 +685,7 @@ export class DopplerLoader {
|
|
|
679
685
|
const device = getDevice();
|
|
680
686
|
if (!device) {
|
|
681
687
|
log.warn('Loader', 'GPU device not available; falling back to CPU');
|
|
682
|
-
if (shardData
|
|
688
|
+
if (isGpuBufferInstance(shardData)) {
|
|
683
689
|
releaseBuffer(shardData);
|
|
684
690
|
shardData = await this.#assembleShardData(location, name);
|
|
685
691
|
}
|
|
@@ -708,7 +714,7 @@ export class DopplerLoader {
|
|
|
708
714
|
return result.data;
|
|
709
715
|
}
|
|
710
716
|
|
|
711
|
-
if (shardData
|
|
717
|
+
if (isGpuBufferInstance(shardData)) {
|
|
712
718
|
// Shouldn't happen (streaming is only used for toGPU), but keep this leak-proof.
|
|
713
719
|
releaseBuffer(shardData);
|
|
714
720
|
shardData = await this.#assembleShardData(location, name);
|
|
@@ -751,31 +757,40 @@ export class DopplerLoader {
|
|
|
751
757
|
// queue.writeBuffer requires 4-byte aligned sizes; we pad the buffer.
|
|
752
758
|
const alignedSize = Math.ceil(location.size / 4) * 4;
|
|
753
759
|
const raw = acquireBuffer(alignedSize, undefined, `raw_${name}`);
|
|
760
|
+
let complete = false;
|
|
754
761
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
762
|
+
try {
|
|
763
|
+
let dstOffset = 0;
|
|
764
|
+
const uploadChunk = (bytes) => {
|
|
765
|
+
device.queue.writeBuffer(raw, dstOffset, bytes, bytes.byteOffset, bytes.byteLength);
|
|
766
|
+
dstOffset += bytes.byteLength;
|
|
767
|
+
};
|
|
768
|
+
const streamRange = (idx, offset, length) => this.shardCache.streamRange(idx, offset, length, { chunkBytes });
|
|
761
769
|
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
770
|
+
if (location.spans) {
|
|
771
|
+
for (const span of location.spans) {
|
|
772
|
+
for await (const chunk of streamRange(span.shardIndex, span.offset, span.size)) {
|
|
773
|
+
uploadChunk(chunk);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
} else {
|
|
777
|
+
for await (const chunk of streamRange(location.shardIndex, location.offset, location.size)) {
|
|
765
778
|
uploadChunk(chunk);
|
|
766
779
|
}
|
|
767
780
|
}
|
|
768
|
-
} else {
|
|
769
|
-
for await (const chunk of streamRange(location.shardIndex, location.offset, location.size)) {
|
|
770
|
-
uploadChunk(chunk);
|
|
771
|
-
}
|
|
772
|
-
}
|
|
773
781
|
|
|
774
|
-
|
|
775
|
-
|
|
782
|
+
if (dstOffset !== location.size) {
|
|
783
|
+
throw new Error(
|
|
784
|
+
`Stream upload short read for "${name}": got=${dstOffset}, expected=${location.size}.`
|
|
785
|
+
);
|
|
786
|
+
}
|
|
787
|
+
complete = true;
|
|
788
|
+
return raw;
|
|
789
|
+
} finally {
|
|
790
|
+
if (!complete) {
|
|
791
|
+
releaseBuffer(raw);
|
|
792
|
+
}
|
|
776
793
|
}
|
|
777
|
-
|
|
778
|
-
return raw;
|
|
779
794
|
}
|
|
780
795
|
|
|
781
796
|
|
|
@@ -950,7 +965,7 @@ export class DopplerLoader {
|
|
|
950
965
|
if (!value) return;
|
|
951
966
|
const gpuBuffer = isWeightBuffer(value)
|
|
952
967
|
? value.buffer
|
|
953
|
-
: (value
|
|
968
|
+
: (isGpuBufferInstance(value) ? value : null);
|
|
954
969
|
if (!gpuBuffer) return;
|
|
955
970
|
try {
|
|
956
971
|
releaseBuffer(gpuBuffer);
|
|
@@ -990,6 +1005,7 @@ export class DopplerLoader {
|
|
|
990
1005
|
this.lmHead = null;
|
|
991
1006
|
this.finalNorm = null;
|
|
992
1007
|
this.manifest = null;
|
|
1008
|
+
clearManifest();
|
|
993
1009
|
this.modelId = null;
|
|
994
1010
|
this.loadedShards.clear();
|
|
995
1011
|
this.isLoaded = false;
|
|
@@ -1,7 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
|
|
3
|
-
import { getDevice } from '../gpu/device.js';
|
|
4
|
-
import { isTraceEnabled, log, trace as debugTrace } from '../debug/index.js';
|
|
5
2
|
import { selectRuleValue } from '../rules/rule-registry.js';
|
|
6
3
|
import { tagBufferDtype } from '../gpu/weight-buffer.js';
|
|
7
4
|
|
|
@@ -26,46 +23,8 @@ export function f16ToF32(h) {
|
|
|
26
23
|
|
|
27
24
|
|
|
28
25
|
export async function convertBF16ToF32GPU(srcBuffer, numElements, name) {
|
|
29
|
-
|
|
30
|
-
const castModule = await import('../gpu/kernels/cast.js');
|
|
31
|
-
debugTrace.loader(`[BF16->F32] castModule keys:`, Object.keys(castModule));
|
|
32
|
-
const { runBF16ToF32 } = castModule;
|
|
33
|
-
debugTrace.loader(`[BF16->F32] runBF16ToF32 type: ${typeof runBF16ToF32}`);
|
|
26
|
+
const { runBF16ToF32 } = await import('../gpu/kernels/cast.js');
|
|
34
27
|
const resultTensor = await runBF16ToF32(srcBuffer, [numElements], name);
|
|
35
|
-
debugTrace.loader(`[BF16->F32] runBF16ToF32 returned, result.size=${resultTensor.buffer?.size}`);
|
|
36
|
-
|
|
37
|
-
// Debug: Verify conversion produced non-zero values
|
|
38
|
-
const shouldCheckEmbed = isTraceEnabled('loader') &&
|
|
39
|
-
name.includes('embed') &&
|
|
40
|
-
name.includes('embed_tokens');
|
|
41
|
-
if (shouldCheckEmbed) {
|
|
42
|
-
try {
|
|
43
|
-
debugTrace.loader(`[BF16->F32] Checking embed buffer for non-zeros...`);
|
|
44
|
-
const device = getDevice();
|
|
45
|
-
const sampleSize = Math.min(1024, resultTensor.buffer.size);
|
|
46
|
-
debugTrace.loader(`[BF16->F32] Creating staging buffer size=${sampleSize}`);
|
|
47
|
-
const stagingBuffer = device.createBuffer({
|
|
48
|
-
size: sampleSize,
|
|
49
|
-
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
|
|
50
|
-
});
|
|
51
|
-
debugTrace.loader(`[BF16->F32] Copying to staging buffer...`);
|
|
52
|
-
const encoder = device.createCommandEncoder();
|
|
53
|
-
encoder.copyBufferToBuffer(resultTensor.buffer, 0, stagingBuffer, 0, sampleSize);
|
|
54
|
-
device.queue.submit([encoder.finish()]);
|
|
55
|
-
debugTrace.loader(`[BF16->F32] Mapping staging buffer...`);
|
|
56
|
-
await stagingBuffer.mapAsync(GPUMapMode.READ);
|
|
57
|
-
debugTrace.loader(`[BF16->F32] Reading data...`);
|
|
58
|
-
const data = new Float32Array(stagingBuffer.getMappedRange().slice(0));
|
|
59
|
-
stagingBuffer.unmap();
|
|
60
|
-
stagingBuffer.destroy();
|
|
61
|
-
const nonZero = Array.from(data).filter(x => x !== 0);
|
|
62
|
-
const nanCount = data.filter(x => !Number.isFinite(x)).length;
|
|
63
|
-
debugTrace.loader(`[BF16->F32] nonZero=${nonZero.length}/${data.length}, nan=${nanCount}, sample=[${nonZero.slice(0, 5).map(x => x.toFixed(4)).join(', ')}]`);
|
|
64
|
-
} catch (err) {
|
|
65
|
-
log.error('Loader', 'BF16->F32 embed buffer check error:', (err).message);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
28
|
return resultTensor.buffer;
|
|
70
29
|
}
|
|
71
30
|
|
|
@@ -84,11 +43,11 @@ function normalizeBufferDtype(locationDtype, outputDtype) {
|
|
|
84
43
|
if (explicit) {
|
|
85
44
|
return explicit;
|
|
86
45
|
}
|
|
87
|
-
const location = typeof locationDtype === 'string' ? locationDtype.
|
|
46
|
+
const location = typeof locationDtype === 'string' ? locationDtype.toUpperCase() : null;
|
|
88
47
|
if (!location) {
|
|
89
48
|
return null;
|
|
90
49
|
}
|
|
91
|
-
return selectRuleValue('loader', 'weights', 'floatLocationDtype', { locationDtype:
|
|
50
|
+
return selectRuleValue('loader', 'weights', 'floatLocationDtype', { locationDtype: location });
|
|
92
51
|
}
|
|
93
52
|
|
|
94
53
|
export function applyBufferLayout(buffer, location, outputDtype = null) {
|
|
@@ -23,6 +23,10 @@ import { releaseBuffer } from '../memory/buffer-pool.js';
|
|
|
23
23
|
const EMBEDDING_ROLE = 'embedding';
|
|
24
24
|
const EMBEDDING_GROUP = 'embed';
|
|
25
25
|
|
|
26
|
+
function isGpuBufferInstance(value) {
|
|
27
|
+
return typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer;
|
|
28
|
+
}
|
|
29
|
+
|
|
26
30
|
// ============================================================================
|
|
27
31
|
// Main Function
|
|
28
32
|
// ============================================================================
|
|
@@ -59,7 +63,7 @@ export async function loadEmbeddings(ctx) {
|
|
|
59
63
|
}
|
|
60
64
|
|
|
61
65
|
// Handle valid tensor types
|
|
62
|
-
if (tensor
|
|
66
|
+
if (isGpuBufferInstance(tensor) || isWeightBuffer(tensor) || tensor instanceof Float32Array) {
|
|
63
67
|
const result = await processEmbeddingTensor(ctx, tensor, name, loc, shouldStream);
|
|
64
68
|
if (result) {
|
|
65
69
|
return result;
|
|
@@ -107,7 +111,7 @@ async function processEmbeddingTensor(ctx, tensor, name, loc, shouldStream) {
|
|
|
107
111
|
}
|
|
108
112
|
|
|
109
113
|
// Raw GPUBuffer - wrap with dtype/layout metadata
|
|
110
|
-
if (promoted
|
|
114
|
+
if (isGpuBufferInstance(promoted) && loc?.shape && loc.shape.length === 2) {
|
|
111
115
|
const layout = ctx.resolveWeightLayout(loc);
|
|
112
116
|
|
|
113
117
|
const dtype = selectRuleValue('loader', 'weights', 'floatLocationDtype', {
|
|
@@ -140,7 +144,7 @@ async function maybePromoteEmbeddingsToF32(ctx, current, name, loc) {
|
|
|
140
144
|
return wrapped;
|
|
141
145
|
}
|
|
142
146
|
|
|
143
|
-
if (!(current
|
|
147
|
+
if (!isGpuBufferInstance(current)) return current;
|
|
144
148
|
|
|
145
149
|
const sourceDtype = selectRuleValue('loader', 'weights', 'floatLocationDtype', {
|
|
146
150
|
locationDtype: loc?.dtype,
|
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
import { releaseBuffer } from '../../memory/buffer-pool.js';
|
|
4
4
|
import { log, trace } from '../../debug/index.js';
|
|
5
5
|
import { getRuntimeConfig } from '../../config/runtime.js';
|
|
6
|
+
import { isWeightBuffer } from '../../gpu/weight-buffer.js';
|
|
7
|
+
|
|
8
|
+
function isGpuBufferInstance(value) {
|
|
9
|
+
return typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer;
|
|
10
|
+
}
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
|
|
@@ -256,12 +261,14 @@ export class ExpertCache {
|
|
|
256
261
|
];
|
|
257
262
|
|
|
258
263
|
for (const buf of buffers) {
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
264
|
+
const gpuBuffer = isWeightBuffer(buf)
|
|
265
|
+
? buf.buffer
|
|
266
|
+
: (isGpuBufferInstance(buf) ? buf : null);
|
|
267
|
+
if (!gpuBuffer) continue;
|
|
268
|
+
try {
|
|
269
|
+
releaseBuffer(gpuBuffer);
|
|
270
|
+
} catch (e) {
|
|
271
|
+
// Buffer may already be released
|
|
265
272
|
}
|
|
266
273
|
}
|
|
267
274
|
}
|
|
@@ -18,7 +18,7 @@ import { releaseBuffer } from '../../memory/buffer-pool.js';
|
|
|
18
18
|
|
|
19
19
|
export async function preloadShardsForExpert(ctx, layerIdx, expertIdx, options) {
|
|
20
20
|
// Get required shards from manifest mapping
|
|
21
|
-
const shardIndices = getShardsForExpert(layerIdx, expertIdx);
|
|
21
|
+
const shardIndices = getShardsForExpert(layerIdx, expertIdx, ctx.manifest);
|
|
22
22
|
if (shardIndices.length === 0) {
|
|
23
23
|
// No mapping available, fall back to loading all shards on demand
|
|
24
24
|
return;
|
|
@@ -69,6 +69,10 @@ export function predictNextLayerExperts(currentExperts) {
|
|
|
69
69
|
return currentExperts;
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
+
function isGpuBufferInstance(value) {
|
|
73
|
+
return typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer;
|
|
74
|
+
}
|
|
75
|
+
|
|
72
76
|
// ============================================================================
|
|
73
77
|
// Expert Loading
|
|
74
78
|
// ============================================================================
|
|
@@ -95,7 +99,7 @@ export async function loadExpert(ctx, layerIdx, expertIdx) {
|
|
|
95
99
|
await preloadShardsForExpert(ctx, layerIdx, expertIdx);
|
|
96
100
|
|
|
97
101
|
// Get tensor names from manifest if available (for logging/debugging)
|
|
98
|
-
const tensorNames = getTensorsForExpert(layerIdx, expertIdx);
|
|
102
|
+
const tensorNames = getTensorsForExpert(layerIdx, expertIdx, ctx.manifest);
|
|
99
103
|
if (tensorNames.length > 0) {
|
|
100
104
|
debugTrace.loader(`Expert ${layerIdx}_${expertIdx} tensors: ${tensorNames.length}`);
|
|
101
105
|
}
|
|
@@ -260,7 +264,7 @@ function getGpuBuffer(value) {
|
|
|
260
264
|
if (isWeightBuffer(value)) {
|
|
261
265
|
return value.buffer;
|
|
262
266
|
}
|
|
263
|
-
if (value
|
|
267
|
+
if (isGpuBufferInstance(value)) {
|
|
264
268
|
return value;
|
|
265
269
|
}
|
|
266
270
|
return null;
|
|
@@ -342,7 +346,7 @@ async function downcastExpertWeights(ctx, weights) {
|
|
|
342
346
|
if (!buf) continue;
|
|
343
347
|
|
|
344
348
|
// Only downcast GPUBuffer or WeightBuffer (not Float32Array)
|
|
345
|
-
if (!(buf
|
|
349
|
+
if (!isGpuBufferInstance(buf) && !isWeightBuffer(buf)) {
|
|
346
350
|
continue;
|
|
347
351
|
}
|
|
348
352
|
|
|
@@ -369,13 +373,13 @@ function calculateExpertSize(weights) {
|
|
|
369
373
|
const buf = weights[k];
|
|
370
374
|
if (isWeightBuffer(buf)) {
|
|
371
375
|
sizeBytes += buf.buffer.size;
|
|
372
|
-
} else if (buf
|
|
376
|
+
} else if (isGpuBufferInstance(buf)) {
|
|
373
377
|
sizeBytes += buf.size;
|
|
374
378
|
}
|
|
375
379
|
}
|
|
376
380
|
|
|
377
381
|
// Use manifest-provided expert size if available, otherwise use calculated
|
|
378
|
-
const manifestBytes = getExpertBytes();
|
|
382
|
+
const manifestBytes = getExpertBytes(ctx.manifest);
|
|
379
383
|
if (manifestBytes > 0) {
|
|
380
384
|
sizeBytes = manifestBytes;
|
|
381
385
|
}
|