npm - @simulatte/doppler - Versions diffs - 0.1.6 → 0.1.8 - Mend

@simulatte/doppler 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (355) hide show

package/CHANGELOG.md +145 -0
package/README.md +16 -23
package/package.json +30 -32
package/src/adapters/adapter-registry.js +12 -1
package/src/adapters/lora-loader.js +23 -6
package/src/bridge/extension-client.d.ts +5 -0
package/src/bridge/extension-client.js +40 -0
package/src/bridge/index.d.ts +2 -1
package/src/bridge/index.js +6 -4
package/src/browser/browser-converter.js +31 -1
package/src/browser/file-picker.js +6 -0
package/src/browser/safetensors-parser-browser.js +84 -1
package/src/browser/shard-io-browser.js +2 -2
package/src/browser/tensor-source-download.js +8 -2
package/src/browser/tensor-source-http.d.ts +1 -0
package/src/browser/tensor-source-http.js +5 -1
package/src/client/doppler-api.browser.js +20 -4
package/src/client/doppler-api.js +19 -3
package/src/client/doppler-provider/generation.js +12 -0
package/src/client/doppler-provider/model-manager.d.ts +10 -0
package/src/client/doppler-provider/model-manager.js +91 -19
package/src/client/doppler-provider/source-runtime.d.ts +2 -1
package/src/client/doppler-provider/source-runtime.js +132 -13
package/src/client/doppler-registry.json +5 -20
package/src/config/backward-registry-loader.js +17 -2
package/src/config/execution-v0-contract-check.js +113 -15
package/src/config/kernel-path-contract-check.js +57 -29
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +18 -36
package/src/config/kernels/kernel-ref-digests.js +1 -1
package/src/config/kernels/registry.js +14 -1
package/src/config/kernels/registry.json +81 -5
package/src/config/loader.d.ts +1 -1
package/src/config/loader.js +15 -2
package/src/config/merge-contract-check.js +66 -4
package/src/config/merge-helpers.js +128 -7
package/src/config/merge.d.ts +1 -0
package/src/config/merge.js +10 -0
package/src/config/param-validator.js +47 -2
package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +43 -8
package/src/config/presets/models/gemma2.json +3 -2
package/src/config/presets/models/gemma3.json +2 -0
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
package/src/config/runtime.js +6 -1
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +5 -0
package/src/config/schema/doppler.schema.js +16 -21
package/src/config/schema/inference-defaults.schema.js +3 -3
package/src/config/schema/kernel-path.schema.d.ts +5 -1
package/src/config/schema/kernel-thresholds.schema.js +12 -4
package/src/config/schema/manifest.schema.d.ts +3 -2
package/src/config/schema/manifest.schema.js +17 -4
package/src/config/schema/storage.schema.js +1 -1
package/src/config/training-defaults.js +30 -22
package/src/converter/conversion-plan.js +104 -11
package/src/converter/core.d.ts +7 -0
package/src/converter/core.js +16 -9
package/src/converter/execution-v0-manifest.js +4 -1
package/src/converter/index.d.ts +1 -0
package/src/converter/index.js +1 -0
package/src/converter/manifest-inference.js +50 -29
package/src/converter/parsers/diffusion.js +0 -3
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +40 -16
package/src/converter/quantizer.js +19 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/shard-packer.d.ts +1 -1
package/src/converter/shard-packer.js +4 -1
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/config.js +123 -11
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/debug/signals.js +7 -1
package/src/debug/tensor.d.ts +2 -0
package/src/debug/tensor.js +13 -2
package/src/distribution/p2p-control-plane.js +52 -12
package/src/distribution/p2p-observability.js +43 -7
package/src/distribution/p2p-webrtc-browser.js +20 -0
package/src/distribution/shard-delivery.js +83 -27
package/src/formats/gguf/types.js +33 -16
package/src/formats/rdrr/groups.d.ts +12 -4
package/src/formats/rdrr/groups.js +3 -6
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +53 -3
package/src/formats/rdrr/types.d.ts +2 -1
package/src/gpu/command-recorder.js +86 -61
package/src/gpu/device.d.ts +1 -0
package/src/gpu/device.js +73 -19
package/src/gpu/kernel-tuner/benchmarks.js +326 -316
package/src/gpu/kernel-tuner/cache.js +71 -4
package/src/gpu/kernel-tuner/tuner.js +22 -4
package/src/gpu/kernels/attention.js +15 -34
package/src/gpu/kernels/backward/adam.js +62 -58
package/src/gpu/kernels/backward/attention_backward.js +257 -169
package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
package/src/gpu/kernels/cast.js +191 -149
package/src/gpu/kernels/check-stop.js +33 -44
package/src/gpu/kernels/conv2d.js +27 -17
package/src/gpu/kernels/cross_entropy_loss.js +21 -15
package/src/gpu/kernels/depthwise_conv2d.js +36 -26
package/src/gpu/kernels/dequant.js +178 -126
package/src/gpu/kernels/energy.d.ts +3 -21
package/src/gpu/kernels/energy.js +111 -88
package/src/gpu/kernels/feature-check.js +1 -1
package/src/gpu/kernels/fused_ffn.js +84 -65
package/src/gpu/kernels/fused_matmul_residual.js +56 -33
package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
package/src/gpu/kernels/gather.js +33 -15
package/src/gpu/kernels/gelu.js +19 -11
package/src/gpu/kernels/grouped_pointwise_conv2d.js +33 -23
package/src/gpu/kernels/groupnorm.js +34 -23
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/kv-quantize.js +5 -2
package/src/gpu/kernels/layernorm.js +35 -19
package/src/gpu/kernels/logit-merge.js +5 -3
package/src/gpu/kernels/matmul-selection.js +47 -4
package/src/gpu/kernels/matmul.d.ts +2 -0
package/src/gpu/kernels/matmul.js +59 -40
package/src/gpu/kernels/modulate.js +23 -15
package/src/gpu/kernels/moe.js +221 -175
package/src/gpu/kernels/pixel_shuffle.js +22 -14
package/src/gpu/kernels/relu.js +18 -10
package/src/gpu/kernels/repeat_channels.js +25 -17
package/src/gpu/kernels/residual.js +37 -27
package/src/gpu/kernels/rmsnorm.js +66 -43
package/src/gpu/kernels/rope.js +3 -0
package/src/gpu/kernels/sample.js +27 -38
package/src/gpu/kernels/sana_linear_attention.js +18 -10
package/src/gpu/kernels/scale.js +18 -11
package/src/gpu/kernels/shader-cache.js +4 -2
package/src/gpu/kernels/silu.js +120 -72
package/src/gpu/kernels/softmax.js +44 -25
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/kernels/split_qkv.js +23 -13
package/src/gpu/kernels/transpose.js +18 -10
package/src/gpu/kernels/transpose.wgsl +5 -3
package/src/gpu/kernels/upsample2d.js +21 -13
package/src/gpu/kernels/utils.js +20 -13
package/src/gpu/partitioned-buffer-pool.js +10 -2
package/src/gpu/perf-guards.js +2 -9
package/src/gpu/profiler.js +27 -22
package/src/gpu/readback-utils.d.ts +16 -0
package/src/gpu/readback-utils.js +41 -0
package/src/gpu/submit-tracker.js +13 -0
package/src/gpu/uniform-cache.d.ts +1 -0
package/src/gpu/uniform-cache.js +30 -9
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/hotswap/intent-bundle.js +6 -0
package/src/hotswap/manifest.d.ts +10 -1
package/src/hotswap/manifest.js +12 -2
package/src/hotswap/runtime.js +30 -8
package/src/index-browser.d.ts +44 -0
package/src/index-browser.js +14 -0
package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
package/src/inference/browser-harness-contract-helpers.js +28 -0
package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
package/src/inference/browser-harness-model-helpers.d.ts +16 -0
package/src/inference/browser-harness-model-helpers.js +217 -0
package/src/inference/browser-harness-report-helpers.d.ts +7 -0
package/src/inference/browser-harness-report-helpers.js +42 -0
package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
package/src/inference/browser-harness-runtime-helpers.js +415 -0
package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
package/src/inference/browser-harness-suite-helpers.js +268 -0
package/src/inference/browser-harness-text-helpers.d.ts +27 -0
package/src/inference/browser-harness-text-helpers.js +788 -0
package/src/inference/browser-harness.d.ts +8 -0
package/src/inference/browser-harness.js +149 -1996
package/src/inference/kv-cache/base.js +140 -94
package/src/inference/kv-cache/tiered.js +5 -3
package/src/inference/moe-router.js +88 -56
package/src/inference/multi-model-network.js +5 -3
package/src/inference/network-evolution.d.ts +11 -2
package/src/inference/network-evolution.js +20 -21
package/src/inference/pipelines/context.d.ts +3 -0
package/src/inference/pipelines/context.js +142 -2
package/src/inference/pipelines/diffusion/helpers.js +10 -2
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
package/src/inference/pipelines/diffusion/vae.js +3 -7
package/src/inference/pipelines/energy/pipeline.js +27 -21
package/src/inference/pipelines/energy/quintel.d.ts +5 -0
package/src/inference/pipelines/energy/quintel.js +11 -0
package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
package/src/inference/pipelines/text/attention/projections.js +192 -112
package/src/inference/pipelines/text/attention/record.js +77 -14
package/src/inference/pipelines/text/attention/run.js +112 -14
package/src/inference/pipelines/text/config.js +17 -4
package/src/inference/pipelines/text/embed.js +2 -8
package/src/inference/pipelines/text/execution-plan.js +46 -23
package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
package/src/inference/pipelines/text/execution-v0.js +62 -1013
package/src/inference/pipelines/text/generator-runtime.js +5 -0
package/src/inference/pipelines/text/generator-steps.d.ts +52 -0
package/src/inference/pipelines/text/generator-steps.js +340 -221
package/src/inference/pipelines/text/generator.js +56 -40
package/src/inference/pipelines/text/init.d.ts +13 -0
package/src/inference/pipelines/text/init.js +94 -25
package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
package/src/inference/pipelines/text/kernel-trace.js +6 -0
package/src/inference/pipelines/text/layer.js +4 -9
package/src/inference/pipelines/text/linear-attention.d.ts +15 -0
package/src/inference/pipelines/text/linear-attention.js +113 -9
package/src/inference/pipelines/text/logits/gpu.js +12 -7
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +13 -12
package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
package/src/inference/pipelines/text/logits/utils.js +9 -0
package/src/inference/pipelines/text/lora-apply.js +50 -32
package/src/inference/pipelines/text/model-load.js +282 -104
package/src/inference/pipelines/text/moe-cache.js +5 -4
package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
package/src/inference/pipelines/text/moe-cpu.js +42 -38
package/src/inference/pipelines/text/moe-gpu.js +110 -86
package/src/inference/pipelines/text/ops.js +90 -90
package/src/inference/pipelines/text/probes.js +9 -9
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/weights.js +17 -7
package/src/inference/pipelines/text.js +13 -1
package/src/inference/speculative.d.ts +2 -2
package/src/inference/speculative.js +4 -18
package/src/inference/test-harness.d.ts +1 -1
package/src/inference/test-harness.js +17 -7
package/src/inference/tokenizer.d.ts +0 -5
package/src/inference/tokenizer.js +4 -23
package/src/inference/tokenizers/bpe.js +9 -0
package/src/inference/tokenizers/bundled.js +20 -0
package/src/inference/tokenizers/sentencepiece.js +12 -0
package/src/loader/doppler-loader.js +38 -22
package/src/loader/dtype-utils.js +3 -44
package/src/loader/embedding-loader.js +7 -3
package/src/loader/experts/expert-cache.js +13 -6
package/src/loader/experts/expert-loader.js +10 -6
package/src/loader/final-weights-loader.js +10 -4
package/src/loader/layer-loader.js +2 -1
package/src/loader/loader-state.js +2 -2
package/src/loader/memory-monitor.js +8 -0
package/src/loader/multi-model-loader.d.ts +14 -0
package/src/loader/multi-model-loader.js +70 -24
package/src/loader/shard-cache.js +84 -14
package/src/loader/shard-resolver.js +25 -3
package/src/loader/tensors/tensor-loader.js +214 -144
package/src/loader/tensors/tensor-reader.js +76 -19
package/src/loader/weight-downcast.js +1 -1
package/src/memory/buffer-pool.d.ts +9 -1
package/src/memory/buffer-pool.js +109 -44
package/src/memory/unified-detect.js +1 -1
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +24 -8
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.js +27 -1
package/src/storage/backends/opfs-store.js +68 -24
package/src/storage/downloader.js +365 -83
package/src/storage/index.d.ts +3 -0
package/src/storage/index.js +3 -0
package/src/storage/preflight.d.ts +2 -2
package/src/storage/preflight.js +24 -2
package/src/storage/quickstart-downloader.js +11 -5
package/src/storage/registry.js +10 -4
package/src/storage/reports.js +1 -1
package/src/storage/shard-manager.d.ts +15 -1
package/src/storage/shard-manager.js +55 -6
package/src/storage/source-artifact-store.d.ts +52 -0
package/src/storage/source-artifact-store.js +234 -0
package/src/tooling/command-api-constants.d.ts +9 -0
package/src/tooling/command-api-constants.js +9 -0
package/src/tooling/command-api-family-normalizers.d.ts +9 -0
package/src/tooling/command-api-family-normalizers.js +343 -0
package/src/tooling/command-api-helpers.d.ts +25 -0
package/src/tooling/command-api-helpers.js +262 -0
package/src/tooling/command-api.js +16 -602
package/src/tooling/command-envelope.js +4 -1
package/src/tooling/command-runner-shared.js +52 -18
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/lean-execution-contract.js +150 -3
package/src/tooling/node-browser-command-runner.js +161 -271
package/src/tooling/node-command-runner.js +29 -3
package/src/tooling/node-converter.js +30 -1
package/src/tooling/node-source-runtime.d.ts +1 -1
package/src/tooling/node-source-runtime.js +120 -3
package/src/tooling/node-webgpu.js +24 -21
package/src/tooling/opfs-cache.js +21 -4
package/src/tooling/runtime-input-composition.d.ts +38 -0
package/src/tooling/runtime-input-composition.js +86 -0
package/src/tooling/source-runtime-bundle.d.ts +40 -5
package/src/tooling/source-runtime-bundle.js +261 -34
package/src/tooling/source-runtime-materializer.d.ts +6 -0
package/src/tooling/source-runtime-materializer.js +93 -0
package/src/training/attention-backward.js +32 -17
package/src/training/autograd.js +80 -52
package/src/training/checkpoint-watch.d.ts +2 -1
package/src/training/checkpoint-watch.js +39 -6
package/src/training/checkpoint.js +40 -11
package/src/training/clip.js +2 -1
package/src/training/datasets/token-batch.js +20 -8
package/src/training/distillation/checkpoint-watch.js +1 -0
package/src/training/distillation/student-fixture.d.ts +22 -0
package/src/training/distillation/student-fixture.js +846 -0
package/src/training/distillation/suite-data.d.ts +45 -0
package/src/training/distillation/suite-data.js +189 -0
package/src/training/lora-pipeline.js +4 -7
package/src/training/lora.js +26 -12
package/src/training/loss.js +5 -6
package/src/training/objectives/cross_entropy.js +2 -5
package/src/training/objectives/distill_kd.js +4 -8
package/src/training/objectives/distill_triplet.js +4 -8
package/src/training/objectives/ul_stage2_base.js +4 -8
package/src/training/operator-command.js +2 -0
package/src/training/optimizer.js +19 -7
package/src/training/runner.js +2 -1
package/src/training/suite.js +18 -978
package/src/training/tensor-factory.d.ts +9 -0
package/src/training/tensor-factory.js +13 -0
package/src/training/trainer.js +3 -5
package/src/training/ul_dataset.js +3 -5
package/src/training/workloads.js +70 -79
package/src/types/model.d.ts +5 -0
package/src/version.js +1 -1
package/tools/convert-safetensors-node.js +22 -16
package/tools/doppler-cli.js +50 -26

package/src/loader/tensors/tensor-loader.js CHANGED Viewed

@@ -16,6 +16,21 @@ import { selectRuleValue } from '../../rules/rule-registry.js';
 let loggedF32UpcastNonMatmul = false;
+function isGpuBufferInstance(value) {
+  return typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer;
+}
+function isReleasableBuffer(value) {
+  return typeof value === 'object' && value !== null && 'size' in value;
+}
+function releaseOwnedGpuBuffer(buffer, owned) {
+  if (!owned || !isReleasableBuffer(buffer)) {
+    return;
+  }
+  releaseBuffer(buffer);
+}
 function logF32UpcastNonMatmul(name, numElements, bufferSize) {
   if (loggedF32UpcastNonMatmul) {
     return;
@@ -152,66 +167,80 @@ export function convertF16ToF32CPU(f16Data) {
 export async function loadQ4KFused(shardData, location, name) {
   const device = getDevice();
-  const buffer = shardData instanceof GPUBuffer
+  const ownsBuffer = !isGpuBufferInstance(shardData);
+  const buffer = isGpuBufferInstance(shardData)
     ? shardData
     : acquireAlignedBuffer(location.size, `q4k_${name}`);
-  if (!(shardData instanceof GPUBuffer)) {
-    writeBufferAligned(device, buffer, shardData);
+  try {
+    if (ownsBuffer) {
+      writeBufferAligned(device, buffer, shardData);
+    }
+    return {
+      data: createWeightBuffer(buffer, 'q4k', 'row', location.shape, name),
+      allocatedBuffers: [buffer],
+    };
+  } catch (error) {
+    releaseOwnedGpuBuffer(buffer, ownsBuffer);
+    throw error;
   }
-  return {
-    data: createWeightBuffer(buffer, 'q4k', 'row', location.shape, name),
-    allocatedBuffers: [buffer],
-  };
 }
 export async function loadQ4KDequant(shardData, location, name, config) {
   const device = getDevice();
-  const quantBuffer = shardData instanceof GPUBuffer
+  let ownsQuantBuffer = !isGpuBufferInstance(shardData);
+  const quantBuffer = isGpuBufferInstance(shardData)
     ? shardData
     : acquireAlignedBuffer(location.size, `quant_${name}`);
-  if (!(shardData instanceof GPUBuffer)) {
-    writeBufferAligned(device, quantBuffer, shardData);
-  }
-  const outputDtype = getQ4KOutputDtype(location, config);
-  // Check if this is a 2D matrix with K (columns) not aligned to QK_K (256).
-  // If so, we need row-wise dequant to produce proper row-major output.
-  const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
-  const K = is2DMatrix ? location.shape[1] : 0;
-  const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
+  let dequantized = null;
+  try {
+    if (ownsQuantBuffer) {
+      writeBufferAligned(device, quantBuffer, shardData);
+    }
-  let dequantizedTensor;
-  if (needsRowwise) {
-    const rows = location.shape[0];
-    debugTrace.loader(
-      `Dequantizing ${name} (row-wise): [${rows},${K}], K not 256-aligned, ` +
-      `outputDtype=${outputDtype}`
-    );
-    dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
-  } else {
-    const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
-    debugTrace.loader(
-      `Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
-      `outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
-    );
-    dequantizedTensor = await dequantize(quantBuffer, numBlocks, { outputDtype });
-  }
-  const dequantized = dequantizedTensor.buffer;
+    const outputDtype = getQ4KOutputDtype(location, config);
+    const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
+    const K = is2DMatrix ? location.shape[1] : 0;
+    const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
+    let dequantizedTensor;
+    if (needsRowwise) {
+      const rows = location.shape[0];
+      debugTrace.loader(
+        `Dequantizing ${name} (row-wise): [${rows},${K}], K not 256-aligned, ` +
+        `outputDtype=${outputDtype}`
+      );
+      dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
+    } else {
+      const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
+      debugTrace.loader(
+        `Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
+        `outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
+      );
+      dequantizedTensor = await dequantize(quantBuffer, numBlocks, { outputDtype });
+    }
+    dequantized = dequantizedTensor.buffer;
-  debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
-  releaseBuffer(quantBuffer);
+    debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
+    releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
+    ownsQuantBuffer = false;
-  const layout = getWeightLayout(location, config);
-  const dtype = outputDtype;
+    const layout = getWeightLayout(location, config);
+    const dtype = outputDtype;
-  return {
-    data: createWeightBuffer(dequantized, dtype, layout, location.shape, name),
-    allocatedBuffers: [dequantized],
-  };
+    return {
+      data: createWeightBuffer(dequantized, dtype, layout, location.shape, name),
+      allocatedBuffers: [dequantized],
+    };
+  } catch (error) {
+    if (isReleasableBuffer(dequantized)) {
+      releaseBuffer(dequantized);
+    }
+    throw error;
+  } finally {
+    releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
+  }
 }
@@ -219,97 +248,124 @@ export async function loadQ6K(shardData, location, name) {
   const device = getDevice();
   debugTrace.loader(`Loading Q6_K tensor "${name}", size=${location.size}`);
-  const quantBuffer = shardData instanceof GPUBuffer
+  let ownsQuantBuffer = !isGpuBufferInstance(shardData);
+  const quantBuffer = isGpuBufferInstance(shardData)
     ? shardData
     : acquireAlignedBuffer(location.size, `quant_${name}`);
-  if (!(shardData instanceof GPUBuffer)) {
-    writeBufferAligned(device, quantBuffer, shardData);
-  }
+  let dequantized = null;
+  try {
+    if (ownsQuantBuffer) {
+      writeBufferAligned(device, quantBuffer, shardData);
+    }
-  const numBlocks = Math.floor(location.size / Q6K_BLOCK_BYTES);
-  debugTrace.loader(
-    `Dequantizing Q6_K ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
-    `expectedOutput=${numBlocks * 256 * 2} (f16)`
-  );
+    const numBlocks = Math.floor(location.size / Q6K_BLOCK_BYTES);
+    debugTrace.loader(
+      `Dequantizing Q6_K ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
+      `expectedOutput=${numBlocks * 256 * 2} (f16)`
+    );
-  const dequantizedTensor = await dequantizeQ6K(quantBuffer, numBlocks, { outputDtype: 'f16' });
-  const dequantized = dequantizedTensor.buffer;
+    const dequantizedTensor = await dequantizeQ6K(quantBuffer, numBlocks, { outputDtype: 'f16' });
+    dequantized = dequantizedTensor.buffer;
-  debugTrace.loader(`Dequantized Q6_K ${name}: resultSize=${dequantized.size}`);
-  releaseBuffer(quantBuffer);
+    debugTrace.loader(`Dequantized Q6_K ${name}: resultSize=${dequantized.size}`);
+    releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
+    ownsQuantBuffer = false;
+    const isMatmulWeight = shouldDequantizeToF16(location);
+    if (isMatmulWeight) {
+      return {
+        data: createWeightBuffer(dequantized, 'f16', 'row', location.shape, name),
+        allocatedBuffers: [dequantized],
+      };
+    }
-  const isMatmulWeight = shouldDequantizeToF16(location);
-  if (isMatmulWeight) {
     return {
-      data: createWeightBuffer(dequantized, 'f16', 'row', location.shape, name),
+      data: applyBufferLayout(dequantized, location, 'f16'),
       allocatedBuffers: [dequantized],
     };
+  } catch (error) {
+    if (isReleasableBuffer(dequantized)) {
+      releaseBuffer(dequantized);
+    }
+    throw error;
+  } finally {
+    releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
   }
-  return {
-    data: applyBufferLayout(dequantized, location, 'f16'),
-    allocatedBuffers: [dequantized],
-  };
 }
 export async function loadBF16(shardData, location, name, config) {
   const device = getDevice();
-  const srcBuffer = shardData instanceof GPUBuffer
+  let ownsSrcBuffer = !isGpuBufferInstance(shardData);
+  const srcBuffer = isGpuBufferInstance(shardData)
     ? shardData
     : acquireAlignedBuffer(location.size, `${name}_bf16`);
-  if (!(shardData instanceof GPUBuffer)) {
-    writeBufferAligned(device, srcBuffer, shardData);
-  }
-  const numElements = location.size / 2;
-  const caps = config.gpuCapabilities || getKernelCapabilities();
-  const isMatmulWeight = shouldDequantizeToF16(location);
+  let resultBuffer = null;
+  try {
+    if (ownsSrcBuffer) {
+      writeBufferAligned(device, srcBuffer, shardData);
+    }
-  // For matmul weights with F16 support: BF16 -> F16 directly
-  if (caps?.hasF16 && isMatmulWeight) {
-    const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
-    releaseBuffer(srcBuffer);
-    debugTrace.loader(`BF16->F16 for matmul weight: ${name} (${numElements} elements)`);
+    const numElements = location.size / 2;
+    const caps = config.gpuCapabilities || getKernelCapabilities();
+    const isMatmulWeight = shouldDequantizeToF16(location);
+    const keepF32Weights = config.keepF32Weights === true;
-    const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
-      layout: location.layout ?? null,
-      useColumnWise: false,
-    });
-    return {
-      data: createWeightBuffer(f16Tensor.buffer, 'f16', layout, location.shape, name),
-      allocatedBuffers: [f16Tensor.buffer],
-    };
-  }
-  // Standard path: BF16 -> F32
-  const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
-  releaseBuffer(srcBuffer);
+    if (caps?.hasF16 && isMatmulWeight && !keepF32Weights) {
+      const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
+      resultBuffer = f16Tensor.buffer;
+      releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
+      ownsSrcBuffer = false;
+      debugTrace.loader(`BF16->F16 for matmul weight: ${name} (${numElements} elements)`);
-  if (dstBuffer instanceof GPUBuffer) {
-    if (isMatmulWeight) {
       const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
         layout: location.layout ?? null,
         useColumnWise: false,
       });
       return {
-        data: createWeightBuffer(dstBuffer, 'f32', layout, location.shape, name),
+        data: createWeightBuffer(f16Tensor.buffer, 'f16', layout, location.shape, name),
+        allocatedBuffers: [f16Tensor.buffer],
+      };
+    }
+    if (isMatmulWeight && keepF32Weights) {
+      debugTrace.loader(`Keeping BF16 matmul weight in f32: ${name} (keepF32Weights=true)`);
+    }
+    const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
+    resultBuffer = dstBuffer;
+    releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
+    ownsSrcBuffer = false;
+    if (isGpuBufferInstance(dstBuffer)) {
+      if (isMatmulWeight) {
+        const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
+          layout: location.layout ?? null,
+          useColumnWise: false,
+        });
+        return {
+          data: createWeightBuffer(dstBuffer, 'f32', layout, location.shape, name),
+          allocatedBuffers: [dstBuffer],
+        };
+      }
+      return {
+        data: applyBufferLayout(dstBuffer, location, 'f32'),
         allocatedBuffers: [dstBuffer],
       };
     }
     return {
-      data: applyBufferLayout(dstBuffer, location, 'f32'),
-      allocatedBuffers: [dstBuffer],
+      data: dstBuffer,
+      allocatedBuffers: [],
     };
+  } catch (error) {
+    if (isReleasableBuffer(resultBuffer)) {
+      releaseBuffer(resultBuffer);
+    }
+    throw error;
+  } finally {
+    releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
   }
-  // Float32Array returned (shouldn't happen in GPU path)
-  return {
-    data: dstBuffer,
-    allocatedBuffers: [],
-  };
 }
@@ -318,55 +374,69 @@ export async function loadFloat(shardData, location, name, config) {
     throw new Error('Tensor load config is required.');
   }
   const device = getDevice();
-  const buffer = shardData instanceof GPUBuffer
+  let ownsBuffer = !isGpuBufferInstance(shardData);
+  const buffer = isGpuBufferInstance(shardData)
     ? shardData
     : acquireAlignedBuffer(location.size, name);
-  if (!(shardData instanceof GPUBuffer)) {
-    writeBufferAligned(device, buffer, shardData);
-  }
-  const dtype = selectRuleValue('loader', 'weights', 'floatLocationDtype', {
-    locationDtype: location.dtype,
-  });
-  const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
-    layout: location.layout ?? null,
-    useColumnWise: false,
-  });
-  const isMatmulWeight = shouldDequantizeToF16(location);
+  let resultBuffer = null;
+  try {
+    if (ownsBuffer) {
+      writeBufferAligned(device, buffer, shardData);
+    }
-  // Return WeightBuffer for matmul weights
-  if (isMatmulWeight) {
-    return {
-      data: createWeightBuffer(buffer, dtype, layout, location.shape, name),
-      allocatedBuffers: [buffer],
-    };
-  }
+    const dtype = selectRuleValue('loader', 'weights', 'floatLocationDtype', {
+      locationDtype: location.dtype,
+    });
+    const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
+      layout: location.layout ?? null,
+      useColumnWise: false,
+    });
+    const isMatmulWeight = shouldDequantizeToF16(location);
-  // Non-matmul F16 weights need upcast to F32
-  if (dtype === 'f16') {
-    if (config.allowF32UpcastNonMatmul === false) {
+    if (isMatmulWeight) {
+      ownsBuffer = false;
       return {
-        data: applyBufferLayout(buffer, location, 'f16'),
+        data: createWeightBuffer(buffer, dtype, layout, location.shape, name),
         allocatedBuffers: [buffer],
       };
     }
-    const numElements = location.shape.reduce((a, b) => a * b, 1);
-    logF32UpcastNonMatmul(name, numElements, buffer.size);
-    debugTrace.loader(`F16->F32 upcast for non-matmul: ${name} (${numElements} elements, bufSize=${buffer.size})`);
-    const inputTensor = createTensor(buffer, 'f16', [numElements], `${name}_f16`);
-    const f32Tensor = await castF16ToF32(inputTensor);
-    debugTrace.loader(`F16->F32 complete: ${name} resultSize=${f32Tensor.buffer.size}`);
-    releaseBuffer(buffer);
+    if (dtype === 'f16') {
+      if (config.allowF32UpcastNonMatmul === false) {
+        ownsBuffer = false;
+        return {
+          data: applyBufferLayout(buffer, location, 'f16'),
+          allocatedBuffers: [buffer],
+        };
+      }
+      const numElements = location.shape.reduce((a, b) => a * b, 1);
+      logF32UpcastNonMatmul(name, numElements, buffer.size);
+      debugTrace.loader(`F16->F32 upcast for non-matmul: ${name} (${numElements} elements, bufSize=${buffer.size})`);
+      const inputTensor = createTensor(buffer, 'f16', [numElements], `${name}_f16`);
+      const f32Tensor = await castF16ToF32(inputTensor);
+      resultBuffer = f32Tensor.buffer;
+      debugTrace.loader(`F16->F32 complete: ${name} resultSize=${f32Tensor.buffer.size}`);
+      releaseOwnedGpuBuffer(buffer, ownsBuffer);
+      ownsBuffer = false;
+      return {
+        data: applyBufferLayout(f32Tensor.buffer, location, 'f32'),
+        allocatedBuffers: [f32Tensor.buffer],
+      };
+    }
+    ownsBuffer = false;
     return {
-      data: applyBufferLayout(f32Tensor.buffer, location, 'f32'),
-      allocatedBuffers: [f32Tensor.buffer],
+      data: applyBufferLayout(buffer, location, dtype),
+      allocatedBuffers: [buffer],
     };
+  } catch (error) {
+    if (isReleasableBuffer(resultBuffer)) {
+      releaseBuffer(resultBuffer);
+    }
+    throw error;
+  } finally {
+    releaseOwnedGpuBuffer(buffer, ownsBuffer);
   }
-  return {
-    data: applyBufferLayout(buffer, location, dtype),
-    allocatedBuffers: [buffer],
-  };
 }
 // ============================================================================

package/src/loader/tensors/tensor-reader.js CHANGED Viewed

@@ -2,30 +2,84 @@
 import { trace } from '../../debug/index.js';
+function resolveSpanShardIndex(span, name, spanIndex) {
+  const shardIndex = typeof span?.shardIndex === 'number'
+    ? span.shardIndex
+    : span?.shard;
+  if (!Number.isInteger(shardIndex) || shardIndex < 0) {
+    throw new Error(
+      `[DopplerLoader] Tensor "${name}" span[${spanIndex}] has invalid shard index.`
+    );
+  }
+  return shardIndex;
+}
+function validateSpanField(value, field, name, spanIndex) {
+  if (!Number.isInteger(value) || value < 0) {
+    throw new Error(
+      `[DopplerLoader] Tensor "${name}" span[${spanIndex}] has invalid ${field}.`
+    );
+  }
+  return value;
+}
+function getLocationSpans(location) {
+  if (!Array.isArray(location?.spans) || location.spans.length === 0) {
+    return null;
+  }
+  return location.spans;
+}
+function resolveLocationShardIndex(location, name) {
+  const shardIndex = typeof location?.shardIndex === 'number'
+    ? location.shardIndex
+    : location?.shard;
+  if (!Number.isInteger(shardIndex) || shardIndex < 0) {
+    throw new Error(`[DopplerLoader] Tensor "${name}" has invalid shard index.`);
+  }
+  return shardIndex;
+}
+function validateLocationField(location, field, name) {
+  const value = location?.[field];
+  if (!Number.isInteger(value) || value < 0) {
+    throw new Error(`[DopplerLoader] Tensor "${name}" has invalid ${field}.`);
+  }
+  return value;
+}
 export async function assembleShardData(location, name, loadShard, loadShardRange = null) {
-  if (location.spans) {
-    trace.loader(`Assembling tensor "${name}" from ${location.spans.length} spans`);
+  const spans = getLocationSpans(location);
+  if (spans) {
+    trace.loader(`Assembling tensor "${name}" from ${spans.length} spans`);
-    const chunks = await Promise.all(location.spans.map(async (span) => {
+    const chunks = await Promise.all(spans.map(async (span, spanIndex) => {
+      const shardIndex = resolveSpanShardIndex(span, name, spanIndex);
+      const offset = validateSpanField(span.offset, 'offset', name, spanIndex);
+      const size = validateSpanField(span.size, 'size', name, spanIndex);
       if (loadShardRange) {
-        const data = await loadShardRange(span.shardIndex, span.offset, span.size);
-        if (span.size > data.byteLength) {
+        const data = await loadShardRange(shardIndex, offset, size);
+        if (size > data.byteLength) {
           throw new Error(
-            `[DopplerLoader] Shard ${span.shardIndex} too small for tensor "${name}" span.`
+            `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" span.`
           );
         }
-        return new Uint8Array(data, 0, span.size);
+        return new Uint8Array(data, 0, size);
       }
-      const data = await loadShard(span.shardIndex);
-      if (span.offset + span.size > data.byteLength) {
+      const data = await loadShard(shardIndex);
+      if (offset + size > data.byteLength) {
         throw new Error(
-          `[DopplerLoader] Shard ${span.shardIndex} too small for tensor "${name}" span.`
+          `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" span.`
         );
       }
-      return new Uint8Array(data, span.offset, span.size);
+      return new Uint8Array(data, offset, size);
     }));
     const totalSize = chunks.reduce((s, c) => s + c.length, 0);
+    if (Number.isInteger(location?.size) && totalSize !== location.size) {
+      throw new Error(
+        `[DopplerLoader] Tensor "${name}" spans total ${totalSize} bytes, expected ${location.size}.`
+      );
+    }
     const combined = new Uint8Array(totalSize);
     let offset = 0;
     for (const chunk of chunks) {
@@ -36,21 +90,24 @@ export async function assembleShardData(location, name, loadShard, loadShardRang
   }
   // Single shard - use view to avoid copying
+  const shardIndex = resolveLocationShardIndex(location, name);
+  const offset = validateLocationField(location, 'offset', name);
+  const size = validateLocationField(location, 'size', name);
   if (loadShardRange) {
-    const slice = await loadShardRange(location.shardIndex, location.offset, location.size);
-    if (location.size > slice.byteLength) {
+    const slice = await loadShardRange(shardIndex, offset, size);
+    if (size > slice.byteLength) {
       throw new Error(
-        `[DopplerLoader] Shard ${location.shardIndex} too small for tensor "${name}" (offset=${location.offset}, size=${location.size}, shard=${slice.byteLength})`
+        `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" (offset=${offset}, size=${size}, shard=${slice.byteLength})`
       );
     }
-    return new Uint8Array(slice, 0, location.size);
+    return new Uint8Array(slice, 0, size);
   }
-  const fullShard = await loadShard(location.shardIndex);
-  if (location.offset + location.size > fullShard.byteLength) {
+  const fullShard = await loadShard(shardIndex);
+  if (offset + size > fullShard.byteLength) {
     throw new Error(
-      `[DopplerLoader] Shard ${location.shardIndex} too small for tensor "${name}" (offset=${location.offset}, size=${location.size}, shard=${fullShard.byteLength})`
+      `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" (offset=${offset}, size=${size}, shard=${fullShard.byteLength})`
     );
   }
-  return new Uint8Array(fullShard, location.offset, location.size);
+  return new Uint8Array(fullShard, offset, size);
 }

package/src/loader/weight-downcast.js CHANGED Viewed

@@ -47,7 +47,7 @@ export async function maybeDowncastToF16(buf, options) {
   }
   // Handle raw GPUBuffer
-  if (buf instanceof GPUBuffer) {
+  if (typeof GPUBuffer !== 'undefined' && buf instanceof GPUBuffer) {
     return downcastGPUBuffer(buf, options);
   }

package/src/memory/buffer-pool.d.ts CHANGED Viewed

@@ -80,6 +80,12 @@ export declare class BufferPool {
    */
   release(buffer: GPUBuffer): void;
+  /**
+   * Force-dispose an active buffer instead of returning it to the pool.
+   * Use for error paths where the buffer contents or device state may be invalid.
+   */
+  discard(buffer: GPUBuffer): void;
   /**
    * Check if a buffer is currently tracked as active by the pool
    */
@@ -159,7 +165,8 @@ export declare class BufferPool {
 }
 /**
- * Get the global buffer pool
+ * Get the global buffer pool for the current device epoch.
+ * If the active device has changed or was lost, a fresh global pool is created.
  */
 export function getBufferPool(): BufferPool;
@@ -179,6 +186,7 @@ export declare const createUploadBuffer: (size: number) => GPUBuffer;
 export declare const createUniformBuffer: (size: number) => GPUBuffer;
 export declare const acquireBuffer: (size: number, usage?: GPUBufferUsageFlags, label?: string) => GPUBuffer;
 export declare const releaseBuffer: (buffer: GPUBuffer) => void;
+export declare const discardBuffer: (buffer: GPUBuffer) => void;
 export declare const isBufferActive: (buffer: GPUBuffer) => boolean;
 export declare const getBufferRequestedSize: (buffer: GPUBuffer) => number;
 export declare const uploadData: (buffer: GPUBuffer, data: ArrayBuffer | ArrayBufferView, offset?: number) => void;