npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/inference/pipelines/vision/ops.js ADDED Viewed

@@ -0,0 +1,78 @@
+import { getDevice } from '../../../gpu/device.js';
+import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
+import { runLayerNorm } from '../../../gpu/kernels/layernorm.js';
+import { dispatchMatmul } from '../../../gpu/kernels/matmul-dispatch.js';
+import { runGelu } from '../../../gpu/kernels/gelu.js';
+import { runResidualAdd } from '../../../gpu/kernels/residual.js';
+/**
+ * Layer norm on GPU.
+ * @param {GPUBuffer} input   [seqLen, hiddenSize]
+ * @param {GPUBuffer} weight  [hiddenSize]
+ * @param {GPUBuffer} bias    [hiddenSize] or null
+ * @param {{ seqLen: number, hiddenSize: number, eps: number }} opts
+ * @returns {Promise<GPUBuffer>}
+ */
+export async function doLayerNorm(input, weight, bias, opts) {
+  const { seqLen, hiddenSize, eps } = opts;
+  const outputSize = seqLen * hiddenSize * 4;
+  const output = acquireBuffer(outputSize, 'vision-layernorm');
+  await runLayerNorm({
+    input,
+    weight,
+    bias: bias || null,
+    output,
+    seqLen,
+    hiddenSize,
+    eps,
+  });
+  return output;
+}
+/**
+ * Matrix multiply on GPU.
+ * @param {GPUBuffer} a  [M, K]
+ * @param {GPUBuffer} b  [K, N]
+ * @param {{ M: number, K: number, N: number, bias?: GPUBuffer }} opts
+ * @returns {Promise<GPUBuffer>}
+ */
+export async function doMatmul(a, b, opts) {
+  const { M, K, N, bias } = opts;
+  const outputSize = M * N * 4;
+  const output = acquireBuffer(outputSize, 'vision-matmul');
+  await dispatchMatmul({
+    a, b, output,
+    M, K, N,
+    bias: bias || null,
+  });
+  return output;
+}
+/**
+ * GELU activation on GPU.
+ * @param {GPUBuffer} input   Flat buffer
+ * @param {{ count: number }} opts  Total element count
+ * @returns {Promise<GPUBuffer>}
+ */
+export async function doGelu(input, opts) {
+  const { count } = opts;
+  const output = acquireBuffer(count * 4, 'vision-gelu');
+  await runGelu({ input, output, count });
+  return output;
+}
+/**
+ * Element-wise residual add on GPU.
+ * @param {GPUBuffer} a
+ * @param {GPUBuffer} b
+ * @param {{ count: number }} opts
+ * @returns {Promise<GPUBuffer>}
+ */
+export async function doResidualAdd(a, b, opts) {
+  const { count } = opts;
+  const output = acquireBuffer(count * 4, 'vision-residual');
+  await runResidualAdd({ a, b, output, count });
+  return output;
+}

package/src/inference/pipelines/vision/patch-embed.js ADDED Viewed

@@ -0,0 +1,151 @@
+import { trace } from '../../../debug/index.js';
+import { getDevice } from '../../../gpu/device.js';
+import { acquireBuffer } from '../../../memory/buffer-pool.js';
+/**
+ * Patch embedding for the vision encoder.
+ *
+ * Qwen3-VL uses a 3D convolution for temporal+spatial patch extraction:
+ *   Conv3D(in_channels=3, out_channels=hiddenSize, kernel=[temporalPatchSize, patchSize, patchSize])
+ *
+ * For single images (T=1), this reduces to a 2D convolution with stride=patchSize.
+ * The output is [numPatches, hiddenSize] where numPatches = (H/patchSize) * (W/patchSize).
+ *
+ * For the initial implementation, this runs on CPU and uploads to GPU.
+ * TODO(perf): GPU kernel for patch embedding (conv2d with large stride).
+ *
+ * @param {object} params
+ * @param {Float32Array} params.imageData    Preprocessed image [C, H, W] normalized
+ * @param {number}       params.height       Image height (patch-aligned)
+ * @param {number}       params.width        Image width (patch-aligned)
+ * @param {number}       params.channels     Number of channels (3)
+ * @param {object}       params.visionConfig Vision config
+ * @param {object}       params.weights      Vision encoder weight buffers
+ * @returns {Promise<{ patchBuffer: GPUBuffer, numPatches: number }>}
+ */
+export async function patchEmbed(params) {
+  const {
+    imageData, height, width, channels,
+    visionConfig, weights,
+  } = params;
+  const {
+    patchSize = 16,
+    hiddenSize = 1024,
+    temporalPatchSize = 2,
+  } = visionConfig;
+  const gridH = Math.floor(height / patchSize);
+  const gridW = Math.floor(width / patchSize);
+  const numPatches = gridH * gridW;
+  trace('vision', `patchEmbed: ${height}x${width} -> ${gridH}x${gridW} = ${numPatches} patches (${hiddenSize}d)`);
+  // Read conv weight from GPU to CPU for the embedding computation.
+  // Weight shape: [hiddenSize, channels * temporalPatchSize * patchSize * patchSize]
+  // For single image: effectively [hiddenSize, channels * patchSize * patchSize]
+  //
+  // Qwen3-VL patch_embed is actually:
+  //   proj = Conv3d(3, embed_dim, kernel_size=(tpp, pp, pp), stride=(tpp, pp, pp))
+  // For T=1 frame, temporal dim collapses: input is [1, C, 1, H, W]
+  // Output: [1, embed_dim, 1, H/pp, W/pp] -> reshape to [numPatches, embed_dim]
+  const device = getDevice();
+  const patchArea = channels * patchSize * patchSize;
+  // Extract patches from image: each patch is [C, patchSize, patchSize] flattened.
+  const patches = new Float32Array(numPatches * patchArea);
+  for (let ph = 0; ph < gridH; ph++) {
+    for (let pw = 0; pw < gridW; pw++) {
+      const patchIdx = ph * gridW + pw;
+      for (let c = 0; c < channels; c++) {
+        for (let py = 0; py < patchSize; py++) {
+          for (let px = 0; px < patchSize; px++) {
+            const imgY = ph * patchSize + py;
+            const imgX = pw * patchSize + px;
+            const srcIdx = c * height * width + imgY * width + imgX;
+            const dstIdx = patchIdx * patchArea + c * patchSize * patchSize + py * patchSize + px;
+            patches[dstIdx] = imageData[srcIdx];
+          }
+        }
+      }
+    }
+  }
+  // Read the projection weight from GPU.
+  // The weight tensor name is visual.patch_embed.proj.weight with shape [hiddenSize, C, tpp, pp, pp].
+  // For temporal_patch_size=2 and a single frame, we need to handle the temporal dimension.
+  // In practice for a single image, we sum over the temporal kernel dimension.
+  const weightKey = 'visual.patch_embed.proj.weight';
+  const biasKey = 'visual.patch_embed.proj.bias';
+  const weightBuffer = weights[weightKey];
+  const biasBuffer = weights[biasKey] || null;
+  // Full conv weight size: hiddenSize * channels * temporalPatchSize * patchSize * patchSize
+  const fullWeightSize = hiddenSize * channels * temporalPatchSize * patchSize * patchSize;
+  const weightData = new Float32Array(fullWeightSize);
+  {
+    const staging = device.createBuffer({
+      size: fullWeightSize * 4,
+      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+    });
+    const encoder = device.createCommandEncoder();
+    encoder.copyBufferToBuffer(weightBuffer, 0, staging, 0, fullWeightSize * 4);
+    device.queue.submit([encoder.finish()]);
+    await staging.mapAsync(GPUMapMode.READ);
+    weightData.set(new Float32Array(staging.getMappedRange()));
+    staging.unmap();
+    staging.destroy();
+  }
+  // For single frame: average over temporal kernel dimension to get [hiddenSize, C*pp*pp].
+  const spatialWeight = new Float32Array(hiddenSize * patchArea);
+  const spatialPatchArea = channels * patchSize * patchSize;
+  for (let h = 0; h < hiddenSize; h++) {
+    for (let s = 0; s < spatialPatchArea; s++) {
+      let sum = 0;
+      for (let t = 0; t < temporalPatchSize; t++) {
+        sum += weightData[h * temporalPatchSize * spatialPatchArea + t * spatialPatchArea + s];
+      }
+      spatialWeight[h * spatialPatchArea + s] = sum;
+    }
+  }
+  // Read bias if present.
+  let biasData = null;
+  if (biasBuffer) {
+    biasData = new Float32Array(hiddenSize);
+    const staging = device.createBuffer({
+      size: hiddenSize * 4,
+      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+    });
+    const encoder = device.createCommandEncoder();
+    encoder.copyBufferToBuffer(biasBuffer, 0, staging, 0, hiddenSize * 4);
+    device.queue.submit([encoder.finish()]);
+    await staging.mapAsync(GPUMapMode.READ);
+    biasData.set(new Float32Array(staging.getMappedRange()));
+    staging.unmap();
+    staging.destroy();
+  }
+  // Compute patch embeddings: patches [numPatches, patchArea] @ spatialWeight^T [patchArea, hiddenSize]
+  const embeddings = new Float32Array(numPatches * hiddenSize);
+  for (let p = 0; p < numPatches; p++) {
+    for (let h = 0; h < hiddenSize; h++) {
+      let val = biasData ? biasData[h] : 0;
+      for (let k = 0; k < patchArea; k++) {
+        val += patches[p * patchArea + k] * spatialWeight[h * patchArea + k];
+      }
+      embeddings[p * hiddenSize + h] = val;
+    }
+  }
+  // Upload to GPU.
+  const patchBuffer = acquireBuffer(numPatches * hiddenSize * 4, 'vision-patch-embed');
+  device.queue.writeBuffer(patchBuffer, 0, embeddings);
+  return { patchBuffer, numPatches };
+}

package/src/inference/test-harness.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
-import { parseManifest } from '../formats/rdrr/index.js';
+import { parseManifest, getExpectedShardHash } from '../formats/rdrr/index.js';
 import { createPipeline } from './pipelines/text.js';
 import { log as debugLog } from '../debug/index.js';
 import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
@@ -66,11 +66,12 @@ export function parseRuntimeOverridesFromURL(searchParams) {
   if (runtimeConfigRaw) {
     try {
       const parsed = JSON.parse(runtimeConfigRaw);
-      if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
-        runtime.runtimeConfig =  (parsed);
+      if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
+        throw new Error('runtimeConfig must be a JSON object');
       }
+      runtime.runtimeConfig = parsed;
     } catch (e) {
-      debugLog.warn('TestHarness', `Failed to parse runtimeConfig JSON: ${ (e).message}`);
+      throw new Error(`Failed to parse runtimeConfig URL parameter: ${e?.message}`);
     }
   }
@@ -79,12 +80,13 @@ export function parseRuntimeOverridesFromURL(searchParams) {
   if (configChainRaw) {
     try {
       const parsed = JSON.parse(configChainRaw);
-      if (Array.isArray(parsed)) {
-        runtime.configChain = parsed;
-        debugLog.info('TestHarness', `Config chain: ${parsed.join(' -> ')}`);
+      if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== 'string' || !entry.trim())) {
+        throw new Error('configChain must be an array of non-empty strings');
       }
+      runtime.configChain = parsed;
+      debugLog.info('TestHarness', `Config chain: ${parsed.join(' -> ')}`);
     } catch (e) {
-      debugLog.warn('TestHarness', `Failed to parse configChain JSON: ${ (e).message}`);
+      throw new Error(`Failed to parse configChain URL parameter: ${e?.message}`);
     }
   }
@@ -168,7 +170,7 @@ export function createHttpShardLoader(baseUrl, manifest, log) {
           distributionConfig,
           algorithm,
           requiredEncoding,
-          expectedHash: shard.hash ?? null,
+          expectedHash: getExpectedShardHash(shard, algorithm) || null,
           expectedSize: Number.isFinite(shard.size) ? Math.floor(shard.size) : null,
           expectedManifestVersionSet: manifestVersionSet,
           writeToStore: false,

package/src/loader/doppler-loader.d.ts CHANGED Viewed

@@ -31,6 +31,7 @@ import type {
 } from './loader-types.js';
 import type { ShardCache } from './shard-cache.js';
 import type { LoadingConfigSchema } from '../config/schema/loading.schema.js';
+import type { LoaderDebugConfigSchema } from '../config/schema/debug.schema.js';
 // Re-export types for backward compatibility
 export type {
@@ -96,6 +97,8 @@ export declare class DopplerLoader {
   setQ4KConfig(config: Q4KConfig): void;
+  setLoaderDebugConfig(loaderDebug: LoaderDebugConfigSchema | null): void;
   setCustomShardLoader(loadShardFn: CustomShardLoader, options?: CustomShardLoaderOptions): void;
   setTensorsJsonUrl(url: string | null): void;

package/src/loader/doppler-loader.js CHANGED Viewed

@@ -11,7 +11,7 @@ import {
 } from '../storage/shard-manager.js';
 import { clearManifest, parseManifest, setManifest as setCurrentManifest } from '../formats/rdrr/index.js';
 import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
-import { acquireBuffer, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
+import { acquireBuffer, isBufferActive, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
 import { getExpertCache } from './experts/expert-cache.js';
 import { formatBytes } from '../storage/quota.js';
 import { log, trace as debugTrace } from '../debug/index.js';
@@ -118,6 +118,7 @@ export class DopplerLoader {
   // Loading configuration
   #loadingConfig;
+  #loaderDebug = null;
   // Fused Q4_K matmul: skip dequantization for matmul weights, use fused kernel
@@ -167,6 +168,10 @@ export class DopplerLoader {
     }
   }
+  setLoaderDebugConfig(loaderDebug) {
+    this.#loaderDebug = loaderDebug ?? null;
+  }
   setQ4KConfig(config) {
     this.useFusedQ4K = config.useFusedQ4K;
@@ -701,6 +706,7 @@ export class DopplerLoader {
         useFusedQ4K: this.useFusedQ4K,
         keepF32Weights: this.keepF32Weights,
         q4kLayout: this.q4kLayout,
+        loaderDebug: this.#loaderDebug,
         gpuCapabilities: this.gpuCapabilities,
         allowF32UpcastNonMatmul,
       };
@@ -924,7 +930,14 @@ export class DopplerLoader {
     return this.layers.get(layerIdx) || null;
   }
+  /**
+   * Load a tensor by name. Public interface for extension loaders (e.g., vision).
+   */
+  async loadTensor(name, toGPU = true, silent = false) {
+    return this.#loadTensor(name, toGPU, silent);
+  }
   getConfig() {
     return  (this.manifest?.config) || {};
   }
@@ -968,7 +981,11 @@ export class DopplerLoader {
         : (isGpuBufferInstance(value) ? value : null);
       if (!gpuBuffer) return;
       try {
-        releaseBuffer(gpuBuffer);
+        if (isBufferActive(gpuBuffer)) {
+          releaseBuffer(gpuBuffer);
+        } else {
+          gpuBuffer.destroy();
+        }
       } catch {
         // Ignore already released/destroyed buffers.
       }

package/src/loader/experts/expert-cache.js CHANGED Viewed

@@ -1,6 +1,6 @@
-import { releaseBuffer } from '../../memory/buffer-pool.js';
+import { isBufferActive, releaseBuffer } from '../../memory/buffer-pool.js';
 import { log, trace } from '../../debug/index.js';
 import { getRuntimeConfig } from '../../config/runtime.js';
 import { isWeightBuffer } from '../../gpu/weight-buffer.js';
@@ -266,7 +266,11 @@ export class ExpertCache {
         : (isGpuBufferInstance(buf) ? buf : null);
       if (!gpuBuffer) continue;
       try {
-        releaseBuffer(gpuBuffer);
+        if (isBufferActive(gpuBuffer)) {
+          releaseBuffer(gpuBuffer);
+        } else {
+          gpuBuffer.destroy();
+        }
       } catch (e) {
         // Buffer may already be released
       }

package/src/loader/experts/expert-loader.js CHANGED Viewed

@@ -9,7 +9,7 @@ import { isWeightBuffer } from '../../gpu/weight-buffer.js';
 import { maybeDowncastToF16 } from '../weight-downcast.js';
 import { log, trace as debugTrace } from '../../debug/index.js';
 import { getRuntimeConfig } from '../../config/runtime.js';
-import { releaseBuffer } from '../../memory/buffer-pool.js';
+import { isBufferActive, releaseBuffer } from '../../memory/buffer-pool.js';
 // ============================================================================
 // Shard Preloading
@@ -283,7 +283,11 @@ function releasePackedLayerWeights(ctx, packed) {
     const gpuBuffer = getGpuBuffer(entry);
     if (!gpuBuffer) continue;
     try {
-      releaseBuffer(gpuBuffer);
+      if (isBufferActive(gpuBuffer)) {
+        releaseBuffer(gpuBuffer);
+      } else {
+        gpuBuffer.destroy();
+      }
       ctx.gpuBuffers?.delete?.(gpuBuffer);
     } catch {
       // Ignore already-released buffers.

package/src/loader/final-weights-loader.js CHANGED Viewed

@@ -36,6 +36,8 @@ function isLikelyFinalNormName(name) {
   return (
     lower === 'norm.weight' ||
     lower.includes('model.norm.weight') ||
+    lower.includes('language_model.norm.weight') ||
+    lower.includes('model.language_model.norm.weight') ||
     lower.includes('embedding_norm.weight') ||
     lower.includes('model.embedding_norm.weight') ||
     lower.includes('final_layernorm.weight') ||

package/src/loader/layer-loader.js CHANGED Viewed

@@ -1,8 +1,11 @@
 import { getKernelCapabilities } from '../gpu/device.js';
-import { isWeightBuffer } from '../gpu/weight-buffer.js';
+import { isWeightBuffer, createWeightBuffer, getWeightDtype } from '../gpu/weight-buffer.js';
+import { dequantize, dequantizeRowwise } from '../gpu/kernel-selector.js';
+import { releaseBuffer } from '../memory/buffer-pool.js';
 import { batchDowncastWeights } from './weight-downcast.js';
+import { QK_K } from './quantization-constants.js';
 import { trace as debugTrace } from '../debug/index.js';
 // ============================================================================
@@ -26,8 +29,8 @@ const ATTN_SUFFIXES = {
   kProj: ['self_attn.k_proj.weight', 'attention.wk.weight', 'attn_k.weight'],
   vProj: ['self_attn.v_proj.weight', 'attention.wv.weight', 'attn_v.weight'],
   oProj: ['self_attn.o_proj.weight', 'self_attn.out_proj.weight', 'attention.wo.weight', 'attn_output.weight'],
-  qNorm: ['self_attn.q_norm.weight', 'attn_q_norm.weight'],
-  kNorm: ['self_attn.k_norm.weight', 'attn_k_norm.weight'],
+  qNorm: ['self_attn.q_norm.weight', 'self_attn.q_layernorm.weight', 'attn_q_norm.weight'],
+  kNorm: ['self_attn.k_norm.weight', 'self_attn.k_layernorm.weight', 'attn_k_norm.weight'],
   postAttentionNorm: ['post_attention_layernorm.weight', 'post_attention_norm.weight', 'ffn_norm.weight'],
   preFeedforwardNorm: ['pre_feedforward_layernorm.weight'],
   postFeedforwardNorm: ['post_feedforward_layernorm.weight', 'post_ffw_norm.weight'],
@@ -415,4 +418,40 @@ async function downcastLayerWeights(ctx, weights, layerIdx) {
     },
     ctx.gpuBuffers
   );
+  await dequantConvQ4KWeights(ctx, weights, layerIdx);
+}
+const CONV_Q4K_DEQUANT_KEYS = ['convInProj', 'convOutProj', 'convKernel'];
+async function dequantConvQ4KWeights(ctx, weights, layerIdx) {
+  for (const key of CONV_Q4K_DEQUANT_KEYS) {
+    const buf = weights[key];
+    if (!buf || !isWeightBuffer(buf)) continue;
+    if (getWeightDtype(buf) !== 'q4k') continue;
+    const shape = buf.shape;
+    if (!Array.isArray(shape) || shape.length < 2) continue;
+    const is2D = shape.length === 2;
+    const totalElements = shape.reduce((a, b) => a * b, 1);
+    let dequantizedTensor;
+    const outputDtype = 'f32';
+    if (is2D && shape[1] % QK_K !== 0) {
+      dequantizedTensor = await dequantizeRowwise(buf.buffer, shape[0], shape[1], { outputDtype });
+    } else {
+      if (totalElements === 0 || totalElements % QK_K !== 0) continue;
+      const numBlocks = totalElements / QK_K;
+      dequantizedTensor = await dequantize(buf.buffer, numBlocks, { outputDtype });
+    }
+    releaseBuffer(buf.buffer);
+    const dequantizedBuffer = dequantizedTensor.buffer;
+    weights[key] = createWeightBuffer(dequantizedBuffer, outputDtype, 'row', shape, buf.label ?? key);
+    ctx.gpuBuffers.add(dequantizedBuffer);
+    debugTrace.loader(`Layer ${layerIdx} dequantized conv ${key} Q4K→${outputDtype.toUpperCase()}: [${shape.join(',')}]`);
+  }
 }

package/src/loader/manifest-config.js CHANGED Viewed

@@ -7,6 +7,8 @@ import { formatBytes } from '../storage/quota.js';
 import { log, trace as debugTrace } from '../debug/index.js';
 import { selectRuleValue } from '../rules/rule-registry.js';
+const STREAMABLE_DTYPES = new Set(['F16', 'F32', 'BF16']);
 // ============================================================================
 // Norm Weight Offset Detection
 // ============================================================================
@@ -102,7 +104,7 @@ export function shouldStreamLargeWeight(name, location, label, gpuCapabilities,
   if (estimate.bytes <= maxBytes) return false;
   // Check if dtype can be streamed (only float types)
-  const canStream = location.dtype === 'F16' || location.dtype === 'F32' || location.dtype === 'BF16';
+  const canStream = STREAMABLE_DTYPES.has(location.dtype);
   if (!canStream) {
     log.warn(
       'Loader',

package/src/loader/shard-cache.js CHANGED Viewed

@@ -5,6 +5,7 @@ import {
   computeHash,
   getStorageBackendType,
 } from '../storage/shard-manager.js';
+import { getExpectedShardHash } from '../formats/rdrr/index.js';
 import { formatBytes } from '../storage/quota.js';
 import { log, trace as debugTrace } from '../debug/index.js';
 import { getRuntimeConfig } from '../config/runtime.js';
@@ -484,11 +485,11 @@ export class ShardCache {
       // Verify hash if enabled
       if (this.#verifyHashes && this.#manifest) {
         const shardInfo = this.#manifest.shards?.[shardIndex];
-        const expectedHash = shardInfo?.hash;
+        const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
+        const expectedHash = getExpectedShardHash(shardInfo, algorithm);
         if (!expectedHash) {
           throw new Error(`Shard ${shardIndex} missing hash in manifest.`);
         }
-        const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
         if (!algorithm) {
           throw new Error(`Manifest missing hashAlgorithm for shard ${shardIndex}.`);
         }

package/src/loader/tensors/tensor-loader.d.ts CHANGED Viewed

@@ -12,10 +12,13 @@
 import type { WeightBuffer, WeightLayout } from '../../gpu/weight-buffer.js';
 import type { TensorLocation, KernelCapabilities } from '../loader-types.js';
+import type { LoaderDebugConfigSchema } from '../../config/schema/debug.schema.js';
 export interface TensorLoadConfig {
   /** Use fused Q4K matmul kernels */
   useFusedQ4K: boolean;
+  /** Debug controls for Q4K loading/dequantization */
+  loaderDebug?: LoaderDebugConfigSchema | null;
   /** Keep weights as F32 (disable F16 downcasting) */
   keepF32Weights: boolean;
   /** Allow F16->F32 upcast for non-matmul weights */