npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/converter/conversion-plan.js CHANGED Viewed

@@ -117,7 +117,10 @@ function isLikelyEmbeddingGemma(rawConfig, architectureHint) {
 export function inferSourceWeightQuantization(tensors) {
   if (!Array.isArray(tensors) || tensors.length === 0) {
-    return 'f16';
+    throw new Error(
+      'Cannot infer source weight quantization: no tensors provided. ' +
+      'Set converterConfig.quantization.weights explicitly.'
+    );
   }
   const weightTensors = [];
   for (const tensor of tensors) {
@@ -128,7 +131,12 @@ export function inferSourceWeightQuantization(tensors) {
     weightTensors.push({ name, dtype });
   }
   const dtypes = new Set(weightTensors.map((tensor) => tensor.dtype));
-  if (dtypes.size === 0) return 'f16';
+  if (dtypes.size === 0) {
+    throw new Error(
+      'Cannot infer source weight quantization: no recognizable weight dtypes found. ' +
+      'Set converterConfig.quantization.weights explicitly.'
+    );
+  }
   if (dtypes.size > 1) {
     const detail = Array.from(dtypes)
       .sort()
@@ -465,7 +473,7 @@ export function resolveConversionPlan(options) {
   // role dtypes should not change kernel-path selection when explicit compute precision is targeted.
   const embedDtypeRaw = normalizeWeightDtype(findTensorDtypeByRole(tensors, 'embedding'));
   const lmHeadDtypeRaw = normalizeWeightDtype(findTensorDtypeByRole(tensors, 'lm_head'));
-  const hasVision = hasAnyTensorPattern(tensors, ['vision_', 'vision_tower', 'vision_model', 'image_encoder']);
+  const hasVision = hasAnyTensorPattern(tensors, ['vision_', 'vision_tower', 'vision_model', 'image_encoder', 'visual.']);
   const hasAudio = hasAnyTensorPattern(tensors, ['audio_', 'audio_encoder', 'whisper', 'wav2vec']);
   const hasProjector = hasAnyTensorPattern(tensors, ['multi_modal_projector', 'mm_projector', 'projector']);
   const quantizationInfo = buildQuantizationInfo(

package/src/converter/core.js CHANGED Viewed

@@ -114,6 +114,15 @@ export function resolveTensorTargetQuant(tensorName, fallbackQuant, quantization
     const headQuant = quantizationInfo.lmHead ?? quantizationInfo.embeddings ?? fallback;
     return normalizeStorageQuant(headQuant) ?? fallback;
   }
+  if (role === 'vision') {
+    return normalizeStorageQuant(quantizationInfo.vision ?? fallback) ?? fallback;
+  }
+  if (role === 'projector') {
+    return normalizeStorageQuant(quantizationInfo.projector ?? fallback) ?? fallback;
+  }
+  if (role === 'audio') {
+    return normalizeStorageQuant(quantizationInfo.audio ?? fallback) ?? fallback;
+  }
   return normalizeStorageQuant(quantizationInfo.weights ?? fallback) ?? fallback;
 }
@@ -819,11 +828,11 @@ export function extractArchitecture(config, ggufConfig) {
       vocabSize,
       maxSeqLen,
       ropeTheta,
-      linearNumKeyHeads: linearNumKeyHeads ?? undefined,
-      linearNumValueHeads: linearNumValueHeads ?? undefined,
-      linearKeyHeadDim: linearKeyHeadDim ?? undefined,
-      linearValueHeadDim: linearValueHeadDim ?? undefined,
-      linearConvKernelDim: linearConvKernelDim ?? undefined,
+      linearNumKeyHeads,
+      linearNumValueHeads,
+      linearKeyHeadDim,
+      linearValueHeadDim,
+      linearConvKernelDim,
       linearNormMode,
     };
   }
@@ -983,6 +992,7 @@ export function createManifest(
     isDiffusion ? 'diffusion' : extractArchitecture(model.config, model.ggufConfig)
   );
   const rawConfig = model.config || {};
+  const generationConfig = model.generationConfig ?? null;
   const resolvedArchitecture = isDiffusion
     ? architecture
     : resolveIntermediateSizeFromTensors(architecture, model, tensorLocations, rawConfig, modelId);
@@ -1037,6 +1047,7 @@ export function createManifest(
       ? null
       : resolveEosTokenId({
           config: rawConfig,
+          generationConfig,
           tokenizer: model.tokenizer ?? model.tokenizerConfig ?? null,
           tokenizerJson: model.tokenizerJson ?? null,
         });
@@ -1054,7 +1065,7 @@ export function createManifest(
     modelId,
     modelType: resolvedModelType,
     quantization: resolvedQuantization,
-    quantizationInfo: options.quantizationInfo ?? undefined,
+    quantizationInfo: options.quantizationInfo,
     architecture: resolvedArchitecture,
     moeConfig,
     inference,
@@ -1063,8 +1074,8 @@ export function createManifest(
     totalSize: shards.reduce((sum, s) => sum + s.size, 0),
     hashAlgorithm,
     eos_token_id: eosTokenId,
-    config: isDiffusion ? rawConfig : undefined,
-    conversion: options.conversionInfo ?? undefined,
+    config: isDiffusion ? rawConfig : (rawConfig.vision_config ? { vision_config: rawConfig.vision_config } : undefined),
+    conversion: options.conversionInfo,
     metadata: {
       source,
       convertedAt: resolveConvertedAt(

package/src/converter/manifest-inference.js CHANGED Viewed

@@ -240,16 +240,6 @@ function detectAttentionOutputGate(presetInference, modelConfig, defaults) {
     return modelConfig.attn_output_gate;
   }
-  const modelType = normalizeLayerTypeName(modelConfig?.model_type);
-  const hasLinearAttentionLayers = Array.isArray(modelConfig?.layer_types)
-    && modelConfig.layer_types.some((entry) => normalizeCustomLayerType(entry) === 'linear_attention');
-  if (
-    hasLinearAttentionLayers
-    && (modelType === 'qwen2' || modelType === 'qwen3_5' || modelType === 'qwen3_5_text')
-  ) {
-    return true;
-  }
   return defaults.attention.attentionOutputGate;
 }
@@ -259,21 +249,18 @@ function resolveQueryPreAttnScalar(preset, modelConfig, headDim) {
     return explicit;
   }
-  const modelType = normalizeLayerTypeName(modelConfig?.model_type);
-  const presetId = normalizeLayerTypeName(preset?.id);
-  if (modelType.startsWith('qwen') || presetId === 'qwen3') {
-    return headDim;
+  // Standard attention scaling: attnScale = 1/sqrt(queryPreAttnScalar).
+  // For standard transformers queryPreAttnScalar = headDim, giving 1/sqrt(headDim).
+  // Preset may override for non-standard models.
+  const presetScalar = Number(preset?.inference?.attention?.queryPreAttnScalar);
+  if (Number.isFinite(presetScalar) && presetScalar > 0) {
+    return presetScalar;
   }
-  return Math.sqrt(headDim);
+  return headDim;
 }
 function detectRmsNormWeightOffset(presetInference, modelConfig, defaults) {
-  const modelType = normalizeLayerTypeName(modelConfig?.model_type);
-  if (modelType === 'qwen3_5' || modelType === 'qwen3_5_text') {
-    return true;
-  }
   if (typeof presetInference?.normalization?.rmsNormWeightOffset === 'boolean') {
     return presetInference.normalization.rmsNormWeightOffset;
   }
@@ -385,8 +372,8 @@ export function buildManifestInference(preset, config, headDim = 64, quantizatio
       queryPreAttnScalar: resolveQueryPreAttnScalar(preset, modelConfig, headDim),
       attnLogitSoftcapping: presetInference.attention?.attnLogitSoftcapping ??
         modelConfig.attn_logit_softcapping ?? defaults.attention.attnLogitSoftcapping,
-      slidingWindow: presetInference.attention?.slidingWindow ??
-        modelConfig.sliding_window ?? defaults.attention.slidingWindow,
+      slidingWindow: modelConfig.sliding_window ??
+        presetInference.attention?.slidingWindow ?? defaults.attention.slidingWindow,
       queryKeyNorm: presetInference.attention?.queryKeyNorm ?? defaults.attention.queryKeyNorm,
       attentionOutputGate: detectAttentionOutputGate(presetInference, modelConfig, defaults),
       causal: detectedCausalAttention ?? presetInference.attention?.causal ?? defaults.attention.causal,
@@ -459,6 +446,9 @@ export function buildManifestInference(preset, config, headDim = 64, quantizatio
         );
       }
       globalPattern = null;
+      // Default offset 0 means first global layer at index 0 (most common pattern).
+      // This is the every_n pattern default, distinct from layerPattern.offset=null
+      // which means "not applicable" in the schema.
       offset = (
         detectEveryNOffsetFromLayerTypes(modelConfig.layer_types, period)
         ?? normalizeEveryNOffset(presetPattern.offset, period)

package/src/converter/parsers/transformer.js CHANGED Viewed

@@ -7,6 +7,9 @@ export async function parseTransformerModel(adapter) {
   } = adapter;
   const config = await readJson('config.json', 'config.json');
+  const generationConfig = await fileExists('generation_config.json')
+    ? await readJson('generation_config.json', 'generation_config.json')
+    : null;
   const architectureHint = config.architectures?.[0] ?? config.model_type ?? '';
   let tensors = null;
@@ -19,6 +22,7 @@ export async function parseTransformerModel(adapter) {
   return {
     config,
+    generationConfig,
     tensors,
     architectureHint,
   };

package/src/converter/quantization-info.js CHANGED Viewed

@@ -2,6 +2,10 @@
 import { DEFAULT_QUANTIZATION_DEFAULTS, DEFAULT_Q4K_LAYOUT } from '../config/index.js';
 import { classifyTensorRole } from '../formats/rdrr/index.js';
+// Default quantization tag when no explicit dtype is provided.
+// F16 is the canonical unquantized storage format for WebGPU inference.
+const DEFAULT_QUANT_TAG = 'f16';
 // Quantization tag aliases mapped to canonical names.
 // Add new aliases here rather than adding if/else branches.
 const QUANT_TAG_ALIASES = {
@@ -47,7 +51,7 @@ const QUANT_TAG_ALIASES = {
 };
 export function normalizeQuantTag(value) {
-  if (!value) return 'f16';
+  if (!value) return DEFAULT_QUANT_TAG;
   const lower = value.toLowerCase();
   return QUANT_TAG_ALIASES[lower] ?? lower;
 }

package/src/converter/quantizer.d.ts CHANGED Viewed

@@ -73,6 +73,11 @@ export declare function dequantizeQ4KM(
   shape: number[]
 ): Float32Array;
+export declare function dequantizeQ4KMRowWise(
+  quantized: Uint8Array,
+  shape: [number, number]
+): Float32Array;
 export declare function calculateQuantizationError(
   original: Float32Array,
   reconstructed: Float32Array

package/src/converter/quantizer.js CHANGED Viewed

@@ -74,9 +74,10 @@ function findMinMax(data, offset, length) {
   return { min, max };
 }
-export function quantizeQ4KBlock(data, offset) {
+function quantizeQ4KBlockWithValidLength(data, offset, validLength = QK_K) {
   const block = new Uint8Array(QK4_K_BLOCK_SIZE);
   const blockView = new DataView(block.buffer);
+  const clampedValidLength = Math.max(0, Math.min(QK_K, Math.trunc(validLength)));
   const scales = new Float32Array(8);
   const minOffsets = new Float32Array(8);
@@ -84,14 +85,22 @@ export function quantizeQ4KBlock(data, offset) {
   for (let sb = 0; sb < 8; sb++) {
     const sbOffset = offset + sb * 32;
-    const { min, max } = findMinMax(data, sbOffset, 32);
+    const subblockStart = sb * 32;
+    const validInSubblock = Math.max(0, Math.min(32, clampedValidLength - subblockStart));
+    if (validInSubblock === 0) {
+      scales[sb] = 0;
+      minOffsets[sb] = 0;
+      continue;
+    }
+    const { min, max } = findMinMax(data, sbOffset, validInSubblock);
     minOffsets[sb] = -min;
     const range = max - min;
     scales[sb] = range > 0 ? range / 15 : 0;
     const invScale = scales[sb] > 0 ? 1 / scales[sb] : 0;
-    for (let i = 0; i < 32; i++) {
+    for (let i = 0; i < validInSubblock; i++) {
       const val = data[sbOffset + i];
       let q = Math.round((val - min) * invScale);
       q = Math.max(0, Math.min(15, q));
@@ -155,6 +164,10 @@ export function quantizeQ4KBlock(data, offset) {
   return block;
 }
+export function quantizeQ4KBlock(data, offset) {
+  return quantizeQ4KBlockWithValidLength(data, offset, QK_K);
+}
 function dequantizeQ4KBlock(block) {
   const blockView = new DataView(block.buffer, block.byteOffset);
   const result = new Float32Array(256);
@@ -245,22 +258,16 @@ export function quantizeToQ4KMRowWise(data, shape) {
   }
   const blocksPerRow = Math.ceil(cols / QK_K);
-  const paddedColsPerRow = blocksPerRow * QK_K;
   const totalBlocks = rows * blocksPerRow;
   const quantized = new Uint8Array(totalBlocks * QK4_K_BLOCK_SIZE);
   for (let row = 0; row < rows; row++) {
-    // Extract and pad this row
-    const rowData = new Float32Array(paddedColsPerRow);
-    const srcOffset = row * cols;
-    for (let c = 0; c < cols; c++) {
-      rowData[c] = data[srcOffset + c];
-    }
     // Quantize each block in this row
     for (let b = 0; b < blocksPerRow; b++) {
-      const block = quantizeQ4KBlock(rowData, b * QK_K);
+      const validLength = Math.max(0, Math.min(QK_K, cols - b * QK_K));
+      const srcOffset = row * cols + b * QK_K;
+      const block = quantizeQ4KBlockWithValidLength(data, srcOffset, validLength);
       const dstOffset = (row * blocksPerRow + b) * QK4_K_BLOCK_SIZE;
       quantized.set(block, dstOffset);
     }
@@ -348,6 +355,21 @@ export function dequantizeQ4KM(quantized, numBlocks, shape) {
   return result;
 }
+export function dequantizeQ4KMRowWise(quantized, shape) {
+  const [rows, cols] = shape;
+  const blocksPerRow = Math.ceil(cols / QK_K);
+  const result = new Float32Array(rows * cols);
+  for (let row = 0; row < rows; row++) {
+    const rowOffset = row * blocksPerRow * QK4_K_BLOCK_SIZE;
+    const rowBytes = quantized.slice(rowOffset, rowOffset + (blocksPerRow * QK4_K_BLOCK_SIZE));
+    const rowDequantized = dequantizeQ4KM(rowBytes, blocksPerRow, [1, cols]);
+    result.set(rowDequantized, row * cols);
+  }
+  return result;
+}
 export function calculateQuantizationError(original, reconstructed) {
   if (original.length !== reconstructed.length) {
     throw new Error('Length mismatch');

package/src/converter/rope-config.js CHANGED Viewed

@@ -1,3 +1,5 @@
+import { DEFAULT_MANIFEST_INFERENCE } from '../config/schema/index.js';
 function asObject(value) {
   if (value == null || typeof value !== 'object' || Array.isArray(value)) {
     return null;
@@ -50,7 +52,7 @@ function resolveScalingConfig(ropeScalingConfig, options = {}) {
     }
     return {
       ropeScalingType: null,
-      ropeScalingFactor: 1.0,
+      ropeScalingFactor: DEFAULT_MANIFEST_INFERENCE.rope.ropeScalingFactor,
       yarnBetaFast: null,
       yarnBetaSlow: null,
       yarnOriginalMaxPos: null,
@@ -58,7 +60,7 @@ function resolveScalingConfig(ropeScalingConfig, options = {}) {
   }
   let ropeScalingType = scalingType;
-  let ropeScalingFactor = 1.0;
+  let ropeScalingFactor = DEFAULT_MANIFEST_INFERENCE.rope.ropeScalingFactor;
   let yarnBetaFast = null;
   let yarnBetaSlow = null;
   let yarnOriginalMaxPos = null;
@@ -110,7 +112,7 @@ function hasScalingDirective(ropeScalingConfig) {
 function hasMeaningfulScalingConfig(resolvedScaling) {
   if (!resolvedScaling) return false;
   return resolvedScaling.ropeScalingType != null
-    || resolvedScaling.ropeScalingFactor !== 1.0
+    || resolvedScaling.ropeScalingFactor !== DEFAULT_MANIFEST_INFERENCE.rope.ropeScalingFactor
     || resolvedScaling.yarnBetaFast != null
     || resolvedScaling.yarnBetaSlow != null
     || resolvedScaling.yarnOriginalMaxPos != null;
@@ -159,7 +161,7 @@ export function buildRoPEConfig(presetInference, config) {
       ?? null,
     ropeScalingFactor: presetRoPE.ropeScalingFactor
       ?? presetAttn?.ropeScalingFactor  // Deprecated location
-      ?? 1.0,
+      ?? DEFAULT_MANIFEST_INFERENCE.rope.ropeScalingFactor,
     yarnBetaFast: presetRoPE.yarnBetaFast ?? null,
     yarnBetaSlow: presetRoPE.yarnBetaSlow ?? null,
     yarnOriginalMaxPos: presetRoPE.yarnOriginalMaxPos ?? null,
@@ -223,7 +225,7 @@ export function buildRoPEConfig(presetInference, config) {
     ?? asFiniteNumber(flatRoPEParameters?.rope_theta)
     ?? asFiniteNumber(config.rope_theta)
     ?? presetInference.rope?.ropeTheta
-    ?? 10000;
+    ?? DEFAULT_MANIFEST_INFERENCE.rope.ropeTheta;
   // For Gemma 3, local sliding attention theta comes from rope_parameters.sliding_attention.
   const ropeLocalTheta = asFiniteNumber(slidingAttentionRoPE?.rope_theta)
@@ -232,7 +234,7 @@ export function buildRoPEConfig(presetInference, config) {
   const mropeInterleaved = asBoolean(flatRoPEParameters?.mrope_interleaved)
     ?? presetInference.rope?.mropeInterleaved
-    ?? false;
+    ?? DEFAULT_MANIFEST_INFERENCE.rope.mropeInterleaved;
   const mropeSection = asNumberArray(flatRoPEParameters?.mrope_section)
     ?? presetInference.rope?.mropeSection
     ?? null;

package/src/converter/tokenizer-utils.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 export declare function resolveEosTokenId(options: {
   config?: Record<string, unknown> | null;
+  generationConfig?: Record<string, unknown> | null;
   tokenizer?: {
     eosTokenId?: number;
     eos_token_id?: number;

package/src/converter/tokenizer-utils.js CHANGED Viewed

@@ -1,6 +1,8 @@
-export function resolveEosTokenId({ config, tokenizer, tokenizerJson }) {
+export function resolveEosTokenId({ config, generationConfig, tokenizer, tokenizerJson }) {
   const nestedTextConfig = getNestedTextConfig(config);
   const candidateSources = [
+    generationConfig?.eos_token_id,
+    generationConfig?.eos_token_ids,
     tokenizer?.eosTokenId,
     tokenizer?.eos_token_id,
     tokenizerJson?.specialTokens?.eos,
@@ -19,6 +21,7 @@ export function resolveEosTokenId({ config, tokenizer, tokenizerJson }) {
   }
   const eosTokenStringCandidates = [
+    generationConfig?.eos_token,
     tokenizer?.eosToken,
     tokenizer?.eos_token,
     tokenizerJson?.specialTokens?.eos_token,

package/src/debug/reference/hf_qwen35_linear_attn_debug.py ADDED Viewed

@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""
+Dump intermediate values from Qwen3.5 linear attention (GatedDeltaNet) for comparison with Doppler.
+Usage:
+    HF_HOME=/media/x/models/huggingface_cache python3 src/debug/reference/hf_qwen35_linear_attn_debug.py
+"""
+import os
+import torch
+import numpy as np
+os.environ.setdefault("HF_HOME", "/media/x/models/huggingface_cache")
+from transformers import AutoModelForCausalLM, AutoTokenizer
+MODEL_ID = "Qwen/Qwen3.5-0.8B"
+PROMPT = "Hello"
+def stats(name, tensor):
+    t = tensor.float().detach().flatten()
+    print(f"  {name}: shape={list(tensor.shape)}, "
+          f"min={t.min().item():.6f}, max={t.max().item():.6f}, "
+          f"mean={t.mean().item():.6f}, absMax={t.abs().max().item():.6f}")
+    first8 = t[:8].tolist()
+    print(f"    first8: {[f'{v:.6f}' for v in first8]}")
+def main():
+    print(f"Loading {MODEL_ID}...")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model.eval()
+    inputs = tokenizer(PROMPT, return_tensors="pt")
+    input_ids = inputs["input_ids"]
+    print(f"Prompt: '{PROMPT}', Token IDs: {input_ids[0].tolist()}")
+    num_tokens = input_ids.shape[1]
+    # Dump key weight values for layer 0
+    layer0 = model.model.layers[0]
+    attn = layer0.linear_attn
+    print(f"\n=== Layer 0 weights ===")
+    if hasattr(attn, 'A_log'):
+        a_log = attn.A_log.detach().float()
+        a_neg_exp = -torch.exp(a_log)
+        stats("A_log", a_log)
+        stats("a_neg_exp", a_neg_exp)
+    if hasattr(attn, 'dt_bias'):
+        stats("dt_bias", attn.dt_bias.detach().float())
+    stats("conv1d.weight", attn.conv1d.weight.detach().float())
+    stats("norm.weight", attn.norm.weight.detach().float())
+    # Hook into the linear_attn module to capture its input and output
+    captured = {}
+    def hook_linear_attn_input(module, args, kwargs):
+        if len(args) > 0:
+            captured['linear_attn_input'] = args[0].detach().clone()
+        return None
+    def hook_linear_attn_output(module, args, kwargs, output):
+        if isinstance(output, tuple):
+            captured['linear_attn_output'] = output[0].detach().clone()
+        else:
+            captured['linear_attn_output'] = output.detach().clone()
+        return None
+    # Hook into individual projection layers
+    def make_hook(name):
+        def hook(module, input, output):
+            captured[name] = output.detach().clone()
+        return hook
+    hooks = []
+    hooks.append(attn.register_forward_pre_hook(hook_linear_attn_input, with_kwargs=True))
+    hooks.append(attn.register_forward_hook(hook_linear_attn_output, with_kwargs=True))
+    hooks.append(attn.in_proj_qkv.register_forward_hook(make_hook('qkv_proj')))
+    hooks.append(attn.in_proj_z.register_forward_hook(make_hook('z_proj')))
+    hooks.append(attn.in_proj_a.register_forward_hook(make_hook('a_proj')))
+    hooks.append(attn.in_proj_b.register_forward_hook(make_hook('b_proj')))
+    hooks.append(attn.out_proj.register_forward_hook(make_hook('out_proj')))
+    hooks.append(attn.conv1d.register_forward_hook(make_hook('conv1d_raw')))
+    hooks.append(attn.norm.register_forward_hook(make_hook('gated_norm')))
+    # Also hook input_layernorm
+    hooks.append(layer0.input_layernorm.register_forward_hook(make_hook('input_layernorm')))
+    print(f"\n=== Running forward pass ===")
+    with torch.no_grad():
+        outputs = model(input_ids, output_hidden_states=True)
+    # Remove hooks
+    for h in hooks:
+        h.remove()
+    print(f"\n=== Captured intermediates ===")
+    for name in ['input_layernorm', 'qkv_proj', 'z_proj', 'a_proj', 'b_proj',
+                  'conv1d_raw', 'gated_norm', 'linear_attn_input', 'linear_attn_output', 'out_proj']:
+        if name in captured:
+            stats(name, captured[name])
+        else:
+            print(f"  {name}: NOT CAPTURED")
+    # Hidden states per layer
+    print(f"\n=== Hidden states per layer (last token) ===")
+    for i in range(min(6, len(outputs.hidden_states) - 1)):
+        hs = outputs.hidden_states[i + 1]
+        t = hs[0, -1]  # last token
+        vals = t[:8].tolist()
+        max_abs = t.abs().max().item()
+        mean_abs = t.abs().mean().item()
+        layer_type = type(model.model.layers[i]).__name__
+        attn_type = "linear" if hasattr(model.model.layers[i], 'linear_attn') else "full"
+        print(f"  Layer {i} ({attn_type}): first8={[f'{v:.4f}' for v in vals]}, "
+              f"maxAbs={max_abs:.4f}, meanAbs={mean_abs:.4f}")
+    # Logits
+    logits = outputs.logits[0, -1]
+    top5 = torch.topk(logits, 5)
+    print(f"\nTop-5 logits: {[(tokenizer.decode([idx.item()]), f'{val.item():.2f}') for val, idx in zip(top5.values, top5.indices)]}")
+    # Also trace through the linear attention manually to compare with Doppler's kernel
+    print(f"\n=== Manual linear attention trace (layer 0) ===")
+    with torch.no_grad():
+        embed = model.model.embed_tokens(input_ids)
+        normed = layer0.input_layernorm(embed)
+        stats("normed_input", normed)
+        qkv = attn.in_proj_qkv(normed)
+        stats("qkv", qkv)
+        # The HF Qwen3.5 GatedDeltaNet does conv1d on the QKV, then applies SiLU
+        # The conv1d expects [batch, channels, seq_len] format
+        qkv_t = qkv.transpose(1, 2)  # [1, 6144, 1]
+        # Use the conv1d module directly (it has padding configured)
+        conv_raw = attn.conv1d(qkv_t)
+        stats("conv_raw (from module)", conv_raw.transpose(1, 2))
+        # Truncate to seq_len (causal conv padding)
+        conv_causal = conv_raw[..., :num_tokens]
+        stats("conv_causal (truncated)", conv_causal.transpose(1, 2))
+        # Apply SiLU
+        conv_silu = torch.nn.functional.silu(conv_causal)
+        stats("conv_silu", conv_silu.transpose(1, 2))
+        # Split Q, K, V
+        conv_out = conv_silu.transpose(1, 2)  # [1, seq_len, 6144]
+        num_k_heads = 16
+        head_k_dim = 128
+        head_v_dim = 128
+        num_v_heads = 16
+        q_size = num_k_heads * head_k_dim  # 2048
+        k_size = q_size
+        v_size = num_v_heads * head_v_dim  # 2048
+        q = conv_out[..., :q_size]
+        k = conv_out[..., q_size:q_size + k_size]
+        v = conv_out[..., q_size + k_size:]
+        stats("Q (raw)", q)
+        stats("K (raw)", k)
+        stats("V (raw)", v)
+        # Reshape for per-head processing
+        # Q and K: [batch, seq, num_k_heads, head_k_dim]
+        q_heads = q.view(1, num_tokens, num_k_heads, head_k_dim)
+        k_heads = k.view(1, num_tokens, num_k_heads, head_k_dim)
+        v_heads = v.view(1, num_tokens, num_v_heads, head_v_dim)
+        # L2 normalize Q and K
+        eps = 1e-6
+        q_norm = torch.nn.functional.normalize(q_heads, p=2, dim=-1, eps=eps)
+        k_norm = torch.nn.functional.normalize(k_heads, p=2, dim=-1, eps=eps)
+        # Scale Q by 1/sqrt(head_k_dim)
+        head_scale = 1.0 / (head_k_dim ** 0.5)
+        q_scaled = q_norm * head_scale
+        stats("Q_normed_scaled (per-head)", q_scaled.reshape(1, num_tokens, -1))
+        stats("K_normed (per-head)", k_norm.reshape(1, num_tokens, -1))
+        # Projections for gating
+        z = attn.in_proj_z(normed)
+        a_out = attn.in_proj_a(normed)
+        b_out = attn.in_proj_b(normed)
+        stats("z", z)
+        stats("a", a_out)
+        stats("b", b_out)
+        # Compute gating values
+        a_log = attn.A_log.detach().float()
+        a_neg_exp = -torch.exp(a_log)
+        dt_bias = attn.dt_bias.detach().float()
+        softplus_input = a_out.squeeze(0).squeeze(0) + dt_bias
+        softplus_val = torch.nn.functional.softplus(softplus_input)
+        g = a_neg_exp * softplus_val
+        g_exp = torch.exp(g)
+        beta = torch.sigmoid(b_out.squeeze(0).squeeze(0))
+        stats("softplus(a + dt_bias)", softplus_val.unsqueeze(0).unsqueeze(0))
+        stats("g (decay)", g.unsqueeze(0).unsqueeze(0))
+        stats("g_exp (decay factor)", g_exp.unsqueeze(0).unsqueeze(0))
+        stats("beta (sigmoid(b))", beta.unsqueeze(0).unsqueeze(0))
+        # Recurrent state update (for first token, state is all zeros)
+        # state[head, kd, vd] = state * g_exp + k[kd] * delta[vd]
+        # where delta[vd] = (v[vd] - state^T @ k * beta
+        # For zero state: delta[vd] = v[vd] * beta, state = k ⊗ delta
+        state = torch.zeros(num_v_heads, head_k_dim, head_v_dim)
+        # Apply decay (no-op for zero state)
+        for head in range(num_v_heads):
+            state[head] *= g_exp[head].item()
+            k_head = k_norm[0, 0, head % num_k_heads]  # broadcast q_rep
+            v_head = v_heads[0, 0, head]
+            # kv_mem = state @ k
+            kv_mem = state[head].t() @ k_head  # [head_v_dim]
+            # delta = (v - kv_mem) * beta
+            delta = (v_head - kv_mem) * beta[head].item()
+            # state += outer(k, delta)
+            state[head] += torch.outer(k_head, delta)
+        # Output: out = state^T @ q
+        output_per_head = torch.zeros(1, num_tokens, num_v_heads, head_v_dim)
+        for head in range(num_v_heads):
+            q_head = q_scaled[0, 0, head % num_k_heads]
+            out_head = state[head].t() @ q_head  # [head_v_dim]
+            output_per_head[0, 0, head] = out_head
+        raw_out = output_per_head.reshape(1, num_tokens, num_v_heads * head_v_dim)
+        stats("Recurrent output (raw)", raw_out)
+        # RMS norm per head + SiLU gate
+        z_reshaped = z.view(1, num_tokens, num_v_heads, head_v_dim)
+        norm_weight = attn.norm.weight.detach().float()  # [head_v_dim] (shared mode)
+        rms_eps = 1e-6
+        for head in range(num_v_heads):
+            head_out = output_per_head[0, 0, head]  # [head_v_dim]
+            mean_sq = (head_out ** 2).mean()
+            inv_rms = 1.0 / torch.sqrt(mean_sq + rms_eps)
+            z_gate = torch.nn.functional.silu(z_reshaped[0, 0, head])
+            output_per_head[0, 0, head] = head_out * inv_rms * norm_weight * z_gate
+        gated_out = output_per_head.reshape(1, num_tokens, num_v_heads * head_v_dim)
+        stats("After RMSNorm + SiLU gate", gated_out)
+        # Output projection
+        o_result = torch.nn.functional.linear(gated_out, attn.out_proj.weight)
+        stats("After out_proj", o_result)
+        # Compare with captured output
+        if 'linear_attn_output' in captured:
+            diff = (o_result - captured['linear_attn_output']).abs()
+            print(f"\n  Diff vs captured output: maxDiff={diff.max().item():.6f}")
+if __name__ == "__main__":
+    main()