npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.8 - Mend

@simulatte/doppler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/CHANGELOG.md +19 -0
package/package.json +21 -36
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-registry.json +1 -17
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +3 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +14 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +2 -0
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/storage.schema.js +1 -1
package/src/converter/conversion-plan.js +10 -2
package/src/converter/core.js +2 -0
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.js +19 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +6 -1
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/matmul-selection.js +47 -4
package/src/gpu/kernels/matmul.d.ts +2 -0
package/src/gpu/kernels/matmul.js +1 -1
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
package/src/inference/pipelines/text/attention/projections.js +41 -11
package/src/inference/pipelines/text/attention/record.js +15 -6
package/src/inference/pipelines/text/attention/run.js +50 -6
package/src/inference/pipelines/text/config.js +14 -0
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/generator-runtime.js +5 -0
package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
package/src/inference/pipelines/text/generator-steps.js +43 -15
package/src/inference/pipelines/text/generator.js +50 -17
package/src/inference/pipelines/text/init.d.ts +13 -0
package/src/inference/pipelines/text/init.js +16 -5
package/src/inference/pipelines/text/layer.js +1 -0
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/test-harness.js +2 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.js +6 -1
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.js +2 -0
package/src/storage/downloader.js +2 -1
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +3 -0
package/src/tooling/node-source-runtime.js +36 -0
package/src/types/model.d.ts +5 -0
package/tools/doppler-cli.js +6 -1

package/src/inference/pipelines/text/generator.js CHANGED Viewed

@@ -122,6 +122,20 @@ function resolveTokenText(tokenizer, tokenIds, fallbackText = '?', renderTokenTe
   return fallbackText;
 }
+export function shouldRetryWithFinitenessFallback(error) {
+  if (error?.name === 'FinitenessError') {
+    return true;
+  }
+  const message = typeof error?.message === 'string'
+    ? error.message
+    : (typeof error === 'string' ? error : '');
+  if (!message.startsWith('[Sampling]')) {
+    return false;
+  }
+  return message.includes('no finite candidate logits after masking the pad token')
+    || message.includes('Softmax produced no finite candidate probabilities');
+}
 export class PipelineGenerator {
   #state;
@@ -351,7 +365,7 @@ export class PipelineGenerator {
       try {
         prefillLogits = await this._prefill(inputIds, opts);
       } catch (error) {
-        if (error.name === 'FinitenessError') {
+        if (shouldRetryWithFinitenessFallback(error)) {
           log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefill. Retrying with F32 precision.`);
           prefillLogits = await this._retryWithFinitenessFallback(
             opts,
@@ -395,13 +409,34 @@ export class PipelineGenerator {
         log.debug('Pipeline', `After rep penalty top-5: ${topAfterPenalty.map(t => `"${t.text}"(${(t.prob * 100).toFixed(1)}%)`).join(', ')}`);
       }
-      const firstToken = sample(prefillLogits, {
-        temperature: opts.temperature,
-        topP: opts.topP,
-        topK: opts.topK,
-        padTokenId,
-        seed: opts.seed,
-      });
+      let firstToken;
+      try {
+        firstToken = sample(prefillLogits, {
+          temperature: opts.temperature,
+          topP: opts.topP,
+          topK: opts.topK,
+          padTokenId,
+          seed: opts.seed,
+        });
+      } catch (error) {
+        if (!shouldRetryWithFinitenessFallback(error)) {
+          throw error;
+        }
+        log.warn('Pipeline', 'FinitenessGuard caught non-finite prefill logits at sampling. Retrying with F32 precision.');
+        prefillLogits = await this._retryWithFinitenessFallback(
+          opts,
+          'prefill-sample',
+          () => this._prefill(inputIds, opts)
+        );
+        applyRepetitionPenalty(prefillLogits, generatedIds, opts.repetitionPenalty);
+        firstToken = sample(prefillLogits, {
+          temperature: opts.temperature,
+          topP: opts.topP,
+          topK: opts.topK,
+          padTokenId,
+          seed: opts.seed,
+        });
+      }
       if (opts.debug) {
         const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
@@ -479,7 +514,7 @@ export class PipelineGenerator {
     try {
       prefillResult = await this._prefillToHidden(inputIds, opts);
     } catch (error) {
-      if (error.name === 'FinitenessError') {
+      if (shouldRetryWithFinitenessFallback(error)) {
         log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillKVOnly. Retrying with F32 precision.`);
         prefillResult = await this._retryWithFinitenessFallback(
           opts,
@@ -544,7 +579,7 @@ export class PipelineGenerator {
     try {
       prefillResult = await this._prefillToHidden(inputIds, opts);
     } catch (error) {
-      if (error.name === 'FinitenessError') {
+      if (shouldRetryWithFinitenessFallback(error)) {
         log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillWithEmbedding. Retrying with F32 precision.`);
         prefillResult = await this._retryWithFinitenessFallback(
           opts,
@@ -833,7 +868,7 @@ export class PipelineGenerator {
           try {
             nextToken = await this._decodeStep(generatedIds, opts);
           } catch (singleTokenError) {
-            if (singleTokenError.name === 'FinitenessError') {
+            if (shouldRetryWithFinitenessFallback(singleTokenError)) {
               log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at batch step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
               nextToken = await this._retryDecodeStepWithFinitenessWindow(
                 generatedIds,
@@ -858,7 +893,7 @@ export class PipelineGenerator {
         try {
           nextToken = await this._decodeStep(generatedIds, opts);
         } catch (error) {
-          if (error.name === 'FinitenessError') {
+          if (shouldRetryWithFinitenessFallback(error)) {
             log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
             nextToken = await this._retryDecodeStepWithFinitenessWindow(
               generatedIds,
@@ -918,11 +953,9 @@ export class PipelineGenerator {
       throw new Error('Embed buffer not found or not a supported buffer type');
     }
     const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
-    const embedDtype = isWeightBuffer(embedBufferRaw)
-      ? getWeightDtype(embedBufferRaw)
-      : isCpuWeightBuffer(embedBufferRaw)
-        ? embedBufferRaw.dtype
-        : null;
+    const embedDtype = isCpuWeightBuffer(embedBufferRaw)
+      ? embedBufferRaw.dtype
+      : getWeightDtype(embedBufferRaw);
     if (opts.debug) {
       const embedSize = embedBuffer instanceof GPUBuffer ? embedBuffer.size : 'N/A';
       log.debug('Pipeline', `Embed buffer: type=${embedBuffer?.constructor?.name}, size=${embedSize}, dtype=${embedDtype}`);

package/src/inference/pipelines/text/init.d.ts CHANGED Viewed

@@ -190,6 +190,12 @@ export interface WeightLoadResult {
   layerRouterWeights: Map<number, RouterWeights>;
 }
+export interface ResolvedQ4KConfig {
+  useFusedQ4K: boolean;
+  q4kLayout: 'row' | 'col' | null;
+  keepF32Weights: boolean;
+}
 /** Options for loadWeights */
 export interface LoadWeightsOptions {
   storageContext?: PipelineStorageContext;
@@ -211,6 +217,13 @@ export function loadWeights(
   options?: LoadWeightsOptions
 ): Promise<WeightLoadResult>;
+export function resolveQ4KConfig(
+  manifest: Manifest,
+  kernelPath?: KernelPathSchema | null,
+  kernelPathSource?: KernelPathSource,
+  keepF32Weights?: boolean
+): ResolvedQ4KConfig;
 /**
  * Apply Gemma chat template to a prompt.
  */

package/src/inference/pipelines/text/init.js CHANGED Viewed

@@ -11,7 +11,7 @@ import { getDopplerLoader } from '../../../loader/doppler-loader.js';
 import { log, setGPUDevice, trace as debugTrace } from '../../../debug/index.js';
 import { getRuntimeConfig } from '../../../config/runtime.js';
 import { PAGED_LAYOUT_SEQ_LEN_THRESHOLD } from '../../../config/schema/index.js';
-import { isKernelPathFusedQ4K } from '../../../config/kernel-path-loader.js';
+import { isKernelPathFusedQ4K, kernelPathRequiresF32MatmulWeights } from '../../../config/kernel-path-loader.js';
 import { createWeightBuffer, getWeightDtype, isWeightBuffer } from '../../../gpu/weight-buffer.js';
 import { selectRuleValue } from '../../../rules/rule-registry.js';
 import {
@@ -128,7 +128,7 @@ function createRemoteStorageContext(baseUrl, manifest) {
 }
-function resolveQ4KConfig(
+export function resolveQ4KConfig(
   manifest,
   kernelPath,
   kernelPathSource = 'none',
@@ -150,18 +150,23 @@ function resolveQ4KConfig(
     );
   }
   let useFused = kernelPath ? isKernelPathFusedQ4K(kernelPath) : hasSubgroups;
+  const kernelPathKeepsF32Weights = kernelPathRequiresF32MatmulWeights(kernelPath);
   if (q4kLayout === 'col') {
     useFused = false;
   }
+  const resolvedKeepF32Weights = keepF32Weights || kernelPathKeepsF32Weights;
   const pathLabel = kernelPath?.id ?? 'auto';
   const layoutLabel = q4kLayout ?? 'none';
-  debugTrace.loader(`Q4K config: fused=${useFused}, kernelPath=${pathLabel}, source=${kernelPathSource}, layout=${layoutLabel}, subgroups=${hasSubgroups}`);
+  debugTrace.loader(
+    `Q4K config: fused=${useFused}, kernelPath=${pathLabel}, source=${kernelPathSource}, ` +
+    `layout=${layoutLabel}, keepF32Weights=${resolvedKeepF32Weights}, subgroups=${hasSubgroups}`
+  );
   return {
     useFusedQ4K: useFused,
     q4kLayout,
-    keepF32Weights,
+    keepF32Weights: resolvedKeepF32Weights,
   };
 }
@@ -502,6 +507,12 @@ export function createKVCache(modelConfig, useGPU, debug = false, runtimeConfig)
     cacheLayout = 'paged';
     layoutSource = 'threshold';
   }
+  if (forceContiguousKVCache && cacheLayout === 'paged') {
+    throw new Error(
+      'Paged KV cache layout is not supported for models with full-attention layers. ' +
+      'Set runtime.inference.kvcache.layout to "contiguous" instead.'
+    );
+  }
   if (debug && cacheLayout !== runtimeKV.layout) {
     log.debug('Pipeline', `KV cache layout override: ${runtimeKV.layout} -> ${cacheLayout} (${layoutSource})`);
   }
@@ -599,7 +610,7 @@ export function createKVCache(modelConfig, useGPU, debug = false, runtimeConfig)
   if (debug) {
     if (forceContiguousKVCache && modelConfig.layerTypes) {
-      log.debug('Pipeline', 'Layer pattern includes full-attention layers; forcing contiguous KV cache.');
+      log.debug('Pipeline', 'Layer pattern includes full-attention layers; paged layout blocked, contiguous enforced.');
     }
     const isSliding = kvCache instanceof SlidingWindowKVCache;
     log.debug('Pipeline', `KV cache: type=${kvCache?.constructor?.name || 'unknown'}, kvDtype=${kvCache.kvDtype}, layout=${kvCache.layout}, maxSeqLen=${kvCache.maxSeqLen}, windowSize=${isSliding ? kvCache.windowSize : null}`);

package/src/inference/pipelines/text/layer.js CHANGED Viewed

@@ -276,6 +276,7 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
         : (ropeFreqsSin),
       kvCache: ((kvCache)),
       stats: context.stats,
+      debugProbes: context.debugProbes,
       linearRuntime: context.linearAttentionRuntime ?? null,
     };

package/src/inference/pipelines/text/linear-attention.d.ts CHANGED Viewed

@@ -84,6 +84,11 @@ export declare function inferLinearNormMode(
   }
 ): LinearNormMode | null;
+export declare function applyLinearNormWeightOffset(
+  values: Float32Array,
+  rmsNormWeightOffset: boolean
+): Float32Array;
 export declare function resetLinearAttentionRuntime(
   runtime: LinearAttentionRuntime | null | undefined
 ): LinearAttentionRuntime;

package/src/inference/pipelines/text/linear-attention.js CHANGED Viewed

@@ -5,6 +5,8 @@ import { log } from '../../../debug/index.js';
 import { decodeReadback } from './debug-utils/index.js';
 import { runLinearAttentionCoreGPU } from '../../../gpu/kernels/linear-attention-core.js';
 import { runProbes } from './probes.js';
+import { QK_K, Q4K_BLOCK_BYTES } from '../../../config/schema/index.js';
+import { dequantizeQ4KM } from '../../../converter/quantizer.js';
 const LINEAR_RUNTIME_SCHEMA_VERSION = 1;
 const QK_L2NORM_EPS = 1e-6;
@@ -34,6 +36,15 @@ function bytesFromDtype(dtype) {
   return 4;
 }
+export function applyLinearNormWeightOffset(values, rmsNormWeightOffset) {
+  if (!(values instanceof Float32Array)) {
+    throw new Error('applyLinearNormWeightOffset requires Float32Array input.');
+  }
+  // Qwen linear-attention output norm uses direct weights even when surrounding
+  // transformer RMSNorm sites use the Gemma-style (1 + weight) formula.
+  return values;
+}
 function cloneLayerRuntimeState(layerState) {
   return {
     layerIdx: layerState.layerIdx,
@@ -283,9 +294,27 @@ async function readWeightAsF32(weight, expectedElements, label) {
   if (!elementCount && isWeightBuffer(weight) && Array.isArray(weight.shape) && weight.shape.length > 0) {
     elementCount = weight.shape.reduce((total, dim) => total * Math.max(1, Math.trunc(Number(dim) || 0)), 1);
   }
+  const isQ4K = sourceDtype === 'q4k' || sourceDtype === 'q4_k_m' || sourceDtype === 'q4_k';
   if (!elementCount) {
-    const inferredBytes = sourceDtype === 'f16' || sourceDtype === 'bf16' ? 2 : 4;
-    elementCount = Math.trunc(sourceBuffer.size / inferredBytes);
+    if (isQ4K) {
+      elementCount = Math.trunc(sourceBuffer.size / Q4K_BLOCK_BYTES) * QK_K;
+    } else {
+      const inferredBytes = sourceDtype === 'f16' || sourceDtype === 'bf16' ? 2 : 4;
+      elementCount = Math.trunc(sourceBuffer.size / inferredBytes);
+    }
+  }
+  if (isQ4K) {
+    const numBlocks = Math.ceil(elementCount / QK_K);
+    const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
+    const raw = await readBuffer(sourceBuffer, q4kBytes);
+    const decoded = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [elementCount]);
+    if (expectedElements != null && decoded.length !== expectedElements) {
+      throw new Error(
+        `Weight "${label}" Q4K decoded length ${decoded.length}, expected ${expectedElements}.`
+      );
+    }
+    return decoded;
   }
   if (!sourceDtype) {
@@ -454,6 +483,7 @@ async function createLayerRuntimeState(
     expectedNormElements,
     `L${layerIdx}.linear_attn.norm.weight`
   );
+  const runtimeNorm = applyLinearNormWeightOffset(norm, config.rmsNormWeightOffset === true);
   const aNegExp = new Float32Array(aLog.length);
   for (let i = 0; i < aLog.length; i++) {
@@ -490,7 +520,7 @@ async function createLayerRuntimeState(
     convWeight,
     dtBias,
     aNegExp,
-    normWeight: norm,
+    normWeight: runtimeNorm,
     convState,
     recurrentState,
     convWeightGPU: null,

package/src/inference/pipelines/text/logits/gpu.js CHANGED Viewed

@@ -304,7 +304,7 @@ export async function computeLogitsGPU(
   const logitsTensor = await runMatmul(normedTensor, lmHeadBuffer, numTokens, matmulVocabSize, hiddenSize, {
     transposeB: 'auto',
-    role: forceStableF32Logits ? undefined : 'lm_head',
+    role: 'lm_head',
     kernelPath: config.kernelPath ?? null,
   });
@@ -391,7 +391,7 @@ export async function recordLogitsGPU(
   // Record matmul (no submit)
   const logitsTensor = await recordMatmul(recorder, normedTensor, lmHeadBuffer, numTokens, matmulVocabSize, hiddenSize, {
     transposeB: 'auto',
-    role: forceStableF32Logits ? undefined : 'lm_head',
+    role: 'lm_head',
     kernelPath: config.kernelPath ?? null,
   });

package/src/inference/pipelines/text/logits/index.d.ts CHANGED Viewed

@@ -25,6 +25,10 @@ export { computeLogitsGPU, recordLogitsGPU, computeChunkedLogitsGPU, resolveCpuW
 // Re-export utilities
 export { extractLastPositionLogits, finalizeLogits } from './utils.js';
+export interface ComputeLogitsOptions {
+  lastPositionOnly?: boolean;
+}
 /**
  * Compute logits from hidden states.
  *
@@ -53,5 +57,6 @@ export function computeLogits(
   debugFlags?: LogitsDebugFlags,
   getNormWeightBuffer?: (weight: GPUBuffer | Float32Array | ArrayBuffer, label: string) => GPUBuffer,
   debugCheckBuffer?: (buffer: GPUBuffer, label: string, numTokens: number, expectedDim?: number) => Promise<void>,
-  debugProbes?: ProbeConfigSchema[] | null
+  debugProbes?: ProbeConfigSchema[] | null,
+  options?: ComputeLogitsOptions
 ): Promise<Float32Array>;

package/src/inference/pipelines/text/logits/index.js CHANGED Viewed

@@ -253,6 +253,7 @@ export async function computeLogits(
   const lastPositionOnly = options?.lastPositionOnly === true && numTokens > 1;
   const matmulRows = lastPositionOnly ? 1 : numTokens;
+  const matmulPhaseOverride = lastPositionOnly ? 'prefill' : null;
   let matmulInputTensor = normedTensor;
   let matmulInputOwned = false;
   if (lastPositionOnly) {
@@ -270,7 +271,8 @@ export async function computeLogits(
   // HuggingFace models store lm_head as [vocabSize, hiddenSize], so transposeB=true
   const logitsTensor = await runMatmul(matmulInputTensor, lmHeadBuffer, matmulRows, matmulVocabSize, hiddenSize, {
     transposeB: 'auto',
-    role: (forceStableF32Logits || lastPositionOnly) ? undefined : 'lm_head',
+    role: 'lm_head',
+    phaseOverride: matmulPhaseOverride,
     kernelPath: config.kernelPath ?? null,
   });
   await runProbes('logits', logitsTensor.buffer, {

package/src/inference/pipelines/text/model-load.js CHANGED Viewed

@@ -234,6 +234,9 @@ function buildManifestDecodeLoopRuntimePatch(manifest) {
 export function applyModelBatchingRuntimeDefaults(runtimeConfig, manifest, modelConfig) {
   void modelConfig;
+  if (manifest?.inference?.schema === 'doppler.execution/v0') {
+    return runtimeConfig;
+  }
   const batching = runtimeConfig?.inference?.batching;
   const generation = runtimeConfig?.inference?.generation;
   const runtimeBatchingAtDefaults = isRuntimeBatchingAtGlobalDefaults(batching);

package/src/inference/pipelines/text/sampling.js CHANGED Viewed

@@ -58,6 +58,30 @@ export function softmax(logits) {
   return exps;
 }
+function countFiniteCandidates(logits, padTokenId) {
+  let finiteCandidateCount = 0;
+  for (let i = 0; i < logits.length; i++) {
+    if (padTokenId != null && i === padTokenId) {
+      continue;
+    }
+    if (Number.isFinite(logits[i])) {
+      finiteCandidateCount += 1;
+    }
+  }
+  return finiteCandidateCount;
+}
+function assertFiniteSamplingCandidates(logits, padTokenId, label) {
+  const finiteCandidateCount = countFiniteCandidates(logits, padTokenId);
+  if (finiteCandidateCount > 0) {
+    return;
+  }
+  throw new Error(
+    `[Sampling] ${label} has no finite candidate logits after masking the pad token. ` +
+    'Upstream decode likely produced NaN/Inf or an all-masked distribution.'
+  );
+}
 export function sample(logits, opts) {
   const { temperature, topP, topK, decode, debug = false, padTokenId, seed } = opts;
@@ -66,16 +90,28 @@ export function sample(logits, opts) {
     logits[padTokenId] = -Infinity;
   }
+  assertFiniteSamplingCandidates(logits, padTokenId, 'Logits');
   // Greedy (argmax) when temperature = 0
   if (temperature === 0) {
-    let maxIdx = 0;
-    let maxVal = logits[0];
-    for (let i = 1; i < logits.length; i++) {
-      if (logits[i] > maxVal) {
-        maxVal = logits[i];
+    let maxIdx = -1;
+    let maxVal = -Infinity;
+    for (let i = 0; i < logits.length; i++) {
+      const value = logits[i];
+      if (!Number.isFinite(value)) {
+        continue;
+      }
+      if (value > maxVal) {
+        maxVal = value;
         maxIdx = i;
       }
     }
+    if (maxIdx < 0) {
+      throw new Error(
+        '[Sampling] Greedy sampling could not find a finite candidate logit. ' +
+        'Upstream decode likely produced NaN/Inf.'
+      );
+    }
     if (debug) {
       const text = decode?.([maxIdx]) ?? '?';
       trace.sample(`Greedy: id=${maxIdx} "${text}" logit=${maxVal.toFixed(4)}`);
@@ -96,7 +132,17 @@ export function sample(logits, opts) {
   let candidates = [];
   for (let i = 0; i < probs.length; i++) {
-    candidates.push({ token: i, prob: probs[i] });
+    const probability = probs[i];
+    if (!Number.isFinite(probability) || probability <= 0) {
+      continue;
+    }
+    candidates.push({ token: i, prob: probability });
+  }
+  if (candidates.length === 0) {
+    throw new Error(
+      '[Sampling] Softmax produced no finite candidate probabilities. ' +
+      'Upstream decode likely produced NaN/Inf logits.'
+    );
   }
   candidates.sort((a, b) => b.prob - a.prob);

package/src/inference/test-harness.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
-import { parseManifest } from '../formats/rdrr/index.js';
+import { parseManifest, getExpectedShardHash } from '../formats/rdrr/index.js';
 import { createPipeline } from './pipelines/text.js';
 import { log as debugLog } from '../debug/index.js';
 import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
@@ -168,7 +168,7 @@ export function createHttpShardLoader(baseUrl, manifest, log) {
           distributionConfig,
           algorithm,
           requiredEncoding,
-          expectedHash: shard.hash ?? null,
+          expectedHash: getExpectedShardHash(shard, algorithm) || null,
           expectedSize: Number.isFinite(shard.size) ? Math.floor(shard.size) : null,
           expectedManifestVersionSet: manifestVersionSet,
           writeToStore: false,

package/src/loader/final-weights-loader.js CHANGED Viewed

@@ -36,6 +36,8 @@ function isLikelyFinalNormName(name) {
   return (
     lower === 'norm.weight' ||
     lower.includes('model.norm.weight') ||
+    lower.includes('language_model.norm.weight') ||
+    lower.includes('model.language_model.norm.weight') ||
     lower.includes('embedding_norm.weight') ||
     lower.includes('model.embedding_norm.weight') ||
     lower.includes('final_layernorm.weight') ||

package/src/loader/shard-cache.js CHANGED Viewed

@@ -5,6 +5,7 @@ import {
   computeHash,
   getStorageBackendType,
 } from '../storage/shard-manager.js';
+import { getExpectedShardHash } from '../formats/rdrr/index.js';
 import { formatBytes } from '../storage/quota.js';
 import { log, trace as debugTrace } from '../debug/index.js';
 import { getRuntimeConfig } from '../config/runtime.js';
@@ -484,11 +485,11 @@ export class ShardCache {
       // Verify hash if enabled
       if (this.#verifyHashes && this.#manifest) {
         const shardInfo = this.#manifest.shards?.[shardIndex];
-        const expectedHash = shardInfo?.hash;
+        const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
+        const expectedHash = getExpectedShardHash(shardInfo, algorithm);
         if (!expectedHash) {
           throw new Error(`Shard ${shardIndex} missing hash in manifest.`);
         }
-        const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
         if (!algorithm) {
           throw new Error(`Manifest missing hashAlgorithm for shard ${shardIndex}.`);
         }

package/src/loader/tensors/tensor-loader.js CHANGED Viewed

@@ -309,8 +309,9 @@ export async function loadBF16(shardData, location, name, config) {
     const numElements = location.size / 2;
     const caps = config.gpuCapabilities || getKernelCapabilities();
     const isMatmulWeight = shouldDequantizeToF16(location);
+    const keepF32Weights = config.keepF32Weights === true;
-    if (caps?.hasF16 && isMatmulWeight) {
+    if (caps?.hasF16 && isMatmulWeight && !keepF32Weights) {
       const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
       resultBuffer = f16Tensor.buffer;
       releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
@@ -327,6 +328,10 @@ export async function loadBF16(shardData, location, name, config) {
       };
     }
+    if (isMatmulWeight && keepF32Weights) {
+      debugTrace.loader(`Keeping BF16 matmul weight in f32: ${name} (keepF32Weights=true)`);
+    }
     const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
     resultBuffer = dstBuffer;
     releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);

package/src/rules/inference/dtype.rules.json CHANGED Viewed

@@ -59,6 +59,11 @@
     { "match": { "useF16": true }, "value": "f16" },
     { "match": {}, "value": { "context": "fallback" } }
   ],
+  "attentionProjectionOutputDtype": [
+    { "match": { "forceF32": true }, "value": "f32" },
+    { "match": { "useF16": true }, "value": "f16" },
+    { "match": {}, "value": { "context": "fallback" } }
+  ],
   "bytesPerElement": [
     { "match": { "dtype": "f16" }, "value": 2 },
     { "match": {}, "value": 4 }

package/src/rules/inference/kernel-path.rules.json CHANGED Viewed

@@ -46,7 +46,7 @@
         "hasSubgroups": false,
         "kernelPathRef": "lfm2-q4k-dequant-f32a-online"
       },
-      "value": "gemma3-q4k-dequant-f32a-nosubgroups"
+      "value": "lfm2-q4k-dequant-f32a-nosubgroups"
     },
     {
       "match": {
@@ -77,7 +77,7 @@
     },
     {
       "match": { "kernelPathId": "lfm2-q4k-dequant-f32a-online" },
-      "value": "gemma3-q4k-dequant-f32a-nosubgroups"
+      "value": "lfm2-q4k-dequant-f32a-nosubgroups"
     },
     {
       "match": { "kernelPathId": "gemma2-f16-f16a" },

package/src/rules/kernels/split-qg.rules.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+  "variant": [
+    { "match": { "outputDtype": "f16" }, "value": "f16" },
+    { "match": {}, "value": "default" }
+  ]
+}

package/src/rules/rule-registry.js CHANGED Viewed

@@ -50,6 +50,7 @@ const sampleRules = await loadJson('./kernels/sample.rules.json', import.meta.ur
 const scaleRules = await loadJson('./kernels/scale.rules.json', import.meta.url, 'Failed to load rules');
 const siluRules = await loadJson('./kernels/silu.rules.json', import.meta.url, 'Failed to load rules');
 const splitQkvRules = await loadJson('./kernels/split-qkv.rules.json', import.meta.url, 'Failed to load rules');
+const splitQgRules = await loadJson('./kernels/split-qg.rules.json', import.meta.url, 'Failed to load rules');
 const softmaxRules = await loadJson('./kernels/softmax.rules.json', import.meta.url, 'Failed to load rules');
 const upsample2dRules = await loadJson('./kernels/upsample2d.rules.json', import.meta.url, 'Failed to load rules');
 const configRules = await loadJson('./inference/config.rules.json', import.meta.url, 'Failed to load rules');
@@ -124,6 +125,7 @@ const RULE_SETS = {
     scale: scaleRules,
     silu: siluRules,
     splitQkv: splitQkvRules,
+    splitQg: splitQgRules,
     softmax: softmaxRules,
     upsample2d: upsample2dRules,
   },

package/src/storage/downloader.js CHANGED Viewed

@@ -2,6 +2,7 @@
 import {
   parseManifest,
+  getExpectedShardHash,
   getManifestUrl,
 } from '../formats/rdrr/index.js';
@@ -726,7 +727,7 @@ export async function downloadModel(
         if (!algorithm) {
           throw new Error('Manifest missing hashAlgorithm for download verification.');
         }
-        const expectedHash = shardInfo.hash;
+        const expectedHash = getExpectedShardHash(shardInfo, algorithm);
         if (!expectedHash) {
           throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
         }

package/src/storage/shard-manager.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import {
   getManifest,
+  getExpectedShardHash,
   getShardInfo,
   getShardCount,
   generateShardFilename,
@@ -280,7 +281,7 @@ export async function writeShard(shardIndex, data, options = { verify: true }) {
       const manifest = getManifest();
       const algorithm = requireManifestHashAlgorithm(manifest, 'shard write');
       const hash = await computeHash(bytes, algorithm);
-      const expectedHash = shardInfo.hash;
+      const expectedHash = getExpectedShardHash(shardInfo, algorithm);
       if (!expectedHash) {
         await backend.deleteFile(shardInfo.filename);
         throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
@@ -369,7 +370,7 @@ export async function loadShard(shardIndex, options = { verify: false }) {
       const manifest = getManifest();
       const algorithm = requireManifestHashAlgorithm(manifest, 'shard load');
       const hash = await computeHash(buffer, algorithm);
-      const expectedHash = shardInfo.hash;
+      const expectedHash = getExpectedShardHash(shardInfo, algorithm);
       if (!expectedHash) {
         throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
       }
@@ -531,7 +532,7 @@ export async function verifyIntegrity(options = {}) {
         const buffer = await loadShard(i, { verify: false });
         const hash = await computeHash(buffer, algorithm);
         const shardInfo = getShardInfo(i);
-        const expectedHash = shardInfo?.hash;
+        const expectedHash = getExpectedShardHash(shardInfo, algorithm);
         if (!expectedHash) {
           corruptShards.push(i);
           continue;