npm - @simulatte/doppler - Versions diffs - 0.1.8 → 0.1.9 - Mend

@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

package/CHANGELOG.md +14 -1
package/README.md +25 -6
package/package.json +5 -3
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +16 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/loader.js +6 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/registry.json +7 -0
package/src/config/presets/models/gemma3.json +2 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +1 -1
package/src/converter/conversion-plan.js +1 -1
package/src/converter/core.js +17 -8
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +15 -0
package/src/distribution/shard-delivery.js +34 -0
package/src/formats/rdrr/classification.js +32 -0
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +1 -0
package/src/gpu/kernels/matmul.d.ts +3 -0
package/src/gpu/kernels/matmul.js +70 -1
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
package/src/inference/pipelines/text/attention/projections.js +13 -2
package/src/inference/pipelines/text/attention/record.js +1 -0
package/src/inference/pipelines/text/attention/run.js +9 -0
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +32 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +14 -0
package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
package/src/inference/pipelines/text/generator-steps.js +46 -29
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +320 -166
package/src/inference/pipelines/text/init.d.ts +2 -0
package/src/inference/pipelines/text/init.js +19 -5
package/src/inference/pipelines/text/layer.js +37 -8
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +9 -7
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +124 -3
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +2 -0
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/tooling/node-converter.js +25 -7
package/src/tooling/node-source-runtime.js +29 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/gpu/kernels/sample_f16.wgsl CHANGED Viewed

@@ -34,6 +34,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
     return softcap * tanh(x / softcap);
 }
+fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
+    if (candidate_value > best_value) {
+        return true;
+    }
+    if (candidate_value < best_value) {
+        return false;
+    }
+    return candidate_index < best_index;
+}
 @group(0) @binding(0) var<uniform> u: Uniforms;
 @group(0) @binding(1) var<storage, read> logits: array<f16>;
 @group(0) @binding(2) var<storage, read_write> output: array<u32>;
@@ -74,7 +84,7 @@ fn find_topk_phase1(
     while (idx < vocab_size) {
         if (idx != pad_id) {
             let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -89,7 +99,12 @@ fn find_topk_phase1(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -130,7 +145,7 @@ fn find_topk_phase2(
             var max_val = shared_values[k];
             for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
-                if (shared_values[i] > max_val) {
+                if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
                     max_val = shared_values[i];
                     max_idx = i;
                 }
@@ -218,7 +233,7 @@ fn sample_single_pass(
     while (idx < vocab_size) {
         if (idx != pad_id) {
             let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -233,7 +248,12 @@ fn sample_single_pass(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -267,7 +287,7 @@ fn argmax(
     while (idx < vocab_size) {
         if (idx != pad_id) {
             let val = apply_softcap(f32(logits[idx]), softcap);
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -282,7 +302,12 @@ fn argmax(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -316,7 +341,12 @@ fn argmax_reduce(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }

package/src/gpu/kernels/shader-cache.js CHANGED Viewed

@@ -133,10 +133,15 @@ export async function compileShader(
   source,
   label
 ) {
-  const module = device.createShaderModule({
-    label,
-    code: source,
-  });
+  let module;
+  try {
+    module = device.createShaderModule({
+      label,
+      code: source,
+    });
+  } catch (err) {
+    throw new Error(`createShaderModule failed for "${label}": ${err.message}`);
+  }
   // Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
   const compilationInfo = typeof module.getCompilationInfo === 'function'

package/src/inference/kv-cache/base.js CHANGED Viewed

@@ -314,10 +314,7 @@ export class KVCache {
     layer.seqLen = Math.max(layer.seqLen, startPos + numNewTokens);
     this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numNewTokens);
-    // Update global sequence length if this is the last layer
-    if (layerIdx === this.numLayers - 1) {
-      this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
-    }
+    this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
   }
@@ -374,9 +371,7 @@ export class KVCache {
     layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
     this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
-    if (layerIdx === this.numLayers - 1) {
-      this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
-    }
+    this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
   }
@@ -433,9 +428,7 @@ export class KVCache {
     layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
     this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
-    if (layerIdx === this.numLayers - 1) {
-      this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
-    }
+    this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
   }

package/src/inference/pipelines/diffusion/pipeline.js CHANGED Viewed

@@ -28,6 +28,7 @@ import { runResidualAdd, runScale, recordResidualAdd, recordScale } from '../../
 import { f16ToF32 } from '../../../loader/dtype-utils.js';
 const SUPPORTED_DIFFUSION_BACKEND_PIPELINES = new Set(['gpu']);
+const DEFAULT_TIME_EMBED_DIM = 256;
 const SD3_TEXT_ENCODER_KEYS = ['text_encoder', 'text_encoder_2', 'text_encoder_3'];
 const SANA_TEXT_ENCODER_KEYS = ['text_encoder'];
@@ -492,7 +493,7 @@ export class DiffusionPipeline {
     const hiddenSize = (transformerConfig.num_attention_heads ?? 0) * (transformerConfig.attention_head_dim ?? 0);
     const patchSize = transformerConfig.patch_size ?? 2;
     const timeEmbedWeight = transformerResolver.get('time_text_embed.timestep_embedder.linear_1.weight');
-    const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? 256;
+    const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? DEFAULT_TIME_EMBED_DIM;
     if (!Number.isFinite(hiddenSize) || hiddenSize <= 0) {
       throw new Error('Diffusion transformer config missing num_attention_heads/attention_head_dim.');
     }

package/src/inference/pipelines/diffusion/text-encoder-gpu.js CHANGED Viewed

@@ -44,6 +44,7 @@ import { initRoPEFrequencies } from '../text/init.js';
 import { processLayerGPU } from '../text/layer.js';
 const QUICK_GELU_ALPHA = 1.702;
+const DEFAULT_TIMESTEP_EMBED_DIM = 256;
 const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
 // Standard CLIP hidden activation per OpenAI CLIP specification.
 const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
@@ -1105,7 +1106,7 @@ export async function buildTimestepEmbedding(timestep, weightsEntry, modelConfig
   const device = getDevice();
   if (!device) throw new Error('Timestep embedding requires a WebGPU device.');
-  const dim = options.dim ?? 256;
+  const dim = options.dim ?? DEFAULT_TIMESTEP_EMBED_DIM;
   const half = Math.floor(dim / 2);
   const emb = new Float32Array(dim);
   const maxPeriod = 10000;

package/src/inference/pipelines/text/attention/projections.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import type { Tensor } from '../../../../gpu/tensor.js';
 import type { WeightBuffer, CpuWeightBuffer } from '../../../../gpu/weight-buffer.js';
 import type { LayerWeights } from '../types.js';
 import type { LoRAAdapter } from '../lora.js';
+import type { MatmulDebugConfigSchema } from '../../../../config/schema/debug.schema.js';
 export interface AttentionInputInfo {
   phase: 'prefill' | 'decode';
@@ -76,11 +77,13 @@ export interface ProjectAttentionQKVOptions {
   getWeightBuffer?: (weight: GPUBuffer | WeightBuffer | Float32Array | ArrayBuffer | CpuWeightBuffer, label: string) => GPUBuffer | WeightBuffer;
   lora?: LoRAAdapter | null;
   releaseTemporary: (buffer: GPUBuffer) => void;
+  matmulDebug?: MatmulDebugConfigSchema | null;
   onFusedQKV?: ((info: { qSize: number; kSize: number; vSize: number; totalSize: number }) => void) | null;
 }
 export interface ProjectAttentionQKVResult {
   qTensor: Tensor;
+  qGateTensor: Tensor | null;
   kTensor: Tensor;
   vTensor: Tensor;
   usedFusedQKV: boolean;

package/src/inference/pipelines/text/attention/projections.js CHANGED Viewed

@@ -71,9 +71,10 @@ async function projectSingleQkvTensor({
   matmulOutputDtype,
   getWeightBuffer,
   lora,
+  matmulDebug,
   releaseTemporary,
 }) {
-  const runMatmulForMode = getMatmulRunner(recorder);
+    const runMatmulForMode = getMatmulRunner(recorder);
   const layerWeight = layerWeights?.[weightKey];
   if (!layerWeight) {
     throw new Error(`Attention projection requires ${weightKey}.`);
@@ -91,6 +92,7 @@ async function projectSingleQkvTensor({
       layerIdx,
       kernelPath,
       outputDtype: matmulOutputDtype,
+      matmulDebug,
     });
   } finally {
     releaseOwnedWeightBuffer(layerWeight, projBuffer, releaseTemporary);
@@ -178,6 +180,7 @@ async function projectQueryWithOptionalGate({
   matmulOutputDtype,
   getWeightBuffer,
   lora,
+  matmulDebug,
   releaseTemporary,
   attentionOutputGate,
 }) {
@@ -205,6 +208,7 @@ async function projectQueryWithOptionalGate({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
     });
     return { qTensor, qGateTensor: null };
@@ -226,6 +230,7 @@ async function projectQueryWithOptionalGate({
       layerIdx,
       kernelPath,
       outputDtype: matmulOutputDtype,
+      matmulDebug,
     });
     const split = await runSplitQGForMode(fullQGTensor, {
@@ -329,6 +334,7 @@ export async function projectAttentionQKV({
   matmulOutputDtype,
   getWeightBuffer,
   lora,
+  matmulDebug,
   releaseTemporary,
   onFusedQKV = null,
   attentionOutputGate = false,
@@ -339,7 +345,8 @@ export async function projectAttentionQKV({
   const hasLoRA = getLoRAModule(lora, layerIdx, 'q_proj')
     || getLoRAModule(lora, layerIdx, 'k_proj')
     || getLoRAModule(lora, layerIdx, 'v_proj');
-  const useFusedQKV = selectRuleValue('inference', 'attention', 'useFusedQkv', {
+  const forceSplitQKV = Boolean(matmulDebug?.enabled) && matmulDebug?.forceSplitQKV === true;
+  const useFusedQKV = !forceSplitQKV && selectRuleValue('inference', 'attention', 'useFusedQkv', {
     hasQkvProj: Boolean(layerWeights.qkvProj),
     hasQkvSizes: Boolean(layerWeights.qkvSizes),
     hasLoRA: Boolean(hasLoRA),
@@ -356,6 +363,7 @@ export async function projectAttentionQKV({
         layerIdx,
         kernelPath,
         outputDtype: matmulOutputDtype,
+        matmulDebug,
       });
       const split = await runSplitForMode(qkvTensor, {
         numTokens,
@@ -394,6 +402,7 @@ export async function projectAttentionQKV({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
       attentionOutputGate,
     }));
@@ -414,6 +423,7 @@ export async function projectAttentionQKV({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
     });
@@ -433,6 +443,7 @@ export async function projectAttentionQKV({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
     });

package/src/inference/pipelines/text/attention/record.js CHANGED Viewed

@@ -167,6 +167,7 @@ export async function recordLayerAttentionGPU(
     matmulOutputDtype,
     getWeightBuffer,
     lora,
+    matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
     attentionOutputGate: config.attentionOutputGate === true,
     releaseTemporary: (buffer) => releaseOrTrack(recorder, buffer),
     onFusedQKV: layerIdx === 0 && isPrefill

package/src/inference/pipelines/text/attention/run.js CHANGED Viewed

@@ -166,6 +166,14 @@ export async function runLayerAttentionGPU(
         dtype: normed.dtype,
       });
     }
+    await runProbes('post_input_norm', normed.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize,
+      probes: state.debugProbes,
+      dtype: normed.dtype,
+    });
   }
   // Debug: Check normed input for L0 prefill
@@ -218,6 +226,7 @@ export async function runLayerAttentionGPU(
     matmulOutputDtype,
     getWeightBuffer,
     lora,
+    matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
     attentionOutputGate: config.attentionOutputGate === true,
     releaseTemporary: (buffer) => releaseBuffer(buffer),
     onFusedQKV: layerIdx === 0 && isPrefill

package/src/inference/pipelines/text/config.d.ts CHANGED Viewed

@@ -150,6 +150,7 @@ export interface ParsedModelConfig {
   ropeLocalTheta: number | null;
   ropeRotaryDim: number;
   ropeInterleaved: boolean;
+  mropeInterleaved: boolean;
   mropeSection: number[] | null;
   partialRotaryFactor: number | null;
   ropeScale: number;

package/src/inference/pipelines/text/config.js CHANGED Viewed

@@ -349,6 +349,24 @@ function normalizeLayerTypeTag(value) {
   return null;
 }
+function resolveVisionConfig(rawConfig, manifest) {
+  const vc = rawConfig?.vision_config ?? manifest?.config?.vision_config;
+  if (!vc || typeof vc !== 'object') return null;
+  return {
+    depth: vc.depth ?? 24,
+    hiddenSize: vc.hidden_size ?? 1024,
+    intermediateSize: vc.intermediate_size ?? 4096,
+    numHeads: vc.num_heads ?? 16,
+    outHiddenSize: vc.out_hidden_size ?? vc.hidden_size ?? 1024,
+    patchSize: vc.patch_size ?? 16,
+    spatialMergeSize: vc.spatial_merge_size ?? 2,
+    temporalPatchSize: vc.temporal_patch_size ?? 2,
+    eps: vc.eps ?? 1e-6,
+    deepstackVisualIndexes: Array.isArray(vc.deepstack_visual_indexes) ? vc.deepstack_visual_indexes : [],
+    imageTokenId: rawConfig?.image_token_id ?? manifest?.image_token_id ?? null,
+  };
+}
 function parseCustomLayerTypes(layerTypes, numLayers, modelId) {
   if (!Array.isArray(layerTypes) || layerTypes.length === 0) {
     throw new Error(
@@ -512,10 +530,18 @@ export function toParsedConfigFromMerged(merged, manifest) {
   // RoPE scaling - use manifest inference as source of truth (not raw config)
   const ropeScale = inf.rope.ropeScalingFactor;
   const ropeScalingType = inf.rope.ropeScalingType;
-  const ropeLocalScale = inf.rope.ropeLocalScalingFactor ?? ropeScale;
-  const ropeLocalScalingType = inf.rope.ropeLocalScalingType ?? ropeScalingType;
+  const ropeLocalScale = inf.rope.ropeLocalScalingFactor;
+  const ropeLocalScalingType = inf.rope.ropeLocalScalingType;
   const partialRotaryFactor = inf.rope.partialRotaryFactor;
-  const ropeInterleaved = inf.rope.mropeInterleaved === true;
+  const mropeInterleaved = inf.rope.mropeInterleaved === true;
+  const ropeInterleaved = false;
+  if (ropeLocalScale == null && (inf.rope.ropeLocalTheta != null || inf.rope.mropeSection != null)) {
+    throw new Error(
+      `Model "${merged.modelId}" uses hybrid/mRoPE but is missing rope.ropeLocalScalingFactor in manifest. ` +
+      `Re-convert the model using the latest converter or update the manifest to include an explicit scale.`
+    );
+  }
   const mropeSection = Array.isArray(inf.rope.mropeSection)
     ? inf.rope.mropeSection.map((entry) => Math.trunc(Number(entry)))
     : null;
@@ -525,7 +551,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
       `Manifest "${merged.modelId}" has invalid rope.mropeSection; expected positive integers.`
     );
   }
-  if (ropeInterleaved && mropeSection) {
+  if (mropeInterleaved && mropeSection) {
     const doubledMropeDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
     if (doubledMropeDim !== ropeRotaryDim) {
       throw new Error(
@@ -610,6 +636,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
     ropeLocalTheta: inf.rope.ropeLocalTheta,
     ropeRotaryDim,
     ropeInterleaved,
+    mropeInterleaved,
     mropeSection,
     partialRotaryFactor,
     ropeScale,
@@ -650,6 +677,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
     chatTemplateType,
     chatTemplateEnabled,
     kernelPath: inf.defaultKernelPath,
+    visionConfig: resolveVisionConfig(config, manifest),
   };
 }

package/src/inference/pipelines/text/embed.js CHANGED Viewed

@@ -9,6 +9,7 @@ import { decodeReadback } from './debug-utils/index.js';
 import { createTensor } from '../../../gpu/tensor.js';
 import { castF32ToF16, recordCastF32ToF16 } from '../../../gpu/kernels/cast.js';
 import { isCpuWeightBuffer } from '../../../gpu/weight-buffer.js';
+import { f16ToF32 } from '../../../loader/dtype-utils.js';
 import { selectRuleValue } from '../../../rules/rule-registry.js';
 const scaleShaderCode = `
@@ -202,11 +203,19 @@ export async function embed(tokenIds, embedBuffer, config) {
   const dtype = selectRuleValue('inference', 'dtype', 'f16OrF32', { useF16 });
-  const cpuEmbeddings = isCpuWeightBuffer(embedBuffer)
-    ? embedBuffer.data
-    : embedBuffer instanceof Float32Array
-      ? embedBuffer
-      : null;
+  let cpuEmbeddings = null;
+  if (isCpuWeightBuffer(embedBuffer)) {
+    const bufDtype = embedBuffer.dtype;
+    if (bufDtype !== 'f32' && bufDtype !== 'f16') {
+      throw new Error(
+        `[Embed] CPU embedding buffer has unsupported dtype '${bufDtype}'; ` +
+        `only 'f32' and 'f16' are supported in the CPU gather path.`
+      );
+    }
+    cpuEmbeddings = embedBuffer.data;
+  } else if (embedBuffer instanceof Float32Array) {
+    cpuEmbeddings = embedBuffer;
+  }
   if (debug) {
     trace.embed(`tokens=${numTokens}, hidden=${hiddenSize}, vocab=${vocabSize}, scaleEmbeddings=${scaleEmbeddings}, transpose=${transpose}, indexOffset=${indexOffset}, activationDtype=${activationDtype}, useF16=${useF16}`);
@@ -226,18 +235,28 @@ export async function embed(tokenIds, embedBuffer, config) {
     }
     const output = new Float32Array(numTokens * hiddenSize);
+    // Check actual data type: loader's f16_to_f32 CPU path already decodes F16 into Float32Array,
+    // so dtype='f16' does not reliably indicate raw F16 bytes. Only Uint16Array needs per-element decoding.
+    const isF16Cpu = cpuEmbeddings instanceof Uint16Array;
     if (!transpose) {
       for (let t = 0; t < numTokens; t++) {
         const tokenId =  (tokenIdArray)[t];
         const srcOffset = tokenId * hiddenSize;
-        output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
+        if (isF16Cpu) {
+          for (let h = 0; h < hiddenSize; h++) {
+            output[t * hiddenSize + h] = f16ToF32(cpuEmbeddings[srcOffset + h]);
+          }
+        } else {
+          output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
+        }
       }
     } else {
       for (let t = 0; t < numTokens; t++) {
         const tokenId =  (tokenIdArray)[t];
         const dstOffset = t * hiddenSize;
         for (let h = 0; h < hiddenSize; h++) {
-          output[dstOffset + h] = cpuEmbeddings[h * vocabSize + tokenId];
+          const raw = cpuEmbeddings[h * vocabSize + tokenId];
+          output[dstOffset + h] = isF16Cpu ? f16ToF32(raw) : raw;
         }
       }
     }

package/src/inference/pipelines/text/execution-v0-runtime-builders.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { selectRuleValue } from '../../../rules/rule-registry.js';
 import { cloneJson, isPhaseMatch, normalizeDtype, requireSessionActivationDtype, stepHasLayer } from './execution-v0-contract-helpers.js';
-const PIPELINE_COMPATIBLE_OPS = new Set([
+export const PIPELINE_COMPATIBLE_OPS = new Set([
   'save',
   'load',
   'conv',
@@ -191,8 +191,15 @@ export function buildLayerPipelineFromExecution(steps) {
   if (layerSectionSteps.length === 0) {
     return null;
   }
-  if (layerSectionSteps.some((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))) {
-    return null;
+  const incompatibleOps = [
+    ...new Set(
+      layerSectionSteps
+        .filter((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))
+        .map((step) => step.op)
+    ),
+  ];
+  if (incompatibleOps.length > 0) {
+    return { incompatibleOps };
   }
   const layerSteps = layerSectionSteps

package/src/inference/pipelines/text/execution-v0.js CHANGED Viewed

@@ -31,6 +31,7 @@ import {
   buildModelRuntimeOverrides,
   buildSessionRuntimePatch,
   resolveFinitenessFallbackKernelPathId,
+  PIPELINE_COMPATIBLE_OPS,
 } from './execution-v0-runtime-builders.js';
 export function hasExecutionV0(manifestInference) {
@@ -152,7 +153,17 @@ export function compileExecutionV0(options = {}) {
     numLayers,
     finitenessFallbackKernelPathId
   );
-  const layerPipeline = buildLayerPipelineFromExecution(resolvedSteps);
+  const layerPipelineResult = buildLayerPipelineFromExecution(resolvedSteps);
+  if (layerPipelineResult?.incompatibleOps && !kernelPath) {
+    throw new Error(
+      `[ExecutionV0] manifest.inference.execution.steps contains layer ops that are not ` +
+      `compatible with the JS layer pipeline and no inline kernelPath was built to cover execution. ` +
+      `Unsupported ops: ${layerPipelineResult.incompatibleOps.join(', ')}. ` +
+      `Either add explicit kernel references to each step (for inline-kernel execution) ` +
+      `or restrict layer ops to: ${[...PIPELINE_COMPATIBLE_OPS].join(', ')}.`
+    );
+  }
+  const layerPipeline = layerPipelineResult?.incompatibleOps ? null : layerPipelineResult;
   const sessionPatch = buildSessionRuntimePatch(resolvedSession);
   const modelOverrides = buildModelRuntimeOverrides(manifestInference);
   for (const [path, source] of sessionSourceByPath.entries()) {

package/src/inference/pipelines/text/generator-helpers.js CHANGED Viewed

@@ -111,6 +111,7 @@ export function buildLayerContext(state, recorder, isDecodeMode, debugLayers, de
     ropeLocalCos: state.ropeLocalCos,
     ropeLocalSin: state.ropeLocalSin,
     linearAttentionRuntime: state.linearAttentionRuntime,
+    convLayerStates: state.convLayerStates,
     weightConfig: getWeightBufferConfig(state),
     debugFlags: state.debugFlags,
     debugProbes: state.runtimeConfig.shared.debug.probes,

package/src/inference/pipelines/text/generator-runtime.js CHANGED Viewed

@@ -139,6 +139,12 @@ export function resolveStepOptions(state, options = {}) {
   const executionPlan = resolveExecutionSessionPlan(state, options);
   return {
+    seed: resolveConfiguredValue(
+      options.seed,
+      undefined,
+      'options.seed',
+      (value) => Number.isFinite(value) && value >= 0
+    ),
     temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
     topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
     topK: resolveConfiguredValue(options.topK, samplingDefaults.topK, 'options.topK'),
@@ -165,6 +171,12 @@ export function resolveGenerateOptions(state, options = {}) {
   const executionPlan = resolveExecutionSessionPlan(state, options);
   return {
+    seed: resolveConfiguredValue(
+      options.seed,
+      undefined,
+      'options.seed',
+      (value) => Number.isFinite(value) && value >= 0
+    ),
     maxTokens: executionPlan.maxTokens,
     temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
     topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
@@ -191,6 +203,7 @@ export function resolveGenerateOptions(state, options = {}) {
     batchSize: executionPlan.batchSize,
     stopCheckMode: executionPlan.stopCheckMode,
     executionPlan,
+    images: options.images ?? null,
   };
 }
@@ -205,6 +218,7 @@ export function resolvePrefillOptions(state, options = {}) {
     disableCommandBatching: executionPlan.disableCommandBatching,
     disableMultiTokenDecode: executionPlan.disableMultiTokenDecode,
     executionPlan,
+    images: options.images ?? null,
   };
 }

package/src/inference/pipelines/text/generator-steps.d.ts CHANGED Viewed

@@ -12,6 +12,15 @@ export interface BatchDecodeSelectionConfig {
 export declare function shouldUseBatchDecode(config: BatchDecodeSelectionConfig): boolean;
+export interface FusedDecodeSamplingConfig {
+  recorderEnabled: boolean;
+  gpuSamplingEnabled: boolean;
+  fusedDecodeDisabled: boolean;
+  layerTypes?: string[] | null;
+}
+export declare function shouldUseFusedDecodeSampling(config: FusedDecodeSamplingConfig): boolean;
 export declare function resolveBatchStop(
   tokens: number[],
   stopFlags: Uint32Array | null,