npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/gpu/kernels/sample.wgsl CHANGED Viewed

@@ -40,6 +40,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
     return softcap * tanh(x / softcap);
 }
+fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
+    if (candidate_value > best_value) {
+        return true;
+    }
+    if (candidate_value < best_value) {
+        return false;
+    }
+    return candidate_index < best_index;
+}
 @group(0) @binding(0) var<uniform> u: Uniforms;
 @group(0) @binding(1) var<storage, read> logits: array<f32>;              // [vocabSize]
 @group(0) @binding(2) var<storage, read_write> output: array<u32>;         // [N] - selected tokens
@@ -87,7 +97,7 @@ fn find_topk_phase1(
         if (idx != pad_id) {
             // Apply softcapping before temperature scaling
             let val = apply_softcap(logits[idx], softcap) / temperature;
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -103,7 +113,12 @@ fn find_topk_phase1(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -150,7 +165,7 @@ fn find_topk_phase2(
             var max_val = shared_values[k];
             for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
-                if (shared_values[i] > max_val) {
+                if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
                     max_val = shared_values[i];
                     max_idx = i;
                 }
@@ -249,7 +264,7 @@ fn sample_single_pass(
         if (idx != pad_id) {
             // Apply softcapping before temperature scaling
             let val = apply_softcap(logits[idx], softcap) / temperature;
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -265,7 +280,12 @@ fn sample_single_pass(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -308,7 +328,7 @@ fn argmax(
         if (idx != pad_id) {
             // Apply softcapping (argmax is greedy, no temperature)
             let val = apply_softcap(logits[idx], softcap);
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -324,7 +344,12 @@ fn argmax(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -362,7 +387,12 @@ fn argmax_reduce(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -374,4 +404,4 @@ fn argmax_reduce(
     if (thread_idx == 0u) {
         output[u.output_index] = shared_indices[0];
     }
-}
+}

package/src/gpu/kernels/sample_f16.wgsl CHANGED Viewed

@@ -34,6 +34,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
     return softcap * tanh(x / softcap);
 }
+fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
+    if (candidate_value > best_value) {
+        return true;
+    }
+    if (candidate_value < best_value) {
+        return false;
+    }
+    return candidate_index < best_index;
+}
 @group(0) @binding(0) var<uniform> u: Uniforms;
 @group(0) @binding(1) var<storage, read> logits: array<f16>;
 @group(0) @binding(2) var<storage, read_write> output: array<u32>;
@@ -74,7 +84,7 @@ fn find_topk_phase1(
     while (idx < vocab_size) {
         if (idx != pad_id) {
             let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -89,7 +99,12 @@ fn find_topk_phase1(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -130,7 +145,7 @@ fn find_topk_phase2(
             var max_val = shared_values[k];
             for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
-                if (shared_values[i] > max_val) {
+                if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
                     max_val = shared_values[i];
                     max_idx = i;
                 }
@@ -218,7 +233,7 @@ fn sample_single_pass(
     while (idx < vocab_size) {
         if (idx != pad_id) {
             let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -233,7 +248,12 @@ fn sample_single_pass(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -267,7 +287,7 @@ fn argmax(
     while (idx < vocab_size) {
         if (idx != pad_id) {
             let val = apply_softcap(f32(logits[idx]), softcap);
-            if (val > local_max) {
+            if (candidate_beats(val, idx, local_max, local_max_idx)) {
                 local_max = val;
                 local_max_idx = idx;
             }
@@ -282,7 +302,12 @@ fn argmax(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }
@@ -316,7 +341,12 @@ fn argmax_reduce(
     var stride = WORKGROUP_SIZE / 2u;
     while (stride > 0u) {
         if (thread_idx < stride) {
-            if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
+            if (candidate_beats(
+                shared_values[thread_idx + stride],
+                shared_indices[thread_idx + stride],
+                shared_values[thread_idx],
+                shared_indices[thread_idx]
+            )) {
                 shared_values[thread_idx] = shared_values[thread_idx + stride];
                 shared_indices[thread_idx] = shared_indices[thread_idx + stride];
             }

package/src/gpu/kernels/shader-cache.js CHANGED Viewed

@@ -133,10 +133,15 @@ export async function compileShader(
   source,
   label
 ) {
-  const module = device.createShaderModule({
-    label,
-    code: source,
-  });
+  let module;
+  try {
+    module = device.createShaderModule({
+      label,
+      code: source,
+    });
+  } catch (err) {
+    throw new Error(`createShaderModule failed for "${label}": ${err.message}`);
+  }
   // Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
   const compilationInfo = typeof module.getCompilationInfo === 'function'

package/src/gpu/kernels/split_qg.d.ts ADDED Viewed

@@ -0,0 +1,50 @@
+/**
+ * Split Q and Gate Kernel
+ *
+ * De-interleaves Q and Gate projections from q_proj output for attentionOutputGate models.
+ * Models like Qwen 3.5 store q_proj weights in per-head interleaved layout:
+ *   rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
+ *   rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
+ * This kernel separates the full matmul output into contiguous Q and Gate tensors.
+ */
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+/** Split Q and Gate options */
+export interface SplitQGOptions {
+  numTokens: number;
+  numHeads: number;
+  headDim: number;
+  /** Pre-allocated Q output tensor */
+  qTensor?: Tensor | null;
+  /** Pre-allocated Gate output tensor */
+  gTensor?: Tensor | null;
+}
+/** Split Q and Gate result */
+export interface SplitQGResult {
+  Q: Tensor;
+  G: Tensor;
+}
+/**
+ * De-interleave Q and Gate from q_proj output.
+ *
+ * @param qgTensor - Full q_proj output [numTokens, numHeads * headDim * 2] (interleaved)
+ * @param options - Split configuration
+ * @returns Separate Q and Gate tensors, each [numTokens, numHeads * headDim]
+ */
+export declare function runSplitQG(
+  qgTensor: Tensor,
+  options: SplitQGOptions
+): Promise<SplitQGResult>;
+/**
+ * Record split Q and Gate (batched, no submit).
+ */
+export declare function recordSplitQG(
+  recorder: CommandRecorder,
+  qgTensor: Tensor,
+  options: SplitQGOptions
+): Promise<SplitQGResult>;

package/src/gpu/kernels/split_qg.js ADDED Viewed

@@ -0,0 +1,46 @@
+import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { WORKGROUP_SIZES } from './constants.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+async function _splitQG(target, qgTensor, options) {
+  const { numTokens, numHeads, headDim, qTensor = null, gTensor = null } = options;
+  const ownsQ = qTensor == null;
+  const ownsG = gTensor == null;
+  const outputDtype = qgTensor.dtype;
+  const pipelineVariant = selectRuleValue('splitQg', 'variant', { outputDtype });
+  const bytesPerElement = dtypeBytes(outputDtype);
+  const qSize = numHeads * headDim;
+  const qBuffer = qTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q');
+  const gBuffer = gTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q_gate');
+  try {
+    await unifiedKernelWrapper(
+      'split_qg', target, pipelineVariant,
+      [qgTensor, qBuffer, gBuffer],
+      { num_tokens: numTokens, num_heads: numHeads, head_dim: headDim, _pad: 0 },
+      Math.ceil((numTokens * qSize) / WORKGROUP_SIZES.DEFAULT)
+    );
+    const Q = qTensor || createTensor(qBuffer, outputDtype, [numTokens, qSize], 'Q');
+    const G = gTensor || createTensor(gBuffer, outputDtype, [numTokens, qSize], 'Q_gate');
+    return { Q, G };
+  } catch (error) {
+    if (ownsQ) releaseBuffer(qBuffer);
+    if (ownsG) releaseBuffer(gBuffer);
+    throw error;
+  }
+}
+export async function runSplitQG(qgTensor, options) {
+  return _splitQG(null, qgTensor, options);
+}
+export async function recordSplitQG(recorder, qgTensor, options) {
+  return _splitQG(recorder, qgTensor, options);
+}

package/src/gpu/kernels/split_qg.wgsl ADDED Viewed

@@ -0,0 +1,58 @@
+// split_qg.wgsl
+/**
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models.
+ *
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
+ *   rows [h*headDim*2 : h*headDim*2+headDim]     = Q for head h
+ *   rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
+ *
+ * A single full matmul over all 2*qSize rows produces interleaved output:
+ *   input[token, h*headDim*2 : h*headDim*2+headDim]     = Q head h
+ *   input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
+ *
+ * This kernel separates them into contiguous Q and G outputs:
+ *   Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
+ *   G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
+ *
+ * Input layout  (row-major): [numTokens, numHeads * headDim * 2]
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
+ */
+struct Params {
+    num_tokens: u32,
+    num_heads: u32,
+    head_dim: u32,
+    _pad: u32,
+}
+override WORKGROUP_SIZE: u32 = 256u;
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read_write> Q: array<f32>;
+@group(0) @binding(3) var<storage, read_write> G: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let q_size = params.num_heads * params.head_dim;
+    let total_elements = params.num_tokens * q_size;
+    if (idx >= total_elements) {
+        return;
+    }
+    let token = idx / q_size;
+    let elem = idx % q_size;
+    let head = elem / params.head_dim;
+    let dim = elem % params.head_dim;
+    // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
+    let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
+    let src_g = src_q + params.head_dim;
+    Q[idx] = input[src_q];
+    G[idx] = input[src_g];
+}

package/src/gpu/kernels/split_qg_f16.wgsl ADDED Viewed

@@ -0,0 +1,62 @@
+// AUTO-GENERATED from src/gpu/kernels/split_qg.wgsl.
+// Edit the source kernel and tools/configs/wgsl-variants.js, then run `npm run kernels:generate`.
+// split_qg_f16.wgsl
+/**
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models (f16).
+ *
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
+ *   rows [h*headDim*2 : h*headDim*2+headDim]     = Q for head h
+ *   rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
+ *
+ * A single full matmul over all 2*qSize rows produces interleaved output:
+ *   input[token, h*headDim*2 : h*headDim*2+headDim]     = Q head h
+ *   input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
+ *
+ * This kernel separates them into contiguous Q and G outputs:
+ *   Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
+ *   G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
+ *
+ * Input layout  (row-major): [numTokens, numHeads * headDim * 2]
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
+ */
+enable f16;
+struct Params {
+    num_tokens: u32,
+    num_heads: u32,
+    head_dim: u32,
+    _pad: u32,
+}
+override WORKGROUP_SIZE: u32 = 256u;
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read_write> Q: array<f16>;
+@group(0) @binding(3) var<storage, read_write> G: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let q_size = params.num_heads * params.head_dim;
+    let total_elements = params.num_tokens * q_size;
+    if (idx >= total_elements) {
+        return;
+    }
+    let token = idx / q_size;
+    let elem = idx % q_size;
+    let head = elem / params.head_dim;
+    let dim = elem % params.head_dim;
+    // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
+    let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
+    let src_g = src_q + params.head_dim;
+    Q[idx] = input[src_q];
+    G[idx] = input[src_g];
+}

package/src/gpu/weight-buffer.d.ts CHANGED Viewed

@@ -110,6 +110,6 @@ export function getBuffer(weight: GPUBuffer | WeightBuffer | TensorLike): GPUBuf
 export function getLayout(weight: GPUBuffer | WeightBuffer | TensorLike): WeightLayout | null;
 /**
- * Get dtype from WeightBuffer, or null for raw GPUBuffer.
+ * Get dtype from WeightBuffer, tagged raw GPUBuffer, or TensorLike.
  */
 export function getWeightDtype(weight: GPUBuffer | WeightBuffer | TensorLike): WeightDtype | TensorLike['dtype'] | null;

package/src/gpu/weight-buffer.js CHANGED Viewed

@@ -114,5 +114,5 @@ export function getLayout(weight) {
 export function getWeightDtype(weight) {
   if (isWeightBuffer(weight)) return weight.dtype;
   if (isTensorLike(weight)) return weight.dtype;
-  return null;
+  return getBufferDtype(weight);
 }

package/src/inference/browser-harness.d.ts CHANGED Viewed

@@ -9,6 +9,7 @@ import type { InferencePipeline } from './pipelines/text.js';
 import type { DiffusionPipeline } from './pipelines/diffusion/pipeline.js';
 import type { EnergyPipeline } from './pipelines/energy/pipeline.js';
 import type { SavedReportInfo, SaveReportOptions } from '../storage/reports.js';
+import type { DebugSnapshot } from '../debug/history.js';
 export interface BrowserHarnessOptions extends InferenceHarnessOptions {
   modelUrl: string;
@@ -143,6 +144,7 @@ export interface BrowserSuiteResult extends SuiteSummary {
   output?: string | DiffusionOutput | null;
   deviceInfo?: Record<string, unknown> | null;
   memoryStats?: ReturnType<InferencePipeline['getMemoryStats']> | null;
+  debugSnapshot?: DebugSnapshot | null;
   pipeline?: InferencePipeline | DiffusionPipeline | EnergyPipeline | null;
   report: Record<string, unknown>;
   reportInfo: SavedReportInfo;

package/src/inference/browser-harness.js CHANGED Viewed

@@ -2,6 +2,7 @@
 import { initializeInference } from './test-harness.js';
 import { saveReport } from '../storage/reports.js';
 import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
+import { clearLogHistory, getDebugSnapshot } from '../debug/history.js';
 import { computeSampleStats } from '../debug/stats.js';
 import {
   setActiveKernelPath,
@@ -846,15 +847,32 @@ async function dispatchBrowserSuite(suite, options) {
   return null;
 }
+function shouldCaptureDebugSnapshot(suite, runtimeConfig) {
+  const debug = runtimeConfig?.shared?.debug ?? {};
+  const logLevel = String(debug.logLevel?.defaultLogLevel ?? '').toLowerCase();
+  return suite === 'debug'
+    || debug.trace?.enabled === true
+    || debug.pipeline?.enabled === true
+    || (Array.isArray(debug.probes) && debug.probes.length > 0)
+    || debug.profiler?.enabled === true
+    || logLevel === 'debug'
+    || logLevel === 'verbose';
+}
 export async function runBrowserSuite(options = {}) {
   return runWithRuntimeIsolationForSuite(async () => {
     const suiteTimestamp = resolveReportTimestamp(options.timestamp, 'runBrowserSuite timestamp');
     const suiteContext = resolveSuiteContext(options);
     const suite = normalizeSuite(options.suite, suiteContext);
+    const captureDebugSnapshot = shouldCaptureDebugSnapshot(suite, getRuntimeConfig());
+    if (captureDebugSnapshot) {
+      clearLogHistory();
+    }
     const suiteResult = await dispatchBrowserSuite(suite, options);
     if (!suiteResult) {
       throw createUnsupportedSuiteError(suite, suiteContext);
     }
+    const debugSnapshot = captureDebugSnapshot ? getDebugSnapshot() : null;
     if (suite === 'bench' && suiteResult?.metrics?.workloadType === 'training') {
       const trainingReport = suiteResult?.metrics?.trainingMetricsReport;
@@ -886,6 +904,7 @@ export async function runBrowserSuite(options = {}) {
       metrics: suiteResult.metrics ?? null,
       output: reportOutput,
       memory: suiteResult.memoryStats ?? null,
+      debugSnapshot,
       ...options.report,
     };
     if (ulArtifacts.length > 0 || distillArtifacts.length > 0 || checkpointResumeTimeline.length > 0) {
@@ -907,7 +926,7 @@ export async function runBrowserSuite(options = {}) {
       report.timestamp = suiteTimestamp;
     }
     const reportInfo = await saveReport(modelId, report, { timestamp: report.timestamp });
-    return { ...suiteResult, report, reportInfo };
+    return { ...suiteResult, debugSnapshot, report, reportInfo };
   });
 }

package/src/inference/kv-cache/base.js CHANGED Viewed

@@ -314,10 +314,7 @@ export class KVCache {
     layer.seqLen = Math.max(layer.seqLen, startPos + numNewTokens);
     this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numNewTokens);
-    // Update global sequence length if this is the last layer
-    if (layerIdx === this.numLayers - 1) {
-      this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
-    }
+    this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
   }
@@ -374,9 +371,7 @@ export class KVCache {
     layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
     this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
-    if (layerIdx === this.numLayers - 1) {
-      this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
-    }
+    this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
   }
@@ -433,9 +428,7 @@ export class KVCache {
     layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
     this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
-    if (layerIdx === this.numLayers - 1) {
-      this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
-    }
+    this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
   }

package/src/inference/pipelines/diffusion/helpers.js CHANGED Viewed

@@ -89,6 +89,9 @@ export function normalizeDiffusionMatmulLocationDtype(dtype) {
   return normalized;
 }
+// Artifact-derived dtype inference: determines actual storage dtype from buffer byte size.
+// This is NOT a config-bypass — it reads physical buffer dimensions (artifact-derived config),
+// which is a valid merge layer per the config merge contract.
 export function inferDiffusionMatmulDtypeFromBuffer(weight, N, K, preferred) {
   const buffer = getBuffer(weight);
   if (!buffer || !Number.isFinite(N) || !Number.isFinite(K)) return preferred;

package/src/inference/pipelines/diffusion/pipeline.js CHANGED Viewed

@@ -28,6 +28,7 @@ import { runResidualAdd, runScale, recordResidualAdd, recordScale } from '../../
 import { f16ToF32 } from '../../../loader/dtype-utils.js';
 const SUPPORTED_DIFFUSION_BACKEND_PIPELINES = new Set(['gpu']);
+const DEFAULT_TIME_EMBED_DIM = 256;
 const SD3_TEXT_ENCODER_KEYS = ['text_encoder', 'text_encoder_2', 'text_encoder_3'];
 const SANA_TEXT_ENCODER_KEYS = ['text_encoder'];
@@ -492,7 +493,7 @@ export class DiffusionPipeline {
     const hiddenSize = (transformerConfig.num_attention_heads ?? 0) * (transformerConfig.attention_head_dim ?? 0);
     const patchSize = transformerConfig.patch_size ?? 2;
     const timeEmbedWeight = transformerResolver.get('time_text_embed.timestep_embedder.linear_1.weight');
-    const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? 256;
+    const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? DEFAULT_TIME_EMBED_DIM;
     if (!Number.isFinite(hiddenSize) || hiddenSize <= 0) {
       throw new Error('Diffusion transformer config missing num_attention_heads/attention_head_dim.');
     }

package/src/inference/pipelines/diffusion/text-encoder-gpu.js CHANGED Viewed

@@ -44,7 +44,10 @@ import { initRoPEFrequencies } from '../text/init.js';
 import { processLayerGPU } from '../text/layer.js';
 const QUICK_GELU_ALPHA = 1.702;
+const DEFAULT_TIMESTEP_EMBED_DIM = 256;
 const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
+// Standard CLIP hidden activation per OpenAI CLIP specification.
+const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
 function padTokens(tokens, maxLength, padTokenId) {
   if (!Number.isFinite(maxLength) || maxLength <= 0) {
@@ -100,11 +103,15 @@ function createVectorTensor(device, data, dtype, label) {
   return createTensor(buffer, dtype, [1, length], label);
 }
+// Conservative fallback dtype for diffusion bias tensors when no dtype
+// metadata is available. F32 avoids precision loss in bias additions.
+const DEFAULT_BIAS_DTYPE = 'f32';
 function resolveBiasDtype(weight, weightsEntry, key) {
   if (weight && weight.dtype) return weight.dtype;
   const locationDtype = weightsEntry?.dtypes?.get(key);
   const mapped = normalizeDiffusionLocationDtype(locationDtype);
-  return mapped || 'f32';
+  return mapped || DEFAULT_BIAS_DTYPE;
 }
 function createBiasTensorWithDtype(weight, weightsEntry, key, size, label) {
@@ -145,7 +152,7 @@ function createKernelOps(recorder) {
 }
 function resolveClipHiddenActivation(config) {
-  const hiddenAct = config?.hidden_act ?? 'gelu';
+  const hiddenAct = config?.hidden_act ?? DEFAULT_CLIP_HIDDEN_ACT;
   if (!SUPPORTED_CLIP_HIDDEN_ACTIVATIONS.has(hiddenAct)) {
     throw new Error(
       `Unsupported CLIP hidden_act "${hiddenAct}". ` +
@@ -1099,7 +1106,7 @@ export async function buildTimestepEmbedding(timestep, weightsEntry, modelConfig
   const device = getDevice();
   if (!device) throw new Error('Timestep embedding requires a WebGPU device.');
-  const dim = options.dim ?? 256;
+  const dim = options.dim ?? DEFAULT_TIMESTEP_EMBED_DIM;
   const half = Math.floor(dim / 2);
   const emb = new Float32Array(dim);
   const maxPeriod = 10000;

package/src/inference/pipelines/text/attention/output-projection.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import type { Tensor } from '../../../../gpu/tensor.js';
+export interface AttentionProjectionInputResult {
+  oProjInput: Tensor;
+  oProjInputTemp: Tensor | null;
+}
+export function prepareAttentionProjectionInput(
+  attnForProjection: Tensor,
+  matmulOutputDtype: string,
+  castToF16: (tensor: Tensor) => Promise<Tensor>
+): Promise<AttentionProjectionInputResult>;