npm - @simulatte/doppler - Versions diffs - 0.1.8 → 0.1.9 - Mend

@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

package/CHANGELOG.md +14 -1
package/README.md +25 -6
package/package.json +5 -3
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +16 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/loader.js +6 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/registry.json +7 -0
package/src/config/presets/models/gemma3.json +2 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +1 -1
package/src/converter/conversion-plan.js +1 -1
package/src/converter/core.js +17 -8
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +15 -0
package/src/distribution/shard-delivery.js +34 -0
package/src/formats/rdrr/classification.js +32 -0
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +1 -0
package/src/gpu/kernels/matmul.d.ts +3 -0
package/src/gpu/kernels/matmul.js +70 -1
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
package/src/inference/pipelines/text/attention/projections.js +13 -2
package/src/inference/pipelines/text/attention/record.js +1 -0
package/src/inference/pipelines/text/attention/run.js +9 -0
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +32 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +14 -0
package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
package/src/inference/pipelines/text/generator-steps.js +46 -29
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +320 -166
package/src/inference/pipelines/text/init.d.ts +2 -0
package/src/inference/pipelines/text/init.js +19 -5
package/src/inference/pipelines/text/layer.js +37 -8
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +9 -7
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +124 -3
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +2 -0
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/tooling/node-converter.js +25 -7
package/src/tooling/node-source-runtime.js +29 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/config/presets/kernel-paths/registry.json CHANGED Viewed

@@ -92,6 +92,13 @@
       "statusReason": "default",
       "notes": "Gemma 3 Q4K dequant default: subgroup GEMV + online attention + tuned lm_head multicol, F32 activations."
     },
+    {
+      "id": "gemma3-q4k-dequant-f32a-small-attn",
+      "file": "gemma3-q4k-dequant-f32a-small-attn.json",
+      "status": "experimental",
+      "statusReason": "diagnostic-probe",
+      "notes": "Diagnostic: same as gemma3-q4k-dequant-f32a-online but uses attention_small_f16kv.wgsl for prefill to isolate streaming attention bug."
+    },
     {
       "id": "gemma3-q4k-dequant-f32w-f32a-online",
       "file": "gemma3-q4k-dequant-f32w-f32a-online.json",

package/src/config/presets/models/gemma3.json CHANGED Viewed

@@ -34,7 +34,8 @@
     },
     "rope": {
       "ropeTheta": 1000000,
-      "ropeLocalTheta": 10000
+      "ropeLocalTheta": 10000,
+      "ropeLocalScalingFactor": 1.0
     },
     "chatTemplate": {
       "type": "gemma",

package/src/config/presets/models/gemma4.json ADDED Viewed

@@ -0,0 +1,61 @@
+{
+  "id": "gemma4",
+  "name": "Gemma 4",
+  "extends": "gemma3",
+  "modelType": "mixtral",
+  "inference": {
+    "attention": {
+      "slidingWindow": 1024
+    },
+    "rope": {
+      "ropeTheta": 1000000,
+      "ropeLocalTheta": 10000,
+      "ropeScalingType": "yarn",
+      "ropeScalingFactor": 8.0,
+      "yarnBetaFast": 4.0,
+      "yarnBetaSlow": 1.0,
+      "yarnOriginalMaxPos": 32768
+    },
+    "moe": {
+      "kernelProfileId": "mixtral-moe-v1",
+      "numExperts": 8,
+      "topK": 2,
+      "numSharedExperts": 0,
+      "routerDtype": "f32",
+      "supportedActivationDtypes": ["f16", "f32"],
+      "preferredActivationDtype": "f32",
+      "tensorPattern": "mixtral"
+    },
+    "kernelPaths": {
+      "q4k": {
+        "default": "gemma3-q4k-dequant-f32a-online",
+        "f16": "gemma3-q4k-dequant-f16a-online",
+        "f16a": "gemma3-q4k-dequant-f16a-online",
+        "f32": "gemma3-q4k-dequant-f32a-online"
+      }
+    }
+  },
+  "tensorPatterns": {
+    "ffn": {
+      "gate": ["layers.{layer}.block_sparse_moe.experts.{expert}.w1.weight"],
+      "up": ["layers.{layer}.block_sparse_moe.experts.{expert}.w3.weight"],
+      "down": ["layers.{layer}.block_sparse_moe.experts.{expert}.w2.weight"]
+    }
+  },
+  "detection": {
+    "architecturePatterns": [
+      "gemma4",
+      "Gemma4ForCausalLM",
+      "Gemma4ForConditionalGeneration",
+      "gemma-4"
+    ],
+    "modelTypePatterns": [
+      "gemma4",
+      "gemma4_text",
+      "gemma4_moe"
+    ]
+  }
+}

package/src/config/presets/models/granite-docling.json ADDED Viewed

@@ -0,0 +1,70 @@
+{
+  "id": "granite-docling",
+  "name": "Granite-Docling (Document OCR VLM)",
+  "extends": "transformer",
+  "modelType": "ocr",
+  "_notes": "Stabilized successor to SmolDocling-256M-preview. Requires full multimodal pipeline: SigLIP vision encoder, Idefics3-style image-token merge, pixel-shuffle connector, SmolLM2 decoder, DocTags output parsing. This preset covers the decoder config only — vision encoder and connector are separate pipeline stages not yet implemented in Doppler.",
+  "architecture": {
+    "headDim": 64,
+    "ropeTheta": 10000,
+    "visionEncoder": {
+      "type": "siglip_b16",
+      "patchSize": 16,
+      "imageSize": 512,
+      "hiddenSize": 768,
+      "numLayers": 12,
+      "numHeads": 12,
+      "parameterCount": 93000000,
+      "_note": "SigLIP base patch-16/512 backbone. Requires dedicated vision encoder pipeline in Doppler."
+    },
+    "connector": {
+      "type": "mlp_pixel_shuffle",
+      "downsampleFactor": 2,
+      "_note": "Idefics3/SmolVLM-style projection. Maps vision tokens to decoder embedding space."
+    }
+  },
+  "inference": {
+    "attention": {
+      "queryKeyNorm": false,
+      "causal": true
+    },
+    "normalization": {
+      "rmsNormWeightOffset": false,
+      "rmsNormEps": 1e-5
+    },
+    "ffn": {
+      "activation": "silu"
+    },
+    "output": {
+      "scaleEmbeddings": false,
+      "tieWordEmbeddings": true
+    },
+    "chatTemplate": {
+      "enabled": false
+    },
+    "kernelPaths": {
+      "q4k": {
+        "f16": "granite-docling-q4k-dequant-f32a",
+        "f32": "granite-docling-q4k-dequant-f32a"
+      },
+      "f16": {
+        "f16": "granite-docling-f16-f32a",
+        "f32": "granite-docling-f16-f32a"
+      }
+    }
+  },
+  "tokenizer": {
+    "bosToken": "<|endoftext|>",
+    "eosTokens": ["<|endoftext|>", "<|im_end|>"],
+    "addBosToken": true
+  },
+  "detection": {
+    "architecturePatterns": ["granite-docling", "GraniteDocling", "smoldocling", "SmolDocling", "SmolVLM"],
+    "modelTypePatterns": ["granite-docling", "smoldocling", "smolvlm"]
+  }
+}

package/src/config/presets/models/lfm2.json CHANGED Viewed

@@ -39,11 +39,16 @@
       "period": null,
       "offset": null,
       "layerTypes": null
+    },
+    "chatTemplate": {
+      "type": "chatml",
+      "enabled": true
     }
   },
   "tokenizer": {
-    "addBosToken": false,
+    "bosTokenId": 1,
+    "addBosToken": true,
     "addEosToken": false
   },

package/src/config/presets/models/qwen3_vl.json ADDED Viewed

@@ -0,0 +1,40 @@
+{
+  "id": "qwen3_vl",
+  "name": "Qwen 3 VL",
+  "extends": "qwen3",
+  "architecture": {
+    "ropeTheta": 5000000
+  },
+  "inference": {
+    "normalization": {
+      "rmsNormWeightOffset": false
+    },
+    "rope": {
+      "ropeTheta": 5000000,
+      "mropeInterleaved": true,
+      "mropeSection": [24, 20, 20],
+      "partialRotaryFactor": null
+    }
+  },
+  "vision": {
+    "patchSize": 16,
+    "spatialMergeSize": 2,
+    "temporalPatchSize": 2,
+    "eps": 1e-6,
+    "minPixels": 3136,
+    "maxPixels": 1003520,
+    "projectorType": "spatial_merge",
+    "normalization": {
+      "mean": [0.48145466, 0.4578275, 0.40821073],
+      "std": [0.26862954, 0.26130258, 0.27577711]
+    }
+  },
+  "detection": {
+    "architecturePatterns": ["qwen3_vl", "Qwen3VLForConditionalGeneration"],
+    "modelTypePatterns": ["qwen3_vl"]
+  }
+}

package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json CHANGED Viewed

@@ -1,7 +1,8 @@
 {
   "id": "experiments/bench/gemma3-bench-q4k",
   "name": "gemma3-bench-q4k",
-  "intent": "investigate",
+  "description": "Benchmark run for Gemma 3 1B Q4K — calibration-mode throughput measurement.",
+  "intent": "calibrate",
   "stability": "experimental",
   "owner": "doppler-core",
   "createdAtUtc": "2026-02-25T00:00:00Z",

package/src/config/presets/runtime/experiments/verify/lfm2-verify.json ADDED Viewed

@@ -0,0 +1,46 @@
+{
+  "id": "experiments/verify/lfm2-verify",
+  "name": "lfm2-verify",
+  "intent": "verify",
+  "stability": "experimental",
+  "owner": "doppler-core",
+  "createdAtUtc": "2026-03-16T00:00:00Z",
+  "extends": "modes/bench",
+  "model": "lfm2-5-1-2b-instruct-q4k-ehf16-af32",
+  "runtime": {
+    "shared": {
+      "tooling": {
+        "intent": "verify"
+      },
+      "debug": {
+        "logLevel": {
+          "defaultLogLevel": "warn"
+        },
+        "trace": {
+          "enabled": false
+        },
+        "profiler": {
+          "enabled": false
+        }
+      }
+    },
+    "inference": {
+      "prompt": {
+        "messages": [
+          {
+            "role": "user",
+            "content": "What color is the sky on a clear day?"
+          }
+        ]
+      },
+      "batching": {
+        "maxTokens": 32
+      },
+      "sampling": {
+        "temperature": 0,
+        "topK": 1,
+        "topP": 1
+      }
+    }
+  }
+}

package/src/config/presets/runtime/experiments/verify/translategemma-verify.json ADDED Viewed

@@ -0,0 +1,39 @@
+{
+  "id": "experiments/verify/translategemma-verify",
+  "name": "translategemma-verify",
+  "intent": "verify",
+  "stability": "experimental",
+  "owner": "doppler-core",
+  "createdAtUtc": "2026-03-16T00:00:00Z",
+  "extends": "modes/bench",
+  "model": "translategemma-4b-it-q4k-ehf16-af32",
+  "runtime": {
+    "shared": {
+      "tooling": {
+        "intent": "verify"
+      },
+      "debug": {
+        "logLevel": {
+          "defaultLogLevel": "warn"
+        },
+        "trace": {
+          "enabled": false
+        },
+        "profiler": {
+          "enabled": false
+        }
+      }
+    },
+    "inference": {
+      "prompt": "Hello from Doppler.",
+      "batching": {
+        "maxTokens": 32
+      },
+      "sampling": {
+        "temperature": 0,
+        "topK": 1,
+        "topP": 1
+      }
+    }
+  }
+}

package/src/config/presets/runtime/modes/trace-layers.json CHANGED Viewed

@@ -6,6 +6,7 @@
   "stability": "canonical",
   "owner": "doppler-core",
   "createdAtUtc": "2026-02-25T00:00:00Z",
+  "extends": "default",
   "runtime": {
     "shared": {
       "tooling": {

package/src/config/presets/runtime/tiers/gemma4-16gb.json ADDED Viewed

@@ -0,0 +1,69 @@
+{
+  "id": "tiers/gemma4-16gb",
+  "name": "Gemma 4 — 16 GB tier (constrained)",
+  "description": "Gemma 4 MoE runtime tier for 16 GB GPU memory. Aggressively constrained: short context, minimal expert cache, hard budget enforcement. Fail-closed if budget is not met.",
+  "intent": "investigate",
+  "stability": "experimental",
+  "owner": "doppler-core",
+  "createdAtUtc": "2026-03-17T00:00:00Z",
+  "extends": "default",
+  "runtime": {
+    "shared": {
+      "bufferPool": {
+        "budget": {
+          "maxTotalBytes": 13958643712,
+          "highWatermarkRatio": 0.85,
+          "emergencyTrimTargetRatio": 0.7,
+          "hardFailOnBudgetExceeded": true
+        }
+      }
+    },
+    "loading": {
+      "expertCache": {
+        "defaultSizeBytes": 1073741824,
+        "maxBufferPercentage": 0.15,
+        "evictionHighWatermark": 0.8,
+        "emergencyTrimToRatio": 0.65
+      },
+      "prefetch": {
+        "enabled": true,
+        "layersAhead": 1,
+        "maxShards": 4
+      },
+      "memoryManagement": {
+        "flushIntervalLayers": 1,
+        "flushThresholdBytes": 134217728
+      }
+    },
+    "inference": {
+      "kvcache": {
+        "layout": "contiguous",
+        "maxSeqLen": 2048,
+        "kvDtype": "f16",
+        "pageSize": 128,
+        "tiering": {
+          "mode": "off"
+        }
+      },
+      "moe": {
+        "routing": {
+          "routerDtype": "f32"
+        },
+        "cache": {
+          "dequantCacheMaxEntries": 2
+        }
+      },
+      "compute": {
+        "activationDtype": "f32"
+      },
+      "batching": {
+        "maxTokens": 512
+      },
+      "session": {
+        "kvcache": {
+          "kvDtype": "f16"
+        }
+      }
+    }
+  }
+}

package/src/config/presets/runtime/tiers/gemma4-24gb.json ADDED Viewed

@@ -0,0 +1,66 @@
+{
+  "id": "tiers/gemma4-24gb",
+  "name": "Gemma 4 — 24 GB tier",
+  "description": "Gemma 4 MoE runtime tier for 24 GB GPU memory. Moderate expert cache, contiguous KV, reduced context length.",
+  "intent": "investigate",
+  "stability": "experimental",
+  "owner": "doppler-core",
+  "createdAtUtc": "2026-03-17T00:00:00Z",
+  "extends": "default",
+  "runtime": {
+    "shared": {
+      "bufferPool": {
+        "budget": {
+          "maxTotalBytes": 21474836480,
+          "highWatermarkRatio": 0.9,
+          "emergencyTrimTargetRatio": 0.75,
+          "hardFailOnBudgetExceeded": true
+        }
+      }
+    },
+    "loading": {
+      "expertCache": {
+        "defaultSizeBytes": 3221225472,
+        "maxBufferPercentage": 0.2,
+        "evictionHighWatermark": 0.85,
+        "emergencyTrimToRatio": 0.7
+      },
+      "prefetch": {
+        "enabled": true,
+        "layersAhead": 1,
+        "maxShards": 8
+      },
+      "memoryManagement": {
+        "flushIntervalLayers": 2,
+        "flushThresholdBytes": 268435456
+      }
+    },
+    "inference": {
+      "kvcache": {
+        "layout": "contiguous",
+        "maxSeqLen": 4096,
+        "kvDtype": "f16",
+        "pageSize": 256,
+        "tiering": {
+          "mode": "off"
+        }
+      },
+      "moe": {
+        "routing": {
+          "routerDtype": "f32"
+        },
+        "cache": {
+          "dequantCacheMaxEntries": 64
+        }
+      },
+      "compute": {
+        "activationDtype": "f32"
+      },
+      "session": {
+        "kvcache": {
+          "kvDtype": "f16"
+        }
+      }
+    }
+  }
+}

package/src/config/presets/runtime/tiers/gemma4-32gb.json ADDED Viewed

@@ -0,0 +1,66 @@
+{
+  "id": "tiers/gemma4-32gb",
+  "name": "Gemma 4 — 32 GB tier",
+  "description": "Gemma 4 MoE runtime tier for 32 GB GPU memory. Generous expert cache, contiguous KV, full-length context.",
+  "intent": "investigate",
+  "stability": "experimental",
+  "owner": "doppler-core",
+  "createdAtUtc": "2026-03-17T00:00:00Z",
+  "extends": "default",
+  "runtime": {
+    "shared": {
+      "bufferPool": {
+        "budget": {
+          "maxTotalBytes": 30064771072,
+          "highWatermarkRatio": 0.9,
+          "emergencyTrimTargetRatio": 0.75,
+          "hardFailOnBudgetExceeded": true
+        }
+      }
+    },
+    "loading": {
+      "expertCache": {
+        "defaultSizeBytes": 6442450944,
+        "maxBufferPercentage": 0.25,
+        "evictionHighWatermark": 0.9,
+        "emergencyTrimToRatio": 0.75
+      },
+      "prefetch": {
+        "enabled": true,
+        "layersAhead": 2,
+        "maxShards": 16
+      },
+      "memoryManagement": {
+        "flushIntervalLayers": 4,
+        "flushThresholdBytes": 536870912
+      }
+    },
+    "inference": {
+      "kvcache": {
+        "layout": "contiguous",
+        "maxSeqLen": 8192,
+        "kvDtype": "f16",
+        "pageSize": 256,
+        "tiering": {
+          "mode": "off"
+        }
+      },
+      "moe": {
+        "routing": {
+          "routerDtype": "f32"
+        },
+        "cache": {
+          "dequantCacheMaxEntries": 128
+        }
+      },
+      "compute": {
+        "activationDtype": "f32"
+      },
+      "session": {
+        "kvcache": {
+          "kvDtype": "f16"
+        }
+      }
+    }
+  }
+}

package/src/config/runtime.js CHANGED Viewed

@@ -58,4 +58,7 @@ function assertNoDeprecatedRuntimeKeys(overrides) {
   if (inference?.sampling?.maxTokens !== undefined) {
     throw new Error('sampling.maxTokens is removed; use inference.batching.maxTokens');
   }
+  if (inference?.session?.maxNewTokens !== undefined) {
+    throw new Error('inference.session.maxNewTokens is not a supported runtime config key; use inference.batching.maxTokens');
+  }
 }

package/src/config/schema/debug.schema.d.ts CHANGED Viewed

@@ -131,6 +131,44 @@ export interface PipelineDebugConfigSchema {
 /** Default pipeline debug configuration */
 export declare const DEFAULT_PIPELINE_DEBUG_CONFIG: PipelineDebugConfigSchema;
+/** Loader debug configuration (Q4K dequant and related probes). */
+export interface LoaderDebugConfigSchema {
+  /** Enable loader debug behavior (default: false) */
+  enabled: boolean;
+  /** Force GPU dequant for Q4K tensors even when CPU fallback is eligible. */
+  forceGpuDequant: boolean;
+  /** Prefer CPU dequant for F32 output when eligible (default: false, GPU is preferred). */
+  preferCpuDequant: boolean;
+  /** Throw when CPU dequant fallback is taken. */
+  failOnCpuDequantPath: boolean;
+  /** Enable dtype-aware GPU-vs-CPU parity checks during Q4K dequant. */
+  runQ4KDequantParity: boolean;
+  /** Number of values to read back for parity checks. */
+  q4kDequantParitySamples: number;
+}
+/** Default loader debug configuration. */
+export declare const DEFAULT_LOADER_DEBUG_CONFIG: LoaderDebugConfigSchema;
+/** Matmul debug configuration (attention split/shape diagnostics). */
+export interface MatmulDebugConfigSchema {
+  /** Enable matmul debug behavior (default: false) */
+  enabled: boolean;
+  /** Force split (non-fused) Q/K/V projection path for diagnostics. */
+  forceSplitQKV: boolean;
+  /** Validate B tensor layout/buffer bytes for attention projection roles. */
+  validateAttentionWeightBuffer: boolean;
+  /** Throw if validation fails due small B tensor. */
+  failOnSmallAttentionWeightBuffer: boolean;
+  /** Emit attention B-buffer diagnostics. */
+  logAttentionWeightBuffer: boolean;
+  /** Log first-8 projection output values for layer 0 decode (diagnostic). */
+  logProjectionValues: boolean;
+}
+/** Default matmul debug configuration. */
+export declare const DEFAULT_MATMUL_DEBUG_CONFIG: MatmulDebugConfigSchema;
 /**
  * Profiler configuration.
  */
@@ -241,6 +279,8 @@ export interface DebugConfigSchema {
   logLevel: LogLevelConfigSchema;
   trace: TraceConfigSchema;
   pipeline: PipelineDebugConfigSchema;
+  loader: LoaderDebugConfigSchema;
+  matmul: MatmulDebugConfigSchema;
   probes: ProbeConfigSchema[];
   profiler: ProfilerConfigSchema;
   perfGuards: PerfGuardsConfigSchema;

package/src/config/schema/debug.schema.js CHANGED Viewed

@@ -38,6 +38,32 @@ export const DEFAULT_TRACE_CONFIG = {
   file: null,
 };
+// =============================================================================
+// Loader Debug Config
+// =============================================================================
+export const DEFAULT_LOADER_DEBUG_CONFIG = {
+  enabled: false,
+  forceGpuDequant: false,
+  preferCpuDequant: false,
+  failOnCpuDequantPath: false,
+  runQ4KDequantParity: false,
+  q4kDequantParitySamples: 256,
+};
+// =============================================================================
+// Kernel Debug Config
+// =============================================================================
+export const DEFAULT_MATMUL_DEBUG_CONFIG = {
+  enabled: false,
+  forceSplitQKV: false,
+  validateAttentionWeightBuffer: false,
+  failOnSmallAttentionWeightBuffer: false,
+  logAttentionWeightBuffer: false,
+  logProjectionValues: false,
+};
 // =============================================================================
 // Kernel Trace Config (kernel-trace.js anomaly detection)
 // =============================================================================
@@ -100,6 +126,8 @@ export const DEFAULT_DEBUG_CONFIG = {
   logLevel: DEFAULT_LOG_LEVEL_CONFIG,
   trace: DEFAULT_TRACE_CONFIG,
   pipeline: DEFAULT_PIPELINE_DEBUG_CONFIG,
+  loader: DEFAULT_LOADER_DEBUG_CONFIG,
+  matmul: DEFAULT_MATMUL_DEBUG_CONFIG,
   probes: [],
   profiler: DEFAULT_PROFILER_CONFIG,
   perfGuards: DEFAULT_PERF_GUARDS_CONFIG,

package/src/config/schema/index.js CHANGED Viewed

@@ -217,6 +217,8 @@ export {
   DEFAULT_LOG_HISTORY_CONFIG,
   DEFAULT_LOG_LEVEL_CONFIG,
   DEFAULT_TRACE_CONFIG,
+  DEFAULT_LOADER_DEBUG_CONFIG,
+  DEFAULT_MATMUL_DEBUG_CONFIG,
   DEFAULT_KERNEL_TRACE_CONFIG,
   DEFAULT_PIPELINE_DEBUG_CONFIG,
   DEFAULT_PROFILER_CONFIG,

package/src/config/schema/inference-defaults.schema.js CHANGED Viewed

@@ -93,7 +93,7 @@ export const DEFAULT_TOKENIZER_DEFAULTS = {
 // =============================================================================
 export const DEFAULT_CHAT_TEMPLATE_CONFIG = {
-  enabled: false,
+  enabled: undefined,
 };
 export const DEFAULT_KERNEL_PATH_POLICY = {

package/src/config/schema/kernel-path.schema.d.ts CHANGED Viewed

@@ -160,6 +160,7 @@ export type BuiltinKernelPathId =
   | 'gemma3-f16-fused-f32a-online-streamingprefill' // Gemma 3 F16 fused FFN online path with streaming prefill attention
   | 'gemma3-q4k-dequant-f16a-online' // Gemma 3 Q4K dequant online path (F16 activations)
   | 'gemma3-q4k-dequant-f32a-online' // Gemma 3 Q4K dequant online path with F32 activations
+  | 'gemma3-q4k-dequant-f32w-f32a-online' // Gemma 3 Q4K path with F32 projection weights and F32 activations
   | 'gemma3-q4k-dequant-f32a-nosubgroups' // Gemma 3 Q4K dequant path with no subgroup requirement
   | 'gemma3-q4k-dequant-f32a' // Legacy alias for gemma3-q4k-dequant-f32a-nosubgroups
   | 'lfm2-q4k-dequant-f32a-online' // LFM2 Q4K path with F32 activations and fast prefill

package/src/config/schema/memory-limits.schema.js CHANGED Viewed

@@ -40,8 +40,8 @@ export const DEFAULT_SEGMENT_ALLOCATION_CONFIG = {
 // =============================================================================
 export const DEFAULT_EMULATED_STORAGE_CONFIG = {
-  vramBudgetBytes: 2 * GB,
-  ramBudgetBytes: 8 * GB,
+  vramBudgetBytes: 4 * GB,
+  ramBudgetBytes: 16 * GB,
 };
 // =============================================================================