@simulatte/doppler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/package.json +21 -36
  3. package/src/browser/browser-converter.js +5 -0
  4. package/src/client/doppler-registry.json +1 -17
  5. package/src/config/kernel-path-loader.d.ts +5 -0
  6. package/src/config/kernel-path-loader.js +13 -0
  7. package/src/config/kernels/registry.json +74 -0
  8. package/src/config/loader.js +3 -0
  9. package/src/config/merge-contract-check.js +7 -0
  10. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  11. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  12. package/src/config/presets/kernel-paths/registry.json +14 -0
  13. package/src/config/presets/models/gemma2.json +2 -1
  14. package/src/config/presets/models/gemma3.json +2 -0
  15. package/src/config/presets/models/qwen3.json +4 -3
  16. package/src/config/presets/models/qwen3_5.json +16 -0
  17. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  18. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  19. package/src/config/schema/conversion.schema.d.ts +1 -0
  20. package/src/config/schema/manifest.schema.d.ts +1 -1
  21. package/src/config/schema/manifest.schema.js +1 -1
  22. package/src/config/schema/storage.schema.js +1 -1
  23. package/src/converter/conversion-plan.js +10 -2
  24. package/src/converter/core.js +2 -0
  25. package/src/converter/manifest-inference.js +12 -22
  26. package/src/converter/parsers/transformer.js +4 -0
  27. package/src/converter/quantization-info.js +5 -1
  28. package/src/converter/quantizer.js +19 -12
  29. package/src/converter/rope-config.js +8 -6
  30. package/src/converter/tokenizer-utils.d.ts +1 -0
  31. package/src/converter/tokenizer-utils.js +4 -1
  32. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  33. package/src/distribution/shard-delivery.js +6 -1
  34. package/src/formats/rdrr/parsing.d.ts +4 -0
  35. package/src/formats/rdrr/parsing.js +14 -1
  36. package/src/gpu/kernels/index.d.ts +8 -0
  37. package/src/gpu/kernels/index.js +6 -0
  38. package/src/gpu/kernels/matmul-selection.js +47 -4
  39. package/src/gpu/kernels/matmul.d.ts +2 -0
  40. package/src/gpu/kernels/matmul.js +1 -1
  41. package/src/gpu/kernels/rmsnorm.js +9 -2
  42. package/src/gpu/kernels/split_qg.d.ts +50 -0
  43. package/src/gpu/kernels/split_qg.js +46 -0
  44. package/src/gpu/kernels/split_qg.wgsl +58 -0
  45. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  46. package/src/gpu/weight-buffer.d.ts +1 -1
  47. package/src/gpu/weight-buffer.js +1 -1
  48. package/src/inference/browser-harness.d.ts +2 -0
  49. package/src/inference/browser-harness.js +20 -1
  50. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  51. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
  52. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  53. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  54. package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
  55. package/src/inference/pipelines/text/attention/projections.js +41 -11
  56. package/src/inference/pipelines/text/attention/record.js +15 -6
  57. package/src/inference/pipelines/text/attention/run.js +50 -6
  58. package/src/inference/pipelines/text/config.js +14 -0
  59. package/src/inference/pipelines/text/execution-plan.js +5 -4
  60. package/src/inference/pipelines/text/generator-runtime.js +5 -0
  61. package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
  62. package/src/inference/pipelines/text/generator-steps.js +43 -15
  63. package/src/inference/pipelines/text/generator.js +50 -17
  64. package/src/inference/pipelines/text/init.d.ts +13 -0
  65. package/src/inference/pipelines/text/init.js +16 -5
  66. package/src/inference/pipelines/text/layer.js +1 -0
  67. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  68. package/src/inference/pipelines/text/linear-attention.js +33 -3
  69. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  70. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  71. package/src/inference/pipelines/text/logits/index.js +3 -1
  72. package/src/inference/pipelines/text/model-load.js +3 -0
  73. package/src/inference/pipelines/text/sampling.js +52 -6
  74. package/src/inference/test-harness.js +2 -2
  75. package/src/loader/final-weights-loader.js +2 -0
  76. package/src/loader/shard-cache.js +3 -2
  77. package/src/loader/tensors/tensor-loader.js +6 -1
  78. package/src/rules/inference/dtype.rules.json +5 -0
  79. package/src/rules/inference/kernel-path.rules.json +2 -2
  80. package/src/rules/kernels/split-qg.rules.json +6 -0
  81. package/src/rules/rule-registry.js +2 -0
  82. package/src/storage/downloader.js +2 -1
  83. package/src/storage/shard-manager.js +4 -3
  84. package/src/tooling/conversion-config-materializer.js +3 -5
  85. package/src/tooling/node-converter.js +3 -0
  86. package/src/tooling/node-source-runtime.js +36 -0
  87. package/src/types/model.d.ts +5 -0
  88. package/tools/doppler-cli.js +6 -1
package/CHANGELOG.md CHANGED
@@ -6,6 +6,25 @@ This changelog is package-facing and release-oriented. Entries before `0.1.7`
6
6
  were retrofitted from package version history, release commits, and release
7
7
  docs so the `0.1.x` line has one conventional npm-visible history surface.
8
8
 
9
+ ## [0.1.8] - 2026-03-13
10
+
11
+ ### Changed
12
+
13
+ - Simplified demo to show only verified Q4K models (Gemma 3 270M, Gemma 3 1B).
14
+ Hidden Translate, Diffusion, and Embedding tabs until models are ready.
15
+ - Trimmed hosted HF registry and quickstart registry to the two verified models.
16
+ - Aligned catalog, HF registry, and quickstart registry to the canonical
17
+ external support registry as single source of truth for HF revisions.
18
+
19
+ ### Fixed
20
+
21
+ - Fixed Qwen 3.5 conversion configs using wrong model preset (`qwen3` instead
22
+ of `qwen3_5`), which caused support matrix check failures.
23
+ - Fixed catalog lifecycle metadata inconsistencies: corrected `local`, `hf`,
24
+ `curated`, and `demo` fields to match actual artifact availability.
25
+ - Removed failing and unverified models from demo visibility (TranslateGemma 4B,
26
+ EmbeddingGemma 300M with broken HF manifest, Qwen 3.5 0.8B/2B, F16 variant).
27
+
9
28
  ## [0.1.7] - 2026-03-10
10
29
 
11
30
  ### Added
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@simulatte/doppler",
3
- "version": "0.1.7",
3
+ "version": "0.1.8",
4
4
  "description": "Browser-native WebGPU inference engine for local intent and inference loops",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
@@ -29,22 +29,22 @@
29
29
  "bench:chart": "node ./benchmarks/vendors/compare-chart.js",
30
30
  "bench:chart:readme": "node ./benchmarks/vendors/compare-chart.js --preset readme-evidence",
31
31
  "bench:architecture:chart": "node ./benchmarks/vendors/generate-architecture-overview-svg.js",
32
- "ci:diffusion:contract": "node tools/ci-diffusion-contract-gates.mjs",
33
- "ci:diffusion:contract:list": "node tools/ci-diffusion-contract-gates.mjs --list",
34
- "ci:training:contract": "node tools/ci-training-contract-gates.mjs",
35
- "ci:training:contract:list": "node tools/ci-training-contract-gates.mjs --list",
36
- "training:contract:delta": "node tools/emit-training-contract-delta.mjs",
37
- "training:workloads:verify": "node tools/verify-training-workload-packs.mjs --registry tools/configs/training-workloads/registry.json",
38
- "training:report-ids:publish": "node tools/publish-training-report-ids.mjs --registry tools/configs/training-workloads/registry.json",
39
- "distill:studio:mvp": "node tools/distill-studio-mvp.mjs",
40
- "distill:quality-gate": "node tools/distill-studio-quality-gate.mjs",
41
- "p2p:observability": "node tools/p2p-delivery-observability.mjs",
42
- "p2p:drill": "node tools/p2p-resilience-drill.mjs",
32
+ "ci:diffusion:contract": "node tools/ci-diffusion-contract-gates.js",
33
+ "ci:diffusion:contract:list": "node tools/ci-diffusion-contract-gates.js --list",
34
+ "ci:training:contract": "node tools/ci-training-contract-gates.js",
35
+ "ci:training:contract:list": "node tools/ci-training-contract-gates.js --list",
36
+ "training:contract:delta": "node tools/emit-training-contract-delta.js",
37
+ "training:workloads:verify": "node tools/verify-training-workload-packs.js --registry tools/configs/training-workloads/registry.json",
38
+ "training:report-ids:publish": "node tools/publish-training-report-ids.js --registry tools/configs/training-workloads/registry.json",
39
+ "distill:studio:mvp": "node tools/distill-studio-mvp.js",
40
+ "distill:quality-gate": "node tools/distill-studio-quality-gate.js",
41
+ "p2p:observability": "node tools/p2p-delivery-observability.js",
42
+ "p2p:drill": "node tools/p2p-resilience-drill.js",
43
43
  "test": "npm run test:unit",
44
- "test:unit": "node tools/run-node-tests.mjs --suite unit",
45
- "test:gpu": "node tools/run-node-tests.mjs --suite gpu",
46
- "test:coverage": "node tools/run-node-coverage.mjs",
47
- "test:coverage:report": "node tools/run-node-coverage.mjs --no-threshold",
44
+ "test:unit": "node tools/run-node-tests.js --suite unit",
45
+ "test:gpu": "node tools/run-node-tests.js --suite gpu",
46
+ "test:coverage": "node tools/run-node-coverage.js",
47
+ "test:coverage:report": "node tools/run-node-coverage.js --no-threshold",
48
48
  "test:gpu:browser": "node tools/doppler-cli.js verify --config '{\"request\":{\"suite\":\"kernels\"},\"run\":{\"surface\":\"browser\",\"browser\":{\"opfsCache\":false,\"headless\":true,\"channel\":\"chromium\",\"browserArgs\":[\"--use-angle=swiftshader\",\"--disable-vulkan-surface\"],\"console\":true}}}'",
49
49
  "agents:verify": "node tools/verify-agent-parity.js",
50
50
  "agents:freshness": "node tools/verify-agent-freshness.js",
@@ -74,9 +74,10 @@
74
74
  "ci:catalog:check": "npm run registry:sync:scripts:check && npm run support:matrix:check && npm run registry:hf:check",
75
75
  "external:rdrr:index": "node tools/sync-external-rdrr-index.js",
76
76
  "external:rdrr:index:check": "node tools/sync-external-rdrr-index.js --check",
77
- "verify:embeddinggemma-300m": "node tools/run-registry-verify.js embeddinggemma-300m",
78
- "verify:gemma-3-1b-it-f16": "node tools/run-registry-verify.js gemma-3-1b-it-f16",
79
- "verify:gemma-3-1b-it-f16-af32": "node tools/run-registry-verify.js gemma-3-1b-it-f16-af32",
77
+ "external:support:sync": "node tools/sync-external-support-registry.js",
78
+ "external:support:check": "node tools/sync-external-support-registry.js --check",
79
+ "catalog:sync:external": "node tools/sync-catalog-from-external-support.js",
80
+ "catalog:sync:external:check": "node tools/sync-catalog-from-external-support.js --check",
80
81
  "verify:gemma-3-1b-it-q4k-ehf16-af32": "node tools/run-registry-verify.js gemma-3-1b-it-q4k-ehf16-af32",
81
82
  "verify:gemma-3-1b-it-wq4k-ef16-hf16": "node tools/run-registry-verify.js gemma-3-1b-it-wq4k-ef16-hf16",
82
83
  "verify:gemma-3-270m-it-q4k-ehf16-af32": "node tools/run-registry-verify.js gemma-3-270m-it-q4k-ehf16-af32",
@@ -84,25 +85,9 @@
84
85
  "verify:gemma-3-270m-it-wq4k-ef16-hf16": "node tools/run-registry-verify.js gemma-3-270m-it-wq4k-ef16-hf16",
85
86
  "verify:gemma-3-270m-it-wq4k-ef16-hf16-f32": "node tools/run-registry-verify.js gemma-3-270m-it-wq4k-ef16-hf16-f32",
86
87
  "verify:gemma3-1b": "node tools/run-registry-verify.js gemma3-1b",
87
- "verify:gemma3-1b-f16": "node tools/run-registry-verify.js gemma3-1b-f16",
88
88
  "verify:gemma3-270m": "node tools/run-registry-verify.js gemma3-270m",
89
- "verify:google-embeddinggemma-300m": "node tools/run-registry-verify.js google-embeddinggemma-300m",
90
- "verify:google-embeddinggemma-300m-q4k-ehf16-af32": "node tools/run-registry-verify.js google-embeddinggemma-300m-q4k-ehf16-af32",
91
- "verify:google-embeddinggemma-300m-wq4k-ef16": "node tools/run-registry-verify.js google-embeddinggemma-300m-wq4k-ef16",
92
89
  "verify:google-gemma-3-1b-it": "node tools/run-registry-verify.js google-gemma-3-1b-it",
93
- "verify:google-gemma-3-270m-it": "node tools/run-registry-verify.js google-gemma-3-270m-it",
94
- "verify:google-translategemma-4b-it": "node tools/run-registry-verify.js google-translategemma-4b-it",
95
- "verify:qwen-3-5-0-8b": "node tools/run-registry-verify.js qwen-3-5-0-8b",
96
- "verify:qwen-3-5-0-8b-wq4k-ef16-hf16-f16": "node tools/run-registry-verify.js qwen-3-5-0-8b-wq4k-ef16-hf16-f16",
97
- "verify:qwen-3-5-2b": "node tools/run-registry-verify.js qwen-3-5-2b",
98
- "verify:qwen-3-5-2b-wq4k-ef16-hf16-f16": "node tools/run-registry-verify.js qwen-3-5-2b-wq4k-ef16-hf16-f16",
99
- "verify:qwen-qwen3.5-0.8b": "node tools/run-registry-verify.js qwen-qwen3.5-0.8b",
100
- "verify:qwen-qwen3.5-2b": "node tools/run-registry-verify.js qwen-qwen3.5-2b",
101
- "verify:qwen3-0.8b": "node tools/run-registry-verify.js qwen3-0.8b",
102
- "verify:qwen3-2b": "node tools/run-registry-verify.js qwen3-2b",
103
- "verify:translategemma": "node tools/run-registry-verify.js translategemma",
104
- "verify:translategemma-4b": "node tools/run-registry-verify.js translategemma-4b",
105
- "verify:translategemma-4b-it-wq4k-ef16-hf16": "node tools/run-registry-verify.js translategemma-4b-it-wq4k-ef16-hf16"
90
+ "verify:google-gemma-3-270m-it": "node tools/run-registry-verify.js google-gemma-3-270m-it"
106
91
  },
107
92
  "exports": {
108
93
  ".": {
@@ -408,6 +408,7 @@ export async function convertModel(files, options = {}) {
408
408
  // Parse based on format
409
409
  let modelInfo;
410
410
  let config = null;
411
+ let generationConfig = null;
411
412
  let tokenizerJson = null;
412
413
  let tokenizerConfig = null;
413
414
  let tokenizerModel = null;
@@ -455,6 +456,10 @@ export async function convertModel(files, options = {}) {
455
456
  tokenizerConfig = await parseTokenizerConfigJson(auxiliary.tokenizerConfig);
456
457
  modelInfo.tokenizerConfig = tokenizerConfig;
457
458
  }
459
+ if (auxiliary.generationConfig) {
460
+ generationConfig = await parseConfigJson(auxiliary.generationConfig);
461
+ modelInfo.generationConfig = generationConfig;
462
+ }
458
463
  if (auxiliary.tokenizerModel) {
459
464
  const source = normalizeTensorSource(auxiliary.tokenizerModel);
460
465
  tokenizerModel = await source.readRange(0, source.size);
@@ -16,25 +16,9 @@
16
16
  ],
17
17
  "hf": {
18
18
  "repoId": "Clocksmith/rdrr",
19
- "revision": "cd6c12be0e83e92d6dbd92598a0aa94391ec7e94",
19
+ "revision": "ca6f0dbdf3882d3893a65cf48f2bb6f1520df162",
20
20
  "path": "models/gemma-3-270m-it-q4k-ehf16-af32"
21
21
  }
22
- },
23
- {
24
- "modelId": "google-embeddinggemma-300m-q4k-ehf16-af32",
25
- "aliases": [
26
- "embeddinggemma-300m",
27
- "google/embeddinggemma-300m",
28
- "google-embeddinggemma-300m-wq4k-ef16"
29
- ],
30
- "modes": [
31
- "embedding"
32
- ],
33
- "hf": {
34
- "repoId": "Clocksmith/rdrr",
35
- "revision": "b23aca921ea11729d6f34b9484555968a5ab0e42",
36
- "path": "models/google-embeddinggemma-300m-q4k-ehf16-af32"
37
- }
38
22
  }
39
23
  ]
40
24
  }
@@ -134,6 +134,11 @@ export function getKernelPathStrict(): boolean;
134
134
  */
135
135
  export function isKernelPathFusedQ4K(path?: KernelPathSchema | null): boolean;
136
136
 
137
+ /**
138
+ * Check if a kernel path requires matmul weights to stay in F32.
139
+ */
140
+ export function kernelPathRequiresF32MatmulWeights(path?: KernelPathSchema | null): boolean;
141
+
137
142
  /**
138
143
  * Check if the active kernel path uses fused Q4K matmul.
139
144
  */
@@ -503,6 +503,19 @@ export function isKernelPathFusedQ4K(path = undefined) {
503
503
  return kernelSteps.some((step) => step.kernel.includes('fused_matmul_q4'));
504
504
  }
505
505
 
506
+ export function kernelPathRequiresF32MatmulWeights(path = undefined) {
507
+ const lookupPath = path === undefined ? activeKernelPath : path;
508
+ if (!lookupPath) return false;
509
+ const kernelSteps = [
510
+ ...(lookupPath.decode?.steps ?? []),
511
+ ...(lookupPath.prefill?.steps ?? []),
512
+ ...(lookupPath.preLayer ?? []),
513
+ ...(lookupPath.postLayer ?? []),
514
+ ...(lookupPath.layerOverrides?.flatMap((override) => override.steps) ?? []),
515
+ ];
516
+ return kernelSteps.some((step) => normalizeKernelFile(step.kernel) === 'matmul_f32.wgsl');
517
+ }
518
+
506
519
  export function isActiveKernelPathFusedQ4K() {
507
520
  return isKernelPathFusedQ4K(activeKernelPath);
508
521
  }
@@ -4322,6 +4322,80 @@
4322
4322
  }
4323
4323
  }
4324
4324
  },
4325
+ "split_qg": {
4326
+ "description": "De-interleave Q and Gate projections from q_proj output for attentionOutputGate models",
4327
+ "baseBindings": [
4328
+ {
4329
+ "index": 0,
4330
+ "name": "uniforms",
4331
+ "type": "uniform"
4332
+ },
4333
+ {
4334
+ "index": 1,
4335
+ "name": "qg_interleaved",
4336
+ "type": "read-only-storage"
4337
+ },
4338
+ {
4339
+ "index": 2,
4340
+ "name": "Q",
4341
+ "type": "storage"
4342
+ },
4343
+ {
4344
+ "index": 3,
4345
+ "name": "G",
4346
+ "type": "storage"
4347
+ }
4348
+ ],
4349
+ "baseUniforms": {
4350
+ "size": 16,
4351
+ "fields": [
4352
+ {
4353
+ "name": "num_tokens",
4354
+ "type": "u32",
4355
+ "offset": 0
4356
+ },
4357
+ {
4358
+ "name": "num_heads",
4359
+ "type": "u32",
4360
+ "offset": 4
4361
+ },
4362
+ {
4363
+ "name": "head_dim",
4364
+ "type": "u32",
4365
+ "offset": 8
4366
+ },
4367
+ {
4368
+ "name": "_pad",
4369
+ "type": "u32",
4370
+ "offset": 12
4371
+ }
4372
+ ]
4373
+ },
4374
+ "variants": {
4375
+ "default": {
4376
+ "wgsl": "split_qg.wgsl",
4377
+ "entryPoint": "main",
4378
+ "workgroup": [
4379
+ 256,
4380
+ 1,
4381
+ 1
4382
+ ],
4383
+ "requires": []
4384
+ },
4385
+ "f16": {
4386
+ "wgsl": "split_qg_f16.wgsl",
4387
+ "entryPoint": "main",
4388
+ "workgroup": [
4389
+ 256,
4390
+ 1,
4391
+ 1
4392
+ ],
4393
+ "requires": [
4394
+ "shader-f16"
4395
+ ]
4396
+ }
4397
+ }
4398
+ },
4325
4399
  "sample": {
4326
4400
  "description": "GPU-side sampling kernels",
4327
4401
  "baseBindings": [
@@ -23,6 +23,7 @@ const mambaPreset = await loadJson('./presets/models/mamba.json', import.meta.ur
23
23
  const modernbertPreset = await loadJson('./presets/models/modernbert.json', import.meta.url, 'Failed to load preset');
24
24
  const lfm2Preset = await loadJson('./presets/models/lfm2.json', import.meta.url, 'Failed to load preset');
25
25
  const qwen3Preset = await loadJson('./presets/models/qwen3.json', import.meta.url, 'Failed to load preset');
26
+ const qwen35Preset = await loadJson('./presets/models/qwen3_5.json', import.meta.url, 'Failed to load preset');
26
27
  const kimiK2Preset = await loadJson('./presets/models/kimi-k2.json', import.meta.url, 'Failed to load preset');
27
28
  const gptOssPreset = await loadJson('./presets/models/gpt-oss.json', import.meta.url, 'Failed to load preset');
28
29
 
@@ -46,6 +47,7 @@ export const PRESET_REGISTRY = {
46
47
  modernbert: modernbertPreset,
47
48
  lfm2: lfm2Preset,
48
49
  qwen3: qwen3Preset,
50
+ qwen3_5: qwen35Preset,
49
51
  kimi_k2: kimiK2Preset,
50
52
  gpt_oss: gptOssPreset,
51
53
  };
@@ -97,6 +99,7 @@ export const PRESET_DETECTION_ORDER = [
97
99
  'gemma3',
98
100
  'llama3',
99
101
  'lfm2',
102
+ 'qwen3_5',
100
103
  'qwen3',
101
104
  'kimi_k2',
102
105
  'gpt_oss',
@@ -171,6 +171,13 @@ export function buildMergeContractArtifact() {
171
171
  `configA=${isolatedConfigA.runtime.inference.compute.activationDtype}, configB=${isolatedConfigB.runtime.inference.compute.activationDtype}`,
172
172
  'actual'
173
173
  );
174
+ recordCheck(
175
+ checks,
176
+ 'runtime.schema.storage.opfs_sync_access_handle_defaults_off',
177
+ isolatedConfigB.runtime.loading.storage.backend.opfs.useSyncAccessHandle === false,
178
+ `value=${String(isolatedConfigB.runtime.loading.storage.backend.opfs.useSyncAccessHandle)}`,
179
+ 'actual'
180
+ );
174
181
 
175
182
  const calibrateConfig = createDopplerConfig({
176
183
  runtime: {
@@ -0,0 +1,56 @@
1
+ {
2
+ "id": "gemma3-q4k-dequant-f32w-f32a-online",
3
+ "name": "Gemma 3 Q4K Dequant (F32 projection weights, F32 activations, online decode)",
4
+ "description": "Q4K projection weights dequantized to F32 with F32 activations. Tied embeddings and LM head stay on the native F16 path. Decode uses online attention; prefill uses streaming attention.",
5
+ "activationDtype": "f32",
6
+ "kvDtype": "f16",
7
+ "decode": {
8
+ "steps": [
9
+ { "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
10
+ { "op": "q_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
11
+ { "op": "k_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
12
+ { "op": "v_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
13
+ { "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
14
+ { "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
15
+ { "op": "attention", "kernel": "attention_decode_online_f16kv.wgsl", "entry": "main" },
16
+ { "op": "o_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
17
+ { "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
18
+ { "op": "post_attn_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
19
+ { "op": "gate_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
20
+ { "op": "up_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
21
+ { "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
22
+ { "op": "down_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
23
+ { "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
24
+ ]
25
+ },
26
+ "prefill": {
27
+ "steps": [
28
+ { "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
29
+ { "op": "q_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
30
+ { "op": "k_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
31
+ { "op": "v_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
32
+ { "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
33
+ { "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
34
+ { "op": "attention", "kernel": "attention_streaming_f16kv.wgsl", "entry": "main" },
35
+ { "op": "o_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
36
+ { "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
37
+ { "op": "post_attn_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
38
+ { "op": "gate_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
39
+ { "op": "up_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
40
+ { "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
41
+ { "op": "down_proj", "kernel": "matmul_f32.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
42
+ { "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
43
+ ]
44
+ },
45
+ "preLayer": [
46
+ { "op": "embed", "kernel": "gather_f16.wgsl", "entry": "main", "weights": "embed_tokens" }
47
+ ],
48
+ "postLayer": [
49
+ { "op": "final_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
50
+ { "op": "lm_head", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_multicol", "weights": "lm_head", "constants": { "MULTICOL_COLS_PER_WG": 64, "MULTICOL_THREADS_PER_COL": 4 } },
51
+ { "op": "lm_head_prefill", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "lm_head" }
52
+ ],
53
+ "sampling": [
54
+ { "op": "sample", "kernel": "sample.wgsl", "entry": "sample_single_pass" }
55
+ ]
56
+ }
@@ -0,0 +1,61 @@
1
+ {
2
+ "id": "lfm2-q4k-dequant-f32a-nosubgroups",
3
+ "name": "LFM2 Q4K Dequant (F32 activations, no subgroups)",
4
+ "description": "Subgroup-free LFM2 Q4K path: F32 activations with tiled prefill matmul and small-kernel prefill attention. Still requires shader-f16 kernels.",
5
+ "activationDtype": "f32",
6
+ "kvDtype": "f16",
7
+
8
+ "decode": {
9
+ "steps": [
10
+ { "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
11
+ { "op": "q_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
12
+ { "op": "k_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
13
+ { "op": "v_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
14
+ { "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
15
+ { "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
16
+ { "op": "attention", "kernel": "attention_decode_chunked_f16kv.wgsl", "entry": "main" },
17
+ { "op": "o_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
18
+ { "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
19
+ { "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
20
+ { "op": "gate_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
21
+ { "op": "up_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
22
+ { "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
23
+ { "op": "down_proj", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
24
+ { "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
25
+ ]
26
+ },
27
+
28
+ "prefill": {
29
+ "steps": [
30
+ { "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
31
+ { "op": "q_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
32
+ { "op": "k_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
33
+ { "op": "v_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
34
+ { "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
35
+ { "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
36
+ { "op": "attention", "kernel": "attention_small_f16kv.wgsl", "entry": "main" },
37
+ { "op": "o_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
38
+ { "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
39
+ { "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
40
+ { "op": "gate_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
41
+ { "op": "up_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
42
+ { "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
43
+ { "op": "down_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
44
+ { "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
45
+ ]
46
+ },
47
+
48
+ "preLayer": [
49
+ { "op": "embed", "kernel": "gather_f16.wgsl", "entry": "main", "weights": "embed_tokens" }
50
+ ],
51
+
52
+ "postLayer": [
53
+ { "op": "final_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
54
+ { "op": "lm_head", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "lm_head" },
55
+ { "op": "lm_head_prefill", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "lm_head" }
56
+ ],
57
+
58
+ "sampling": [
59
+ { "op": "sample", "kernel": "sample.wgsl", "entry": "sample_single_pass" }
60
+ ]
61
+ }
@@ -92,6 +92,13 @@
92
92
  "statusReason": "default",
93
93
  "notes": "Gemma 3 Q4K dequant default: subgroup GEMV + online attention + tuned lm_head multicol, F32 activations."
94
94
  },
95
+ {
96
+ "id": "gemma3-q4k-dequant-f32w-f32a-online",
97
+ "file": "gemma3-q4k-dequant-f32w-f32a-online.json",
98
+ "status": "experimental",
99
+ "statusReason": "accuracy-probe",
100
+ "notes": "Gemma 3 Q4K dequant path that keeps matmul weights in F32 and runs F32 matmul kernels for numeric-sensitivity debugging."
101
+ },
95
102
  {
96
103
  "id": "lfm2-q4k-dequant-f32a-online",
97
104
  "file": "lfm2-q4k-dequant-f32a-online.json",
@@ -99,6 +106,13 @@
99
106
  "statusReason": "default",
100
107
  "notes": "LFM2 Q4K default: subgroup GEMV decode with tiled fast-prefill path and F32 activations."
101
108
  },
109
+ {
110
+ "id": "lfm2-q4k-dequant-f32a-nosubgroups",
111
+ "file": "lfm2-q4k-dequant-f32a-nosubgroups.json",
112
+ "status": "canonical",
113
+ "statusReason": "subgroup-free",
114
+ "notes": "Subgroup-free LFM2 Q4K dequant path with F32 activations and tiled prefill. Still requires shader-f16 kernels."
115
+ },
102
116
  {
103
117
  "id": "embeddinggemma-f16-f32a",
104
118
  "file": "embeddinggemma-f16-f32a.json",
@@ -9,6 +9,7 @@
9
9
 
10
10
  "inference": {
11
11
  "attention": {
12
+ "queryPreAttnScalar": 256,
12
13
  "slidingWindow": 4096,
13
14
  "attnLogitSoftcapping": 50.0,
14
15
  "queryKeyNorm": false
@@ -40,7 +41,7 @@
40
41
  "f32": "gemma2-f16-f32a"
41
42
  },
42
43
  "q4k": {
43
- "f16": "gemma2-q4k-dequant-f32a-nosubgroups",
44
+ "f16": "gemma2-q4k-dequant-f16a",
44
45
  "f32": "gemma2-q4k-dequant-f32a-nosubgroups"
45
46
  }
46
47
  }
@@ -8,7 +8,9 @@
8
8
  },
9
9
  "inference": {
10
10
  "attention": {
11
+ "queryPreAttnScalar": 256,
11
12
  "attnLogitSoftcapping": null,
13
+ "slidingWindow": 512,
12
14
  "queryKeyNorm": true
13
15
  },
14
16
  "normalization": {
@@ -10,7 +10,8 @@
10
10
  "inference": {
11
11
  "attention": {
12
12
  "slidingWindow": null,
13
- "queryKeyNorm": true
13
+ "queryKeyNorm": true,
14
+ "attentionOutputGate": true
14
15
  },
15
16
  "output": {
16
17
  "scaleEmbeddings": false
@@ -39,8 +40,8 @@
39
40
  },
40
41
 
41
42
  "detection": {
42
- "architecturePatterns": ["qwen3", "qwen3_5", "Qwen3ForCausalLM", "Qwen3_5ForCausalLM", "Qwen2ForCausalLM"],
43
- "modelTypePatterns": ["qwen3_5", "qwen3_5_text", "qwen2"],
43
+ "architecturePatterns": ["qwen3", "Qwen3ForCausalLM", "Qwen2ForCausalLM"],
44
+ "modelTypePatterns": ["qwen3", "qwen2"],
44
45
  "configPatterns": {
45
46
  "model_type": "qwen2"
46
47
  }
@@ -0,0 +1,16 @@
1
+ {
2
+ "id": "qwen3_5",
3
+ "name": "Qwen 3.5",
4
+ "extends": "qwen3",
5
+
6
+ "inference": {
7
+ "normalization": {
8
+ "rmsNormWeightOffset": true
9
+ }
10
+ },
11
+
12
+ "detection": {
13
+ "architecturePatterns": ["qwen3_5", "Qwen3_5ForCausalLM", "Qwen3_5ForConditionalGeneration"],
14
+ "modelTypePatterns": ["qwen3_5", "qwen3_5_text"]
15
+ }
16
+ }
@@ -0,0 +1,52 @@
1
+ {
2
+ "id": "model/qwen3-5-layer-probe",
3
+ "name": "qwen3-5-layer-probe",
4
+ "description": "Probe all 24 layer outputs in Qwen 3.5 to isolate where the hidden state distribution collapses.",
5
+ "intent": "investigate",
6
+ "stability": "canonical",
7
+ "owner": "doppler-core",
8
+ "createdAtUtc": "2026-03-13T00:00:00Z",
9
+ "extends": "modes/debug",
10
+ "runtime": {
11
+ "inference": {
12
+ "prompt": "What color is the sky on a clear day? Answer in one word.",
13
+ "batching": {
14
+ "maxTokens": 1
15
+ },
16
+ "sampling": {
17
+ "temperature": 0
18
+ }
19
+ },
20
+ "shared": {
21
+ "debug": {
22
+ "trace": {
23
+ "enabled": true,
24
+ "categories": ["attn", "ffn", "logits"],
25
+ "layers": null,
26
+ "maxDecodeSteps": 1
27
+ },
28
+ "probes": [
29
+ {
30
+ "id": "embed",
31
+ "stage": "embed_out",
32
+ "tokens": [-1],
33
+ "dims": [0, 1, 2, 3, 512, 513]
34
+ },
35
+ {
36
+ "id": "layer_out",
37
+ "stage": "layer_out",
38
+ "layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
39
+ "tokens": [-1],
40
+ "dims": [0, 1, 2, 3]
41
+ },
42
+ {
43
+ "id": "logits",
44
+ "stage": "logits_final",
45
+ "tokens": [-1],
46
+ "dims": [271, 0, 1, 2, 3, 496, 138]
47
+ }
48
+ ]
49
+ }
50
+ }
51
+ }
52
+ }