@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +14 -1
  2. package/README.md +25 -6
  3. package/package.json +5 -3
  4. package/src/client/doppler-api.browser.js +6 -0
  5. package/src/client/doppler-api.d.ts +3 -0
  6. package/src/client/doppler-api.js +11 -2
  7. package/src/client/doppler-registry.js +3 -5
  8. package/src/client/doppler-registry.json +16 -0
  9. package/src/config/kernels/kernel-ref-digests.js +23 -21
  10. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  11. package/src/config/loader.js +6 -0
  12. package/src/config/platforms/loader.js +3 -1
  13. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  14. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  15. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  16. package/src/config/presets/kernel-paths/registry.json +7 -0
  17. package/src/config/presets/models/gemma3.json +2 -1
  18. package/src/config/presets/models/gemma4.json +61 -0
  19. package/src/config/presets/models/granite-docling.json +70 -0
  20. package/src/config/presets/models/lfm2.json +6 -1
  21. package/src/config/presets/models/qwen3_vl.json +40 -0
  22. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  23. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  24. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  25. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  26. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  27. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  28. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  29. package/src/config/runtime.js +3 -0
  30. package/src/config/schema/debug.schema.d.ts +40 -0
  31. package/src/config/schema/debug.schema.js +28 -0
  32. package/src/config/schema/index.js +2 -0
  33. package/src/config/schema/inference-defaults.schema.js +1 -1
  34. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  35. package/src/config/schema/memory-limits.schema.js +2 -2
  36. package/src/config/schema/storage.schema.js +1 -1
  37. package/src/converter/conversion-plan.js +1 -1
  38. package/src/converter/core.js +17 -8
  39. package/src/converter/quantizer.d.ts +5 -0
  40. package/src/converter/quantizer.js +15 -0
  41. package/src/distribution/shard-delivery.js +34 -0
  42. package/src/formats/rdrr/classification.js +32 -0
  43. package/src/gpu/kernel-runtime.js +4 -2
  44. package/src/gpu/kernels/attention.js +2 -1
  45. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  46. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  47. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  48. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  49. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  50. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  51. package/src/gpu/kernels/gated-short-conv.js +284 -0
  52. package/src/gpu/kernels/linear-attention-core.js +37 -17
  53. package/src/gpu/kernels/matmul-selection.js +1 -0
  54. package/src/gpu/kernels/matmul.d.ts +3 -0
  55. package/src/gpu/kernels/matmul.js +70 -1
  56. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  57. package/src/gpu/kernels/sample.js +1 -3
  58. package/src/gpu/kernels/sample.wgsl +39 -9
  59. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  60. package/src/gpu/kernels/shader-cache.js +9 -4
  61. package/src/inference/kv-cache/base.js +3 -10
  62. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  63. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
  64. package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
  65. package/src/inference/pipelines/text/attention/projections.js +13 -2
  66. package/src/inference/pipelines/text/attention/record.js +1 -0
  67. package/src/inference/pipelines/text/attention/run.js +9 -0
  68. package/src/inference/pipelines/text/config.d.ts +1 -0
  69. package/src/inference/pipelines/text/config.js +32 -4
  70. package/src/inference/pipelines/text/embed.js +26 -7
  71. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  72. package/src/inference/pipelines/text/execution-v0.js +12 -1
  73. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  74. package/src/inference/pipelines/text/generator-runtime.js +14 -0
  75. package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
  76. package/src/inference/pipelines/text/generator-steps.js +46 -29
  77. package/src/inference/pipelines/text/generator.d.ts +5 -0
  78. package/src/inference/pipelines/text/generator.js +320 -166
  79. package/src/inference/pipelines/text/init.d.ts +2 -0
  80. package/src/inference/pipelines/text/init.js +19 -5
  81. package/src/inference/pipelines/text/layer.js +37 -8
  82. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  83. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  84. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  85. package/src/inference/pipelines/text/ops.js +123 -53
  86. package/src/inference/pipelines/text/probes.js +1 -0
  87. package/src/inference/pipelines/text/state.js +2 -0
  88. package/src/inference/pipelines/text.d.ts +5 -0
  89. package/src/inference/pipelines/text.js +59 -1
  90. package/src/inference/pipelines/vision/encoder.js +386 -0
  91. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  92. package/src/inference/pipelines/vision/index.js +173 -0
  93. package/src/inference/pipelines/vision/ops.js +78 -0
  94. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  95. package/src/inference/test-harness.js +9 -7
  96. package/src/loader/doppler-loader.d.ts +3 -0
  97. package/src/loader/doppler-loader.js +20 -3
  98. package/src/loader/experts/expert-cache.js +6 -2
  99. package/src/loader/experts/expert-loader.js +6 -2
  100. package/src/loader/layer-loader.js +42 -3
  101. package/src/loader/manifest-config.js +3 -1
  102. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  103. package/src/loader/tensors/tensor-loader.js +124 -3
  104. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  105. package/src/rules/kernels/softmax.rules.json +2 -0
  106. package/src/rules/rule-registry.d.ts +1 -0
  107. package/src/rules/rule-registry.js +2 -0
  108. package/src/storage/quickstart-downloader.d.ts +3 -0
  109. package/src/storage/quickstart-downloader.js +27 -30
  110. package/src/tooling/node-converter.js +25 -7
  111. package/src/tooling/node-source-runtime.js +29 -5
  112. package/src/tooling/node-webgpu.js +24 -7
  113. package/src/utils/hf-resolve-url.d.ts +16 -0
  114. package/src/utils/hf-resolve-url.js +17 -0
  115. package/src/version.js +1 -1
  116. package/src/tooling/node-convert.d.ts +0 -54
@@ -92,6 +92,13 @@
92
92
  "statusReason": "default",
93
93
  "notes": "Gemma 3 Q4K dequant default: subgroup GEMV + online attention + tuned lm_head multicol, F32 activations."
94
94
  },
95
+ {
96
+ "id": "gemma3-q4k-dequant-f32a-small-attn",
97
+ "file": "gemma3-q4k-dequant-f32a-small-attn.json",
98
+ "status": "experimental",
99
+ "statusReason": "diagnostic-probe",
100
+ "notes": "Diagnostic: same as gemma3-q4k-dequant-f32a-online but uses attention_small_f16kv.wgsl for prefill to isolate streaming attention bug."
101
+ },
95
102
  {
96
103
  "id": "gemma3-q4k-dequant-f32w-f32a-online",
97
104
  "file": "gemma3-q4k-dequant-f32w-f32a-online.json",
@@ -34,7 +34,8 @@
34
34
  },
35
35
  "rope": {
36
36
  "ropeTheta": 1000000,
37
- "ropeLocalTheta": 10000
37
+ "ropeLocalTheta": 10000,
38
+ "ropeLocalScalingFactor": 1.0
38
39
  },
39
40
  "chatTemplate": {
40
41
  "type": "gemma",
@@ -0,0 +1,61 @@
1
+ {
2
+ "id": "gemma4",
3
+ "name": "Gemma 4",
4
+ "extends": "gemma3",
5
+ "modelType": "mixtral",
6
+
7
+ "inference": {
8
+ "attention": {
9
+ "slidingWindow": 1024
10
+ },
11
+ "rope": {
12
+ "ropeTheta": 1000000,
13
+ "ropeLocalTheta": 10000,
14
+ "ropeScalingType": "yarn",
15
+ "ropeScalingFactor": 8.0,
16
+ "yarnBetaFast": 4.0,
17
+ "yarnBetaSlow": 1.0,
18
+ "yarnOriginalMaxPos": 32768
19
+ },
20
+ "moe": {
21
+ "kernelProfileId": "mixtral-moe-v1",
22
+ "numExperts": 8,
23
+ "topK": 2,
24
+ "numSharedExperts": 0,
25
+ "routerDtype": "f32",
26
+ "supportedActivationDtypes": ["f16", "f32"],
27
+ "preferredActivationDtype": "f32",
28
+ "tensorPattern": "mixtral"
29
+ },
30
+ "kernelPaths": {
31
+ "q4k": {
32
+ "default": "gemma3-q4k-dequant-f32a-online",
33
+ "f16": "gemma3-q4k-dequant-f16a-online",
34
+ "f16a": "gemma3-q4k-dequant-f16a-online",
35
+ "f32": "gemma3-q4k-dequant-f32a-online"
36
+ }
37
+ }
38
+ },
39
+
40
+ "tensorPatterns": {
41
+ "ffn": {
42
+ "gate": ["layers.{layer}.block_sparse_moe.experts.{expert}.w1.weight"],
43
+ "up": ["layers.{layer}.block_sparse_moe.experts.{expert}.w3.weight"],
44
+ "down": ["layers.{layer}.block_sparse_moe.experts.{expert}.w2.weight"]
45
+ }
46
+ },
47
+
48
+ "detection": {
49
+ "architecturePatterns": [
50
+ "gemma4",
51
+ "Gemma4ForCausalLM",
52
+ "Gemma4ForConditionalGeneration",
53
+ "gemma-4"
54
+ ],
55
+ "modelTypePatterns": [
56
+ "gemma4",
57
+ "gemma4_text",
58
+ "gemma4_moe"
59
+ ]
60
+ }
61
+ }
@@ -0,0 +1,70 @@
1
+ {
2
+ "id": "granite-docling",
3
+ "name": "Granite-Docling (Document OCR VLM)",
4
+ "extends": "transformer",
5
+ "modelType": "ocr",
6
+
7
+ "_notes": "Stabilized successor to SmolDocling-256M-preview. Requires full multimodal pipeline: SigLIP vision encoder, Idefics3-style image-token merge, pixel-shuffle connector, SmolLM2 decoder, DocTags output parsing. This preset covers the decoder config only — vision encoder and connector are separate pipeline stages not yet implemented in Doppler.",
8
+
9
+ "architecture": {
10
+ "headDim": 64,
11
+ "ropeTheta": 10000,
12
+ "visionEncoder": {
13
+ "type": "siglip_b16",
14
+ "patchSize": 16,
15
+ "imageSize": 512,
16
+ "hiddenSize": 768,
17
+ "numLayers": 12,
18
+ "numHeads": 12,
19
+ "parameterCount": 93000000,
20
+ "_note": "SigLIP base patch-16/512 backbone. Requires dedicated vision encoder pipeline in Doppler."
21
+ },
22
+ "connector": {
23
+ "type": "mlp_pixel_shuffle",
24
+ "downsampleFactor": 2,
25
+ "_note": "Idefics3/SmolVLM-style projection. Maps vision tokens to decoder embedding space."
26
+ }
27
+ },
28
+
29
+ "inference": {
30
+ "attention": {
31
+ "queryKeyNorm": false,
32
+ "causal": true
33
+ },
34
+ "normalization": {
35
+ "rmsNormWeightOffset": false,
36
+ "rmsNormEps": 1e-5
37
+ },
38
+ "ffn": {
39
+ "activation": "silu"
40
+ },
41
+ "output": {
42
+ "scaleEmbeddings": false,
43
+ "tieWordEmbeddings": true
44
+ },
45
+ "chatTemplate": {
46
+ "enabled": false
47
+ },
48
+ "kernelPaths": {
49
+ "q4k": {
50
+ "f16": "granite-docling-q4k-dequant-f32a",
51
+ "f32": "granite-docling-q4k-dequant-f32a"
52
+ },
53
+ "f16": {
54
+ "f16": "granite-docling-f16-f32a",
55
+ "f32": "granite-docling-f16-f32a"
56
+ }
57
+ }
58
+ },
59
+
60
+ "tokenizer": {
61
+ "bosToken": "<|endoftext|>",
62
+ "eosTokens": ["<|endoftext|>", "<|im_end|>"],
63
+ "addBosToken": true
64
+ },
65
+
66
+ "detection": {
67
+ "architecturePatterns": ["granite-docling", "GraniteDocling", "smoldocling", "SmolDocling", "SmolVLM"],
68
+ "modelTypePatterns": ["granite-docling", "smoldocling", "smolvlm"]
69
+ }
70
+ }
@@ -39,11 +39,16 @@
39
39
  "period": null,
40
40
  "offset": null,
41
41
  "layerTypes": null
42
+ },
43
+ "chatTemplate": {
44
+ "type": "chatml",
45
+ "enabled": true
42
46
  }
43
47
  },
44
48
 
45
49
  "tokenizer": {
46
- "addBosToken": false,
50
+ "bosTokenId": 1,
51
+ "addBosToken": true,
47
52
  "addEosToken": false
48
53
  },
49
54
 
@@ -0,0 +1,40 @@
1
+ {
2
+ "id": "qwen3_vl",
3
+ "name": "Qwen 3 VL",
4
+ "extends": "qwen3",
5
+
6
+ "architecture": {
7
+ "ropeTheta": 5000000
8
+ },
9
+
10
+ "inference": {
11
+ "normalization": {
12
+ "rmsNormWeightOffset": false
13
+ },
14
+ "rope": {
15
+ "ropeTheta": 5000000,
16
+ "mropeInterleaved": true,
17
+ "mropeSection": [24, 20, 20],
18
+ "partialRotaryFactor": null
19
+ }
20
+ },
21
+
22
+ "vision": {
23
+ "patchSize": 16,
24
+ "spatialMergeSize": 2,
25
+ "temporalPatchSize": 2,
26
+ "eps": 1e-6,
27
+ "minPixels": 3136,
28
+ "maxPixels": 1003520,
29
+ "projectorType": "spatial_merge",
30
+ "normalization": {
31
+ "mean": [0.48145466, 0.4578275, 0.40821073],
32
+ "std": [0.26862954, 0.26130258, 0.27577711]
33
+ }
34
+ },
35
+
36
+ "detection": {
37
+ "architecturePatterns": ["qwen3_vl", "Qwen3VLForConditionalGeneration"],
38
+ "modelTypePatterns": ["qwen3_vl"]
39
+ }
40
+ }
@@ -1,7 +1,8 @@
1
1
  {
2
2
  "id": "experiments/bench/gemma3-bench-q4k",
3
3
  "name": "gemma3-bench-q4k",
4
- "intent": "investigate",
4
+ "description": "Benchmark run for Gemma 3 1B Q4K — calibration-mode throughput measurement.",
5
+ "intent": "calibrate",
5
6
  "stability": "experimental",
6
7
  "owner": "doppler-core",
7
8
  "createdAtUtc": "2026-02-25T00:00:00Z",
@@ -0,0 +1,46 @@
1
+ {
2
+ "id": "experiments/verify/lfm2-verify",
3
+ "name": "lfm2-verify",
4
+ "intent": "verify",
5
+ "stability": "experimental",
6
+ "owner": "doppler-core",
7
+ "createdAtUtc": "2026-03-16T00:00:00Z",
8
+ "extends": "modes/bench",
9
+ "model": "lfm2-5-1-2b-instruct-q4k-ehf16-af32",
10
+ "runtime": {
11
+ "shared": {
12
+ "tooling": {
13
+ "intent": "verify"
14
+ },
15
+ "debug": {
16
+ "logLevel": {
17
+ "defaultLogLevel": "warn"
18
+ },
19
+ "trace": {
20
+ "enabled": false
21
+ },
22
+ "profiler": {
23
+ "enabled": false
24
+ }
25
+ }
26
+ },
27
+ "inference": {
28
+ "prompt": {
29
+ "messages": [
30
+ {
31
+ "role": "user",
32
+ "content": "What color is the sky on a clear day?"
33
+ }
34
+ ]
35
+ },
36
+ "batching": {
37
+ "maxTokens": 32
38
+ },
39
+ "sampling": {
40
+ "temperature": 0,
41
+ "topK": 1,
42
+ "topP": 1
43
+ }
44
+ }
45
+ }
46
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "id": "experiments/verify/translategemma-verify",
3
+ "name": "translategemma-verify",
4
+ "intent": "verify",
5
+ "stability": "experimental",
6
+ "owner": "doppler-core",
7
+ "createdAtUtc": "2026-03-16T00:00:00Z",
8
+ "extends": "modes/bench",
9
+ "model": "translategemma-4b-it-q4k-ehf16-af32",
10
+ "runtime": {
11
+ "shared": {
12
+ "tooling": {
13
+ "intent": "verify"
14
+ },
15
+ "debug": {
16
+ "logLevel": {
17
+ "defaultLogLevel": "warn"
18
+ },
19
+ "trace": {
20
+ "enabled": false
21
+ },
22
+ "profiler": {
23
+ "enabled": false
24
+ }
25
+ }
26
+ },
27
+ "inference": {
28
+ "prompt": "Hello from Doppler.",
29
+ "batching": {
30
+ "maxTokens": 32
31
+ },
32
+ "sampling": {
33
+ "temperature": 0,
34
+ "topK": 1,
35
+ "topP": 1
36
+ }
37
+ }
38
+ }
39
+ }
@@ -6,6 +6,7 @@
6
6
  "stability": "canonical",
7
7
  "owner": "doppler-core",
8
8
  "createdAtUtc": "2026-02-25T00:00:00Z",
9
+ "extends": "default",
9
10
  "runtime": {
10
11
  "shared": {
11
12
  "tooling": {
@@ -0,0 +1,69 @@
1
+ {
2
+ "id": "tiers/gemma4-16gb",
3
+ "name": "Gemma 4 — 16 GB tier (constrained)",
4
+ "description": "Gemma 4 MoE runtime tier for 16 GB GPU memory. Aggressively constrained: short context, minimal expert cache, hard budget enforcement. Fail-closed if budget is not met.",
5
+ "intent": "investigate",
6
+ "stability": "experimental",
7
+ "owner": "doppler-core",
8
+ "createdAtUtc": "2026-03-17T00:00:00Z",
9
+ "extends": "default",
10
+ "runtime": {
11
+ "shared": {
12
+ "bufferPool": {
13
+ "budget": {
14
+ "maxTotalBytes": 13958643712,
15
+ "highWatermarkRatio": 0.85,
16
+ "emergencyTrimTargetRatio": 0.7,
17
+ "hardFailOnBudgetExceeded": true
18
+ }
19
+ }
20
+ },
21
+ "loading": {
22
+ "expertCache": {
23
+ "defaultSizeBytes": 1073741824,
24
+ "maxBufferPercentage": 0.15,
25
+ "evictionHighWatermark": 0.8,
26
+ "emergencyTrimToRatio": 0.65
27
+ },
28
+ "prefetch": {
29
+ "enabled": true,
30
+ "layersAhead": 1,
31
+ "maxShards": 4
32
+ },
33
+ "memoryManagement": {
34
+ "flushIntervalLayers": 1,
35
+ "flushThresholdBytes": 134217728
36
+ }
37
+ },
38
+ "inference": {
39
+ "kvcache": {
40
+ "layout": "contiguous",
41
+ "maxSeqLen": 2048,
42
+ "kvDtype": "f16",
43
+ "pageSize": 128,
44
+ "tiering": {
45
+ "mode": "off"
46
+ }
47
+ },
48
+ "moe": {
49
+ "routing": {
50
+ "routerDtype": "f32"
51
+ },
52
+ "cache": {
53
+ "dequantCacheMaxEntries": 2
54
+ }
55
+ },
56
+ "compute": {
57
+ "activationDtype": "f32"
58
+ },
59
+ "batching": {
60
+ "maxTokens": 512
61
+ },
62
+ "session": {
63
+ "kvcache": {
64
+ "kvDtype": "f16"
65
+ }
66
+ }
67
+ }
68
+ }
69
+ }
@@ -0,0 +1,66 @@
1
+ {
2
+ "id": "tiers/gemma4-24gb",
3
+ "name": "Gemma 4 — 24 GB tier",
4
+ "description": "Gemma 4 MoE runtime tier for 24 GB GPU memory. Moderate expert cache, contiguous KV, reduced context length.",
5
+ "intent": "investigate",
6
+ "stability": "experimental",
7
+ "owner": "doppler-core",
8
+ "createdAtUtc": "2026-03-17T00:00:00Z",
9
+ "extends": "default",
10
+ "runtime": {
11
+ "shared": {
12
+ "bufferPool": {
13
+ "budget": {
14
+ "maxTotalBytes": 21474836480,
15
+ "highWatermarkRatio": 0.9,
16
+ "emergencyTrimTargetRatio": 0.75,
17
+ "hardFailOnBudgetExceeded": true
18
+ }
19
+ }
20
+ },
21
+ "loading": {
22
+ "expertCache": {
23
+ "defaultSizeBytes": 3221225472,
24
+ "maxBufferPercentage": 0.2,
25
+ "evictionHighWatermark": 0.85,
26
+ "emergencyTrimToRatio": 0.7
27
+ },
28
+ "prefetch": {
29
+ "enabled": true,
30
+ "layersAhead": 1,
31
+ "maxShards": 8
32
+ },
33
+ "memoryManagement": {
34
+ "flushIntervalLayers": 2,
35
+ "flushThresholdBytes": 268435456
36
+ }
37
+ },
38
+ "inference": {
39
+ "kvcache": {
40
+ "layout": "contiguous",
41
+ "maxSeqLen": 4096,
42
+ "kvDtype": "f16",
43
+ "pageSize": 256,
44
+ "tiering": {
45
+ "mode": "off"
46
+ }
47
+ },
48
+ "moe": {
49
+ "routing": {
50
+ "routerDtype": "f32"
51
+ },
52
+ "cache": {
53
+ "dequantCacheMaxEntries": 64
54
+ }
55
+ },
56
+ "compute": {
57
+ "activationDtype": "f32"
58
+ },
59
+ "session": {
60
+ "kvcache": {
61
+ "kvDtype": "f16"
62
+ }
63
+ }
64
+ }
65
+ }
66
+ }
@@ -0,0 +1,66 @@
1
+ {
2
+ "id": "tiers/gemma4-32gb",
3
+ "name": "Gemma 4 — 32 GB tier",
4
+ "description": "Gemma 4 MoE runtime tier for 32 GB GPU memory. Generous expert cache, contiguous KV, full-length context.",
5
+ "intent": "investigate",
6
+ "stability": "experimental",
7
+ "owner": "doppler-core",
8
+ "createdAtUtc": "2026-03-17T00:00:00Z",
9
+ "extends": "default",
10
+ "runtime": {
11
+ "shared": {
12
+ "bufferPool": {
13
+ "budget": {
14
+ "maxTotalBytes": 30064771072,
15
+ "highWatermarkRatio": 0.9,
16
+ "emergencyTrimTargetRatio": 0.75,
17
+ "hardFailOnBudgetExceeded": true
18
+ }
19
+ }
20
+ },
21
+ "loading": {
22
+ "expertCache": {
23
+ "defaultSizeBytes": 6442450944,
24
+ "maxBufferPercentage": 0.25,
25
+ "evictionHighWatermark": 0.9,
26
+ "emergencyTrimToRatio": 0.75
27
+ },
28
+ "prefetch": {
29
+ "enabled": true,
30
+ "layersAhead": 2,
31
+ "maxShards": 16
32
+ },
33
+ "memoryManagement": {
34
+ "flushIntervalLayers": 4,
35
+ "flushThresholdBytes": 536870912
36
+ }
37
+ },
38
+ "inference": {
39
+ "kvcache": {
40
+ "layout": "contiguous",
41
+ "maxSeqLen": 8192,
42
+ "kvDtype": "f16",
43
+ "pageSize": 256,
44
+ "tiering": {
45
+ "mode": "off"
46
+ }
47
+ },
48
+ "moe": {
49
+ "routing": {
50
+ "routerDtype": "f32"
51
+ },
52
+ "cache": {
53
+ "dequantCacheMaxEntries": 128
54
+ }
55
+ },
56
+ "compute": {
57
+ "activationDtype": "f32"
58
+ },
59
+ "session": {
60
+ "kvcache": {
61
+ "kvDtype": "f16"
62
+ }
63
+ }
64
+ }
65
+ }
66
+ }
@@ -58,4 +58,7 @@ function assertNoDeprecatedRuntimeKeys(overrides) {
58
58
  if (inference?.sampling?.maxTokens !== undefined) {
59
59
  throw new Error('sampling.maxTokens is removed; use inference.batching.maxTokens');
60
60
  }
61
+ if (inference?.session?.maxNewTokens !== undefined) {
62
+ throw new Error('inference.session.maxNewTokens is not a supported runtime config key; use inference.batching.maxTokens');
63
+ }
61
64
  }
@@ -131,6 +131,44 @@ export interface PipelineDebugConfigSchema {
131
131
  /** Default pipeline debug configuration */
132
132
  export declare const DEFAULT_PIPELINE_DEBUG_CONFIG: PipelineDebugConfigSchema;
133
133
 
134
+ /** Loader debug configuration (Q4K dequant and related probes). */
135
+ export interface LoaderDebugConfigSchema {
136
+ /** Enable loader debug behavior (default: false) */
137
+ enabled: boolean;
138
+ /** Force GPU dequant for Q4K tensors even when CPU fallback is eligible. */
139
+ forceGpuDequant: boolean;
140
+ /** Prefer CPU dequant for F32 output when eligible (default: false, GPU is preferred). */
141
+ preferCpuDequant: boolean;
142
+ /** Throw when CPU dequant fallback is taken. */
143
+ failOnCpuDequantPath: boolean;
144
+ /** Enable dtype-aware GPU-vs-CPU parity checks during Q4K dequant. */
145
+ runQ4KDequantParity: boolean;
146
+ /** Number of values to read back for parity checks. */
147
+ q4kDequantParitySamples: number;
148
+ }
149
+
150
+ /** Default loader debug configuration. */
151
+ export declare const DEFAULT_LOADER_DEBUG_CONFIG: LoaderDebugConfigSchema;
152
+
153
+ /** Matmul debug configuration (attention split/shape diagnostics). */
154
+ export interface MatmulDebugConfigSchema {
155
+ /** Enable matmul debug behavior (default: false) */
156
+ enabled: boolean;
157
+ /** Force split (non-fused) Q/K/V projection path for diagnostics. */
158
+ forceSplitQKV: boolean;
159
+ /** Validate B tensor layout/buffer bytes for attention projection roles. */
160
+ validateAttentionWeightBuffer: boolean;
161
+ /** Throw if validation fails due small B tensor. */
162
+ failOnSmallAttentionWeightBuffer: boolean;
163
+ /** Emit attention B-buffer diagnostics. */
164
+ logAttentionWeightBuffer: boolean;
165
+ /** Log first-8 projection output values for layer 0 decode (diagnostic). */
166
+ logProjectionValues: boolean;
167
+ }
168
+
169
+ /** Default matmul debug configuration. */
170
+ export declare const DEFAULT_MATMUL_DEBUG_CONFIG: MatmulDebugConfigSchema;
171
+
134
172
  /**
135
173
  * Profiler configuration.
136
174
  */
@@ -241,6 +279,8 @@ export interface DebugConfigSchema {
241
279
  logLevel: LogLevelConfigSchema;
242
280
  trace: TraceConfigSchema;
243
281
  pipeline: PipelineDebugConfigSchema;
282
+ loader: LoaderDebugConfigSchema;
283
+ matmul: MatmulDebugConfigSchema;
244
284
  probes: ProbeConfigSchema[];
245
285
  profiler: ProfilerConfigSchema;
246
286
  perfGuards: PerfGuardsConfigSchema;
@@ -38,6 +38,32 @@ export const DEFAULT_TRACE_CONFIG = {
38
38
  file: null,
39
39
  };
40
40
 
41
+ // =============================================================================
42
+ // Loader Debug Config
43
+ // =============================================================================
44
+
45
+ export const DEFAULT_LOADER_DEBUG_CONFIG = {
46
+ enabled: false,
47
+ forceGpuDequant: false,
48
+ preferCpuDequant: false,
49
+ failOnCpuDequantPath: false,
50
+ runQ4KDequantParity: false,
51
+ q4kDequantParitySamples: 256,
52
+ };
53
+
54
+ // =============================================================================
55
+ // Kernel Debug Config
56
+ // =============================================================================
57
+
58
+ export const DEFAULT_MATMUL_DEBUG_CONFIG = {
59
+ enabled: false,
60
+ forceSplitQKV: false,
61
+ validateAttentionWeightBuffer: false,
62
+ failOnSmallAttentionWeightBuffer: false,
63
+ logAttentionWeightBuffer: false,
64
+ logProjectionValues: false,
65
+ };
66
+
41
67
  // =============================================================================
42
68
  // Kernel Trace Config (kernel-trace.js anomaly detection)
43
69
  // =============================================================================
@@ -100,6 +126,8 @@ export const DEFAULT_DEBUG_CONFIG = {
100
126
  logLevel: DEFAULT_LOG_LEVEL_CONFIG,
101
127
  trace: DEFAULT_TRACE_CONFIG,
102
128
  pipeline: DEFAULT_PIPELINE_DEBUG_CONFIG,
129
+ loader: DEFAULT_LOADER_DEBUG_CONFIG,
130
+ matmul: DEFAULT_MATMUL_DEBUG_CONFIG,
103
131
  probes: [],
104
132
  profiler: DEFAULT_PROFILER_CONFIG,
105
133
  perfGuards: DEFAULT_PERF_GUARDS_CONFIG,
@@ -217,6 +217,8 @@ export {
217
217
  DEFAULT_LOG_HISTORY_CONFIG,
218
218
  DEFAULT_LOG_LEVEL_CONFIG,
219
219
  DEFAULT_TRACE_CONFIG,
220
+ DEFAULT_LOADER_DEBUG_CONFIG,
221
+ DEFAULT_MATMUL_DEBUG_CONFIG,
220
222
  DEFAULT_KERNEL_TRACE_CONFIG,
221
223
  DEFAULT_PIPELINE_DEBUG_CONFIG,
222
224
  DEFAULT_PROFILER_CONFIG,
@@ -93,7 +93,7 @@ export const DEFAULT_TOKENIZER_DEFAULTS = {
93
93
  // =============================================================================
94
94
 
95
95
  export const DEFAULT_CHAT_TEMPLATE_CONFIG = {
96
- enabled: false,
96
+ enabled: undefined,
97
97
  };
98
98
 
99
99
  export const DEFAULT_KERNEL_PATH_POLICY = {
@@ -160,6 +160,7 @@ export type BuiltinKernelPathId =
160
160
  | 'gemma3-f16-fused-f32a-online-streamingprefill' // Gemma 3 F16 fused FFN online path with streaming prefill attention
161
161
  | 'gemma3-q4k-dequant-f16a-online' // Gemma 3 Q4K dequant online path (F16 activations)
162
162
  | 'gemma3-q4k-dequant-f32a-online' // Gemma 3 Q4K dequant online path with F32 activations
163
+ | 'gemma3-q4k-dequant-f32w-f32a-online' // Gemma 3 Q4K path with F32 projection weights and F32 activations
163
164
  | 'gemma3-q4k-dequant-f32a-nosubgroups' // Gemma 3 Q4K dequant path with no subgroup requirement
164
165
  | 'gemma3-q4k-dequant-f32a' // Legacy alias for gemma3-q4k-dequant-f32a-nosubgroups
165
166
  | 'lfm2-q4k-dequant-f32a-online' // LFM2 Q4K path with F32 activations and fast prefill
@@ -40,8 +40,8 @@ export const DEFAULT_SEGMENT_ALLOCATION_CONFIG = {
40
40
  // =============================================================================
41
41
 
42
42
  export const DEFAULT_EMULATED_STORAGE_CONFIG = {
43
- vramBudgetBytes: 2 * GB,
44
- ramBudgetBytes: 8 * GB,
43
+ vramBudgetBytes: 4 * GB,
44
+ ramBudgetBytes: 16 * GB,
45
45
  };
46
46
 
47
47
  // =============================================================================