@simulatte/doppler 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +25 -6
- package/package.json +5 -3
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +16 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/loader.js +6 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +7 -0
- package/src/config/presets/models/gemma3.json +2 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +1 -1
- package/src/converter/core.js +17 -8
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +15 -0
- package/src/distribution/shard-delivery.js +34 -0
- package/src/formats/rdrr/classification.js +32 -0
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +1 -0
- package/src/gpu/kernels/matmul.d.ts +3 -0
- package/src/gpu/kernels/matmul.js +70 -1
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
- package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
- package/src/inference/pipelines/text/attention/projections.js +13 -2
- package/src/inference/pipelines/text/attention/record.js +1 -0
- package/src/inference/pipelines/text/attention/run.js +9 -0
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +32 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +14 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
- package/src/inference/pipelines/text/generator-steps.js +46 -29
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +320 -166
- package/src/inference/pipelines/text/init.d.ts +2 -0
- package/src/inference/pipelines/text/init.js +19 -5
- package/src/inference/pipelines/text/layer.js +37 -8
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +9 -7
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +124 -3
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/tooling/node-converter.js +25 -7
- package/src/tooling/node-source-runtime.js +29 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -92,6 +92,13 @@
|
|
|
92
92
|
"statusReason": "default",
|
|
93
93
|
"notes": "Gemma 3 Q4K dequant default: subgroup GEMV + online attention + tuned lm_head multicol, F32 activations."
|
|
94
94
|
},
|
|
95
|
+
{
|
|
96
|
+
"id": "gemma3-q4k-dequant-f32a-small-attn",
|
|
97
|
+
"file": "gemma3-q4k-dequant-f32a-small-attn.json",
|
|
98
|
+
"status": "experimental",
|
|
99
|
+
"statusReason": "diagnostic-probe",
|
|
100
|
+
"notes": "Diagnostic: same as gemma3-q4k-dequant-f32a-online but uses attention_small_f16kv.wgsl for prefill to isolate streaming attention bug."
|
|
101
|
+
},
|
|
95
102
|
{
|
|
96
103
|
"id": "gemma3-q4k-dequant-f32w-f32a-online",
|
|
97
104
|
"file": "gemma3-q4k-dequant-f32w-f32a-online.json",
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "gemma4",
|
|
3
|
+
"name": "Gemma 4",
|
|
4
|
+
"extends": "gemma3",
|
|
5
|
+
"modelType": "mixtral",
|
|
6
|
+
|
|
7
|
+
"inference": {
|
|
8
|
+
"attention": {
|
|
9
|
+
"slidingWindow": 1024
|
|
10
|
+
},
|
|
11
|
+
"rope": {
|
|
12
|
+
"ropeTheta": 1000000,
|
|
13
|
+
"ropeLocalTheta": 10000,
|
|
14
|
+
"ropeScalingType": "yarn",
|
|
15
|
+
"ropeScalingFactor": 8.0,
|
|
16
|
+
"yarnBetaFast": 4.0,
|
|
17
|
+
"yarnBetaSlow": 1.0,
|
|
18
|
+
"yarnOriginalMaxPos": 32768
|
|
19
|
+
},
|
|
20
|
+
"moe": {
|
|
21
|
+
"kernelProfileId": "mixtral-moe-v1",
|
|
22
|
+
"numExperts": 8,
|
|
23
|
+
"topK": 2,
|
|
24
|
+
"numSharedExperts": 0,
|
|
25
|
+
"routerDtype": "f32",
|
|
26
|
+
"supportedActivationDtypes": ["f16", "f32"],
|
|
27
|
+
"preferredActivationDtype": "f32",
|
|
28
|
+
"tensorPattern": "mixtral"
|
|
29
|
+
},
|
|
30
|
+
"kernelPaths": {
|
|
31
|
+
"q4k": {
|
|
32
|
+
"default": "gemma3-q4k-dequant-f32a-online",
|
|
33
|
+
"f16": "gemma3-q4k-dequant-f16a-online",
|
|
34
|
+
"f16a": "gemma3-q4k-dequant-f16a-online",
|
|
35
|
+
"f32": "gemma3-q4k-dequant-f32a-online"
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
|
|
40
|
+
"tensorPatterns": {
|
|
41
|
+
"ffn": {
|
|
42
|
+
"gate": ["layers.{layer}.block_sparse_moe.experts.{expert}.w1.weight"],
|
|
43
|
+
"up": ["layers.{layer}.block_sparse_moe.experts.{expert}.w3.weight"],
|
|
44
|
+
"down": ["layers.{layer}.block_sparse_moe.experts.{expert}.w2.weight"]
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
|
|
48
|
+
"detection": {
|
|
49
|
+
"architecturePatterns": [
|
|
50
|
+
"gemma4",
|
|
51
|
+
"Gemma4ForCausalLM",
|
|
52
|
+
"Gemma4ForConditionalGeneration",
|
|
53
|
+
"gemma-4"
|
|
54
|
+
],
|
|
55
|
+
"modelTypePatterns": [
|
|
56
|
+
"gemma4",
|
|
57
|
+
"gemma4_text",
|
|
58
|
+
"gemma4_moe"
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "granite-docling",
|
|
3
|
+
"name": "Granite-Docling (Document OCR VLM)",
|
|
4
|
+
"extends": "transformer",
|
|
5
|
+
"modelType": "ocr",
|
|
6
|
+
|
|
7
|
+
"_notes": "Stabilized successor to SmolDocling-256M-preview. Requires full multimodal pipeline: SigLIP vision encoder, Idefics3-style image-token merge, pixel-shuffle connector, SmolLM2 decoder, DocTags output parsing. This preset covers the decoder config only — vision encoder and connector are separate pipeline stages not yet implemented in Doppler.",
|
|
8
|
+
|
|
9
|
+
"architecture": {
|
|
10
|
+
"headDim": 64,
|
|
11
|
+
"ropeTheta": 10000,
|
|
12
|
+
"visionEncoder": {
|
|
13
|
+
"type": "siglip_b16",
|
|
14
|
+
"patchSize": 16,
|
|
15
|
+
"imageSize": 512,
|
|
16
|
+
"hiddenSize": 768,
|
|
17
|
+
"numLayers": 12,
|
|
18
|
+
"numHeads": 12,
|
|
19
|
+
"parameterCount": 93000000,
|
|
20
|
+
"_note": "SigLIP base patch-16/512 backbone. Requires dedicated vision encoder pipeline in Doppler."
|
|
21
|
+
},
|
|
22
|
+
"connector": {
|
|
23
|
+
"type": "mlp_pixel_shuffle",
|
|
24
|
+
"downsampleFactor": 2,
|
|
25
|
+
"_note": "Idefics3/SmolVLM-style projection. Maps vision tokens to decoder embedding space."
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
|
|
29
|
+
"inference": {
|
|
30
|
+
"attention": {
|
|
31
|
+
"queryKeyNorm": false,
|
|
32
|
+
"causal": true
|
|
33
|
+
},
|
|
34
|
+
"normalization": {
|
|
35
|
+
"rmsNormWeightOffset": false,
|
|
36
|
+
"rmsNormEps": 1e-5
|
|
37
|
+
},
|
|
38
|
+
"ffn": {
|
|
39
|
+
"activation": "silu"
|
|
40
|
+
},
|
|
41
|
+
"output": {
|
|
42
|
+
"scaleEmbeddings": false,
|
|
43
|
+
"tieWordEmbeddings": true
|
|
44
|
+
},
|
|
45
|
+
"chatTemplate": {
|
|
46
|
+
"enabled": false
|
|
47
|
+
},
|
|
48
|
+
"kernelPaths": {
|
|
49
|
+
"q4k": {
|
|
50
|
+
"f16": "granite-docling-q4k-dequant-f32a",
|
|
51
|
+
"f32": "granite-docling-q4k-dequant-f32a"
|
|
52
|
+
},
|
|
53
|
+
"f16": {
|
|
54
|
+
"f16": "granite-docling-f16-f32a",
|
|
55
|
+
"f32": "granite-docling-f16-f32a"
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
|
|
60
|
+
"tokenizer": {
|
|
61
|
+
"bosToken": "<|endoftext|>",
|
|
62
|
+
"eosTokens": ["<|endoftext|>", "<|im_end|>"],
|
|
63
|
+
"addBosToken": true
|
|
64
|
+
},
|
|
65
|
+
|
|
66
|
+
"detection": {
|
|
67
|
+
"architecturePatterns": ["granite-docling", "GraniteDocling", "smoldocling", "SmolDocling", "SmolVLM"],
|
|
68
|
+
"modelTypePatterns": ["granite-docling", "smoldocling", "smolvlm"]
|
|
69
|
+
}
|
|
70
|
+
}
|
|
@@ -39,11 +39,16 @@
|
|
|
39
39
|
"period": null,
|
|
40
40
|
"offset": null,
|
|
41
41
|
"layerTypes": null
|
|
42
|
+
},
|
|
43
|
+
"chatTemplate": {
|
|
44
|
+
"type": "chatml",
|
|
45
|
+
"enabled": true
|
|
42
46
|
}
|
|
43
47
|
},
|
|
44
48
|
|
|
45
49
|
"tokenizer": {
|
|
46
|
-
"
|
|
50
|
+
"bosTokenId": 1,
|
|
51
|
+
"addBosToken": true,
|
|
47
52
|
"addEosToken": false
|
|
48
53
|
},
|
|
49
54
|
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "qwen3_vl",
|
|
3
|
+
"name": "Qwen 3 VL",
|
|
4
|
+
"extends": "qwen3",
|
|
5
|
+
|
|
6
|
+
"architecture": {
|
|
7
|
+
"ropeTheta": 5000000
|
|
8
|
+
},
|
|
9
|
+
|
|
10
|
+
"inference": {
|
|
11
|
+
"normalization": {
|
|
12
|
+
"rmsNormWeightOffset": false
|
|
13
|
+
},
|
|
14
|
+
"rope": {
|
|
15
|
+
"ropeTheta": 5000000,
|
|
16
|
+
"mropeInterleaved": true,
|
|
17
|
+
"mropeSection": [24, 20, 20],
|
|
18
|
+
"partialRotaryFactor": null
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
|
|
22
|
+
"vision": {
|
|
23
|
+
"patchSize": 16,
|
|
24
|
+
"spatialMergeSize": 2,
|
|
25
|
+
"temporalPatchSize": 2,
|
|
26
|
+
"eps": 1e-6,
|
|
27
|
+
"minPixels": 3136,
|
|
28
|
+
"maxPixels": 1003520,
|
|
29
|
+
"projectorType": "spatial_merge",
|
|
30
|
+
"normalization": {
|
|
31
|
+
"mean": [0.48145466, 0.4578275, 0.40821073],
|
|
32
|
+
"std": [0.26862954, 0.26130258, 0.27577711]
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
"detection": {
|
|
37
|
+
"architecturePatterns": ["qwen3_vl", "Qwen3VLForConditionalGeneration"],
|
|
38
|
+
"modelTypePatterns": ["qwen3_vl"]
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"id": "experiments/bench/gemma3-bench-q4k",
|
|
3
3
|
"name": "gemma3-bench-q4k",
|
|
4
|
-
"
|
|
4
|
+
"description": "Benchmark run for Gemma 3 1B Q4K — calibration-mode throughput measurement.",
|
|
5
|
+
"intent": "calibrate",
|
|
5
6
|
"stability": "experimental",
|
|
6
7
|
"owner": "doppler-core",
|
|
7
8
|
"createdAtUtc": "2026-02-25T00:00:00Z",
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "experiments/verify/lfm2-verify",
|
|
3
|
+
"name": "lfm2-verify",
|
|
4
|
+
"intent": "verify",
|
|
5
|
+
"stability": "experimental",
|
|
6
|
+
"owner": "doppler-core",
|
|
7
|
+
"createdAtUtc": "2026-03-16T00:00:00Z",
|
|
8
|
+
"extends": "modes/bench",
|
|
9
|
+
"model": "lfm2-5-1-2b-instruct-q4k-ehf16-af32",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"tooling": {
|
|
13
|
+
"intent": "verify"
|
|
14
|
+
},
|
|
15
|
+
"debug": {
|
|
16
|
+
"logLevel": {
|
|
17
|
+
"defaultLogLevel": "warn"
|
|
18
|
+
},
|
|
19
|
+
"trace": {
|
|
20
|
+
"enabled": false
|
|
21
|
+
},
|
|
22
|
+
"profiler": {
|
|
23
|
+
"enabled": false
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"inference": {
|
|
28
|
+
"prompt": {
|
|
29
|
+
"messages": [
|
|
30
|
+
{
|
|
31
|
+
"role": "user",
|
|
32
|
+
"content": "What color is the sky on a clear day?"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
"batching": {
|
|
37
|
+
"maxTokens": 32
|
|
38
|
+
},
|
|
39
|
+
"sampling": {
|
|
40
|
+
"temperature": 0,
|
|
41
|
+
"topK": 1,
|
|
42
|
+
"topP": 1
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "experiments/verify/translategemma-verify",
|
|
3
|
+
"name": "translategemma-verify",
|
|
4
|
+
"intent": "verify",
|
|
5
|
+
"stability": "experimental",
|
|
6
|
+
"owner": "doppler-core",
|
|
7
|
+
"createdAtUtc": "2026-03-16T00:00:00Z",
|
|
8
|
+
"extends": "modes/bench",
|
|
9
|
+
"model": "translategemma-4b-it-q4k-ehf16-af32",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"tooling": {
|
|
13
|
+
"intent": "verify"
|
|
14
|
+
},
|
|
15
|
+
"debug": {
|
|
16
|
+
"logLevel": {
|
|
17
|
+
"defaultLogLevel": "warn"
|
|
18
|
+
},
|
|
19
|
+
"trace": {
|
|
20
|
+
"enabled": false
|
|
21
|
+
},
|
|
22
|
+
"profiler": {
|
|
23
|
+
"enabled": false
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"inference": {
|
|
28
|
+
"prompt": "Hello from Doppler.",
|
|
29
|
+
"batching": {
|
|
30
|
+
"maxTokens": 32
|
|
31
|
+
},
|
|
32
|
+
"sampling": {
|
|
33
|
+
"temperature": 0,
|
|
34
|
+
"topK": 1,
|
|
35
|
+
"topP": 1
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "tiers/gemma4-16gb",
|
|
3
|
+
"name": "Gemma 4 — 16 GB tier (constrained)",
|
|
4
|
+
"description": "Gemma 4 MoE runtime tier for 16 GB GPU memory. Aggressively constrained: short context, minimal expert cache, hard budget enforcement. Fail-closed if budget is not met.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "experimental",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-17T00:00:00Z",
|
|
9
|
+
"extends": "default",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"bufferPool": {
|
|
13
|
+
"budget": {
|
|
14
|
+
"maxTotalBytes": 13958643712,
|
|
15
|
+
"highWatermarkRatio": 0.85,
|
|
16
|
+
"emergencyTrimTargetRatio": 0.7,
|
|
17
|
+
"hardFailOnBudgetExceeded": true
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"loading": {
|
|
22
|
+
"expertCache": {
|
|
23
|
+
"defaultSizeBytes": 1073741824,
|
|
24
|
+
"maxBufferPercentage": 0.15,
|
|
25
|
+
"evictionHighWatermark": 0.8,
|
|
26
|
+
"emergencyTrimToRatio": 0.65
|
|
27
|
+
},
|
|
28
|
+
"prefetch": {
|
|
29
|
+
"enabled": true,
|
|
30
|
+
"layersAhead": 1,
|
|
31
|
+
"maxShards": 4
|
|
32
|
+
},
|
|
33
|
+
"memoryManagement": {
|
|
34
|
+
"flushIntervalLayers": 1,
|
|
35
|
+
"flushThresholdBytes": 134217728
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"inference": {
|
|
39
|
+
"kvcache": {
|
|
40
|
+
"layout": "contiguous",
|
|
41
|
+
"maxSeqLen": 2048,
|
|
42
|
+
"kvDtype": "f16",
|
|
43
|
+
"pageSize": 128,
|
|
44
|
+
"tiering": {
|
|
45
|
+
"mode": "off"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"moe": {
|
|
49
|
+
"routing": {
|
|
50
|
+
"routerDtype": "f32"
|
|
51
|
+
},
|
|
52
|
+
"cache": {
|
|
53
|
+
"dequantCacheMaxEntries": 2
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"compute": {
|
|
57
|
+
"activationDtype": "f32"
|
|
58
|
+
},
|
|
59
|
+
"batching": {
|
|
60
|
+
"maxTokens": 512
|
|
61
|
+
},
|
|
62
|
+
"session": {
|
|
63
|
+
"kvcache": {
|
|
64
|
+
"kvDtype": "f16"
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "tiers/gemma4-24gb",
|
|
3
|
+
"name": "Gemma 4 — 24 GB tier",
|
|
4
|
+
"description": "Gemma 4 MoE runtime tier for 24 GB GPU memory. Moderate expert cache, contiguous KV, reduced context length.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "experimental",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-17T00:00:00Z",
|
|
9
|
+
"extends": "default",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"bufferPool": {
|
|
13
|
+
"budget": {
|
|
14
|
+
"maxTotalBytes": 21474836480,
|
|
15
|
+
"highWatermarkRatio": 0.9,
|
|
16
|
+
"emergencyTrimTargetRatio": 0.75,
|
|
17
|
+
"hardFailOnBudgetExceeded": true
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"loading": {
|
|
22
|
+
"expertCache": {
|
|
23
|
+
"defaultSizeBytes": 3221225472,
|
|
24
|
+
"maxBufferPercentage": 0.2,
|
|
25
|
+
"evictionHighWatermark": 0.85,
|
|
26
|
+
"emergencyTrimToRatio": 0.7
|
|
27
|
+
},
|
|
28
|
+
"prefetch": {
|
|
29
|
+
"enabled": true,
|
|
30
|
+
"layersAhead": 1,
|
|
31
|
+
"maxShards": 8
|
|
32
|
+
},
|
|
33
|
+
"memoryManagement": {
|
|
34
|
+
"flushIntervalLayers": 2,
|
|
35
|
+
"flushThresholdBytes": 268435456
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"inference": {
|
|
39
|
+
"kvcache": {
|
|
40
|
+
"layout": "contiguous",
|
|
41
|
+
"maxSeqLen": 4096,
|
|
42
|
+
"kvDtype": "f16",
|
|
43
|
+
"pageSize": 256,
|
|
44
|
+
"tiering": {
|
|
45
|
+
"mode": "off"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"moe": {
|
|
49
|
+
"routing": {
|
|
50
|
+
"routerDtype": "f32"
|
|
51
|
+
},
|
|
52
|
+
"cache": {
|
|
53
|
+
"dequantCacheMaxEntries": 64
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"compute": {
|
|
57
|
+
"activationDtype": "f32"
|
|
58
|
+
},
|
|
59
|
+
"session": {
|
|
60
|
+
"kvcache": {
|
|
61
|
+
"kvDtype": "f16"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "tiers/gemma4-32gb",
|
|
3
|
+
"name": "Gemma 4 — 32 GB tier",
|
|
4
|
+
"description": "Gemma 4 MoE runtime tier for 32 GB GPU memory. Generous expert cache, contiguous KV, full-length context.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "experimental",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-17T00:00:00Z",
|
|
9
|
+
"extends": "default",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"bufferPool": {
|
|
13
|
+
"budget": {
|
|
14
|
+
"maxTotalBytes": 30064771072,
|
|
15
|
+
"highWatermarkRatio": 0.9,
|
|
16
|
+
"emergencyTrimTargetRatio": 0.75,
|
|
17
|
+
"hardFailOnBudgetExceeded": true
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"loading": {
|
|
22
|
+
"expertCache": {
|
|
23
|
+
"defaultSizeBytes": 6442450944,
|
|
24
|
+
"maxBufferPercentage": 0.25,
|
|
25
|
+
"evictionHighWatermark": 0.9,
|
|
26
|
+
"emergencyTrimToRatio": 0.75
|
|
27
|
+
},
|
|
28
|
+
"prefetch": {
|
|
29
|
+
"enabled": true,
|
|
30
|
+
"layersAhead": 2,
|
|
31
|
+
"maxShards": 16
|
|
32
|
+
},
|
|
33
|
+
"memoryManagement": {
|
|
34
|
+
"flushIntervalLayers": 4,
|
|
35
|
+
"flushThresholdBytes": 536870912
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"inference": {
|
|
39
|
+
"kvcache": {
|
|
40
|
+
"layout": "contiguous",
|
|
41
|
+
"maxSeqLen": 8192,
|
|
42
|
+
"kvDtype": "f16",
|
|
43
|
+
"pageSize": 256,
|
|
44
|
+
"tiering": {
|
|
45
|
+
"mode": "off"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"moe": {
|
|
49
|
+
"routing": {
|
|
50
|
+
"routerDtype": "f32"
|
|
51
|
+
},
|
|
52
|
+
"cache": {
|
|
53
|
+
"dequantCacheMaxEntries": 128
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"compute": {
|
|
57
|
+
"activationDtype": "f32"
|
|
58
|
+
},
|
|
59
|
+
"session": {
|
|
60
|
+
"kvcache": {
|
|
61
|
+
"kvDtype": "f16"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
package/src/config/runtime.js
CHANGED
|
@@ -58,4 +58,7 @@ function assertNoDeprecatedRuntimeKeys(overrides) {
|
|
|
58
58
|
if (inference?.sampling?.maxTokens !== undefined) {
|
|
59
59
|
throw new Error('sampling.maxTokens is removed; use inference.batching.maxTokens');
|
|
60
60
|
}
|
|
61
|
+
if (inference?.session?.maxNewTokens !== undefined) {
|
|
62
|
+
throw new Error('inference.session.maxNewTokens is not a supported runtime config key; use inference.batching.maxTokens');
|
|
63
|
+
}
|
|
61
64
|
}
|
|
@@ -131,6 +131,44 @@ export interface PipelineDebugConfigSchema {
|
|
|
131
131
|
/** Default pipeline debug configuration */
|
|
132
132
|
export declare const DEFAULT_PIPELINE_DEBUG_CONFIG: PipelineDebugConfigSchema;
|
|
133
133
|
|
|
134
|
+
/** Loader debug configuration (Q4K dequant and related probes). */
|
|
135
|
+
export interface LoaderDebugConfigSchema {
|
|
136
|
+
/** Enable loader debug behavior (default: false) */
|
|
137
|
+
enabled: boolean;
|
|
138
|
+
/** Force GPU dequant for Q4K tensors even when CPU fallback is eligible. */
|
|
139
|
+
forceGpuDequant: boolean;
|
|
140
|
+
/** Prefer CPU dequant for F32 output when eligible (default: false, GPU is preferred). */
|
|
141
|
+
preferCpuDequant: boolean;
|
|
142
|
+
/** Throw when CPU dequant fallback is taken. */
|
|
143
|
+
failOnCpuDequantPath: boolean;
|
|
144
|
+
/** Enable dtype-aware GPU-vs-CPU parity checks during Q4K dequant. */
|
|
145
|
+
runQ4KDequantParity: boolean;
|
|
146
|
+
/** Number of values to read back for parity checks. */
|
|
147
|
+
q4kDequantParitySamples: number;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** Default loader debug configuration. */
|
|
151
|
+
export declare const DEFAULT_LOADER_DEBUG_CONFIG: LoaderDebugConfigSchema;
|
|
152
|
+
|
|
153
|
+
/** Matmul debug configuration (attention split/shape diagnostics). */
|
|
154
|
+
export interface MatmulDebugConfigSchema {
|
|
155
|
+
/** Enable matmul debug behavior (default: false) */
|
|
156
|
+
enabled: boolean;
|
|
157
|
+
/** Force split (non-fused) Q/K/V projection path for diagnostics. */
|
|
158
|
+
forceSplitQKV: boolean;
|
|
159
|
+
/** Validate B tensor layout/buffer bytes for attention projection roles. */
|
|
160
|
+
validateAttentionWeightBuffer: boolean;
|
|
161
|
+
/** Throw if validation fails due small B tensor. */
|
|
162
|
+
failOnSmallAttentionWeightBuffer: boolean;
|
|
163
|
+
/** Emit attention B-buffer diagnostics. */
|
|
164
|
+
logAttentionWeightBuffer: boolean;
|
|
165
|
+
/** Log first-8 projection output values for layer 0 decode (diagnostic). */
|
|
166
|
+
logProjectionValues: boolean;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/** Default matmul debug configuration. */
|
|
170
|
+
export declare const DEFAULT_MATMUL_DEBUG_CONFIG: MatmulDebugConfigSchema;
|
|
171
|
+
|
|
134
172
|
/**
|
|
135
173
|
* Profiler configuration.
|
|
136
174
|
*/
|
|
@@ -241,6 +279,8 @@ export interface DebugConfigSchema {
|
|
|
241
279
|
logLevel: LogLevelConfigSchema;
|
|
242
280
|
trace: TraceConfigSchema;
|
|
243
281
|
pipeline: PipelineDebugConfigSchema;
|
|
282
|
+
loader: LoaderDebugConfigSchema;
|
|
283
|
+
matmul: MatmulDebugConfigSchema;
|
|
244
284
|
probes: ProbeConfigSchema[];
|
|
245
285
|
profiler: ProfilerConfigSchema;
|
|
246
286
|
perfGuards: PerfGuardsConfigSchema;
|
|
@@ -38,6 +38,32 @@ export const DEFAULT_TRACE_CONFIG = {
|
|
|
38
38
|
file: null,
|
|
39
39
|
};
|
|
40
40
|
|
|
41
|
+
// =============================================================================
|
|
42
|
+
// Loader Debug Config
|
|
43
|
+
// =============================================================================
|
|
44
|
+
|
|
45
|
+
export const DEFAULT_LOADER_DEBUG_CONFIG = {
|
|
46
|
+
enabled: false,
|
|
47
|
+
forceGpuDequant: false,
|
|
48
|
+
preferCpuDequant: false,
|
|
49
|
+
failOnCpuDequantPath: false,
|
|
50
|
+
runQ4KDequantParity: false,
|
|
51
|
+
q4kDequantParitySamples: 256,
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
// =============================================================================
|
|
55
|
+
// Kernel Debug Config
|
|
56
|
+
// =============================================================================
|
|
57
|
+
|
|
58
|
+
export const DEFAULT_MATMUL_DEBUG_CONFIG = {
|
|
59
|
+
enabled: false,
|
|
60
|
+
forceSplitQKV: false,
|
|
61
|
+
validateAttentionWeightBuffer: false,
|
|
62
|
+
failOnSmallAttentionWeightBuffer: false,
|
|
63
|
+
logAttentionWeightBuffer: false,
|
|
64
|
+
logProjectionValues: false,
|
|
65
|
+
};
|
|
66
|
+
|
|
41
67
|
// =============================================================================
|
|
42
68
|
// Kernel Trace Config (kernel-trace.js anomaly detection)
|
|
43
69
|
// =============================================================================
|
|
@@ -100,6 +126,8 @@ export const DEFAULT_DEBUG_CONFIG = {
|
|
|
100
126
|
logLevel: DEFAULT_LOG_LEVEL_CONFIG,
|
|
101
127
|
trace: DEFAULT_TRACE_CONFIG,
|
|
102
128
|
pipeline: DEFAULT_PIPELINE_DEBUG_CONFIG,
|
|
129
|
+
loader: DEFAULT_LOADER_DEBUG_CONFIG,
|
|
130
|
+
matmul: DEFAULT_MATMUL_DEBUG_CONFIG,
|
|
103
131
|
probes: [],
|
|
104
132
|
profiler: DEFAULT_PROFILER_CONFIG,
|
|
105
133
|
perfGuards: DEFAULT_PERF_GUARDS_CONFIG,
|
|
@@ -217,6 +217,8 @@ export {
|
|
|
217
217
|
DEFAULT_LOG_HISTORY_CONFIG,
|
|
218
218
|
DEFAULT_LOG_LEVEL_CONFIG,
|
|
219
219
|
DEFAULT_TRACE_CONFIG,
|
|
220
|
+
DEFAULT_LOADER_DEBUG_CONFIG,
|
|
221
|
+
DEFAULT_MATMUL_DEBUG_CONFIG,
|
|
220
222
|
DEFAULT_KERNEL_TRACE_CONFIG,
|
|
221
223
|
DEFAULT_PIPELINE_DEBUG_CONFIG,
|
|
222
224
|
DEFAULT_PROFILER_CONFIG,
|
|
@@ -93,7 +93,7 @@ export const DEFAULT_TOKENIZER_DEFAULTS = {
|
|
|
93
93
|
// =============================================================================
|
|
94
94
|
|
|
95
95
|
export const DEFAULT_CHAT_TEMPLATE_CONFIG = {
|
|
96
|
-
enabled:
|
|
96
|
+
enabled: undefined,
|
|
97
97
|
};
|
|
98
98
|
|
|
99
99
|
export const DEFAULT_KERNEL_PATH_POLICY = {
|
|
@@ -160,6 +160,7 @@ export type BuiltinKernelPathId =
|
|
|
160
160
|
| 'gemma3-f16-fused-f32a-online-streamingprefill' // Gemma 3 F16 fused FFN online path with streaming prefill attention
|
|
161
161
|
| 'gemma3-q4k-dequant-f16a-online' // Gemma 3 Q4K dequant online path (F16 activations)
|
|
162
162
|
| 'gemma3-q4k-dequant-f32a-online' // Gemma 3 Q4K dequant online path with F32 activations
|
|
163
|
+
| 'gemma3-q4k-dequant-f32w-f32a-online' // Gemma 3 Q4K path with F32 projection weights and F32 activations
|
|
163
164
|
| 'gemma3-q4k-dequant-f32a-nosubgroups' // Gemma 3 Q4K dequant path with no subgroup requirement
|
|
164
165
|
| 'gemma3-q4k-dequant-f32a' // Legacy alias for gemma3-q4k-dequant-f32a-nosubgroups
|
|
165
166
|
| 'lfm2-q4k-dequant-f32a-online' // LFM2 Q4K path with F32 activations and fast prefill
|
|
@@ -40,8 +40,8 @@ export const DEFAULT_SEGMENT_ALLOCATION_CONFIG = {
|
|
|
40
40
|
// =============================================================================
|
|
41
41
|
|
|
42
42
|
export const DEFAULT_EMULATED_STORAGE_CONFIG = {
|
|
43
|
-
vramBudgetBytes:
|
|
44
|
-
ramBudgetBytes:
|
|
43
|
+
vramBudgetBytes: 4 * GB,
|
|
44
|
+
ramBudgetBytes: 16 * GB,
|
|
45
45
|
};
|
|
46
46
|
|
|
47
47
|
// =============================================================================
|