@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"id": "experiments/bench/gemma3-bench-q4k",
|
|
3
3
|
"name": "gemma3-bench-q4k",
|
|
4
|
-
"
|
|
4
|
+
"description": "Benchmark run for Gemma 3 1B Q4K — calibration-mode throughput measurement.",
|
|
5
|
+
"intent": "calibrate",
|
|
5
6
|
"stability": "experimental",
|
|
6
7
|
"owner": "doppler-core",
|
|
7
8
|
"createdAtUtc": "2026-02-25T00:00:00Z",
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "experiments/verify/lfm2-verify",
|
|
3
|
+
"name": "lfm2-verify",
|
|
4
|
+
"intent": "verify",
|
|
5
|
+
"stability": "experimental",
|
|
6
|
+
"owner": "doppler-core",
|
|
7
|
+
"createdAtUtc": "2026-03-16T00:00:00Z",
|
|
8
|
+
"extends": "modes/bench",
|
|
9
|
+
"model": "lfm2-5-1-2b-instruct-q4k-ehf16-af32",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"tooling": {
|
|
13
|
+
"intent": "verify"
|
|
14
|
+
},
|
|
15
|
+
"debug": {
|
|
16
|
+
"logLevel": {
|
|
17
|
+
"defaultLogLevel": "warn"
|
|
18
|
+
},
|
|
19
|
+
"trace": {
|
|
20
|
+
"enabled": false
|
|
21
|
+
},
|
|
22
|
+
"profiler": {
|
|
23
|
+
"enabled": false
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"inference": {
|
|
28
|
+
"prompt": {
|
|
29
|
+
"messages": [
|
|
30
|
+
{
|
|
31
|
+
"role": "user",
|
|
32
|
+
"content": "What color is the sky on a clear day?"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
"batching": {
|
|
37
|
+
"maxTokens": 32
|
|
38
|
+
},
|
|
39
|
+
"sampling": {
|
|
40
|
+
"temperature": 0,
|
|
41
|
+
"topK": 1,
|
|
42
|
+
"topP": 1
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "experiments/verify/translategemma-verify",
|
|
3
|
+
"name": "translategemma-verify",
|
|
4
|
+
"intent": "verify",
|
|
5
|
+
"stability": "experimental",
|
|
6
|
+
"owner": "doppler-core",
|
|
7
|
+
"createdAtUtc": "2026-03-16T00:00:00Z",
|
|
8
|
+
"extends": "modes/bench",
|
|
9
|
+
"model": "translategemma-4b-it-q4k-ehf16-af32",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"tooling": {
|
|
13
|
+
"intent": "verify"
|
|
14
|
+
},
|
|
15
|
+
"debug": {
|
|
16
|
+
"logLevel": {
|
|
17
|
+
"defaultLogLevel": "warn"
|
|
18
|
+
},
|
|
19
|
+
"trace": {
|
|
20
|
+
"enabled": false
|
|
21
|
+
},
|
|
22
|
+
"profiler": {
|
|
23
|
+
"enabled": false
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"inference": {
|
|
28
|
+
"prompt": "Hello from Doppler.",
|
|
29
|
+
"batching": {
|
|
30
|
+
"maxTokens": 32
|
|
31
|
+
},
|
|
32
|
+
"sampling": {
|
|
33
|
+
"temperature": 0,
|
|
34
|
+
"topK": 1,
|
|
35
|
+
"topP": 1
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "model/qwen3-5-layer-probe",
|
|
3
|
+
"name": "qwen3-5-layer-probe",
|
|
4
|
+
"description": "Probe all 24 layer outputs in Qwen 3.5 to isolate where the hidden state distribution collapses.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "canonical",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-13T00:00:00Z",
|
|
9
|
+
"extends": "modes/debug",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"inference": {
|
|
12
|
+
"prompt": "What color is the sky on a clear day? Answer in one word.",
|
|
13
|
+
"batching": {
|
|
14
|
+
"maxTokens": 1
|
|
15
|
+
},
|
|
16
|
+
"sampling": {
|
|
17
|
+
"temperature": 0
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"shared": {
|
|
21
|
+
"debug": {
|
|
22
|
+
"trace": {
|
|
23
|
+
"enabled": true,
|
|
24
|
+
"categories": ["attn", "ffn", "logits"],
|
|
25
|
+
"layers": null,
|
|
26
|
+
"maxDecodeSteps": 1
|
|
27
|
+
},
|
|
28
|
+
"probes": [
|
|
29
|
+
{
|
|
30
|
+
"id": "embed",
|
|
31
|
+
"stage": "embed_out",
|
|
32
|
+
"tokens": [-1],
|
|
33
|
+
"dims": [0, 1, 2, 3, 512, 513]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"id": "layer_out",
|
|
37
|
+
"stage": "layer_out",
|
|
38
|
+
"layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
|
|
39
|
+
"tokens": [-1],
|
|
40
|
+
"dims": [0, 1, 2, 3]
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "logits",
|
|
44
|
+
"stage": "logits_final",
|
|
45
|
+
"tokens": [-1],
|
|
46
|
+
"dims": [271, 0, 1, 2, 3, 496, 138]
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "model/qwen3-5-linear-attn-debug",
|
|
3
|
+
"name": "qwen3-5-linear-attn-debug",
|
|
4
|
+
"description": "Probe linear attention intermediates in Qwen 3.5 layer 0 for comparison with HF reference.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "canonical",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-13T00:00:00Z",
|
|
9
|
+
"extends": "modes/debug",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"inference": {
|
|
12
|
+
"prompt": "Hello",
|
|
13
|
+
"batching": {
|
|
14
|
+
"maxTokens": 1
|
|
15
|
+
},
|
|
16
|
+
"sampling": {
|
|
17
|
+
"temperature": 0
|
|
18
|
+
},
|
|
19
|
+
"chatTemplate": {
|
|
20
|
+
"enabled": false
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"shared": {
|
|
24
|
+
"debug": {
|
|
25
|
+
"trace": {
|
|
26
|
+
"enabled": true,
|
|
27
|
+
"categories": ["attn", "logits"],
|
|
28
|
+
"layers": null,
|
|
29
|
+
"maxDecodeSteps": 1
|
|
30
|
+
},
|
|
31
|
+
"probes": [
|
|
32
|
+
{
|
|
33
|
+
"id": "embed",
|
|
34
|
+
"stage": "embed_out",
|
|
35
|
+
"tokens": [-1],
|
|
36
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"id": "qkv",
|
|
40
|
+
"stage": "linear_qkv_proj",
|
|
41
|
+
"layers": [0],
|
|
42
|
+
"tokens": [-1],
|
|
43
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "z",
|
|
47
|
+
"stage": "linear_z_proj",
|
|
48
|
+
"layers": [0],
|
|
49
|
+
"tokens": [-1],
|
|
50
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"id": "a",
|
|
54
|
+
"stage": "linear_a_proj",
|
|
55
|
+
"layers": [0],
|
|
56
|
+
"tokens": [-1],
|
|
57
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"id": "b",
|
|
61
|
+
"stage": "linear_b_proj",
|
|
62
|
+
"layers": [0],
|
|
63
|
+
"tokens": [-1],
|
|
64
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"id": "core",
|
|
68
|
+
"stage": "linear_core_out",
|
|
69
|
+
"layers": [0],
|
|
70
|
+
"tokens": [-1],
|
|
71
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"id": "layer0",
|
|
75
|
+
"stage": "layer_out",
|
|
76
|
+
"layers": [0],
|
|
77
|
+
"tokens": [-1],
|
|
78
|
+
"dims": [0, 1, 2, 3, 4, 5, 6, 7]
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"id": "logits",
|
|
82
|
+
"stage": "logits_final",
|
|
83
|
+
"tokens": [-1],
|
|
84
|
+
"dims": [0, 1, 2, 3]
|
|
85
|
+
}
|
|
86
|
+
]
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "tiers/gemma4-16gb",
|
|
3
|
+
"name": "Gemma 4 — 16 GB tier (constrained)",
|
|
4
|
+
"description": "Gemma 4 MoE runtime tier for 16 GB GPU memory. Aggressively constrained: short context, minimal expert cache, hard budget enforcement. Fail-closed if budget is not met.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "experimental",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-17T00:00:00Z",
|
|
9
|
+
"extends": "default",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"bufferPool": {
|
|
13
|
+
"budget": {
|
|
14
|
+
"maxTotalBytes": 13958643712,
|
|
15
|
+
"highWatermarkRatio": 0.85,
|
|
16
|
+
"emergencyTrimTargetRatio": 0.7,
|
|
17
|
+
"hardFailOnBudgetExceeded": true
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"loading": {
|
|
22
|
+
"expertCache": {
|
|
23
|
+
"defaultSizeBytes": 1073741824,
|
|
24
|
+
"maxBufferPercentage": 0.15,
|
|
25
|
+
"evictionHighWatermark": 0.8,
|
|
26
|
+
"emergencyTrimToRatio": 0.65
|
|
27
|
+
},
|
|
28
|
+
"prefetch": {
|
|
29
|
+
"enabled": true,
|
|
30
|
+
"layersAhead": 1,
|
|
31
|
+
"maxShards": 4
|
|
32
|
+
},
|
|
33
|
+
"memoryManagement": {
|
|
34
|
+
"flushIntervalLayers": 1,
|
|
35
|
+
"flushThresholdBytes": 134217728
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"inference": {
|
|
39
|
+
"kvcache": {
|
|
40
|
+
"layout": "contiguous",
|
|
41
|
+
"maxSeqLen": 2048,
|
|
42
|
+
"kvDtype": "f16",
|
|
43
|
+
"pageSize": 128,
|
|
44
|
+
"tiering": {
|
|
45
|
+
"mode": "off"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"moe": {
|
|
49
|
+
"routing": {
|
|
50
|
+
"routerDtype": "f32"
|
|
51
|
+
},
|
|
52
|
+
"cache": {
|
|
53
|
+
"dequantCacheMaxEntries": 2
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"compute": {
|
|
57
|
+
"activationDtype": "f32"
|
|
58
|
+
},
|
|
59
|
+
"batching": {
|
|
60
|
+
"maxTokens": 512
|
|
61
|
+
},
|
|
62
|
+
"session": {
|
|
63
|
+
"kvcache": {
|
|
64
|
+
"kvDtype": "f16"
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "tiers/gemma4-24gb",
|
|
3
|
+
"name": "Gemma 4 — 24 GB tier",
|
|
4
|
+
"description": "Gemma 4 MoE runtime tier for 24 GB GPU memory. Moderate expert cache, contiguous KV, reduced context length.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "experimental",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-17T00:00:00Z",
|
|
9
|
+
"extends": "default",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"bufferPool": {
|
|
13
|
+
"budget": {
|
|
14
|
+
"maxTotalBytes": 21474836480,
|
|
15
|
+
"highWatermarkRatio": 0.9,
|
|
16
|
+
"emergencyTrimTargetRatio": 0.75,
|
|
17
|
+
"hardFailOnBudgetExceeded": true
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"loading": {
|
|
22
|
+
"expertCache": {
|
|
23
|
+
"defaultSizeBytes": 3221225472,
|
|
24
|
+
"maxBufferPercentage": 0.2,
|
|
25
|
+
"evictionHighWatermark": 0.85,
|
|
26
|
+
"emergencyTrimToRatio": 0.7
|
|
27
|
+
},
|
|
28
|
+
"prefetch": {
|
|
29
|
+
"enabled": true,
|
|
30
|
+
"layersAhead": 1,
|
|
31
|
+
"maxShards": 8
|
|
32
|
+
},
|
|
33
|
+
"memoryManagement": {
|
|
34
|
+
"flushIntervalLayers": 2,
|
|
35
|
+
"flushThresholdBytes": 268435456
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"inference": {
|
|
39
|
+
"kvcache": {
|
|
40
|
+
"layout": "contiguous",
|
|
41
|
+
"maxSeqLen": 4096,
|
|
42
|
+
"kvDtype": "f16",
|
|
43
|
+
"pageSize": 256,
|
|
44
|
+
"tiering": {
|
|
45
|
+
"mode": "off"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"moe": {
|
|
49
|
+
"routing": {
|
|
50
|
+
"routerDtype": "f32"
|
|
51
|
+
},
|
|
52
|
+
"cache": {
|
|
53
|
+
"dequantCacheMaxEntries": 64
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"compute": {
|
|
57
|
+
"activationDtype": "f32"
|
|
58
|
+
},
|
|
59
|
+
"session": {
|
|
60
|
+
"kvcache": {
|
|
61
|
+
"kvDtype": "f16"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "tiers/gemma4-32gb",
|
|
3
|
+
"name": "Gemma 4 — 32 GB tier",
|
|
4
|
+
"description": "Gemma 4 MoE runtime tier for 32 GB GPU memory. Generous expert cache, contiguous KV, full-length context.",
|
|
5
|
+
"intent": "investigate",
|
|
6
|
+
"stability": "experimental",
|
|
7
|
+
"owner": "doppler-core",
|
|
8
|
+
"createdAtUtc": "2026-03-17T00:00:00Z",
|
|
9
|
+
"extends": "default",
|
|
10
|
+
"runtime": {
|
|
11
|
+
"shared": {
|
|
12
|
+
"bufferPool": {
|
|
13
|
+
"budget": {
|
|
14
|
+
"maxTotalBytes": 30064771072,
|
|
15
|
+
"highWatermarkRatio": 0.9,
|
|
16
|
+
"emergencyTrimTargetRatio": 0.75,
|
|
17
|
+
"hardFailOnBudgetExceeded": true
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"loading": {
|
|
22
|
+
"expertCache": {
|
|
23
|
+
"defaultSizeBytes": 6442450944,
|
|
24
|
+
"maxBufferPercentage": 0.25,
|
|
25
|
+
"evictionHighWatermark": 0.9,
|
|
26
|
+
"emergencyTrimToRatio": 0.75
|
|
27
|
+
},
|
|
28
|
+
"prefetch": {
|
|
29
|
+
"enabled": true,
|
|
30
|
+
"layersAhead": 2,
|
|
31
|
+
"maxShards": 16
|
|
32
|
+
},
|
|
33
|
+
"memoryManagement": {
|
|
34
|
+
"flushIntervalLayers": 4,
|
|
35
|
+
"flushThresholdBytes": 536870912
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"inference": {
|
|
39
|
+
"kvcache": {
|
|
40
|
+
"layout": "contiguous",
|
|
41
|
+
"maxSeqLen": 8192,
|
|
42
|
+
"kvDtype": "f16",
|
|
43
|
+
"pageSize": 256,
|
|
44
|
+
"tiering": {
|
|
45
|
+
"mode": "off"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"moe": {
|
|
49
|
+
"routing": {
|
|
50
|
+
"routerDtype": "f32"
|
|
51
|
+
},
|
|
52
|
+
"cache": {
|
|
53
|
+
"dequantCacheMaxEntries": 128
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"compute": {
|
|
57
|
+
"activationDtype": "f32"
|
|
58
|
+
},
|
|
59
|
+
"session": {
|
|
60
|
+
"kvcache": {
|
|
61
|
+
"kvDtype": "f16"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
package/src/config/runtime.js
CHANGED
|
@@ -58,4 +58,7 @@ function assertNoDeprecatedRuntimeKeys(overrides) {
|
|
|
58
58
|
if (inference?.sampling?.maxTokens !== undefined) {
|
|
59
59
|
throw new Error('sampling.maxTokens is removed; use inference.batching.maxTokens');
|
|
60
60
|
}
|
|
61
|
+
if (inference?.session?.maxNewTokens !== undefined) {
|
|
62
|
+
throw new Error('inference.session.maxNewTokens is not a supported runtime config key; use inference.batching.maxTokens');
|
|
63
|
+
}
|
|
61
64
|
}
|
|
@@ -131,6 +131,44 @@ export interface PipelineDebugConfigSchema {
|
|
|
131
131
|
/** Default pipeline debug configuration */
|
|
132
132
|
export declare const DEFAULT_PIPELINE_DEBUG_CONFIG: PipelineDebugConfigSchema;
|
|
133
133
|
|
|
134
|
+
/** Loader debug configuration (Q4K dequant and related probes). */
|
|
135
|
+
export interface LoaderDebugConfigSchema {
|
|
136
|
+
/** Enable loader debug behavior (default: false) */
|
|
137
|
+
enabled: boolean;
|
|
138
|
+
/** Force GPU dequant for Q4K tensors even when CPU fallback is eligible. */
|
|
139
|
+
forceGpuDequant: boolean;
|
|
140
|
+
/** Prefer CPU dequant for F32 output when eligible (default: false, GPU is preferred). */
|
|
141
|
+
preferCpuDequant: boolean;
|
|
142
|
+
/** Throw when CPU dequant fallback is taken. */
|
|
143
|
+
failOnCpuDequantPath: boolean;
|
|
144
|
+
/** Enable dtype-aware GPU-vs-CPU parity checks during Q4K dequant. */
|
|
145
|
+
runQ4KDequantParity: boolean;
|
|
146
|
+
/** Number of values to read back for parity checks. */
|
|
147
|
+
q4kDequantParitySamples: number;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** Default loader debug configuration. */
|
|
151
|
+
export declare const DEFAULT_LOADER_DEBUG_CONFIG: LoaderDebugConfigSchema;
|
|
152
|
+
|
|
153
|
+
/** Matmul debug configuration (attention split/shape diagnostics). */
|
|
154
|
+
export interface MatmulDebugConfigSchema {
|
|
155
|
+
/** Enable matmul debug behavior (default: false) */
|
|
156
|
+
enabled: boolean;
|
|
157
|
+
/** Force split (non-fused) Q/K/V projection path for diagnostics. */
|
|
158
|
+
forceSplitQKV: boolean;
|
|
159
|
+
/** Validate B tensor layout/buffer bytes for attention projection roles. */
|
|
160
|
+
validateAttentionWeightBuffer: boolean;
|
|
161
|
+
/** Throw if validation fails due small B tensor. */
|
|
162
|
+
failOnSmallAttentionWeightBuffer: boolean;
|
|
163
|
+
/** Emit attention B-buffer diagnostics. */
|
|
164
|
+
logAttentionWeightBuffer: boolean;
|
|
165
|
+
/** Log first-8 projection output values for layer 0 decode (diagnostic). */
|
|
166
|
+
logProjectionValues: boolean;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/** Default matmul debug configuration. */
|
|
170
|
+
export declare const DEFAULT_MATMUL_DEBUG_CONFIG: MatmulDebugConfigSchema;
|
|
171
|
+
|
|
134
172
|
/**
|
|
135
173
|
* Profiler configuration.
|
|
136
174
|
*/
|
|
@@ -241,6 +279,8 @@ export interface DebugConfigSchema {
|
|
|
241
279
|
logLevel: LogLevelConfigSchema;
|
|
242
280
|
trace: TraceConfigSchema;
|
|
243
281
|
pipeline: PipelineDebugConfigSchema;
|
|
282
|
+
loader: LoaderDebugConfigSchema;
|
|
283
|
+
matmul: MatmulDebugConfigSchema;
|
|
244
284
|
probes: ProbeConfigSchema[];
|
|
245
285
|
profiler: ProfilerConfigSchema;
|
|
246
286
|
perfGuards: PerfGuardsConfigSchema;
|
|
@@ -38,6 +38,32 @@ export const DEFAULT_TRACE_CONFIG = {
|
|
|
38
38
|
file: null,
|
|
39
39
|
};
|
|
40
40
|
|
|
41
|
+
// =============================================================================
|
|
42
|
+
// Loader Debug Config
|
|
43
|
+
// =============================================================================
|
|
44
|
+
|
|
45
|
+
export const DEFAULT_LOADER_DEBUG_CONFIG = {
|
|
46
|
+
enabled: false,
|
|
47
|
+
forceGpuDequant: false,
|
|
48
|
+
preferCpuDequant: false,
|
|
49
|
+
failOnCpuDequantPath: false,
|
|
50
|
+
runQ4KDequantParity: false,
|
|
51
|
+
q4kDequantParitySamples: 256,
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
// =============================================================================
|
|
55
|
+
// Kernel Debug Config
|
|
56
|
+
// =============================================================================
|
|
57
|
+
|
|
58
|
+
export const DEFAULT_MATMUL_DEBUG_CONFIG = {
|
|
59
|
+
enabled: false,
|
|
60
|
+
forceSplitQKV: false,
|
|
61
|
+
validateAttentionWeightBuffer: false,
|
|
62
|
+
failOnSmallAttentionWeightBuffer: false,
|
|
63
|
+
logAttentionWeightBuffer: false,
|
|
64
|
+
logProjectionValues: false,
|
|
65
|
+
};
|
|
66
|
+
|
|
41
67
|
// =============================================================================
|
|
42
68
|
// Kernel Trace Config (kernel-trace.js anomaly detection)
|
|
43
69
|
// =============================================================================
|
|
@@ -100,6 +126,8 @@ export const DEFAULT_DEBUG_CONFIG = {
|
|
|
100
126
|
logLevel: DEFAULT_LOG_LEVEL_CONFIG,
|
|
101
127
|
trace: DEFAULT_TRACE_CONFIG,
|
|
102
128
|
pipeline: DEFAULT_PIPELINE_DEBUG_CONFIG,
|
|
129
|
+
loader: DEFAULT_LOADER_DEBUG_CONFIG,
|
|
130
|
+
matmul: DEFAULT_MATMUL_DEBUG_CONFIG,
|
|
103
131
|
probes: [],
|
|
104
132
|
profiler: DEFAULT_PROFILER_CONFIG,
|
|
105
133
|
perfGuards: DEFAULT_PERF_GUARDS_CONFIG,
|
|
@@ -217,6 +217,8 @@ export {
|
|
|
217
217
|
DEFAULT_LOG_HISTORY_CONFIG,
|
|
218
218
|
DEFAULT_LOG_LEVEL_CONFIG,
|
|
219
219
|
DEFAULT_TRACE_CONFIG,
|
|
220
|
+
DEFAULT_LOADER_DEBUG_CONFIG,
|
|
221
|
+
DEFAULT_MATMUL_DEBUG_CONFIG,
|
|
220
222
|
DEFAULT_KERNEL_TRACE_CONFIG,
|
|
221
223
|
DEFAULT_PIPELINE_DEBUG_CONFIG,
|
|
222
224
|
DEFAULT_PROFILER_CONFIG,
|
|
@@ -93,7 +93,7 @@ export const DEFAULT_TOKENIZER_DEFAULTS = {
|
|
|
93
93
|
// =============================================================================
|
|
94
94
|
|
|
95
95
|
export const DEFAULT_CHAT_TEMPLATE_CONFIG = {
|
|
96
|
-
enabled:
|
|
96
|
+
enabled: undefined,
|
|
97
97
|
};
|
|
98
98
|
|
|
99
99
|
export const DEFAULT_KERNEL_PATH_POLICY = {
|
|
@@ -160,6 +160,7 @@ export type BuiltinKernelPathId =
|
|
|
160
160
|
| 'gemma3-f16-fused-f32a-online-streamingprefill' // Gemma 3 F16 fused FFN online path with streaming prefill attention
|
|
161
161
|
| 'gemma3-q4k-dequant-f16a-online' // Gemma 3 Q4K dequant online path (F16 activations)
|
|
162
162
|
| 'gemma3-q4k-dequant-f32a-online' // Gemma 3 Q4K dequant online path with F32 activations
|
|
163
|
+
| 'gemma3-q4k-dequant-f32w-f32a-online' // Gemma 3 Q4K path with F32 projection weights and F32 activations
|
|
163
164
|
| 'gemma3-q4k-dequant-f32a-nosubgroups' // Gemma 3 Q4K dequant path with no subgroup requirement
|
|
164
165
|
| 'gemma3-q4k-dequant-f32a' // Legacy alias for gemma3-q4k-dequant-f32a-nosubgroups
|
|
165
166
|
| 'lfm2-q4k-dequant-f32a-online' // LFM2 Q4K path with F32 activations and fast prefill
|
|
@@ -163,7 +163,7 @@ export interface ArchitectureSchema {
|
|
|
163
163
|
* Use `null` to indicate "not applicable" (e.g., no softcapping).
|
|
164
164
|
*/
|
|
165
165
|
export interface ManifestAttentionSchema {
|
|
166
|
-
/** Query pre-attention scalar
|
|
166
|
+
/** Query pre-attention scalar: attnScale = 1/sqrt(scalar). Standard = headDim. */
|
|
167
167
|
queryPreAttnScalar: number;
|
|
168
168
|
/** Attention logit softcapping (Gemma 2: 50, null = disabled) */
|
|
169
169
|
attnLogitSoftcapping: number | null;
|
|
@@ -40,7 +40,7 @@ export const DEFAULT_MANIFEST_INFERENCE = {
|
|
|
40
40
|
schema: null,
|
|
41
41
|
presetId: null,
|
|
42
42
|
attention: {
|
|
43
|
-
queryPreAttnScalar:
|
|
43
|
+
queryPreAttnScalar: 64, // headDim for standard 64-dim heads; attnScale = 1/sqrt(scalar)
|
|
44
44
|
attnLogitSoftcapping: null, // No softcapping (null = disabled)
|
|
45
45
|
slidingWindow: null, // Full attention (null = no sliding window)
|
|
46
46
|
queryKeyNorm: false,
|
|
@@ -40,8 +40,8 @@ export const DEFAULT_SEGMENT_ALLOCATION_CONFIG = {
|
|
|
40
40
|
// =============================================================================
|
|
41
41
|
|
|
42
42
|
export const DEFAULT_EMULATED_STORAGE_CONFIG = {
|
|
43
|
-
vramBudgetBytes:
|
|
44
|
-
ramBudgetBytes:
|
|
43
|
+
vramBudgetBytes: 4 * GB,
|
|
44
|
+
ramBudgetBytes: 16 * GB,
|
|
45
45
|
};
|
|
46
46
|
|
|
47
47
|
// =============================================================================
|
|
@@ -16,7 +16,7 @@ export const DEFAULT_QUOTA_CONFIG = {
|
|
|
16
16
|
|
|
17
17
|
export const DEFAULT_VRAM_ESTIMATION_CONFIG = {
|
|
18
18
|
unifiedMemoryRatio: 0.5, // 50% of system RAM
|
|
19
|
-
fallbackVramBytes:
|
|
19
|
+
fallbackVramBytes: 4 * GB,
|
|
20
20
|
lowVramHeadroomBytes: 500 * MB,
|
|
21
21
|
};
|
|
22
22
|
|
|
@@ -35,7 +35,7 @@ export const DEFAULT_STORAGE_ALIGNMENT_CONFIG = {
|
|
|
35
35
|
export const DEFAULT_STORAGE_BACKEND_CONFIG = {
|
|
36
36
|
backend: 'auto', // auto | opfs | indexeddb | memory
|
|
37
37
|
opfs: {
|
|
38
|
-
useSyncAccessHandle:
|
|
38
|
+
useSyncAccessHandle: false,
|
|
39
39
|
maxConcurrentHandles: 2,
|
|
40
40
|
},
|
|
41
41
|
indexeddb: {
|