@simulatte/doppler 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +25 -6
- package/package.json +5 -3
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +16 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/loader.js +6 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +7 -0
- package/src/config/presets/models/gemma3.json +2 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +1 -1
- package/src/converter/core.js +17 -8
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +15 -0
- package/src/distribution/shard-delivery.js +34 -0
- package/src/formats/rdrr/classification.js +32 -0
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +1 -0
- package/src/gpu/kernels/matmul.d.ts +3 -0
- package/src/gpu/kernels/matmul.js +70 -1
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
- package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
- package/src/inference/pipelines/text/attention/projections.js +13 -2
- package/src/inference/pipelines/text/attention/record.js +1 -0
- package/src/inference/pipelines/text/attention/run.js +9 -0
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +32 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +14 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
- package/src/inference/pipelines/text/generator-steps.js +46 -29
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +320 -166
- package/src/inference/pipelines/text/init.d.ts +2 -0
- package/src/inference/pipelines/text/init.js +19 -5
- package/src/inference/pipelines/text/layer.js +37 -8
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +9 -7
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +124 -3
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/tooling/node-converter.js +25 -7
- package/src/tooling/node-source-runtime.js +29 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/src/tooling/node-convert.d.ts +0 -54
package/CHANGELOG.md
CHANGED
|
@@ -6,22 +6,35 @@ This changelog is package-facing and release-oriented. Entries before `0.1.7`
|
|
|
6
6
|
were retrofitted from package version history, release commits, and release
|
|
7
7
|
docs so the `0.1.x` line has one conventional npm-visible history surface.
|
|
8
8
|
|
|
9
|
-
## [0.1.8] - 2026-03-
|
|
9
|
+
## [0.1.8] - 2026-03-18
|
|
10
10
|
|
|
11
11
|
### Changed
|
|
12
12
|
|
|
13
13
|
- Simplified demo to show only verified Q4K models (Gemma 3 270M, Gemma 3 1B).
|
|
14
14
|
Hidden Translate, Diffusion, and Embedding tabs until models are ready.
|
|
15
|
+
- Split demo monolith (6,680 lines) into focused modules: core, generation,
|
|
16
|
+
storage, translate, diagnostics, routing, utils.
|
|
15
17
|
- Trimmed hosted HF registry and quickstart registry to the two verified models.
|
|
16
18
|
- Aligned catalog, HF registry, and quickstart registry to the canonical
|
|
17
19
|
external support registry as single source of truth for HF revisions.
|
|
20
|
+
- Renamed all `.mjs` tool scripts to `.js` to match `"type": "module"` convention.
|
|
21
|
+
- Switched WebGPU optional dependency from `@simulatte/webgpu` to `webgpu ^0.3.8`.
|
|
22
|
+
- Pruned unused `verify:*` npm scripts for models no longer in the active set.
|
|
23
|
+
- Updated release-claim policy with newly verified models (LFM2, Qwen 3.5,
|
|
24
|
+
TranslateGemma variants).
|
|
18
25
|
|
|
19
26
|
### Fixed
|
|
20
27
|
|
|
21
28
|
- Fixed Qwen 3.5 conversion configs using wrong model preset (`qwen3` instead
|
|
22
29
|
of `qwen3_5`), which caused support matrix check failures.
|
|
30
|
+
- Fixed Qwen mRoPE conflation: `ropeInterleaved` was incorrectly set from
|
|
31
|
+
`mropeInterleaved`, forcing adjacent-pair RoPE rotation on Qwen models.
|
|
23
32
|
- Fixed catalog lifecycle metadata inconsistencies: corrected `local`, `hf`,
|
|
24
33
|
`curated`, and `demo` fields to match actual artifact availability.
|
|
34
|
+
- Fixed GPU-dependent unit tests failing in non-GPU environments by adding
|
|
35
|
+
proper GPU readiness probes with clear skip reasons.
|
|
36
|
+
- Fixed kernel-ref digest registry drift (222 vs 224 entries).
|
|
37
|
+
- Fixed stale vendor benchmark fixture hashes after compare-engines config update.
|
|
25
38
|
- Removed failing and unverified models from demo visibility (TranslateGemma 4B,
|
|
26
39
|
EmbeddingGemma 300M with broken HF manifest, Qwen 3.5 0.8B/2B, F16 variant).
|
|
27
40
|
|
package/README.md
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
Inference and training on raw WebGPU. Pure JS + WGSL.
|
|
4
4
|
|
|
5
|
+
**[Try the live demo](https://d4da.com)** | **[npm](https://www.npmjs.com/package/@simulatte/doppler)** | **[docs](https://github.com/clocksmith/doppler/blob/main/docs/INDEX.md)**
|
|
6
|
+
|
|
5
7
|

|
|
6
8
|
|
|
7
9
|
## Quick start
|
|
@@ -28,8 +30,6 @@ Registry IDs resolve to hosted RDRR artifacts from `Clocksmith/rdrr` by default.
|
|
|
28
30
|
npm install @simulatte/doppler
|
|
29
31
|
```
|
|
30
32
|
|
|
31
|
-
**[Live Demo](https://d4da.com)** · **[npm](https://www.npmjs.com/package/@simulatte/doppler)** · **[docs](https://github.com/clocksmith/doppler/blob/main/docs/INDEX.md)** · **[Project site](https://simulatte.world)**
|
|
32
|
-
|
|
33
33
|
## Why Doppler
|
|
34
34
|
|
|
35
35
|
**JS → WGSL → WebGPU.** Direct JavaScript orchestration into native WebGPU kernels, avoiding ONNX runtimes, WASM blobs, and bridge layers.
|
|
@@ -46,6 +46,28 @@ Snapshot artifacts:
|
|
|
46
46
|
- [g3-1b-p064-d064-t0-k1.compare.json](https://github.com/clocksmith/doppler/blob/main/benchmarks/vendors/fixtures/g3-1b-p064-d064-t0-k1.compare.json)
|
|
47
47
|
- [lfm2-5-1-2b-p064-d064-t0-k1.compare.json](https://github.com/clocksmith/doppler/blob/main/benchmarks/vendors/fixtures/lfm2-5-1-2b-p064-d064-t0-k1.compare.json)
|
|
48
48
|
|
|
49
|
+
## Supported models
|
|
50
|
+
|
|
51
|
+
All models below are verified with deterministic greedy decoding on WebGPU hardware.
|
|
52
|
+
Registry IDs resolve to hosted RDRR artifacts automatically.
|
|
53
|
+
|
|
54
|
+
| Model | Registry ID | Quant | Params |
|
|
55
|
+
| --- | --- | --- | --- |
|
|
56
|
+
| Gemma 3 270M IT | `gemma3-270m` | Q4K | 270M |
|
|
57
|
+
| Gemma 3 1B IT | `gemma3-1b` | Q4K | 1B |
|
|
58
|
+
| Gemma 3 1B IT (F16) | `gemma-3-1b-it-f16-af32` | F16 | 1B |
|
|
59
|
+
| TranslateGemma 4B IT | `translategemma-4b-it-q4k-ehf16-af32` | Q4K | 4B |
|
|
60
|
+
| TranslateGemma 4B 1B EN-ES | `translategemma-4b-1b-enes-q4k-ehf16-af32` | Q4K | 1B |
|
|
61
|
+
| EmbeddingGemma 300M | `google-embeddinggemma-300m-q4k-ehf16-af32` | Q4K | 300M |
|
|
62
|
+
| Qwen 3.5 0.8B | `qwen-3-5-0-8b-q4k-ehaf16` | Q4K | 0.8B |
|
|
63
|
+
| Qwen 3.5 2B | `qwen-3-5-2b-q4k-ehaf16` | Q4K | 2B |
|
|
64
|
+
| LFM2.5 1.2B Instruct | `lfm2-5-1-2b-instruct-q4k-ehf16-af32` | Q4K | 1.2B |
|
|
65
|
+
|
|
66
|
+
Additional model families (Llama 3, DeepSeek, Gemma 4 MoE, Mixtral, and others) have conversion
|
|
67
|
+
configs ready but are not yet cataloged. See the full
|
|
68
|
+
[model support matrix](https://github.com/clocksmith/doppler/blob/main/docs/model-support-matrix.md)
|
|
69
|
+
for details.
|
|
70
|
+
|
|
49
71
|
## Under the hood
|
|
50
72
|
|
|
51
73
|
- Sharded weight loading via OPFS moves multi-GB weights into VRAM without blocking the main thread.
|
|
@@ -85,10 +107,7 @@ for await (const token of doppler('Hello', { model: 'gemma3-270m' })) {
|
|
|
85
107
|
- First-run workflow: [docs/getting-started.md](https://github.com/clocksmith/doppler/blob/main/docs/getting-started.md)
|
|
86
108
|
- Runtime config contract: [docs/config.md](https://github.com/clocksmith/doppler/blob/main/docs/config.md)
|
|
87
109
|
- Architecture: [docs/architecture.md](https://github.com/clocksmith/doppler/blob/main/docs/architecture.md)
|
|
88
|
-
-
|
|
89
|
-
|
|
90
|
-
Current model support is generated from the catalog and conversion registry.
|
|
91
|
-
See [docs/model-support-matrix.md](https://github.com/clocksmith/doppler/blob/main/docs/model-support-matrix.md) for the canonical verified, failing, and unverified status table.
|
|
110
|
+
- Model support matrix: [docs/model-support-matrix.md](https://github.com/clocksmith/doppler/blob/main/docs/model-support-matrix.md)
|
|
92
111
|
|
|
93
112
|
## Environment requirements
|
|
94
113
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@simulatte/doppler",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.9",
|
|
4
4
|
"description": "Browser-native WebGPU inference engine for local intent and inference loops",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "src/index.d.ts",
|
|
@@ -75,6 +75,7 @@
|
|
|
75
75
|
"external:rdrr:index": "node tools/sync-external-rdrr-index.js",
|
|
76
76
|
"external:rdrr:index:check": "node tools/sync-external-rdrr-index.js --check",
|
|
77
77
|
"external:support:sync": "node tools/sync-external-support-registry.js",
|
|
78
|
+
"external:support:promote": "node tools/sync-external-support-registry.js --source-support-file models/catalog.json",
|
|
78
79
|
"external:support:check": "node tools/sync-external-support-registry.js --check",
|
|
79
80
|
"catalog:sync:external": "node tools/sync-catalog-from-external-support.js",
|
|
80
81
|
"catalog:sync:external:check": "node tools/sync-catalog-from-external-support.js --check",
|
|
@@ -155,12 +156,13 @@
|
|
|
155
156
|
"tools/convert-safetensors-node.js"
|
|
156
157
|
],
|
|
157
158
|
"devDependencies": {
|
|
158
|
-
"@huggingface/transformers": "
|
|
159
|
+
"@huggingface/transformers": "4.0.0-next.8",
|
|
159
160
|
"jest": "^30.2.0",
|
|
160
161
|
"onnxruntime-web": "^1.24.1",
|
|
161
162
|
"playwright": "^1.58.2"
|
|
162
163
|
},
|
|
163
164
|
"optionalDependencies": {
|
|
164
|
-
"@simulatte/webgpu": "0.
|
|
165
|
+
"@simulatte/webgpu": "0.x.x",
|
|
166
|
+
"webgpu": "^0.3.8"
|
|
165
167
|
}
|
|
166
168
|
}
|
|
@@ -165,6 +165,12 @@ function createModelHandle(pipeline, resolved) {
|
|
|
165
165
|
prefillKV(prompt, options = {}) {
|
|
166
166
|
return pipeline.prefillKVOnly(prompt, options);
|
|
167
167
|
},
|
|
168
|
+
prefillWithLogits(prompt, options = {}) {
|
|
169
|
+
return pipeline.prefillWithLogits(prompt, options);
|
|
170
|
+
},
|
|
171
|
+
decodeStepLogits(currentIds, options = {}) {
|
|
172
|
+
return pipeline.decodeStepLogits(currentIds, options);
|
|
173
|
+
},
|
|
168
174
|
generateWithPrefixKV(prefix, prompt, options = {}) {
|
|
169
175
|
return pipeline.generateWithPrefixKV(prefix, prompt, options);
|
|
170
176
|
},
|
|
@@ -2,6 +2,7 @@ import type { RDRRManifest } from '../formats/rdrr/index.js';
|
|
|
2
2
|
import type { GenerateOptions, KVCacheSnapshot } from '../generation/index.js';
|
|
3
3
|
import type { ChatMessage } from '../inference/pipelines/text/chat-format.js';
|
|
4
4
|
import type { LoRAManifest } from '../adapters/lora-loader.js';
|
|
5
|
+
import type { LogitsStepResult, PrefillResult } from '../inference/pipelines/text/types.d.ts';
|
|
5
6
|
|
|
6
7
|
export interface DopplerLoadProgress {
|
|
7
8
|
phase: 'resolve' | 'manifest' | 'load' | 'ready';
|
|
@@ -43,6 +44,8 @@ export interface DopplerModel {
|
|
|
43
44
|
readonly deviceInfo: Record<string, unknown> | null;
|
|
44
45
|
readonly advanced: {
|
|
45
46
|
prefillKV(prompt: string, options?: GenerateOptions): Promise<KVCacheSnapshot>;
|
|
47
|
+
prefillWithLogits(prompt: string | ChatMessage[] | { messages: ChatMessage[] }, options?: GenerateOptions): Promise<PrefillResult>;
|
|
48
|
+
decodeStepLogits(currentIds: number[], options?: GenerateOptions): Promise<LogitsStepResult>;
|
|
46
49
|
generateWithPrefixKV(
|
|
47
50
|
prefix: KVCacheSnapshot,
|
|
48
51
|
prompt: string,
|
|
@@ -199,6 +199,12 @@ function createModelHandle(pipeline, resolved) {
|
|
|
199
199
|
prefillKV(prompt, options = {}) {
|
|
200
200
|
return pipeline.prefillKVOnly(prompt, options);
|
|
201
201
|
},
|
|
202
|
+
prefillWithLogits(prompt, options = {}) {
|
|
203
|
+
return pipeline.prefillWithLogits(prompt, options);
|
|
204
|
+
},
|
|
205
|
+
decodeStepLogits(currentIds, options = {}) {
|
|
206
|
+
return pipeline.decodeStepLogits(currentIds, options);
|
|
207
|
+
},
|
|
202
208
|
generateWithPrefixKV(prefix, prompt, options = {}) {
|
|
203
209
|
return pipeline.generateWithPrefixKV(prefix, prompt, options);
|
|
204
210
|
},
|
|
@@ -272,6 +278,9 @@ export function doppler(prompt, options) {
|
|
|
272
278
|
|
|
273
279
|
doppler.load = load;
|
|
274
280
|
doppler.text = async function text(prompt, options) {
|
|
281
|
+
if (!options || typeof options !== 'object' || options.model == null) {
|
|
282
|
+
throw new Error('doppler.text() requires options.model.');
|
|
283
|
+
}
|
|
275
284
|
assertNoLoadAffectingOptions('doppler.text()', options);
|
|
276
285
|
return collectText(doppler(prompt, options));
|
|
277
286
|
};
|
|
@@ -299,14 +308,14 @@ doppler.evict = async function evict(model) {
|
|
|
299
308
|
if (!cached) {
|
|
300
309
|
return false;
|
|
301
310
|
}
|
|
302
|
-
convenienceModelCache.delete(resolved.modelId);
|
|
303
311
|
await cached.unload();
|
|
312
|
+
convenienceModelCache.delete(resolved.modelId);
|
|
304
313
|
return true;
|
|
305
314
|
};
|
|
306
315
|
doppler.evictAll = async function evictAll() {
|
|
307
316
|
const cached = [...convenienceModelCache.values()];
|
|
308
317
|
convenienceModelCache.clear();
|
|
309
|
-
await Promise.
|
|
318
|
+
await Promise.allSettled(cached.map((entry) => entry.unload()));
|
|
310
319
|
};
|
|
311
320
|
doppler.listModels = async function listModels() {
|
|
312
321
|
const models = await listQuickstartModels();
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { getCdnBasePath } from '../storage/download-types.js';
|
|
2
|
+
import { buildHfResolveBaseUrl } from '../utils/hf-resolve-url.js';
|
|
2
3
|
import { loadJson } from '../utils/load-json.js';
|
|
3
4
|
|
|
4
5
|
let registryPromise = null;
|
|
@@ -80,9 +81,6 @@ export function buildQuickstartModelBaseUrl(entry, options = {}) {
|
|
|
80
81
|
}
|
|
81
82
|
const cdnBasePath = typeof options.cdnBasePath === 'string' && options.cdnBasePath.length > 0
|
|
82
83
|
? options.cdnBasePath
|
|
83
|
-
:
|
|
84
|
-
|
|
85
|
-
const base = cdnBasePath.replace(/\/$/, '');
|
|
86
|
-
const path = entry.hf.path.replace(/^\/+/, '');
|
|
87
|
-
return `${base}/${entry.hf.repoId}/resolve/${revision}/${path}`;
|
|
84
|
+
: getCdnBasePath();
|
|
85
|
+
return buildHfResolveBaseUrl(entry.hf, { cdnBasePath });
|
|
88
86
|
}
|
|
@@ -19,6 +19,22 @@
|
|
|
19
19
|
"revision": "ca6f0dbdf3882d3893a65cf48f2bb6f1520df162",
|
|
20
20
|
"path": "models/gemma-3-270m-it-q4k-ehf16-af32"
|
|
21
21
|
}
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"modelId": "google-embeddinggemma-300m-q4k-ehf16-af32",
|
|
25
|
+
"aliases": [
|
|
26
|
+
"embeddinggemma-300m",
|
|
27
|
+
"google/embeddinggemma-300m",
|
|
28
|
+
"google-embeddinggemma-300m-wq4k-ef16"
|
|
29
|
+
],
|
|
30
|
+
"modes": [
|
|
31
|
+
"embedding"
|
|
32
|
+
],
|
|
33
|
+
"hf": {
|
|
34
|
+
"repoId": "Clocksmith/rdrr",
|
|
35
|
+
"revision": "7e79c466d54455bd370c81685956ea9abae0fd30",
|
|
36
|
+
"path": "models/google-embeddinggemma-300m-q4k-ehf16-af32"
|
|
37
|
+
}
|
|
22
38
|
}
|
|
23
39
|
]
|
|
24
40
|
}
|
|
@@ -59,8 +59,8 @@ export const KERNEL_REF_CONTENT_DIGESTS = Object.freeze({
|
|
|
59
59
|
"cross_entropy_loss.wgsl#main": "5a48087bdec94184432c90ce5b345e1eadbdfcb13b9793ecee8052bc7392239c",
|
|
60
60
|
"depthwise_conv2d_f16.wgsl#main": "f7f093a7e6623ed17a675bac729149e94718aece916416966eaf03c1d6939f2a",
|
|
61
61
|
"depthwise_conv2d.wgsl#main": "cf14cb40d282ad4d4fab160109b97eaeaf12aab62579b73324ac485ac75155b0",
|
|
62
|
-
"dequant_f16_out_vec4.wgsl#main_vec4": "
|
|
63
|
-
"dequant_f16_out.wgsl#main": "
|
|
62
|
+
"dequant_f16_out_vec4.wgsl#main_vec4": "ff729cc220ba5425e17c4c537a9993f25b6541046b6c2553d2a43a8b40ed2ce9",
|
|
63
|
+
"dequant_f16_out.wgsl#main": "caed21e420cbace78d3203548962a5ec3fc36980f153ae775f6a91a31af97d3a",
|
|
64
64
|
"dequant_f16_rowwise.wgsl#main": "f5bf7cef950b52d65cee6121dbaa176244d3221045b3b6386b3be47f23ce17dc",
|
|
65
65
|
"dequant_f32_rowwise.wgsl#main": "e73606e1b47e1191203a210bececa8a597bcab8bcc535146718afa6a021cab0d",
|
|
66
66
|
"dequant_mxfp4_expert_f16.wgsl#main_expert": "96af52551ac40e1b86121a528a3ffaba835c5d0419e06407fed80353d46b17e1",
|
|
@@ -69,10 +69,10 @@ export const KERNEL_REF_CONTENT_DIGESTS = Object.freeze({
|
|
|
69
69
|
"dequant_mxfp4.wgsl#main": "885a5f752b684c6ca0bb10e3a1846a396eef14d2158e8c8ad31bd1dd4c74b9ef",
|
|
70
70
|
"dequant_q6k.wgsl#main": "be0aed027932d8b7dd1e92d0090ced39e4df8be724acf290f52db0004be9a35e",
|
|
71
71
|
"dequant_q8_0.wgsl#main": "ff5f800da963b0502a9ffab723cbcac0bbb5eb9a02898afc2aba2db215a58da7",
|
|
72
|
-
"dequant_shared_vec4.wgsl#main_vec4": "
|
|
73
|
-
"dequant_shared.wgsl#main": "
|
|
74
|
-
"dequant_subgroup.wgsl#main": "
|
|
75
|
-
"dequant_subgroup.wgsl#main_vec4": "
|
|
72
|
+
"dequant_shared_vec4.wgsl#main_vec4": "24820dae36f6669a33f22b428df03791d9c700944c5ae33bd8c88e8cbeffd103",
|
|
73
|
+
"dequant_shared.wgsl#main": "e21284b5b70d4ac88d7c151760e451c2006705f1ea617b3db7f89994af4cc7df",
|
|
74
|
+
"dequant_subgroup.wgsl#main": "cbc2d86a5a2234b4c1691d5df02279263be7a66a1d4a2ad4aec1845a26baa9c9",
|
|
75
|
+
"dequant_subgroup.wgsl#main_vec4": "9e044bd0f44e73872dd8d8aa467e802c5471de86a2044de2cf8efc726e5a1182",
|
|
76
76
|
"energy_eval_f16.wgsl#main": "09223ae193593f3555866a3acfe76ca35442ef4f3967cae376bdcc211f3054b3",
|
|
77
77
|
"energy_eval.wgsl#main": "e10d9572397ebece5275aecd907cba5970f6a5c3744dd8b982677efb8982bdd2",
|
|
78
78
|
"energy_quintel_grad_f16.wgsl#main": "eb87ed8592b46b0a4d866c245b664cadb2bca016f72419e763402a6a721c4951",
|
|
@@ -142,9 +142,9 @@ export const KERNEL_REF_CONTENT_DIGESTS = Object.freeze({
|
|
|
142
142
|
"matmul_gemv_subgroup_f16a.wgsl#main_multicol": "c8e86ecbbefa27a3b7366af676d89a992c2e951329cdf19abb57b9c90144379e",
|
|
143
143
|
"matmul_gemv_subgroup_f16a.wgsl#main_vec4": "f227a403cdf9717dd68224c9ea55708ffe14c618d8146f5d48b42af0f253df29",
|
|
144
144
|
"matmul_gemv_subgroup_f16a.wgsl#main_vec4_cols8": "9e7aba97a6cf199b3f574166e295ea051ebd59e308b5f6f2ce5a4de2d04963ce",
|
|
145
|
-
"matmul_gemv_subgroup.wgsl#main": "
|
|
146
|
-
"matmul_gemv_subgroup.wgsl#main_multicol": "
|
|
147
|
-
"matmul_gemv_subgroup.wgsl#main_vec4": "
|
|
145
|
+
"matmul_gemv_subgroup.wgsl#main": "ac84b6dc88fe077dc885d8547e55526bec2f792074dd8746f907ce4a7c342028",
|
|
146
|
+
"matmul_gemv_subgroup.wgsl#main_multicol": "6631ed8936b6316499e1e1493915dc02a2e137d4f4d2650b62ce63e8805067f1",
|
|
147
|
+
"matmul_gemv_subgroup.wgsl#main_vec4": "de04e5670494401dd975915e77a603e07144aa1c928c47270afe7a806428cbfd",
|
|
148
148
|
"matmul_gemv.wgsl#main": "dc892efc87edc6d5ddaf191b86c1cc41a603352a332023aa0b1fe55d166673d0",
|
|
149
149
|
"modulate_f16.wgsl#main": "44a98cda1cc7a3575788f865173b9890be792c94e852ac8311b6b8ffbdc1438d",
|
|
150
150
|
"modulate.wgsl#main": "dfe88a35b94752573199c16b3d8aecd4e8e7da57dc88d7b342aa61e0122e71ec",
|
|
@@ -182,18 +182,18 @@ export const KERNEL_REF_CONTENT_DIGESTS = Object.freeze({
|
|
|
182
182
|
"rope.wgsl#rope_ntk_scaled": "818f89865a3d1d6f2d49f671ac882d0fde9709702160a1ae8d9a8ef113afb511",
|
|
183
183
|
"rope.wgsl#rope_qk": "3d773c8b8c400142edc8a4111afb04a2bf75bdb109b2d41cbe5afdb72a959772",
|
|
184
184
|
"rope.wgsl#rope_yarn": "cb00e1cf87fac198dcf0fb0d4e2d5f6f99d2fed6dff0a089a96bb459917851d2",
|
|
185
|
-
"sample_f16.wgsl#argmax": "
|
|
186
|
-
"sample_f16.wgsl#argmax_reduce": "
|
|
187
|
-
"sample_f16.wgsl#find_topk_phase1": "
|
|
188
|
-
"sample_f16.wgsl#find_topk_phase2": "
|
|
189
|
-
"sample_f16.wgsl#sample_single_pass": "
|
|
190
|
-
"sample_f16.wgsl#softmax_and_sample": "
|
|
191
|
-
"sample.wgsl#argmax": "
|
|
192
|
-
"sample.wgsl#argmax_reduce": "
|
|
193
|
-
"sample.wgsl#find_topk_phase1": "
|
|
194
|
-
"sample.wgsl#find_topk_phase2": "
|
|
195
|
-
"sample.wgsl#sample_single_pass": "
|
|
196
|
-
"sample.wgsl#softmax_and_sample": "
|
|
185
|
+
"sample_f16.wgsl#argmax": "7d7188081953a79b6d71bdf783d75df97a78401e2fe62e6d356cc44756a42c41",
|
|
186
|
+
"sample_f16.wgsl#argmax_reduce": "c0284bc9a50d25e215b21cb8c70e24dae3cb32b578691c2b6df5f3ede68a67c7",
|
|
187
|
+
"sample_f16.wgsl#find_topk_phase1": "8abd0a978d87adb7cce7337bb1b045a151768724f57802ad060df1dad735cff6",
|
|
188
|
+
"sample_f16.wgsl#find_topk_phase2": "ea7684cf0cc6014d39ac821edf9c89e140552b5009a72e6e91b00f8816678568",
|
|
189
|
+
"sample_f16.wgsl#sample_single_pass": "1cd8f0babc5c824b455080d30028109adfe52ce6f79009fbb986fde0d377fcb5",
|
|
190
|
+
"sample_f16.wgsl#softmax_and_sample": "c1d58cb952b704596d7ab6a2aa32b911a6e869e05b42adac3e4a19d898aa17ae",
|
|
191
|
+
"sample.wgsl#argmax": "f68b9cfdd3265a5cc52b216e549b629f1f8209e5aaa2f788142fa03db4c2d538",
|
|
192
|
+
"sample.wgsl#argmax_reduce": "96f8dd75a13db82e1928914e1f40ff1b9e03563eb5f8e3708b230f453b1fc160",
|
|
193
|
+
"sample.wgsl#find_topk_phase1": "736222d54f805b2791ebb803e9574fca93ab2b25fad0a64245f782499ce2d10c",
|
|
194
|
+
"sample.wgsl#find_topk_phase2": "a590107f0b7603b4b9624140dea1b436362062f63d64ed6d77e1628578796e77",
|
|
195
|
+
"sample.wgsl#sample_single_pass": "91c5c30bbc3e034457c1521c1ad576ce798c0868a1fe16e02be5f92706614096",
|
|
196
|
+
"sample.wgsl#softmax_and_sample": "132d67a1393702c81ee896975447f14f9a6a2589b25125d28401bc8ca47a253d",
|
|
197
197
|
"sana_linear_attention_apply_f16.wgsl#main": "4a7426ce67eccfb70956feeae84275f4d3cc586c50e8442c07eb69993b378ab5",
|
|
198
198
|
"sana_linear_attention_apply.wgsl#main": "5f69e0bc1d9e2df5a61e13bd819313c8f7ff5dfc4b7d78e71d5152dc23b6a86c",
|
|
199
199
|
"sana_linear_attention_summary_f16.wgsl#main": "3abb736ead999485b5dac9c6b534143b464cfd0b5300c5e03c56cec03c8fa48e",
|
|
@@ -213,6 +213,8 @@ export const KERNEL_REF_CONTENT_DIGESTS = Object.freeze({
|
|
|
213
213
|
"softmax.wgsl#main": "45c5876806b442222d7e190e595f55a0079bae82e07d37586996c1a63790bb7a",
|
|
214
214
|
"softmax.wgsl#softmax_online": "6c62601ba2f88f7de9dacf026cc2357168df47d009fd108736655b645217cd0a",
|
|
215
215
|
"softmax.wgsl#softmax_small": "ad75f10e0a3caadd278130504e7d0e4e1b2f0621f8bd390abae5d973e301e47b",
|
|
216
|
+
"split_qg_f16.wgsl#main": "1d19e9d9900a270a3a1bd407347908f891ba98a1cdcee35ae932257a9a1c72cb",
|
|
217
|
+
"split_qg.wgsl#main": "64a8aa855c6246675bebdeab0258dc0e649e3986ef2bba4572d2d7dc1af902f4",
|
|
216
218
|
"split_qkv_f16.wgsl#main": "bd1a92fcd9382bdcdf00bcc59248a12489444904a4f10845a381f177b6ad649f",
|
|
217
219
|
"split_qkv.wgsl#main": "bc7c95a47322edc11fec19105efd3774c2adfed151530c849909d03af7503e4c",
|
|
218
220
|
"topk_f16_weights.wgsl#softmax_topk": "863559c28eb46a2b4dc16f21a19aca2424a5d68fc3430b29461bebdd7ec8f625",
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "mixtral-moe-v1",
|
|
3
|
+
"description": "Deterministic Mixtral-style MoE kernel routing profile for Q4K/F16 expert weights with standard gate/up/down FFN.",
|
|
4
|
+
"router": {
|
|
5
|
+
"topk": [
|
|
6
|
+
{
|
|
7
|
+
"match": { "hasF16": true, "hasSubgroups": true, "routerDtype": "f32" },
|
|
8
|
+
"value": "softmax_topk_f32_subgroup"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"match": { "hasF16": true, "routerDtype": "f32" },
|
|
12
|
+
"value": "softmax_topk_f32"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"match": {},
|
|
16
|
+
"value": "softmax_topk_f32"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
"dequant": {
|
|
21
|
+
"q4kExpert": [
|
|
22
|
+
{
|
|
23
|
+
"match": { "hasF16": true, "hasSubgroups": true, "outputDtype": "f32" },
|
|
24
|
+
"value": "q4k_expert_dequant_f32_subgroup"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"match": { "hasF16": true, "outputDtype": "f16" },
|
|
28
|
+
"value": "q4k_expert_dequant_f16"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"match": {},
|
|
32
|
+
"value": "q4k_expert_dequant_f32"
|
|
33
|
+
}
|
|
34
|
+
],
|
|
35
|
+
"f16Expert": [
|
|
36
|
+
{
|
|
37
|
+
"match": { "hasF16": true, "outputDtype": "f16" },
|
|
38
|
+
"value": "f16_expert_passthrough"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"match": {},
|
|
42
|
+
"value": "f16_expert_upcast_f32"
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
}
|
|
46
|
+
}
|
package/src/config/loader.js
CHANGED
|
@@ -12,6 +12,7 @@ const transformerPreset = await loadJson('./presets/models/transformer.json', im
|
|
|
12
12
|
const diffusionPreset = await loadJson('./presets/models/diffusion.json', import.meta.url, 'Failed to load preset');
|
|
13
13
|
const gemma2Preset = await loadJson('./presets/models/gemma2.json', import.meta.url, 'Failed to load preset');
|
|
14
14
|
const gemma3Preset = await loadJson('./presets/models/gemma3.json', import.meta.url, 'Failed to load preset');
|
|
15
|
+
const gemma4Preset = await loadJson('./presets/models/gemma4.json', import.meta.url, 'Failed to load preset');
|
|
15
16
|
const translateGemmaPreset = await loadJson('./presets/models/translategemma.json', import.meta.url, 'Failed to load preset');
|
|
16
17
|
const embeddingGemmaPreset = await loadJson('./presets/models/embeddinggemma.json', import.meta.url, 'Failed to load preset');
|
|
17
18
|
const functiongemmaPreset = await loadJson('./presets/models/functiongemma.json', import.meta.url, 'Failed to load preset');
|
|
@@ -24,6 +25,7 @@ const modernbertPreset = await loadJson('./presets/models/modernbert.json', impo
|
|
|
24
25
|
const lfm2Preset = await loadJson('./presets/models/lfm2.json', import.meta.url, 'Failed to load preset');
|
|
25
26
|
const qwen3Preset = await loadJson('./presets/models/qwen3.json', import.meta.url, 'Failed to load preset');
|
|
26
27
|
const qwen35Preset = await loadJson('./presets/models/qwen3_5.json', import.meta.url, 'Failed to load preset');
|
|
28
|
+
const qwen3VlPreset = await loadJson('./presets/models/qwen3_vl.json', import.meta.url, 'Failed to load preset');
|
|
27
29
|
const kimiK2Preset = await loadJson('./presets/models/kimi-k2.json', import.meta.url, 'Failed to load preset');
|
|
28
30
|
const gptOssPreset = await loadJson('./presets/models/gpt-oss.json', import.meta.url, 'Failed to load preset');
|
|
29
31
|
|
|
@@ -36,6 +38,7 @@ export const PRESET_REGISTRY = {
|
|
|
36
38
|
transformer: transformerPreset,
|
|
37
39
|
gemma2: gemma2Preset,
|
|
38
40
|
gemma3: gemma3Preset,
|
|
41
|
+
gemma4: gemma4Preset,
|
|
39
42
|
translategemma: translateGemmaPreset,
|
|
40
43
|
embeddinggemma: embeddingGemmaPreset,
|
|
41
44
|
functiongemma: functiongemmaPreset,
|
|
@@ -48,6 +51,7 @@ export const PRESET_REGISTRY = {
|
|
|
48
51
|
lfm2: lfm2Preset,
|
|
49
52
|
qwen3: qwen3Preset,
|
|
50
53
|
qwen3_5: qwen35Preset,
|
|
54
|
+
qwen3_vl: qwen3VlPreset,
|
|
51
55
|
kimi_k2: kimiK2Preset,
|
|
52
56
|
gpt_oss: gptOssPreset,
|
|
53
57
|
};
|
|
@@ -95,10 +99,12 @@ export const PRESET_DETECTION_ORDER = [
|
|
|
95
99
|
'diffusion',
|
|
96
100
|
// Model families (check more specific patterns first)
|
|
97
101
|
'gemma2',
|
|
102
|
+
'gemma4',
|
|
98
103
|
'translategemma',
|
|
99
104
|
'gemma3',
|
|
100
105
|
'llama3',
|
|
101
106
|
'lfm2',
|
|
107
|
+
'qwen3_vl',
|
|
102
108
|
'qwen3_5',
|
|
103
109
|
'qwen3',
|
|
104
110
|
'kimi_k2',
|
|
@@ -9,6 +9,8 @@ const platformCache = new Map();
|
|
|
9
9
|
|
|
10
10
|
let platformsBaseUrl = null;
|
|
11
11
|
|
|
12
|
+
const DEFAULT_PREFER_UNIFIED_MEMORY = false;
|
|
13
|
+
|
|
12
14
|
const PLATFORM_FILES = [
|
|
13
15
|
'apple-m3',
|
|
14
16
|
'apple-m2',
|
|
@@ -131,7 +133,7 @@ export function getMemoryHints() {
|
|
|
131
133
|
}
|
|
132
134
|
|
|
133
135
|
export function prefersUnifiedMemory() {
|
|
134
|
-
return getMemoryHints()?.preferUnifiedMemory ??
|
|
136
|
+
return getMemoryHints()?.preferUnifiedMemory ?? DEFAULT_PREFER_UNIFIED_MEMORY;
|
|
135
137
|
}
|
|
136
138
|
|
|
137
139
|
export function getBufferAlignment() {
|
|
@@ -8,19 +8,19 @@
|
|
|
8
8
|
"decode": {
|
|
9
9
|
"steps": [
|
|
10
10
|
{ "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
11
|
-
{ "op": "q_proj", "kernel": "
|
|
12
|
-
{ "op": "k_proj", "kernel": "
|
|
13
|
-
{ "op": "v_proj", "kernel": "
|
|
11
|
+
{ "op": "q_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
|
|
12
|
+
{ "op": "k_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
|
|
13
|
+
{ "op": "v_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
|
|
14
14
|
{ "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
|
|
15
15
|
{ "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
|
|
16
16
|
{ "op": "attention", "kernel": "attention_decode_chunked_f16kv.wgsl", "entry": "main" },
|
|
17
|
-
{ "op": "o_proj", "kernel": "
|
|
17
|
+
{ "op": "o_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
|
|
18
18
|
{ "op": "attn_residual","kernel": "residual.wgsl", "entry": "main" },
|
|
19
19
|
{ "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
20
|
-
{ "op": "gate_proj", "kernel": "
|
|
21
|
-
{ "op": "up_proj", "kernel": "
|
|
20
|
+
{ "op": "gate_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
|
|
21
|
+
{ "op": "up_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
|
|
22
22
|
{ "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
|
|
23
|
-
{ "op": "down_proj", "kernel": "
|
|
23
|
+
{ "op": "down_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
|
|
24
24
|
{ "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
|
|
25
25
|
]
|
|
26
26
|
},
|
|
@@ -28,19 +28,19 @@
|
|
|
28
28
|
"prefill": {
|
|
29
29
|
"steps": [
|
|
30
30
|
{ "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
31
|
-
{ "op": "q_proj", "kernel": "
|
|
32
|
-
{ "op": "k_proj", "kernel": "
|
|
33
|
-
{ "op": "v_proj", "kernel": "
|
|
31
|
+
{ "op": "q_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
|
|
32
|
+
{ "op": "k_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
|
|
33
|
+
{ "op": "v_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
|
|
34
34
|
{ "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
|
|
35
35
|
{ "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
|
|
36
36
|
{ "op": "attention", "kernel": "attention_streaming_f16kv.wgsl", "entry": "main" },
|
|
37
|
-
{ "op": "o_proj", "kernel": "
|
|
37
|
+
{ "op": "o_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
|
|
38
38
|
{ "op": "attn_residual","kernel": "residual.wgsl", "entry": "main" },
|
|
39
39
|
{ "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
40
|
-
{ "op": "gate_proj", "kernel": "
|
|
41
|
-
{ "op": "up_proj", "kernel": "
|
|
40
|
+
{ "op": "gate_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
|
|
41
|
+
{ "op": "up_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
|
|
42
42
|
{ "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
|
|
43
|
-
{ "op": "down_proj", "kernel": "
|
|
43
|
+
{ "op": "down_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
|
|
44
44
|
{ "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
|
|
45
45
|
]
|
|
46
46
|
},
|
|
@@ -51,8 +51,8 @@
|
|
|
51
51
|
|
|
52
52
|
"postLayer": [
|
|
53
53
|
{ "op": "final_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
54
|
-
{ "op": "lm_head", "kernel": "matmul_f16w_f32a.wgsl",
|
|
55
|
-
{ "op": "lm_head_prefill", "kernel": "
|
|
54
|
+
{ "op": "lm_head", "kernel": "matmul_f16w_f32a.wgsl", "entry": "main", "weights": "lm_head" },
|
|
55
|
+
{ "op": "lm_head_prefill", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "lm_head" }
|
|
56
56
|
],
|
|
57
57
|
|
|
58
58
|
"sampling": [
|
|
@@ -28,19 +28,19 @@
|
|
|
28
28
|
"prefill": {
|
|
29
29
|
"steps": [
|
|
30
30
|
{ "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
31
|
-
{ "op": "q_proj", "kernel": "
|
|
32
|
-
{ "op": "k_proj", "kernel": "
|
|
33
|
-
{ "op": "v_proj", "kernel": "
|
|
31
|
+
{ "op": "q_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
|
|
32
|
+
{ "op": "k_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
|
|
33
|
+
{ "op": "v_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
|
|
34
34
|
{ "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
|
|
35
35
|
{ "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
|
|
36
36
|
{ "op": "attention", "kernel": "attention_streaming_f16kv.wgsl", "entry": "main" },
|
|
37
|
-
{ "op": "o_proj", "kernel": "
|
|
37
|
+
{ "op": "o_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
|
|
38
38
|
{ "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
|
|
39
39
|
{ "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
40
|
-
{ "op": "gate_proj", "kernel": "
|
|
41
|
-
{ "op": "up_proj", "kernel": "
|
|
40
|
+
{ "op": "gate_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
|
|
41
|
+
{ "op": "up_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
|
|
42
42
|
{ "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
|
|
43
|
-
{ "op": "down_proj", "kernel": "
|
|
43
|
+
{ "op": "down_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
|
|
44
44
|
{ "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
|
|
45
45
|
]
|
|
46
46
|
},
|
|
@@ -52,7 +52,7 @@
|
|
|
52
52
|
"postLayer": [
|
|
53
53
|
{ "op": "final_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
54
54
|
{ "op": "lm_head", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_multicol", "weights": "lm_head", "constants": { "MULTICOL_COLS_PER_WG": 64, "MULTICOL_THREADS_PER_COL": 4 } },
|
|
55
|
-
{ "op": "lm_head_prefill", "kernel": "
|
|
55
|
+
{ "op": "lm_head_prefill", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "lm_head" }
|
|
56
56
|
],
|
|
57
57
|
|
|
58
58
|
"sampling": [
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "gemma3-q4k-dequant-f32a-small-attn",
|
|
3
|
+
"name": "Gemma 3 Q4K Dequant (F32 activations, small-attn prefill)",
|
|
4
|
+
"description": "Q4K dequantized to F16 with F32 activations. Same as gemma3-q4k-dequant-f32a-online but uses attention_small_f16kv.wgsl for prefill (diagnostic variant).",
|
|
5
|
+
"activationDtype": "f32",
|
|
6
|
+
"kvDtype": "f16",
|
|
7
|
+
|
|
8
|
+
"decode": {
|
|
9
|
+
"steps": [
|
|
10
|
+
{ "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
11
|
+
{ "op": "q_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.self_attn.q_proj" },
|
|
12
|
+
{ "op": "k_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.self_attn.k_proj" },
|
|
13
|
+
{ "op": "v_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.self_attn.v_proj" },
|
|
14
|
+
{ "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
|
|
15
|
+
{ "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
|
|
16
|
+
{ "op": "attention", "kernel": "attention_decode_online_f16kv.wgsl", "entry": "main" },
|
|
17
|
+
{ "op": "o_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.self_attn.o_proj" },
|
|
18
|
+
{ "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
|
|
19
|
+
{ "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
20
|
+
{ "op": "gate_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.mlp.gate_proj" },
|
|
21
|
+
{ "op": "up_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.mlp.up_proj" },
|
|
22
|
+
{ "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
|
|
23
|
+
{ "op": "down_proj", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_vec4", "weights": "layer.{L}.mlp.down_proj" },
|
|
24
|
+
{ "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
|
|
28
|
+
"prefill": {
|
|
29
|
+
"steps": [
|
|
30
|
+
{ "op": "input_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
31
|
+
{ "op": "q_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.q_proj" },
|
|
32
|
+
{ "op": "k_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.k_proj" },
|
|
33
|
+
{ "op": "v_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.v_proj" },
|
|
34
|
+
{ "op": "rope_q", "kernel": "rope.wgsl", "entry": "main" },
|
|
35
|
+
{ "op": "rope_k", "kernel": "rope.wgsl", "entry": "main" },
|
|
36
|
+
{ "op": "attention", "kernel": "attention_small_f16kv.wgsl", "entry": "main" },
|
|
37
|
+
{ "op": "o_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.self_attn.o_proj" },
|
|
38
|
+
{ "op": "attn_residual", "kernel": "residual.wgsl", "entry": "main" },
|
|
39
|
+
{ "op": "post_attn_norm","kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
40
|
+
{ "op": "gate_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.gate_proj" },
|
|
41
|
+
{ "op": "up_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.up_proj" },
|
|
42
|
+
{ "op": "activation", "kernel": "gelu.wgsl", "entry": "main", "constants": { "HAS_GATE": true } },
|
|
43
|
+
{ "op": "down_proj", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "layer.{L}.mlp.down_proj" },
|
|
44
|
+
{ "op": "ffn_residual", "kernel": "residual.wgsl", "entry": "main" }
|
|
45
|
+
]
|
|
46
|
+
},
|
|
47
|
+
|
|
48
|
+
"preLayer": [
|
|
49
|
+
{ "op": "embed", "kernel": "gather_f16.wgsl", "entry": "main", "weights": "embed_tokens" }
|
|
50
|
+
],
|
|
51
|
+
|
|
52
|
+
"postLayer": [
|
|
53
|
+
{ "op": "final_norm", "kernel": "rmsnorm.wgsl", "entry": "main" },
|
|
54
|
+
{ "op": "lm_head", "kernel": "matmul_gemv_subgroup.wgsl", "entry": "main_multicol", "weights": "lm_head", "constants": { "MULTICOL_COLS_PER_WG": 64, "MULTICOL_THREADS_PER_COL": 4 } },
|
|
55
|
+
{ "op": "lm_head_prefill", "kernel": "matmul_f16w_f32a_tiled.wgsl", "entry": "main", "weights": "lm_head" }
|
|
56
|
+
],
|
|
57
|
+
|
|
58
|
+
"sampling": [
|
|
59
|
+
{ "op": "sample", "kernel": "sample.wgsl", "entry": "sample_single_pass" }
|
|
60
|
+
]
|
|
61
|
+
}
|