@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -40,6 +40,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
|
|
|
40
40
|
return softcap * tanh(x / softcap);
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
+
fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
|
|
44
|
+
if (candidate_value > best_value) {
|
|
45
|
+
return true;
|
|
46
|
+
}
|
|
47
|
+
if (candidate_value < best_value) {
|
|
48
|
+
return false;
|
|
49
|
+
}
|
|
50
|
+
return candidate_index < best_index;
|
|
51
|
+
}
|
|
52
|
+
|
|
43
53
|
@group(0) @binding(0) var<uniform> u: Uniforms;
|
|
44
54
|
@group(0) @binding(1) var<storage, read> logits: array<f32>; // [vocabSize]
|
|
45
55
|
@group(0) @binding(2) var<storage, read_write> output: array<u32>; // [N] - selected tokens
|
|
@@ -87,7 +97,7 @@ fn find_topk_phase1(
|
|
|
87
97
|
if (idx != pad_id) {
|
|
88
98
|
// Apply softcapping before temperature scaling
|
|
89
99
|
let val = apply_softcap(logits[idx], softcap) / temperature;
|
|
90
|
-
if (val
|
|
100
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
91
101
|
local_max = val;
|
|
92
102
|
local_max_idx = idx;
|
|
93
103
|
}
|
|
@@ -103,7 +113,12 @@ fn find_topk_phase1(
|
|
|
103
113
|
var stride = WORKGROUP_SIZE / 2u;
|
|
104
114
|
while (stride > 0u) {
|
|
105
115
|
if (thread_idx < stride) {
|
|
106
|
-
if (
|
|
116
|
+
if (candidate_beats(
|
|
117
|
+
shared_values[thread_idx + stride],
|
|
118
|
+
shared_indices[thread_idx + stride],
|
|
119
|
+
shared_values[thread_idx],
|
|
120
|
+
shared_indices[thread_idx]
|
|
121
|
+
)) {
|
|
107
122
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
108
123
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
109
124
|
}
|
|
@@ -150,7 +165,7 @@ fn find_topk_phase2(
|
|
|
150
165
|
var max_val = shared_values[k];
|
|
151
166
|
|
|
152
167
|
for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
|
|
153
|
-
if (shared_values[i]
|
|
168
|
+
if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
|
|
154
169
|
max_val = shared_values[i];
|
|
155
170
|
max_idx = i;
|
|
156
171
|
}
|
|
@@ -249,7 +264,7 @@ fn sample_single_pass(
|
|
|
249
264
|
if (idx != pad_id) {
|
|
250
265
|
// Apply softcapping before temperature scaling
|
|
251
266
|
let val = apply_softcap(logits[idx], softcap) / temperature;
|
|
252
|
-
if (val
|
|
267
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
253
268
|
local_max = val;
|
|
254
269
|
local_max_idx = idx;
|
|
255
270
|
}
|
|
@@ -265,7 +280,12 @@ fn sample_single_pass(
|
|
|
265
280
|
var stride = WORKGROUP_SIZE / 2u;
|
|
266
281
|
while (stride > 0u) {
|
|
267
282
|
if (thread_idx < stride) {
|
|
268
|
-
if (
|
|
283
|
+
if (candidate_beats(
|
|
284
|
+
shared_values[thread_idx + stride],
|
|
285
|
+
shared_indices[thread_idx + stride],
|
|
286
|
+
shared_values[thread_idx],
|
|
287
|
+
shared_indices[thread_idx]
|
|
288
|
+
)) {
|
|
269
289
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
270
290
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
271
291
|
}
|
|
@@ -308,7 +328,7 @@ fn argmax(
|
|
|
308
328
|
if (idx != pad_id) {
|
|
309
329
|
// Apply softcapping (argmax is greedy, no temperature)
|
|
310
330
|
let val = apply_softcap(logits[idx], softcap);
|
|
311
|
-
if (val
|
|
331
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
312
332
|
local_max = val;
|
|
313
333
|
local_max_idx = idx;
|
|
314
334
|
}
|
|
@@ -324,7 +344,12 @@ fn argmax(
|
|
|
324
344
|
var stride = WORKGROUP_SIZE / 2u;
|
|
325
345
|
while (stride > 0u) {
|
|
326
346
|
if (thread_idx < stride) {
|
|
327
|
-
if (
|
|
347
|
+
if (candidate_beats(
|
|
348
|
+
shared_values[thread_idx + stride],
|
|
349
|
+
shared_indices[thread_idx + stride],
|
|
350
|
+
shared_values[thread_idx],
|
|
351
|
+
shared_indices[thread_idx]
|
|
352
|
+
)) {
|
|
328
353
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
329
354
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
330
355
|
}
|
|
@@ -362,7 +387,12 @@ fn argmax_reduce(
|
|
|
362
387
|
var stride = WORKGROUP_SIZE / 2u;
|
|
363
388
|
while (stride > 0u) {
|
|
364
389
|
if (thread_idx < stride) {
|
|
365
|
-
if (
|
|
390
|
+
if (candidate_beats(
|
|
391
|
+
shared_values[thread_idx + stride],
|
|
392
|
+
shared_indices[thread_idx + stride],
|
|
393
|
+
shared_values[thread_idx],
|
|
394
|
+
shared_indices[thread_idx]
|
|
395
|
+
)) {
|
|
366
396
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
367
397
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
368
398
|
}
|
|
@@ -374,4 +404,4 @@ fn argmax_reduce(
|
|
|
374
404
|
if (thread_idx == 0u) {
|
|
375
405
|
output[u.output_index] = shared_indices[0];
|
|
376
406
|
}
|
|
377
|
-
}
|
|
407
|
+
}
|
|
@@ -34,6 +34,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
|
|
|
34
34
|
return softcap * tanh(x / softcap);
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
|
|
38
|
+
if (candidate_value > best_value) {
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
if (candidate_value < best_value) {
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
return candidate_index < best_index;
|
|
45
|
+
}
|
|
46
|
+
|
|
37
47
|
@group(0) @binding(0) var<uniform> u: Uniforms;
|
|
38
48
|
@group(0) @binding(1) var<storage, read> logits: array<f16>;
|
|
39
49
|
@group(0) @binding(2) var<storage, read_write> output: array<u32>;
|
|
@@ -74,7 +84,7 @@ fn find_topk_phase1(
|
|
|
74
84
|
while (idx < vocab_size) {
|
|
75
85
|
if (idx != pad_id) {
|
|
76
86
|
let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
|
|
77
|
-
if (val
|
|
87
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
78
88
|
local_max = val;
|
|
79
89
|
local_max_idx = idx;
|
|
80
90
|
}
|
|
@@ -89,7 +99,12 @@ fn find_topk_phase1(
|
|
|
89
99
|
var stride = WORKGROUP_SIZE / 2u;
|
|
90
100
|
while (stride > 0u) {
|
|
91
101
|
if (thread_idx < stride) {
|
|
92
|
-
if (
|
|
102
|
+
if (candidate_beats(
|
|
103
|
+
shared_values[thread_idx + stride],
|
|
104
|
+
shared_indices[thread_idx + stride],
|
|
105
|
+
shared_values[thread_idx],
|
|
106
|
+
shared_indices[thread_idx]
|
|
107
|
+
)) {
|
|
93
108
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
94
109
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
95
110
|
}
|
|
@@ -130,7 +145,7 @@ fn find_topk_phase2(
|
|
|
130
145
|
var max_val = shared_values[k];
|
|
131
146
|
|
|
132
147
|
for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
|
|
133
|
-
if (shared_values[i]
|
|
148
|
+
if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
|
|
134
149
|
max_val = shared_values[i];
|
|
135
150
|
max_idx = i;
|
|
136
151
|
}
|
|
@@ -218,7 +233,7 @@ fn sample_single_pass(
|
|
|
218
233
|
while (idx < vocab_size) {
|
|
219
234
|
if (idx != pad_id) {
|
|
220
235
|
let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
|
|
221
|
-
if (val
|
|
236
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
222
237
|
local_max = val;
|
|
223
238
|
local_max_idx = idx;
|
|
224
239
|
}
|
|
@@ -233,7 +248,12 @@ fn sample_single_pass(
|
|
|
233
248
|
var stride = WORKGROUP_SIZE / 2u;
|
|
234
249
|
while (stride > 0u) {
|
|
235
250
|
if (thread_idx < stride) {
|
|
236
|
-
if (
|
|
251
|
+
if (candidate_beats(
|
|
252
|
+
shared_values[thread_idx + stride],
|
|
253
|
+
shared_indices[thread_idx + stride],
|
|
254
|
+
shared_values[thread_idx],
|
|
255
|
+
shared_indices[thread_idx]
|
|
256
|
+
)) {
|
|
237
257
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
238
258
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
239
259
|
}
|
|
@@ -267,7 +287,7 @@ fn argmax(
|
|
|
267
287
|
while (idx < vocab_size) {
|
|
268
288
|
if (idx != pad_id) {
|
|
269
289
|
let val = apply_softcap(f32(logits[idx]), softcap);
|
|
270
|
-
if (val
|
|
290
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
271
291
|
local_max = val;
|
|
272
292
|
local_max_idx = idx;
|
|
273
293
|
}
|
|
@@ -282,7 +302,12 @@ fn argmax(
|
|
|
282
302
|
var stride = WORKGROUP_SIZE / 2u;
|
|
283
303
|
while (stride > 0u) {
|
|
284
304
|
if (thread_idx < stride) {
|
|
285
|
-
if (
|
|
305
|
+
if (candidate_beats(
|
|
306
|
+
shared_values[thread_idx + stride],
|
|
307
|
+
shared_indices[thread_idx + stride],
|
|
308
|
+
shared_values[thread_idx],
|
|
309
|
+
shared_indices[thread_idx]
|
|
310
|
+
)) {
|
|
286
311
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
287
312
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
288
313
|
}
|
|
@@ -316,7 +341,12 @@ fn argmax_reduce(
|
|
|
316
341
|
var stride = WORKGROUP_SIZE / 2u;
|
|
317
342
|
while (stride > 0u) {
|
|
318
343
|
if (thread_idx < stride) {
|
|
319
|
-
if (
|
|
344
|
+
if (candidate_beats(
|
|
345
|
+
shared_values[thread_idx + stride],
|
|
346
|
+
shared_indices[thread_idx + stride],
|
|
347
|
+
shared_values[thread_idx],
|
|
348
|
+
shared_indices[thread_idx]
|
|
349
|
+
)) {
|
|
320
350
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
321
351
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
322
352
|
}
|
|
@@ -133,10 +133,15 @@ export async function compileShader(
|
|
|
133
133
|
source,
|
|
134
134
|
label
|
|
135
135
|
) {
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
136
|
+
let module;
|
|
137
|
+
try {
|
|
138
|
+
module = device.createShaderModule({
|
|
139
|
+
label,
|
|
140
|
+
code: source,
|
|
141
|
+
});
|
|
142
|
+
} catch (err) {
|
|
143
|
+
throw new Error(`createShaderModule failed for "${label}": ${err.message}`);
|
|
144
|
+
}
|
|
140
145
|
|
|
141
146
|
// Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
|
|
142
147
|
const compilationInfo = typeof module.getCompilationInfo === 'function'
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Split Q and Gate Kernel
|
|
3
|
+
*
|
|
4
|
+
* De-interleaves Q and Gate projections from q_proj output for attentionOutputGate models.
|
|
5
|
+
* Models like Qwen 3.5 store q_proj weights in per-head interleaved layout:
|
|
6
|
+
* rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
|
|
7
|
+
* rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
|
|
8
|
+
* This kernel separates the full matmul output into contiguous Q and Gate tensors.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { Tensor } from '../tensor.js';
|
|
12
|
+
import type { CommandRecorder } from '../command-recorder.js';
|
|
13
|
+
|
|
14
|
+
/** Split Q and Gate options */
|
|
15
|
+
export interface SplitQGOptions {
|
|
16
|
+
numTokens: number;
|
|
17
|
+
numHeads: number;
|
|
18
|
+
headDim: number;
|
|
19
|
+
/** Pre-allocated Q output tensor */
|
|
20
|
+
qTensor?: Tensor | null;
|
|
21
|
+
/** Pre-allocated Gate output tensor */
|
|
22
|
+
gTensor?: Tensor | null;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Split Q and Gate result */
|
|
26
|
+
export interface SplitQGResult {
|
|
27
|
+
Q: Tensor;
|
|
28
|
+
G: Tensor;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* De-interleave Q and Gate from q_proj output.
|
|
33
|
+
*
|
|
34
|
+
* @param qgTensor - Full q_proj output [numTokens, numHeads * headDim * 2] (interleaved)
|
|
35
|
+
* @param options - Split configuration
|
|
36
|
+
* @returns Separate Q and Gate tensors, each [numTokens, numHeads * headDim]
|
|
37
|
+
*/
|
|
38
|
+
export declare function runSplitQG(
|
|
39
|
+
qgTensor: Tensor,
|
|
40
|
+
options: SplitQGOptions
|
|
41
|
+
): Promise<SplitQGResult>;
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Record split Q and Gate (batched, no submit).
|
|
45
|
+
*/
|
|
46
|
+
export declare function recordSplitQG(
|
|
47
|
+
recorder: CommandRecorder,
|
|
48
|
+
qgTensor: Tensor,
|
|
49
|
+
options: SplitQGOptions
|
|
50
|
+
): Promise<SplitQGResult>;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
|
|
2
|
+
import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
|
|
3
|
+
import { createTensor, dtypeBytes } from '../tensor.js';
|
|
4
|
+
import { WORKGROUP_SIZES } from './constants.js';
|
|
5
|
+
import { unifiedKernelWrapper } from './utils.js';
|
|
6
|
+
import { selectRuleValue } from './rule-registry.js';
|
|
7
|
+
|
|
8
|
+
async function _splitQG(target, qgTensor, options) {
|
|
9
|
+
const { numTokens, numHeads, headDim, qTensor = null, gTensor = null } = options;
|
|
10
|
+
const ownsQ = qTensor == null;
|
|
11
|
+
const ownsG = gTensor == null;
|
|
12
|
+
|
|
13
|
+
const outputDtype = qgTensor.dtype;
|
|
14
|
+
const pipelineVariant = selectRuleValue('splitQg', 'variant', { outputDtype });
|
|
15
|
+
const bytesPerElement = dtypeBytes(outputDtype);
|
|
16
|
+
const qSize = numHeads * headDim;
|
|
17
|
+
|
|
18
|
+
const qBuffer = qTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q');
|
|
19
|
+
const gBuffer = gTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q_gate');
|
|
20
|
+
|
|
21
|
+
try {
|
|
22
|
+
await unifiedKernelWrapper(
|
|
23
|
+
'split_qg', target, pipelineVariant,
|
|
24
|
+
[qgTensor, qBuffer, gBuffer],
|
|
25
|
+
{ num_tokens: numTokens, num_heads: numHeads, head_dim: headDim, _pad: 0 },
|
|
26
|
+
Math.ceil((numTokens * qSize) / WORKGROUP_SIZES.DEFAULT)
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
const Q = qTensor || createTensor(qBuffer, outputDtype, [numTokens, qSize], 'Q');
|
|
30
|
+
const G = gTensor || createTensor(gBuffer, outputDtype, [numTokens, qSize], 'Q_gate');
|
|
31
|
+
|
|
32
|
+
return { Q, G };
|
|
33
|
+
} catch (error) {
|
|
34
|
+
if (ownsQ) releaseBuffer(qBuffer);
|
|
35
|
+
if (ownsG) releaseBuffer(gBuffer);
|
|
36
|
+
throw error;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export async function runSplitQG(qgTensor, options) {
|
|
41
|
+
return _splitQG(null, qgTensor, options);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export async function recordSplitQG(recorder, qgTensor, options) {
|
|
45
|
+
return _splitQG(recorder, qgTensor, options);
|
|
46
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
// split_qg.wgsl
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* De-interleave Q and Gate projections from q_proj output for attentionOutputGate models.
|
|
5
|
+
*
|
|
6
|
+
* Models like Qwen 3.5 store q_proj weights with interleaved head layout:
|
|
7
|
+
* rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
|
|
8
|
+
* rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
|
|
9
|
+
*
|
|
10
|
+
* A single full matmul over all 2*qSize rows produces interleaved output:
|
|
11
|
+
* input[token, h*headDim*2 : h*headDim*2+headDim] = Q head h
|
|
12
|
+
* input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
|
|
13
|
+
*
|
|
14
|
+
* This kernel separates them into contiguous Q and G outputs:
|
|
15
|
+
* Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
|
|
16
|
+
* G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
|
|
17
|
+
*
|
|
18
|
+
* Input layout (row-major): [numTokens, numHeads * headDim * 2]
|
|
19
|
+
* Output Q layout (row-major): [numTokens, numHeads * headDim]
|
|
20
|
+
* Output G layout (row-major): [numTokens, numHeads * headDim]
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
struct Params {
|
|
24
|
+
num_tokens: u32,
|
|
25
|
+
num_heads: u32,
|
|
26
|
+
head_dim: u32,
|
|
27
|
+
_pad: u32,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
override WORKGROUP_SIZE: u32 = 256u;
|
|
31
|
+
|
|
32
|
+
@group(0) @binding(0) var<uniform> params: Params;
|
|
33
|
+
@group(0) @binding(1) var<storage, read> input: array<f32>;
|
|
34
|
+
@group(0) @binding(2) var<storage, read_write> Q: array<f32>;
|
|
35
|
+
@group(0) @binding(3) var<storage, read_write> G: array<f32>;
|
|
36
|
+
|
|
37
|
+
@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
|
|
38
|
+
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
39
|
+
let idx = gid.x;
|
|
40
|
+
let q_size = params.num_heads * params.head_dim;
|
|
41
|
+
let total_elements = params.num_tokens * q_size;
|
|
42
|
+
|
|
43
|
+
if (idx >= total_elements) {
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
let token = idx / q_size;
|
|
48
|
+
let elem = idx % q_size;
|
|
49
|
+
let head = elem / params.head_dim;
|
|
50
|
+
let dim = elem % params.head_dim;
|
|
51
|
+
|
|
52
|
+
// Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
|
|
53
|
+
let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
|
|
54
|
+
let src_g = src_q + params.head_dim;
|
|
55
|
+
|
|
56
|
+
Q[idx] = input[src_q];
|
|
57
|
+
G[idx] = input[src_g];
|
|
58
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// AUTO-GENERATED from src/gpu/kernels/split_qg.wgsl.
|
|
2
|
+
// Edit the source kernel and tools/configs/wgsl-variants.js, then run `npm run kernels:generate`.
|
|
3
|
+
// split_qg_f16.wgsl
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* De-interleave Q and Gate projections from q_proj output for attentionOutputGate models (f16).
|
|
7
|
+
*
|
|
8
|
+
* Models like Qwen 3.5 store q_proj weights with interleaved head layout:
|
|
9
|
+
* rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
|
|
10
|
+
* rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
|
|
11
|
+
*
|
|
12
|
+
* A single full matmul over all 2*qSize rows produces interleaved output:
|
|
13
|
+
* input[token, h*headDim*2 : h*headDim*2+headDim] = Q head h
|
|
14
|
+
* input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
|
|
15
|
+
*
|
|
16
|
+
* This kernel separates them into contiguous Q and G outputs:
|
|
17
|
+
* Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
|
|
18
|
+
* G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
|
|
19
|
+
*
|
|
20
|
+
* Input layout (row-major): [numTokens, numHeads * headDim * 2]
|
|
21
|
+
* Output Q layout (row-major): [numTokens, numHeads * headDim]
|
|
22
|
+
* Output G layout (row-major): [numTokens, numHeads * headDim]
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
enable f16;
|
|
26
|
+
|
|
27
|
+
struct Params {
|
|
28
|
+
num_tokens: u32,
|
|
29
|
+
num_heads: u32,
|
|
30
|
+
head_dim: u32,
|
|
31
|
+
_pad: u32,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
override WORKGROUP_SIZE: u32 = 256u;
|
|
35
|
+
|
|
36
|
+
@group(0) @binding(0) var<uniform> params: Params;
|
|
37
|
+
@group(0) @binding(1) var<storage, read> input: array<f16>;
|
|
38
|
+
@group(0) @binding(2) var<storage, read_write> Q: array<f16>;
|
|
39
|
+
@group(0) @binding(3) var<storage, read_write> G: array<f16>;
|
|
40
|
+
|
|
41
|
+
@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
|
|
42
|
+
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
43
|
+
let idx = gid.x;
|
|
44
|
+
let q_size = params.num_heads * params.head_dim;
|
|
45
|
+
let total_elements = params.num_tokens * q_size;
|
|
46
|
+
|
|
47
|
+
if (idx >= total_elements) {
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
let token = idx / q_size;
|
|
52
|
+
let elem = idx % q_size;
|
|
53
|
+
let head = elem / params.head_dim;
|
|
54
|
+
let dim = elem % params.head_dim;
|
|
55
|
+
|
|
56
|
+
// Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
|
|
57
|
+
let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
|
|
58
|
+
let src_g = src_q + params.head_dim;
|
|
59
|
+
|
|
60
|
+
Q[idx] = input[src_q];
|
|
61
|
+
G[idx] = input[src_g];
|
|
62
|
+
}
|
|
@@ -110,6 +110,6 @@ export function getBuffer(weight: GPUBuffer | WeightBuffer | TensorLike): GPUBuf
|
|
|
110
110
|
export function getLayout(weight: GPUBuffer | WeightBuffer | TensorLike): WeightLayout | null;
|
|
111
111
|
|
|
112
112
|
/**
|
|
113
|
-
* Get dtype from WeightBuffer,
|
|
113
|
+
* Get dtype from WeightBuffer, tagged raw GPUBuffer, or TensorLike.
|
|
114
114
|
*/
|
|
115
115
|
export function getWeightDtype(weight: GPUBuffer | WeightBuffer | TensorLike): WeightDtype | TensorLike['dtype'] | null;
|
package/src/gpu/weight-buffer.js
CHANGED
|
@@ -9,6 +9,7 @@ import type { InferencePipeline } from './pipelines/text.js';
|
|
|
9
9
|
import type { DiffusionPipeline } from './pipelines/diffusion/pipeline.js';
|
|
10
10
|
import type { EnergyPipeline } from './pipelines/energy/pipeline.js';
|
|
11
11
|
import type { SavedReportInfo, SaveReportOptions } from '../storage/reports.js';
|
|
12
|
+
import type { DebugSnapshot } from '../debug/history.js';
|
|
12
13
|
|
|
13
14
|
export interface BrowserHarnessOptions extends InferenceHarnessOptions {
|
|
14
15
|
modelUrl: string;
|
|
@@ -143,6 +144,7 @@ export interface BrowserSuiteResult extends SuiteSummary {
|
|
|
143
144
|
output?: string | DiffusionOutput | null;
|
|
144
145
|
deviceInfo?: Record<string, unknown> | null;
|
|
145
146
|
memoryStats?: ReturnType<InferencePipeline['getMemoryStats']> | null;
|
|
147
|
+
debugSnapshot?: DebugSnapshot | null;
|
|
146
148
|
pipeline?: InferencePipeline | DiffusionPipeline | EnergyPipeline | null;
|
|
147
149
|
report: Record<string, unknown>;
|
|
148
150
|
reportInfo: SavedReportInfo;
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import { initializeInference } from './test-harness.js';
|
|
3
3
|
import { saveReport } from '../storage/reports.js';
|
|
4
4
|
import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
|
|
5
|
+
import { clearLogHistory, getDebugSnapshot } from '../debug/history.js';
|
|
5
6
|
import { computeSampleStats } from '../debug/stats.js';
|
|
6
7
|
import {
|
|
7
8
|
setActiveKernelPath,
|
|
@@ -846,15 +847,32 @@ async function dispatchBrowserSuite(suite, options) {
|
|
|
846
847
|
return null;
|
|
847
848
|
}
|
|
848
849
|
|
|
850
|
+
function shouldCaptureDebugSnapshot(suite, runtimeConfig) {
|
|
851
|
+
const debug = runtimeConfig?.shared?.debug ?? {};
|
|
852
|
+
const logLevel = String(debug.logLevel?.defaultLogLevel ?? '').toLowerCase();
|
|
853
|
+
return suite === 'debug'
|
|
854
|
+
|| debug.trace?.enabled === true
|
|
855
|
+
|| debug.pipeline?.enabled === true
|
|
856
|
+
|| (Array.isArray(debug.probes) && debug.probes.length > 0)
|
|
857
|
+
|| debug.profiler?.enabled === true
|
|
858
|
+
|| logLevel === 'debug'
|
|
859
|
+
|| logLevel === 'verbose';
|
|
860
|
+
}
|
|
861
|
+
|
|
849
862
|
export async function runBrowserSuite(options = {}) {
|
|
850
863
|
return runWithRuntimeIsolationForSuite(async () => {
|
|
851
864
|
const suiteTimestamp = resolveReportTimestamp(options.timestamp, 'runBrowserSuite timestamp');
|
|
852
865
|
const suiteContext = resolveSuiteContext(options);
|
|
853
866
|
const suite = normalizeSuite(options.suite, suiteContext);
|
|
867
|
+
const captureDebugSnapshot = shouldCaptureDebugSnapshot(suite, getRuntimeConfig());
|
|
868
|
+
if (captureDebugSnapshot) {
|
|
869
|
+
clearLogHistory();
|
|
870
|
+
}
|
|
854
871
|
const suiteResult = await dispatchBrowserSuite(suite, options);
|
|
855
872
|
if (!suiteResult) {
|
|
856
873
|
throw createUnsupportedSuiteError(suite, suiteContext);
|
|
857
874
|
}
|
|
875
|
+
const debugSnapshot = captureDebugSnapshot ? getDebugSnapshot() : null;
|
|
858
876
|
|
|
859
877
|
if (suite === 'bench' && suiteResult?.metrics?.workloadType === 'training') {
|
|
860
878
|
const trainingReport = suiteResult?.metrics?.trainingMetricsReport;
|
|
@@ -886,6 +904,7 @@ export async function runBrowserSuite(options = {}) {
|
|
|
886
904
|
metrics: suiteResult.metrics ?? null,
|
|
887
905
|
output: reportOutput,
|
|
888
906
|
memory: suiteResult.memoryStats ?? null,
|
|
907
|
+
debugSnapshot,
|
|
889
908
|
...options.report,
|
|
890
909
|
};
|
|
891
910
|
if (ulArtifacts.length > 0 || distillArtifacts.length > 0 || checkpointResumeTimeline.length > 0) {
|
|
@@ -907,7 +926,7 @@ export async function runBrowserSuite(options = {}) {
|
|
|
907
926
|
report.timestamp = suiteTimestamp;
|
|
908
927
|
}
|
|
909
928
|
const reportInfo = await saveReport(modelId, report, { timestamp: report.timestamp });
|
|
910
|
-
return { ...suiteResult, report, reportInfo };
|
|
929
|
+
return { ...suiteResult, debugSnapshot, report, reportInfo };
|
|
911
930
|
});
|
|
912
931
|
}
|
|
913
932
|
|
|
@@ -314,10 +314,7 @@ export class KVCache {
|
|
|
314
314
|
layer.seqLen = Math.max(layer.seqLen, startPos + numNewTokens);
|
|
315
315
|
this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numNewTokens);
|
|
316
316
|
|
|
317
|
-
|
|
318
|
-
if (layerIdx === this.numLayers - 1) {
|
|
319
|
-
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
|
|
320
|
-
}
|
|
317
|
+
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
|
|
321
318
|
}
|
|
322
319
|
|
|
323
320
|
|
|
@@ -374,9 +371,7 @@ export class KVCache {
|
|
|
374
371
|
layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
|
|
375
372
|
this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
|
|
376
373
|
|
|
377
|
-
|
|
378
|
-
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
379
|
-
}
|
|
374
|
+
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
380
375
|
}
|
|
381
376
|
|
|
382
377
|
|
|
@@ -433,9 +428,7 @@ export class KVCache {
|
|
|
433
428
|
layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
|
|
434
429
|
this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
|
|
435
430
|
|
|
436
|
-
|
|
437
|
-
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
438
|
-
}
|
|
431
|
+
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
439
432
|
}
|
|
440
433
|
|
|
441
434
|
|
|
@@ -89,6 +89,9 @@ export function normalizeDiffusionMatmulLocationDtype(dtype) {
|
|
|
89
89
|
return normalized;
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
// Artifact-derived dtype inference: determines actual storage dtype from buffer byte size.
|
|
93
|
+
// This is NOT a config-bypass — it reads physical buffer dimensions (artifact-derived config),
|
|
94
|
+
// which is a valid merge layer per the config merge contract.
|
|
92
95
|
export function inferDiffusionMatmulDtypeFromBuffer(weight, N, K, preferred) {
|
|
93
96
|
const buffer = getBuffer(weight);
|
|
94
97
|
if (!buffer || !Number.isFinite(N) || !Number.isFinite(K)) return preferred;
|
|
@@ -28,6 +28,7 @@ import { runResidualAdd, runScale, recordResidualAdd, recordScale } from '../../
|
|
|
28
28
|
import { f16ToF32 } from '../../../loader/dtype-utils.js';
|
|
29
29
|
|
|
30
30
|
const SUPPORTED_DIFFUSION_BACKEND_PIPELINES = new Set(['gpu']);
|
|
31
|
+
const DEFAULT_TIME_EMBED_DIM = 256;
|
|
31
32
|
const SD3_TEXT_ENCODER_KEYS = ['text_encoder', 'text_encoder_2', 'text_encoder_3'];
|
|
32
33
|
const SANA_TEXT_ENCODER_KEYS = ['text_encoder'];
|
|
33
34
|
|
|
@@ -492,7 +493,7 @@ export class DiffusionPipeline {
|
|
|
492
493
|
const hiddenSize = (transformerConfig.num_attention_heads ?? 0) * (transformerConfig.attention_head_dim ?? 0);
|
|
493
494
|
const patchSize = transformerConfig.patch_size ?? 2;
|
|
494
495
|
const timeEmbedWeight = transformerResolver.get('time_text_embed.timestep_embedder.linear_1.weight');
|
|
495
|
-
const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ??
|
|
496
|
+
const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? DEFAULT_TIME_EMBED_DIM;
|
|
496
497
|
if (!Number.isFinite(hiddenSize) || hiddenSize <= 0) {
|
|
497
498
|
throw new Error('Diffusion transformer config missing num_attention_heads/attention_head_dim.');
|
|
498
499
|
}
|
|
@@ -44,7 +44,10 @@ import { initRoPEFrequencies } from '../text/init.js';
|
|
|
44
44
|
import { processLayerGPU } from '../text/layer.js';
|
|
45
45
|
|
|
46
46
|
const QUICK_GELU_ALPHA = 1.702;
|
|
47
|
+
const DEFAULT_TIMESTEP_EMBED_DIM = 256;
|
|
47
48
|
const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
|
|
49
|
+
// Standard CLIP hidden activation per OpenAI CLIP specification.
|
|
50
|
+
const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
|
|
48
51
|
|
|
49
52
|
function padTokens(tokens, maxLength, padTokenId) {
|
|
50
53
|
if (!Number.isFinite(maxLength) || maxLength <= 0) {
|
|
@@ -100,11 +103,15 @@ function createVectorTensor(device, data, dtype, label) {
|
|
|
100
103
|
return createTensor(buffer, dtype, [1, length], label);
|
|
101
104
|
}
|
|
102
105
|
|
|
106
|
+
// Conservative fallback dtype for diffusion bias tensors when no dtype
|
|
107
|
+
// metadata is available. F32 avoids precision loss in bias additions.
|
|
108
|
+
const DEFAULT_BIAS_DTYPE = 'f32';
|
|
109
|
+
|
|
103
110
|
function resolveBiasDtype(weight, weightsEntry, key) {
|
|
104
111
|
if (weight && weight.dtype) return weight.dtype;
|
|
105
112
|
const locationDtype = weightsEntry?.dtypes?.get(key);
|
|
106
113
|
const mapped = normalizeDiffusionLocationDtype(locationDtype);
|
|
107
|
-
return mapped ||
|
|
114
|
+
return mapped || DEFAULT_BIAS_DTYPE;
|
|
108
115
|
}
|
|
109
116
|
|
|
110
117
|
function createBiasTensorWithDtype(weight, weightsEntry, key, size, label) {
|
|
@@ -145,7 +152,7 @@ function createKernelOps(recorder) {
|
|
|
145
152
|
}
|
|
146
153
|
|
|
147
154
|
function resolveClipHiddenActivation(config) {
|
|
148
|
-
const hiddenAct = config?.hidden_act ??
|
|
155
|
+
const hiddenAct = config?.hidden_act ?? DEFAULT_CLIP_HIDDEN_ACT;
|
|
149
156
|
if (!SUPPORTED_CLIP_HIDDEN_ACTIVATIONS.has(hiddenAct)) {
|
|
150
157
|
throw new Error(
|
|
151
158
|
`Unsupported CLIP hidden_act "${hiddenAct}". ` +
|
|
@@ -1099,7 +1106,7 @@ export async function buildTimestepEmbedding(timestep, weightsEntry, modelConfig
|
|
|
1099
1106
|
const device = getDevice();
|
|
1100
1107
|
if (!device) throw new Error('Timestep embedding requires a WebGPU device.');
|
|
1101
1108
|
|
|
1102
|
-
const dim = options.dim ??
|
|
1109
|
+
const dim = options.dim ?? DEFAULT_TIMESTEP_EMBED_DIM;
|
|
1103
1110
|
const half = Math.floor(dim / 2);
|
|
1104
1111
|
const emb = new Float32Array(dim);
|
|
1105
1112
|
const maxPeriod = 10000;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { Tensor } from '../../../../gpu/tensor.js';
|
|
2
|
+
|
|
3
|
+
export interface AttentionProjectionInputResult {
|
|
4
|
+
oProjInput: Tensor;
|
|
5
|
+
oProjInputTemp: Tensor | null;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export function prepareAttentionProjectionInput(
|
|
9
|
+
attnForProjection: Tensor,
|
|
10
|
+
matmulOutputDtype: string,
|
|
11
|
+
castToF16: (tensor: Tensor) => Promise<Tensor>
|
|
12
|
+
): Promise<AttentionProjectionInputResult>;
|