@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -40,6 +40,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
40
40
  return softcap * tanh(x / softcap);
41
41
  }
42
42
 
43
+ fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
44
+ if (candidate_value > best_value) {
45
+ return true;
46
+ }
47
+ if (candidate_value < best_value) {
48
+ return false;
49
+ }
50
+ return candidate_index < best_index;
51
+ }
52
+
43
53
  @group(0) @binding(0) var<uniform> u: Uniforms;
44
54
  @group(0) @binding(1) var<storage, read> logits: array<f32>; // [vocabSize]
45
55
  @group(0) @binding(2) var<storage, read_write> output: array<u32>; // [N] - selected tokens
@@ -87,7 +97,7 @@ fn find_topk_phase1(
87
97
  if (idx != pad_id) {
88
98
  // Apply softcapping before temperature scaling
89
99
  let val = apply_softcap(logits[idx], softcap) / temperature;
90
- if (val > local_max) {
100
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
91
101
  local_max = val;
92
102
  local_max_idx = idx;
93
103
  }
@@ -103,7 +113,12 @@ fn find_topk_phase1(
103
113
  var stride = WORKGROUP_SIZE / 2u;
104
114
  while (stride > 0u) {
105
115
  if (thread_idx < stride) {
106
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
116
+ if (candidate_beats(
117
+ shared_values[thread_idx + stride],
118
+ shared_indices[thread_idx + stride],
119
+ shared_values[thread_idx],
120
+ shared_indices[thread_idx]
121
+ )) {
107
122
  shared_values[thread_idx] = shared_values[thread_idx + stride];
108
123
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
109
124
  }
@@ -150,7 +165,7 @@ fn find_topk_phase2(
150
165
  var max_val = shared_values[k];
151
166
 
152
167
  for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
153
- if (shared_values[i] > max_val) {
168
+ if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
154
169
  max_val = shared_values[i];
155
170
  max_idx = i;
156
171
  }
@@ -249,7 +264,7 @@ fn sample_single_pass(
249
264
  if (idx != pad_id) {
250
265
  // Apply softcapping before temperature scaling
251
266
  let val = apply_softcap(logits[idx], softcap) / temperature;
252
- if (val > local_max) {
267
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
253
268
  local_max = val;
254
269
  local_max_idx = idx;
255
270
  }
@@ -265,7 +280,12 @@ fn sample_single_pass(
265
280
  var stride = WORKGROUP_SIZE / 2u;
266
281
  while (stride > 0u) {
267
282
  if (thread_idx < stride) {
268
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
283
+ if (candidate_beats(
284
+ shared_values[thread_idx + stride],
285
+ shared_indices[thread_idx + stride],
286
+ shared_values[thread_idx],
287
+ shared_indices[thread_idx]
288
+ )) {
269
289
  shared_values[thread_idx] = shared_values[thread_idx + stride];
270
290
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
271
291
  }
@@ -308,7 +328,7 @@ fn argmax(
308
328
  if (idx != pad_id) {
309
329
  // Apply softcapping (argmax is greedy, no temperature)
310
330
  let val = apply_softcap(logits[idx], softcap);
311
- if (val > local_max) {
331
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
312
332
  local_max = val;
313
333
  local_max_idx = idx;
314
334
  }
@@ -324,7 +344,12 @@ fn argmax(
324
344
  var stride = WORKGROUP_SIZE / 2u;
325
345
  while (stride > 0u) {
326
346
  if (thread_idx < stride) {
327
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
347
+ if (candidate_beats(
348
+ shared_values[thread_idx + stride],
349
+ shared_indices[thread_idx + stride],
350
+ shared_values[thread_idx],
351
+ shared_indices[thread_idx]
352
+ )) {
328
353
  shared_values[thread_idx] = shared_values[thread_idx + stride];
329
354
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
330
355
  }
@@ -362,7 +387,12 @@ fn argmax_reduce(
362
387
  var stride = WORKGROUP_SIZE / 2u;
363
388
  while (stride > 0u) {
364
389
  if (thread_idx < stride) {
365
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
390
+ if (candidate_beats(
391
+ shared_values[thread_idx + stride],
392
+ shared_indices[thread_idx + stride],
393
+ shared_values[thread_idx],
394
+ shared_indices[thread_idx]
395
+ )) {
366
396
  shared_values[thread_idx] = shared_values[thread_idx + stride];
367
397
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
368
398
  }
@@ -374,4 +404,4 @@ fn argmax_reduce(
374
404
  if (thread_idx == 0u) {
375
405
  output[u.output_index] = shared_indices[0];
376
406
  }
377
- }
407
+ }
@@ -34,6 +34,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
34
34
  return softcap * tanh(x / softcap);
35
35
  }
36
36
 
37
+ fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
38
+ if (candidate_value > best_value) {
39
+ return true;
40
+ }
41
+ if (candidate_value < best_value) {
42
+ return false;
43
+ }
44
+ return candidate_index < best_index;
45
+ }
46
+
37
47
  @group(0) @binding(0) var<uniform> u: Uniforms;
38
48
  @group(0) @binding(1) var<storage, read> logits: array<f16>;
39
49
  @group(0) @binding(2) var<storage, read_write> output: array<u32>;
@@ -74,7 +84,7 @@ fn find_topk_phase1(
74
84
  while (idx < vocab_size) {
75
85
  if (idx != pad_id) {
76
86
  let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
77
- if (val > local_max) {
87
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
78
88
  local_max = val;
79
89
  local_max_idx = idx;
80
90
  }
@@ -89,7 +99,12 @@ fn find_topk_phase1(
89
99
  var stride = WORKGROUP_SIZE / 2u;
90
100
  while (stride > 0u) {
91
101
  if (thread_idx < stride) {
92
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
102
+ if (candidate_beats(
103
+ shared_values[thread_idx + stride],
104
+ shared_indices[thread_idx + stride],
105
+ shared_values[thread_idx],
106
+ shared_indices[thread_idx]
107
+ )) {
93
108
  shared_values[thread_idx] = shared_values[thread_idx + stride];
94
109
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
95
110
  }
@@ -130,7 +145,7 @@ fn find_topk_phase2(
130
145
  var max_val = shared_values[k];
131
146
 
132
147
  for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
133
- if (shared_values[i] > max_val) {
148
+ if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
134
149
  max_val = shared_values[i];
135
150
  max_idx = i;
136
151
  }
@@ -218,7 +233,7 @@ fn sample_single_pass(
218
233
  while (idx < vocab_size) {
219
234
  if (idx != pad_id) {
220
235
  let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
221
- if (val > local_max) {
236
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
222
237
  local_max = val;
223
238
  local_max_idx = idx;
224
239
  }
@@ -233,7 +248,12 @@ fn sample_single_pass(
233
248
  var stride = WORKGROUP_SIZE / 2u;
234
249
  while (stride > 0u) {
235
250
  if (thread_idx < stride) {
236
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
251
+ if (candidate_beats(
252
+ shared_values[thread_idx + stride],
253
+ shared_indices[thread_idx + stride],
254
+ shared_values[thread_idx],
255
+ shared_indices[thread_idx]
256
+ )) {
237
257
  shared_values[thread_idx] = shared_values[thread_idx + stride];
238
258
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
239
259
  }
@@ -267,7 +287,7 @@ fn argmax(
267
287
  while (idx < vocab_size) {
268
288
  if (idx != pad_id) {
269
289
  let val = apply_softcap(f32(logits[idx]), softcap);
270
- if (val > local_max) {
290
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
271
291
  local_max = val;
272
292
  local_max_idx = idx;
273
293
  }
@@ -282,7 +302,12 @@ fn argmax(
282
302
  var stride = WORKGROUP_SIZE / 2u;
283
303
  while (stride > 0u) {
284
304
  if (thread_idx < stride) {
285
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
305
+ if (candidate_beats(
306
+ shared_values[thread_idx + stride],
307
+ shared_indices[thread_idx + stride],
308
+ shared_values[thread_idx],
309
+ shared_indices[thread_idx]
310
+ )) {
286
311
  shared_values[thread_idx] = shared_values[thread_idx + stride];
287
312
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
288
313
  }
@@ -316,7 +341,12 @@ fn argmax_reduce(
316
341
  var stride = WORKGROUP_SIZE / 2u;
317
342
  while (stride > 0u) {
318
343
  if (thread_idx < stride) {
319
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
344
+ if (candidate_beats(
345
+ shared_values[thread_idx + stride],
346
+ shared_indices[thread_idx + stride],
347
+ shared_values[thread_idx],
348
+ shared_indices[thread_idx]
349
+ )) {
320
350
  shared_values[thread_idx] = shared_values[thread_idx + stride];
321
351
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
322
352
  }
@@ -133,10 +133,15 @@ export async function compileShader(
133
133
  source,
134
134
  label
135
135
  ) {
136
- const module = device.createShaderModule({
137
- label,
138
- code: source,
139
- });
136
+ let module;
137
+ try {
138
+ module = device.createShaderModule({
139
+ label,
140
+ code: source,
141
+ });
142
+ } catch (err) {
143
+ throw new Error(`createShaderModule failed for "${label}": ${err.message}`);
144
+ }
140
145
 
141
146
  // Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
142
147
  const compilationInfo = typeof module.getCompilationInfo === 'function'
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Split Q and Gate Kernel
3
+ *
4
+ * De-interleaves Q and Gate projections from q_proj output for attentionOutputGate models.
5
+ * Models like Qwen 3.5 store q_proj weights in per-head interleaved layout:
6
+ * rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
7
+ * rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
8
+ * This kernel separates the full matmul output into contiguous Q and Gate tensors.
9
+ */
10
+
11
+ import type { Tensor } from '../tensor.js';
12
+ import type { CommandRecorder } from '../command-recorder.js';
13
+
14
+ /** Split Q and Gate options */
15
+ export interface SplitQGOptions {
16
+ numTokens: number;
17
+ numHeads: number;
18
+ headDim: number;
19
+ /** Pre-allocated Q output tensor */
20
+ qTensor?: Tensor | null;
21
+ /** Pre-allocated Gate output tensor */
22
+ gTensor?: Tensor | null;
23
+ }
24
+
25
+ /** Split Q and Gate result */
26
+ export interface SplitQGResult {
27
+ Q: Tensor;
28
+ G: Tensor;
29
+ }
30
+
31
+ /**
32
+ * De-interleave Q and Gate from q_proj output.
33
+ *
34
+ * @param qgTensor - Full q_proj output [numTokens, numHeads * headDim * 2] (interleaved)
35
+ * @param options - Split configuration
36
+ * @returns Separate Q and Gate tensors, each [numTokens, numHeads * headDim]
37
+ */
38
+ export declare function runSplitQG(
39
+ qgTensor: Tensor,
40
+ options: SplitQGOptions
41
+ ): Promise<SplitQGResult>;
42
+
43
+ /**
44
+ * Record split Q and Gate (batched, no submit).
45
+ */
46
+ export declare function recordSplitQG(
47
+ recorder: CommandRecorder,
48
+ qgTensor: Tensor,
49
+ options: SplitQGOptions
50
+ ): Promise<SplitQGResult>;
@@ -0,0 +1,46 @@
1
+
2
+ import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
3
+ import { createTensor, dtypeBytes } from '../tensor.js';
4
+ import { WORKGROUP_SIZES } from './constants.js';
5
+ import { unifiedKernelWrapper } from './utils.js';
6
+ import { selectRuleValue } from './rule-registry.js';
7
+
8
+ async function _splitQG(target, qgTensor, options) {
9
+ const { numTokens, numHeads, headDim, qTensor = null, gTensor = null } = options;
10
+ const ownsQ = qTensor == null;
11
+ const ownsG = gTensor == null;
12
+
13
+ const outputDtype = qgTensor.dtype;
14
+ const pipelineVariant = selectRuleValue('splitQg', 'variant', { outputDtype });
15
+ const bytesPerElement = dtypeBytes(outputDtype);
16
+ const qSize = numHeads * headDim;
17
+
18
+ const qBuffer = qTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q');
19
+ const gBuffer = gTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q_gate');
20
+
21
+ try {
22
+ await unifiedKernelWrapper(
23
+ 'split_qg', target, pipelineVariant,
24
+ [qgTensor, qBuffer, gBuffer],
25
+ { num_tokens: numTokens, num_heads: numHeads, head_dim: headDim, _pad: 0 },
26
+ Math.ceil((numTokens * qSize) / WORKGROUP_SIZES.DEFAULT)
27
+ );
28
+
29
+ const Q = qTensor || createTensor(qBuffer, outputDtype, [numTokens, qSize], 'Q');
30
+ const G = gTensor || createTensor(gBuffer, outputDtype, [numTokens, qSize], 'Q_gate');
31
+
32
+ return { Q, G };
33
+ } catch (error) {
34
+ if (ownsQ) releaseBuffer(qBuffer);
35
+ if (ownsG) releaseBuffer(gBuffer);
36
+ throw error;
37
+ }
38
+ }
39
+
40
+ export async function runSplitQG(qgTensor, options) {
41
+ return _splitQG(null, qgTensor, options);
42
+ }
43
+
44
+ export async function recordSplitQG(recorder, qgTensor, options) {
45
+ return _splitQG(recorder, qgTensor, options);
46
+ }
@@ -0,0 +1,58 @@
1
+ // split_qg.wgsl
2
+
3
+ /**
4
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models.
5
+ *
6
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
7
+ * rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
8
+ * rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
9
+ *
10
+ * A single full matmul over all 2*qSize rows produces interleaved output:
11
+ * input[token, h*headDim*2 : h*headDim*2+headDim] = Q head h
12
+ * input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
13
+ *
14
+ * This kernel separates them into contiguous Q and G outputs:
15
+ * Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
16
+ * G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
17
+ *
18
+ * Input layout (row-major): [numTokens, numHeads * headDim * 2]
19
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
20
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
21
+ */
22
+
23
+ struct Params {
24
+ num_tokens: u32,
25
+ num_heads: u32,
26
+ head_dim: u32,
27
+ _pad: u32,
28
+ }
29
+
30
+ override WORKGROUP_SIZE: u32 = 256u;
31
+
32
+ @group(0) @binding(0) var<uniform> params: Params;
33
+ @group(0) @binding(1) var<storage, read> input: array<f32>;
34
+ @group(0) @binding(2) var<storage, read_write> Q: array<f32>;
35
+ @group(0) @binding(3) var<storage, read_write> G: array<f32>;
36
+
37
+ @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
38
+ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
39
+ let idx = gid.x;
40
+ let q_size = params.num_heads * params.head_dim;
41
+ let total_elements = params.num_tokens * q_size;
42
+
43
+ if (idx >= total_elements) {
44
+ return;
45
+ }
46
+
47
+ let token = idx / q_size;
48
+ let elem = idx % q_size;
49
+ let head = elem / params.head_dim;
50
+ let dim = elem % params.head_dim;
51
+
52
+ // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
53
+ let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
54
+ let src_g = src_q + params.head_dim;
55
+
56
+ Q[idx] = input[src_q];
57
+ G[idx] = input[src_g];
58
+ }
@@ -0,0 +1,62 @@
1
+ // AUTO-GENERATED from src/gpu/kernels/split_qg.wgsl.
2
+ // Edit the source kernel and tools/configs/wgsl-variants.js, then run `npm run kernels:generate`.
3
+ // split_qg_f16.wgsl
4
+
5
+ /**
6
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models (f16).
7
+ *
8
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
9
+ * rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
10
+ * rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
11
+ *
12
+ * A single full matmul over all 2*qSize rows produces interleaved output:
13
+ * input[token, h*headDim*2 : h*headDim*2+headDim] = Q head h
14
+ * input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
15
+ *
16
+ * This kernel separates them into contiguous Q and G outputs:
17
+ * Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
18
+ * G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
19
+ *
20
+ * Input layout (row-major): [numTokens, numHeads * headDim * 2]
21
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
22
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
23
+ */
24
+
25
+ enable f16;
26
+
27
+ struct Params {
28
+ num_tokens: u32,
29
+ num_heads: u32,
30
+ head_dim: u32,
31
+ _pad: u32,
32
+ }
33
+
34
+ override WORKGROUP_SIZE: u32 = 256u;
35
+
36
+ @group(0) @binding(0) var<uniform> params: Params;
37
+ @group(0) @binding(1) var<storage, read> input: array<f16>;
38
+ @group(0) @binding(2) var<storage, read_write> Q: array<f16>;
39
+ @group(0) @binding(3) var<storage, read_write> G: array<f16>;
40
+
41
+ @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
42
+ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
43
+ let idx = gid.x;
44
+ let q_size = params.num_heads * params.head_dim;
45
+ let total_elements = params.num_tokens * q_size;
46
+
47
+ if (idx >= total_elements) {
48
+ return;
49
+ }
50
+
51
+ let token = idx / q_size;
52
+ let elem = idx % q_size;
53
+ let head = elem / params.head_dim;
54
+ let dim = elem % params.head_dim;
55
+
56
+ // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
57
+ let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
58
+ let src_g = src_q + params.head_dim;
59
+
60
+ Q[idx] = input[src_q];
61
+ G[idx] = input[src_g];
62
+ }
@@ -110,6 +110,6 @@ export function getBuffer(weight: GPUBuffer | WeightBuffer | TensorLike): GPUBuf
110
110
  export function getLayout(weight: GPUBuffer | WeightBuffer | TensorLike): WeightLayout | null;
111
111
 
112
112
  /**
113
- * Get dtype from WeightBuffer, or null for raw GPUBuffer.
113
+ * Get dtype from WeightBuffer, tagged raw GPUBuffer, or TensorLike.
114
114
  */
115
115
  export function getWeightDtype(weight: GPUBuffer | WeightBuffer | TensorLike): WeightDtype | TensorLike['dtype'] | null;
@@ -114,5 +114,5 @@ export function getLayout(weight) {
114
114
  export function getWeightDtype(weight) {
115
115
  if (isWeightBuffer(weight)) return weight.dtype;
116
116
  if (isTensorLike(weight)) return weight.dtype;
117
- return null;
117
+ return getBufferDtype(weight);
118
118
  }
@@ -9,6 +9,7 @@ import type { InferencePipeline } from './pipelines/text.js';
9
9
  import type { DiffusionPipeline } from './pipelines/diffusion/pipeline.js';
10
10
  import type { EnergyPipeline } from './pipelines/energy/pipeline.js';
11
11
  import type { SavedReportInfo, SaveReportOptions } from '../storage/reports.js';
12
+ import type { DebugSnapshot } from '../debug/history.js';
12
13
 
13
14
  export interface BrowserHarnessOptions extends InferenceHarnessOptions {
14
15
  modelUrl: string;
@@ -143,6 +144,7 @@ export interface BrowserSuiteResult extends SuiteSummary {
143
144
  output?: string | DiffusionOutput | null;
144
145
  deviceInfo?: Record<string, unknown> | null;
145
146
  memoryStats?: ReturnType<InferencePipeline['getMemoryStats']> | null;
147
+ debugSnapshot?: DebugSnapshot | null;
146
148
  pipeline?: InferencePipeline | DiffusionPipeline | EnergyPipeline | null;
147
149
  report: Record<string, unknown>;
148
150
  reportInfo: SavedReportInfo;
@@ -2,6 +2,7 @@
2
2
  import { initializeInference } from './test-harness.js';
3
3
  import { saveReport } from '../storage/reports.js';
4
4
  import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
5
+ import { clearLogHistory, getDebugSnapshot } from '../debug/history.js';
5
6
  import { computeSampleStats } from '../debug/stats.js';
6
7
  import {
7
8
  setActiveKernelPath,
@@ -846,15 +847,32 @@ async function dispatchBrowserSuite(suite, options) {
846
847
  return null;
847
848
  }
848
849
 
850
+ function shouldCaptureDebugSnapshot(suite, runtimeConfig) {
851
+ const debug = runtimeConfig?.shared?.debug ?? {};
852
+ const logLevel = String(debug.logLevel?.defaultLogLevel ?? '').toLowerCase();
853
+ return suite === 'debug'
854
+ || debug.trace?.enabled === true
855
+ || debug.pipeline?.enabled === true
856
+ || (Array.isArray(debug.probes) && debug.probes.length > 0)
857
+ || debug.profiler?.enabled === true
858
+ || logLevel === 'debug'
859
+ || logLevel === 'verbose';
860
+ }
861
+
849
862
  export async function runBrowserSuite(options = {}) {
850
863
  return runWithRuntimeIsolationForSuite(async () => {
851
864
  const suiteTimestamp = resolveReportTimestamp(options.timestamp, 'runBrowserSuite timestamp');
852
865
  const suiteContext = resolveSuiteContext(options);
853
866
  const suite = normalizeSuite(options.suite, suiteContext);
867
+ const captureDebugSnapshot = shouldCaptureDebugSnapshot(suite, getRuntimeConfig());
868
+ if (captureDebugSnapshot) {
869
+ clearLogHistory();
870
+ }
854
871
  const suiteResult = await dispatchBrowserSuite(suite, options);
855
872
  if (!suiteResult) {
856
873
  throw createUnsupportedSuiteError(suite, suiteContext);
857
874
  }
875
+ const debugSnapshot = captureDebugSnapshot ? getDebugSnapshot() : null;
858
876
 
859
877
  if (suite === 'bench' && suiteResult?.metrics?.workloadType === 'training') {
860
878
  const trainingReport = suiteResult?.metrics?.trainingMetricsReport;
@@ -886,6 +904,7 @@ export async function runBrowserSuite(options = {}) {
886
904
  metrics: suiteResult.metrics ?? null,
887
905
  output: reportOutput,
888
906
  memory: suiteResult.memoryStats ?? null,
907
+ debugSnapshot,
889
908
  ...options.report,
890
909
  };
891
910
  if (ulArtifacts.length > 0 || distillArtifacts.length > 0 || checkpointResumeTimeline.length > 0) {
@@ -907,7 +926,7 @@ export async function runBrowserSuite(options = {}) {
907
926
  report.timestamp = suiteTimestamp;
908
927
  }
909
928
  const reportInfo = await saveReport(modelId, report, { timestamp: report.timestamp });
910
- return { ...suiteResult, report, reportInfo };
929
+ return { ...suiteResult, debugSnapshot, report, reportInfo };
911
930
  });
912
931
  }
913
932
 
@@ -314,10 +314,7 @@ export class KVCache {
314
314
  layer.seqLen = Math.max(layer.seqLen, startPos + numNewTokens);
315
315
  this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numNewTokens);
316
316
 
317
- // Update global sequence length if this is the last layer
318
- if (layerIdx === this.numLayers - 1) {
319
- this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
320
- }
317
+ this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
321
318
  }
322
319
 
323
320
 
@@ -374,9 +371,7 @@ export class KVCache {
374
371
  layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
375
372
  this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
376
373
 
377
- if (layerIdx === this.numLayers - 1) {
378
- this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
379
- }
374
+ this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
380
375
  }
381
376
 
382
377
 
@@ -433,9 +428,7 @@ export class KVCache {
433
428
  layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
434
429
  this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
435
430
 
436
- if (layerIdx === this.numLayers - 1) {
437
- this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
438
- }
431
+ this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
439
432
  }
440
433
 
441
434
 
@@ -89,6 +89,9 @@ export function normalizeDiffusionMatmulLocationDtype(dtype) {
89
89
  return normalized;
90
90
  }
91
91
 
92
+ // Artifact-derived dtype inference: determines actual storage dtype from buffer byte size.
93
+ // This is NOT a config-bypass — it reads physical buffer dimensions (artifact-derived config),
94
+ // which is a valid merge layer per the config merge contract.
92
95
  export function inferDiffusionMatmulDtypeFromBuffer(weight, N, K, preferred) {
93
96
  const buffer = getBuffer(weight);
94
97
  if (!buffer || !Number.isFinite(N) || !Number.isFinite(K)) return preferred;
@@ -28,6 +28,7 @@ import { runResidualAdd, runScale, recordResidualAdd, recordScale } from '../../
28
28
  import { f16ToF32 } from '../../../loader/dtype-utils.js';
29
29
 
30
30
  const SUPPORTED_DIFFUSION_BACKEND_PIPELINES = new Set(['gpu']);
31
+ const DEFAULT_TIME_EMBED_DIM = 256;
31
32
  const SD3_TEXT_ENCODER_KEYS = ['text_encoder', 'text_encoder_2', 'text_encoder_3'];
32
33
  const SANA_TEXT_ENCODER_KEYS = ['text_encoder'];
33
34
 
@@ -492,7 +493,7 @@ export class DiffusionPipeline {
492
493
  const hiddenSize = (transformerConfig.num_attention_heads ?? 0) * (transformerConfig.attention_head_dim ?? 0);
493
494
  const patchSize = transformerConfig.patch_size ?? 2;
494
495
  const timeEmbedWeight = transformerResolver.get('time_text_embed.timestep_embedder.linear_1.weight');
495
- const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? 256;
496
+ const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? DEFAULT_TIME_EMBED_DIM;
496
497
  if (!Number.isFinite(hiddenSize) || hiddenSize <= 0) {
497
498
  throw new Error('Diffusion transformer config missing num_attention_heads/attention_head_dim.');
498
499
  }
@@ -44,7 +44,10 @@ import { initRoPEFrequencies } from '../text/init.js';
44
44
  import { processLayerGPU } from '../text/layer.js';
45
45
 
46
46
  const QUICK_GELU_ALPHA = 1.702;
47
+ const DEFAULT_TIMESTEP_EMBED_DIM = 256;
47
48
  const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
49
+ // Standard CLIP hidden activation per OpenAI CLIP specification.
50
+ const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
48
51
 
49
52
  function padTokens(tokens, maxLength, padTokenId) {
50
53
  if (!Number.isFinite(maxLength) || maxLength <= 0) {
@@ -100,11 +103,15 @@ function createVectorTensor(device, data, dtype, label) {
100
103
  return createTensor(buffer, dtype, [1, length], label);
101
104
  }
102
105
 
106
+ // Conservative fallback dtype for diffusion bias tensors when no dtype
107
+ // metadata is available. F32 avoids precision loss in bias additions.
108
+ const DEFAULT_BIAS_DTYPE = 'f32';
109
+
103
110
  function resolveBiasDtype(weight, weightsEntry, key) {
104
111
  if (weight && weight.dtype) return weight.dtype;
105
112
  const locationDtype = weightsEntry?.dtypes?.get(key);
106
113
  const mapped = normalizeDiffusionLocationDtype(locationDtype);
107
- return mapped || 'f32';
114
+ return mapped || DEFAULT_BIAS_DTYPE;
108
115
  }
109
116
 
110
117
  function createBiasTensorWithDtype(weight, weightsEntry, key, size, label) {
@@ -145,7 +152,7 @@ function createKernelOps(recorder) {
145
152
  }
146
153
 
147
154
  function resolveClipHiddenActivation(config) {
148
- const hiddenAct = config?.hidden_act ?? 'gelu';
155
+ const hiddenAct = config?.hidden_act ?? DEFAULT_CLIP_HIDDEN_ACT;
149
156
  if (!SUPPORTED_CLIP_HIDDEN_ACTIVATIONS.has(hiddenAct)) {
150
157
  throw new Error(
151
158
  `Unsupported CLIP hidden_act "${hiddenAct}". ` +
@@ -1099,7 +1106,7 @@ export async function buildTimestepEmbedding(timestep, weightsEntry, modelConfig
1099
1106
  const device = getDevice();
1100
1107
  if (!device) throw new Error('Timestep embedding requires a WebGPU device.');
1101
1108
 
1102
- const dim = options.dim ?? 256;
1109
+ const dim = options.dim ?? DEFAULT_TIMESTEP_EMBED_DIM;
1103
1110
  const half = Math.floor(dim / 2);
1104
1111
  const emb = new Float32Array(dim);
1105
1112
  const maxPeriod = 10000;
@@ -0,0 +1,12 @@
1
+ import type { Tensor } from '../../../../gpu/tensor.js';
2
+
3
+ export interface AttentionProjectionInputResult {
4
+ oProjInput: Tensor;
5
+ oProjInputTemp: Tensor | null;
6
+ }
7
+
8
+ export function prepareAttentionProjectionInput(
9
+ attnForProjection: Tensor,
10
+ matmulOutputDtype: string,
11
+ castToF16: (tensor: Tensor) => Promise<Tensor>
12
+ ): Promise<AttentionProjectionInputResult>;