@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +14 -1
  2. package/README.md +25 -6
  3. package/package.json +5 -3
  4. package/src/client/doppler-api.browser.js +6 -0
  5. package/src/client/doppler-api.d.ts +3 -0
  6. package/src/client/doppler-api.js +11 -2
  7. package/src/client/doppler-registry.js +3 -5
  8. package/src/client/doppler-registry.json +16 -0
  9. package/src/config/kernels/kernel-ref-digests.js +23 -21
  10. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  11. package/src/config/loader.js +6 -0
  12. package/src/config/platforms/loader.js +3 -1
  13. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  14. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  15. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  16. package/src/config/presets/kernel-paths/registry.json +7 -0
  17. package/src/config/presets/models/gemma3.json +2 -1
  18. package/src/config/presets/models/gemma4.json +61 -0
  19. package/src/config/presets/models/granite-docling.json +70 -0
  20. package/src/config/presets/models/lfm2.json +6 -1
  21. package/src/config/presets/models/qwen3_vl.json +40 -0
  22. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  23. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  24. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  25. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  26. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  27. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  28. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  29. package/src/config/runtime.js +3 -0
  30. package/src/config/schema/debug.schema.d.ts +40 -0
  31. package/src/config/schema/debug.schema.js +28 -0
  32. package/src/config/schema/index.js +2 -0
  33. package/src/config/schema/inference-defaults.schema.js +1 -1
  34. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  35. package/src/config/schema/memory-limits.schema.js +2 -2
  36. package/src/config/schema/storage.schema.js +1 -1
  37. package/src/converter/conversion-plan.js +1 -1
  38. package/src/converter/core.js +17 -8
  39. package/src/converter/quantizer.d.ts +5 -0
  40. package/src/converter/quantizer.js +15 -0
  41. package/src/distribution/shard-delivery.js +34 -0
  42. package/src/formats/rdrr/classification.js +32 -0
  43. package/src/gpu/kernel-runtime.js +4 -2
  44. package/src/gpu/kernels/attention.js +2 -1
  45. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  46. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  47. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  48. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  49. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  50. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  51. package/src/gpu/kernels/gated-short-conv.js +284 -0
  52. package/src/gpu/kernels/linear-attention-core.js +37 -17
  53. package/src/gpu/kernels/matmul-selection.js +1 -0
  54. package/src/gpu/kernels/matmul.d.ts +3 -0
  55. package/src/gpu/kernels/matmul.js +70 -1
  56. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  57. package/src/gpu/kernels/sample.js +1 -3
  58. package/src/gpu/kernels/sample.wgsl +39 -9
  59. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  60. package/src/gpu/kernels/shader-cache.js +9 -4
  61. package/src/inference/kv-cache/base.js +3 -10
  62. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  63. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
  64. package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
  65. package/src/inference/pipelines/text/attention/projections.js +13 -2
  66. package/src/inference/pipelines/text/attention/record.js +1 -0
  67. package/src/inference/pipelines/text/attention/run.js +9 -0
  68. package/src/inference/pipelines/text/config.d.ts +1 -0
  69. package/src/inference/pipelines/text/config.js +32 -4
  70. package/src/inference/pipelines/text/embed.js +26 -7
  71. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  72. package/src/inference/pipelines/text/execution-v0.js +12 -1
  73. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  74. package/src/inference/pipelines/text/generator-runtime.js +14 -0
  75. package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
  76. package/src/inference/pipelines/text/generator-steps.js +46 -29
  77. package/src/inference/pipelines/text/generator.d.ts +5 -0
  78. package/src/inference/pipelines/text/generator.js +320 -166
  79. package/src/inference/pipelines/text/init.d.ts +2 -0
  80. package/src/inference/pipelines/text/init.js +19 -5
  81. package/src/inference/pipelines/text/layer.js +37 -8
  82. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  83. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  84. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  85. package/src/inference/pipelines/text/ops.js +123 -53
  86. package/src/inference/pipelines/text/probes.js +1 -0
  87. package/src/inference/pipelines/text/state.js +2 -0
  88. package/src/inference/pipelines/text.d.ts +5 -0
  89. package/src/inference/pipelines/text.js +59 -1
  90. package/src/inference/pipelines/vision/encoder.js +386 -0
  91. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  92. package/src/inference/pipelines/vision/index.js +173 -0
  93. package/src/inference/pipelines/vision/ops.js +78 -0
  94. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  95. package/src/inference/test-harness.js +9 -7
  96. package/src/loader/doppler-loader.d.ts +3 -0
  97. package/src/loader/doppler-loader.js +20 -3
  98. package/src/loader/experts/expert-cache.js +6 -2
  99. package/src/loader/experts/expert-loader.js +6 -2
  100. package/src/loader/layer-loader.js +42 -3
  101. package/src/loader/manifest-config.js +3 -1
  102. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  103. package/src/loader/tensors/tensor-loader.js +124 -3
  104. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  105. package/src/rules/kernels/softmax.rules.json +2 -0
  106. package/src/rules/rule-registry.d.ts +1 -0
  107. package/src/rules/rule-registry.js +2 -0
  108. package/src/storage/quickstart-downloader.d.ts +3 -0
  109. package/src/storage/quickstart-downloader.js +27 -30
  110. package/src/tooling/node-converter.js +25 -7
  111. package/src/tooling/node-source-runtime.js +29 -5
  112. package/src/tooling/node-webgpu.js +24 -7
  113. package/src/utils/hf-resolve-url.d.ts +16 -0
  114. package/src/utils/hf-resolve-url.js +17 -0
  115. package/src/version.js +1 -1
  116. package/src/tooling/node-convert.d.ts +0 -54
@@ -34,6 +34,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
34
34
  return softcap * tanh(x / softcap);
35
35
  }
36
36
 
37
+ fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
38
+ if (candidate_value > best_value) {
39
+ return true;
40
+ }
41
+ if (candidate_value < best_value) {
42
+ return false;
43
+ }
44
+ return candidate_index < best_index;
45
+ }
46
+
37
47
  @group(0) @binding(0) var<uniform> u: Uniforms;
38
48
  @group(0) @binding(1) var<storage, read> logits: array<f16>;
39
49
  @group(0) @binding(2) var<storage, read_write> output: array<u32>;
@@ -74,7 +84,7 @@ fn find_topk_phase1(
74
84
  while (idx < vocab_size) {
75
85
  if (idx != pad_id) {
76
86
  let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
77
- if (val > local_max) {
87
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
78
88
  local_max = val;
79
89
  local_max_idx = idx;
80
90
  }
@@ -89,7 +99,12 @@ fn find_topk_phase1(
89
99
  var stride = WORKGROUP_SIZE / 2u;
90
100
  while (stride > 0u) {
91
101
  if (thread_idx < stride) {
92
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
102
+ if (candidate_beats(
103
+ shared_values[thread_idx + stride],
104
+ shared_indices[thread_idx + stride],
105
+ shared_values[thread_idx],
106
+ shared_indices[thread_idx]
107
+ )) {
93
108
  shared_values[thread_idx] = shared_values[thread_idx + stride];
94
109
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
95
110
  }
@@ -130,7 +145,7 @@ fn find_topk_phase2(
130
145
  var max_val = shared_values[k];
131
146
 
132
147
  for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
133
- if (shared_values[i] > max_val) {
148
+ if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
134
149
  max_val = shared_values[i];
135
150
  max_idx = i;
136
151
  }
@@ -218,7 +233,7 @@ fn sample_single_pass(
218
233
  while (idx < vocab_size) {
219
234
  if (idx != pad_id) {
220
235
  let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
221
- if (val > local_max) {
236
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
222
237
  local_max = val;
223
238
  local_max_idx = idx;
224
239
  }
@@ -233,7 +248,12 @@ fn sample_single_pass(
233
248
  var stride = WORKGROUP_SIZE / 2u;
234
249
  while (stride > 0u) {
235
250
  if (thread_idx < stride) {
236
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
251
+ if (candidate_beats(
252
+ shared_values[thread_idx + stride],
253
+ shared_indices[thread_idx + stride],
254
+ shared_values[thread_idx],
255
+ shared_indices[thread_idx]
256
+ )) {
237
257
  shared_values[thread_idx] = shared_values[thread_idx + stride];
238
258
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
239
259
  }
@@ -267,7 +287,7 @@ fn argmax(
267
287
  while (idx < vocab_size) {
268
288
  if (idx != pad_id) {
269
289
  let val = apply_softcap(f32(logits[idx]), softcap);
270
- if (val > local_max) {
290
+ if (candidate_beats(val, idx, local_max, local_max_idx)) {
271
291
  local_max = val;
272
292
  local_max_idx = idx;
273
293
  }
@@ -282,7 +302,12 @@ fn argmax(
282
302
  var stride = WORKGROUP_SIZE / 2u;
283
303
  while (stride > 0u) {
284
304
  if (thread_idx < stride) {
285
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
305
+ if (candidate_beats(
306
+ shared_values[thread_idx + stride],
307
+ shared_indices[thread_idx + stride],
308
+ shared_values[thread_idx],
309
+ shared_indices[thread_idx]
310
+ )) {
286
311
  shared_values[thread_idx] = shared_values[thread_idx + stride];
287
312
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
288
313
  }
@@ -316,7 +341,12 @@ fn argmax_reduce(
316
341
  var stride = WORKGROUP_SIZE / 2u;
317
342
  while (stride > 0u) {
318
343
  if (thread_idx < stride) {
319
- if (shared_values[thread_idx + stride] > shared_values[thread_idx]) {
344
+ if (candidate_beats(
345
+ shared_values[thread_idx + stride],
346
+ shared_indices[thread_idx + stride],
347
+ shared_values[thread_idx],
348
+ shared_indices[thread_idx]
349
+ )) {
320
350
  shared_values[thread_idx] = shared_values[thread_idx + stride];
321
351
  shared_indices[thread_idx] = shared_indices[thread_idx + stride];
322
352
  }
@@ -133,10 +133,15 @@ export async function compileShader(
133
133
  source,
134
134
  label
135
135
  ) {
136
- const module = device.createShaderModule({
137
- label,
138
- code: source,
139
- });
136
+ let module;
137
+ try {
138
+ module = device.createShaderModule({
139
+ label,
140
+ code: source,
141
+ });
142
+ } catch (err) {
143
+ throw new Error(`createShaderModule failed for "${label}": ${err.message}`);
144
+ }
140
145
 
141
146
  // Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
142
147
  const compilationInfo = typeof module.getCompilationInfo === 'function'
@@ -314,10 +314,7 @@ export class KVCache {
314
314
  layer.seqLen = Math.max(layer.seqLen, startPos + numNewTokens);
315
315
  this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numNewTokens);
316
316
 
317
- // Update global sequence length if this is the last layer
318
- if (layerIdx === this.numLayers - 1) {
319
- this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
320
- }
317
+ this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
321
318
  }
322
319
 
323
320
 
@@ -374,9 +371,7 @@ export class KVCache {
374
371
  layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
375
372
  this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
376
373
 
377
- if (layerIdx === this.numLayers - 1) {
378
- this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
379
- }
374
+ this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
380
375
  }
381
376
 
382
377
 
@@ -433,9 +428,7 @@ export class KVCache {
433
428
  layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
434
429
  this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
435
430
 
436
- if (layerIdx === this.numLayers - 1) {
437
- this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
438
- }
431
+ this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
439
432
  }
440
433
 
441
434
 
@@ -28,6 +28,7 @@ import { runResidualAdd, runScale, recordResidualAdd, recordScale } from '../../
28
28
  import { f16ToF32 } from '../../../loader/dtype-utils.js';
29
29
 
30
30
  const SUPPORTED_DIFFUSION_BACKEND_PIPELINES = new Set(['gpu']);
31
+ const DEFAULT_TIME_EMBED_DIM = 256;
31
32
  const SD3_TEXT_ENCODER_KEYS = ['text_encoder', 'text_encoder_2', 'text_encoder_3'];
32
33
  const SANA_TEXT_ENCODER_KEYS = ['text_encoder'];
33
34
 
@@ -492,7 +493,7 @@ export class DiffusionPipeline {
492
493
  const hiddenSize = (transformerConfig.num_attention_heads ?? 0) * (transformerConfig.attention_head_dim ?? 0);
493
494
  const patchSize = transformerConfig.patch_size ?? 2;
494
495
  const timeEmbedWeight = transformerResolver.get('time_text_embed.timestep_embedder.linear_1.weight');
495
- const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? 256;
496
+ const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? DEFAULT_TIME_EMBED_DIM;
496
497
  if (!Number.isFinite(hiddenSize) || hiddenSize <= 0) {
497
498
  throw new Error('Diffusion transformer config missing num_attention_heads/attention_head_dim.');
498
499
  }
@@ -44,6 +44,7 @@ import { initRoPEFrequencies } from '../text/init.js';
44
44
  import { processLayerGPU } from '../text/layer.js';
45
45
 
46
46
  const QUICK_GELU_ALPHA = 1.702;
47
+ const DEFAULT_TIMESTEP_EMBED_DIM = 256;
47
48
  const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
48
49
  // Standard CLIP hidden activation per OpenAI CLIP specification.
49
50
  const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
@@ -1105,7 +1106,7 @@ export async function buildTimestepEmbedding(timestep, weightsEntry, modelConfig
1105
1106
  const device = getDevice();
1106
1107
  if (!device) throw new Error('Timestep embedding requires a WebGPU device.');
1107
1108
 
1108
- const dim = options.dim ?? 256;
1109
+ const dim = options.dim ?? DEFAULT_TIMESTEP_EMBED_DIM;
1109
1110
  const half = Math.floor(dim / 2);
1110
1111
  const emb = new Float32Array(dim);
1111
1112
  const maxPeriod = 10000;
@@ -3,6 +3,7 @@ import type { Tensor } from '../../../../gpu/tensor.js';
3
3
  import type { WeightBuffer, CpuWeightBuffer } from '../../../../gpu/weight-buffer.js';
4
4
  import type { LayerWeights } from '../types.js';
5
5
  import type { LoRAAdapter } from '../lora.js';
6
+ import type { MatmulDebugConfigSchema } from '../../../../config/schema/debug.schema.js';
6
7
 
7
8
  export interface AttentionInputInfo {
8
9
  phase: 'prefill' | 'decode';
@@ -76,11 +77,13 @@ export interface ProjectAttentionQKVOptions {
76
77
  getWeightBuffer?: (weight: GPUBuffer | WeightBuffer | Float32Array | ArrayBuffer | CpuWeightBuffer, label: string) => GPUBuffer | WeightBuffer;
77
78
  lora?: LoRAAdapter | null;
78
79
  releaseTemporary: (buffer: GPUBuffer) => void;
80
+ matmulDebug?: MatmulDebugConfigSchema | null;
79
81
  onFusedQKV?: ((info: { qSize: number; kSize: number; vSize: number; totalSize: number }) => void) | null;
80
82
  }
81
83
 
82
84
  export interface ProjectAttentionQKVResult {
83
85
  qTensor: Tensor;
86
+ qGateTensor: Tensor | null;
84
87
  kTensor: Tensor;
85
88
  vTensor: Tensor;
86
89
  usedFusedQKV: boolean;
@@ -71,9 +71,10 @@ async function projectSingleQkvTensor({
71
71
  matmulOutputDtype,
72
72
  getWeightBuffer,
73
73
  lora,
74
+ matmulDebug,
74
75
  releaseTemporary,
75
76
  }) {
76
- const runMatmulForMode = getMatmulRunner(recorder);
77
+ const runMatmulForMode = getMatmulRunner(recorder);
77
78
  const layerWeight = layerWeights?.[weightKey];
78
79
  if (!layerWeight) {
79
80
  throw new Error(`Attention projection requires ${weightKey}.`);
@@ -91,6 +92,7 @@ async function projectSingleQkvTensor({
91
92
  layerIdx,
92
93
  kernelPath,
93
94
  outputDtype: matmulOutputDtype,
95
+ matmulDebug,
94
96
  });
95
97
  } finally {
96
98
  releaseOwnedWeightBuffer(layerWeight, projBuffer, releaseTemporary);
@@ -178,6 +180,7 @@ async function projectQueryWithOptionalGate({
178
180
  matmulOutputDtype,
179
181
  getWeightBuffer,
180
182
  lora,
183
+ matmulDebug,
181
184
  releaseTemporary,
182
185
  attentionOutputGate,
183
186
  }) {
@@ -205,6 +208,7 @@ async function projectQueryWithOptionalGate({
205
208
  matmulOutputDtype,
206
209
  getWeightBuffer,
207
210
  lora,
211
+ matmulDebug,
208
212
  releaseTemporary,
209
213
  });
210
214
  return { qTensor, qGateTensor: null };
@@ -226,6 +230,7 @@ async function projectQueryWithOptionalGate({
226
230
  layerIdx,
227
231
  kernelPath,
228
232
  outputDtype: matmulOutputDtype,
233
+ matmulDebug,
229
234
  });
230
235
 
231
236
  const split = await runSplitQGForMode(fullQGTensor, {
@@ -329,6 +334,7 @@ export async function projectAttentionQKV({
329
334
  matmulOutputDtype,
330
335
  getWeightBuffer,
331
336
  lora,
337
+ matmulDebug,
332
338
  releaseTemporary,
333
339
  onFusedQKV = null,
334
340
  attentionOutputGate = false,
@@ -339,7 +345,8 @@ export async function projectAttentionQKV({
339
345
  const hasLoRA = getLoRAModule(lora, layerIdx, 'q_proj')
340
346
  || getLoRAModule(lora, layerIdx, 'k_proj')
341
347
  || getLoRAModule(lora, layerIdx, 'v_proj');
342
- const useFusedQKV = selectRuleValue('inference', 'attention', 'useFusedQkv', {
348
+ const forceSplitQKV = Boolean(matmulDebug?.enabled) && matmulDebug?.forceSplitQKV === true;
349
+ const useFusedQKV = !forceSplitQKV && selectRuleValue('inference', 'attention', 'useFusedQkv', {
343
350
  hasQkvProj: Boolean(layerWeights.qkvProj),
344
351
  hasQkvSizes: Boolean(layerWeights.qkvSizes),
345
352
  hasLoRA: Boolean(hasLoRA),
@@ -356,6 +363,7 @@ export async function projectAttentionQKV({
356
363
  layerIdx,
357
364
  kernelPath,
358
365
  outputDtype: matmulOutputDtype,
366
+ matmulDebug,
359
367
  });
360
368
  const split = await runSplitForMode(qkvTensor, {
361
369
  numTokens,
@@ -394,6 +402,7 @@ export async function projectAttentionQKV({
394
402
  matmulOutputDtype,
395
403
  getWeightBuffer,
396
404
  lora,
405
+ matmulDebug,
397
406
  releaseTemporary,
398
407
  attentionOutputGate,
399
408
  }));
@@ -414,6 +423,7 @@ export async function projectAttentionQKV({
414
423
  matmulOutputDtype,
415
424
  getWeightBuffer,
416
425
  lora,
426
+ matmulDebug,
417
427
  releaseTemporary,
418
428
  });
419
429
 
@@ -433,6 +443,7 @@ export async function projectAttentionQKV({
433
443
  matmulOutputDtype,
434
444
  getWeightBuffer,
435
445
  lora,
446
+ matmulDebug,
436
447
  releaseTemporary,
437
448
  });
438
449
 
@@ -167,6 +167,7 @@ export async function recordLayerAttentionGPU(
167
167
  matmulOutputDtype,
168
168
  getWeightBuffer,
169
169
  lora,
170
+ matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
170
171
  attentionOutputGate: config.attentionOutputGate === true,
171
172
  releaseTemporary: (buffer) => releaseOrTrack(recorder, buffer),
172
173
  onFusedQKV: layerIdx === 0 && isPrefill
@@ -166,6 +166,14 @@ export async function runLayerAttentionGPU(
166
166
  dtype: normed.dtype,
167
167
  });
168
168
  }
169
+
170
+ await runProbes('post_input_norm', normed.buffer, {
171
+ layerIdx,
172
+ numTokens,
173
+ hiddenSize,
174
+ probes: state.debugProbes,
175
+ dtype: normed.dtype,
176
+ });
169
177
  }
170
178
 
171
179
  // Debug: Check normed input for L0 prefill
@@ -218,6 +226,7 @@ export async function runLayerAttentionGPU(
218
226
  matmulOutputDtype,
219
227
  getWeightBuffer,
220
228
  lora,
229
+ matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
221
230
  attentionOutputGate: config.attentionOutputGate === true,
222
231
  releaseTemporary: (buffer) => releaseBuffer(buffer),
223
232
  onFusedQKV: layerIdx === 0 && isPrefill
@@ -150,6 +150,7 @@ export interface ParsedModelConfig {
150
150
  ropeLocalTheta: number | null;
151
151
  ropeRotaryDim: number;
152
152
  ropeInterleaved: boolean;
153
+ mropeInterleaved: boolean;
153
154
  mropeSection: number[] | null;
154
155
  partialRotaryFactor: number | null;
155
156
  ropeScale: number;
@@ -349,6 +349,24 @@ function normalizeLayerTypeTag(value) {
349
349
  return null;
350
350
  }
351
351
 
352
+ function resolveVisionConfig(rawConfig, manifest) {
353
+ const vc = rawConfig?.vision_config ?? manifest?.config?.vision_config;
354
+ if (!vc || typeof vc !== 'object') return null;
355
+ return {
356
+ depth: vc.depth ?? 24,
357
+ hiddenSize: vc.hidden_size ?? 1024,
358
+ intermediateSize: vc.intermediate_size ?? 4096,
359
+ numHeads: vc.num_heads ?? 16,
360
+ outHiddenSize: vc.out_hidden_size ?? vc.hidden_size ?? 1024,
361
+ patchSize: vc.patch_size ?? 16,
362
+ spatialMergeSize: vc.spatial_merge_size ?? 2,
363
+ temporalPatchSize: vc.temporal_patch_size ?? 2,
364
+ eps: vc.eps ?? 1e-6,
365
+ deepstackVisualIndexes: Array.isArray(vc.deepstack_visual_indexes) ? vc.deepstack_visual_indexes : [],
366
+ imageTokenId: rawConfig?.image_token_id ?? manifest?.image_token_id ?? null,
367
+ };
368
+ }
369
+
352
370
  function parseCustomLayerTypes(layerTypes, numLayers, modelId) {
353
371
  if (!Array.isArray(layerTypes) || layerTypes.length === 0) {
354
372
  throw new Error(
@@ -512,10 +530,18 @@ export function toParsedConfigFromMerged(merged, manifest) {
512
530
  // RoPE scaling - use manifest inference as source of truth (not raw config)
513
531
  const ropeScale = inf.rope.ropeScalingFactor;
514
532
  const ropeScalingType = inf.rope.ropeScalingType;
515
- const ropeLocalScale = inf.rope.ropeLocalScalingFactor ?? ropeScale;
516
- const ropeLocalScalingType = inf.rope.ropeLocalScalingType ?? ropeScalingType;
533
+ const ropeLocalScale = inf.rope.ropeLocalScalingFactor;
534
+ const ropeLocalScalingType = inf.rope.ropeLocalScalingType;
517
535
  const partialRotaryFactor = inf.rope.partialRotaryFactor;
518
- const ropeInterleaved = inf.rope.mropeInterleaved === true;
536
+ const mropeInterleaved = inf.rope.mropeInterleaved === true;
537
+ const ropeInterleaved = false;
538
+
539
+ if (ropeLocalScale == null && (inf.rope.ropeLocalTheta != null || inf.rope.mropeSection != null)) {
540
+ throw new Error(
541
+ `Model "${merged.modelId}" uses hybrid/mRoPE but is missing rope.ropeLocalScalingFactor in manifest. ` +
542
+ `Re-convert the model using the latest converter or update the manifest to include an explicit scale.`
543
+ );
544
+ }
519
545
  const mropeSection = Array.isArray(inf.rope.mropeSection)
520
546
  ? inf.rope.mropeSection.map((entry) => Math.trunc(Number(entry)))
521
547
  : null;
@@ -525,7 +551,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
525
551
  `Manifest "${merged.modelId}" has invalid rope.mropeSection; expected positive integers.`
526
552
  );
527
553
  }
528
- if (ropeInterleaved && mropeSection) {
554
+ if (mropeInterleaved && mropeSection) {
529
555
  const doubledMropeDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
530
556
  if (doubledMropeDim !== ropeRotaryDim) {
531
557
  throw new Error(
@@ -610,6 +636,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
610
636
  ropeLocalTheta: inf.rope.ropeLocalTheta,
611
637
  ropeRotaryDim,
612
638
  ropeInterleaved,
639
+ mropeInterleaved,
613
640
  mropeSection,
614
641
  partialRotaryFactor,
615
642
  ropeScale,
@@ -650,6 +677,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
650
677
  chatTemplateType,
651
678
  chatTemplateEnabled,
652
679
  kernelPath: inf.defaultKernelPath,
680
+ visionConfig: resolveVisionConfig(config, manifest),
653
681
  };
654
682
  }
655
683
 
@@ -9,6 +9,7 @@ import { decodeReadback } from './debug-utils/index.js';
9
9
  import { createTensor } from '../../../gpu/tensor.js';
10
10
  import { castF32ToF16, recordCastF32ToF16 } from '../../../gpu/kernels/cast.js';
11
11
  import { isCpuWeightBuffer } from '../../../gpu/weight-buffer.js';
12
+ import { f16ToF32 } from '../../../loader/dtype-utils.js';
12
13
  import { selectRuleValue } from '../../../rules/rule-registry.js';
13
14
 
14
15
  const scaleShaderCode = `
@@ -202,11 +203,19 @@ export async function embed(tokenIds, embedBuffer, config) {
202
203
 
203
204
  const dtype = selectRuleValue('inference', 'dtype', 'f16OrF32', { useF16 });
204
205
 
205
- const cpuEmbeddings = isCpuWeightBuffer(embedBuffer)
206
- ? embedBuffer.data
207
- : embedBuffer instanceof Float32Array
208
- ? embedBuffer
209
- : null;
206
+ let cpuEmbeddings = null;
207
+ if (isCpuWeightBuffer(embedBuffer)) {
208
+ const bufDtype = embedBuffer.dtype;
209
+ if (bufDtype !== 'f32' && bufDtype !== 'f16') {
210
+ throw new Error(
211
+ `[Embed] CPU embedding buffer has unsupported dtype '${bufDtype}'; ` +
212
+ `only 'f32' and 'f16' are supported in the CPU gather path.`
213
+ );
214
+ }
215
+ cpuEmbeddings = embedBuffer.data;
216
+ } else if (embedBuffer instanceof Float32Array) {
217
+ cpuEmbeddings = embedBuffer;
218
+ }
210
219
 
211
220
  if (debug) {
212
221
  trace.embed(`tokens=${numTokens}, hidden=${hiddenSize}, vocab=${vocabSize}, scaleEmbeddings=${scaleEmbeddings}, transpose=${transpose}, indexOffset=${indexOffset}, activationDtype=${activationDtype}, useF16=${useF16}`);
@@ -226,18 +235,28 @@ export async function embed(tokenIds, embedBuffer, config) {
226
235
  }
227
236
 
228
237
  const output = new Float32Array(numTokens * hiddenSize);
238
+ // Check actual data type: loader's f16_to_f32 CPU path already decodes F16 into Float32Array,
239
+ // so dtype='f16' does not reliably indicate raw F16 bytes. Only Uint16Array needs per-element decoding.
240
+ const isF16Cpu = cpuEmbeddings instanceof Uint16Array;
229
241
  if (!transpose) {
230
242
  for (let t = 0; t < numTokens; t++) {
231
243
  const tokenId = (tokenIdArray)[t];
232
244
  const srcOffset = tokenId * hiddenSize;
233
- output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
245
+ if (isF16Cpu) {
246
+ for (let h = 0; h < hiddenSize; h++) {
247
+ output[t * hiddenSize + h] = f16ToF32(cpuEmbeddings[srcOffset + h]);
248
+ }
249
+ } else {
250
+ output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
251
+ }
234
252
  }
235
253
  } else {
236
254
  for (let t = 0; t < numTokens; t++) {
237
255
  const tokenId = (tokenIdArray)[t];
238
256
  const dstOffset = t * hiddenSize;
239
257
  for (let h = 0; h < hiddenSize; h++) {
240
- output[dstOffset + h] = cpuEmbeddings[h * vocabSize + tokenId];
258
+ const raw = cpuEmbeddings[h * vocabSize + tokenId];
259
+ output[dstOffset + h] = isF16Cpu ? f16ToF32(raw) : raw;
241
260
  }
242
261
  }
243
262
  }
@@ -1,7 +1,7 @@
1
1
  import { selectRuleValue } from '../../../rules/rule-registry.js';
2
2
  import { cloneJson, isPhaseMatch, normalizeDtype, requireSessionActivationDtype, stepHasLayer } from './execution-v0-contract-helpers.js';
3
3
 
4
- const PIPELINE_COMPATIBLE_OPS = new Set([
4
+ export const PIPELINE_COMPATIBLE_OPS = new Set([
5
5
  'save',
6
6
  'load',
7
7
  'conv',
@@ -191,8 +191,15 @@ export function buildLayerPipelineFromExecution(steps) {
191
191
  if (layerSectionSteps.length === 0) {
192
192
  return null;
193
193
  }
194
- if (layerSectionSteps.some((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))) {
195
- return null;
194
+ const incompatibleOps = [
195
+ ...new Set(
196
+ layerSectionSteps
197
+ .filter((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))
198
+ .map((step) => step.op)
199
+ ),
200
+ ];
201
+ if (incompatibleOps.length > 0) {
202
+ return { incompatibleOps };
196
203
  }
197
204
 
198
205
  const layerSteps = layerSectionSteps
@@ -31,6 +31,7 @@ import {
31
31
  buildModelRuntimeOverrides,
32
32
  buildSessionRuntimePatch,
33
33
  resolveFinitenessFallbackKernelPathId,
34
+ PIPELINE_COMPATIBLE_OPS,
34
35
  } from './execution-v0-runtime-builders.js';
35
36
 
36
37
  export function hasExecutionV0(manifestInference) {
@@ -152,7 +153,17 @@ export function compileExecutionV0(options = {}) {
152
153
  numLayers,
153
154
  finitenessFallbackKernelPathId
154
155
  );
155
- const layerPipeline = buildLayerPipelineFromExecution(resolvedSteps);
156
+ const layerPipelineResult = buildLayerPipelineFromExecution(resolvedSteps);
157
+ if (layerPipelineResult?.incompatibleOps && !kernelPath) {
158
+ throw new Error(
159
+ `[ExecutionV0] manifest.inference.execution.steps contains layer ops that are not ` +
160
+ `compatible with the JS layer pipeline and no inline kernelPath was built to cover execution. ` +
161
+ `Unsupported ops: ${layerPipelineResult.incompatibleOps.join(', ')}. ` +
162
+ `Either add explicit kernel references to each step (for inline-kernel execution) ` +
163
+ `or restrict layer ops to: ${[...PIPELINE_COMPATIBLE_OPS].join(', ')}.`
164
+ );
165
+ }
166
+ const layerPipeline = layerPipelineResult?.incompatibleOps ? null : layerPipelineResult;
156
167
  const sessionPatch = buildSessionRuntimePatch(resolvedSession);
157
168
  const modelOverrides = buildModelRuntimeOverrides(manifestInference);
158
169
  for (const [path, source] of sessionSourceByPath.entries()) {
@@ -111,6 +111,7 @@ export function buildLayerContext(state, recorder, isDecodeMode, debugLayers, de
111
111
  ropeLocalCos: state.ropeLocalCos,
112
112
  ropeLocalSin: state.ropeLocalSin,
113
113
  linearAttentionRuntime: state.linearAttentionRuntime,
114
+ convLayerStates: state.convLayerStates,
114
115
  weightConfig: getWeightBufferConfig(state),
115
116
  debugFlags: state.debugFlags,
116
117
  debugProbes: state.runtimeConfig.shared.debug.probes,
@@ -139,6 +139,12 @@ export function resolveStepOptions(state, options = {}) {
139
139
  const executionPlan = resolveExecutionSessionPlan(state, options);
140
140
 
141
141
  return {
142
+ seed: resolveConfiguredValue(
143
+ options.seed,
144
+ undefined,
145
+ 'options.seed',
146
+ (value) => Number.isFinite(value) && value >= 0
147
+ ),
142
148
  temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
143
149
  topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
144
150
  topK: resolveConfiguredValue(options.topK, samplingDefaults.topK, 'options.topK'),
@@ -165,6 +171,12 @@ export function resolveGenerateOptions(state, options = {}) {
165
171
  const executionPlan = resolveExecutionSessionPlan(state, options);
166
172
 
167
173
  return {
174
+ seed: resolveConfiguredValue(
175
+ options.seed,
176
+ undefined,
177
+ 'options.seed',
178
+ (value) => Number.isFinite(value) && value >= 0
179
+ ),
168
180
  maxTokens: executionPlan.maxTokens,
169
181
  temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
170
182
  topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
@@ -191,6 +203,7 @@ export function resolveGenerateOptions(state, options = {}) {
191
203
  batchSize: executionPlan.batchSize,
192
204
  stopCheckMode: executionPlan.stopCheckMode,
193
205
  executionPlan,
206
+ images: options.images ?? null,
194
207
  };
195
208
  }
196
209
 
@@ -205,6 +218,7 @@ export function resolvePrefillOptions(state, options = {}) {
205
218
  disableCommandBatching: executionPlan.disableCommandBatching,
206
219
  disableMultiTokenDecode: executionPlan.disableMultiTokenDecode,
207
220
  executionPlan,
221
+ images: options.images ?? null,
208
222
  };
209
223
  }
210
224
 
@@ -12,6 +12,15 @@ export interface BatchDecodeSelectionConfig {
12
12
 
13
13
  export declare function shouldUseBatchDecode(config: BatchDecodeSelectionConfig): boolean;
14
14
 
15
+ export interface FusedDecodeSamplingConfig {
16
+ recorderEnabled: boolean;
17
+ gpuSamplingEnabled: boolean;
18
+ fusedDecodeDisabled: boolean;
19
+ layerTypes?: string[] | null;
20
+ }
21
+
22
+ export declare function shouldUseFusedDecodeSampling(config: FusedDecodeSamplingConfig): boolean;
23
+
15
24
  export declare function resolveBatchStop(
16
25
  tokens: number[],
17
26
  stopFlags: Uint32Array | null,