@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +14 -1
  2. package/README.md +25 -6
  3. package/package.json +5 -3
  4. package/src/client/doppler-api.browser.js +6 -0
  5. package/src/client/doppler-api.d.ts +3 -0
  6. package/src/client/doppler-api.js +11 -2
  7. package/src/client/doppler-registry.js +3 -5
  8. package/src/client/doppler-registry.json +16 -0
  9. package/src/config/kernels/kernel-ref-digests.js +23 -21
  10. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  11. package/src/config/loader.js +6 -0
  12. package/src/config/platforms/loader.js +3 -1
  13. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  14. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  15. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  16. package/src/config/presets/kernel-paths/registry.json +7 -0
  17. package/src/config/presets/models/gemma3.json +2 -1
  18. package/src/config/presets/models/gemma4.json +61 -0
  19. package/src/config/presets/models/granite-docling.json +70 -0
  20. package/src/config/presets/models/lfm2.json +6 -1
  21. package/src/config/presets/models/qwen3_vl.json +40 -0
  22. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  23. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  24. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  25. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  26. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  27. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  28. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  29. package/src/config/runtime.js +3 -0
  30. package/src/config/schema/debug.schema.d.ts +40 -0
  31. package/src/config/schema/debug.schema.js +28 -0
  32. package/src/config/schema/index.js +2 -0
  33. package/src/config/schema/inference-defaults.schema.js +1 -1
  34. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  35. package/src/config/schema/memory-limits.schema.js +2 -2
  36. package/src/config/schema/storage.schema.js +1 -1
  37. package/src/converter/conversion-plan.js +1 -1
  38. package/src/converter/core.js +17 -8
  39. package/src/converter/quantizer.d.ts +5 -0
  40. package/src/converter/quantizer.js +15 -0
  41. package/src/distribution/shard-delivery.js +34 -0
  42. package/src/formats/rdrr/classification.js +32 -0
  43. package/src/gpu/kernel-runtime.js +4 -2
  44. package/src/gpu/kernels/attention.js +2 -1
  45. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  46. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  47. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  48. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  49. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  50. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  51. package/src/gpu/kernels/gated-short-conv.js +284 -0
  52. package/src/gpu/kernels/linear-attention-core.js +37 -17
  53. package/src/gpu/kernels/matmul-selection.js +1 -0
  54. package/src/gpu/kernels/matmul.d.ts +3 -0
  55. package/src/gpu/kernels/matmul.js +70 -1
  56. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  57. package/src/gpu/kernels/sample.js +1 -3
  58. package/src/gpu/kernels/sample.wgsl +39 -9
  59. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  60. package/src/gpu/kernels/shader-cache.js +9 -4
  61. package/src/inference/kv-cache/base.js +3 -10
  62. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  63. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
  64. package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
  65. package/src/inference/pipelines/text/attention/projections.js +13 -2
  66. package/src/inference/pipelines/text/attention/record.js +1 -0
  67. package/src/inference/pipelines/text/attention/run.js +9 -0
  68. package/src/inference/pipelines/text/config.d.ts +1 -0
  69. package/src/inference/pipelines/text/config.js +32 -4
  70. package/src/inference/pipelines/text/embed.js +26 -7
  71. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  72. package/src/inference/pipelines/text/execution-v0.js +12 -1
  73. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  74. package/src/inference/pipelines/text/generator-runtime.js +14 -0
  75. package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
  76. package/src/inference/pipelines/text/generator-steps.js +46 -29
  77. package/src/inference/pipelines/text/generator.d.ts +5 -0
  78. package/src/inference/pipelines/text/generator.js +320 -166
  79. package/src/inference/pipelines/text/init.d.ts +2 -0
  80. package/src/inference/pipelines/text/init.js +19 -5
  81. package/src/inference/pipelines/text/layer.js +37 -8
  82. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  83. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  84. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  85. package/src/inference/pipelines/text/ops.js +123 -53
  86. package/src/inference/pipelines/text/probes.js +1 -0
  87. package/src/inference/pipelines/text/state.js +2 -0
  88. package/src/inference/pipelines/text.d.ts +5 -0
  89. package/src/inference/pipelines/text.js +59 -1
  90. package/src/inference/pipelines/vision/encoder.js +386 -0
  91. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  92. package/src/inference/pipelines/vision/index.js +173 -0
  93. package/src/inference/pipelines/vision/ops.js +78 -0
  94. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  95. package/src/inference/test-harness.js +9 -7
  96. package/src/loader/doppler-loader.d.ts +3 -0
  97. package/src/loader/doppler-loader.js +20 -3
  98. package/src/loader/experts/expert-cache.js +6 -2
  99. package/src/loader/experts/expert-loader.js +6 -2
  100. package/src/loader/layer-loader.js +42 -3
  101. package/src/loader/manifest-config.js +3 -1
  102. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  103. package/src/loader/tensors/tensor-loader.js +124 -3
  104. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  105. package/src/rules/kernels/softmax.rules.json +2 -0
  106. package/src/rules/rule-registry.d.ts +1 -0
  107. package/src/rules/rule-registry.js +2 -0
  108. package/src/storage/quickstart-downloader.d.ts +3 -0
  109. package/src/storage/quickstart-downloader.js +27 -30
  110. package/src/tooling/node-converter.js +25 -7
  111. package/src/tooling/node-source-runtime.js +29 -5
  112. package/src/tooling/node-webgpu.js +24 -7
  113. package/src/utils/hf-resolve-url.d.ts +16 -0
  114. package/src/utils/hf-resolve-url.js +17 -0
  115. package/src/version.js +1 -1
  116. package/src/tooling/node-convert.d.ts +0 -54
@@ -0,0 +1,386 @@
1
+
2
+
3
+ import { trace } from '../../../debug/index.js';
4
+ import { getDevice, getKernelCapabilities } from '../../../gpu/device.js';
5
+ import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
6
+ import {
7
+ doLayerNorm, doMatmul, doGelu, doResidualAdd, doCast,
8
+ } from './ops.js';
9
+
10
+ /**
11
+ * Run the Qwen3-VL vision encoder on preprocessed image patches.
12
+ *
13
+ * Architecture:
14
+ * patch_embed (conv2d 3->hidden, stride=patchSize) -> [numPatches, hiddenSize]
15
+ * for each ViT block:
16
+ * x = layerNorm(x)
17
+ * x = x + selfAttention(x) (no KV cache — full prefill attention)
18
+ * x = layerNorm(x)
19
+ * x = x + FFN(x) (gelu activation)
20
+ * spatialMerge(x) -> [numMergedPatches, outHiddenSize]
21
+ *
22
+ * @param {object} params
23
+ * @param {GPUBuffer} params.patchBuffer Preprocessed patches [numPatches, hiddenSize] on GPU
24
+ * @param {number} params.numPatches Total number of patches
25
+ * @param {object} params.visionConfig Vision config from manifest
26
+ * @param {object} params.weights Vision encoder weight buffers keyed by tensor name
27
+ * @param {object} params.pipelineState Shared pipeline state for buffer tracking
28
+ * @returns {Promise<{ features: GPUBuffer, numTokens: number }>}
29
+ */
30
+ export async function runVisionEncoder(params) {
31
+ const {
32
+ patchBuffer,
33
+ numPatches,
34
+ visionConfig,
35
+ weights,
36
+ pipelineState,
37
+ } = params;
38
+
39
+ const {
40
+ depth,
41
+ hiddenSize,
42
+ intermediateSize,
43
+ numHeads,
44
+ outHiddenSize,
45
+ spatialMergeSize,
46
+ eps = 1e-6,
47
+ } = visionConfig;
48
+
49
+ const headDim = Math.floor(hiddenSize / numHeads);
50
+ const device = getDevice();
51
+
52
+ trace('vision', `encoder: depth=${depth} hidden=${hiddenSize} heads=${numHeads} patches=${numPatches}`);
53
+
54
+ let hidden = patchBuffer;
55
+
56
+ // Run ViT transformer blocks.
57
+ for (let i = 0; i < depth; i++) {
58
+ const prefix = `visual.blocks.${i}`;
59
+
60
+ // Pre-attention layer norm.
61
+ const normed1 = await doLayerNorm(hidden, weights[`${prefix}.norm1.weight`], weights[`${prefix}.norm1.bias`], {
62
+ seqLen: numPatches, hiddenSize, eps,
63
+ });
64
+
65
+ // Self-attention (full, no KV cache).
66
+ const attnOut = await visionSelfAttention({
67
+ input: normed1,
68
+ seqLen: numPatches,
69
+ hiddenSize,
70
+ numHeads,
71
+ headDim,
72
+ qkvWeight: weights[`${prefix}.attn.qkv.weight`],
73
+ qkvBias: weights[`${prefix}.attn.qkv.bias`],
74
+ projWeight: weights[`${prefix}.attn.proj.weight`],
75
+ projBias: weights[`${prefix}.attn.proj.bias`],
76
+ });
77
+
78
+ releaseBuffer(normed1);
79
+
80
+ // Residual add.
81
+ const residual1 = await doResidualAdd(hidden, attnOut, { count: numPatches * hiddenSize });
82
+ releaseBuffer(hidden);
83
+ releaseBuffer(attnOut);
84
+
85
+ // Pre-FFN layer norm.
86
+ const normed2 = await doLayerNorm(residual1, weights[`${prefix}.norm2.weight`], weights[`${prefix}.norm2.bias`], {
87
+ seqLen: numPatches, hiddenSize, eps,
88
+ });
89
+
90
+ // FFN: linear -> gelu -> linear.
91
+ const ffnOut = await visionFFN({
92
+ input: normed2,
93
+ seqLen: numPatches,
94
+ hiddenSize,
95
+ intermediateSize,
96
+ fc1Weight: weights[`${prefix}.mlp.fc1.weight`],
97
+ fc1Bias: weights[`${prefix}.mlp.fc1.bias`],
98
+ fc2Weight: weights[`${prefix}.mlp.fc2.weight`],
99
+ fc2Bias: weights[`${prefix}.mlp.fc2.bias`],
100
+ });
101
+
102
+ releaseBuffer(normed2);
103
+
104
+ // Residual add.
105
+ hidden = await doResidualAdd(residual1, ffnOut, { count: numPatches * hiddenSize });
106
+ releaseBuffer(residual1);
107
+ releaseBuffer(ffnOut);
108
+
109
+ trace('vision', ` block ${i}/${depth} done`);
110
+ }
111
+
112
+ // Spatial merge projector: merge 2x2 patches -> outHiddenSize.
113
+ const mergedTokens = Math.floor(numPatches / (spatialMergeSize * spatialMergeSize));
114
+ const merged = await spatialMergeProject({
115
+ input: hidden,
116
+ numPatches,
117
+ hiddenSize,
118
+ outHiddenSize,
119
+ spatialMergeSize,
120
+ weights,
121
+ });
122
+
123
+ releaseBuffer(hidden);
124
+
125
+ trace('vision', `encoder done: ${numPatches} patches -> ${mergedTokens} tokens (${outHiddenSize}d)`);
126
+
127
+ return { features: merged, numTokens: mergedTokens };
128
+ }
129
+
130
+ /**
131
+ * Vision self-attention (full prefill, no KV cache).
132
+ * QKV are fused into one weight matrix [3*hiddenSize, hiddenSize].
133
+ */
134
+ async function visionSelfAttention(params) {
135
+ const {
136
+ input, seqLen, hiddenSize, numHeads, headDim,
137
+ qkvWeight, qkvBias, projWeight, projBias,
138
+ } = params;
139
+
140
+ // QKV projection: [seqLen, hiddenSize] @ [hiddenSize, 3*hiddenSize] -> [seqLen, 3*hiddenSize]
141
+ const qkv = await doMatmul(input, qkvWeight, {
142
+ M: seqLen, K: hiddenSize, N: 3 * hiddenSize, bias: qkvBias,
143
+ });
144
+
145
+ // Split Q, K, V and compute scaled dot-product attention on GPU.
146
+ // This uses the existing attention kernel infrastructure in prefill mode.
147
+ const attnResult = await computeVisionAttention({
148
+ qkv, seqLen, numHeads, headDim, hiddenSize,
149
+ });
150
+
151
+ releaseBuffer(qkv);
152
+
153
+ // Output projection: [seqLen, hiddenSize] @ [hiddenSize, hiddenSize] -> [seqLen, hiddenSize]
154
+ const output = await doMatmul(attnResult, projWeight, {
155
+ M: seqLen, K: hiddenSize, N: hiddenSize, bias: projBias,
156
+ });
157
+
158
+ releaseBuffer(attnResult);
159
+
160
+ return output;
161
+ }
162
+
163
+ /**
164
+ * Compute scaled dot-product attention for vision encoder.
165
+ * No KV cache, no causal mask — full bidirectional attention.
166
+ *
167
+ * Input: fused QKV buffer [seqLen, 3*hiddenSize]
168
+ * Output: attention output [seqLen, hiddenSize]
169
+ */
170
+ async function computeVisionAttention(params) {
171
+ const { qkv, seqLen, numHeads, headDim, hiddenSize } = params;
172
+ const device = getDevice();
173
+ const scale = 1.0 / Math.sqrt(headDim);
174
+
175
+ // For the initial implementation, read QKV back to CPU, compute attention,
176
+ // and upload the result. This will be replaced with a GPU kernel.
177
+ //
178
+ // TODO(perf): Replace with GPU-native vision attention kernel.
179
+ // The text decoder attention kernels assume causal masking and KV cache,
180
+ // which don't apply to the vision encoder's bidirectional full attention.
181
+ const qkvSize = seqLen * 3 * hiddenSize;
182
+ const qkvData = new Float32Array(qkvSize);
183
+ {
184
+ const staging = device.createBuffer({
185
+ size: qkvSize * 4,
186
+ usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
187
+ });
188
+ const encoder = device.createCommandEncoder();
189
+ encoder.copyBufferToBuffer(qkv, 0, staging, 0, qkvSize * 4);
190
+ device.queue.submit([encoder.finish()]);
191
+ await staging.mapAsync(GPUMapMode.READ);
192
+ qkvData.set(new Float32Array(staging.getMappedRange()));
193
+ staging.unmap();
194
+ staging.destroy();
195
+ }
196
+
197
+ // Split into Q, K, V: each [numHeads, seqLen, headDim]
198
+ const Q = new Float32Array(numHeads * seqLen * headDim);
199
+ const K = new Float32Array(numHeads * seqLen * headDim);
200
+ const V = new Float32Array(numHeads * seqLen * headDim);
201
+
202
+ for (let s = 0; s < seqLen; s++) {
203
+ for (let h = 0; h < numHeads; h++) {
204
+ for (let d = 0; d < headDim; d++) {
205
+ const srcBase = s * 3 * hiddenSize;
206
+ const headOffset = h * headDim + d;
207
+ Q[(h * seqLen + s) * headDim + d] = qkvData[srcBase + headOffset];
208
+ K[(h * seqLen + s) * headDim + d] = qkvData[srcBase + hiddenSize + headOffset];
209
+ V[(h * seqLen + s) * headDim + d] = qkvData[srcBase + 2 * hiddenSize + headOffset];
210
+ }
211
+ }
212
+ }
213
+
214
+ // Compute attention: softmax(Q @ K^T / sqrt(d)) @ V per head.
215
+ const output = new Float32Array(seqLen * hiddenSize);
216
+
217
+ for (let h = 0; h < numHeads; h++) {
218
+ // Scores: [seqLen, seqLen]
219
+ const scores = new Float32Array(seqLen * seqLen);
220
+ for (let i = 0; i < seqLen; i++) {
221
+ for (let j = 0; j < seqLen; j++) {
222
+ let dot = 0;
223
+ for (let d = 0; d < headDim; d++) {
224
+ dot += Q[(h * seqLen + i) * headDim + d] * K[(h * seqLen + j) * headDim + d];
225
+ }
226
+ scores[i * seqLen + j] = dot * scale;
227
+ }
228
+ }
229
+
230
+ // Softmax per row.
231
+ for (let i = 0; i < seqLen; i++) {
232
+ let maxVal = -Infinity;
233
+ for (let j = 0; j < seqLen; j++) {
234
+ if (scores[i * seqLen + j] > maxVal) maxVal = scores[i * seqLen + j];
235
+ }
236
+ let sumExp = 0;
237
+ for (let j = 0; j < seqLen; j++) {
238
+ scores[i * seqLen + j] = Math.exp(scores[i * seqLen + j] - maxVal);
239
+ sumExp += scores[i * seqLen + j];
240
+ }
241
+ for (let j = 0; j < seqLen; j++) {
242
+ scores[i * seqLen + j] /= sumExp;
243
+ }
244
+ }
245
+
246
+ // Weighted sum: [seqLen, headDim]
247
+ for (let i = 0; i < seqLen; i++) {
248
+ for (let d = 0; d < headDim; d++) {
249
+ let val = 0;
250
+ for (let j = 0; j < seqLen; j++) {
251
+ val += scores[i * seqLen + j] * V[(h * seqLen + j) * headDim + d];
252
+ }
253
+ output[i * hiddenSize + h * headDim + d] = val;
254
+ }
255
+ }
256
+ }
257
+
258
+ // Upload result to GPU.
259
+ const outBuffer = acquireBuffer(seqLen * hiddenSize * 4, 'vision-attn-output');
260
+ device.queue.writeBuffer(outBuffer, 0, output);
261
+
262
+ return outBuffer;
263
+ }
264
+
265
+ /**
266
+ * Vision FFN: fc1 -> gelu -> fc2.
267
+ */
268
+ async function visionFFN(params) {
269
+ const {
270
+ input, seqLen, hiddenSize, intermediateSize,
271
+ fc1Weight, fc1Bias, fc2Weight, fc2Bias,
272
+ } = params;
273
+
274
+ // fc1: [seqLen, hiddenSize] -> [seqLen, intermediateSize]
275
+ const fc1Out = await doMatmul(input, fc1Weight, {
276
+ M: seqLen, K: hiddenSize, N: intermediateSize, bias: fc1Bias,
277
+ });
278
+
279
+ // GELU activation.
280
+ const activated = await doGelu(fc1Out, { count: seqLen * intermediateSize });
281
+ releaseBuffer(fc1Out);
282
+
283
+ // fc2: [seqLen, intermediateSize] -> [seqLen, hiddenSize]
284
+ const fc2Out = await doMatmul(activated, fc2Weight, {
285
+ M: seqLen, K: intermediateSize, N: hiddenSize, bias: fc2Bias,
286
+ });
287
+ releaseBuffer(activated);
288
+
289
+ return fc2Out;
290
+ }
291
+
292
+ /**
293
+ * Spatial merge projector.
294
+ *
295
+ * Takes [numPatches, hiddenSize] vision features and merges spatialMergeSize x spatialMergeSize
296
+ * adjacent patches into single tokens via concatenation + linear projection.
297
+ *
298
+ * Input: [numPatches, hiddenSize] where numPatches = gridH * gridW
299
+ * Output: [mergedPatches, outHiddenSize] where mergedPatches = (gridH/m) * (gridW/m), m = spatialMergeSize
300
+ */
301
+ async function spatialMergeProject(params) {
302
+ const {
303
+ input, numPatches, hiddenSize, outHiddenSize, spatialMergeSize, weights,
304
+ } = params;
305
+
306
+ const device = getDevice();
307
+ const m = spatialMergeSize;
308
+ const concatDim = m * m * hiddenSize;
309
+
310
+ // Read vision features back for spatial rearrangement.
311
+ // TODO(perf): GPU kernel for spatial merge gather.
312
+ const inputSize = numPatches * hiddenSize;
313
+ const inputData = new Float32Array(inputSize);
314
+ {
315
+ const staging = device.createBuffer({
316
+ size: inputSize * 4,
317
+ usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
318
+ });
319
+ const encoder = device.createCommandEncoder();
320
+ encoder.copyBufferToBuffer(input, 0, staging, 0, inputSize * 4);
321
+ device.queue.submit([encoder.finish()]);
322
+ await staging.mapAsync(GPUMapMode.READ);
323
+ inputData.set(new Float32Array(staging.getMappedRange()));
324
+ staging.unmap();
325
+ staging.destroy();
326
+ }
327
+
328
+ // Assume patches are laid out as [gridH, gridW, hiddenSize].
329
+ // We need gridH and gridW — derive from numPatches assuming square-ish grid.
330
+ // The actual grid dimensions should be passed in; for now infer from sqrt.
331
+ const gridSide = Math.round(Math.sqrt(numPatches));
332
+ const gridH = gridSide;
333
+ const gridW = Math.floor(numPatches / gridH);
334
+
335
+ const mergedH = Math.floor(gridH / m);
336
+ const mergedW = Math.floor(gridW / m);
337
+ const mergedCount = mergedH * mergedW;
338
+
339
+ // Concatenate m x m patches into single vectors of dimension concatDim.
340
+ const concatenated = new Float32Array(mergedCount * concatDim);
341
+ for (let mh = 0; mh < mergedH; mh++) {
342
+ for (let mw = 0; mw < mergedW; mw++) {
343
+ const outIdx = mh * mergedW + mw;
344
+ let offset = 0;
345
+ for (let dh = 0; dh < m; dh++) {
346
+ for (let dw = 0; dw < m; dw++) {
347
+ const srcH = mh * m + dh;
348
+ const srcW = mw * m + dw;
349
+ const srcIdx = srcH * gridW + srcW;
350
+ for (let d = 0; d < hiddenSize; d++) {
351
+ concatenated[outIdx * concatDim + offset] = inputData[srcIdx * hiddenSize + d];
352
+ offset++;
353
+ }
354
+ }
355
+ }
356
+ }
357
+ }
358
+
359
+ // Upload concatenated data.
360
+ const concatBuffer = acquireBuffer(mergedCount * concatDim * 4, 'vision-merge-concat');
361
+ device.queue.writeBuffer(concatBuffer, 0, concatenated);
362
+
363
+ // Linear projection: [mergedCount, concatDim] @ [concatDim, outHiddenSize] -> [mergedCount, outHiddenSize]
364
+ const projected = await doMatmul(concatBuffer, weights['visual.merger.mlp.0.weight'], {
365
+ M: mergedCount,
366
+ K: concatDim,
367
+ N: outHiddenSize,
368
+ bias: weights['visual.merger.mlp.0.bias'],
369
+ });
370
+
371
+ releaseBuffer(concatBuffer);
372
+
373
+ // GELU + second linear layer.
374
+ const activated = await doGelu(projected, { count: mergedCount * outHiddenSize });
375
+ releaseBuffer(projected);
376
+
377
+ const output = await doMatmul(activated, weights['visual.merger.mlp.2.weight'], {
378
+ M: mergedCount,
379
+ K: outHiddenSize,
380
+ N: outHiddenSize,
381
+ bias: weights['visual.merger.mlp.2.bias'],
382
+ });
383
+ releaseBuffer(activated);
384
+
385
+ return output;
386
+ }
@@ -0,0 +1,151 @@
1
+
2
+
3
+ import { trace } from '../../../debug/index.js';
4
+
5
+ /**
6
+ * Preprocess an image for Qwen3-VL vision encoder.
7
+ *
8
+ * Accepts raw pixel data (Uint8Array RGBA or RGB, or Float32Array normalized)
9
+ * and returns a GPU-ready Float32Array of shape [C, H, W] after:
10
+ * 1. Resize to fit min/max pixel constraints
11
+ * 2. Pad to patch-aligned dimensions
12
+ * 3. Normalize with mean/std
13
+ * 4. Extract temporal patches (for video; single frame for images)
14
+ *
15
+ * @param {Uint8Array|Float32Array} pixels Raw pixel data (RGBA or RGB)
16
+ * @param {number} width Source image width
17
+ * @param {number} height Source image height
18
+ * @param {object} config Vision config from manifest/preset
19
+ * @returns {{ data: Float32Array, gridThw: [number, number, number], patchedHeight: number, patchedWidth: number }}
20
+ */
21
+ export function preprocessImage(pixels, width, height, config) {
22
+ const {
23
+ patchSize = 16,
24
+ spatialMergeSize = 2,
25
+ temporalPatchSize = 2,
26
+ minPixels = 3136,
27
+ maxPixels = 1003520,
28
+ normalization = {},
29
+ } = config;
30
+
31
+ const mean = normalization.mean || [0.48145466, 0.4578275, 0.40821073];
32
+ const std = normalization.std || [0.26862954, 0.26130258, 0.27577711];
33
+
34
+ // Step 1: Compute target dimensions respecting pixel constraints and patch alignment.
35
+ const mergedPatch = patchSize * spatialMergeSize;
36
+ const { targetWidth, targetHeight } = computeTargetDimensions(
37
+ width, height, minPixels, maxPixels, mergedPatch,
38
+ );
39
+
40
+ trace('vision', `preprocess: ${width}x${height} -> ${targetWidth}x${targetHeight} (patch=${patchSize}, merge=${spatialMergeSize})`);
41
+
42
+ // Step 2: Resize to target dimensions (bilinear interpolation on CPU).
43
+ const channels = 3;
44
+ const resized = resizeBilinear(pixels, width, height, targetWidth, targetHeight, channels);
45
+
46
+ // Step 3: Normalize to [0,1] then apply mean/std normalization.
47
+ const normalized = new Float32Array(channels * targetHeight * targetWidth);
48
+ for (let c = 0; c < channels; c++) {
49
+ const m = mean[c];
50
+ const s = std[c];
51
+ for (let y = 0; y < targetHeight; y++) {
52
+ for (let x = 0; x < targetWidth; x++) {
53
+ const srcIdx = (y * targetWidth + x) * channels + c;
54
+ const dstIdx = c * targetHeight * targetWidth + y * targetWidth + x;
55
+ normalized[dstIdx] = (resized[srcIdx] / 255.0 - m) / s;
56
+ }
57
+ }
58
+ }
59
+
60
+ // Step 4: Compute grid dimensions for the LLM.
61
+ // gridT = 1 for single image (temporalPatchSize frames per temporal patch)
62
+ // gridH = targetHeight / patchSize
63
+ // gridW = targetWidth / patchSize
64
+ const gridT = 1;
65
+ const gridH = Math.floor(targetHeight / patchSize);
66
+ const gridW = Math.floor(targetWidth / patchSize);
67
+
68
+ return {
69
+ data: normalized,
70
+ width: targetWidth,
71
+ height: targetHeight,
72
+ channels,
73
+ gridThw: [gridT, gridH, gridW],
74
+ patchedHeight: targetHeight,
75
+ patchedWidth: targetWidth,
76
+ };
77
+ }
78
+
79
+ /**
80
+ * Compute target dimensions that satisfy:
81
+ * - Total pixels >= minPixels and <= maxPixels
82
+ * - Both dimensions are multiples of mergedPatch
83
+ * - Aspect ratio is preserved as closely as possible
84
+ */
85
+ function computeTargetDimensions(width, height, minPixels, maxPixels, mergedPatch) {
86
+ const aspectRatio = width / height;
87
+
88
+ // Start from the geometric mean of min/max pixel counts.
89
+ let targetPixels = Math.sqrt(minPixels * maxPixels);
90
+ targetPixels = Math.max(minPixels, Math.min(maxPixels, targetPixels));
91
+
92
+ // Compute dimensions preserving aspect ratio.
93
+ let h = Math.sqrt(targetPixels / aspectRatio);
94
+ let w = h * aspectRatio;
95
+
96
+ // Round to nearest mergedPatch multiple.
97
+ h = Math.max(mergedPatch, Math.round(h / mergedPatch) * mergedPatch);
98
+ w = Math.max(mergedPatch, Math.round(w / mergedPatch) * mergedPatch);
99
+
100
+ // Clamp total pixels.
101
+ if (h * w > maxPixels) {
102
+ const scale = Math.sqrt(maxPixels / (h * w));
103
+ h = Math.max(mergedPatch, Math.round((h * scale) / mergedPatch) * mergedPatch);
104
+ w = Math.max(mergedPatch, Math.round((w * scale) / mergedPatch) * mergedPatch);
105
+ }
106
+ if (h * w < minPixels) {
107
+ const scale = Math.sqrt(minPixels / (h * w));
108
+ h = Math.max(mergedPatch, Math.round((h * scale) / mergedPatch) * mergedPatch);
109
+ w = Math.max(mergedPatch, Math.round((w * scale) / mergedPatch) * mergedPatch);
110
+ }
111
+
112
+ return { targetWidth: w, targetHeight: h };
113
+ }
114
+
115
+ /**
116
+ * Bilinear resize of interleaved RGB(A) pixel data.
117
+ * Input: Uint8Array or Float32Array in [H, W, C] layout (C >= 3, only first 3 used).
118
+ * Output: Float32Array in [H, W, 3] layout with values in [0, 255].
119
+ */
120
+ function resizeBilinear(src, srcW, srcH, dstW, dstH, channels) {
121
+ const srcChannels = src.length / (srcW * srcH);
122
+ const out = new Float32Array(dstH * dstW * channels);
123
+ const scaleX = srcW / dstW;
124
+ const scaleY = srcH / dstH;
125
+
126
+ for (let y = 0; y < dstH; y++) {
127
+ const srcY = y * scaleY;
128
+ const y0 = Math.min(Math.floor(srcY), srcH - 1);
129
+ const y1 = Math.min(y0 + 1, srcH - 1);
130
+ const fy = srcY - y0;
131
+
132
+ for (let x = 0; x < dstW; x++) {
133
+ const srcX = x * scaleX;
134
+ const x0 = Math.min(Math.floor(srcX), srcW - 1);
135
+ const x1 = Math.min(x0 + 1, srcW - 1);
136
+ const fx = srcX - x0;
137
+
138
+ for (let c = 0; c < channels; c++) {
139
+ const v00 = src[(y0 * srcW + x0) * srcChannels + c];
140
+ const v01 = src[(y0 * srcW + x1) * srcChannels + c];
141
+ const v10 = src[(y1 * srcW + x0) * srcChannels + c];
142
+ const v11 = src[(y1 * srcW + x1) * srcChannels + c];
143
+ const top = v00 + (v01 - v00) * fx;
144
+ const bot = v10 + (v11 - v10) * fx;
145
+ out[(y * dstW + x) * channels + c] = top + (bot - top) * fy;
146
+ }
147
+ }
148
+ }
149
+
150
+ return out;
151
+ }
@@ -0,0 +1,173 @@
1
+
2
+
3
+ import { trace } from '../../../debug/index.js';
4
+ import { getDevice } from '../../../gpu/device.js';
5
+ import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
6
+ import { preprocessImage } from './image-preprocess.js';
7
+ import { patchEmbed } from './patch-embed.js';
8
+ import { runVisionEncoder } from './encoder.js';
9
+
10
+ /**
11
+ * Encode an image through the Qwen3-VL vision pipeline.
12
+ *
13
+ * Full flow:
14
+ * raw pixels -> preprocess -> patch embed -> ViT blocks -> spatial merge -> visual tokens
15
+ *
16
+ * @param {object} params
17
+ * @param {Uint8Array|Float32Array} params.pixels Raw image pixel data (RGBA or RGB)
18
+ * @param {number} params.width Image width
19
+ * @param {number} params.height Image height
20
+ * @param {object} params.visionConfig Vision config from manifest
21
+ * @param {object} params.weights Vision encoder weight buffers
22
+ * @returns {Promise<VisionEncodeResult>}
23
+ */
24
+ export async function encodeImage(params) {
25
+ const { pixels, width, height, visionConfig, weights } = params;
26
+
27
+ trace('vision', `encodeImage: ${width}x${height} input`);
28
+
29
+ // Step 1: Preprocess — resize, normalize, compute grid.
30
+ const preprocessed = preprocessImage(pixels, width, height, visionConfig);
31
+
32
+ // Step 2: Patch embedding — conv2d patches -> [numPatches, hiddenSize].
33
+ const { patchBuffer, numPatches } = await patchEmbed({
34
+ imageData: preprocessed.data,
35
+ height: preprocessed.height,
36
+ width: preprocessed.width,
37
+ channels: preprocessed.channels,
38
+ visionConfig,
39
+ weights,
40
+ });
41
+
42
+ // Step 3: Vision encoder — ViT blocks + spatial merge.
43
+ const { features, numTokens } = await runVisionEncoder({
44
+ patchBuffer,
45
+ numPatches,
46
+ visionConfig,
47
+ weights,
48
+ });
49
+
50
+ return {
51
+ features,
52
+ numTokens,
53
+ gridThw: preprocessed.gridThw,
54
+ imageWidth: preprocessed.width,
55
+ imageHeight: preprocessed.height,
56
+ };
57
+ }
58
+
59
+ /**
60
+ * Inject visual tokens into text token embeddings.
61
+ *
62
+ * Replaces positions in the embedding sequence where image_token_id appears
63
+ * with the encoded visual features from the vision encoder.
64
+ *
65
+ * For Qwen3-VL with DeepStack, visual tokens are injected at specific decoder
66
+ * layers (deepstackVisualIndexes), not at the input embedding level.
67
+ * This function handles the simpler input-level injection case.
68
+ * DeepStack injection is handled in the decoder layer loop.
69
+ *
70
+ * @param {object} params
71
+ * @param {Float32Array} params.textEmbeddings [seqLen, hiddenSize]
72
+ * @param {Int32Array} params.tokenIds [seqLen]
73
+ * @param {GPUBuffer} params.visualFeatures [numVisualTokens, outHiddenSize]
74
+ * @param {number} params.numVisualTokens Number of visual tokens
75
+ * @param {number} params.imageTokenId Token ID marking image positions
76
+ * @param {number} params.hiddenSize Text model hidden size
77
+ * @returns {{ mergedEmbeddings: Float32Array, mergedLength: number }}
78
+ */
79
+ export function mergeVisualTokens(params) {
80
+ const {
81
+ textEmbeddings, tokenIds, visualFeatures,
82
+ numVisualTokens, imageTokenId, hiddenSize,
83
+ } = params;
84
+
85
+ // Count image token positions.
86
+ const imagePositions = [];
87
+ for (let i = 0; i < tokenIds.length; i++) {
88
+ if (tokenIds[i] === imageTokenId) {
89
+ imagePositions.push(i);
90
+ }
91
+ }
92
+
93
+ if (imagePositions.length === 0) {
94
+ trace('vision', 'mergeVisualTokens: no image tokens found, returning text-only');
95
+ return { mergedEmbeddings: textEmbeddings, mergedLength: tokenIds.length };
96
+ }
97
+
98
+ trace('vision', `mergeVisualTokens: replacing ${imagePositions.length} image tokens with ${numVisualTokens} visual tokens`);
99
+
100
+ // The merged sequence replaces contiguous image_token_id runs with visual features.
101
+ // For Qwen3-VL: image tokens appear as a block between vision_start and vision_end tokens.
102
+ // The visual features replace the entire image token block.
103
+
104
+ // Find contiguous image token ranges.
105
+ const ranges = [];
106
+ let rangeStart = imagePositions[0];
107
+ let rangeEnd = imagePositions[0];
108
+ for (let i = 1; i < imagePositions.length; i++) {
109
+ if (imagePositions[i] === rangeEnd + 1) {
110
+ rangeEnd = imagePositions[i];
111
+ } else {
112
+ ranges.push([rangeStart, rangeEnd]);
113
+ rangeStart = imagePositions[i];
114
+ rangeEnd = imagePositions[i];
115
+ }
116
+ }
117
+ ranges.push([rangeStart, rangeEnd]);
118
+
119
+ // Build merged sequence: text tokens (non-image) + visual tokens replacing each range.
120
+ const textLen = tokenIds.length;
121
+ const replacedCount = imagePositions.length;
122
+ const mergedLength = textLen - replacedCount + numVisualTokens;
123
+ const merged = new Float32Array(mergedLength * hiddenSize);
124
+
125
+ let srcPos = 0;
126
+ let dstPos = 0;
127
+ let visualOffset = 0;
128
+
129
+ for (const [start, end] of ranges) {
130
+ // Copy text tokens before this range.
131
+ const textBefore = start - srcPos;
132
+ if (textBefore > 0) {
133
+ merged.set(
134
+ textEmbeddings.subarray(srcPos * hiddenSize, start * hiddenSize),
135
+ dstPos * hiddenSize,
136
+ );
137
+ dstPos += textBefore;
138
+ }
139
+
140
+ // Insert visual tokens replacing this range.
141
+ const rangeLen = end - start + 1;
142
+ const tokensToInsert = Math.min(numVisualTokens - visualOffset, rangeLen);
143
+ // Copy from visual features buffer (CPU side).
144
+ for (let i = 0; i < tokensToInsert; i++) {
145
+ for (let d = 0; d < hiddenSize; d++) {
146
+ merged[(dstPos + i) * hiddenSize + d] = visualFeatures[(visualOffset + i) * hiddenSize + d];
147
+ }
148
+ }
149
+ dstPos += tokensToInsert;
150
+ visualOffset += tokensToInsert;
151
+ srcPos = end + 1;
152
+ }
153
+
154
+ // Copy remaining text tokens after last range.
155
+ if (srcPos < textLen) {
156
+ merged.set(
157
+ textEmbeddings.subarray(srcPos * hiddenSize, textLen * hiddenSize),
158
+ dstPos * hiddenSize,
159
+ );
160
+ dstPos += textLen - srcPos;
161
+ }
162
+
163
+ return { mergedEmbeddings: merged, mergedLength: dstPos };
164
+ }
165
+
166
+ /**
167
+ * @typedef {object} VisionEncodeResult
168
+ * @property {GPUBuffer} features Encoded visual tokens [numTokens, outHiddenSize]
169
+ * @property {number} numTokens Number of visual tokens after spatial merge
170
+ * @property {number[]} gridThw [temporal, height, width] grid dimensions
171
+ * @property {number} imageWidth Processed image width
172
+ * @property {number} imageHeight Processed image height
173
+ */