@simulatte/doppler 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +25 -6
- package/package.json +5 -3
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +16 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/loader.js +6 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +7 -0
- package/src/config/presets/models/gemma3.json +2 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +1 -1
- package/src/converter/core.js +17 -8
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +15 -0
- package/src/distribution/shard-delivery.js +34 -0
- package/src/formats/rdrr/classification.js +32 -0
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +1 -0
- package/src/gpu/kernels/matmul.d.ts +3 -0
- package/src/gpu/kernels/matmul.js +70 -1
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
- package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
- package/src/inference/pipelines/text/attention/projections.js +13 -2
- package/src/inference/pipelines/text/attention/record.js +1 -0
- package/src/inference/pipelines/text/attention/run.js +9 -0
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +32 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +14 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
- package/src/inference/pipelines/text/generator-steps.js +46 -29
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +320 -166
- package/src/inference/pipelines/text/init.d.ts +2 -0
- package/src/inference/pipelines/text/init.js +19 -5
- package/src/inference/pipelines/text/layer.js +37 -8
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +9 -7
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +124 -3
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/tooling/node-converter.js +25 -7
- package/src/tooling/node-source-runtime.js +29 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -34,6 +34,16 @@ fn apply_softcap(x: f32, softcap: f32) -> f32 {
|
|
|
34
34
|
return softcap * tanh(x / softcap);
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
fn candidate_beats(candidate_value: f32, candidate_index: u32, best_value: f32, best_index: u32) -> bool {
|
|
38
|
+
if (candidate_value > best_value) {
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
if (candidate_value < best_value) {
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
return candidate_index < best_index;
|
|
45
|
+
}
|
|
46
|
+
|
|
37
47
|
@group(0) @binding(0) var<uniform> u: Uniforms;
|
|
38
48
|
@group(0) @binding(1) var<storage, read> logits: array<f16>;
|
|
39
49
|
@group(0) @binding(2) var<storage, read_write> output: array<u32>;
|
|
@@ -74,7 +84,7 @@ fn find_topk_phase1(
|
|
|
74
84
|
while (idx < vocab_size) {
|
|
75
85
|
if (idx != pad_id) {
|
|
76
86
|
let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
|
|
77
|
-
if (val
|
|
87
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
78
88
|
local_max = val;
|
|
79
89
|
local_max_idx = idx;
|
|
80
90
|
}
|
|
@@ -89,7 +99,12 @@ fn find_topk_phase1(
|
|
|
89
99
|
var stride = WORKGROUP_SIZE / 2u;
|
|
90
100
|
while (stride > 0u) {
|
|
91
101
|
if (thread_idx < stride) {
|
|
92
|
-
if (
|
|
102
|
+
if (candidate_beats(
|
|
103
|
+
shared_values[thread_idx + stride],
|
|
104
|
+
shared_indices[thread_idx + stride],
|
|
105
|
+
shared_values[thread_idx],
|
|
106
|
+
shared_indices[thread_idx]
|
|
107
|
+
)) {
|
|
93
108
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
94
109
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
95
110
|
}
|
|
@@ -130,7 +145,7 @@ fn find_topk_phase2(
|
|
|
130
145
|
var max_val = shared_values[k];
|
|
131
146
|
|
|
132
147
|
for (var i: u32 = k + 1u; i < num_candidates; i = i + 1u) {
|
|
133
|
-
if (shared_values[i]
|
|
148
|
+
if (candidate_beats(shared_values[i], shared_indices[i], max_val, shared_indices[max_idx])) {
|
|
134
149
|
max_val = shared_values[i];
|
|
135
150
|
max_idx = i;
|
|
136
151
|
}
|
|
@@ -218,7 +233,7 @@ fn sample_single_pass(
|
|
|
218
233
|
while (idx < vocab_size) {
|
|
219
234
|
if (idx != pad_id) {
|
|
220
235
|
let val = apply_softcap(f32(logits[idx]), softcap) / temperature;
|
|
221
|
-
if (val
|
|
236
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
222
237
|
local_max = val;
|
|
223
238
|
local_max_idx = idx;
|
|
224
239
|
}
|
|
@@ -233,7 +248,12 @@ fn sample_single_pass(
|
|
|
233
248
|
var stride = WORKGROUP_SIZE / 2u;
|
|
234
249
|
while (stride > 0u) {
|
|
235
250
|
if (thread_idx < stride) {
|
|
236
|
-
if (
|
|
251
|
+
if (candidate_beats(
|
|
252
|
+
shared_values[thread_idx + stride],
|
|
253
|
+
shared_indices[thread_idx + stride],
|
|
254
|
+
shared_values[thread_idx],
|
|
255
|
+
shared_indices[thread_idx]
|
|
256
|
+
)) {
|
|
237
257
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
238
258
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
239
259
|
}
|
|
@@ -267,7 +287,7 @@ fn argmax(
|
|
|
267
287
|
while (idx < vocab_size) {
|
|
268
288
|
if (idx != pad_id) {
|
|
269
289
|
let val = apply_softcap(f32(logits[idx]), softcap);
|
|
270
|
-
if (val
|
|
290
|
+
if (candidate_beats(val, idx, local_max, local_max_idx)) {
|
|
271
291
|
local_max = val;
|
|
272
292
|
local_max_idx = idx;
|
|
273
293
|
}
|
|
@@ -282,7 +302,12 @@ fn argmax(
|
|
|
282
302
|
var stride = WORKGROUP_SIZE / 2u;
|
|
283
303
|
while (stride > 0u) {
|
|
284
304
|
if (thread_idx < stride) {
|
|
285
|
-
if (
|
|
305
|
+
if (candidate_beats(
|
|
306
|
+
shared_values[thread_idx + stride],
|
|
307
|
+
shared_indices[thread_idx + stride],
|
|
308
|
+
shared_values[thread_idx],
|
|
309
|
+
shared_indices[thread_idx]
|
|
310
|
+
)) {
|
|
286
311
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
287
312
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
288
313
|
}
|
|
@@ -316,7 +341,12 @@ fn argmax_reduce(
|
|
|
316
341
|
var stride = WORKGROUP_SIZE / 2u;
|
|
317
342
|
while (stride > 0u) {
|
|
318
343
|
if (thread_idx < stride) {
|
|
319
|
-
if (
|
|
344
|
+
if (candidate_beats(
|
|
345
|
+
shared_values[thread_idx + stride],
|
|
346
|
+
shared_indices[thread_idx + stride],
|
|
347
|
+
shared_values[thread_idx],
|
|
348
|
+
shared_indices[thread_idx]
|
|
349
|
+
)) {
|
|
320
350
|
shared_values[thread_idx] = shared_values[thread_idx + stride];
|
|
321
351
|
shared_indices[thread_idx] = shared_indices[thread_idx + stride];
|
|
322
352
|
}
|
|
@@ -133,10 +133,15 @@ export async function compileShader(
|
|
|
133
133
|
source,
|
|
134
134
|
label
|
|
135
135
|
) {
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
136
|
+
let module;
|
|
137
|
+
try {
|
|
138
|
+
module = device.createShaderModule({
|
|
139
|
+
label,
|
|
140
|
+
code: source,
|
|
141
|
+
});
|
|
142
|
+
} catch (err) {
|
|
143
|
+
throw new Error(`createShaderModule failed for "${label}": ${err.message}`);
|
|
144
|
+
}
|
|
140
145
|
|
|
141
146
|
// Check for compilation errors (getCompilationInfo not available in all WebGPU providers)
|
|
142
147
|
const compilationInfo = typeof module.getCompilationInfo === 'function'
|
|
@@ -314,10 +314,7 @@ export class KVCache {
|
|
|
314
314
|
layer.seqLen = Math.max(layer.seqLen, startPos + numNewTokens);
|
|
315
315
|
this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numNewTokens);
|
|
316
316
|
|
|
317
|
-
|
|
318
|
-
if (layerIdx === this.numLayers - 1) {
|
|
319
|
-
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
|
|
320
|
-
}
|
|
317
|
+
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numNewTokens);
|
|
321
318
|
}
|
|
322
319
|
|
|
323
320
|
|
|
@@ -374,9 +371,7 @@ export class KVCache {
|
|
|
374
371
|
layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
|
|
375
372
|
this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
|
|
376
373
|
|
|
377
|
-
|
|
378
|
-
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
379
|
-
}
|
|
374
|
+
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
380
375
|
}
|
|
381
376
|
|
|
382
377
|
|
|
@@ -433,9 +428,7 @@ export class KVCache {
|
|
|
433
428
|
layer.seqLen = Math.max(layer.seqLen, startPos + numTokens);
|
|
434
429
|
this.totalTokensSeen = Math.max(this.totalTokensSeen, startPos + numTokens);
|
|
435
430
|
|
|
436
|
-
|
|
437
|
-
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
438
|
-
}
|
|
431
|
+
this.currentSeqLen = Math.max(this.currentSeqLen, startPos + numTokens);
|
|
439
432
|
}
|
|
440
433
|
|
|
441
434
|
|
|
@@ -28,6 +28,7 @@ import { runResidualAdd, runScale, recordResidualAdd, recordScale } from '../../
|
|
|
28
28
|
import { f16ToF32 } from '../../../loader/dtype-utils.js';
|
|
29
29
|
|
|
30
30
|
const SUPPORTED_DIFFUSION_BACKEND_PIPELINES = new Set(['gpu']);
|
|
31
|
+
const DEFAULT_TIME_EMBED_DIM = 256;
|
|
31
32
|
const SD3_TEXT_ENCODER_KEYS = ['text_encoder', 'text_encoder_2', 'text_encoder_3'];
|
|
32
33
|
const SANA_TEXT_ENCODER_KEYS = ['text_encoder'];
|
|
33
34
|
|
|
@@ -492,7 +493,7 @@ export class DiffusionPipeline {
|
|
|
492
493
|
const hiddenSize = (transformerConfig.num_attention_heads ?? 0) * (transformerConfig.attention_head_dim ?? 0);
|
|
493
494
|
const patchSize = transformerConfig.patch_size ?? 2;
|
|
494
495
|
const timeEmbedWeight = transformerResolver.get('time_text_embed.timestep_embedder.linear_1.weight');
|
|
495
|
-
const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ??
|
|
496
|
+
const timeEmbedDim = timeEmbedWeight?.shape?.[1] ?? transformerConfig.time_embed_dim ?? DEFAULT_TIME_EMBED_DIM;
|
|
496
497
|
if (!Number.isFinite(hiddenSize) || hiddenSize <= 0) {
|
|
497
498
|
throw new Error('Diffusion transformer config missing num_attention_heads/attention_head_dim.');
|
|
498
499
|
}
|
|
@@ -44,6 +44,7 @@ import { initRoPEFrequencies } from '../text/init.js';
|
|
|
44
44
|
import { processLayerGPU } from '../text/layer.js';
|
|
45
45
|
|
|
46
46
|
const QUICK_GELU_ALPHA = 1.702;
|
|
47
|
+
const DEFAULT_TIMESTEP_EMBED_DIM = 256;
|
|
47
48
|
const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
|
|
48
49
|
// Standard CLIP hidden activation per OpenAI CLIP specification.
|
|
49
50
|
const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
|
|
@@ -1105,7 +1106,7 @@ export async function buildTimestepEmbedding(timestep, weightsEntry, modelConfig
|
|
|
1105
1106
|
const device = getDevice();
|
|
1106
1107
|
if (!device) throw new Error('Timestep embedding requires a WebGPU device.');
|
|
1107
1108
|
|
|
1108
|
-
const dim = options.dim ??
|
|
1109
|
+
const dim = options.dim ?? DEFAULT_TIMESTEP_EMBED_DIM;
|
|
1109
1110
|
const half = Math.floor(dim / 2);
|
|
1110
1111
|
const emb = new Float32Array(dim);
|
|
1111
1112
|
const maxPeriod = 10000;
|
|
@@ -3,6 +3,7 @@ import type { Tensor } from '../../../../gpu/tensor.js';
|
|
|
3
3
|
import type { WeightBuffer, CpuWeightBuffer } from '../../../../gpu/weight-buffer.js';
|
|
4
4
|
import type { LayerWeights } from '../types.js';
|
|
5
5
|
import type { LoRAAdapter } from '../lora.js';
|
|
6
|
+
import type { MatmulDebugConfigSchema } from '../../../../config/schema/debug.schema.js';
|
|
6
7
|
|
|
7
8
|
export interface AttentionInputInfo {
|
|
8
9
|
phase: 'prefill' | 'decode';
|
|
@@ -76,11 +77,13 @@ export interface ProjectAttentionQKVOptions {
|
|
|
76
77
|
getWeightBuffer?: (weight: GPUBuffer | WeightBuffer | Float32Array | ArrayBuffer | CpuWeightBuffer, label: string) => GPUBuffer | WeightBuffer;
|
|
77
78
|
lora?: LoRAAdapter | null;
|
|
78
79
|
releaseTemporary: (buffer: GPUBuffer) => void;
|
|
80
|
+
matmulDebug?: MatmulDebugConfigSchema | null;
|
|
79
81
|
onFusedQKV?: ((info: { qSize: number; kSize: number; vSize: number; totalSize: number }) => void) | null;
|
|
80
82
|
}
|
|
81
83
|
|
|
82
84
|
export interface ProjectAttentionQKVResult {
|
|
83
85
|
qTensor: Tensor;
|
|
86
|
+
qGateTensor: Tensor | null;
|
|
84
87
|
kTensor: Tensor;
|
|
85
88
|
vTensor: Tensor;
|
|
86
89
|
usedFusedQKV: boolean;
|
|
@@ -71,9 +71,10 @@ async function projectSingleQkvTensor({
|
|
|
71
71
|
matmulOutputDtype,
|
|
72
72
|
getWeightBuffer,
|
|
73
73
|
lora,
|
|
74
|
+
matmulDebug,
|
|
74
75
|
releaseTemporary,
|
|
75
76
|
}) {
|
|
76
|
-
|
|
77
|
+
const runMatmulForMode = getMatmulRunner(recorder);
|
|
77
78
|
const layerWeight = layerWeights?.[weightKey];
|
|
78
79
|
if (!layerWeight) {
|
|
79
80
|
throw new Error(`Attention projection requires ${weightKey}.`);
|
|
@@ -91,6 +92,7 @@ async function projectSingleQkvTensor({
|
|
|
91
92
|
layerIdx,
|
|
92
93
|
kernelPath,
|
|
93
94
|
outputDtype: matmulOutputDtype,
|
|
95
|
+
matmulDebug,
|
|
94
96
|
});
|
|
95
97
|
} finally {
|
|
96
98
|
releaseOwnedWeightBuffer(layerWeight, projBuffer, releaseTemporary);
|
|
@@ -178,6 +180,7 @@ async function projectQueryWithOptionalGate({
|
|
|
178
180
|
matmulOutputDtype,
|
|
179
181
|
getWeightBuffer,
|
|
180
182
|
lora,
|
|
183
|
+
matmulDebug,
|
|
181
184
|
releaseTemporary,
|
|
182
185
|
attentionOutputGate,
|
|
183
186
|
}) {
|
|
@@ -205,6 +208,7 @@ async function projectQueryWithOptionalGate({
|
|
|
205
208
|
matmulOutputDtype,
|
|
206
209
|
getWeightBuffer,
|
|
207
210
|
lora,
|
|
211
|
+
matmulDebug,
|
|
208
212
|
releaseTemporary,
|
|
209
213
|
});
|
|
210
214
|
return { qTensor, qGateTensor: null };
|
|
@@ -226,6 +230,7 @@ async function projectQueryWithOptionalGate({
|
|
|
226
230
|
layerIdx,
|
|
227
231
|
kernelPath,
|
|
228
232
|
outputDtype: matmulOutputDtype,
|
|
233
|
+
matmulDebug,
|
|
229
234
|
});
|
|
230
235
|
|
|
231
236
|
const split = await runSplitQGForMode(fullQGTensor, {
|
|
@@ -329,6 +334,7 @@ export async function projectAttentionQKV({
|
|
|
329
334
|
matmulOutputDtype,
|
|
330
335
|
getWeightBuffer,
|
|
331
336
|
lora,
|
|
337
|
+
matmulDebug,
|
|
332
338
|
releaseTemporary,
|
|
333
339
|
onFusedQKV = null,
|
|
334
340
|
attentionOutputGate = false,
|
|
@@ -339,7 +345,8 @@ export async function projectAttentionQKV({
|
|
|
339
345
|
const hasLoRA = getLoRAModule(lora, layerIdx, 'q_proj')
|
|
340
346
|
|| getLoRAModule(lora, layerIdx, 'k_proj')
|
|
341
347
|
|| getLoRAModule(lora, layerIdx, 'v_proj');
|
|
342
|
-
const
|
|
348
|
+
const forceSplitQKV = Boolean(matmulDebug?.enabled) && matmulDebug?.forceSplitQKV === true;
|
|
349
|
+
const useFusedQKV = !forceSplitQKV && selectRuleValue('inference', 'attention', 'useFusedQkv', {
|
|
343
350
|
hasQkvProj: Boolean(layerWeights.qkvProj),
|
|
344
351
|
hasQkvSizes: Boolean(layerWeights.qkvSizes),
|
|
345
352
|
hasLoRA: Boolean(hasLoRA),
|
|
@@ -356,6 +363,7 @@ export async function projectAttentionQKV({
|
|
|
356
363
|
layerIdx,
|
|
357
364
|
kernelPath,
|
|
358
365
|
outputDtype: matmulOutputDtype,
|
|
366
|
+
matmulDebug,
|
|
359
367
|
});
|
|
360
368
|
const split = await runSplitForMode(qkvTensor, {
|
|
361
369
|
numTokens,
|
|
@@ -394,6 +402,7 @@ export async function projectAttentionQKV({
|
|
|
394
402
|
matmulOutputDtype,
|
|
395
403
|
getWeightBuffer,
|
|
396
404
|
lora,
|
|
405
|
+
matmulDebug,
|
|
397
406
|
releaseTemporary,
|
|
398
407
|
attentionOutputGate,
|
|
399
408
|
}));
|
|
@@ -414,6 +423,7 @@ export async function projectAttentionQKV({
|
|
|
414
423
|
matmulOutputDtype,
|
|
415
424
|
getWeightBuffer,
|
|
416
425
|
lora,
|
|
426
|
+
matmulDebug,
|
|
417
427
|
releaseTemporary,
|
|
418
428
|
});
|
|
419
429
|
|
|
@@ -433,6 +443,7 @@ export async function projectAttentionQKV({
|
|
|
433
443
|
matmulOutputDtype,
|
|
434
444
|
getWeightBuffer,
|
|
435
445
|
lora,
|
|
446
|
+
matmulDebug,
|
|
436
447
|
releaseTemporary,
|
|
437
448
|
});
|
|
438
449
|
|
|
@@ -167,6 +167,7 @@ export async function recordLayerAttentionGPU(
|
|
|
167
167
|
matmulOutputDtype,
|
|
168
168
|
getWeightBuffer,
|
|
169
169
|
lora,
|
|
170
|
+
matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
|
|
170
171
|
attentionOutputGate: config.attentionOutputGate === true,
|
|
171
172
|
releaseTemporary: (buffer) => releaseOrTrack(recorder, buffer),
|
|
172
173
|
onFusedQKV: layerIdx === 0 && isPrefill
|
|
@@ -166,6 +166,14 @@ export async function runLayerAttentionGPU(
|
|
|
166
166
|
dtype: normed.dtype,
|
|
167
167
|
});
|
|
168
168
|
}
|
|
169
|
+
|
|
170
|
+
await runProbes('post_input_norm', normed.buffer, {
|
|
171
|
+
layerIdx,
|
|
172
|
+
numTokens,
|
|
173
|
+
hiddenSize,
|
|
174
|
+
probes: state.debugProbes,
|
|
175
|
+
dtype: normed.dtype,
|
|
176
|
+
});
|
|
169
177
|
}
|
|
170
178
|
|
|
171
179
|
// Debug: Check normed input for L0 prefill
|
|
@@ -218,6 +226,7 @@ export async function runLayerAttentionGPU(
|
|
|
218
226
|
matmulOutputDtype,
|
|
219
227
|
getWeightBuffer,
|
|
220
228
|
lora,
|
|
229
|
+
matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
|
|
221
230
|
attentionOutputGate: config.attentionOutputGate === true,
|
|
222
231
|
releaseTemporary: (buffer) => releaseBuffer(buffer),
|
|
223
232
|
onFusedQKV: layerIdx === 0 && isPrefill
|
|
@@ -150,6 +150,7 @@ export interface ParsedModelConfig {
|
|
|
150
150
|
ropeLocalTheta: number | null;
|
|
151
151
|
ropeRotaryDim: number;
|
|
152
152
|
ropeInterleaved: boolean;
|
|
153
|
+
mropeInterleaved: boolean;
|
|
153
154
|
mropeSection: number[] | null;
|
|
154
155
|
partialRotaryFactor: number | null;
|
|
155
156
|
ropeScale: number;
|
|
@@ -349,6 +349,24 @@ function normalizeLayerTypeTag(value) {
|
|
|
349
349
|
return null;
|
|
350
350
|
}
|
|
351
351
|
|
|
352
|
+
function resolveVisionConfig(rawConfig, manifest) {
|
|
353
|
+
const vc = rawConfig?.vision_config ?? manifest?.config?.vision_config;
|
|
354
|
+
if (!vc || typeof vc !== 'object') return null;
|
|
355
|
+
return {
|
|
356
|
+
depth: vc.depth ?? 24,
|
|
357
|
+
hiddenSize: vc.hidden_size ?? 1024,
|
|
358
|
+
intermediateSize: vc.intermediate_size ?? 4096,
|
|
359
|
+
numHeads: vc.num_heads ?? 16,
|
|
360
|
+
outHiddenSize: vc.out_hidden_size ?? vc.hidden_size ?? 1024,
|
|
361
|
+
patchSize: vc.patch_size ?? 16,
|
|
362
|
+
spatialMergeSize: vc.spatial_merge_size ?? 2,
|
|
363
|
+
temporalPatchSize: vc.temporal_patch_size ?? 2,
|
|
364
|
+
eps: vc.eps ?? 1e-6,
|
|
365
|
+
deepstackVisualIndexes: Array.isArray(vc.deepstack_visual_indexes) ? vc.deepstack_visual_indexes : [],
|
|
366
|
+
imageTokenId: rawConfig?.image_token_id ?? manifest?.image_token_id ?? null,
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
|
|
352
370
|
function parseCustomLayerTypes(layerTypes, numLayers, modelId) {
|
|
353
371
|
if (!Array.isArray(layerTypes) || layerTypes.length === 0) {
|
|
354
372
|
throw new Error(
|
|
@@ -512,10 +530,18 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
512
530
|
// RoPE scaling - use manifest inference as source of truth (not raw config)
|
|
513
531
|
const ropeScale = inf.rope.ropeScalingFactor;
|
|
514
532
|
const ropeScalingType = inf.rope.ropeScalingType;
|
|
515
|
-
const ropeLocalScale = inf.rope.ropeLocalScalingFactor
|
|
516
|
-
const ropeLocalScalingType = inf.rope.ropeLocalScalingType
|
|
533
|
+
const ropeLocalScale = inf.rope.ropeLocalScalingFactor;
|
|
534
|
+
const ropeLocalScalingType = inf.rope.ropeLocalScalingType;
|
|
517
535
|
const partialRotaryFactor = inf.rope.partialRotaryFactor;
|
|
518
|
-
const
|
|
536
|
+
const mropeInterleaved = inf.rope.mropeInterleaved === true;
|
|
537
|
+
const ropeInterleaved = false;
|
|
538
|
+
|
|
539
|
+
if (ropeLocalScale == null && (inf.rope.ropeLocalTheta != null || inf.rope.mropeSection != null)) {
|
|
540
|
+
throw new Error(
|
|
541
|
+
`Model "${merged.modelId}" uses hybrid/mRoPE but is missing rope.ropeLocalScalingFactor in manifest. ` +
|
|
542
|
+
`Re-convert the model using the latest converter or update the manifest to include an explicit scale.`
|
|
543
|
+
);
|
|
544
|
+
}
|
|
519
545
|
const mropeSection = Array.isArray(inf.rope.mropeSection)
|
|
520
546
|
? inf.rope.mropeSection.map((entry) => Math.trunc(Number(entry)))
|
|
521
547
|
: null;
|
|
@@ -525,7 +551,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
525
551
|
`Manifest "${merged.modelId}" has invalid rope.mropeSection; expected positive integers.`
|
|
526
552
|
);
|
|
527
553
|
}
|
|
528
|
-
if (
|
|
554
|
+
if (mropeInterleaved && mropeSection) {
|
|
529
555
|
const doubledMropeDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
|
|
530
556
|
if (doubledMropeDim !== ropeRotaryDim) {
|
|
531
557
|
throw new Error(
|
|
@@ -610,6 +636,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
610
636
|
ropeLocalTheta: inf.rope.ropeLocalTheta,
|
|
611
637
|
ropeRotaryDim,
|
|
612
638
|
ropeInterleaved,
|
|
639
|
+
mropeInterleaved,
|
|
613
640
|
mropeSection,
|
|
614
641
|
partialRotaryFactor,
|
|
615
642
|
ropeScale,
|
|
@@ -650,6 +677,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
650
677
|
chatTemplateType,
|
|
651
678
|
chatTemplateEnabled,
|
|
652
679
|
kernelPath: inf.defaultKernelPath,
|
|
680
|
+
visionConfig: resolveVisionConfig(config, manifest),
|
|
653
681
|
};
|
|
654
682
|
}
|
|
655
683
|
|
|
@@ -9,6 +9,7 @@ import { decodeReadback } from './debug-utils/index.js';
|
|
|
9
9
|
import { createTensor } from '../../../gpu/tensor.js';
|
|
10
10
|
import { castF32ToF16, recordCastF32ToF16 } from '../../../gpu/kernels/cast.js';
|
|
11
11
|
import { isCpuWeightBuffer } from '../../../gpu/weight-buffer.js';
|
|
12
|
+
import { f16ToF32 } from '../../../loader/dtype-utils.js';
|
|
12
13
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
13
14
|
|
|
14
15
|
const scaleShaderCode = `
|
|
@@ -202,11 +203,19 @@ export async function embed(tokenIds, embedBuffer, config) {
|
|
|
202
203
|
|
|
203
204
|
const dtype = selectRuleValue('inference', 'dtype', 'f16OrF32', { useF16 });
|
|
204
205
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
206
|
+
let cpuEmbeddings = null;
|
|
207
|
+
if (isCpuWeightBuffer(embedBuffer)) {
|
|
208
|
+
const bufDtype = embedBuffer.dtype;
|
|
209
|
+
if (bufDtype !== 'f32' && bufDtype !== 'f16') {
|
|
210
|
+
throw new Error(
|
|
211
|
+
`[Embed] CPU embedding buffer has unsupported dtype '${bufDtype}'; ` +
|
|
212
|
+
`only 'f32' and 'f16' are supported in the CPU gather path.`
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
cpuEmbeddings = embedBuffer.data;
|
|
216
|
+
} else if (embedBuffer instanceof Float32Array) {
|
|
217
|
+
cpuEmbeddings = embedBuffer;
|
|
218
|
+
}
|
|
210
219
|
|
|
211
220
|
if (debug) {
|
|
212
221
|
trace.embed(`tokens=${numTokens}, hidden=${hiddenSize}, vocab=${vocabSize}, scaleEmbeddings=${scaleEmbeddings}, transpose=${transpose}, indexOffset=${indexOffset}, activationDtype=${activationDtype}, useF16=${useF16}`);
|
|
@@ -226,18 +235,28 @@ export async function embed(tokenIds, embedBuffer, config) {
|
|
|
226
235
|
}
|
|
227
236
|
|
|
228
237
|
const output = new Float32Array(numTokens * hiddenSize);
|
|
238
|
+
// Check actual data type: loader's f16_to_f32 CPU path already decodes F16 into Float32Array,
|
|
239
|
+
// so dtype='f16' does not reliably indicate raw F16 bytes. Only Uint16Array needs per-element decoding.
|
|
240
|
+
const isF16Cpu = cpuEmbeddings instanceof Uint16Array;
|
|
229
241
|
if (!transpose) {
|
|
230
242
|
for (let t = 0; t < numTokens; t++) {
|
|
231
243
|
const tokenId = (tokenIdArray)[t];
|
|
232
244
|
const srcOffset = tokenId * hiddenSize;
|
|
233
|
-
|
|
245
|
+
if (isF16Cpu) {
|
|
246
|
+
for (let h = 0; h < hiddenSize; h++) {
|
|
247
|
+
output[t * hiddenSize + h] = f16ToF32(cpuEmbeddings[srcOffset + h]);
|
|
248
|
+
}
|
|
249
|
+
} else {
|
|
250
|
+
output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
|
|
251
|
+
}
|
|
234
252
|
}
|
|
235
253
|
} else {
|
|
236
254
|
for (let t = 0; t < numTokens; t++) {
|
|
237
255
|
const tokenId = (tokenIdArray)[t];
|
|
238
256
|
const dstOffset = t * hiddenSize;
|
|
239
257
|
for (let h = 0; h < hiddenSize; h++) {
|
|
240
|
-
|
|
258
|
+
const raw = cpuEmbeddings[h * vocabSize + tokenId];
|
|
259
|
+
output[dstOffset + h] = isF16Cpu ? f16ToF32(raw) : raw;
|
|
241
260
|
}
|
|
242
261
|
}
|
|
243
262
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
2
2
|
import { cloneJson, isPhaseMatch, normalizeDtype, requireSessionActivationDtype, stepHasLayer } from './execution-v0-contract-helpers.js';
|
|
3
3
|
|
|
4
|
-
const PIPELINE_COMPATIBLE_OPS = new Set([
|
|
4
|
+
export const PIPELINE_COMPATIBLE_OPS = new Set([
|
|
5
5
|
'save',
|
|
6
6
|
'load',
|
|
7
7
|
'conv',
|
|
@@ -191,8 +191,15 @@ export function buildLayerPipelineFromExecution(steps) {
|
|
|
191
191
|
if (layerSectionSteps.length === 0) {
|
|
192
192
|
return null;
|
|
193
193
|
}
|
|
194
|
-
|
|
195
|
-
|
|
194
|
+
const incompatibleOps = [
|
|
195
|
+
...new Set(
|
|
196
|
+
layerSectionSteps
|
|
197
|
+
.filter((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))
|
|
198
|
+
.map((step) => step.op)
|
|
199
|
+
),
|
|
200
|
+
];
|
|
201
|
+
if (incompatibleOps.length > 0) {
|
|
202
|
+
return { incompatibleOps };
|
|
196
203
|
}
|
|
197
204
|
|
|
198
205
|
const layerSteps = layerSectionSteps
|
|
@@ -31,6 +31,7 @@ import {
|
|
|
31
31
|
buildModelRuntimeOverrides,
|
|
32
32
|
buildSessionRuntimePatch,
|
|
33
33
|
resolveFinitenessFallbackKernelPathId,
|
|
34
|
+
PIPELINE_COMPATIBLE_OPS,
|
|
34
35
|
} from './execution-v0-runtime-builders.js';
|
|
35
36
|
|
|
36
37
|
export function hasExecutionV0(manifestInference) {
|
|
@@ -152,7 +153,17 @@ export function compileExecutionV0(options = {}) {
|
|
|
152
153
|
numLayers,
|
|
153
154
|
finitenessFallbackKernelPathId
|
|
154
155
|
);
|
|
155
|
-
const
|
|
156
|
+
const layerPipelineResult = buildLayerPipelineFromExecution(resolvedSteps);
|
|
157
|
+
if (layerPipelineResult?.incompatibleOps && !kernelPath) {
|
|
158
|
+
throw new Error(
|
|
159
|
+
`[ExecutionV0] manifest.inference.execution.steps contains layer ops that are not ` +
|
|
160
|
+
`compatible with the JS layer pipeline and no inline kernelPath was built to cover execution. ` +
|
|
161
|
+
`Unsupported ops: ${layerPipelineResult.incompatibleOps.join(', ')}. ` +
|
|
162
|
+
`Either add explicit kernel references to each step (for inline-kernel execution) ` +
|
|
163
|
+
`or restrict layer ops to: ${[...PIPELINE_COMPATIBLE_OPS].join(', ')}.`
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
const layerPipeline = layerPipelineResult?.incompatibleOps ? null : layerPipelineResult;
|
|
156
167
|
const sessionPatch = buildSessionRuntimePatch(resolvedSession);
|
|
157
168
|
const modelOverrides = buildModelRuntimeOverrides(manifestInference);
|
|
158
169
|
for (const [path, source] of sessionSourceByPath.entries()) {
|
|
@@ -111,6 +111,7 @@ export function buildLayerContext(state, recorder, isDecodeMode, debugLayers, de
|
|
|
111
111
|
ropeLocalCos: state.ropeLocalCos,
|
|
112
112
|
ropeLocalSin: state.ropeLocalSin,
|
|
113
113
|
linearAttentionRuntime: state.linearAttentionRuntime,
|
|
114
|
+
convLayerStates: state.convLayerStates,
|
|
114
115
|
weightConfig: getWeightBufferConfig(state),
|
|
115
116
|
debugFlags: state.debugFlags,
|
|
116
117
|
debugProbes: state.runtimeConfig.shared.debug.probes,
|
|
@@ -139,6 +139,12 @@ export function resolveStepOptions(state, options = {}) {
|
|
|
139
139
|
const executionPlan = resolveExecutionSessionPlan(state, options);
|
|
140
140
|
|
|
141
141
|
return {
|
|
142
|
+
seed: resolveConfiguredValue(
|
|
143
|
+
options.seed,
|
|
144
|
+
undefined,
|
|
145
|
+
'options.seed',
|
|
146
|
+
(value) => Number.isFinite(value) && value >= 0
|
|
147
|
+
),
|
|
142
148
|
temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
|
|
143
149
|
topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
|
|
144
150
|
topK: resolveConfiguredValue(options.topK, samplingDefaults.topK, 'options.topK'),
|
|
@@ -165,6 +171,12 @@ export function resolveGenerateOptions(state, options = {}) {
|
|
|
165
171
|
const executionPlan = resolveExecutionSessionPlan(state, options);
|
|
166
172
|
|
|
167
173
|
return {
|
|
174
|
+
seed: resolveConfiguredValue(
|
|
175
|
+
options.seed,
|
|
176
|
+
undefined,
|
|
177
|
+
'options.seed',
|
|
178
|
+
(value) => Number.isFinite(value) && value >= 0
|
|
179
|
+
),
|
|
168
180
|
maxTokens: executionPlan.maxTokens,
|
|
169
181
|
temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
|
|
170
182
|
topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
|
|
@@ -191,6 +203,7 @@ export function resolveGenerateOptions(state, options = {}) {
|
|
|
191
203
|
batchSize: executionPlan.batchSize,
|
|
192
204
|
stopCheckMode: executionPlan.stopCheckMode,
|
|
193
205
|
executionPlan,
|
|
206
|
+
images: options.images ?? null,
|
|
194
207
|
};
|
|
195
208
|
}
|
|
196
209
|
|
|
@@ -205,6 +218,7 @@ export function resolvePrefillOptions(state, options = {}) {
|
|
|
205
218
|
disableCommandBatching: executionPlan.disableCommandBatching,
|
|
206
219
|
disableMultiTokenDecode: executionPlan.disableMultiTokenDecode,
|
|
207
220
|
executionPlan,
|
|
221
|
+
images: options.images ?? null,
|
|
208
222
|
};
|
|
209
223
|
}
|
|
210
224
|
|
|
@@ -12,6 +12,15 @@ export interface BatchDecodeSelectionConfig {
|
|
|
12
12
|
|
|
13
13
|
export declare function shouldUseBatchDecode(config: BatchDecodeSelectionConfig): boolean;
|
|
14
14
|
|
|
15
|
+
export interface FusedDecodeSamplingConfig {
|
|
16
|
+
recorderEnabled: boolean;
|
|
17
|
+
gpuSamplingEnabled: boolean;
|
|
18
|
+
fusedDecodeDisabled: boolean;
|
|
19
|
+
layerTypes?: string[] | null;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export declare function shouldUseFusedDecodeSampling(config: FusedDecodeSamplingConfig): boolean;
|
|
23
|
+
|
|
15
24
|
export declare function resolveBatchStop(
|
|
16
25
|
tokens: number[],
|
|
17
26
|
stopFlags: Uint32Array | null,
|