@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export async function prepareAttentionProjectionInput(attnForProjection, matmulOutputDtype, castToF16) {
|
|
2
|
+
if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
|
|
3
|
+
const casted = await castToF16(attnForProjection);
|
|
4
|
+
return { oProjInput: casted, oProjInputTemp: casted };
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
return { oProjInput: attnForProjection, oProjInputTemp: null };
|
|
8
|
+
}
|
|
@@ -3,6 +3,7 @@ import type { Tensor } from '../../../../gpu/tensor.js';
|
|
|
3
3
|
import type { WeightBuffer, CpuWeightBuffer } from '../../../../gpu/weight-buffer.js';
|
|
4
4
|
import type { LayerWeights } from '../types.js';
|
|
5
5
|
import type { LoRAAdapter } from '../lora.js';
|
|
6
|
+
import type { MatmulDebugConfigSchema } from '../../../../config/schema/debug.schema.js';
|
|
6
7
|
|
|
7
8
|
export interface AttentionInputInfo {
|
|
8
9
|
phase: 'prefill' | 'decode';
|
|
@@ -46,7 +47,16 @@ export function recordAttentionInputs(
|
|
|
46
47
|
info: AttentionInputInfo | null | undefined
|
|
47
48
|
): void;
|
|
48
49
|
|
|
49
|
-
export function
|
|
50
|
+
export function shouldForceF32AttentionProjectionForRoPE(options: {
|
|
51
|
+
attentionInputDtype: string;
|
|
52
|
+
headDim: number;
|
|
53
|
+
rotaryDim?: number;
|
|
54
|
+
interleaved?: boolean;
|
|
55
|
+
}): boolean;
|
|
56
|
+
export function resolveAttentionProjectionOutputDtype(
|
|
57
|
+
attentionInputDtype: string,
|
|
58
|
+
options?: { forceF32?: boolean }
|
|
59
|
+
): 'f16' | 'f32' | string;
|
|
50
60
|
export function resolveProjectionSliceOffsetBytes(
|
|
51
61
|
weightBuffer: WeightBuffer | Tensor | GPUBuffer | null | undefined,
|
|
52
62
|
outputRows: number,
|
|
@@ -67,11 +77,13 @@ export interface ProjectAttentionQKVOptions {
|
|
|
67
77
|
getWeightBuffer?: (weight: GPUBuffer | WeightBuffer | Float32Array | ArrayBuffer | CpuWeightBuffer, label: string) => GPUBuffer | WeightBuffer;
|
|
68
78
|
lora?: LoRAAdapter | null;
|
|
69
79
|
releaseTemporary: (buffer: GPUBuffer) => void;
|
|
80
|
+
matmulDebug?: MatmulDebugConfigSchema | null;
|
|
70
81
|
onFusedQKV?: ((info: { qSize: number; kSize: number; vSize: number; totalSize: number }) => void) | null;
|
|
71
82
|
}
|
|
72
83
|
|
|
73
84
|
export interface ProjectAttentionQKVResult {
|
|
74
85
|
qTensor: Tensor;
|
|
86
|
+
qGateTensor: Tensor | null;
|
|
75
87
|
kTensor: Tensor;
|
|
76
88
|
vTensor: Tensor;
|
|
77
89
|
usedFusedQKV: boolean;
|
|
@@ -5,6 +5,8 @@ import {
|
|
|
5
5
|
recordMatmul,
|
|
6
6
|
runSplitQKV,
|
|
7
7
|
recordSplitQKV,
|
|
8
|
+
runSplitQG,
|
|
9
|
+
recordSplitQG,
|
|
8
10
|
runRMSNorm,
|
|
9
11
|
recordRMSNorm,
|
|
10
12
|
} from '../../../../gpu/kernel-selector.js';
|
|
@@ -28,6 +30,13 @@ function getSplitRunner(recorder) {
|
|
|
28
30
|
return (qkvTensor, options) => recordSplitQKV(recorder, qkvTensor, options);
|
|
29
31
|
}
|
|
30
32
|
|
|
33
|
+
function getSplitQGRunner(recorder) {
|
|
34
|
+
if (!recorder) {
|
|
35
|
+
return (qgTensor, options) => runSplitQG(qgTensor, options);
|
|
36
|
+
}
|
|
37
|
+
return (qgTensor, options) => recordSplitQG(recorder, qgTensor, options);
|
|
38
|
+
}
|
|
39
|
+
|
|
31
40
|
function getRmsNormRunner(recorder) {
|
|
32
41
|
if (!recorder) {
|
|
33
42
|
return (input, weight, eps, options) => runRMSNorm(input, weight, eps, options);
|
|
@@ -62,9 +71,10 @@ async function projectSingleQkvTensor({
|
|
|
62
71
|
matmulOutputDtype,
|
|
63
72
|
getWeightBuffer,
|
|
64
73
|
lora,
|
|
74
|
+
matmulDebug,
|
|
65
75
|
releaseTemporary,
|
|
66
76
|
}) {
|
|
67
|
-
|
|
77
|
+
const runMatmulForMode = getMatmulRunner(recorder);
|
|
68
78
|
const layerWeight = layerWeights?.[weightKey];
|
|
69
79
|
if (!layerWeight) {
|
|
70
80
|
throw new Error(`Attention projection requires ${weightKey}.`);
|
|
@@ -82,6 +92,7 @@ async function projectSingleQkvTensor({
|
|
|
82
92
|
layerIdx,
|
|
83
93
|
kernelPath,
|
|
84
94
|
outputDtype: matmulOutputDtype,
|
|
95
|
+
matmulDebug,
|
|
85
96
|
});
|
|
86
97
|
} finally {
|
|
87
98
|
releaseOwnedWeightBuffer(layerWeight, projBuffer, releaseTemporary);
|
|
@@ -169,6 +180,7 @@ async function projectQueryWithOptionalGate({
|
|
|
169
180
|
matmulOutputDtype,
|
|
170
181
|
getWeightBuffer,
|
|
171
182
|
lora,
|
|
183
|
+
matmulDebug,
|
|
172
184
|
releaseTemporary,
|
|
173
185
|
attentionOutputGate,
|
|
174
186
|
}) {
|
|
@@ -196,34 +208,44 @@ async function projectQueryWithOptionalGate({
|
|
|
196
208
|
matmulOutputDtype,
|
|
197
209
|
getWeightBuffer,
|
|
198
210
|
lora,
|
|
211
|
+
matmulDebug,
|
|
199
212
|
releaseTemporary,
|
|
200
213
|
});
|
|
201
214
|
return { qTensor, qGateTensor: null };
|
|
202
215
|
}
|
|
203
216
|
|
|
217
|
+
// q_proj weights are stored with interleaved head layout: for head h,
|
|
218
|
+
// rows [h*headDim*2 : h*headDim*2+headDim] = Q, rows [h*headDim*2+headDim : (h+1)*headDim*2] = gate.
|
|
219
|
+
// Compute the full 2*qSize matmul, then de-interleave into separate Q and gate tensors.
|
|
204
220
|
const runMatmulForMode = getMatmulRunner(recorder);
|
|
221
|
+
const runSplitQGForMode = getSplitQGRunner(recorder);
|
|
205
222
|
const qWeightBuffer = getWeightBuffer(qWeight, 'q_proj');
|
|
206
|
-
|
|
223
|
+
let fullQGTensor = null;
|
|
207
224
|
let qTensor = null;
|
|
208
225
|
let qGateTensor = null;
|
|
209
226
|
try {
|
|
210
|
-
|
|
227
|
+
fullQGTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize * 2, hiddenSize, {
|
|
211
228
|
transposeB: 'auto',
|
|
212
229
|
role: 'q_proj',
|
|
213
230
|
layerIdx,
|
|
214
231
|
kernelPath,
|
|
215
232
|
outputDtype: matmulOutputDtype,
|
|
233
|
+
matmulDebug,
|
|
216
234
|
});
|
|
217
235
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
kernelPath,
|
|
223
|
-
bOffset: gateOffset,
|
|
224
|
-
outputDtype: matmulOutputDtype,
|
|
236
|
+
const split = await runSplitQGForMode(fullQGTensor, {
|
|
237
|
+
numTokens,
|
|
238
|
+
numHeads,
|
|
239
|
+
headDim,
|
|
225
240
|
});
|
|
241
|
+
releaseTemporary(fullQGTensor.buffer);
|
|
242
|
+
fullQGTensor = null;
|
|
243
|
+
qTensor = split.Q;
|
|
244
|
+
qGateTensor = split.G;
|
|
226
245
|
} catch (error) {
|
|
246
|
+
if (fullQGTensor) {
|
|
247
|
+
releaseTemporary(fullQGTensor.buffer);
|
|
248
|
+
}
|
|
227
249
|
if (qTensor) {
|
|
228
250
|
releaseTemporary(qTensor.buffer);
|
|
229
251
|
}
|
|
@@ -277,9 +299,22 @@ export function recordAttentionInputs(state, info) {
|
|
|
277
299
|
state.stats.attentionInputs.push(info);
|
|
278
300
|
}
|
|
279
301
|
|
|
280
|
-
export function
|
|
302
|
+
export function shouldForceF32AttentionProjectionForRoPE({
|
|
303
|
+
attentionInputDtype,
|
|
304
|
+
headDim,
|
|
305
|
+
rotaryDim = headDim,
|
|
306
|
+
interleaved = false,
|
|
307
|
+
}) {
|
|
308
|
+
return attentionInputDtype === 'f16'
|
|
309
|
+
&& Number.isFinite(headDim)
|
|
310
|
+
&& Number.isFinite(rotaryDim)
|
|
311
|
+
&& (rotaryDim !== headDim || interleaved === true);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
export function resolveAttentionProjectionOutputDtype(attentionInputDtype, options = {}) {
|
|
281
315
|
const useF16Activations = attentionInputDtype === 'f16';
|
|
282
|
-
return selectRuleValue('
|
|
316
|
+
return selectRuleValue('inference', 'dtype', 'attentionProjectionOutputDtype', {
|
|
317
|
+
forceF32: options.forceF32 === true,
|
|
283
318
|
useF16: useF16Activations,
|
|
284
319
|
fallback: attentionInputDtype,
|
|
285
320
|
});
|
|
@@ -299,6 +334,7 @@ export async function projectAttentionQKV({
|
|
|
299
334
|
matmulOutputDtype,
|
|
300
335
|
getWeightBuffer,
|
|
301
336
|
lora,
|
|
337
|
+
matmulDebug,
|
|
302
338
|
releaseTemporary,
|
|
303
339
|
onFusedQKV = null,
|
|
304
340
|
attentionOutputGate = false,
|
|
@@ -309,7 +345,8 @@ export async function projectAttentionQKV({
|
|
|
309
345
|
const hasLoRA = getLoRAModule(lora, layerIdx, 'q_proj')
|
|
310
346
|
|| getLoRAModule(lora, layerIdx, 'k_proj')
|
|
311
347
|
|| getLoRAModule(lora, layerIdx, 'v_proj');
|
|
312
|
-
const
|
|
348
|
+
const forceSplitQKV = Boolean(matmulDebug?.enabled) && matmulDebug?.forceSplitQKV === true;
|
|
349
|
+
const useFusedQKV = !forceSplitQKV && selectRuleValue('inference', 'attention', 'useFusedQkv', {
|
|
313
350
|
hasQkvProj: Boolean(layerWeights.qkvProj),
|
|
314
351
|
hasQkvSizes: Boolean(layerWeights.qkvSizes),
|
|
315
352
|
hasLoRA: Boolean(hasLoRA),
|
|
@@ -326,6 +363,7 @@ export async function projectAttentionQKV({
|
|
|
326
363
|
layerIdx,
|
|
327
364
|
kernelPath,
|
|
328
365
|
outputDtype: matmulOutputDtype,
|
|
366
|
+
matmulDebug,
|
|
329
367
|
});
|
|
330
368
|
const split = await runSplitForMode(qkvTensor, {
|
|
331
369
|
numTokens,
|
|
@@ -364,6 +402,7 @@ export async function projectAttentionQKV({
|
|
|
364
402
|
matmulOutputDtype,
|
|
365
403
|
getWeightBuffer,
|
|
366
404
|
lora,
|
|
405
|
+
matmulDebug,
|
|
367
406
|
releaseTemporary,
|
|
368
407
|
attentionOutputGate,
|
|
369
408
|
}));
|
|
@@ -384,6 +423,7 @@ export async function projectAttentionQKV({
|
|
|
384
423
|
matmulOutputDtype,
|
|
385
424
|
getWeightBuffer,
|
|
386
425
|
lora,
|
|
426
|
+
matmulDebug,
|
|
387
427
|
releaseTemporary,
|
|
388
428
|
});
|
|
389
429
|
|
|
@@ -403,6 +443,7 @@ export async function projectAttentionQKV({
|
|
|
403
443
|
matmulOutputDtype,
|
|
404
444
|
getWeightBuffer,
|
|
405
445
|
lora,
|
|
446
|
+
matmulDebug,
|
|
406
447
|
releaseTemporary,
|
|
407
448
|
});
|
|
408
449
|
|
|
@@ -24,10 +24,12 @@ import { selectRuleValue } from '../../../../rules/rule-registry.js';
|
|
|
24
24
|
import { SlidingWindowKVCache } from '../../../kv-cache.js';
|
|
25
25
|
import {
|
|
26
26
|
recordAttentionInputs,
|
|
27
|
+
shouldForceF32AttentionProjectionForRoPE,
|
|
27
28
|
resolveAttentionProjectionOutputDtype,
|
|
28
29
|
projectAttentionQKV,
|
|
29
30
|
applyAttentionQKNorm,
|
|
30
31
|
} from './projections.js';
|
|
32
|
+
import { prepareAttentionProjectionInput } from './output-projection.js';
|
|
31
33
|
|
|
32
34
|
import { releaseOrTrack, shouldDebugLayer } from './types.js';
|
|
33
35
|
|
|
@@ -142,7 +144,14 @@ export async function recordLayerAttentionGPU(
|
|
|
142
144
|
}
|
|
143
145
|
|
|
144
146
|
// 2. Q/K/V projections
|
|
145
|
-
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype
|
|
147
|
+
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
|
|
148
|
+
forceF32: shouldForceF32AttentionProjectionForRoPE({
|
|
149
|
+
attentionInputDtype: desiredOutputDtype,
|
|
150
|
+
headDim,
|
|
151
|
+
rotaryDim: config.ropeRotaryDim,
|
|
152
|
+
interleaved: config.ropeInterleaved,
|
|
153
|
+
}),
|
|
154
|
+
});
|
|
146
155
|
let usedFusedQKV = false;
|
|
147
156
|
({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
|
|
148
157
|
recorder,
|
|
@@ -158,6 +167,7 @@ export async function recordLayerAttentionGPU(
|
|
|
158
167
|
matmulOutputDtype,
|
|
159
168
|
getWeightBuffer,
|
|
160
169
|
lora,
|
|
170
|
+
matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
|
|
161
171
|
attentionOutputGate: config.attentionOutputGate === true,
|
|
162
172
|
releaseTemporary: (buffer) => releaseOrTrack(recorder, buffer),
|
|
163
173
|
onFusedQKV: layerIdx === 0 && isPrefill
|
|
@@ -535,14 +545,14 @@ export async function recordLayerAttentionGPU(
|
|
|
535
545
|
let oProjInput = attnForProjection;
|
|
536
546
|
oProjInputTemp = null;
|
|
537
547
|
if (layerWeights.oProj && getWeightBuffer) {
|
|
548
|
+
({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
|
|
549
|
+
attnForProjection,
|
|
550
|
+
matmulOutputDtype,
|
|
551
|
+
(tensor) => recordCastF32ToF16(recorder, tensor)
|
|
552
|
+
));
|
|
538
553
|
const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
|
|
539
554
|
const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
|
|
540
555
|
|
|
541
|
-
if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
|
|
542
|
-
oProjInput = await recordCastF32ToF16(recorder, attnForProjection);
|
|
543
|
-
oProjInputTemp = oProjInput;
|
|
544
|
-
}
|
|
545
|
-
|
|
546
556
|
// Use fused o_proj + residual for decode when possible
|
|
547
557
|
// Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
|
|
548
558
|
const oProjDtype = getWeightDtype(oProjBuf);
|
|
@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
|
|
|
28
28
|
import { SlidingWindowKVCache } from '../../../kv-cache.js';
|
|
29
29
|
import {
|
|
30
30
|
recordAttentionInputs,
|
|
31
|
+
shouldForceF32AttentionProjectionForRoPE,
|
|
31
32
|
resolveAttentionProjectionOutputDtype,
|
|
32
33
|
projectAttentionQKV,
|
|
33
34
|
applyAttentionQKNorm,
|
|
34
35
|
} from './projections.js';
|
|
36
|
+
import { prepareAttentionProjectionInput } from './output-projection.js';
|
|
35
37
|
|
|
36
38
|
import {
|
|
37
39
|
shouldDebugLayer,
|
|
@@ -164,6 +166,14 @@ export async function runLayerAttentionGPU(
|
|
|
164
166
|
dtype: normed.dtype,
|
|
165
167
|
});
|
|
166
168
|
}
|
|
169
|
+
|
|
170
|
+
await runProbes('post_input_norm', normed.buffer, {
|
|
171
|
+
layerIdx,
|
|
172
|
+
numTokens,
|
|
173
|
+
hiddenSize,
|
|
174
|
+
probes: state.debugProbes,
|
|
175
|
+
dtype: normed.dtype,
|
|
176
|
+
});
|
|
167
177
|
}
|
|
168
178
|
|
|
169
179
|
// Debug: Check normed input for L0 prefill
|
|
@@ -193,7 +203,14 @@ export async function runLayerAttentionGPU(
|
|
|
193
203
|
}
|
|
194
204
|
|
|
195
205
|
// 2. Q/K/V projections
|
|
196
|
-
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype
|
|
206
|
+
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
|
|
207
|
+
forceF32: shouldForceF32AttentionProjectionForRoPE({
|
|
208
|
+
attentionInputDtype: desiredOutputDtype,
|
|
209
|
+
headDim,
|
|
210
|
+
rotaryDim: config.ropeRotaryDim,
|
|
211
|
+
interleaved: config.ropeInterleaved,
|
|
212
|
+
}),
|
|
213
|
+
});
|
|
197
214
|
let usedFusedQKV = false;
|
|
198
215
|
({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
|
|
199
216
|
recorder: null,
|
|
@@ -209,6 +226,7 @@ export async function runLayerAttentionGPU(
|
|
|
209
226
|
matmulOutputDtype,
|
|
210
227
|
getWeightBuffer,
|
|
211
228
|
lora,
|
|
229
|
+
matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
|
|
212
230
|
attentionOutputGate: config.attentionOutputGate === true,
|
|
213
231
|
releaseTemporary: (buffer) => releaseBuffer(buffer),
|
|
214
232
|
onFusedQKV: layerIdx === 0 && isPrefill
|
|
@@ -224,6 +242,27 @@ export async function runLayerAttentionGPU(
|
|
|
224
242
|
await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
225
243
|
await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
226
244
|
}
|
|
245
|
+
await runProbes('q_proj', qTensor.buffer, {
|
|
246
|
+
layerIdx,
|
|
247
|
+
numTokens,
|
|
248
|
+
hiddenSize: numHeads * headDim,
|
|
249
|
+
probes: state.debugProbes,
|
|
250
|
+
dtype: qTensor.dtype,
|
|
251
|
+
});
|
|
252
|
+
await runProbes('k_proj', kTensor.buffer, {
|
|
253
|
+
layerIdx,
|
|
254
|
+
numTokens,
|
|
255
|
+
hiddenSize: numKVHeads * headDim,
|
|
256
|
+
probes: state.debugProbes,
|
|
257
|
+
dtype: kTensor.dtype,
|
|
258
|
+
});
|
|
259
|
+
await runProbes('v_proj', vTensor.buffer, {
|
|
260
|
+
layerIdx,
|
|
261
|
+
numTokens,
|
|
262
|
+
hiddenSize: numKVHeads * headDim,
|
|
263
|
+
probes: state.debugProbes,
|
|
264
|
+
dtype: vTensor.dtype,
|
|
265
|
+
});
|
|
227
266
|
|
|
228
267
|
// Kernel step debug: Q/K/V projections
|
|
229
268
|
if (isKernelDebugEnabled(layerIdx)) {
|
|
@@ -331,6 +370,20 @@ export async function runLayerAttentionGPU(
|
|
|
331
370
|
await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
332
371
|
}
|
|
333
372
|
}
|
|
373
|
+
await runProbes('q_rope', qTensor.buffer, {
|
|
374
|
+
layerIdx,
|
|
375
|
+
numTokens,
|
|
376
|
+
hiddenSize: numHeads * headDim,
|
|
377
|
+
probes: state.debugProbes,
|
|
378
|
+
dtype: qTensor.dtype,
|
|
379
|
+
});
|
|
380
|
+
await runProbes('k_rope', kTensor.buffer, {
|
|
381
|
+
layerIdx,
|
|
382
|
+
numTokens,
|
|
383
|
+
hiddenSize: numKVHeads * headDim,
|
|
384
|
+
probes: state.debugProbes,
|
|
385
|
+
dtype: kTensor.dtype,
|
|
386
|
+
});
|
|
334
387
|
if (isKernelDebugEnabled(layerIdx)) {
|
|
335
388
|
logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
|
|
336
389
|
await dumpTokenVector(qTensor.buffer, 'Q_rope', {
|
|
@@ -723,14 +776,14 @@ export async function runLayerAttentionGPU(
|
|
|
723
776
|
let oProjInput = attnForProjection;
|
|
724
777
|
oProjInputTemp = null;
|
|
725
778
|
if (layerWeights.oProj && getWeightBuffer) {
|
|
779
|
+
({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
|
|
780
|
+
attnForProjection,
|
|
781
|
+
matmulOutputDtype,
|
|
782
|
+
castF32ToF16
|
|
783
|
+
));
|
|
726
784
|
const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
|
|
727
785
|
const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
|
|
728
786
|
|
|
729
|
-
if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
|
|
730
|
-
oProjInput = await castF32ToF16(attnOutput);
|
|
731
|
-
oProjInputTemp = oProjInput;
|
|
732
|
-
}
|
|
733
|
-
|
|
734
787
|
// Use fused o_proj + residual for decode when possible
|
|
735
788
|
// Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
|
|
736
789
|
const oProjDtype = getWeightDtype(oProjBuf);
|
|
@@ -150,6 +150,7 @@ export interface ParsedModelConfig {
|
|
|
150
150
|
ropeLocalTheta: number | null;
|
|
151
151
|
ropeRotaryDim: number;
|
|
152
152
|
ropeInterleaved: boolean;
|
|
153
|
+
mropeInterleaved: boolean;
|
|
153
154
|
mropeSection: number[] | null;
|
|
154
155
|
partialRotaryFactor: number | null;
|
|
155
156
|
ropeScale: number;
|
|
@@ -349,6 +349,24 @@ function normalizeLayerTypeTag(value) {
|
|
|
349
349
|
return null;
|
|
350
350
|
}
|
|
351
351
|
|
|
352
|
+
function resolveVisionConfig(rawConfig, manifest) {
|
|
353
|
+
const vc = rawConfig?.vision_config ?? manifest?.config?.vision_config;
|
|
354
|
+
if (!vc || typeof vc !== 'object') return null;
|
|
355
|
+
return {
|
|
356
|
+
depth: vc.depth ?? 24,
|
|
357
|
+
hiddenSize: vc.hidden_size ?? 1024,
|
|
358
|
+
intermediateSize: vc.intermediate_size ?? 4096,
|
|
359
|
+
numHeads: vc.num_heads ?? 16,
|
|
360
|
+
outHiddenSize: vc.out_hidden_size ?? vc.hidden_size ?? 1024,
|
|
361
|
+
patchSize: vc.patch_size ?? 16,
|
|
362
|
+
spatialMergeSize: vc.spatial_merge_size ?? 2,
|
|
363
|
+
temporalPatchSize: vc.temporal_patch_size ?? 2,
|
|
364
|
+
eps: vc.eps ?? 1e-6,
|
|
365
|
+
deepstackVisualIndexes: Array.isArray(vc.deepstack_visual_indexes) ? vc.deepstack_visual_indexes : [],
|
|
366
|
+
imageTokenId: rawConfig?.image_token_id ?? manifest?.image_token_id ?? null,
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
|
|
352
370
|
function parseCustomLayerTypes(layerTypes, numLayers, modelId) {
|
|
353
371
|
if (!Array.isArray(layerTypes) || layerTypes.length === 0) {
|
|
354
372
|
throw new Error(
|
|
@@ -482,6 +500,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
482
500
|
const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
|
|
483
501
|
const causalAttention = inf.attention.causal;
|
|
484
502
|
|
|
503
|
+
// Cross-field sanity: queryPreAttnScalar should typically equal headDim.
|
|
504
|
+
// A value of sqrt(headDim) indicates a known converter bug that produces
|
|
505
|
+
// attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
|
|
506
|
+
if (queryPreAttnScalar != null && headDim != null
|
|
507
|
+
&& queryPreAttnScalar !== headDim
|
|
508
|
+
&& Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
|
|
509
|
+
throw new Error(
|
|
510
|
+
`Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
|
|
511
|
+
`equals sqrt(headDim) instead of headDim (${headDim}). ` +
|
|
512
|
+
`This is a known converter bug — the manifest must be regenerated ` +
|
|
513
|
+
`with the corrected converter.`
|
|
514
|
+
);
|
|
515
|
+
}
|
|
516
|
+
|
|
485
517
|
// Get stop token IDs (cast to Manifest for compatibility)
|
|
486
518
|
const stopTokenIds = getStopTokenIds(manifest);
|
|
487
519
|
|
|
@@ -498,10 +530,18 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
498
530
|
// RoPE scaling - use manifest inference as source of truth (not raw config)
|
|
499
531
|
const ropeScale = inf.rope.ropeScalingFactor;
|
|
500
532
|
const ropeScalingType = inf.rope.ropeScalingType;
|
|
501
|
-
const ropeLocalScale = inf.rope.ropeLocalScalingFactor
|
|
502
|
-
const ropeLocalScalingType = inf.rope.ropeLocalScalingType
|
|
533
|
+
const ropeLocalScale = inf.rope.ropeLocalScalingFactor;
|
|
534
|
+
const ropeLocalScalingType = inf.rope.ropeLocalScalingType;
|
|
503
535
|
const partialRotaryFactor = inf.rope.partialRotaryFactor;
|
|
504
|
-
const
|
|
536
|
+
const mropeInterleaved = inf.rope.mropeInterleaved === true;
|
|
537
|
+
const ropeInterleaved = false;
|
|
538
|
+
|
|
539
|
+
if (ropeLocalScale == null && (inf.rope.ropeLocalTheta != null || inf.rope.mropeSection != null)) {
|
|
540
|
+
throw new Error(
|
|
541
|
+
`Model "${merged.modelId}" uses hybrid/mRoPE but is missing rope.ropeLocalScalingFactor in manifest. ` +
|
|
542
|
+
`Re-convert the model using the latest converter or update the manifest to include an explicit scale.`
|
|
543
|
+
);
|
|
544
|
+
}
|
|
505
545
|
const mropeSection = Array.isArray(inf.rope.mropeSection)
|
|
506
546
|
? inf.rope.mropeSection.map((entry) => Math.trunc(Number(entry)))
|
|
507
547
|
: null;
|
|
@@ -511,7 +551,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
511
551
|
`Manifest "${merged.modelId}" has invalid rope.mropeSection; expected positive integers.`
|
|
512
552
|
);
|
|
513
553
|
}
|
|
514
|
-
if (
|
|
554
|
+
if (mropeInterleaved && mropeSection) {
|
|
515
555
|
const doubledMropeDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
|
|
516
556
|
if (doubledMropeDim !== ropeRotaryDim) {
|
|
517
557
|
throw new Error(
|
|
@@ -596,6 +636,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
596
636
|
ropeLocalTheta: inf.rope.ropeLocalTheta,
|
|
597
637
|
ropeRotaryDim,
|
|
598
638
|
ropeInterleaved,
|
|
639
|
+
mropeInterleaved,
|
|
599
640
|
mropeSection,
|
|
600
641
|
partialRotaryFactor,
|
|
601
642
|
ropeScale,
|
|
@@ -636,6 +677,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
636
677
|
chatTemplateType,
|
|
637
678
|
chatTemplateEnabled,
|
|
638
679
|
kernelPath: inf.defaultKernelPath,
|
|
680
|
+
visionConfig: resolveVisionConfig(config, manifest),
|
|
639
681
|
};
|
|
640
682
|
}
|
|
641
683
|
|
|
@@ -9,6 +9,7 @@ import { decodeReadback } from './debug-utils/index.js';
|
|
|
9
9
|
import { createTensor } from '../../../gpu/tensor.js';
|
|
10
10
|
import { castF32ToF16, recordCastF32ToF16 } from '../../../gpu/kernels/cast.js';
|
|
11
11
|
import { isCpuWeightBuffer } from '../../../gpu/weight-buffer.js';
|
|
12
|
+
import { f16ToF32 } from '../../../loader/dtype-utils.js';
|
|
12
13
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
13
14
|
|
|
14
15
|
const scaleShaderCode = `
|
|
@@ -202,11 +203,19 @@ export async function embed(tokenIds, embedBuffer, config) {
|
|
|
202
203
|
|
|
203
204
|
const dtype = selectRuleValue('inference', 'dtype', 'f16OrF32', { useF16 });
|
|
204
205
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
206
|
+
let cpuEmbeddings = null;
|
|
207
|
+
if (isCpuWeightBuffer(embedBuffer)) {
|
|
208
|
+
const bufDtype = embedBuffer.dtype;
|
|
209
|
+
if (bufDtype !== 'f32' && bufDtype !== 'f16') {
|
|
210
|
+
throw new Error(
|
|
211
|
+
`[Embed] CPU embedding buffer has unsupported dtype '${bufDtype}'; ` +
|
|
212
|
+
`only 'f32' and 'f16' are supported in the CPU gather path.`
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
cpuEmbeddings = embedBuffer.data;
|
|
216
|
+
} else if (embedBuffer instanceof Float32Array) {
|
|
217
|
+
cpuEmbeddings = embedBuffer;
|
|
218
|
+
}
|
|
210
219
|
|
|
211
220
|
if (debug) {
|
|
212
221
|
trace.embed(`tokens=${numTokens}, hidden=${hiddenSize}, vocab=${vocabSize}, scaleEmbeddings=${scaleEmbeddings}, transpose=${transpose}, indexOffset=${indexOffset}, activationDtype=${activationDtype}, useF16=${useF16}`);
|
|
@@ -226,18 +235,28 @@ export async function embed(tokenIds, embedBuffer, config) {
|
|
|
226
235
|
}
|
|
227
236
|
|
|
228
237
|
const output = new Float32Array(numTokens * hiddenSize);
|
|
238
|
+
// Check actual data type: loader's f16_to_f32 CPU path already decodes F16 into Float32Array,
|
|
239
|
+
// so dtype='f16' does not reliably indicate raw F16 bytes. Only Uint16Array needs per-element decoding.
|
|
240
|
+
const isF16Cpu = cpuEmbeddings instanceof Uint16Array;
|
|
229
241
|
if (!transpose) {
|
|
230
242
|
for (let t = 0; t < numTokens; t++) {
|
|
231
243
|
const tokenId = (tokenIdArray)[t];
|
|
232
244
|
const srcOffset = tokenId * hiddenSize;
|
|
233
|
-
|
|
245
|
+
if (isF16Cpu) {
|
|
246
|
+
for (let h = 0; h < hiddenSize; h++) {
|
|
247
|
+
output[t * hiddenSize + h] = f16ToF32(cpuEmbeddings[srcOffset + h]);
|
|
248
|
+
}
|
|
249
|
+
} else {
|
|
250
|
+
output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
|
|
251
|
+
}
|
|
234
252
|
}
|
|
235
253
|
} else {
|
|
236
254
|
for (let t = 0; t < numTokens; t++) {
|
|
237
255
|
const tokenId = (tokenIdArray)[t];
|
|
238
256
|
const dstOffset = t * hiddenSize;
|
|
239
257
|
for (let h = 0; h < hiddenSize; h++) {
|
|
240
|
-
|
|
258
|
+
const raw = cpuEmbeddings[h * vocabSize + tokenId];
|
|
259
|
+
output[dstOffset + h] = isF16Cpu ? f16ToF32(raw) : raw;
|
|
241
260
|
}
|
|
242
261
|
}
|
|
243
262
|
}
|
|
@@ -58,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
|
|
|
58
58
|
function resolveFallbackKernelPath(primaryKernelPath) {
|
|
59
59
|
const primaryKernelPathId = primaryKernelPath?.id ?? null;
|
|
60
60
|
if (!primaryKernelPathId) {
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
return {
|
|
62
|
+
kernelPath: null,
|
|
63
|
+
kernelPathId: null,
|
|
64
|
+
kernelPathSource: 'none',
|
|
65
|
+
};
|
|
65
66
|
}
|
|
66
67
|
|
|
67
68
|
const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
2
2
|
import { cloneJson, isPhaseMatch, normalizeDtype, requireSessionActivationDtype, stepHasLayer } from './execution-v0-contract-helpers.js';
|
|
3
3
|
|
|
4
|
-
const PIPELINE_COMPATIBLE_OPS = new Set([
|
|
4
|
+
export const PIPELINE_COMPATIBLE_OPS = new Set([
|
|
5
5
|
'save',
|
|
6
6
|
'load',
|
|
7
7
|
'conv',
|
|
@@ -191,8 +191,15 @@ export function buildLayerPipelineFromExecution(steps) {
|
|
|
191
191
|
if (layerSectionSteps.length === 0) {
|
|
192
192
|
return null;
|
|
193
193
|
}
|
|
194
|
-
|
|
195
|
-
|
|
194
|
+
const incompatibleOps = [
|
|
195
|
+
...new Set(
|
|
196
|
+
layerSectionSteps
|
|
197
|
+
.filter((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))
|
|
198
|
+
.map((step) => step.op)
|
|
199
|
+
),
|
|
200
|
+
];
|
|
201
|
+
if (incompatibleOps.length > 0) {
|
|
202
|
+
return { incompatibleOps };
|
|
196
203
|
}
|
|
197
204
|
|
|
198
205
|
const layerSteps = layerSectionSteps
|
|
@@ -31,6 +31,7 @@ import {
|
|
|
31
31
|
buildModelRuntimeOverrides,
|
|
32
32
|
buildSessionRuntimePatch,
|
|
33
33
|
resolveFinitenessFallbackKernelPathId,
|
|
34
|
+
PIPELINE_COMPATIBLE_OPS,
|
|
34
35
|
} from './execution-v0-runtime-builders.js';
|
|
35
36
|
|
|
36
37
|
export function hasExecutionV0(manifestInference) {
|
|
@@ -152,7 +153,17 @@ export function compileExecutionV0(options = {}) {
|
|
|
152
153
|
numLayers,
|
|
153
154
|
finitenessFallbackKernelPathId
|
|
154
155
|
);
|
|
155
|
-
const
|
|
156
|
+
const layerPipelineResult = buildLayerPipelineFromExecution(resolvedSteps);
|
|
157
|
+
if (layerPipelineResult?.incompatibleOps && !kernelPath) {
|
|
158
|
+
throw new Error(
|
|
159
|
+
`[ExecutionV0] manifest.inference.execution.steps contains layer ops that are not ` +
|
|
160
|
+
`compatible with the JS layer pipeline and no inline kernelPath was built to cover execution. ` +
|
|
161
|
+
`Unsupported ops: ${layerPipelineResult.incompatibleOps.join(', ')}. ` +
|
|
162
|
+
`Either add explicit kernel references to each step (for inline-kernel execution) ` +
|
|
163
|
+
`or restrict layer ops to: ${[...PIPELINE_COMPATIBLE_OPS].join(', ')}.`
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
const layerPipeline = layerPipelineResult?.incompatibleOps ? null : layerPipelineResult;
|
|
156
167
|
const sessionPatch = buildSessionRuntimePatch(resolvedSession);
|
|
157
168
|
const modelOverrides = buildModelRuntimeOverrides(manifestInference);
|
|
158
169
|
for (const [path, source] of sessionSourceByPath.entries()) {
|
|
@@ -111,6 +111,7 @@ export function buildLayerContext(state, recorder, isDecodeMode, debugLayers, de
|
|
|
111
111
|
ropeLocalCos: state.ropeLocalCos,
|
|
112
112
|
ropeLocalSin: state.ropeLocalSin,
|
|
113
113
|
linearAttentionRuntime: state.linearAttentionRuntime,
|
|
114
|
+
convLayerStates: state.convLayerStates,
|
|
114
115
|
weightConfig: getWeightBufferConfig(state),
|
|
115
116
|
debugFlags: state.debugFlags,
|
|
116
117
|
debugProbes: state.runtimeConfig.shared.debug.probes,
|