@simulatte/doppler 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/package.json +21 -36
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-registry.json +1 -17
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +3 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +14 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +2 -0
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +10 -2
- package/src/converter/core.js +2 -0
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.js +19 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +6 -1
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/matmul-selection.js +47 -4
- package/src/gpu/kernels/matmul.d.ts +2 -0
- package/src/gpu/kernels/matmul.js +1 -1
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
- package/src/inference/pipelines/text/attention/projections.js +41 -11
- package/src/inference/pipelines/text/attention/record.js +15 -6
- package/src/inference/pipelines/text/attention/run.js +50 -6
- package/src/inference/pipelines/text/config.js +14 -0
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/generator-runtime.js +5 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
- package/src/inference/pipelines/text/generator-steps.js +43 -15
- package/src/inference/pipelines/text/generator.js +50 -17
- package/src/inference/pipelines/text/init.d.ts +13 -0
- package/src/inference/pipelines/text/init.js +16 -5
- package/src/inference/pipelines/text/layer.js +1 -0
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/test-harness.js +2 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.js +6 -1
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +3 -0
- package/src/tooling/node-source-runtime.js +36 -0
- package/src/types/model.d.ts +5 -0
- package/tools/doppler-cli.js +6 -1
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// AUTO-GENERATED from src/gpu/kernels/split_qg.wgsl.
|
|
2
|
+
// Edit the source kernel and tools/configs/wgsl-variants.js, then run `npm run kernels:generate`.
|
|
3
|
+
// split_qg_f16.wgsl
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* De-interleave Q and Gate projections from q_proj output for attentionOutputGate models (f16).
|
|
7
|
+
*
|
|
8
|
+
* Models like Qwen 3.5 store q_proj weights with interleaved head layout:
|
|
9
|
+
* rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
|
|
10
|
+
* rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
|
|
11
|
+
*
|
|
12
|
+
* A single full matmul over all 2*qSize rows produces interleaved output:
|
|
13
|
+
* input[token, h*headDim*2 : h*headDim*2+headDim] = Q head h
|
|
14
|
+
* input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
|
|
15
|
+
*
|
|
16
|
+
* This kernel separates them into contiguous Q and G outputs:
|
|
17
|
+
* Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
|
|
18
|
+
* G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
|
|
19
|
+
*
|
|
20
|
+
* Input layout (row-major): [numTokens, numHeads * headDim * 2]
|
|
21
|
+
* Output Q layout (row-major): [numTokens, numHeads * headDim]
|
|
22
|
+
* Output G layout (row-major): [numTokens, numHeads * headDim]
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
enable f16;
|
|
26
|
+
|
|
27
|
+
struct Params {
|
|
28
|
+
num_tokens: u32,
|
|
29
|
+
num_heads: u32,
|
|
30
|
+
head_dim: u32,
|
|
31
|
+
_pad: u32,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
override WORKGROUP_SIZE: u32 = 256u;
|
|
35
|
+
|
|
36
|
+
@group(0) @binding(0) var<uniform> params: Params;
|
|
37
|
+
@group(0) @binding(1) var<storage, read> input: array<f16>;
|
|
38
|
+
@group(0) @binding(2) var<storage, read_write> Q: array<f16>;
|
|
39
|
+
@group(0) @binding(3) var<storage, read_write> G: array<f16>;
|
|
40
|
+
|
|
41
|
+
@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
|
|
42
|
+
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
43
|
+
let idx = gid.x;
|
|
44
|
+
let q_size = params.num_heads * params.head_dim;
|
|
45
|
+
let total_elements = params.num_tokens * q_size;
|
|
46
|
+
|
|
47
|
+
if (idx >= total_elements) {
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
let token = idx / q_size;
|
|
52
|
+
let elem = idx % q_size;
|
|
53
|
+
let head = elem / params.head_dim;
|
|
54
|
+
let dim = elem % params.head_dim;
|
|
55
|
+
|
|
56
|
+
// Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
|
|
57
|
+
let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
|
|
58
|
+
let src_g = src_q + params.head_dim;
|
|
59
|
+
|
|
60
|
+
Q[idx] = input[src_q];
|
|
61
|
+
G[idx] = input[src_g];
|
|
62
|
+
}
|
|
@@ -110,6 +110,6 @@ export function getBuffer(weight: GPUBuffer | WeightBuffer | TensorLike): GPUBuf
|
|
|
110
110
|
export function getLayout(weight: GPUBuffer | WeightBuffer | TensorLike): WeightLayout | null;
|
|
111
111
|
|
|
112
112
|
/**
|
|
113
|
-
* Get dtype from WeightBuffer,
|
|
113
|
+
* Get dtype from WeightBuffer, tagged raw GPUBuffer, or TensorLike.
|
|
114
114
|
*/
|
|
115
115
|
export function getWeightDtype(weight: GPUBuffer | WeightBuffer | TensorLike): WeightDtype | TensorLike['dtype'] | null;
|
package/src/gpu/weight-buffer.js
CHANGED
|
@@ -9,6 +9,7 @@ import type { InferencePipeline } from './pipelines/text.js';
|
|
|
9
9
|
import type { DiffusionPipeline } from './pipelines/diffusion/pipeline.js';
|
|
10
10
|
import type { EnergyPipeline } from './pipelines/energy/pipeline.js';
|
|
11
11
|
import type { SavedReportInfo, SaveReportOptions } from '../storage/reports.js';
|
|
12
|
+
import type { DebugSnapshot } from '../debug/history.js';
|
|
12
13
|
|
|
13
14
|
export interface BrowserHarnessOptions extends InferenceHarnessOptions {
|
|
14
15
|
modelUrl: string;
|
|
@@ -143,6 +144,7 @@ export interface BrowserSuiteResult extends SuiteSummary {
|
|
|
143
144
|
output?: string | DiffusionOutput | null;
|
|
144
145
|
deviceInfo?: Record<string, unknown> | null;
|
|
145
146
|
memoryStats?: ReturnType<InferencePipeline['getMemoryStats']> | null;
|
|
147
|
+
debugSnapshot?: DebugSnapshot | null;
|
|
146
148
|
pipeline?: InferencePipeline | DiffusionPipeline | EnergyPipeline | null;
|
|
147
149
|
report: Record<string, unknown>;
|
|
148
150
|
reportInfo: SavedReportInfo;
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import { initializeInference } from './test-harness.js';
|
|
3
3
|
import { saveReport } from '../storage/reports.js';
|
|
4
4
|
import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
|
|
5
|
+
import { clearLogHistory, getDebugSnapshot } from '../debug/history.js';
|
|
5
6
|
import { computeSampleStats } from '../debug/stats.js';
|
|
6
7
|
import {
|
|
7
8
|
setActiveKernelPath,
|
|
@@ -846,15 +847,32 @@ async function dispatchBrowserSuite(suite, options) {
|
|
|
846
847
|
return null;
|
|
847
848
|
}
|
|
848
849
|
|
|
850
|
+
function shouldCaptureDebugSnapshot(suite, runtimeConfig) {
|
|
851
|
+
const debug = runtimeConfig?.shared?.debug ?? {};
|
|
852
|
+
const logLevel = String(debug.logLevel?.defaultLogLevel ?? '').toLowerCase();
|
|
853
|
+
return suite === 'debug'
|
|
854
|
+
|| debug.trace?.enabled === true
|
|
855
|
+
|| debug.pipeline?.enabled === true
|
|
856
|
+
|| (Array.isArray(debug.probes) && debug.probes.length > 0)
|
|
857
|
+
|| debug.profiler?.enabled === true
|
|
858
|
+
|| logLevel === 'debug'
|
|
859
|
+
|| logLevel === 'verbose';
|
|
860
|
+
}
|
|
861
|
+
|
|
849
862
|
export async function runBrowserSuite(options = {}) {
|
|
850
863
|
return runWithRuntimeIsolationForSuite(async () => {
|
|
851
864
|
const suiteTimestamp = resolveReportTimestamp(options.timestamp, 'runBrowserSuite timestamp');
|
|
852
865
|
const suiteContext = resolveSuiteContext(options);
|
|
853
866
|
const suite = normalizeSuite(options.suite, suiteContext);
|
|
867
|
+
const captureDebugSnapshot = shouldCaptureDebugSnapshot(suite, getRuntimeConfig());
|
|
868
|
+
if (captureDebugSnapshot) {
|
|
869
|
+
clearLogHistory();
|
|
870
|
+
}
|
|
854
871
|
const suiteResult = await dispatchBrowserSuite(suite, options);
|
|
855
872
|
if (!suiteResult) {
|
|
856
873
|
throw createUnsupportedSuiteError(suite, suiteContext);
|
|
857
874
|
}
|
|
875
|
+
const debugSnapshot = captureDebugSnapshot ? getDebugSnapshot() : null;
|
|
858
876
|
|
|
859
877
|
if (suite === 'bench' && suiteResult?.metrics?.workloadType === 'training') {
|
|
860
878
|
const trainingReport = suiteResult?.metrics?.trainingMetricsReport;
|
|
@@ -886,6 +904,7 @@ export async function runBrowserSuite(options = {}) {
|
|
|
886
904
|
metrics: suiteResult.metrics ?? null,
|
|
887
905
|
output: reportOutput,
|
|
888
906
|
memory: suiteResult.memoryStats ?? null,
|
|
907
|
+
debugSnapshot,
|
|
889
908
|
...options.report,
|
|
890
909
|
};
|
|
891
910
|
if (ulArtifacts.length > 0 || distillArtifacts.length > 0 || checkpointResumeTimeline.length > 0) {
|
|
@@ -907,7 +926,7 @@ export async function runBrowserSuite(options = {}) {
|
|
|
907
926
|
report.timestamp = suiteTimestamp;
|
|
908
927
|
}
|
|
909
928
|
const reportInfo = await saveReport(modelId, report, { timestamp: report.timestamp });
|
|
910
|
-
return { ...suiteResult, report, reportInfo };
|
|
929
|
+
return { ...suiteResult, debugSnapshot, report, reportInfo };
|
|
911
930
|
});
|
|
912
931
|
}
|
|
913
932
|
|
|
@@ -89,6 +89,9 @@ export function normalizeDiffusionMatmulLocationDtype(dtype) {
|
|
|
89
89
|
return normalized;
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
// Artifact-derived dtype inference: determines actual storage dtype from buffer byte size.
|
|
93
|
+
// This is NOT a config-bypass — it reads physical buffer dimensions (artifact-derived config),
|
|
94
|
+
// which is a valid merge layer per the config merge contract.
|
|
92
95
|
export function inferDiffusionMatmulDtypeFromBuffer(weight, N, K, preferred) {
|
|
93
96
|
const buffer = getBuffer(weight);
|
|
94
97
|
if (!buffer || !Number.isFinite(N) || !Number.isFinite(K)) return preferred;
|
|
@@ -45,6 +45,8 @@ import { processLayerGPU } from '../text/layer.js';
|
|
|
45
45
|
|
|
46
46
|
const QUICK_GELU_ALPHA = 1.702;
|
|
47
47
|
const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
|
|
48
|
+
// Standard CLIP hidden activation per OpenAI CLIP specification.
|
|
49
|
+
const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
|
|
48
50
|
|
|
49
51
|
function padTokens(tokens, maxLength, padTokenId) {
|
|
50
52
|
if (!Number.isFinite(maxLength) || maxLength <= 0) {
|
|
@@ -100,11 +102,15 @@ function createVectorTensor(device, data, dtype, label) {
|
|
|
100
102
|
return createTensor(buffer, dtype, [1, length], label);
|
|
101
103
|
}
|
|
102
104
|
|
|
105
|
+
// Conservative fallback dtype for diffusion bias tensors when no dtype
|
|
106
|
+
// metadata is available. F32 avoids precision loss in bias additions.
|
|
107
|
+
const DEFAULT_BIAS_DTYPE = 'f32';
|
|
108
|
+
|
|
103
109
|
function resolveBiasDtype(weight, weightsEntry, key) {
|
|
104
110
|
if (weight && weight.dtype) return weight.dtype;
|
|
105
111
|
const locationDtype = weightsEntry?.dtypes?.get(key);
|
|
106
112
|
const mapped = normalizeDiffusionLocationDtype(locationDtype);
|
|
107
|
-
return mapped ||
|
|
113
|
+
return mapped || DEFAULT_BIAS_DTYPE;
|
|
108
114
|
}
|
|
109
115
|
|
|
110
116
|
function createBiasTensorWithDtype(weight, weightsEntry, key, size, label) {
|
|
@@ -145,7 +151,7 @@ function createKernelOps(recorder) {
|
|
|
145
151
|
}
|
|
146
152
|
|
|
147
153
|
function resolveClipHiddenActivation(config) {
|
|
148
|
-
const hiddenAct = config?.hidden_act ??
|
|
154
|
+
const hiddenAct = config?.hidden_act ?? DEFAULT_CLIP_HIDDEN_ACT;
|
|
149
155
|
if (!SUPPORTED_CLIP_HIDDEN_ACTIVATIONS.has(hiddenAct)) {
|
|
150
156
|
throw new Error(
|
|
151
157
|
`Unsupported CLIP hidden_act "${hiddenAct}". ` +
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { Tensor } from '../../../../gpu/tensor.js';
|
|
2
|
+
|
|
3
|
+
export interface AttentionProjectionInputResult {
|
|
4
|
+
oProjInput: Tensor;
|
|
5
|
+
oProjInputTemp: Tensor | null;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export function prepareAttentionProjectionInput(
|
|
9
|
+
attnForProjection: Tensor,
|
|
10
|
+
matmulOutputDtype: string,
|
|
11
|
+
castToF16: (tensor: Tensor) => Promise<Tensor>
|
|
12
|
+
): Promise<AttentionProjectionInputResult>;
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export async function prepareAttentionProjectionInput(attnForProjection, matmulOutputDtype, castToF16) {
|
|
2
|
+
if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
|
|
3
|
+
const casted = await castToF16(attnForProjection);
|
|
4
|
+
return { oProjInput: casted, oProjInputTemp: casted };
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
return { oProjInput: attnForProjection, oProjInputTemp: null };
|
|
8
|
+
}
|
|
@@ -46,7 +46,16 @@ export function recordAttentionInputs(
|
|
|
46
46
|
info: AttentionInputInfo | null | undefined
|
|
47
47
|
): void;
|
|
48
48
|
|
|
49
|
-
export function
|
|
49
|
+
export function shouldForceF32AttentionProjectionForRoPE(options: {
|
|
50
|
+
attentionInputDtype: string;
|
|
51
|
+
headDim: number;
|
|
52
|
+
rotaryDim?: number;
|
|
53
|
+
interleaved?: boolean;
|
|
54
|
+
}): boolean;
|
|
55
|
+
export function resolveAttentionProjectionOutputDtype(
|
|
56
|
+
attentionInputDtype: string,
|
|
57
|
+
options?: { forceF32?: boolean }
|
|
58
|
+
): 'f16' | 'f32' | string;
|
|
50
59
|
export function resolveProjectionSliceOffsetBytes(
|
|
51
60
|
weightBuffer: WeightBuffer | Tensor | GPUBuffer | null | undefined,
|
|
52
61
|
outputRows: number,
|
|
@@ -5,6 +5,8 @@ import {
|
|
|
5
5
|
recordMatmul,
|
|
6
6
|
runSplitQKV,
|
|
7
7
|
recordSplitQKV,
|
|
8
|
+
runSplitQG,
|
|
9
|
+
recordSplitQG,
|
|
8
10
|
runRMSNorm,
|
|
9
11
|
recordRMSNorm,
|
|
10
12
|
} from '../../../../gpu/kernel-selector.js';
|
|
@@ -28,6 +30,13 @@ function getSplitRunner(recorder) {
|
|
|
28
30
|
return (qkvTensor, options) => recordSplitQKV(recorder, qkvTensor, options);
|
|
29
31
|
}
|
|
30
32
|
|
|
33
|
+
function getSplitQGRunner(recorder) {
|
|
34
|
+
if (!recorder) {
|
|
35
|
+
return (qgTensor, options) => runSplitQG(qgTensor, options);
|
|
36
|
+
}
|
|
37
|
+
return (qgTensor, options) => recordSplitQG(recorder, qgTensor, options);
|
|
38
|
+
}
|
|
39
|
+
|
|
31
40
|
function getRmsNormRunner(recorder) {
|
|
32
41
|
if (!recorder) {
|
|
33
42
|
return (input, weight, eps, options) => runRMSNorm(input, weight, eps, options);
|
|
@@ -201,13 +210,17 @@ async function projectQueryWithOptionalGate({
|
|
|
201
210
|
return { qTensor, qGateTensor: null };
|
|
202
211
|
}
|
|
203
212
|
|
|
213
|
+
// q_proj weights are stored with interleaved head layout: for head h,
|
|
214
|
+
// rows [h*headDim*2 : h*headDim*2+headDim] = Q, rows [h*headDim*2+headDim : (h+1)*headDim*2] = gate.
|
|
215
|
+
// Compute the full 2*qSize matmul, then de-interleave into separate Q and gate tensors.
|
|
204
216
|
const runMatmulForMode = getMatmulRunner(recorder);
|
|
217
|
+
const runSplitQGForMode = getSplitQGRunner(recorder);
|
|
205
218
|
const qWeightBuffer = getWeightBuffer(qWeight, 'q_proj');
|
|
206
|
-
|
|
219
|
+
let fullQGTensor = null;
|
|
207
220
|
let qTensor = null;
|
|
208
221
|
let qGateTensor = null;
|
|
209
222
|
try {
|
|
210
|
-
|
|
223
|
+
fullQGTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize * 2, hiddenSize, {
|
|
211
224
|
transposeB: 'auto',
|
|
212
225
|
role: 'q_proj',
|
|
213
226
|
layerIdx,
|
|
@@ -215,15 +228,19 @@ async function projectQueryWithOptionalGate({
|
|
|
215
228
|
outputDtype: matmulOutputDtype,
|
|
216
229
|
});
|
|
217
230
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
kernelPath,
|
|
223
|
-
bOffset: gateOffset,
|
|
224
|
-
outputDtype: matmulOutputDtype,
|
|
231
|
+
const split = await runSplitQGForMode(fullQGTensor, {
|
|
232
|
+
numTokens,
|
|
233
|
+
numHeads,
|
|
234
|
+
headDim,
|
|
225
235
|
});
|
|
236
|
+
releaseTemporary(fullQGTensor.buffer);
|
|
237
|
+
fullQGTensor = null;
|
|
238
|
+
qTensor = split.Q;
|
|
239
|
+
qGateTensor = split.G;
|
|
226
240
|
} catch (error) {
|
|
241
|
+
if (fullQGTensor) {
|
|
242
|
+
releaseTemporary(fullQGTensor.buffer);
|
|
243
|
+
}
|
|
227
244
|
if (qTensor) {
|
|
228
245
|
releaseTemporary(qTensor.buffer);
|
|
229
246
|
}
|
|
@@ -277,9 +294,22 @@ export function recordAttentionInputs(state, info) {
|
|
|
277
294
|
state.stats.attentionInputs.push(info);
|
|
278
295
|
}
|
|
279
296
|
|
|
280
|
-
export function
|
|
297
|
+
export function shouldForceF32AttentionProjectionForRoPE({
|
|
298
|
+
attentionInputDtype,
|
|
299
|
+
headDim,
|
|
300
|
+
rotaryDim = headDim,
|
|
301
|
+
interleaved = false,
|
|
302
|
+
}) {
|
|
303
|
+
return attentionInputDtype === 'f16'
|
|
304
|
+
&& Number.isFinite(headDim)
|
|
305
|
+
&& Number.isFinite(rotaryDim)
|
|
306
|
+
&& (rotaryDim !== headDim || interleaved === true);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
export function resolveAttentionProjectionOutputDtype(attentionInputDtype, options = {}) {
|
|
281
310
|
const useF16Activations = attentionInputDtype === 'f16';
|
|
282
|
-
return selectRuleValue('
|
|
311
|
+
return selectRuleValue('inference', 'dtype', 'attentionProjectionOutputDtype', {
|
|
312
|
+
forceF32: options.forceF32 === true,
|
|
283
313
|
useF16: useF16Activations,
|
|
284
314
|
fallback: attentionInputDtype,
|
|
285
315
|
});
|
|
@@ -24,10 +24,12 @@ import { selectRuleValue } from '../../../../rules/rule-registry.js';
|
|
|
24
24
|
import { SlidingWindowKVCache } from '../../../kv-cache.js';
|
|
25
25
|
import {
|
|
26
26
|
recordAttentionInputs,
|
|
27
|
+
shouldForceF32AttentionProjectionForRoPE,
|
|
27
28
|
resolveAttentionProjectionOutputDtype,
|
|
28
29
|
projectAttentionQKV,
|
|
29
30
|
applyAttentionQKNorm,
|
|
30
31
|
} from './projections.js';
|
|
32
|
+
import { prepareAttentionProjectionInput } from './output-projection.js';
|
|
31
33
|
|
|
32
34
|
import { releaseOrTrack, shouldDebugLayer } from './types.js';
|
|
33
35
|
|
|
@@ -142,7 +144,14 @@ export async function recordLayerAttentionGPU(
|
|
|
142
144
|
}
|
|
143
145
|
|
|
144
146
|
// 2. Q/K/V projections
|
|
145
|
-
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype
|
|
147
|
+
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
|
|
148
|
+
forceF32: shouldForceF32AttentionProjectionForRoPE({
|
|
149
|
+
attentionInputDtype: desiredOutputDtype,
|
|
150
|
+
headDim,
|
|
151
|
+
rotaryDim: config.ropeRotaryDim,
|
|
152
|
+
interleaved: config.ropeInterleaved,
|
|
153
|
+
}),
|
|
154
|
+
});
|
|
146
155
|
let usedFusedQKV = false;
|
|
147
156
|
({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
|
|
148
157
|
recorder,
|
|
@@ -535,14 +544,14 @@ export async function recordLayerAttentionGPU(
|
|
|
535
544
|
let oProjInput = attnForProjection;
|
|
536
545
|
oProjInputTemp = null;
|
|
537
546
|
if (layerWeights.oProj && getWeightBuffer) {
|
|
547
|
+
({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
|
|
548
|
+
attnForProjection,
|
|
549
|
+
matmulOutputDtype,
|
|
550
|
+
(tensor) => recordCastF32ToF16(recorder, tensor)
|
|
551
|
+
));
|
|
538
552
|
const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
|
|
539
553
|
const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
|
|
540
554
|
|
|
541
|
-
if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
|
|
542
|
-
oProjInput = await recordCastF32ToF16(recorder, attnForProjection);
|
|
543
|
-
oProjInputTemp = oProjInput;
|
|
544
|
-
}
|
|
545
|
-
|
|
546
555
|
// Use fused o_proj + residual for decode when possible
|
|
547
556
|
// Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
|
|
548
557
|
const oProjDtype = getWeightDtype(oProjBuf);
|
|
@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
|
|
|
28
28
|
import { SlidingWindowKVCache } from '../../../kv-cache.js';
|
|
29
29
|
import {
|
|
30
30
|
recordAttentionInputs,
|
|
31
|
+
shouldForceF32AttentionProjectionForRoPE,
|
|
31
32
|
resolveAttentionProjectionOutputDtype,
|
|
32
33
|
projectAttentionQKV,
|
|
33
34
|
applyAttentionQKNorm,
|
|
34
35
|
} from './projections.js';
|
|
36
|
+
import { prepareAttentionProjectionInput } from './output-projection.js';
|
|
35
37
|
|
|
36
38
|
import {
|
|
37
39
|
shouldDebugLayer,
|
|
@@ -193,7 +195,14 @@ export async function runLayerAttentionGPU(
|
|
|
193
195
|
}
|
|
194
196
|
|
|
195
197
|
// 2. Q/K/V projections
|
|
196
|
-
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype
|
|
198
|
+
const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
|
|
199
|
+
forceF32: shouldForceF32AttentionProjectionForRoPE({
|
|
200
|
+
attentionInputDtype: desiredOutputDtype,
|
|
201
|
+
headDim,
|
|
202
|
+
rotaryDim: config.ropeRotaryDim,
|
|
203
|
+
interleaved: config.ropeInterleaved,
|
|
204
|
+
}),
|
|
205
|
+
});
|
|
197
206
|
let usedFusedQKV = false;
|
|
198
207
|
({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
|
|
199
208
|
recorder: null,
|
|
@@ -224,6 +233,27 @@ export async function runLayerAttentionGPU(
|
|
|
224
233
|
await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
225
234
|
await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
226
235
|
}
|
|
236
|
+
await runProbes('q_proj', qTensor.buffer, {
|
|
237
|
+
layerIdx,
|
|
238
|
+
numTokens,
|
|
239
|
+
hiddenSize: numHeads * headDim,
|
|
240
|
+
probes: state.debugProbes,
|
|
241
|
+
dtype: qTensor.dtype,
|
|
242
|
+
});
|
|
243
|
+
await runProbes('k_proj', kTensor.buffer, {
|
|
244
|
+
layerIdx,
|
|
245
|
+
numTokens,
|
|
246
|
+
hiddenSize: numKVHeads * headDim,
|
|
247
|
+
probes: state.debugProbes,
|
|
248
|
+
dtype: kTensor.dtype,
|
|
249
|
+
});
|
|
250
|
+
await runProbes('v_proj', vTensor.buffer, {
|
|
251
|
+
layerIdx,
|
|
252
|
+
numTokens,
|
|
253
|
+
hiddenSize: numKVHeads * headDim,
|
|
254
|
+
probes: state.debugProbes,
|
|
255
|
+
dtype: vTensor.dtype,
|
|
256
|
+
});
|
|
227
257
|
|
|
228
258
|
// Kernel step debug: Q/K/V projections
|
|
229
259
|
if (isKernelDebugEnabled(layerIdx)) {
|
|
@@ -331,6 +361,20 @@ export async function runLayerAttentionGPU(
|
|
|
331
361
|
await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
|
|
332
362
|
}
|
|
333
363
|
}
|
|
364
|
+
await runProbes('q_rope', qTensor.buffer, {
|
|
365
|
+
layerIdx,
|
|
366
|
+
numTokens,
|
|
367
|
+
hiddenSize: numHeads * headDim,
|
|
368
|
+
probes: state.debugProbes,
|
|
369
|
+
dtype: qTensor.dtype,
|
|
370
|
+
});
|
|
371
|
+
await runProbes('k_rope', kTensor.buffer, {
|
|
372
|
+
layerIdx,
|
|
373
|
+
numTokens,
|
|
374
|
+
hiddenSize: numKVHeads * headDim,
|
|
375
|
+
probes: state.debugProbes,
|
|
376
|
+
dtype: kTensor.dtype,
|
|
377
|
+
});
|
|
334
378
|
if (isKernelDebugEnabled(layerIdx)) {
|
|
335
379
|
logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
|
|
336
380
|
await dumpTokenVector(qTensor.buffer, 'Q_rope', {
|
|
@@ -723,14 +767,14 @@ export async function runLayerAttentionGPU(
|
|
|
723
767
|
let oProjInput = attnForProjection;
|
|
724
768
|
oProjInputTemp = null;
|
|
725
769
|
if (layerWeights.oProj && getWeightBuffer) {
|
|
770
|
+
({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
|
|
771
|
+
attnForProjection,
|
|
772
|
+
matmulOutputDtype,
|
|
773
|
+
castF32ToF16
|
|
774
|
+
));
|
|
726
775
|
const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
|
|
727
776
|
const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
|
|
728
777
|
|
|
729
|
-
if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
|
|
730
|
-
oProjInput = await castF32ToF16(attnOutput);
|
|
731
|
-
oProjInputTemp = oProjInput;
|
|
732
|
-
}
|
|
733
|
-
|
|
734
778
|
// Use fused o_proj + residual for decode when possible
|
|
735
779
|
// Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
|
|
736
780
|
const oProjDtype = getWeightDtype(oProjBuf);
|
|
@@ -482,6 +482,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
|
|
|
482
482
|
const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
|
|
483
483
|
const causalAttention = inf.attention.causal;
|
|
484
484
|
|
|
485
|
+
// Cross-field sanity: queryPreAttnScalar should typically equal headDim.
|
|
486
|
+
// A value of sqrt(headDim) indicates a known converter bug that produces
|
|
487
|
+
// attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
|
|
488
|
+
if (queryPreAttnScalar != null && headDim != null
|
|
489
|
+
&& queryPreAttnScalar !== headDim
|
|
490
|
+
&& Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
|
|
491
|
+
throw new Error(
|
|
492
|
+
`Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
|
|
493
|
+
`equals sqrt(headDim) instead of headDim (${headDim}). ` +
|
|
494
|
+
`This is a known converter bug — the manifest must be regenerated ` +
|
|
495
|
+
`with the corrected converter.`
|
|
496
|
+
);
|
|
497
|
+
}
|
|
498
|
+
|
|
485
499
|
// Get stop token IDs (cast to Manifest for compatibility)
|
|
486
500
|
const stopTokenIds = getStopTokenIds(manifest);
|
|
487
501
|
|
|
@@ -58,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
|
|
|
58
58
|
function resolveFallbackKernelPath(primaryKernelPath) {
|
|
59
59
|
const primaryKernelPathId = primaryKernelPath?.id ?? null;
|
|
60
60
|
if (!primaryKernelPathId) {
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
return {
|
|
62
|
+
kernelPath: null,
|
|
63
|
+
kernelPathId: null,
|
|
64
|
+
kernelPathSource: 'none',
|
|
65
|
+
};
|
|
65
66
|
}
|
|
66
67
|
|
|
67
68
|
const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'
|
|
@@ -213,6 +213,10 @@ export function resolvePrefillEmbeddingOptions(state, options = {}) {
|
|
|
213
213
|
? state.manifest.modelType.toLowerCase()
|
|
214
214
|
: '';
|
|
215
215
|
const generationDefaults = state.runtimeConfig.inference.generation;
|
|
216
|
+
// Embedding models default to 'mean' pooling — this is a model-category behavior,
|
|
217
|
+
// not a model-family identity check. Ideally embedding model presets would set
|
|
218
|
+
// generation.embeddingMode='mean' in their runtime config; the modelType fallback
|
|
219
|
+
// provides this default for manifests that predate runtime-preset embedding mode.
|
|
216
220
|
const defaultEmbeddingMode = modelType === 'embedding'
|
|
217
221
|
? 'mean'
|
|
218
222
|
: generationDefaults.embeddingMode;
|
|
@@ -226,6 +230,7 @@ export function resolveAdvanceEmbeddingMode(state, options = {}) {
|
|
|
226
230
|
const modelType = typeof state.manifest?.modelType === 'string'
|
|
227
231
|
? state.manifest.modelType.toLowerCase()
|
|
228
232
|
: '';
|
|
233
|
+
// See resolvePrefillEmbeddingOptions for embedding-model pooling rationale.
|
|
229
234
|
const configuredMode = state.runtimeConfig.inference.generation.embeddingMode;
|
|
230
235
|
return resolveConfiguredValue(
|
|
231
236
|
options.embeddingMode,
|
|
@@ -19,6 +19,12 @@ export declare function resolveBatchStop(
|
|
|
19
19
|
eosTokenId: number | undefined | null
|
|
20
20
|
): number;
|
|
21
21
|
|
|
22
|
+
export declare function findInvalidGeneratedToken(
|
|
23
|
+
tokens: number[],
|
|
24
|
+
vocabSize: number,
|
|
25
|
+
padTokenId?: number | null
|
|
26
|
+
): { index: number; tokenId: number } | null;
|
|
27
|
+
|
|
22
28
|
export interface SampledTokenStagingBuffer {
|
|
23
29
|
mapAsync(mode: number): Promise<void>;
|
|
24
30
|
getMappedRange(): ArrayBufferLike;
|
|
@@ -113,6 +113,20 @@ export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
|
|
|
113
113
|
return actualCount;
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
export function findInvalidGeneratedToken(tokens, vocabSize, padTokenId = null) {
|
|
117
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
118
|
+
const tokenId = tokens[i];
|
|
119
|
+
const isInvalid = !Number.isFinite(tokenId)
|
|
120
|
+
|| tokenId < 0
|
|
121
|
+
|| tokenId >= vocabSize
|
|
122
|
+
|| (padTokenId != null ? tokenId === padTokenId : tokenId === 0);
|
|
123
|
+
if (isInvalid) {
|
|
124
|
+
return { index: i, tokenId };
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
return null;
|
|
128
|
+
}
|
|
129
|
+
|
|
116
130
|
export async function readSampledTokenFromStagingBuffer(stagingBuffer, options = {}) {
|
|
117
131
|
const ownsStagingBuffer = options.ownsStagingBuffer === true;
|
|
118
132
|
const hasFinitenessBuffer = options.hasFinitenessBuffer === true;
|
|
@@ -240,11 +254,9 @@ async function runDecodeLayers(state, tokenId, opts, helpers) {
|
|
|
240
254
|
throw new Error('Embed buffer not found or not a supported buffer type');
|
|
241
255
|
}
|
|
242
256
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
243
|
-
const embedDtype =
|
|
244
|
-
?
|
|
245
|
-
:
|
|
246
|
-
? embedBufferRaw.dtype
|
|
247
|
-
: null;
|
|
257
|
+
const embedDtype = isCpuWeightBuffer(embedBufferRaw)
|
|
258
|
+
? embedBufferRaw.dtype
|
|
259
|
+
: getWeightDtype(embedBufferRaw);
|
|
248
260
|
const activationDtype = getEffectiveActivationDtype(state, opts);
|
|
249
261
|
|
|
250
262
|
const embedTensor = await embed([tokenId], embedBuffer, {
|
|
@@ -326,11 +338,9 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
326
338
|
throw new Error('Embed buffer not found or not a supported buffer type');
|
|
327
339
|
}
|
|
328
340
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
329
|
-
const embedDtype =
|
|
330
|
-
?
|
|
331
|
-
:
|
|
332
|
-
? embedBufferRaw.dtype
|
|
333
|
-
: null;
|
|
341
|
+
const embedDtype = isCpuWeightBuffer(embedBufferRaw)
|
|
342
|
+
? embedBufferRaw.dtype
|
|
343
|
+
: getWeightDtype(embedBufferRaw);
|
|
334
344
|
const activationDtype = getEffectiveActivationDtype(state, opts);
|
|
335
345
|
const activationBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
|
|
336
346
|
|
|
@@ -636,11 +646,21 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
636
646
|
});
|
|
637
647
|
|
|
638
648
|
releaseBuffer(logitsBuffer);
|
|
639
|
-
|
|
640
|
-
|
|
649
|
+
const invalidGpuToken = nextToken >= config.vocabSize
|
|
650
|
+
|| (padTokenId != null && nextToken === padTokenId)
|
|
651
|
+
|| (padTokenId == null && nextToken === 0);
|
|
652
|
+
if (!invalidGpuToken) {
|
|
653
|
+
if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
|
|
654
|
+
releaseBuffer(hiddenStates);
|
|
655
|
+
}
|
|
656
|
+
state.currentSeqLen++;
|
|
657
|
+
return nextToken;
|
|
641
658
|
}
|
|
642
|
-
state.
|
|
643
|
-
|
|
659
|
+
state.disableFusedDecode = true;
|
|
660
|
+
log.warn(
|
|
661
|
+
'Decode',
|
|
662
|
+
`GPU sampling produced invalid token ${nextToken} (vocabSize=${config.vocabSize}, step=${state.decodeStepCount}); falling back to CPU sampling.`
|
|
663
|
+
);
|
|
644
664
|
}
|
|
645
665
|
}
|
|
646
666
|
|
|
@@ -981,7 +1001,7 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
|
|
|
981
1001
|
throw new Error('Embed buffer not found or not a GPUBuffer/WeightBuffer');
|
|
982
1002
|
}
|
|
983
1003
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
984
|
-
const embedDtype =
|
|
1004
|
+
const embedDtype = getWeightDtype(embedBufferRaw);
|
|
985
1005
|
const activationDtype = getEffectiveActivationDtype(state, opts);
|
|
986
1006
|
|
|
987
1007
|
for (let i = 0; i < N; i++) {
|
|
@@ -1125,10 +1145,18 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
|
|
|
1125
1145
|
|
|
1126
1146
|
const actualCount = resolveBatchStop(tokens, stopFlags, stopTokenIds, eosToken);
|
|
1127
1147
|
const generatedTokens = tokens.slice(0, actualCount);
|
|
1148
|
+
const invalidToken = findInvalidGeneratedToken(generatedTokens, config.vocabSize, padTokenId);
|
|
1128
1149
|
|
|
1129
1150
|
if (isInfinite) {
|
|
1130
1151
|
throw new FinitenessError(`F16 bounds exceeded during batch generation${metadata}`);
|
|
1131
1152
|
}
|
|
1153
|
+
if (invalidToken) {
|
|
1154
|
+
state.disableFusedDecode = true;
|
|
1155
|
+
throw new Error(
|
|
1156
|
+
`[Pipeline] Batch decode produced invalid token ${invalidToken.tokenId} ` +
|
|
1157
|
+
`at batch index ${invalidToken.index} (vocabSize=${config.vocabSize}, padTokenId=${padTokenId ?? 'none'}).`
|
|
1158
|
+
);
|
|
1159
|
+
}
|
|
1132
1160
|
|
|
1133
1161
|
if (opts.profile && recorder.isProfilingEnabled()) {
|
|
1134
1162
|
const timings = await recorder.resolveProfileTimings();
|