@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -14,13 +14,14 @@ import {
|
|
|
14
14
|
recordCastF32ToF16,
|
|
15
15
|
} from '../../../gpu/kernels/cast.js';
|
|
16
16
|
import { createTensor } from '../../../gpu/tensor.js';
|
|
17
|
-
import { releaseBuffer } from '../../../memory/buffer-pool.js';
|
|
17
|
+
import { releaseBuffer, readBuffer, acquireBuffer, uploadData } from '../../../memory/buffer-pool.js';
|
|
18
18
|
import { kernelTrace, traceStep } from './kernel-trace.js';
|
|
19
19
|
import {
|
|
20
20
|
runLayerAttentionGPU,
|
|
21
21
|
recordLayerAttentionGPU,
|
|
22
22
|
} from './attention/index.js';
|
|
23
23
|
import { runLinearAttentionLayer } from './linear-attention.js';
|
|
24
|
+
import { runGatedShortConvGPU } from '../../../gpu/kernels/gated-short-conv.js';
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
export function isDecodeBuffer(decodeBuffers, buffer) {
|
|
@@ -174,17 +175,22 @@ export async function doConv(
|
|
|
174
175
|
throw new Error('doConv requires hiddenSize > 0.');
|
|
175
176
|
}
|
|
176
177
|
|
|
177
|
-
//
|
|
178
|
+
// LFM2 gated short convolution (GPU-native):
|
|
179
|
+
// in_proj → 3×hidden → GPU kernel: split(B,C,x) + B*x + causal conv1d + C*conv_out → out_proj
|
|
178
180
|
let inProj = null;
|
|
179
|
-
let
|
|
180
|
-
let convInput = null;
|
|
181
|
+
let convOut = null;
|
|
181
182
|
let outProj = null;
|
|
182
183
|
try {
|
|
184
|
+
const convState = options.convState;
|
|
185
|
+
const hasConvState = Boolean(convState?.convWeightGPU && convState?.convStateGPU);
|
|
186
|
+
const projN = hasConvState ? hiddenSize * 3 : hiddenSize * 2;
|
|
187
|
+
|
|
188
|
+
// Project input
|
|
183
189
|
inProj = await doMatmul(
|
|
184
190
|
inputTensor,
|
|
185
191
|
convInProj,
|
|
186
192
|
numTokens,
|
|
187
|
-
|
|
193
|
+
projN,
|
|
188
194
|
hiddenSize,
|
|
189
195
|
{
|
|
190
196
|
transposeB: 'auto',
|
|
@@ -195,50 +201,32 @@ export async function doConv(
|
|
|
195
201
|
},
|
|
196
202
|
recorder
|
|
197
203
|
);
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
204
|
+
|
|
205
|
+
if (hasConvState) {
|
|
206
|
+
// GPU gated short conv kernel: B*x → conv1d → C*conv_out (all on GPU)
|
|
207
|
+
convOut = await runGatedShortConvGPU(inProj, convState, {
|
|
208
|
+
numTokens,
|
|
209
|
+
layerIdx,
|
|
210
|
+
recorder,
|
|
211
|
+
});
|
|
212
|
+
} else {
|
|
213
|
+
// SwiGLU gated activation fallback: silu(first_half) * second_half
|
|
214
|
+
convOut = await doSiLURowSplit(inProj, {
|
|
215
|
+
numTokens,
|
|
216
|
+
dim: hiddenSize,
|
|
217
|
+
activation: 'silu',
|
|
218
|
+
swigluLimit: options.swigluLimit ?? null,
|
|
219
|
+
label: `${label}.activation`,
|
|
220
|
+
layerIdx,
|
|
221
|
+
}, recorder);
|
|
222
|
+
}
|
|
206
223
|
|
|
207
224
|
releaseOrTrack(recorder, inProj.buffer);
|
|
208
225
|
inProj = null;
|
|
209
226
|
|
|
210
|
-
|
|
211
|
-
if (convKernel && options.conv2d && options.conv2d.enabled === true) {
|
|
212
|
-
const convTensorInput = createTensor(activated.buffer, activated.dtype, [
|
|
213
|
-
options.conv2d.inChannels,
|
|
214
|
-
options.conv2d.height,
|
|
215
|
-
options.conv2d.width,
|
|
216
|
-
], `${label}.conv_input`);
|
|
217
|
-
const convOptions = {
|
|
218
|
-
inChannels: options.conv2d.inChannels,
|
|
219
|
-
outChannels: options.conv2d.outChannels,
|
|
220
|
-
height: options.conv2d.height,
|
|
221
|
-
width: options.conv2d.width,
|
|
222
|
-
kernelH: options.conv2d.kernelH,
|
|
223
|
-
kernelW: options.conv2d.kernelW,
|
|
224
|
-
stride: options.conv2d.stride ?? 1,
|
|
225
|
-
pad: options.conv2d.pad ?? 0,
|
|
226
|
-
};
|
|
227
|
-
const convResult = recorder
|
|
228
|
-
? await recordConv2D(recorder, convTensorInput, convKernel, null, convOptions)
|
|
229
|
-
: await runConv2D(convTensorInput, convKernel, null, convOptions);
|
|
230
|
-
convInput = createTensor(
|
|
231
|
-
convResult.buffer,
|
|
232
|
-
convResult.dtype,
|
|
233
|
-
[numTokens, hiddenSize],
|
|
234
|
-
`${label}.conv_output`
|
|
235
|
-
);
|
|
236
|
-
releaseOrTrack(recorder, activated.buffer);
|
|
237
|
-
activated = null;
|
|
238
|
-
}
|
|
239
|
-
|
|
227
|
+
// Output projection
|
|
240
228
|
outProj = await doMatmul(
|
|
241
|
-
|
|
229
|
+
convOut,
|
|
242
230
|
convOutProj,
|
|
243
231
|
numTokens,
|
|
244
232
|
hiddenSize,
|
|
@@ -253,13 +241,8 @@ export async function doConv(
|
|
|
253
241
|
recorder
|
|
254
242
|
);
|
|
255
243
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
convInput = null;
|
|
259
|
-
} else if (activated) {
|
|
260
|
-
releaseOrTrack(recorder, activated.buffer);
|
|
261
|
-
activated = null;
|
|
262
|
-
}
|
|
244
|
+
releaseOrTrack(recorder, convOut.buffer);
|
|
245
|
+
convOut = null;
|
|
263
246
|
|
|
264
247
|
if (kernelTrace.enabled && !recorder) {
|
|
265
248
|
await traceStep('conv', label, layerIdx, outProj.buffer, [numTokens, hiddenSize]);
|
|
@@ -267,13 +250,100 @@ export async function doConv(
|
|
|
267
250
|
return outProj;
|
|
268
251
|
} catch (error) {
|
|
269
252
|
if (outProj) releaseOrTrack(recorder, outProj.buffer);
|
|
270
|
-
if (
|
|
271
|
-
if (activated) releaseOrTrack(recorder, activated.buffer);
|
|
253
|
+
if (convOut) releaseOrTrack(recorder, convOut.buffer);
|
|
272
254
|
if (inProj) releaseOrTrack(recorder, inProj.buffer);
|
|
273
255
|
throw error;
|
|
274
256
|
}
|
|
275
257
|
}
|
|
276
258
|
|
|
259
|
+
export async function initConvLayerState(convState, convKernel, convInProj, hiddenSize, label, layerIdx) {
|
|
260
|
+
const { isWeightBuffer } = await import('../../../gpu/weight-buffer.js');
|
|
261
|
+
const isWB = typeof isWeightBuffer === 'function' && isWeightBuffer(convKernel);
|
|
262
|
+
const kernelBuf = isWB ? convKernel.buffer : (convKernel instanceof GPUBuffer ? convKernel : convKernel.buffer ?? convKernel);
|
|
263
|
+
const kernelDtype = isWB ? String(convKernel.dtype ?? '').toLowerCase() : null;
|
|
264
|
+
|
|
265
|
+
// Determine kernel size from weight shape
|
|
266
|
+
let kernelSize = 3;
|
|
267
|
+
if (isWB && Array.isArray(convKernel.shape)) {
|
|
268
|
+
kernelSize = Number(convKernel.shape[convKernel.shape.length - 1]) || 3;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Dequantize conv kernel weights to F32
|
|
272
|
+
const totalElements = hiddenSize * kernelSize;
|
|
273
|
+
const { QK_K, Q4K_BLOCK_BYTES } = await import('../../../config/schema/index.js');
|
|
274
|
+
const { dequantizeQ4KM } = await import('../../../converter/quantizer.js');
|
|
275
|
+
const { getDevice } = await import('../../../gpu/device.js');
|
|
276
|
+
const device = getDevice();
|
|
277
|
+
|
|
278
|
+
const isQ4K = kernelDtype === 'q4k' || kernelDtype === 'q4_k_m' || kernelDtype === 'q4_k';
|
|
279
|
+
let weightF32;
|
|
280
|
+
|
|
281
|
+
if (isQ4K) {
|
|
282
|
+
const numBlocks = Math.ceil(totalElements / QK_K);
|
|
283
|
+
const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
|
|
284
|
+
// GPU readBuffer returns zeros for some Q4K weight buffers, so prefer
|
|
285
|
+
// CPU-side rawBytes from the WeightBuffer when available.
|
|
286
|
+
const hasRawBytes = isWB && convKernel.rawBytes;
|
|
287
|
+
if (hasRawBytes) {
|
|
288
|
+
weightF32 = dequantizeQ4KM(new Uint8Array(convKernel.rawBytes), numBlocks, [totalElements]);
|
|
289
|
+
} else {
|
|
290
|
+
if (device) await device.queue.onSubmittedWorkDone();
|
|
291
|
+
const raw = await readBuffer(kernelBuf, q4kBytes);
|
|
292
|
+
weightF32 = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [totalElements]);
|
|
293
|
+
}
|
|
294
|
+
} else if (kernelDtype === 'f16' || kernelDtype === 'bf16') {
|
|
295
|
+
if (device) await device.queue.onSubmittedWorkDone();
|
|
296
|
+
const raw = await readBuffer(kernelBuf, totalElements * 2);
|
|
297
|
+
const { decodeReadback } = await import('./debug-utils/index.js');
|
|
298
|
+
weightF32 = decodeReadback(raw, 'f16');
|
|
299
|
+
} else {
|
|
300
|
+
if (device) await device.queue.onSubmittedWorkDone();
|
|
301
|
+
const raw = await readBuffer(kernelBuf, totalElements * 4);
|
|
302
|
+
weightF32 = new Float32Array(raw);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Validate dequantized weights are non-degenerate
|
|
306
|
+
let maxAbs = 0;
|
|
307
|
+
for (let i = 0; i < weightF32.length; i++) {
|
|
308
|
+
const abs = Math.abs(weightF32[i]);
|
|
309
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
310
|
+
}
|
|
311
|
+
if (maxAbs === 0) {
|
|
312
|
+
const { log } = await import('../../../debug/index.js');
|
|
313
|
+
log.error('Pipeline', `${label} conv kernel weights are all zeros after dequantization (dtype=${kernelDtype}, elements=${totalElements}). Conv layers will produce degenerate output.`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Upload dequantized weights to GPU
|
|
317
|
+
const weightGPU = acquireBuffer(weightF32.byteLength, undefined, `${label}.conv_weight_f32`);
|
|
318
|
+
uploadData(weightGPU, weightF32);
|
|
319
|
+
|
|
320
|
+
// Create zeroed conv state buffer
|
|
321
|
+
const stateSize = hiddenSize * (kernelSize - 1) * Float32Array.BYTES_PER_ELEMENT;
|
|
322
|
+
const stateGPU = acquireBuffer(stateSize, undefined, `${label}.conv_state`);
|
|
323
|
+
uploadData(stateGPU, new Float32Array(hiddenSize * (kernelSize - 1)));
|
|
324
|
+
|
|
325
|
+
convState.convWeightGPU = weightGPU;
|
|
326
|
+
convState.convStateGPU = stateGPU;
|
|
327
|
+
convState.hiddenSize = hiddenSize;
|
|
328
|
+
convState.kernelSize = kernelSize;
|
|
329
|
+
|
|
330
|
+
// Pre-dequantize in_proj weight to F32 via CPU dequantization of the raw Q4K buffer.
|
|
331
|
+
// GPU readBuffer returns zeros for some Q4K weight buffers, so we dequantize from the
|
|
332
|
+
// WeightBuffer's raw bytes instead.
|
|
333
|
+
if (isWB && isWeightBuffer(convInProj)) {
|
|
334
|
+
const inProjDtype = String(convInProj.dtype ?? '').toLowerCase();
|
|
335
|
+
const isInProjQ4K = inProjDtype === 'q4k' || inProjDtype === 'q4_k_m' || inProjDtype === 'q4_k';
|
|
336
|
+
if (isInProjQ4K && convInProj.rawBytes) {
|
|
337
|
+
const inProjElements = hiddenSize * 3 * hiddenSize;
|
|
338
|
+
const inProjBlocks = Math.ceil(inProjElements / QK_K);
|
|
339
|
+
const inProjF32 = dequantizeQ4KM(new Uint8Array(convInProj.rawBytes), inProjBlocks, [inProjElements]);
|
|
340
|
+
const inProjGPU = acquireBuffer(inProjF32.byteLength, undefined, `${label}.in_proj_f32`);
|
|
341
|
+
uploadData(inProjGPU, inProjF32);
|
|
342
|
+
convState.inProjF32GPU = inProjGPU;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
277
347
|
export async function doCast(input, toDtype, recorder) {
|
|
278
348
|
if (toDtype !== 'f16' && toDtype !== 'f32') {
|
|
279
349
|
throw new Error(`Unsupported cast target dtype "${toDtype}"`);
|
|
@@ -58,6 +58,30 @@ export function softmax(logits) {
|
|
|
58
58
|
return exps;
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
+
function countFiniteCandidates(logits, padTokenId) {
|
|
62
|
+
let finiteCandidateCount = 0;
|
|
63
|
+
for (let i = 0; i < logits.length; i++) {
|
|
64
|
+
if (padTokenId != null && i === padTokenId) {
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
if (Number.isFinite(logits[i])) {
|
|
68
|
+
finiteCandidateCount += 1;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return finiteCandidateCount;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function assertFiniteSamplingCandidates(logits, padTokenId, label) {
|
|
75
|
+
const finiteCandidateCount = countFiniteCandidates(logits, padTokenId);
|
|
76
|
+
if (finiteCandidateCount > 0) {
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
throw new Error(
|
|
80
|
+
`[Sampling] ${label} has no finite candidate logits after masking the pad token. ` +
|
|
81
|
+
'Upstream decode likely produced NaN/Inf or an all-masked distribution.'
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
61
85
|
|
|
62
86
|
export function sample(logits, opts) {
|
|
63
87
|
const { temperature, topP, topK, decode, debug = false, padTokenId, seed } = opts;
|
|
@@ -66,16 +90,28 @@ export function sample(logits, opts) {
|
|
|
66
90
|
logits[padTokenId] = -Infinity;
|
|
67
91
|
}
|
|
68
92
|
|
|
93
|
+
assertFiniteSamplingCandidates(logits, padTokenId, 'Logits');
|
|
94
|
+
|
|
69
95
|
// Greedy (argmax) when temperature = 0
|
|
70
96
|
if (temperature === 0) {
|
|
71
|
-
let maxIdx =
|
|
72
|
-
let maxVal =
|
|
73
|
-
for (let i =
|
|
74
|
-
|
|
75
|
-
|
|
97
|
+
let maxIdx = -1;
|
|
98
|
+
let maxVal = -Infinity;
|
|
99
|
+
for (let i = 0; i < logits.length; i++) {
|
|
100
|
+
const value = logits[i];
|
|
101
|
+
if (!Number.isFinite(value)) {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
if (value > maxVal) {
|
|
105
|
+
maxVal = value;
|
|
76
106
|
maxIdx = i;
|
|
77
107
|
}
|
|
78
108
|
}
|
|
109
|
+
if (maxIdx < 0) {
|
|
110
|
+
throw new Error(
|
|
111
|
+
'[Sampling] Greedy sampling could not find a finite candidate logit. ' +
|
|
112
|
+
'Upstream decode likely produced NaN/Inf.'
|
|
113
|
+
);
|
|
114
|
+
}
|
|
79
115
|
if (debug) {
|
|
80
116
|
const text = decode?.([maxIdx]) ?? '?';
|
|
81
117
|
trace.sample(`Greedy: id=${maxIdx} "${text}" logit=${maxVal.toFixed(4)}`);
|
|
@@ -96,7 +132,17 @@ export function sample(logits, opts) {
|
|
|
96
132
|
|
|
97
133
|
let candidates = [];
|
|
98
134
|
for (let i = 0; i < probs.length; i++) {
|
|
99
|
-
|
|
135
|
+
const probability = probs[i];
|
|
136
|
+
if (!Number.isFinite(probability) || probability <= 0) {
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
candidates.push({ token: i, prob: probability });
|
|
140
|
+
}
|
|
141
|
+
if (candidates.length === 0) {
|
|
142
|
+
throw new Error(
|
|
143
|
+
'[Sampling] Softmax produced no finite candidate probabilities. ' +
|
|
144
|
+
'Upstream decode likely produced NaN/Inf logits.'
|
|
145
|
+
);
|
|
100
146
|
}
|
|
101
147
|
candidates.sort((a, b) => b.prob - a.prob);
|
|
102
148
|
|
|
@@ -69,6 +69,11 @@ export declare class InferencePipeline extends PipelineState {
|
|
|
69
69
|
// ==========================================================================
|
|
70
70
|
|
|
71
71
|
generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
|
|
72
|
+
generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
|
|
73
|
+
generateTokenIds(
|
|
74
|
+
prompt: PromptInput,
|
|
75
|
+
options?: GenerateOptions
|
|
76
|
+
): Promise<{ tokenIds: number[]; stats: PipelineStats }>;
|
|
72
77
|
|
|
73
78
|
decodeStepLogits(currentIds: number[], options?: GenerateOptions): Promise<LogitsStepResult>;
|
|
74
79
|
|
|
@@ -43,6 +43,7 @@ import {
|
|
|
43
43
|
import { getDopplerLoader } from '../../loader/doppler-loader.js';
|
|
44
44
|
import { registerPipeline, getPipelineFactory } from './registry.js';
|
|
45
45
|
import { selectRuleValue } from '../../rules/rule-registry.js';
|
|
46
|
+
import { initConvLayerState } from './text/ops.js';
|
|
46
47
|
|
|
47
48
|
function destroyMoERouter(router) {
|
|
48
49
|
if (router && typeof router.destroy === 'function') {
|
|
@@ -221,6 +222,9 @@ export class InferencePipeline extends PipelineState {
|
|
|
221
222
|
// Initialize RoPE frequencies
|
|
222
223
|
await this._initRoPE();
|
|
223
224
|
|
|
225
|
+
// Initialize conv layer states for gated short conv layers (LFM2)
|
|
226
|
+
await this._initConvLayerStates();
|
|
227
|
+
|
|
224
228
|
this.isLoaded = true;
|
|
225
229
|
log.info('Pipeline', 'Model loaded successfully');
|
|
226
230
|
}
|
|
@@ -237,6 +241,7 @@ export class InferencePipeline extends PipelineState {
|
|
|
237
241
|
resolvedKernelPath: this.resolvedKernelPath,
|
|
238
242
|
kernelPathSource: this.kernelPathSource,
|
|
239
243
|
keepF32Weights: this.runtimeConfig.inference.compute.keepF32Weights === true,
|
|
244
|
+
loaderDebug: this.runtimeConfig?.shared?.debug?.loader ?? null,
|
|
240
245
|
onProgress: (info) => {
|
|
241
246
|
if (info.stage !== 'layers' && info.stage !== 'shards') {
|
|
242
247
|
log.verbose('Loader', `${info.stage}: ${Math.round(info.progress * 100)}%${info.message ? ` - ${info.message}` : ''}`);
|
|
@@ -310,7 +315,7 @@ export class InferencePipeline extends PipelineState {
|
|
|
310
315
|
maxSeqLen,
|
|
311
316
|
ropeTheta: config.ropeTheta,
|
|
312
317
|
ropeLocalTheta: config.ropeLocalTheta,
|
|
313
|
-
mropeInterleaved: config.
|
|
318
|
+
mropeInterleaved: config.mropeInterleaved,
|
|
314
319
|
mropeSection: config.mropeSection,
|
|
315
320
|
partialRotaryFactor: config.partialRotaryFactor,
|
|
316
321
|
ropeScale: config.ropeScale,
|
|
@@ -327,6 +332,51 @@ export class InferencePipeline extends PipelineState {
|
|
|
327
332
|
}
|
|
328
333
|
|
|
329
334
|
|
|
335
|
+
async _initConvLayerStates() {
|
|
336
|
+
const config = this.modelConfig;
|
|
337
|
+
if (!config?.layerTypes) return;
|
|
338
|
+
const { getDevice } = await import('../../gpu/device.js');
|
|
339
|
+
const device = getDevice();
|
|
340
|
+
if (!device) return;
|
|
341
|
+
|
|
342
|
+
const hiddenSize = config.hiddenSize;
|
|
343
|
+
const convStates = new Map();
|
|
344
|
+
|
|
345
|
+
for (let i = 0; i < config.layerTypes.length; i++) {
|
|
346
|
+
const lt = String(config.layerTypes[i] ?? '').toLowerCase();
|
|
347
|
+
if (lt !== 'conv' && lt !== 'convolution') continue;
|
|
348
|
+
|
|
349
|
+
const layerWeights = this.weights.get(`layer_${i}`);
|
|
350
|
+
if (!layerWeights) continue;
|
|
351
|
+
const convKernel = layerWeights?.convKernel;
|
|
352
|
+
if (!convKernel) continue;
|
|
353
|
+
|
|
354
|
+
const convState = {};
|
|
355
|
+
try {
|
|
356
|
+
await initConvLayerState(
|
|
357
|
+
convState,
|
|
358
|
+
convKernel,
|
|
359
|
+
layerWeights.convInProj ?? null,
|
|
360
|
+
hiddenSize,
|
|
361
|
+
`L${i}.conv`,
|
|
362
|
+
i
|
|
363
|
+
);
|
|
364
|
+
if (!convState.convWeightGPU || !convState.convStateGPU) {
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
convStates.set(i, convState);
|
|
368
|
+
} catch (e) {
|
|
369
|
+
log.warn('Pipeline', `Conv layer ${i} state init failed: ${e.message}`);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (convStates.size > 0) {
|
|
374
|
+
this.convLayerStates = convStates;
|
|
375
|
+
log.info('Pipeline', `Initialized ${convStates.size} conv layer states (kernelSize=${convStates.values().next().value?.kernelSize})`);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
|
|
330
380
|
_resolveLayerPipeline() {
|
|
331
381
|
if (!this.modelConfig) return;
|
|
332
382
|
const runtimePlan = this.runtimeConfig.inference.pipeline ?? null;
|
|
@@ -349,6 +399,14 @@ export class InferencePipeline extends PipelineState {
|
|
|
349
399
|
return this.generator.generate(prompt, options);
|
|
350
400
|
}
|
|
351
401
|
|
|
402
|
+
generateTokens(prompt, options = {}) {
|
|
403
|
+
return this.generator.generateTokens(prompt, options);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
generateTokenIds(prompt, options = {}) {
|
|
407
|
+
return this.generator.generateTokenIds(prompt, options);
|
|
408
|
+
}
|
|
409
|
+
|
|
352
410
|
decodeStepLogits(currentIds, options = {}) {
|
|
353
411
|
return this.generator.decodeStepLogits(currentIds, options);
|
|
354
412
|
}
|