@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
|
|
4
|
-
import { releaseBuffer, readBuffer, readBufferSlice } from '../../../memory/buffer-pool.js';
|
|
4
|
+
import { releaseBuffer, readBuffer, readBufferSlice, uploadData } from '../../../memory/buffer-pool.js';
|
|
5
5
|
import { isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
|
|
6
6
|
import { markWarmed as markKernelCacheWarmed } from '../../../gpu/kernel-selection-cache.js';
|
|
7
7
|
import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
|
|
@@ -122,6 +122,20 @@ function resolveTokenText(tokenizer, tokenIds, fallbackText = '?', renderTokenTe
|
|
|
122
122
|
return fallbackText;
|
|
123
123
|
}
|
|
124
124
|
|
|
125
|
+
export function shouldRetryWithFinitenessFallback(error) {
|
|
126
|
+
if (error?.name === 'FinitenessError') {
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
const message = typeof error?.message === 'string'
|
|
130
|
+
? error.message
|
|
131
|
+
: (typeof error === 'string' ? error : '');
|
|
132
|
+
if (!message.startsWith('[Sampling]')) {
|
|
133
|
+
return false;
|
|
134
|
+
}
|
|
135
|
+
return message.includes('no finite candidate logits after masking the pad token')
|
|
136
|
+
|| message.includes('Softmax produced no finite candidate probabilities');
|
|
137
|
+
}
|
|
138
|
+
|
|
125
139
|
export class PipelineGenerator {
|
|
126
140
|
|
|
127
141
|
#state;
|
|
@@ -196,6 +210,14 @@ export class PipelineGenerator {
|
|
|
196
210
|
return resolveStepOptions(this.#state, options);
|
|
197
211
|
}
|
|
198
212
|
|
|
213
|
+
_resetDecodeRuntimeState() {
|
|
214
|
+
this.#state.decodeStepCount = 0;
|
|
215
|
+
this.#state.disableRecordedLogits = false;
|
|
216
|
+
this.#state.disableFusedDecode = false;
|
|
217
|
+
resetActiveExecutionPlan(this.#state);
|
|
218
|
+
this.#state.decodeRing?.reset();
|
|
219
|
+
}
|
|
220
|
+
|
|
199
221
|
_getDecodeHelpers(debugCheckBuffer) {
|
|
200
222
|
return {
|
|
201
223
|
buildLayerContext: (recorder, isDecodeMode, debugLayers, executionPlan) =>
|
|
@@ -221,6 +243,209 @@ export class PipelineGenerator {
|
|
|
221
243
|
);
|
|
222
244
|
}
|
|
223
245
|
|
|
246
|
+
_resolvePromptTokenIds(prompt, useChatTemplate, contextLabel) {
|
|
247
|
+
const processedPrompt = resolvePromptInput(this.#state, prompt, useChatTemplate, contextLabel);
|
|
248
|
+
const inputIds = this.#state.tokenizer.encode(processedPrompt);
|
|
249
|
+
this._assertTokenIdsInRange(inputIds, `${contextLabel}.encode`);
|
|
250
|
+
return inputIds;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
_sampleNextTokenFromLogits(logits, generatedIds, opts) {
|
|
254
|
+
const sampledLogits = Float32Array.from(logits);
|
|
255
|
+
applyRepetitionPenalty(sampledLogits, generatedIds, opts.repetitionPenalty);
|
|
256
|
+
const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
|
|
257
|
+
return sample(sampledLogits, {
|
|
258
|
+
temperature: opts.temperature,
|
|
259
|
+
topP: opts.topP,
|
|
260
|
+
topK: opts.topK,
|
|
261
|
+
padTokenId,
|
|
262
|
+
seed: opts.seed,
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
async _prefillPromptToLogits(prompt, opts, contextLabel) {
|
|
267
|
+
const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, contextLabel);
|
|
268
|
+
if (opts.debug) {
|
|
269
|
+
log.debug('Pipeline', `${contextLabel}: ${inputIds.length} tokens`);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
let logits;
|
|
273
|
+
try {
|
|
274
|
+
logits = await this._prefill(inputIds, opts);
|
|
275
|
+
} catch (error) {
|
|
276
|
+
if (!shouldRetryWithFinitenessFallback(error)) {
|
|
277
|
+
throw error;
|
|
278
|
+
}
|
|
279
|
+
log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during ${contextLabel}. Retrying with F32 precision.`);
|
|
280
|
+
logits = await this._retryWithFinitenessFallback(
|
|
281
|
+
opts,
|
|
282
|
+
contextLabel,
|
|
283
|
+
() => this._prefill(inputIds, opts)
|
|
284
|
+
);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return { inputIds, logits };
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
async _decodeStepToLogits(currentIds, opts) {
|
|
291
|
+
const debugCheckBuffer = this.#state.debug
|
|
292
|
+
? (buffer, label, numTokens, expectedDim) =>
|
|
293
|
+
debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
|
|
294
|
+
: undefined;
|
|
295
|
+
return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
async _decodeNextTokenViaLogits(currentIds, opts) {
|
|
299
|
+
const stepResult = await this._decodeStepToLogits(currentIds, opts);
|
|
300
|
+
return this._sampleNextTokenFromLogits(stepResult.logits, currentIds, opts);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
async *_generateTokensInternal(prompt, options = {}, mode = 'text') {
|
|
304
|
+
if (!this.#state.isLoaded) throw new Error('Model not loaded');
|
|
305
|
+
if (this.#state.isGenerating) throw new Error('Generation already in progress');
|
|
306
|
+
|
|
307
|
+
validateCallTimeOptions(options);
|
|
308
|
+
|
|
309
|
+
this.#state.isGenerating = true;
|
|
310
|
+
this._resetDecodeRuntimeState();
|
|
311
|
+
this.#state.stats.gpuTimePrefillMs = undefined;
|
|
312
|
+
this.#state.stats.gpuTimeDecodeMs = undefined;
|
|
313
|
+
this.#state.stats.decodeRecordMs = 0;
|
|
314
|
+
this.#state.stats.decodeSubmitWaitMs = 0;
|
|
315
|
+
this.#state.stats.decodeReadbackWaitMs = 0;
|
|
316
|
+
this.#state.stats.ttftMs = 0;
|
|
317
|
+
const startTime = performance.now();
|
|
318
|
+
|
|
319
|
+
const opts = resolveGenerateOptions(this.#state, options);
|
|
320
|
+
|
|
321
|
+
if (opts.debug) {
|
|
322
|
+
log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const emitToken = async function* (generator, tokenId, textDecoder) {
|
|
326
|
+
if (mode === 'token') {
|
|
327
|
+
yield tokenId;
|
|
328
|
+
if (options.onToken) options.onToken(tokenId, '');
|
|
329
|
+
return;
|
|
330
|
+
}
|
|
331
|
+
const tokenText = textDecoder(tokenId);
|
|
332
|
+
yield tokenText;
|
|
333
|
+
if (options.onToken) options.onToken(tokenId, tokenText);
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
try {
|
|
337
|
+
const prefillStart = performance.now();
|
|
338
|
+
const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generate');
|
|
339
|
+
let prefillLogits = initialPrefillLogits;
|
|
340
|
+
this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
|
|
341
|
+
this._assertTokenIdsInRange(inputIds, 'generate.prefillTokens');
|
|
342
|
+
const generatedIds = [...inputIds];
|
|
343
|
+
this.#state.stats.prefillTokens = inputIds.length;
|
|
344
|
+
|
|
345
|
+
if (opts.debug) {
|
|
346
|
+
log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
const intentBundleConfig = this.#state.runtimeConfig.shared.intentBundle;
|
|
350
|
+
const intentBundle = intentBundleConfig?.bundle;
|
|
351
|
+
const expectedTopK = intentBundle?.payload?.expectedTopK
|
|
352
|
+
?? intentBundle?.payload?.expected_top_k;
|
|
353
|
+
const maxDriftThreshold = intentBundle?.constraints?.maxDriftThreshold
|
|
354
|
+
?? intentBundle?.constraints?.max_drift_threshold;
|
|
355
|
+
|
|
356
|
+
if (intentBundleConfig?.enabled && Array.isArray(expectedTopK) && expectedTopK.length > 0) {
|
|
357
|
+
const actualTopK = getTopK(
|
|
358
|
+
prefillLogits,
|
|
359
|
+
expectedTopK.length,
|
|
360
|
+
(tokens) => resolveTokenText(this.#state.tokenizer, tokens),
|
|
361
|
+
).map((token) => token.token);
|
|
362
|
+
const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
|
|
363
|
+
if (!driftResult.ok) {
|
|
364
|
+
throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
if (opts.debug) {
|
|
369
|
+
const topAfterPenalty = getTopK(
|
|
370
|
+
Float32Array.from(prefillLogits),
|
|
371
|
+
5,
|
|
372
|
+
(tokens) => resolveTokenText(this.#state.tokenizer, tokens)
|
|
373
|
+
);
|
|
374
|
+
log.debug('Pipeline', `After rep penalty top-5: ${topAfterPenalty.map(t => `"${t.text}"(${(t.prob * 100).toFixed(1)}%)`).join(', ')}`);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
let firstToken;
|
|
378
|
+
try {
|
|
379
|
+
firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
|
|
380
|
+
} catch (error) {
|
|
381
|
+
if (!shouldRetryWithFinitenessFallback(error)) {
|
|
382
|
+
throw error;
|
|
383
|
+
}
|
|
384
|
+
log.warn('Pipeline', 'FinitenessGuard caught non-finite prefill logits at sampling. Retrying with F32 precision.');
|
|
385
|
+
prefillLogits = await this._retryWithFinitenessFallback(
|
|
386
|
+
opts,
|
|
387
|
+
'prefill-sample',
|
|
388
|
+
() => this._prefill(inputIds, opts)
|
|
389
|
+
);
|
|
390
|
+
firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
if (opts.debug) {
|
|
394
|
+
const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
|
|
395
|
+
log.debug('Pipeline', `First token sampled: id=${firstToken} text="${firstTokenText}"`);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
generatedIds.push(firstToken);
|
|
399
|
+
this.#state.stats.ttftMs = performance.now() - startTime;
|
|
400
|
+
|
|
401
|
+
const decodeToken = (tokenId) => resolveTokenText(
|
|
402
|
+
this.#state.tokenizer,
|
|
403
|
+
[tokenId],
|
|
404
|
+
`[${tokenId}]`,
|
|
405
|
+
(tokens) => this.#state.tokenizer?.decode?.(tokens, true, false),
|
|
406
|
+
(tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
|
|
407
|
+
);
|
|
408
|
+
|
|
409
|
+
yield* emitToken(this, firstToken, decodeToken);
|
|
410
|
+
|
|
411
|
+
yield* this._runDecodeLoop(generatedIds, opts, options, {
|
|
412
|
+
stopTokenIds: this.#state.modelConfig.stopTokenIds,
|
|
413
|
+
eosToken: this.#state.tokenizer.getSpecialTokens?.()?.eos,
|
|
414
|
+
stopSequenceStart: inputIds.length,
|
|
415
|
+
decodeToken,
|
|
416
|
+
logBatchPath: opts.debug,
|
|
417
|
+
emitMode: mode,
|
|
418
|
+
});
|
|
419
|
+
const tokensGenerated = this.#state.stats.decodeTokens;
|
|
420
|
+
this.#state.stats.totalTimeMs = performance.now() - startTime;
|
|
421
|
+
|
|
422
|
+
if (opts.debug) {
|
|
423
|
+
log.debug('Pipeline', `Generated ${tokensGenerated} tokens in ${this.#state.stats.totalTimeMs.toFixed(0)}ms`);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
const ttft = this.#state.stats.ttftMs ?? this.#state.stats.prefillTimeMs;
|
|
427
|
+
const decodeTokens = Math.max(0, tokensGenerated - 1);
|
|
428
|
+
const decodeSpeed = decodeTokens > 0 ? (decodeTokens / this.#state.stats.decodeTimeMs * 1000) : 0;
|
|
429
|
+
if (opts.benchmark) {
|
|
430
|
+
log.info('Benchmark', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
|
|
431
|
+
} else {
|
|
432
|
+
log.info('Perf', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
|
|
433
|
+
}
|
|
434
|
+
trace.perf('Decode summary', {
|
|
435
|
+
ttftMs: ttft,
|
|
436
|
+
prefillMs: this.#state.stats.prefillTimeMs,
|
|
437
|
+
decodeMs: this.#state.stats.decodeTimeMs,
|
|
438
|
+
decodeTokens,
|
|
439
|
+
decodeSpeed,
|
|
440
|
+
totalMs: this.#state.stats.totalTimeMs,
|
|
441
|
+
});
|
|
442
|
+
} finally {
|
|
443
|
+
this._closeFinitenessFallbackWindow(opts);
|
|
444
|
+
resetActiveExecutionPlan(this.#state);
|
|
445
|
+
this.#state.isGenerating = false;
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
224
449
|
_beginFinitenessFallback(opts, reasonLabel) {
|
|
225
450
|
const originalPlan = resolveActiveExecutionPlan(this.#state);
|
|
226
451
|
const original = {
|
|
@@ -306,17 +531,21 @@ export class PipelineGenerator {
|
|
|
306
531
|
|
|
307
532
|
|
|
308
533
|
async *generate(prompt, options = {}) {
|
|
534
|
+
yield* this._generateTokensInternal(prompt, options, 'text');
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
async *generateTokens(prompt, options = {}) {
|
|
538
|
+
yield* this._generateTokensInternal(prompt, options, 'token');
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
async generateTokenIds(prompt, options = {}) {
|
|
309
542
|
if (!this.#state.isLoaded) throw new Error('Model not loaded');
|
|
310
543
|
if (this.#state.isGenerating) throw new Error('Generation already in progress');
|
|
311
544
|
|
|
312
545
|
validateCallTimeOptions(options);
|
|
313
546
|
|
|
314
547
|
this.#state.isGenerating = true;
|
|
315
|
-
this
|
|
316
|
-
this.#state.disableRecordedLogits = false;
|
|
317
|
-
this.#state.disableFusedDecode = false;
|
|
318
|
-
resetActiveExecutionPlan(this.#state);
|
|
319
|
-
this.#state.decodeRing?.reset();
|
|
548
|
+
this._resetDecodeRuntimeState();
|
|
320
549
|
this.#state.stats.gpuTimePrefillMs = undefined;
|
|
321
550
|
this.#state.stats.gpuTimeDecodeMs = undefined;
|
|
322
551
|
this.#state.stats.decodeRecordMs = 0;
|
|
@@ -324,135 +553,79 @@ export class PipelineGenerator {
|
|
|
324
553
|
this.#state.stats.decodeReadbackWaitMs = 0;
|
|
325
554
|
this.#state.stats.ttftMs = 0;
|
|
326
555
|
const startTime = performance.now();
|
|
327
|
-
|
|
328
556
|
const opts = resolveGenerateOptions(this.#state, options);
|
|
329
557
|
|
|
330
|
-
if (opts.debug) {
|
|
331
|
-
log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
|
|
332
|
-
}
|
|
333
|
-
|
|
334
558
|
try {
|
|
335
|
-
const
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
const inputIds = this.#state.tokenizer.encode(processedPrompt);
|
|
341
|
-
this._assertTokenIdsInRange(inputIds, 'generate.encode');
|
|
559
|
+
const prefillStart = performance.now();
|
|
560
|
+
const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generateTokenIds');
|
|
561
|
+
let prefillLogits = initialPrefillLogits;
|
|
562
|
+
this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
|
|
563
|
+
this._assertTokenIdsInRange(inputIds, 'generateTokenIds.prefillTokens');
|
|
342
564
|
const generatedIds = [...inputIds];
|
|
343
565
|
this.#state.stats.prefillTokens = inputIds.length;
|
|
344
566
|
|
|
345
|
-
|
|
346
|
-
log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
const prefillStart = performance.now();
|
|
350
|
-
let prefillLogits;
|
|
567
|
+
let firstToken;
|
|
351
568
|
try {
|
|
352
|
-
|
|
569
|
+
firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
|
|
353
570
|
} catch (error) {
|
|
354
|
-
if (error
|
|
355
|
-
log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefill. Retrying with F32 precision.`);
|
|
356
|
-
prefillLogits = await this._retryWithFinitenessFallback(
|
|
357
|
-
opts,
|
|
358
|
-
'prefill',
|
|
359
|
-
() => this._prefill(inputIds, opts)
|
|
360
|
-
);
|
|
361
|
-
} else {
|
|
571
|
+
if (!shouldRetryWithFinitenessFallback(error)) {
|
|
362
572
|
throw error;
|
|
363
573
|
}
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
const intentBundle = intentBundleConfig?.bundle;
|
|
369
|
-
const expectedTopK = intentBundle?.payload?.expectedTopK
|
|
370
|
-
?? intentBundle?.payload?.expected_top_k;
|
|
371
|
-
const maxDriftThreshold = intentBundle?.constraints?.maxDriftThreshold
|
|
372
|
-
?? intentBundle?.constraints?.max_drift_threshold;
|
|
373
|
-
|
|
374
|
-
if (intentBundleConfig?.enabled && Array.isArray(expectedTopK) && expectedTopK.length > 0) {
|
|
375
|
-
const actualTopK = getTopK(
|
|
376
|
-
prefillLogits,
|
|
377
|
-
expectedTopK.length,
|
|
378
|
-
(tokens) => resolveTokenText(this.#state.tokenizer, tokens),
|
|
379
|
-
).map((token) => token.token);
|
|
380
|
-
const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
|
|
381
|
-
if (!driftResult.ok) {
|
|
382
|
-
throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
|
|
383
|
-
}
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
applyRepetitionPenalty(prefillLogits, generatedIds, opts.repetitionPenalty);
|
|
387
|
-
const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
|
|
388
|
-
|
|
389
|
-
if (opts.debug) {
|
|
390
|
-
const topAfterPenalty = getTopK(
|
|
391
|
-
prefillLogits,
|
|
392
|
-
5,
|
|
393
|
-
(tokens) => resolveTokenText(this.#state.tokenizer, tokens)
|
|
574
|
+
prefillLogits = await this._retryWithFinitenessFallback(
|
|
575
|
+
opts,
|
|
576
|
+
'prefill-sample',
|
|
577
|
+
() => this._prefill(inputIds, opts)
|
|
394
578
|
);
|
|
395
|
-
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
const firstToken = sample(prefillLogits, {
|
|
399
|
-
temperature: opts.temperature,
|
|
400
|
-
topP: opts.topP,
|
|
401
|
-
topK: opts.topK,
|
|
402
|
-
padTokenId,
|
|
403
|
-
seed: opts.seed,
|
|
404
|
-
});
|
|
405
|
-
|
|
406
|
-
if (opts.debug) {
|
|
407
|
-
const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
|
|
408
|
-
log.debug('Pipeline', `First token sampled: id=${firstToken} text="${firstTokenText}"`);
|
|
579
|
+
firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
|
|
409
580
|
}
|
|
410
581
|
|
|
411
582
|
generatedIds.push(firstToken);
|
|
583
|
+
const tokenIds = [firstToken];
|
|
412
584
|
this.#state.stats.ttftMs = performance.now() - startTime;
|
|
413
585
|
|
|
414
|
-
const
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
(tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
|
|
420
|
-
);
|
|
586
|
+
const stopTokenIds = this.#state.modelConfig.stopTokenIds;
|
|
587
|
+
const eosToken = this.#state.tokenizer.getSpecialTokens?.()?.eos;
|
|
588
|
+
const stopSequenceStart = inputIds.length;
|
|
589
|
+
markKernelCacheWarmed();
|
|
590
|
+
const decodeStart = performance.now();
|
|
421
591
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
592
|
+
while (tokenIds.length < opts.maxTokens) {
|
|
593
|
+
if (options.signal?.aborted) break;
|
|
594
|
+
let nextToken;
|
|
595
|
+
try {
|
|
596
|
+
nextToken = await this._decodeNextTokenViaLogits(generatedIds, opts);
|
|
597
|
+
} catch (error) {
|
|
598
|
+
if (shouldRetryWithFinitenessFallback(error)) {
|
|
599
|
+
nextToken = await this._retryDecodeStepWithFinitenessWindow(
|
|
600
|
+
generatedIds,
|
|
601
|
+
opts,
|
|
602
|
+
`decode-step-${tokenIds.length}`
|
|
603
|
+
);
|
|
604
|
+
} else {
|
|
605
|
+
throw error;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
generatedIds.push(nextToken);
|
|
609
|
+
tokenIds.push(nextToken);
|
|
610
|
+
this._consumeFinitenessFallbackToken(opts);
|
|
611
|
+
if (isStopToken(nextToken, stopTokenIds, eosToken)) {
|
|
612
|
+
break;
|
|
613
|
+
}
|
|
614
|
+
if (opts.stopSequences.length > 0) {
|
|
615
|
+
const fullText = this.#state.tokenizer.decode(generatedIds.slice(stopSequenceStart), false);
|
|
616
|
+
if (opts.stopSequences.some((seq) => fullText.endsWith(seq))) break;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
425
619
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
stopSequenceStart: inputIds.length,
|
|
430
|
-
decodeToken,
|
|
431
|
-
logBatchPath: opts.debug,
|
|
432
|
-
});
|
|
433
|
-
const tokensGenerated = this.#state.stats.decodeTokens;
|
|
620
|
+
this.#state.stats.decodeTimeMs = performance.now() - decodeStart;
|
|
621
|
+
this.#state.stats.tokensGenerated = tokenIds.length;
|
|
622
|
+
this.#state.stats.decodeTokens = tokenIds.length;
|
|
434
623
|
this.#state.stats.totalTimeMs = performance.now() - startTime;
|
|
435
624
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
const ttft = this.#state.stats.ttftMs ?? this.#state.stats.prefillTimeMs;
|
|
441
|
-
const decodeTokens = Math.max(0, tokensGenerated - 1);
|
|
442
|
-
const decodeSpeed = decodeTokens > 0 ? (decodeTokens / this.#state.stats.decodeTimeMs * 1000) : 0;
|
|
443
|
-
if (opts.benchmark) {
|
|
444
|
-
log.info('Benchmark', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
|
|
445
|
-
} else {
|
|
446
|
-
log.info('Perf', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
|
|
447
|
-
}
|
|
448
|
-
trace.perf('Decode summary', {
|
|
449
|
-
ttftMs: ttft,
|
|
450
|
-
prefillMs: this.#state.stats.prefillTimeMs,
|
|
451
|
-
decodeMs: this.#state.stats.decodeTimeMs,
|
|
452
|
-
decodeTokens,
|
|
453
|
-
decodeSpeed,
|
|
454
|
-
totalMs: this.#state.stats.totalTimeMs,
|
|
455
|
-
});
|
|
625
|
+
return {
|
|
626
|
+
tokenIds,
|
|
627
|
+
stats: this.#state.stats,
|
|
628
|
+
};
|
|
456
629
|
} finally {
|
|
457
630
|
this._closeFinitenessFallbackWindow(opts);
|
|
458
631
|
resetActiveExecutionPlan(this.#state);
|
|
@@ -463,14 +636,13 @@ export class PipelineGenerator {
|
|
|
463
636
|
|
|
464
637
|
async prefillKVOnly(prompt, options = {}) {
|
|
465
638
|
if (!this.#state.isLoaded) throw new Error('Model not loaded');
|
|
466
|
-
|
|
639
|
+
if (this.#state.isGenerating && options.__internalGenerate !== true) {
|
|
640
|
+
throw new Error('Generation already in progress');
|
|
641
|
+
}
|
|
642
|
+
this._resetDecodeRuntimeState();
|
|
467
643
|
this.#state.stats.gpuTimePrefillMs = undefined;
|
|
468
644
|
const opts = resolvePrefillOptions(this.#state, options);
|
|
469
|
-
|
|
470
|
-
const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillKVOnly');
|
|
471
|
-
|
|
472
|
-
const inputIds = this.#state.tokenizer.encode(processedPrompt);
|
|
473
|
-
this._assertTokenIdsInRange(inputIds, 'prefillKVOnly.encode');
|
|
645
|
+
const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillKVOnly');
|
|
474
646
|
if (opts.debug) {
|
|
475
647
|
log.debug('Pipeline', `PrefillKVOnly: ${inputIds.length} tokens`);
|
|
476
648
|
}
|
|
@@ -479,7 +651,7 @@ export class PipelineGenerator {
|
|
|
479
651
|
try {
|
|
480
652
|
prefillResult = await this._prefillToHidden(inputIds, opts);
|
|
481
653
|
} catch (error) {
|
|
482
|
-
if (error
|
|
654
|
+
if (shouldRetryWithFinitenessFallback(error)) {
|
|
483
655
|
log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillKVOnly. Retrying with F32 precision.`);
|
|
484
656
|
prefillResult = await this._retryWithFinitenessFallback(
|
|
485
657
|
opts,
|
|
@@ -528,14 +700,13 @@ export class PipelineGenerator {
|
|
|
528
700
|
|
|
529
701
|
async prefillWithEmbedding(prompt, options = {}) {
|
|
530
702
|
if (!this.#state.isLoaded) throw new Error('Model not loaded');
|
|
531
|
-
|
|
703
|
+
if (this.#state.isGenerating && options.__internalGenerate !== true) {
|
|
704
|
+
throw new Error('Generation already in progress');
|
|
705
|
+
}
|
|
706
|
+
this._resetDecodeRuntimeState();
|
|
532
707
|
this.#state.stats.gpuTimePrefillMs = undefined;
|
|
533
708
|
const opts = resolvePrefillEmbeddingOptions(this.#state, options);
|
|
534
|
-
|
|
535
|
-
const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithEmbedding');
|
|
536
|
-
|
|
537
|
-
const inputIds = this.#state.tokenizer.encode(processedPrompt);
|
|
538
|
-
this._assertTokenIdsInRange(inputIds, 'prefillWithEmbedding.encode');
|
|
709
|
+
const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillWithEmbedding');
|
|
539
710
|
if (opts.debug) {
|
|
540
711
|
log.debug('Pipeline', `PrefillWithEmbedding: ${inputIds.length} tokens (mode=${opts.embeddingMode})`);
|
|
541
712
|
}
|
|
@@ -544,7 +715,7 @@ export class PipelineGenerator {
|
|
|
544
715
|
try {
|
|
545
716
|
prefillResult = await this._prefillToHidden(inputIds, opts);
|
|
546
717
|
} catch (error) {
|
|
547
|
-
if (error
|
|
718
|
+
if (shouldRetryWithFinitenessFallback(error)) {
|
|
548
719
|
log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillWithEmbedding. Retrying with F32 precision.`);
|
|
549
720
|
prefillResult = await this._retryWithFinitenessFallback(
|
|
550
721
|
opts,
|
|
@@ -623,19 +794,13 @@ export class PipelineGenerator {
|
|
|
623
794
|
|
|
624
795
|
async prefillWithLogits(prompt, options = {}) {
|
|
625
796
|
if (!this.#state.isLoaded) throw new Error('Model not loaded');
|
|
626
|
-
|
|
797
|
+
if (this.#state.isGenerating && options.__internalGenerate !== true) {
|
|
798
|
+
throw new Error('Generation already in progress');
|
|
799
|
+
}
|
|
800
|
+
this._resetDecodeRuntimeState();
|
|
627
801
|
this.#state.stats.gpuTimePrefillMs = undefined;
|
|
628
802
|
const opts = resolvePrefillOptions(this.#state, options);
|
|
629
|
-
|
|
630
|
-
const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithLogits');
|
|
631
|
-
|
|
632
|
-
const inputIds = this.#state.tokenizer.encode(processedPrompt);
|
|
633
|
-
this._assertTokenIdsInRange(inputIds, 'prefillWithLogits.encode');
|
|
634
|
-
if (opts.debug) {
|
|
635
|
-
log.debug('Pipeline', `PrefillWithLogits: ${inputIds.length} tokens`);
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
const logits = await this._prefill(inputIds, opts);
|
|
803
|
+
const { inputIds, logits } = await this._prefillPromptToLogits(prompt, opts, 'prefillWithLogits');
|
|
639
804
|
|
|
640
805
|
const snapshot = this.#state.kvCache?.clone();
|
|
641
806
|
if (!snapshot) {
|
|
@@ -757,6 +922,7 @@ export class PipelineGenerator {
|
|
|
757
922
|
stopSequenceStart,
|
|
758
923
|
decodeToken,
|
|
759
924
|
logBatchPath = false,
|
|
925
|
+
emitMode = 'text',
|
|
760
926
|
} = runtime;
|
|
761
927
|
|
|
762
928
|
let tokensGenerated = 1;
|
|
@@ -786,6 +952,9 @@ export class PipelineGenerator {
|
|
|
786
952
|
}
|
|
787
953
|
const readbackInterval = executionPlan.readbackInterval;
|
|
788
954
|
const intervalBatches = readbackInterval == null ? 1 : readbackInterval;
|
|
955
|
+
const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
|
|
956
|
+
|
|
957
|
+
const decodeSingleTokenViaLogits = async () => this._decodeNextTokenViaLogits(generatedIds, opts);
|
|
789
958
|
|
|
790
959
|
if (logBatchPath && useBatchPath) {
|
|
791
960
|
log.debug(
|
|
@@ -811,10 +980,16 @@ export class PipelineGenerator {
|
|
|
811
980
|
for (const tokenId of batchResult.tokens) {
|
|
812
981
|
generatedIds.push(tokenId);
|
|
813
982
|
tokensGenerated++;
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
983
|
+
if (emitMode === 'token') {
|
|
984
|
+
yield tokenId;
|
|
985
|
+
if (options.onToken) options.onToken(tokenId, '');
|
|
986
|
+
batchTokens.push({ id: tokenId, text: '' });
|
|
987
|
+
} else {
|
|
988
|
+
const tokenText = decodeToken(tokenId);
|
|
989
|
+
yield tokenText;
|
|
990
|
+
if (options.onToken) options.onToken(tokenId, tokenText);
|
|
991
|
+
batchTokens.push({ id: tokenId, text: tokenText });
|
|
992
|
+
}
|
|
818
993
|
if (batchTokens.length === executionPlan.batchSize) {
|
|
819
994
|
if (options.onBatch) options.onBatch(batchTokens);
|
|
820
995
|
batchTokens = [];
|
|
@@ -831,9 +1006,9 @@ export class PipelineGenerator {
|
|
|
831
1006
|
useBatchPath = false;
|
|
832
1007
|
let nextToken;
|
|
833
1008
|
try {
|
|
834
|
-
nextToken = await
|
|
1009
|
+
nextToken = await decodeSingleTokenViaLogits();
|
|
835
1010
|
} catch (singleTokenError) {
|
|
836
|
-
if (singleTokenError
|
|
1011
|
+
if (shouldRetryWithFinitenessFallback(singleTokenError)) {
|
|
837
1012
|
log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at batch step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
|
|
838
1013
|
nextToken = await this._retryDecodeStepWithFinitenessWindow(
|
|
839
1014
|
generatedIds,
|
|
@@ -846,9 +1021,14 @@ export class PipelineGenerator {
|
|
|
846
1021
|
}
|
|
847
1022
|
generatedIds.push(nextToken);
|
|
848
1023
|
tokensGenerated++;
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
1024
|
+
if (emitMode === 'token') {
|
|
1025
|
+
yield nextToken;
|
|
1026
|
+
if (options.onToken) options.onToken(nextToken, '');
|
|
1027
|
+
} else {
|
|
1028
|
+
const tokenText = decodeToken(nextToken);
|
|
1029
|
+
yield tokenText;
|
|
1030
|
+
if (options.onToken) options.onToken(nextToken, tokenText);
|
|
1031
|
+
}
|
|
852
1032
|
this._consumeFinitenessFallbackToken(opts);
|
|
853
1033
|
if (isStopToken(nextToken, stopTokenIds, eosToken)) break;
|
|
854
1034
|
}
|
|
@@ -856,9 +1036,9 @@ export class PipelineGenerator {
|
|
|
856
1036
|
const tokenStart = performance.now();
|
|
857
1037
|
let nextToken;
|
|
858
1038
|
try {
|
|
859
|
-
nextToken = await
|
|
1039
|
+
nextToken = await decodeSingleTokenViaLogits();
|
|
860
1040
|
} catch (error) {
|
|
861
|
-
if (error
|
|
1041
|
+
if (shouldRetryWithFinitenessFallback(error)) {
|
|
862
1042
|
log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
|
|
863
1043
|
nextToken = await this._retryDecodeStepWithFinitenessWindow(
|
|
864
1044
|
generatedIds,
|
|
@@ -872,9 +1052,14 @@ export class PipelineGenerator {
|
|
|
872
1052
|
const tokenTime = performance.now() - tokenStart;
|
|
873
1053
|
generatedIds.push(nextToken);
|
|
874
1054
|
tokensGenerated++;
|
|
875
|
-
const tokenText = decodeToken(nextToken);
|
|
876
|
-
|
|
877
|
-
|
|
1055
|
+
const tokenText = emitMode === 'token' ? '' : decodeToken(nextToken);
|
|
1056
|
+
if (emitMode === 'token') {
|
|
1057
|
+
yield nextToken;
|
|
1058
|
+
if (options.onToken) options.onToken(nextToken, '');
|
|
1059
|
+
} else {
|
|
1060
|
+
yield tokenText;
|
|
1061
|
+
if (options.onToken) options.onToken(nextToken, tokenText);
|
|
1062
|
+
}
|
|
878
1063
|
this._consumeFinitenessFallbackToken(opts);
|
|
879
1064
|
|
|
880
1065
|
if (opts.debug || opts.benchmark) {
|
|
@@ -912,17 +1097,22 @@ export class PipelineGenerator {
|
|
|
912
1097
|
if (startPos === 0 && hasLinearAttentionLayers(config.layerTypes)) {
|
|
913
1098
|
this.#state.linearAttentionRuntime = resetLinearAttentionRuntime(this.#state.linearAttentionRuntime);
|
|
914
1099
|
}
|
|
1100
|
+
if (startPos === 0) {
|
|
1101
|
+
for (const [, convState] of this.#state.convLayerStates) {
|
|
1102
|
+
if (convState.convStateGPU && convState.hiddenSize && convState.kernelSize) {
|
|
1103
|
+
uploadData(convState.convStateGPU, new Float32Array(convState.hiddenSize * (convState.kernelSize - 1)));
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
915
1107
|
|
|
916
1108
|
const embedBufferRaw = this.#state.weights.get('embed');
|
|
917
1109
|
if (!(embedBufferRaw instanceof GPUBuffer) && !isWeightBuffer(embedBufferRaw) && !isCpuWeightBuffer(embedBufferRaw) && !(embedBufferRaw instanceof Float32Array)) {
|
|
918
1110
|
throw new Error('Embed buffer not found or not a supported buffer type');
|
|
919
1111
|
}
|
|
920
1112
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
921
|
-
const embedDtype =
|
|
922
|
-
?
|
|
923
|
-
:
|
|
924
|
-
? embedBufferRaw.dtype
|
|
925
|
-
: null;
|
|
1113
|
+
const embedDtype = isCpuWeightBuffer(embedBufferRaw)
|
|
1114
|
+
? embedBufferRaw.dtype
|
|
1115
|
+
: getWeightDtype(embedBufferRaw);
|
|
926
1116
|
if (opts.debug) {
|
|
927
1117
|
const embedSize = embedBuffer instanceof GPUBuffer ? embedBuffer.size : 'N/A';
|
|
928
1118
|
log.debug('Pipeline', `Embed buffer: type=${embedBuffer?.constructor?.name}, size=${embedSize}, dtype=${embedDtype}`);
|
|
@@ -1263,18 +1453,15 @@ export class PipelineGenerator {
|
|
|
1263
1453
|
|
|
1264
1454
|
async decodeStepLogits(currentIds, options = {}) {
|
|
1265
1455
|
if (!this.#state.isLoaded) throw new Error('Model not loaded');
|
|
1266
|
-
if (this.#state.isGenerating
|
|
1456
|
+
if (this.#state.isGenerating && options.__internalGenerate !== true) {
|
|
1457
|
+
throw new Error('Generation already in progress');
|
|
1458
|
+
}
|
|
1267
1459
|
resetActiveExecutionPlan(this.#state);
|
|
1268
1460
|
|
|
1269
1461
|
validateCallTimeOptions(options);
|
|
1270
1462
|
|
|
1271
1463
|
const opts = this._resolveStepOptions(options);
|
|
1272
|
-
|
|
1273
|
-
? (buffer, label, numTokens, expectedDim) =>
|
|
1274
|
-
debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
|
|
1275
|
-
: undefined;
|
|
1276
|
-
|
|
1277
|
-
return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
|
|
1464
|
+
return this._decodeStepToLogits(currentIds, opts);
|
|
1278
1465
|
}
|
|
1279
1466
|
|
|
1280
1467
|
async advanceWithToken(tokenId, options = {}) {
|