@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -139,6 +139,12 @@ export function resolveStepOptions(state, options = {}) {
|
|
|
139
139
|
const executionPlan = resolveExecutionSessionPlan(state, options);
|
|
140
140
|
|
|
141
141
|
return {
|
|
142
|
+
seed: resolveConfiguredValue(
|
|
143
|
+
options.seed,
|
|
144
|
+
undefined,
|
|
145
|
+
'options.seed',
|
|
146
|
+
(value) => Number.isFinite(value) && value >= 0
|
|
147
|
+
),
|
|
142
148
|
temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
|
|
143
149
|
topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
|
|
144
150
|
topK: resolveConfiguredValue(options.topK, samplingDefaults.topK, 'options.topK'),
|
|
@@ -165,6 +171,12 @@ export function resolveGenerateOptions(state, options = {}) {
|
|
|
165
171
|
const executionPlan = resolveExecutionSessionPlan(state, options);
|
|
166
172
|
|
|
167
173
|
return {
|
|
174
|
+
seed: resolveConfiguredValue(
|
|
175
|
+
options.seed,
|
|
176
|
+
undefined,
|
|
177
|
+
'options.seed',
|
|
178
|
+
(value) => Number.isFinite(value) && value >= 0
|
|
179
|
+
),
|
|
168
180
|
maxTokens: executionPlan.maxTokens,
|
|
169
181
|
temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
|
|
170
182
|
topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
|
|
@@ -191,6 +203,7 @@ export function resolveGenerateOptions(state, options = {}) {
|
|
|
191
203
|
batchSize: executionPlan.batchSize,
|
|
192
204
|
stopCheckMode: executionPlan.stopCheckMode,
|
|
193
205
|
executionPlan,
|
|
206
|
+
images: options.images ?? null,
|
|
194
207
|
};
|
|
195
208
|
}
|
|
196
209
|
|
|
@@ -205,6 +218,7 @@ export function resolvePrefillOptions(state, options = {}) {
|
|
|
205
218
|
disableCommandBatching: executionPlan.disableCommandBatching,
|
|
206
219
|
disableMultiTokenDecode: executionPlan.disableMultiTokenDecode,
|
|
207
220
|
executionPlan,
|
|
221
|
+
images: options.images ?? null,
|
|
208
222
|
};
|
|
209
223
|
}
|
|
210
224
|
|
|
@@ -213,6 +227,10 @@ export function resolvePrefillEmbeddingOptions(state, options = {}) {
|
|
|
213
227
|
? state.manifest.modelType.toLowerCase()
|
|
214
228
|
: '';
|
|
215
229
|
const generationDefaults = state.runtimeConfig.inference.generation;
|
|
230
|
+
// Embedding models default to 'mean' pooling — this is a model-category behavior,
|
|
231
|
+
// not a model-family identity check. Ideally embedding model presets would set
|
|
232
|
+
// generation.embeddingMode='mean' in their runtime config; the modelType fallback
|
|
233
|
+
// provides this default for manifests that predate runtime-preset embedding mode.
|
|
216
234
|
const defaultEmbeddingMode = modelType === 'embedding'
|
|
217
235
|
? 'mean'
|
|
218
236
|
: generationDefaults.embeddingMode;
|
|
@@ -226,6 +244,7 @@ export function resolveAdvanceEmbeddingMode(state, options = {}) {
|
|
|
226
244
|
const modelType = typeof state.manifest?.modelType === 'string'
|
|
227
245
|
? state.manifest.modelType.toLowerCase()
|
|
228
246
|
: '';
|
|
247
|
+
// See resolvePrefillEmbeddingOptions for embedding-model pooling rationale.
|
|
229
248
|
const configuredMode = state.runtimeConfig.inference.generation.embeddingMode;
|
|
230
249
|
return resolveConfiguredValue(
|
|
231
250
|
options.embeddingMode,
|
|
@@ -12,6 +12,15 @@ export interface BatchDecodeSelectionConfig {
|
|
|
12
12
|
|
|
13
13
|
export declare function shouldUseBatchDecode(config: BatchDecodeSelectionConfig): boolean;
|
|
14
14
|
|
|
15
|
+
export interface FusedDecodeSamplingConfig {
|
|
16
|
+
recorderEnabled: boolean;
|
|
17
|
+
gpuSamplingEnabled: boolean;
|
|
18
|
+
fusedDecodeDisabled: boolean;
|
|
19
|
+
layerTypes?: string[] | null;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export declare function shouldUseFusedDecodeSampling(config: FusedDecodeSamplingConfig): boolean;
|
|
23
|
+
|
|
15
24
|
export declare function resolveBatchStop(
|
|
16
25
|
tokens: number[],
|
|
17
26
|
stopFlags: Uint32Array | null,
|
|
@@ -19,6 +28,12 @@ export declare function resolveBatchStop(
|
|
|
19
28
|
eosTokenId: number | undefined | null
|
|
20
29
|
): number;
|
|
21
30
|
|
|
31
|
+
export declare function findInvalidGeneratedToken(
|
|
32
|
+
tokens: number[],
|
|
33
|
+
vocabSize: number,
|
|
34
|
+
padTokenId?: number | null
|
|
35
|
+
): { index: number; tokenId: number } | null;
|
|
36
|
+
|
|
22
37
|
export interface SampledTokenStagingBuffer {
|
|
23
38
|
mapAsync(mode: number): Promise<void>;
|
|
24
39
|
getMappedRange(): ArrayBufferLike;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
|
|
2
2
|
import { releaseBuffer, readBuffer } from '../../../memory/buffer-pool.js';
|
|
3
|
-
import {
|
|
3
|
+
import { recordArgmax, recordGPUSample, isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
|
|
4
4
|
import { recordCheckStop } from '../../../gpu/kernels/check-stop.js';
|
|
5
5
|
import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
|
|
6
6
|
import { createCommandRecorder, createProfilingRecorder, CommandRecorder } from '../../../gpu/command-recorder.js';
|
|
@@ -20,6 +20,7 @@ import { decodeReadback } from './debug-utils/index.js';
|
|
|
20
20
|
import { getFinalNormWeights, extractEmbeddingFromHidden } from './generator-runtime.js';
|
|
21
21
|
import { parseFinitenessStatusWords } from './finiteness-guard-status.js';
|
|
22
22
|
import { hasLinearAttentionLayers } from './linear-attention.js';
|
|
23
|
+
import { hasConvLayers } from './layer.js';
|
|
23
24
|
|
|
24
25
|
const UNKNOWN_TOKEN_TEXT = '<unknown>';
|
|
25
26
|
|
|
@@ -91,6 +92,13 @@ export function shouldUseBatchDecode(config) {
|
|
|
91
92
|
return isBatchDecodeEnabled(config);
|
|
92
93
|
}
|
|
93
94
|
|
|
95
|
+
export function shouldUseFusedDecodeSampling(config) {
|
|
96
|
+
return config.recorderEnabled === true
|
|
97
|
+
&& config.gpuSamplingEnabled === true
|
|
98
|
+
&& config.fusedDecodeDisabled !== true
|
|
99
|
+
&& !hasConvLayers(config.layerTypes ?? []);
|
|
100
|
+
}
|
|
101
|
+
|
|
94
102
|
export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
|
|
95
103
|
let actualCount = tokens.length;
|
|
96
104
|
if (stopFlags) {
|
|
@@ -113,6 +121,20 @@ export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
|
|
|
113
121
|
return actualCount;
|
|
114
122
|
}
|
|
115
123
|
|
|
124
|
+
export function findInvalidGeneratedToken(tokens, vocabSize, padTokenId = null) {
|
|
125
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
126
|
+
const tokenId = tokens[i];
|
|
127
|
+
const isInvalid = !Number.isFinite(tokenId)
|
|
128
|
+
|| tokenId < 0
|
|
129
|
+
|| tokenId >= vocabSize
|
|
130
|
+
|| (padTokenId != null ? tokenId === padTokenId : tokenId === 0);
|
|
131
|
+
if (isInvalid) {
|
|
132
|
+
return { index: i, tokenId };
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
|
|
116
138
|
export async function readSampledTokenFromStagingBuffer(stagingBuffer, options = {}) {
|
|
117
139
|
const ownsStagingBuffer = options.ownsStagingBuffer === true;
|
|
118
140
|
const hasFinitenessBuffer = options.hasFinitenessBuffer === true;
|
|
@@ -240,11 +262,9 @@ async function runDecodeLayers(state, tokenId, opts, helpers) {
|
|
|
240
262
|
throw new Error('Embed buffer not found or not a supported buffer type');
|
|
241
263
|
}
|
|
242
264
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
243
|
-
const embedDtype =
|
|
244
|
-
?
|
|
245
|
-
:
|
|
246
|
-
? embedBufferRaw.dtype
|
|
247
|
-
: null;
|
|
265
|
+
const embedDtype = isCpuWeightBuffer(embedBufferRaw)
|
|
266
|
+
? embedBufferRaw.dtype
|
|
267
|
+
: getWeightDtype(embedBufferRaw);
|
|
248
268
|
const activationDtype = getEffectiveActivationDtype(state, opts);
|
|
249
269
|
|
|
250
270
|
const embedTensor = await embed([tokenId], embedBuffer, {
|
|
@@ -326,11 +346,9 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
326
346
|
throw new Error('Embed buffer not found or not a supported buffer type');
|
|
327
347
|
}
|
|
328
348
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
329
|
-
const embedDtype =
|
|
330
|
-
?
|
|
331
|
-
:
|
|
332
|
-
? embedBufferRaw.dtype
|
|
333
|
-
: null;
|
|
349
|
+
const embedDtype = isCpuWeightBuffer(embedBufferRaw)
|
|
350
|
+
? embedBufferRaw.dtype
|
|
351
|
+
: getWeightDtype(embedBufferRaw);
|
|
334
352
|
const activationDtype = getEffectiveActivationDtype(state, opts);
|
|
335
353
|
const activationBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
|
|
336
354
|
|
|
@@ -393,7 +411,12 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
393
411
|
const padTokenId = state.tokenizer?.getSpecialTokens?.()?.pad ?? null;
|
|
394
412
|
const lmHeadIsCpu = isCpuWeightBuffer(state.weights.get('lm_head'));
|
|
395
413
|
const useGPUSampling = state.useGPU && isGPUSamplingAvailable() && !lmHeadIsCpu;
|
|
396
|
-
const useFusedDecode =
|
|
414
|
+
const useFusedDecode = shouldUseFusedDecodeSampling({
|
|
415
|
+
recorderEnabled: Boolean(recorder),
|
|
416
|
+
gpuSamplingEnabled: useGPUSampling,
|
|
417
|
+
fusedDecodeDisabled: state.disableFusedDecode,
|
|
418
|
+
layerTypes: config.layerTypes,
|
|
419
|
+
});
|
|
397
420
|
|
|
398
421
|
if (useFusedDecode) {
|
|
399
422
|
const ring = state.decodeRing;
|
|
@@ -621,21 +644,30 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
621
644
|
);
|
|
622
645
|
if (logitsResult) {
|
|
623
646
|
const { logitsBuffer, vocabSize, logitsDtype } = logitsResult;
|
|
647
|
+
const logitsBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: logitsDtype });
|
|
648
|
+
const logitsData = await readBuffer(logitsBuffer, numTokens * vocabSize * logitsBytes);
|
|
649
|
+
releaseBuffer(logitsBuffer);
|
|
624
650
|
|
|
625
|
-
const
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
651
|
+
const rawLogits = decodeReadback(logitsData, logitsDtype);
|
|
652
|
+
const finalizedLogits = await finalizeLogits(
|
|
653
|
+
rawLogits,
|
|
654
|
+
numTokens,
|
|
655
|
+
vocabSize,
|
|
656
|
+
config.vocabSize,
|
|
657
|
+
config,
|
|
658
|
+
state.runtimeConfig.shared.debug.probes
|
|
659
|
+
);
|
|
660
|
+
const sampledLogits = extractLastPositionLogits(finalizedLogits, numTokens, config.vocabSize);
|
|
661
|
+
|
|
662
|
+
applyRepetitionPenalty(sampledLogits, currentIds, opts.repetitionPenalty);
|
|
663
|
+
const nextToken = sample(sampledLogits, {
|
|
664
|
+
temperature: opts.temperature,
|
|
665
|
+
topP: opts.topP,
|
|
666
|
+
topK: opts.topK,
|
|
667
|
+
padTokenId,
|
|
668
|
+
seed: opts.seed,
|
|
669
|
+
});
|
|
637
670
|
|
|
638
|
-
releaseBuffer(logitsBuffer);
|
|
639
671
|
if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
|
|
640
672
|
releaseBuffer(hiddenStates);
|
|
641
673
|
}
|
|
@@ -867,6 +899,11 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
|
|
|
867
899
|
'[Pipeline] Batch decode path is disabled for linear_attention models; use single-token decode.'
|
|
868
900
|
);
|
|
869
901
|
}
|
|
902
|
+
if (hasConvLayers(config.layerTypes)) {
|
|
903
|
+
throw new Error(
|
|
904
|
+
'[Pipeline] Batch decode path is disabled for conv models; use single-token decode.'
|
|
905
|
+
);
|
|
906
|
+
}
|
|
870
907
|
const samplingDefaults = state.runtimeConfig.inference.sampling;
|
|
871
908
|
const executionPlan = opts.executionPlan;
|
|
872
909
|
const batchSize = executionPlan?.batchSize ?? opts.batchSize ?? state.runtimeConfig.inference.batching.batchSize;
|
|
@@ -981,7 +1018,7 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
|
|
|
981
1018
|
throw new Error('Embed buffer not found or not a GPUBuffer/WeightBuffer');
|
|
982
1019
|
}
|
|
983
1020
|
const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
|
|
984
|
-
const embedDtype =
|
|
1021
|
+
const embedDtype = getWeightDtype(embedBufferRaw);
|
|
985
1022
|
const activationDtype = getEffectiveActivationDtype(state, opts);
|
|
986
1023
|
|
|
987
1024
|
for (let i = 0; i < N; i++) {
|
|
@@ -1125,10 +1162,18 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
|
|
|
1125
1162
|
|
|
1126
1163
|
const actualCount = resolveBatchStop(tokens, stopFlags, stopTokenIds, eosToken);
|
|
1127
1164
|
const generatedTokens = tokens.slice(0, actualCount);
|
|
1165
|
+
const invalidToken = findInvalidGeneratedToken(generatedTokens, config.vocabSize, padTokenId);
|
|
1128
1166
|
|
|
1129
1167
|
if (isInfinite) {
|
|
1130
1168
|
throw new FinitenessError(`F16 bounds exceeded during batch generation${metadata}`);
|
|
1131
1169
|
}
|
|
1170
|
+
if (invalidToken) {
|
|
1171
|
+
state.disableFusedDecode = true;
|
|
1172
|
+
throw new Error(
|
|
1173
|
+
`[Pipeline] Batch decode produced invalid token ${invalidToken.tokenId} ` +
|
|
1174
|
+
`at batch index ${invalidToken.index} (vocabSize=${config.vocabSize}, padTokenId=${padTokenId ?? 'none'}).`
|
|
1175
|
+
);
|
|
1176
|
+
}
|
|
1132
1177
|
|
|
1133
1178
|
if (opts.profile && recorder.isProfilingEnabled()) {
|
|
1134
1179
|
const timings = await recorder.resolveProfileTimings();
|
|
@@ -27,6 +27,11 @@ export declare class PipelineGenerator {
|
|
|
27
27
|
* Batching and readback cadence are controlled by runtime.inference.batching.
|
|
28
28
|
*/
|
|
29
29
|
generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
|
|
30
|
+
generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
|
|
31
|
+
generateTokenIds(
|
|
32
|
+
prompt: PromptInput,
|
|
33
|
+
options?: GenerateOptions
|
|
34
|
+
): Promise<{ tokenIds: number[]; stats: import('./types.js').PipelineStats }>;
|
|
30
35
|
prefillKVOnly(prompt: PromptInput, options?: GenerateOptions): Promise<KVCacheSnapshot>;
|
|
31
36
|
prefillWithEmbedding(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillEmbeddingResult>;
|
|
32
37
|
prefillWithLogits(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillResult>;
|