@simulatte/doppler 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +25 -6
- package/package.json +5 -3
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +16 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/loader.js +6 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +7 -0
- package/src/config/presets/models/gemma3.json +2 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +1 -1
- package/src/converter/core.js +17 -8
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +15 -0
- package/src/distribution/shard-delivery.js +34 -0
- package/src/formats/rdrr/classification.js +32 -0
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +1 -0
- package/src/gpu/kernels/matmul.d.ts +3 -0
- package/src/gpu/kernels/matmul.js +70 -1
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
- package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
- package/src/inference/pipelines/text/attention/projections.js +13 -2
- package/src/inference/pipelines/text/attention/record.js +1 -0
- package/src/inference/pipelines/text/attention/run.js +9 -0
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +32 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +14 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
- package/src/inference/pipelines/text/generator-steps.js +46 -29
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +320 -166
- package/src/inference/pipelines/text/init.d.ts +2 -0
- package/src/inference/pipelines/text/init.js +19 -5
- package/src/inference/pipelines/text/layer.js +37 -8
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +9 -7
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +124 -3
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/tooling/node-converter.js +25 -7
- package/src/tooling/node-source-runtime.js +29 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
|
|
2
2
|
import { releaseBuffer, readBuffer } from '../../../memory/buffer-pool.js';
|
|
3
|
-
import {
|
|
3
|
+
import { recordArgmax, recordGPUSample, isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
|
|
4
4
|
import { recordCheckStop } from '../../../gpu/kernels/check-stop.js';
|
|
5
5
|
import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
|
|
6
6
|
import { createCommandRecorder, createProfilingRecorder, CommandRecorder } from '../../../gpu/command-recorder.js';
|
|
@@ -20,6 +20,7 @@ import { decodeReadback } from './debug-utils/index.js';
|
|
|
20
20
|
import { getFinalNormWeights, extractEmbeddingFromHidden } from './generator-runtime.js';
|
|
21
21
|
import { parseFinitenessStatusWords } from './finiteness-guard-status.js';
|
|
22
22
|
import { hasLinearAttentionLayers } from './linear-attention.js';
|
|
23
|
+
import { hasConvLayers } from './layer.js';
|
|
23
24
|
|
|
24
25
|
const UNKNOWN_TOKEN_TEXT = '<unknown>';
|
|
25
26
|
|
|
@@ -91,6 +92,13 @@ export function shouldUseBatchDecode(config) {
|
|
|
91
92
|
return isBatchDecodeEnabled(config);
|
|
92
93
|
}
|
|
93
94
|
|
|
95
|
+
export function shouldUseFusedDecodeSampling(config) {
|
|
96
|
+
return config.recorderEnabled === true
|
|
97
|
+
&& config.gpuSamplingEnabled === true
|
|
98
|
+
&& config.fusedDecodeDisabled !== true
|
|
99
|
+
&& !hasConvLayers(config.layerTypes ?? []);
|
|
100
|
+
}
|
|
101
|
+
|
|
94
102
|
export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
|
|
95
103
|
let actualCount = tokens.length;
|
|
96
104
|
if (stopFlags) {
|
|
@@ -403,7 +411,12 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
403
411
|
const padTokenId = state.tokenizer?.getSpecialTokens?.()?.pad ?? null;
|
|
404
412
|
const lmHeadIsCpu = isCpuWeightBuffer(state.weights.get('lm_head'));
|
|
405
413
|
const useGPUSampling = state.useGPU && isGPUSamplingAvailable() && !lmHeadIsCpu;
|
|
406
|
-
const useFusedDecode =
|
|
414
|
+
const useFusedDecode = shouldUseFusedDecodeSampling({
|
|
415
|
+
recorderEnabled: Boolean(recorder),
|
|
416
|
+
gpuSamplingEnabled: useGPUSampling,
|
|
417
|
+
fusedDecodeDisabled: state.disableFusedDecode,
|
|
418
|
+
layerTypes: config.layerTypes,
|
|
419
|
+
});
|
|
407
420
|
|
|
408
421
|
if (useFusedDecode) {
|
|
409
422
|
const ring = state.decodeRing;
|
|
@@ -631,36 +644,35 @@ export async function decodeStep(state, currentIds, opts, helpers) {
|
|
|
631
644
|
);
|
|
632
645
|
if (logitsResult) {
|
|
633
646
|
const { logitsBuffer, vocabSize, logitsDtype } = logitsResult;
|
|
647
|
+
const logitsBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: logitsDtype });
|
|
648
|
+
const logitsData = await readBuffer(logitsBuffer, numTokens * vocabSize * logitsBytes);
|
|
649
|
+
releaseBuffer(logitsBuffer);
|
|
634
650
|
|
|
635
|
-
const
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
randomSeed: opts.seed,
|
|
646
|
-
});
|
|
651
|
+
const rawLogits = decodeReadback(logitsData, logitsDtype);
|
|
652
|
+
const finalizedLogits = await finalizeLogits(
|
|
653
|
+
rawLogits,
|
|
654
|
+
numTokens,
|
|
655
|
+
vocabSize,
|
|
656
|
+
config.vocabSize,
|
|
657
|
+
config,
|
|
658
|
+
state.runtimeConfig.shared.debug.probes
|
|
659
|
+
);
|
|
660
|
+
const sampledLogits = extractLastPositionLogits(finalizedLogits, numTokens, config.vocabSize);
|
|
647
661
|
|
|
648
|
-
|
|
649
|
-
const
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
662
|
+
applyRepetitionPenalty(sampledLogits, currentIds, opts.repetitionPenalty);
|
|
663
|
+
const nextToken = sample(sampledLogits, {
|
|
664
|
+
temperature: opts.temperature,
|
|
665
|
+
topP: opts.topP,
|
|
666
|
+
topK: opts.topK,
|
|
667
|
+
padTokenId,
|
|
668
|
+
seed: opts.seed,
|
|
669
|
+
});
|
|
670
|
+
|
|
671
|
+
if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
|
|
672
|
+
releaseBuffer(hiddenStates);
|
|
658
673
|
}
|
|
659
|
-
state.
|
|
660
|
-
|
|
661
|
-
'Decode',
|
|
662
|
-
`GPU sampling produced invalid token ${nextToken} (vocabSize=${config.vocabSize}, step=${state.decodeStepCount}); falling back to CPU sampling.`
|
|
663
|
-
);
|
|
674
|
+
state.currentSeqLen++;
|
|
675
|
+
return nextToken;
|
|
664
676
|
}
|
|
665
677
|
}
|
|
666
678
|
|
|
@@ -887,6 +899,11 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
|
|
|
887
899
|
'[Pipeline] Batch decode path is disabled for linear_attention models; use single-token decode.'
|
|
888
900
|
);
|
|
889
901
|
}
|
|
902
|
+
if (hasConvLayers(config.layerTypes)) {
|
|
903
|
+
throw new Error(
|
|
904
|
+
'[Pipeline] Batch decode path is disabled for conv models; use single-token decode.'
|
|
905
|
+
);
|
|
906
|
+
}
|
|
890
907
|
const samplingDefaults = state.runtimeConfig.inference.sampling;
|
|
891
908
|
const executionPlan = opts.executionPlan;
|
|
892
909
|
const batchSize = executionPlan?.batchSize ?? opts.batchSize ?? state.runtimeConfig.inference.batching.batchSize;
|
|
@@ -27,6 +27,11 @@ export declare class PipelineGenerator {
|
|
|
27
27
|
* Batching and readback cadence are controlled by runtime.inference.batching.
|
|
28
28
|
*/
|
|
29
29
|
generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
|
|
30
|
+
generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
|
|
31
|
+
generateTokenIds(
|
|
32
|
+
prompt: PromptInput,
|
|
33
|
+
options?: GenerateOptions
|
|
34
|
+
): Promise<{ tokenIds: number[]; stats: import('./types.js').PipelineStats }>;
|
|
30
35
|
prefillKVOnly(prompt: PromptInput, options?: GenerateOptions): Promise<KVCacheSnapshot>;
|
|
31
36
|
prefillWithEmbedding(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillEmbeddingResult>;
|
|
32
37
|
prefillWithLogits(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillResult>;
|