@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -28,6 +28,7 @@ import type {
|
|
|
28
28
|
SpeculativeConfigSchema,
|
|
29
29
|
KernelPathSchema,
|
|
30
30
|
} from '../../../config/schema/index.js';
|
|
31
|
+
import type { LoaderDebugConfigSchema } from '../../../config/schema/debug.schema.js';
|
|
31
32
|
import type { KernelPathSource } from '../../../config/kernel-path-loader.js';
|
|
32
33
|
|
|
33
34
|
export interface PipelineStorageContext {
|
|
@@ -190,6 +191,12 @@ export interface WeightLoadResult {
|
|
|
190
191
|
layerRouterWeights: Map<number, RouterWeights>;
|
|
191
192
|
}
|
|
192
193
|
|
|
194
|
+
export interface ResolvedQ4KConfig {
|
|
195
|
+
useFusedQ4K: boolean;
|
|
196
|
+
q4kLayout: 'row' | 'col' | null;
|
|
197
|
+
keepF32Weights: boolean;
|
|
198
|
+
}
|
|
199
|
+
|
|
193
200
|
/** Options for loadWeights */
|
|
194
201
|
export interface LoadWeightsOptions {
|
|
195
202
|
storageContext?: PipelineStorageContext;
|
|
@@ -200,6 +207,7 @@ export interface LoadWeightsOptions {
|
|
|
200
207
|
resolvedKernelPath?: KernelPathSchema | null;
|
|
201
208
|
kernelPathSource?: KernelPathSource;
|
|
202
209
|
keepF32Weights?: boolean;
|
|
210
|
+
loaderDebug?: LoaderDebugConfigSchema | null;
|
|
203
211
|
}
|
|
204
212
|
|
|
205
213
|
/**
|
|
@@ -211,6 +219,13 @@ export function loadWeights(
|
|
|
211
219
|
options?: LoadWeightsOptions
|
|
212
220
|
): Promise<WeightLoadResult>;
|
|
213
221
|
|
|
222
|
+
export function resolveQ4KConfig(
|
|
223
|
+
manifest: Manifest,
|
|
224
|
+
kernelPath?: KernelPathSchema | null,
|
|
225
|
+
kernelPathSource?: KernelPathSource,
|
|
226
|
+
keepF32Weights?: boolean
|
|
227
|
+
): ResolvedQ4KConfig;
|
|
228
|
+
|
|
214
229
|
/**
|
|
215
230
|
* Apply Gemma chat template to a prompt.
|
|
216
231
|
*/
|
|
@@ -11,7 +11,7 @@ import { getDopplerLoader } from '../../../loader/doppler-loader.js';
|
|
|
11
11
|
import { log, setGPUDevice, trace as debugTrace } from '../../../debug/index.js';
|
|
12
12
|
import { getRuntimeConfig } from '../../../config/runtime.js';
|
|
13
13
|
import { PAGED_LAYOUT_SEQ_LEN_THRESHOLD } from '../../../config/schema/index.js';
|
|
14
|
-
import { isKernelPathFusedQ4K } from '../../../config/kernel-path-loader.js';
|
|
14
|
+
import { isKernelPathFusedQ4K, kernelPathRequiresF32MatmulWeights } from '../../../config/kernel-path-loader.js';
|
|
15
15
|
import { createWeightBuffer, getWeightDtype, isWeightBuffer } from '../../../gpu/weight-buffer.js';
|
|
16
16
|
import { selectRuleValue } from '../../../rules/rule-registry.js';
|
|
17
17
|
import {
|
|
@@ -128,7 +128,7 @@ function createRemoteStorageContext(baseUrl, manifest) {
|
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
|
|
131
|
-
function resolveQ4KConfig(
|
|
131
|
+
export function resolveQ4KConfig(
|
|
132
132
|
manifest,
|
|
133
133
|
kernelPath,
|
|
134
134
|
kernelPathSource = 'none',
|
|
@@ -150,18 +150,23 @@ function resolveQ4KConfig(
|
|
|
150
150
|
);
|
|
151
151
|
}
|
|
152
152
|
let useFused = kernelPath ? isKernelPathFusedQ4K(kernelPath) : hasSubgroups;
|
|
153
|
+
const kernelPathKeepsF32Weights = kernelPathRequiresF32MatmulWeights(kernelPath);
|
|
153
154
|
if (q4kLayout === 'col') {
|
|
154
155
|
useFused = false;
|
|
155
156
|
}
|
|
157
|
+
const resolvedKeepF32Weights = keepF32Weights || kernelPathKeepsF32Weights;
|
|
156
158
|
|
|
157
159
|
const pathLabel = kernelPath?.id ?? 'auto';
|
|
158
160
|
const layoutLabel = q4kLayout ?? 'none';
|
|
159
|
-
debugTrace.loader(
|
|
161
|
+
debugTrace.loader(
|
|
162
|
+
`Q4K config: fused=${useFused}, kernelPath=${pathLabel}, source=${kernelPathSource}, ` +
|
|
163
|
+
`layout=${layoutLabel}, keepF32Weights=${resolvedKeepF32Weights}, subgroups=${hasSubgroups}`
|
|
164
|
+
);
|
|
160
165
|
|
|
161
166
|
return {
|
|
162
167
|
useFusedQ4K: useFused,
|
|
163
168
|
q4kLayout,
|
|
164
|
-
keepF32Weights,
|
|
169
|
+
keepF32Weights: resolvedKeepF32Weights,
|
|
165
170
|
};
|
|
166
171
|
}
|
|
167
172
|
|
|
@@ -304,13 +309,21 @@ export async function initRoPEFrequencies(config, useGPU) {
|
|
|
304
309
|
if (!Number.isFinite(ropeScale) || ropeScale <= 0) {
|
|
305
310
|
throw new Error(`RoPE scale must be a positive number; got "${ropeScale}".`);
|
|
306
311
|
}
|
|
307
|
-
const resolvedLocalScale = ropeLocalScale
|
|
308
|
-
if (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0) {
|
|
312
|
+
const resolvedLocalScale = ropeLocalScale;
|
|
313
|
+
if (resolvedLocalScale != null && (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0)) {
|
|
309
314
|
throw new Error(`Local RoPE scale must be a positive number; got "${resolvedLocalScale}".`);
|
|
310
315
|
}
|
|
311
316
|
const resolvedLocalTheta = ropeLocalTheta ?? ropeTheta;
|
|
312
|
-
const resolvedLocalScalingType =
|
|
313
|
-
|
|
317
|
+
const resolvedLocalScalingType = (
|
|
318
|
+
ropeLocalScalingType === undefined
|
|
319
|
+
? ropeScalingType
|
|
320
|
+
: ropeLocalScalingType
|
|
321
|
+
);
|
|
322
|
+
const resolvedLocalScaling = (
|
|
323
|
+
ropeLocalScalingType === undefined
|
|
324
|
+
? ropeScaling
|
|
325
|
+
: ropeLocalScaling
|
|
326
|
+
);
|
|
314
327
|
const resolvedRotaryDim = resolveRotaryDim(headDim, rotaryDim, partialRotaryFactor);
|
|
315
328
|
const halfDim = resolvedRotaryDim / 2;
|
|
316
329
|
if (mropeInterleaved === true && Array.isArray(mropeSection)) {
|
|
@@ -502,6 +515,12 @@ export function createKVCache(modelConfig, useGPU, debug = false, runtimeConfig)
|
|
|
502
515
|
cacheLayout = 'paged';
|
|
503
516
|
layoutSource = 'threshold';
|
|
504
517
|
}
|
|
518
|
+
if (forceContiguousKVCache && cacheLayout === 'paged') {
|
|
519
|
+
throw new Error(
|
|
520
|
+
'Paged KV cache layout is not supported for models with full-attention layers. ' +
|
|
521
|
+
'Set runtime.inference.kvcache.layout to "contiguous" instead.'
|
|
522
|
+
);
|
|
523
|
+
}
|
|
505
524
|
if (debug && cacheLayout !== runtimeKV.layout) {
|
|
506
525
|
log.debug('Pipeline', `KV cache layout override: ${runtimeKV.layout} -> ${cacheLayout} (${layoutSource})`);
|
|
507
526
|
}
|
|
@@ -599,7 +618,7 @@ export function createKVCache(modelConfig, useGPU, debug = false, runtimeConfig)
|
|
|
599
618
|
|
|
600
619
|
if (debug) {
|
|
601
620
|
if (forceContiguousKVCache && modelConfig.layerTypes) {
|
|
602
|
-
log.debug('Pipeline', 'Layer pattern includes full-attention layers;
|
|
621
|
+
log.debug('Pipeline', 'Layer pattern includes full-attention layers; paged layout blocked, contiguous enforced.');
|
|
603
622
|
}
|
|
604
623
|
const isSliding = kvCache instanceof SlidingWindowKVCache;
|
|
605
624
|
log.debug('Pipeline', `KV cache: type=${kvCache?.constructor?.name || 'unknown'}, kvDtype=${kvCache.kvDtype}, layout=${kvCache.layout}, maxSeqLen=${kvCache.maxSeqLen}, windowSize=${isSliding ? kvCache.windowSize : null}`);
|
|
@@ -635,7 +654,12 @@ export async function initTokenizer(manifest, options = {}) {
|
|
|
635
654
|
|
|
636
655
|
|
|
637
656
|
export async function loadWeights(manifest, modelConfig, options = {}) {
|
|
638
|
-
const {
|
|
657
|
+
const {
|
|
658
|
+
onProgress,
|
|
659
|
+
loadingConfig,
|
|
660
|
+
baseUrl,
|
|
661
|
+
loaderDebug,
|
|
662
|
+
} = options;
|
|
639
663
|
const runtimeStorageContext = options.storageContext
|
|
640
664
|
?? createRemoteStorageContext(baseUrl, manifest);
|
|
641
665
|
const verifyHashes = (
|
|
@@ -657,6 +681,7 @@ export async function loadWeights(manifest, modelConfig, options = {}) {
|
|
|
657
681
|
keepF32Weights
|
|
658
682
|
)
|
|
659
683
|
);
|
|
684
|
+
dopplerLoader.setLoaderDebugConfig(loaderDebug ?? null);
|
|
660
685
|
|
|
661
686
|
const tensorsFile = isRDRRManifest(manifest) ? manifest.tensorsFile : null;
|
|
662
687
|
if (baseUrl && tensorsFile) {
|
|
@@ -43,19 +43,16 @@ export function detectSandwichNorm(config) {
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
export function isMoELayer(layerIdx, config
|
|
46
|
+
export function isMoELayer(layerIdx, config) {
|
|
47
47
|
if (!config.useMoE) return false;
|
|
48
48
|
|
|
49
|
-
//
|
|
50
|
-
if (layerWeights?.routerWeight) return true;
|
|
51
|
-
|
|
52
|
-
// Fall back to layer_types array if available
|
|
49
|
+
// Manifest-first: check layerTypes from config (derived from manifest.inference.layerPattern)
|
|
53
50
|
const layerTypes = config.layerTypes;
|
|
54
51
|
if (Array.isArray(layerTypes) && layerIdx < layerTypes.length) {
|
|
55
52
|
return layerTypes[layerIdx] === 'moe';
|
|
56
53
|
}
|
|
57
54
|
|
|
58
|
-
//
|
|
55
|
+
// No layerTypes available: assume all layers are MoE
|
|
59
56
|
return true;
|
|
60
57
|
}
|
|
61
58
|
|
|
@@ -87,6 +84,11 @@ function assertSupportedLayerRuntime(layerIdx, config) {
|
|
|
87
84
|
}
|
|
88
85
|
}
|
|
89
86
|
|
|
87
|
+
function getConvLayerState(convLayerStates, layerIdx) {
|
|
88
|
+
if (!convLayerStates) return {};
|
|
89
|
+
return convLayerStates.get(layerIdx) ?? {};
|
|
90
|
+
}
|
|
91
|
+
|
|
90
92
|
function isSlidingLayerType(layerType) {
|
|
91
93
|
const normalized = normalizeLayerType(layerType);
|
|
92
94
|
return normalized === 'sliding_attention'
|
|
@@ -103,6 +105,14 @@ function isConvLayerType(layerType) {
|
|
|
103
105
|
|| normalized === 'liv_convolution';
|
|
104
106
|
}
|
|
105
107
|
|
|
108
|
+
export function hasConvLayers(layerTypes) {
|
|
109
|
+
if (!Array.isArray(layerTypes)) return false;
|
|
110
|
+
for (let i = 0; i < layerTypes.length; i++) {
|
|
111
|
+
if (isConvLayerType(layerTypes[i])) return true;
|
|
112
|
+
}
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
|
|
106
116
|
function isLinearLayerType(layerType) {
|
|
107
117
|
const normalized = normalizeLayerType(layerType);
|
|
108
118
|
return normalized === 'linear_attention'
|
|
@@ -201,8 +211,22 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
|
|
|
201
211
|
);
|
|
202
212
|
}
|
|
203
213
|
const convKernel = layerWeights?.convKernel ?? null;
|
|
214
|
+
// Apply input norm (operator_norm) before conv mixer — matches HF Lfm2 forward pass
|
|
215
|
+
let normedTensor = inputTensor;
|
|
216
|
+
const inputNormWeight = layerWeights?.inputNorm ?? null;
|
|
217
|
+
if (inputNormWeight) {
|
|
218
|
+
const normWeightBuf = getNormWeightBuffer(inputNormWeight, `L${layerIdx}.conv_input_norm`);
|
|
219
|
+
normedTensor = await doRMSNorm(inputTensor, normWeightBuf, rmsNormEps, {
|
|
220
|
+
batchSize: numTokens,
|
|
221
|
+
hiddenSize,
|
|
222
|
+
rmsNormWeightOffset: config.rmsNormWeightOffset,
|
|
223
|
+
label: `L${layerIdx}.conv_input_norm`,
|
|
224
|
+
layerIdx,
|
|
225
|
+
}, recorder);
|
|
226
|
+
if (!(inputNormWeight instanceof GPUBuffer)) releaseOrTrack(recorder, normWeightBuf);
|
|
227
|
+
}
|
|
204
228
|
attnOutput = await doConv(
|
|
205
|
-
|
|
229
|
+
normedTensor,
|
|
206
230
|
getWeightBuffer(convInProj, `L${layerIdx}.conv_in_proj`),
|
|
207
231
|
convKernel ? getWeightBuffer(convKernel, `L${layerIdx}.conv_kernel`) : null,
|
|
208
232
|
getWeightBuffer(convOutProj, `L${layerIdx}.conv_out_proj`),
|
|
@@ -213,9 +237,13 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
|
|
|
213
237
|
label: `L${layerIdx}.conv`,
|
|
214
238
|
swigluLimit: config.swigluLimit,
|
|
215
239
|
kernelPath: context.kernelPath ?? null,
|
|
240
|
+
convState: getConvLayerState(context.convLayerStates, layerIdx),
|
|
216
241
|
},
|
|
217
242
|
recorder
|
|
218
243
|
);
|
|
244
|
+
if (normedTensor !== inputTensor) {
|
|
245
|
+
releaseOrTrack(recorder, normedTensor.buffer);
|
|
246
|
+
}
|
|
219
247
|
} else if (isLinearLayer) {
|
|
220
248
|
attnOutput = await runLinearAttentionLayer(inputTensor, layerWeights ?? null, {
|
|
221
249
|
layerIdx,
|
|
@@ -276,6 +304,7 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
|
|
|
276
304
|
: (ropeFreqsSin),
|
|
277
305
|
kvCache: ((kvCache)),
|
|
278
306
|
stats: context.stats,
|
|
307
|
+
debugProbes: context.debugProbes,
|
|
279
308
|
linearRuntime: context.linearAttentionRuntime ?? null,
|
|
280
309
|
};
|
|
281
310
|
|
|
@@ -720,6 +749,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
|
|
|
720
749
|
label: `L${layerIdx}.plan_conv`,
|
|
721
750
|
swigluLimit: config.swigluLimit,
|
|
722
751
|
kernelPath: context.kernelPath ?? null,
|
|
752
|
+
convState: getConvLayerState(context.convLayerStates, layerIdx),
|
|
723
753
|
},
|
|
724
754
|
recorder
|
|
725
755
|
);
|
|
@@ -781,7 +811,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
|
|
|
781
811
|
let outputTensor;
|
|
782
812
|
const { runMoEFFNGPU, runDenseFFNGPU } = await import('./ffn/index.js');
|
|
783
813
|
|
|
784
|
-
const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config
|
|
814
|
+
const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config);
|
|
785
815
|
const useMoe = selectRuleValue(
|
|
786
816
|
'inference',
|
|
787
817
|
'layer',
|
|
@@ -84,6 +84,11 @@ export declare function inferLinearNormMode(
|
|
|
84
84
|
}
|
|
85
85
|
): LinearNormMode | null;
|
|
86
86
|
|
|
87
|
+
export declare function applyLinearNormWeightOffset(
|
|
88
|
+
values: Float32Array,
|
|
89
|
+
rmsNormWeightOffset: boolean
|
|
90
|
+
): Float32Array;
|
|
91
|
+
|
|
87
92
|
export declare function resetLinearAttentionRuntime(
|
|
88
93
|
runtime: LinearAttentionRuntime | null | undefined
|
|
89
94
|
): LinearAttentionRuntime;
|
|
@@ -5,6 +5,8 @@ import { log } from '../../../debug/index.js';
|
|
|
5
5
|
import { decodeReadback } from './debug-utils/index.js';
|
|
6
6
|
import { runLinearAttentionCoreGPU } from '../../../gpu/kernels/linear-attention-core.js';
|
|
7
7
|
import { runProbes } from './probes.js';
|
|
8
|
+
import { QK_K, Q4K_BLOCK_BYTES } from '../../../config/schema/index.js';
|
|
9
|
+
import { dequantizeQ4KM } from '../../../converter/quantizer.js';
|
|
8
10
|
|
|
9
11
|
const LINEAR_RUNTIME_SCHEMA_VERSION = 1;
|
|
10
12
|
const QK_L2NORM_EPS = 1e-6;
|
|
@@ -34,6 +36,15 @@ function bytesFromDtype(dtype) {
|
|
|
34
36
|
return 4;
|
|
35
37
|
}
|
|
36
38
|
|
|
39
|
+
export function applyLinearNormWeightOffset(values, rmsNormWeightOffset) {
|
|
40
|
+
if (!(values instanceof Float32Array)) {
|
|
41
|
+
throw new Error('applyLinearNormWeightOffset requires Float32Array input.');
|
|
42
|
+
}
|
|
43
|
+
// Qwen linear-attention output norm uses direct weights even when surrounding
|
|
44
|
+
// transformer RMSNorm sites use the Gemma-style (1 + weight) formula.
|
|
45
|
+
return values;
|
|
46
|
+
}
|
|
47
|
+
|
|
37
48
|
function cloneLayerRuntimeState(layerState) {
|
|
38
49
|
return {
|
|
39
50
|
layerIdx: layerState.layerIdx,
|
|
@@ -283,9 +294,27 @@ async function readWeightAsF32(weight, expectedElements, label) {
|
|
|
283
294
|
if (!elementCount && isWeightBuffer(weight) && Array.isArray(weight.shape) && weight.shape.length > 0) {
|
|
284
295
|
elementCount = weight.shape.reduce((total, dim) => total * Math.max(1, Math.trunc(Number(dim) || 0)), 1);
|
|
285
296
|
}
|
|
297
|
+
const isQ4K = sourceDtype === 'q4k' || sourceDtype === 'q4_k_m' || sourceDtype === 'q4_k';
|
|
286
298
|
if (!elementCount) {
|
|
287
|
-
|
|
288
|
-
|
|
299
|
+
if (isQ4K) {
|
|
300
|
+
elementCount = Math.trunc(sourceBuffer.size / Q4K_BLOCK_BYTES) * QK_K;
|
|
301
|
+
} else {
|
|
302
|
+
const inferredBytes = sourceDtype === 'f16' || sourceDtype === 'bf16' ? 2 : 4;
|
|
303
|
+
elementCount = Math.trunc(sourceBuffer.size / inferredBytes);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
if (isQ4K) {
|
|
308
|
+
const numBlocks = Math.ceil(elementCount / QK_K);
|
|
309
|
+
const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
|
|
310
|
+
const raw = await readBuffer(sourceBuffer, q4kBytes);
|
|
311
|
+
const decoded = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [elementCount]);
|
|
312
|
+
if (expectedElements != null && decoded.length !== expectedElements) {
|
|
313
|
+
throw new Error(
|
|
314
|
+
`Weight "${label}" Q4K decoded length ${decoded.length}, expected ${expectedElements}.`
|
|
315
|
+
);
|
|
316
|
+
}
|
|
317
|
+
return decoded;
|
|
289
318
|
}
|
|
290
319
|
|
|
291
320
|
if (!sourceDtype) {
|
|
@@ -454,6 +483,7 @@ async function createLayerRuntimeState(
|
|
|
454
483
|
expectedNormElements,
|
|
455
484
|
`L${layerIdx}.linear_attn.norm.weight`
|
|
456
485
|
);
|
|
486
|
+
const runtimeNorm = applyLinearNormWeightOffset(norm, config.rmsNormWeightOffset === true);
|
|
457
487
|
|
|
458
488
|
const aNegExp = new Float32Array(aLog.length);
|
|
459
489
|
for (let i = 0; i < aLog.length; i++) {
|
|
@@ -490,7 +520,7 @@ async function createLayerRuntimeState(
|
|
|
490
520
|
convWeight,
|
|
491
521
|
dtBias,
|
|
492
522
|
aNegExp,
|
|
493
|
-
normWeight:
|
|
523
|
+
normWeight: runtimeNorm,
|
|
494
524
|
convState,
|
|
495
525
|
recurrentState,
|
|
496
526
|
convWeightGPU: null,
|
|
@@ -304,7 +304,7 @@ export async function computeLogitsGPU(
|
|
|
304
304
|
|
|
305
305
|
const logitsTensor = await runMatmul(normedTensor, lmHeadBuffer, numTokens, matmulVocabSize, hiddenSize, {
|
|
306
306
|
transposeB: 'auto',
|
|
307
|
-
role:
|
|
307
|
+
role: 'lm_head',
|
|
308
308
|
kernelPath: config.kernelPath ?? null,
|
|
309
309
|
});
|
|
310
310
|
|
|
@@ -391,7 +391,7 @@ export async function recordLogitsGPU(
|
|
|
391
391
|
// Record matmul (no submit)
|
|
392
392
|
const logitsTensor = await recordMatmul(recorder, normedTensor, lmHeadBuffer, numTokens, matmulVocabSize, hiddenSize, {
|
|
393
393
|
transposeB: 'auto',
|
|
394
|
-
role:
|
|
394
|
+
role: 'lm_head',
|
|
395
395
|
kernelPath: config.kernelPath ?? null,
|
|
396
396
|
});
|
|
397
397
|
|
|
@@ -25,6 +25,10 @@ export { computeLogitsGPU, recordLogitsGPU, computeChunkedLogitsGPU, resolveCpuW
|
|
|
25
25
|
// Re-export utilities
|
|
26
26
|
export { extractLastPositionLogits, finalizeLogits } from './utils.js';
|
|
27
27
|
|
|
28
|
+
export interface ComputeLogitsOptions {
|
|
29
|
+
lastPositionOnly?: boolean;
|
|
30
|
+
}
|
|
31
|
+
|
|
28
32
|
/**
|
|
29
33
|
* Compute logits from hidden states.
|
|
30
34
|
*
|
|
@@ -53,5 +57,6 @@ export function computeLogits(
|
|
|
53
57
|
debugFlags?: LogitsDebugFlags,
|
|
54
58
|
getNormWeightBuffer?: (weight: GPUBuffer | Float32Array | ArrayBuffer, label: string) => GPUBuffer,
|
|
55
59
|
debugCheckBuffer?: (buffer: GPUBuffer, label: string, numTokens: number, expectedDim?: number) => Promise<void>,
|
|
56
|
-
debugProbes?: ProbeConfigSchema[] | null
|
|
60
|
+
debugProbes?: ProbeConfigSchema[] | null,
|
|
61
|
+
options?: ComputeLogitsOptions
|
|
57
62
|
): Promise<Float32Array>;
|
|
@@ -253,6 +253,7 @@ export async function computeLogits(
|
|
|
253
253
|
|
|
254
254
|
const lastPositionOnly = options?.lastPositionOnly === true && numTokens > 1;
|
|
255
255
|
const matmulRows = lastPositionOnly ? 1 : numTokens;
|
|
256
|
+
const matmulPhaseOverride = lastPositionOnly ? 'prefill' : null;
|
|
256
257
|
let matmulInputTensor = normedTensor;
|
|
257
258
|
let matmulInputOwned = false;
|
|
258
259
|
if (lastPositionOnly) {
|
|
@@ -270,7 +271,8 @@ export async function computeLogits(
|
|
|
270
271
|
// HuggingFace models store lm_head as [vocabSize, hiddenSize], so transposeB=true
|
|
271
272
|
const logitsTensor = await runMatmul(matmulInputTensor, lmHeadBuffer, matmulRows, matmulVocabSize, hiddenSize, {
|
|
272
273
|
transposeB: 'auto',
|
|
273
|
-
role:
|
|
274
|
+
role: 'lm_head',
|
|
275
|
+
phaseOverride: matmulPhaseOverride,
|
|
274
276
|
kernelPath: config.kernelPath ?? null,
|
|
275
277
|
});
|
|
276
278
|
await runProbes('logits', logitsTensor.buffer, {
|
|
@@ -234,6 +234,9 @@ function buildManifestDecodeLoopRuntimePatch(manifest) {
|
|
|
234
234
|
|
|
235
235
|
export function applyModelBatchingRuntimeDefaults(runtimeConfig, manifest, modelConfig) {
|
|
236
236
|
void modelConfig;
|
|
237
|
+
if (manifest?.inference?.schema === 'doppler.execution/v0') {
|
|
238
|
+
return runtimeConfig;
|
|
239
|
+
}
|
|
237
240
|
const batching = runtimeConfig?.inference?.batching;
|
|
238
241
|
const generation = runtimeConfig?.inference?.generation;
|
|
239
242
|
const runtimeBatchingAtDefaults = isRuntimeBatchingAtGlobalDefaults(batching);
|
|
@@ -23,6 +23,7 @@ import {
|
|
|
23
23
|
validateMoeShape,
|
|
24
24
|
resolveMoeVendorProfile,
|
|
25
25
|
resolveGptOssKernelPathProfile,
|
|
26
|
+
resolveMixtralKernelPathProfile,
|
|
26
27
|
} from './moe-shape-validator.js';
|
|
27
28
|
|
|
28
29
|
export async function moeFeedForwardGPU(
|
|
@@ -52,7 +53,10 @@ export async function moeFeedForwardGPU(
|
|
|
52
53
|
if (topK == null) {
|
|
53
54
|
throw new Error('MoE topK is required in config.');
|
|
54
55
|
}
|
|
55
|
-
|
|
56
|
+
if (config.modelType == null) {
|
|
57
|
+
throw new Error('MoE config.modelType is required; got null/undefined.');
|
|
58
|
+
}
|
|
59
|
+
const modelType = config.modelType;
|
|
56
60
|
validateMoeShape(
|
|
57
61
|
{ hiddenSize, intermediateSize, moeTopK: topK, numExperts, expertFormat },
|
|
58
62
|
{ modelType }
|
|
@@ -130,7 +134,13 @@ export async function moeFeedForwardGPU(
|
|
|
130
134
|
trace.buffers(`MoE L${layerIdx} router_logits`, { min, max, nanCount, dtype: logitsDtype });
|
|
131
135
|
}
|
|
132
136
|
|
|
137
|
+
// Profile resolution: routerTopK/dequantExpert are resolved for tracing and
|
|
138
|
+
// forward validation. Actual kernel dispatch uses the generic softmax.rules.json
|
|
139
|
+
// topkVariant rules (keyed by modelType) and format-specific dequant paths.
|
|
140
|
+
// GPT-OSS: dequantTileShape actively steers MXFP4 dequant; routerTopK is trace-only.
|
|
141
|
+
// Mixtral: expert weights are pre-loaded (no runtime dequant); both fields are trace-only.
|
|
133
142
|
let gptOssKernelPathProfile = null;
|
|
143
|
+
let mixtralKernelPathProfile = null;
|
|
134
144
|
if (modelType === 'gpt-oss') {
|
|
135
145
|
gptOssKernelPathProfile = await resolveGptOssKernelPathProfile({
|
|
136
146
|
hasF16: caps.hasF16,
|
|
@@ -141,6 +151,14 @@ export async function moeFeedForwardGPU(
|
|
|
141
151
|
groupSize: 32,
|
|
142
152
|
tileShape: vendorProfile.dequantTileShape,
|
|
143
153
|
});
|
|
154
|
+
} else if (modelType === 'mixtral') {
|
|
155
|
+
mixtralKernelPathProfile = await resolveMixtralKernelPathProfile({
|
|
156
|
+
hasF16: caps.hasF16,
|
|
157
|
+
hasSubgroups: caps.hasSubgroups,
|
|
158
|
+
routerDtype: logitsDtype,
|
|
159
|
+
weightsDtype: activationDtype,
|
|
160
|
+
outputDtype: activationDtype,
|
|
161
|
+
});
|
|
144
162
|
}
|
|
145
163
|
|
|
146
164
|
stepStart = perfMark();
|
|
@@ -159,7 +177,7 @@ export async function moeFeedForwardGPU(
|
|
|
159
177
|
perfLog(`MoE L${layerIdx} topk`, stepStart, {
|
|
160
178
|
topK,
|
|
161
179
|
modelType,
|
|
162
|
-
routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? null,
|
|
180
|
+
routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? mixtralKernelPathProfile?.routerTopK ?? null,
|
|
163
181
|
});
|
|
164
182
|
|
|
165
183
|
if (isTraceEnabled('buffers')) {
|
|
@@ -211,7 +229,7 @@ export async function moeFeedForwardGPU(
|
|
|
211
229
|
const bytesPerElement = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
|
|
212
230
|
const bytesPerToken = hiddenSize * bytesPerElement;
|
|
213
231
|
let maxTokensPerExpert = resolveMaxTokensPerExpert(numTokens, numExperts, topK, hiddenSize, activationDtype);
|
|
214
|
-
if (
|
|
232
|
+
if (vendorProfile.maxTokensPerExpertScale !== 1.0) {
|
|
215
233
|
maxTokensPerExpert = Math.max(
|
|
216
234
|
1,
|
|
217
235
|
Math.round(maxTokensPerExpert * vendorProfile.maxTokensPerExpertScale)
|
|
@@ -29,3 +29,12 @@ export interface GptOssKernelPathProfile {
|
|
|
29
29
|
export declare function resolveGptOssKernelPathProfile(
|
|
30
30
|
context: Record<string, unknown>
|
|
31
31
|
): Promise<GptOssKernelPathProfile>;
|
|
32
|
+
|
|
33
|
+
export interface MixtralKernelPathProfile {
|
|
34
|
+
routerTopK: string;
|
|
35
|
+
dequantExpert: string;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export declare function resolveMixtralKernelPathProfile(
|
|
39
|
+
context: Record<string, unknown>
|
|
40
|
+
): Promise<MixtralKernelPathProfile>;
|
|
@@ -7,17 +7,15 @@ function asVendorString(caps) {
|
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
export function resolveMoeVendorProfile(modelType) {
|
|
10
|
-
if (modelType !== 'gpt-oss') {
|
|
11
|
-
return {
|
|
12
|
-
preferVec4Dequant: false,
|
|
13
|
-
dequantTileShape: 'scalar',
|
|
14
|
-
routerWorkgroupSize: 128,
|
|
15
|
-
maxTokensPerExpertScale: 1.0,
|
|
16
|
-
};
|
|
17
|
-
}
|
|
18
10
|
const caps = getKernelCapabilities();
|
|
19
11
|
const vendor = asVendorString(caps);
|
|
20
|
-
|
|
12
|
+
if (modelType === 'gpt-oss') {
|
|
13
|
+
return selectRuleValue('kernels', 'moeGptoss', 'vendorQuirkProfile', { vendor });
|
|
14
|
+
}
|
|
15
|
+
if (modelType === 'mixtral') {
|
|
16
|
+
return selectRuleValue('kernels', 'moeMixtral', 'vendorQuirkProfile', { vendor });
|
|
17
|
+
}
|
|
18
|
+
throw new Error(`[MoE] Unknown modelType "${modelType}" for vendor profile resolution.`);
|
|
21
19
|
}
|
|
22
20
|
|
|
23
21
|
function resolveGptOssRuleContext(context) {
|
|
@@ -41,6 +39,25 @@ export async function resolveGptOssKernelPathProfile(context) {
|
|
|
41
39
|
};
|
|
42
40
|
}
|
|
43
41
|
|
|
42
|
+
function resolveMixtralRuleContext(context) {
|
|
43
|
+
return {
|
|
44
|
+
modelType: 'mixtral',
|
|
45
|
+
hasF16: context?.hasF16,
|
|
46
|
+
hasSubgroups: context?.hasSubgroups,
|
|
47
|
+
routerDtype: context?.routerDtype ?? 'f32',
|
|
48
|
+
weightsDtype: context?.weightsDtype,
|
|
49
|
+
outputDtype: context?.outputDtype ?? context?.weightsDtype,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export async function resolveMixtralKernelPathProfile(context) {
|
|
54
|
+
const ruleContext = resolveMixtralRuleContext(context);
|
|
55
|
+
return {
|
|
56
|
+
routerTopK: selectRuleValue('kernels', 'moeMixtral', 'routerTopKVariant', ruleContext),
|
|
57
|
+
dequantExpert: selectRuleValue('kernels', 'moeMixtral', 'dequantVariant', ruleContext),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
44
61
|
export function validateMoeShape(config, options = {}) {
|
|
45
62
|
const {
|
|
46
63
|
hiddenSize,
|
|
@@ -66,8 +83,11 @@ export function validateMoeShape(config, options = {}) {
|
|
|
66
83
|
|
|
67
84
|
if (modelType === 'gpt-oss') {
|
|
68
85
|
const policy = selectRuleValue('kernels', 'moeGptoss', 'shapePolicy', { modelType });
|
|
69
|
-
|
|
70
|
-
|
|
86
|
+
if (policy.hiddenSizeDivisor == null || policy.intermediateSizeDivisor == null) {
|
|
87
|
+
throw new Error('[MoE] GPT-OSS shapePolicy is missing hiddenSizeDivisor or intermediateSizeDivisor.');
|
|
88
|
+
}
|
|
89
|
+
const hiddenDivisor = policy.hiddenSizeDivisor;
|
|
90
|
+
const intermediateDivisor = policy.intermediateSizeDivisor;
|
|
71
91
|
if (hiddenSize % hiddenDivisor !== 0 || intermediateSize % intermediateDivisor !== 0) {
|
|
72
92
|
throw new Error(
|
|
73
93
|
`[MoE] GPT-OSS shape policy violation: hiddenSize (${hiddenSize}) % ${hiddenDivisor} = ${hiddenSize % hiddenDivisor}, ` +
|