@simulatte/doppler 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +25 -6
- package/package.json +5 -3
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +16 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/loader.js +6 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +7 -0
- package/src/config/presets/models/gemma3.json +2 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +1 -1
- package/src/converter/core.js +17 -8
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +15 -0
- package/src/distribution/shard-delivery.js +34 -0
- package/src/formats/rdrr/classification.js +32 -0
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +1 -0
- package/src/gpu/kernels/matmul.d.ts +3 -0
- package/src/gpu/kernels/matmul.js +70 -1
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
- package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
- package/src/inference/pipelines/text/attention/projections.js +13 -2
- package/src/inference/pipelines/text/attention/record.js +1 -0
- package/src/inference/pipelines/text/attention/run.js +9 -0
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +32 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +14 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
- package/src/inference/pipelines/text/generator-steps.js +46 -29
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +320 -166
- package/src/inference/pipelines/text/init.d.ts +2 -0
- package/src/inference/pipelines/text/init.js +19 -5
- package/src/inference/pipelines/text/layer.js +37 -8
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +9 -7
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +124 -3
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/tooling/node-converter.js +25 -7
- package/src/tooling/node-source-runtime.js +29 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -28,6 +28,7 @@ import type {
|
|
|
28
28
|
SpeculativeConfigSchema,
|
|
29
29
|
KernelPathSchema,
|
|
30
30
|
} from '../../../config/schema/index.js';
|
|
31
|
+
import type { LoaderDebugConfigSchema } from '../../../config/schema/debug.schema.js';
|
|
31
32
|
import type { KernelPathSource } from '../../../config/kernel-path-loader.js';
|
|
32
33
|
|
|
33
34
|
export interface PipelineStorageContext {
|
|
@@ -206,6 +207,7 @@ export interface LoadWeightsOptions {
|
|
|
206
207
|
resolvedKernelPath?: KernelPathSchema | null;
|
|
207
208
|
kernelPathSource?: KernelPathSource;
|
|
208
209
|
keepF32Weights?: boolean;
|
|
210
|
+
loaderDebug?: LoaderDebugConfigSchema | null;
|
|
209
211
|
}
|
|
210
212
|
|
|
211
213
|
/**
|
|
@@ -309,13 +309,21 @@ export async function initRoPEFrequencies(config, useGPU) {
|
|
|
309
309
|
if (!Number.isFinite(ropeScale) || ropeScale <= 0) {
|
|
310
310
|
throw new Error(`RoPE scale must be a positive number; got "${ropeScale}".`);
|
|
311
311
|
}
|
|
312
|
-
const resolvedLocalScale = ropeLocalScale
|
|
313
|
-
if (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0) {
|
|
312
|
+
const resolvedLocalScale = ropeLocalScale;
|
|
313
|
+
if (resolvedLocalScale != null && (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0)) {
|
|
314
314
|
throw new Error(`Local RoPE scale must be a positive number; got "${resolvedLocalScale}".`);
|
|
315
315
|
}
|
|
316
316
|
const resolvedLocalTheta = ropeLocalTheta ?? ropeTheta;
|
|
317
|
-
const resolvedLocalScalingType =
|
|
318
|
-
|
|
317
|
+
const resolvedLocalScalingType = (
|
|
318
|
+
ropeLocalScalingType === undefined
|
|
319
|
+
? ropeScalingType
|
|
320
|
+
: ropeLocalScalingType
|
|
321
|
+
);
|
|
322
|
+
const resolvedLocalScaling = (
|
|
323
|
+
ropeLocalScalingType === undefined
|
|
324
|
+
? ropeScaling
|
|
325
|
+
: ropeLocalScaling
|
|
326
|
+
);
|
|
319
327
|
const resolvedRotaryDim = resolveRotaryDim(headDim, rotaryDim, partialRotaryFactor);
|
|
320
328
|
const halfDim = resolvedRotaryDim / 2;
|
|
321
329
|
if (mropeInterleaved === true && Array.isArray(mropeSection)) {
|
|
@@ -646,7 +654,12 @@ export async function initTokenizer(manifest, options = {}) {
|
|
|
646
654
|
|
|
647
655
|
|
|
648
656
|
export async function loadWeights(manifest, modelConfig, options = {}) {
|
|
649
|
-
const {
|
|
657
|
+
const {
|
|
658
|
+
onProgress,
|
|
659
|
+
loadingConfig,
|
|
660
|
+
baseUrl,
|
|
661
|
+
loaderDebug,
|
|
662
|
+
} = options;
|
|
650
663
|
const runtimeStorageContext = options.storageContext
|
|
651
664
|
?? createRemoteStorageContext(baseUrl, manifest);
|
|
652
665
|
const verifyHashes = (
|
|
@@ -668,6 +681,7 @@ export async function loadWeights(manifest, modelConfig, options = {}) {
|
|
|
668
681
|
keepF32Weights
|
|
669
682
|
)
|
|
670
683
|
);
|
|
684
|
+
dopplerLoader.setLoaderDebugConfig(loaderDebug ?? null);
|
|
671
685
|
|
|
672
686
|
const tensorsFile = isRDRRManifest(manifest) ? manifest.tensorsFile : null;
|
|
673
687
|
if (baseUrl && tensorsFile) {
|
|
@@ -43,19 +43,16 @@ export function detectSandwichNorm(config) {
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
export function isMoELayer(layerIdx, config
|
|
46
|
+
export function isMoELayer(layerIdx, config) {
|
|
47
47
|
if (!config.useMoE) return false;
|
|
48
48
|
|
|
49
|
-
//
|
|
50
|
-
if (layerWeights?.routerWeight) return true;
|
|
51
|
-
|
|
52
|
-
// Fall back to layer_types array if available
|
|
49
|
+
// Manifest-first: check layerTypes from config (derived from manifest.inference.layerPattern)
|
|
53
50
|
const layerTypes = config.layerTypes;
|
|
54
51
|
if (Array.isArray(layerTypes) && layerIdx < layerTypes.length) {
|
|
55
52
|
return layerTypes[layerIdx] === 'moe';
|
|
56
53
|
}
|
|
57
54
|
|
|
58
|
-
//
|
|
55
|
+
// No layerTypes available: assume all layers are MoE
|
|
59
56
|
return true;
|
|
60
57
|
}
|
|
61
58
|
|
|
@@ -87,6 +84,11 @@ function assertSupportedLayerRuntime(layerIdx, config) {
|
|
|
87
84
|
}
|
|
88
85
|
}
|
|
89
86
|
|
|
87
|
+
function getConvLayerState(convLayerStates, layerIdx) {
|
|
88
|
+
if (!convLayerStates) return {};
|
|
89
|
+
return convLayerStates.get(layerIdx) ?? {};
|
|
90
|
+
}
|
|
91
|
+
|
|
90
92
|
function isSlidingLayerType(layerType) {
|
|
91
93
|
const normalized = normalizeLayerType(layerType);
|
|
92
94
|
return normalized === 'sliding_attention'
|
|
@@ -103,6 +105,14 @@ function isConvLayerType(layerType) {
|
|
|
103
105
|
|| normalized === 'liv_convolution';
|
|
104
106
|
}
|
|
105
107
|
|
|
108
|
+
export function hasConvLayers(layerTypes) {
|
|
109
|
+
if (!Array.isArray(layerTypes)) return false;
|
|
110
|
+
for (let i = 0; i < layerTypes.length; i++) {
|
|
111
|
+
if (isConvLayerType(layerTypes[i])) return true;
|
|
112
|
+
}
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
|
|
106
116
|
function isLinearLayerType(layerType) {
|
|
107
117
|
const normalized = normalizeLayerType(layerType);
|
|
108
118
|
return normalized === 'linear_attention'
|
|
@@ -201,8 +211,22 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
|
|
|
201
211
|
);
|
|
202
212
|
}
|
|
203
213
|
const convKernel = layerWeights?.convKernel ?? null;
|
|
214
|
+
// Apply input norm (operator_norm) before conv mixer — matches HF Lfm2 forward pass
|
|
215
|
+
let normedTensor = inputTensor;
|
|
216
|
+
const inputNormWeight = layerWeights?.inputNorm ?? null;
|
|
217
|
+
if (inputNormWeight) {
|
|
218
|
+
const normWeightBuf = getNormWeightBuffer(inputNormWeight, `L${layerIdx}.conv_input_norm`);
|
|
219
|
+
normedTensor = await doRMSNorm(inputTensor, normWeightBuf, rmsNormEps, {
|
|
220
|
+
batchSize: numTokens,
|
|
221
|
+
hiddenSize,
|
|
222
|
+
rmsNormWeightOffset: config.rmsNormWeightOffset,
|
|
223
|
+
label: `L${layerIdx}.conv_input_norm`,
|
|
224
|
+
layerIdx,
|
|
225
|
+
}, recorder);
|
|
226
|
+
if (!(inputNormWeight instanceof GPUBuffer)) releaseOrTrack(recorder, normWeightBuf);
|
|
227
|
+
}
|
|
204
228
|
attnOutput = await doConv(
|
|
205
|
-
|
|
229
|
+
normedTensor,
|
|
206
230
|
getWeightBuffer(convInProj, `L${layerIdx}.conv_in_proj`),
|
|
207
231
|
convKernel ? getWeightBuffer(convKernel, `L${layerIdx}.conv_kernel`) : null,
|
|
208
232
|
getWeightBuffer(convOutProj, `L${layerIdx}.conv_out_proj`),
|
|
@@ -213,9 +237,13 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
|
|
|
213
237
|
label: `L${layerIdx}.conv`,
|
|
214
238
|
swigluLimit: config.swigluLimit,
|
|
215
239
|
kernelPath: context.kernelPath ?? null,
|
|
240
|
+
convState: getConvLayerState(context.convLayerStates, layerIdx),
|
|
216
241
|
},
|
|
217
242
|
recorder
|
|
218
243
|
);
|
|
244
|
+
if (normedTensor !== inputTensor) {
|
|
245
|
+
releaseOrTrack(recorder, normedTensor.buffer);
|
|
246
|
+
}
|
|
219
247
|
} else if (isLinearLayer) {
|
|
220
248
|
attnOutput = await runLinearAttentionLayer(inputTensor, layerWeights ?? null, {
|
|
221
249
|
layerIdx,
|
|
@@ -721,6 +749,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
|
|
|
721
749
|
label: `L${layerIdx}.plan_conv`,
|
|
722
750
|
swigluLimit: config.swigluLimit,
|
|
723
751
|
kernelPath: context.kernelPath ?? null,
|
|
752
|
+
convState: getConvLayerState(context.convLayerStates, layerIdx),
|
|
724
753
|
},
|
|
725
754
|
recorder
|
|
726
755
|
);
|
|
@@ -782,7 +811,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
|
|
|
782
811
|
let outputTensor;
|
|
783
812
|
const { runMoEFFNGPU, runDenseFFNGPU } = await import('./ffn/index.js');
|
|
784
813
|
|
|
785
|
-
const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config
|
|
814
|
+
const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config);
|
|
786
815
|
const useMoe = selectRuleValue(
|
|
787
816
|
'inference',
|
|
788
817
|
'layer',
|
|
@@ -23,6 +23,7 @@ import {
|
|
|
23
23
|
validateMoeShape,
|
|
24
24
|
resolveMoeVendorProfile,
|
|
25
25
|
resolveGptOssKernelPathProfile,
|
|
26
|
+
resolveMixtralKernelPathProfile,
|
|
26
27
|
} from './moe-shape-validator.js';
|
|
27
28
|
|
|
28
29
|
export async function moeFeedForwardGPU(
|
|
@@ -52,7 +53,10 @@ export async function moeFeedForwardGPU(
|
|
|
52
53
|
if (topK == null) {
|
|
53
54
|
throw new Error('MoE topK is required in config.');
|
|
54
55
|
}
|
|
55
|
-
|
|
56
|
+
if (config.modelType == null) {
|
|
57
|
+
throw new Error('MoE config.modelType is required; got null/undefined.');
|
|
58
|
+
}
|
|
59
|
+
const modelType = config.modelType;
|
|
56
60
|
validateMoeShape(
|
|
57
61
|
{ hiddenSize, intermediateSize, moeTopK: topK, numExperts, expertFormat },
|
|
58
62
|
{ modelType }
|
|
@@ -130,7 +134,13 @@ export async function moeFeedForwardGPU(
|
|
|
130
134
|
trace.buffers(`MoE L${layerIdx} router_logits`, { min, max, nanCount, dtype: logitsDtype });
|
|
131
135
|
}
|
|
132
136
|
|
|
137
|
+
// Profile resolution: routerTopK/dequantExpert are resolved for tracing and
|
|
138
|
+
// forward validation. Actual kernel dispatch uses the generic softmax.rules.json
|
|
139
|
+
// topkVariant rules (keyed by modelType) and format-specific dequant paths.
|
|
140
|
+
// GPT-OSS: dequantTileShape actively steers MXFP4 dequant; routerTopK is trace-only.
|
|
141
|
+
// Mixtral: expert weights are pre-loaded (no runtime dequant); both fields are trace-only.
|
|
133
142
|
let gptOssKernelPathProfile = null;
|
|
143
|
+
let mixtralKernelPathProfile = null;
|
|
134
144
|
if (modelType === 'gpt-oss') {
|
|
135
145
|
gptOssKernelPathProfile = await resolveGptOssKernelPathProfile({
|
|
136
146
|
hasF16: caps.hasF16,
|
|
@@ -141,6 +151,14 @@ export async function moeFeedForwardGPU(
|
|
|
141
151
|
groupSize: 32,
|
|
142
152
|
tileShape: vendorProfile.dequantTileShape,
|
|
143
153
|
});
|
|
154
|
+
} else if (modelType === 'mixtral') {
|
|
155
|
+
mixtralKernelPathProfile = await resolveMixtralKernelPathProfile({
|
|
156
|
+
hasF16: caps.hasF16,
|
|
157
|
+
hasSubgroups: caps.hasSubgroups,
|
|
158
|
+
routerDtype: logitsDtype,
|
|
159
|
+
weightsDtype: activationDtype,
|
|
160
|
+
outputDtype: activationDtype,
|
|
161
|
+
});
|
|
144
162
|
}
|
|
145
163
|
|
|
146
164
|
stepStart = perfMark();
|
|
@@ -159,7 +177,7 @@ export async function moeFeedForwardGPU(
|
|
|
159
177
|
perfLog(`MoE L${layerIdx} topk`, stepStart, {
|
|
160
178
|
topK,
|
|
161
179
|
modelType,
|
|
162
|
-
routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? null,
|
|
180
|
+
routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? mixtralKernelPathProfile?.routerTopK ?? null,
|
|
163
181
|
});
|
|
164
182
|
|
|
165
183
|
if (isTraceEnabled('buffers')) {
|
|
@@ -211,7 +229,7 @@ export async function moeFeedForwardGPU(
|
|
|
211
229
|
const bytesPerElement = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
|
|
212
230
|
const bytesPerToken = hiddenSize * bytesPerElement;
|
|
213
231
|
let maxTokensPerExpert = resolveMaxTokensPerExpert(numTokens, numExperts, topK, hiddenSize, activationDtype);
|
|
214
|
-
if (
|
|
232
|
+
if (vendorProfile.maxTokensPerExpertScale !== 1.0) {
|
|
215
233
|
maxTokensPerExpert = Math.max(
|
|
216
234
|
1,
|
|
217
235
|
Math.round(maxTokensPerExpert * vendorProfile.maxTokensPerExpertScale)
|
|
@@ -29,3 +29,12 @@ export interface GptOssKernelPathProfile {
|
|
|
29
29
|
export declare function resolveGptOssKernelPathProfile(
|
|
30
30
|
context: Record<string, unknown>
|
|
31
31
|
): Promise<GptOssKernelPathProfile>;
|
|
32
|
+
|
|
33
|
+
export interface MixtralKernelPathProfile {
|
|
34
|
+
routerTopK: string;
|
|
35
|
+
dequantExpert: string;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export declare function resolveMixtralKernelPathProfile(
|
|
39
|
+
context: Record<string, unknown>
|
|
40
|
+
): Promise<MixtralKernelPathProfile>;
|
|
@@ -7,17 +7,15 @@ function asVendorString(caps) {
|
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
export function resolveMoeVendorProfile(modelType) {
|
|
10
|
-
if (modelType !== 'gpt-oss') {
|
|
11
|
-
return {
|
|
12
|
-
preferVec4Dequant: false,
|
|
13
|
-
dequantTileShape: 'scalar',
|
|
14
|
-
routerWorkgroupSize: 128,
|
|
15
|
-
maxTokensPerExpertScale: 1.0,
|
|
16
|
-
};
|
|
17
|
-
}
|
|
18
10
|
const caps = getKernelCapabilities();
|
|
19
11
|
const vendor = asVendorString(caps);
|
|
20
|
-
|
|
12
|
+
if (modelType === 'gpt-oss') {
|
|
13
|
+
return selectRuleValue('kernels', 'moeGptoss', 'vendorQuirkProfile', { vendor });
|
|
14
|
+
}
|
|
15
|
+
if (modelType === 'mixtral') {
|
|
16
|
+
return selectRuleValue('kernels', 'moeMixtral', 'vendorQuirkProfile', { vendor });
|
|
17
|
+
}
|
|
18
|
+
throw new Error(`[MoE] Unknown modelType "${modelType}" for vendor profile resolution.`);
|
|
21
19
|
}
|
|
22
20
|
|
|
23
21
|
function resolveGptOssRuleContext(context) {
|
|
@@ -41,6 +39,25 @@ export async function resolveGptOssKernelPathProfile(context) {
|
|
|
41
39
|
};
|
|
42
40
|
}
|
|
43
41
|
|
|
42
|
+
function resolveMixtralRuleContext(context) {
|
|
43
|
+
return {
|
|
44
|
+
modelType: 'mixtral',
|
|
45
|
+
hasF16: context?.hasF16,
|
|
46
|
+
hasSubgroups: context?.hasSubgroups,
|
|
47
|
+
routerDtype: context?.routerDtype ?? 'f32',
|
|
48
|
+
weightsDtype: context?.weightsDtype,
|
|
49
|
+
outputDtype: context?.outputDtype ?? context?.weightsDtype,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export async function resolveMixtralKernelPathProfile(context) {
|
|
54
|
+
const ruleContext = resolveMixtralRuleContext(context);
|
|
55
|
+
return {
|
|
56
|
+
routerTopK: selectRuleValue('kernels', 'moeMixtral', 'routerTopKVariant', ruleContext),
|
|
57
|
+
dequantExpert: selectRuleValue('kernels', 'moeMixtral', 'dequantVariant', ruleContext),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
44
61
|
export function validateMoeShape(config, options = {}) {
|
|
45
62
|
const {
|
|
46
63
|
hiddenSize,
|
|
@@ -66,8 +83,11 @@ export function validateMoeShape(config, options = {}) {
|
|
|
66
83
|
|
|
67
84
|
if (modelType === 'gpt-oss') {
|
|
68
85
|
const policy = selectRuleValue('kernels', 'moeGptoss', 'shapePolicy', { modelType });
|
|
69
|
-
|
|
70
|
-
|
|
86
|
+
if (policy.hiddenSizeDivisor == null || policy.intermediateSizeDivisor == null) {
|
|
87
|
+
throw new Error('[MoE] GPT-OSS shapePolicy is missing hiddenSizeDivisor or intermediateSizeDivisor.');
|
|
88
|
+
}
|
|
89
|
+
const hiddenDivisor = policy.hiddenSizeDivisor;
|
|
90
|
+
const intermediateDivisor = policy.intermediateSizeDivisor;
|
|
71
91
|
if (hiddenSize % hiddenDivisor !== 0 || intermediateSize % intermediateDivisor !== 0) {
|
|
72
92
|
throw new Error(
|
|
73
93
|
`[MoE] GPT-OSS shape policy violation: hiddenSize (${hiddenSize}) % ${hiddenDivisor} = ${hiddenSize % hiddenDivisor}, ` +
|
|
@@ -14,13 +14,14 @@ import {
|
|
|
14
14
|
recordCastF32ToF16,
|
|
15
15
|
} from '../../../gpu/kernels/cast.js';
|
|
16
16
|
import { createTensor } from '../../../gpu/tensor.js';
|
|
17
|
-
import { releaseBuffer } from '../../../memory/buffer-pool.js';
|
|
17
|
+
import { releaseBuffer, readBuffer, acquireBuffer, uploadData } from '../../../memory/buffer-pool.js';
|
|
18
18
|
import { kernelTrace, traceStep } from './kernel-trace.js';
|
|
19
19
|
import {
|
|
20
20
|
runLayerAttentionGPU,
|
|
21
21
|
recordLayerAttentionGPU,
|
|
22
22
|
} from './attention/index.js';
|
|
23
23
|
import { runLinearAttentionLayer } from './linear-attention.js';
|
|
24
|
+
import { runGatedShortConvGPU } from '../../../gpu/kernels/gated-short-conv.js';
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
export function isDecodeBuffer(decodeBuffers, buffer) {
|
|
@@ -174,17 +175,22 @@ export async function doConv(
|
|
|
174
175
|
throw new Error('doConv requires hiddenSize > 0.');
|
|
175
176
|
}
|
|
176
177
|
|
|
177
|
-
//
|
|
178
|
+
// LFM2 gated short convolution (GPU-native):
|
|
179
|
+
// in_proj → 3×hidden → GPU kernel: split(B,C,x) + B*x + causal conv1d + C*conv_out → out_proj
|
|
178
180
|
let inProj = null;
|
|
179
|
-
let
|
|
180
|
-
let convInput = null;
|
|
181
|
+
let convOut = null;
|
|
181
182
|
let outProj = null;
|
|
182
183
|
try {
|
|
184
|
+
const convState = options.convState;
|
|
185
|
+
const hasConvState = Boolean(convState?.convWeightGPU && convState?.convStateGPU);
|
|
186
|
+
const projN = hasConvState ? hiddenSize * 3 : hiddenSize * 2;
|
|
187
|
+
|
|
188
|
+
// Project input
|
|
183
189
|
inProj = await doMatmul(
|
|
184
190
|
inputTensor,
|
|
185
191
|
convInProj,
|
|
186
192
|
numTokens,
|
|
187
|
-
|
|
193
|
+
projN,
|
|
188
194
|
hiddenSize,
|
|
189
195
|
{
|
|
190
196
|
transposeB: 'auto',
|
|
@@ -195,50 +201,32 @@ export async function doConv(
|
|
|
195
201
|
},
|
|
196
202
|
recorder
|
|
197
203
|
);
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
204
|
+
|
|
205
|
+
if (hasConvState) {
|
|
206
|
+
// GPU gated short conv kernel: B*x → conv1d → C*conv_out (all on GPU)
|
|
207
|
+
convOut = await runGatedShortConvGPU(inProj, convState, {
|
|
208
|
+
numTokens,
|
|
209
|
+
layerIdx,
|
|
210
|
+
recorder,
|
|
211
|
+
});
|
|
212
|
+
} else {
|
|
213
|
+
// SwiGLU gated activation fallback: silu(first_half) * second_half
|
|
214
|
+
convOut = await doSiLURowSplit(inProj, {
|
|
215
|
+
numTokens,
|
|
216
|
+
dim: hiddenSize,
|
|
217
|
+
activation: 'silu',
|
|
218
|
+
swigluLimit: options.swigluLimit ?? null,
|
|
219
|
+
label: `${label}.activation`,
|
|
220
|
+
layerIdx,
|
|
221
|
+
}, recorder);
|
|
222
|
+
}
|
|
206
223
|
|
|
207
224
|
releaseOrTrack(recorder, inProj.buffer);
|
|
208
225
|
inProj = null;
|
|
209
226
|
|
|
210
|
-
|
|
211
|
-
if (convKernel && options.conv2d && options.conv2d.enabled === true) {
|
|
212
|
-
const convTensorInput = createTensor(activated.buffer, activated.dtype, [
|
|
213
|
-
options.conv2d.inChannels,
|
|
214
|
-
options.conv2d.height,
|
|
215
|
-
options.conv2d.width,
|
|
216
|
-
], `${label}.conv_input`);
|
|
217
|
-
const convOptions = {
|
|
218
|
-
inChannels: options.conv2d.inChannels,
|
|
219
|
-
outChannels: options.conv2d.outChannels,
|
|
220
|
-
height: options.conv2d.height,
|
|
221
|
-
width: options.conv2d.width,
|
|
222
|
-
kernelH: options.conv2d.kernelH,
|
|
223
|
-
kernelW: options.conv2d.kernelW,
|
|
224
|
-
stride: options.conv2d.stride ?? 1,
|
|
225
|
-
pad: options.conv2d.pad ?? 0,
|
|
226
|
-
};
|
|
227
|
-
const convResult = recorder
|
|
228
|
-
? await recordConv2D(recorder, convTensorInput, convKernel, null, convOptions)
|
|
229
|
-
: await runConv2D(convTensorInput, convKernel, null, convOptions);
|
|
230
|
-
convInput = createTensor(
|
|
231
|
-
convResult.buffer,
|
|
232
|
-
convResult.dtype,
|
|
233
|
-
[numTokens, hiddenSize],
|
|
234
|
-
`${label}.conv_output`
|
|
235
|
-
);
|
|
236
|
-
releaseOrTrack(recorder, activated.buffer);
|
|
237
|
-
activated = null;
|
|
238
|
-
}
|
|
239
|
-
|
|
227
|
+
// Output projection
|
|
240
228
|
outProj = await doMatmul(
|
|
241
|
-
|
|
229
|
+
convOut,
|
|
242
230
|
convOutProj,
|
|
243
231
|
numTokens,
|
|
244
232
|
hiddenSize,
|
|
@@ -253,13 +241,8 @@ export async function doConv(
|
|
|
253
241
|
recorder
|
|
254
242
|
);
|
|
255
243
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
convInput = null;
|
|
259
|
-
} else if (activated) {
|
|
260
|
-
releaseOrTrack(recorder, activated.buffer);
|
|
261
|
-
activated = null;
|
|
262
|
-
}
|
|
244
|
+
releaseOrTrack(recorder, convOut.buffer);
|
|
245
|
+
convOut = null;
|
|
263
246
|
|
|
264
247
|
if (kernelTrace.enabled && !recorder) {
|
|
265
248
|
await traceStep('conv', label, layerIdx, outProj.buffer, [numTokens, hiddenSize]);
|
|
@@ -267,13 +250,100 @@ export async function doConv(
|
|
|
267
250
|
return outProj;
|
|
268
251
|
} catch (error) {
|
|
269
252
|
if (outProj) releaseOrTrack(recorder, outProj.buffer);
|
|
270
|
-
if (
|
|
271
|
-
if (activated) releaseOrTrack(recorder, activated.buffer);
|
|
253
|
+
if (convOut) releaseOrTrack(recorder, convOut.buffer);
|
|
272
254
|
if (inProj) releaseOrTrack(recorder, inProj.buffer);
|
|
273
255
|
throw error;
|
|
274
256
|
}
|
|
275
257
|
}
|
|
276
258
|
|
|
259
|
+
export async function initConvLayerState(convState, convKernel, convInProj, hiddenSize, label, layerIdx) {
|
|
260
|
+
const { isWeightBuffer } = await import('../../../gpu/weight-buffer.js');
|
|
261
|
+
const isWB = typeof isWeightBuffer === 'function' && isWeightBuffer(convKernel);
|
|
262
|
+
const kernelBuf = isWB ? convKernel.buffer : (convKernel instanceof GPUBuffer ? convKernel : convKernel.buffer ?? convKernel);
|
|
263
|
+
const kernelDtype = isWB ? String(convKernel.dtype ?? '').toLowerCase() : null;
|
|
264
|
+
|
|
265
|
+
// Determine kernel size from weight shape
|
|
266
|
+
let kernelSize = 3;
|
|
267
|
+
if (isWB && Array.isArray(convKernel.shape)) {
|
|
268
|
+
kernelSize = Number(convKernel.shape[convKernel.shape.length - 1]) || 3;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Dequantize conv kernel weights to F32
|
|
272
|
+
const totalElements = hiddenSize * kernelSize;
|
|
273
|
+
const { QK_K, Q4K_BLOCK_BYTES } = await import('../../../config/schema/index.js');
|
|
274
|
+
const { dequantizeQ4KM } = await import('../../../converter/quantizer.js');
|
|
275
|
+
const { getDevice } = await import('../../../gpu/device.js');
|
|
276
|
+
const device = getDevice();
|
|
277
|
+
|
|
278
|
+
const isQ4K = kernelDtype === 'q4k' || kernelDtype === 'q4_k_m' || kernelDtype === 'q4_k';
|
|
279
|
+
let weightF32;
|
|
280
|
+
|
|
281
|
+
if (isQ4K) {
|
|
282
|
+
const numBlocks = Math.ceil(totalElements / QK_K);
|
|
283
|
+
const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
|
|
284
|
+
// GPU readBuffer returns zeros for some Q4K weight buffers, so prefer
|
|
285
|
+
// CPU-side rawBytes from the WeightBuffer when available.
|
|
286
|
+
const hasRawBytes = isWB && convKernel.rawBytes;
|
|
287
|
+
if (hasRawBytes) {
|
|
288
|
+
weightF32 = dequantizeQ4KM(new Uint8Array(convKernel.rawBytes), numBlocks, [totalElements]);
|
|
289
|
+
} else {
|
|
290
|
+
if (device) await device.queue.onSubmittedWorkDone();
|
|
291
|
+
const raw = await readBuffer(kernelBuf, q4kBytes);
|
|
292
|
+
weightF32 = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [totalElements]);
|
|
293
|
+
}
|
|
294
|
+
} else if (kernelDtype === 'f16' || kernelDtype === 'bf16') {
|
|
295
|
+
if (device) await device.queue.onSubmittedWorkDone();
|
|
296
|
+
const raw = await readBuffer(kernelBuf, totalElements * 2);
|
|
297
|
+
const { decodeReadback } = await import('./debug-utils/index.js');
|
|
298
|
+
weightF32 = decodeReadback(raw, 'f16');
|
|
299
|
+
} else {
|
|
300
|
+
if (device) await device.queue.onSubmittedWorkDone();
|
|
301
|
+
const raw = await readBuffer(kernelBuf, totalElements * 4);
|
|
302
|
+
weightF32 = new Float32Array(raw);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Validate dequantized weights are non-degenerate
|
|
306
|
+
let maxAbs = 0;
|
|
307
|
+
for (let i = 0; i < weightF32.length; i++) {
|
|
308
|
+
const abs = Math.abs(weightF32[i]);
|
|
309
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
310
|
+
}
|
|
311
|
+
if (maxAbs === 0) {
|
|
312
|
+
const { log } = await import('../../../debug/index.js');
|
|
313
|
+
log.error('Pipeline', `${label} conv kernel weights are all zeros after dequantization (dtype=${kernelDtype}, elements=${totalElements}). Conv layers will produce degenerate output.`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Upload dequantized weights to GPU
|
|
317
|
+
const weightGPU = acquireBuffer(weightF32.byteLength, undefined, `${label}.conv_weight_f32`);
|
|
318
|
+
uploadData(weightGPU, weightF32);
|
|
319
|
+
|
|
320
|
+
// Create zeroed conv state buffer
|
|
321
|
+
const stateSize = hiddenSize * (kernelSize - 1) * Float32Array.BYTES_PER_ELEMENT;
|
|
322
|
+
const stateGPU = acquireBuffer(stateSize, undefined, `${label}.conv_state`);
|
|
323
|
+
uploadData(stateGPU, new Float32Array(hiddenSize * (kernelSize - 1)));
|
|
324
|
+
|
|
325
|
+
convState.convWeightGPU = weightGPU;
|
|
326
|
+
convState.convStateGPU = stateGPU;
|
|
327
|
+
convState.hiddenSize = hiddenSize;
|
|
328
|
+
convState.kernelSize = kernelSize;
|
|
329
|
+
|
|
330
|
+
// Pre-dequantize in_proj weight to F32 via CPU dequantization of the raw Q4K buffer.
|
|
331
|
+
// GPU readBuffer returns zeros for some Q4K weight buffers, so we dequantize from the
|
|
332
|
+
// WeightBuffer's raw bytes instead.
|
|
333
|
+
if (isWB && isWeightBuffer(convInProj)) {
|
|
334
|
+
const inProjDtype = String(convInProj.dtype ?? '').toLowerCase();
|
|
335
|
+
const isInProjQ4K = inProjDtype === 'q4k' || inProjDtype === 'q4_k_m' || inProjDtype === 'q4_k';
|
|
336
|
+
if (isInProjQ4K && convInProj.rawBytes) {
|
|
337
|
+
const inProjElements = hiddenSize * 3 * hiddenSize;
|
|
338
|
+
const inProjBlocks = Math.ceil(inProjElements / QK_K);
|
|
339
|
+
const inProjF32 = dequantizeQ4KM(new Uint8Array(convInProj.rawBytes), inProjBlocks, [inProjElements]);
|
|
340
|
+
const inProjGPU = acquireBuffer(inProjF32.byteLength, undefined, `${label}.in_proj_f32`);
|
|
341
|
+
uploadData(inProjGPU, inProjF32);
|
|
342
|
+
convState.inProjF32GPU = inProjGPU;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
277
347
|
export async function doCast(input, toDtype, recorder) {
|
|
278
348
|
if (toDtype !== 'f16' && toDtype !== 'f32') {
|
|
279
349
|
throw new Error(`Unsupported cast target dtype "${toDtype}"`);
|
|
@@ -69,6 +69,11 @@ export declare class InferencePipeline extends PipelineState {
|
|
|
69
69
|
// ==========================================================================
|
|
70
70
|
|
|
71
71
|
generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
|
|
72
|
+
generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
|
|
73
|
+
generateTokenIds(
|
|
74
|
+
prompt: PromptInput,
|
|
75
|
+
options?: GenerateOptions
|
|
76
|
+
): Promise<{ tokenIds: number[]; stats: PipelineStats }>;
|
|
72
77
|
|
|
73
78
|
decodeStepLogits(currentIds: number[], options?: GenerateOptions): Promise<LogitsStepResult>;
|
|
74
79
|
|
|
@@ -43,6 +43,7 @@ import {
|
|
|
43
43
|
import { getDopplerLoader } from '../../loader/doppler-loader.js';
|
|
44
44
|
import { registerPipeline, getPipelineFactory } from './registry.js';
|
|
45
45
|
import { selectRuleValue } from '../../rules/rule-registry.js';
|
|
46
|
+
import { initConvLayerState } from './text/ops.js';
|
|
46
47
|
|
|
47
48
|
function destroyMoERouter(router) {
|
|
48
49
|
if (router && typeof router.destroy === 'function') {
|
|
@@ -221,6 +222,9 @@ export class InferencePipeline extends PipelineState {
|
|
|
221
222
|
// Initialize RoPE frequencies
|
|
222
223
|
await this._initRoPE();
|
|
223
224
|
|
|
225
|
+
// Initialize conv layer states for gated short conv layers (LFM2)
|
|
226
|
+
await this._initConvLayerStates();
|
|
227
|
+
|
|
224
228
|
this.isLoaded = true;
|
|
225
229
|
log.info('Pipeline', 'Model loaded successfully');
|
|
226
230
|
}
|
|
@@ -237,6 +241,7 @@ export class InferencePipeline extends PipelineState {
|
|
|
237
241
|
resolvedKernelPath: this.resolvedKernelPath,
|
|
238
242
|
kernelPathSource: this.kernelPathSource,
|
|
239
243
|
keepF32Weights: this.runtimeConfig.inference.compute.keepF32Weights === true,
|
|
244
|
+
loaderDebug: this.runtimeConfig?.shared?.debug?.loader ?? null,
|
|
240
245
|
onProgress: (info) => {
|
|
241
246
|
if (info.stage !== 'layers' && info.stage !== 'shards') {
|
|
242
247
|
log.verbose('Loader', `${info.stage}: ${Math.round(info.progress * 100)}%${info.message ? ` - ${info.message}` : ''}`);
|
|
@@ -310,7 +315,7 @@ export class InferencePipeline extends PipelineState {
|
|
|
310
315
|
maxSeqLen,
|
|
311
316
|
ropeTheta: config.ropeTheta,
|
|
312
317
|
ropeLocalTheta: config.ropeLocalTheta,
|
|
313
|
-
mropeInterleaved: config.
|
|
318
|
+
mropeInterleaved: config.mropeInterleaved,
|
|
314
319
|
mropeSection: config.mropeSection,
|
|
315
320
|
partialRotaryFactor: config.partialRotaryFactor,
|
|
316
321
|
ropeScale: config.ropeScale,
|
|
@@ -327,6 +332,51 @@ export class InferencePipeline extends PipelineState {
|
|
|
327
332
|
}
|
|
328
333
|
|
|
329
334
|
|
|
335
|
+
async _initConvLayerStates() {
|
|
336
|
+
const config = this.modelConfig;
|
|
337
|
+
if (!config?.layerTypes) return;
|
|
338
|
+
const { getDevice } = await import('../../gpu/device.js');
|
|
339
|
+
const device = getDevice();
|
|
340
|
+
if (!device) return;
|
|
341
|
+
|
|
342
|
+
const hiddenSize = config.hiddenSize;
|
|
343
|
+
const convStates = new Map();
|
|
344
|
+
|
|
345
|
+
for (let i = 0; i < config.layerTypes.length; i++) {
|
|
346
|
+
const lt = String(config.layerTypes[i] ?? '').toLowerCase();
|
|
347
|
+
if (lt !== 'conv' && lt !== 'convolution') continue;
|
|
348
|
+
|
|
349
|
+
const layerWeights = this.weights.get(`layer_${i}`);
|
|
350
|
+
if (!layerWeights) continue;
|
|
351
|
+
const convKernel = layerWeights?.convKernel;
|
|
352
|
+
if (!convKernel) continue;
|
|
353
|
+
|
|
354
|
+
const convState = {};
|
|
355
|
+
try {
|
|
356
|
+
await initConvLayerState(
|
|
357
|
+
convState,
|
|
358
|
+
convKernel,
|
|
359
|
+
layerWeights.convInProj ?? null,
|
|
360
|
+
hiddenSize,
|
|
361
|
+
`L${i}.conv`,
|
|
362
|
+
i
|
|
363
|
+
);
|
|
364
|
+
if (!convState.convWeightGPU || !convState.convStateGPU) {
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
convStates.set(i, convState);
|
|
368
|
+
} catch (e) {
|
|
369
|
+
log.warn('Pipeline', `Conv layer ${i} state init failed: ${e.message}`);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (convStates.size > 0) {
|
|
374
|
+
this.convLayerStates = convStates;
|
|
375
|
+
log.info('Pipeline', `Initialized ${convStates.size} conv layer states (kernelSize=${convStates.values().next().value?.kernelSize})`);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
|
|
330
380
|
_resolveLayerPipeline() {
|
|
331
381
|
if (!this.modelConfig) return;
|
|
332
382
|
const runtimePlan = this.runtimeConfig.inference.pipeline ?? null;
|
|
@@ -349,6 +399,14 @@ export class InferencePipeline extends PipelineState {
|
|
|
349
399
|
return this.generator.generate(prompt, options);
|
|
350
400
|
}
|
|
351
401
|
|
|
402
|
+
generateTokens(prompt, options = {}) {
|
|
403
|
+
return this.generator.generateTokens(prompt, options);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
generateTokenIds(prompt, options = {}) {
|
|
407
|
+
return this.generator.generateTokenIds(prompt, options);
|
|
408
|
+
}
|
|
409
|
+
|
|
352
410
|
decodeStepLogits(currentIds, options = {}) {
|
|
353
411
|
return this.generator.decodeStepLogits(currentIds, options);
|
|
354
412
|
}
|