@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import { getDevice } from '../../../gpu/device.js';
|
|
4
|
+
import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
|
|
5
|
+
import { runLayerNorm } from '../../../gpu/kernels/layernorm.js';
|
|
6
|
+
import { dispatchMatmul } from '../../../gpu/kernels/matmul-dispatch.js';
|
|
7
|
+
import { runGelu } from '../../../gpu/kernels/gelu.js';
|
|
8
|
+
import { runResidualAdd } from '../../../gpu/kernels/residual.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Layer norm on GPU.
|
|
12
|
+
* @param {GPUBuffer} input [seqLen, hiddenSize]
|
|
13
|
+
* @param {GPUBuffer} weight [hiddenSize]
|
|
14
|
+
* @param {GPUBuffer} bias [hiddenSize] or null
|
|
15
|
+
* @param {{ seqLen: number, hiddenSize: number, eps: number }} opts
|
|
16
|
+
* @returns {Promise<GPUBuffer>}
|
|
17
|
+
*/
|
|
18
|
+
export async function doLayerNorm(input, weight, bias, opts) {
|
|
19
|
+
const { seqLen, hiddenSize, eps } = opts;
|
|
20
|
+
const outputSize = seqLen * hiddenSize * 4;
|
|
21
|
+
const output = acquireBuffer(outputSize, 'vision-layernorm');
|
|
22
|
+
await runLayerNorm({
|
|
23
|
+
input,
|
|
24
|
+
weight,
|
|
25
|
+
bias: bias || null,
|
|
26
|
+
output,
|
|
27
|
+
seqLen,
|
|
28
|
+
hiddenSize,
|
|
29
|
+
eps,
|
|
30
|
+
});
|
|
31
|
+
return output;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Matrix multiply on GPU.
|
|
36
|
+
* @param {GPUBuffer} a [M, K]
|
|
37
|
+
* @param {GPUBuffer} b [K, N]
|
|
38
|
+
* @param {{ M: number, K: number, N: number, bias?: GPUBuffer }} opts
|
|
39
|
+
* @returns {Promise<GPUBuffer>}
|
|
40
|
+
*/
|
|
41
|
+
export async function doMatmul(a, b, opts) {
|
|
42
|
+
const { M, K, N, bias } = opts;
|
|
43
|
+
const outputSize = M * N * 4;
|
|
44
|
+
const output = acquireBuffer(outputSize, 'vision-matmul');
|
|
45
|
+
await dispatchMatmul({
|
|
46
|
+
a, b, output,
|
|
47
|
+
M, K, N,
|
|
48
|
+
bias: bias || null,
|
|
49
|
+
});
|
|
50
|
+
return output;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* GELU activation on GPU.
|
|
55
|
+
* @param {GPUBuffer} input Flat buffer
|
|
56
|
+
* @param {{ count: number }} opts Total element count
|
|
57
|
+
* @returns {Promise<GPUBuffer>}
|
|
58
|
+
*/
|
|
59
|
+
export async function doGelu(input, opts) {
|
|
60
|
+
const { count } = opts;
|
|
61
|
+
const output = acquireBuffer(count * 4, 'vision-gelu');
|
|
62
|
+
await runGelu({ input, output, count });
|
|
63
|
+
return output;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Element-wise residual add on GPU.
|
|
68
|
+
* @param {GPUBuffer} a
|
|
69
|
+
* @param {GPUBuffer} b
|
|
70
|
+
* @param {{ count: number }} opts
|
|
71
|
+
* @returns {Promise<GPUBuffer>}
|
|
72
|
+
*/
|
|
73
|
+
export async function doResidualAdd(a, b, opts) {
|
|
74
|
+
const { count } = opts;
|
|
75
|
+
const output = acquireBuffer(count * 4, 'vision-residual');
|
|
76
|
+
await runResidualAdd({ a, b, output, count });
|
|
77
|
+
return output;
|
|
78
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import { trace } from '../../../debug/index.js';
|
|
4
|
+
import { getDevice } from '../../../gpu/device.js';
|
|
5
|
+
import { acquireBuffer } from '../../../memory/buffer-pool.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Patch embedding for the vision encoder.
|
|
9
|
+
*
|
|
10
|
+
* Qwen3-VL uses a 3D convolution for temporal+spatial patch extraction:
|
|
11
|
+
* Conv3D(in_channels=3, out_channels=hiddenSize, kernel=[temporalPatchSize, patchSize, patchSize])
|
|
12
|
+
*
|
|
13
|
+
* For single images (T=1), this reduces to a 2D convolution with stride=patchSize.
|
|
14
|
+
* The output is [numPatches, hiddenSize] where numPatches = (H/patchSize) * (W/patchSize).
|
|
15
|
+
*
|
|
16
|
+
* For the initial implementation, this runs on CPU and uploads to GPU.
|
|
17
|
+
* TODO(perf): GPU kernel for patch embedding (conv2d with large stride).
|
|
18
|
+
*
|
|
19
|
+
* @param {object} params
|
|
20
|
+
* @param {Float32Array} params.imageData Preprocessed image [C, H, W] normalized
|
|
21
|
+
* @param {number} params.height Image height (patch-aligned)
|
|
22
|
+
* @param {number} params.width Image width (patch-aligned)
|
|
23
|
+
* @param {number} params.channels Number of channels (3)
|
|
24
|
+
* @param {object} params.visionConfig Vision config
|
|
25
|
+
* @param {object} params.weights Vision encoder weight buffers
|
|
26
|
+
* @returns {Promise<{ patchBuffer: GPUBuffer, numPatches: number }>}
|
|
27
|
+
*/
|
|
28
|
+
export async function patchEmbed(params) {
|
|
29
|
+
const {
|
|
30
|
+
imageData, height, width, channels,
|
|
31
|
+
visionConfig, weights,
|
|
32
|
+
} = params;
|
|
33
|
+
|
|
34
|
+
const {
|
|
35
|
+
patchSize = 16,
|
|
36
|
+
hiddenSize = 1024,
|
|
37
|
+
temporalPatchSize = 2,
|
|
38
|
+
} = visionConfig;
|
|
39
|
+
|
|
40
|
+
const gridH = Math.floor(height / patchSize);
|
|
41
|
+
const gridW = Math.floor(width / patchSize);
|
|
42
|
+
const numPatches = gridH * gridW;
|
|
43
|
+
|
|
44
|
+
trace('vision', `patchEmbed: ${height}x${width} -> ${gridH}x${gridW} = ${numPatches} patches (${hiddenSize}d)`);
|
|
45
|
+
|
|
46
|
+
// Read conv weight from GPU to CPU for the embedding computation.
|
|
47
|
+
// Weight shape: [hiddenSize, channels * temporalPatchSize * patchSize * patchSize]
|
|
48
|
+
// For single image: effectively [hiddenSize, channels * patchSize * patchSize]
|
|
49
|
+
//
|
|
50
|
+
// Qwen3-VL patch_embed is actually:
|
|
51
|
+
// proj = Conv3d(3, embed_dim, kernel_size=(tpp, pp, pp), stride=(tpp, pp, pp))
|
|
52
|
+
// For T=1 frame, temporal dim collapses: input is [1, C, 1, H, W]
|
|
53
|
+
// Output: [1, embed_dim, 1, H/pp, W/pp] -> reshape to [numPatches, embed_dim]
|
|
54
|
+
|
|
55
|
+
const device = getDevice();
|
|
56
|
+
const patchArea = channels * patchSize * patchSize;
|
|
57
|
+
|
|
58
|
+
// Extract patches from image: each patch is [C, patchSize, patchSize] flattened.
|
|
59
|
+
const patches = new Float32Array(numPatches * patchArea);
|
|
60
|
+
for (let ph = 0; ph < gridH; ph++) {
|
|
61
|
+
for (let pw = 0; pw < gridW; pw++) {
|
|
62
|
+
const patchIdx = ph * gridW + pw;
|
|
63
|
+
for (let c = 0; c < channels; c++) {
|
|
64
|
+
for (let py = 0; py < patchSize; py++) {
|
|
65
|
+
for (let px = 0; px < patchSize; px++) {
|
|
66
|
+
const imgY = ph * patchSize + py;
|
|
67
|
+
const imgX = pw * patchSize + px;
|
|
68
|
+
const srcIdx = c * height * width + imgY * width + imgX;
|
|
69
|
+
const dstIdx = patchIdx * patchArea + c * patchSize * patchSize + py * patchSize + px;
|
|
70
|
+
patches[dstIdx] = imageData[srcIdx];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Read the projection weight from GPU.
|
|
78
|
+
// The weight tensor name is visual.patch_embed.proj.weight with shape [hiddenSize, C, tpp, pp, pp].
|
|
79
|
+
// For temporal_patch_size=2 and a single frame, we need to handle the temporal dimension.
|
|
80
|
+
// In practice for a single image, we sum over the temporal kernel dimension.
|
|
81
|
+
const weightKey = 'visual.patch_embed.proj.weight';
|
|
82
|
+
const biasKey = 'visual.patch_embed.proj.bias';
|
|
83
|
+
|
|
84
|
+
const weightBuffer = weights[weightKey];
|
|
85
|
+
const biasBuffer = weights[biasKey] || null;
|
|
86
|
+
|
|
87
|
+
// Full conv weight size: hiddenSize * channels * temporalPatchSize * patchSize * patchSize
|
|
88
|
+
const fullWeightSize = hiddenSize * channels * temporalPatchSize * patchSize * patchSize;
|
|
89
|
+
const weightData = new Float32Array(fullWeightSize);
|
|
90
|
+
{
|
|
91
|
+
const staging = device.createBuffer({
|
|
92
|
+
size: fullWeightSize * 4,
|
|
93
|
+
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
|
|
94
|
+
});
|
|
95
|
+
const encoder = device.createCommandEncoder();
|
|
96
|
+
encoder.copyBufferToBuffer(weightBuffer, 0, staging, 0, fullWeightSize * 4);
|
|
97
|
+
device.queue.submit([encoder.finish()]);
|
|
98
|
+
await staging.mapAsync(GPUMapMode.READ);
|
|
99
|
+
weightData.set(new Float32Array(staging.getMappedRange()));
|
|
100
|
+
staging.unmap();
|
|
101
|
+
staging.destroy();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// For single frame: average over temporal kernel dimension to get [hiddenSize, C*pp*pp].
|
|
105
|
+
const spatialWeight = new Float32Array(hiddenSize * patchArea);
|
|
106
|
+
const spatialPatchArea = channels * patchSize * patchSize;
|
|
107
|
+
for (let h = 0; h < hiddenSize; h++) {
|
|
108
|
+
for (let s = 0; s < spatialPatchArea; s++) {
|
|
109
|
+
let sum = 0;
|
|
110
|
+
for (let t = 0; t < temporalPatchSize; t++) {
|
|
111
|
+
sum += weightData[h * temporalPatchSize * spatialPatchArea + t * spatialPatchArea + s];
|
|
112
|
+
}
|
|
113
|
+
spatialWeight[h * spatialPatchArea + s] = sum;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Read bias if present.
|
|
118
|
+
let biasData = null;
|
|
119
|
+
if (biasBuffer) {
|
|
120
|
+
biasData = new Float32Array(hiddenSize);
|
|
121
|
+
const staging = device.createBuffer({
|
|
122
|
+
size: hiddenSize * 4,
|
|
123
|
+
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
|
|
124
|
+
});
|
|
125
|
+
const encoder = device.createCommandEncoder();
|
|
126
|
+
encoder.copyBufferToBuffer(biasBuffer, 0, staging, 0, hiddenSize * 4);
|
|
127
|
+
device.queue.submit([encoder.finish()]);
|
|
128
|
+
await staging.mapAsync(GPUMapMode.READ);
|
|
129
|
+
biasData.set(new Float32Array(staging.getMappedRange()));
|
|
130
|
+
staging.unmap();
|
|
131
|
+
staging.destroy();
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Compute patch embeddings: patches [numPatches, patchArea] @ spatialWeight^T [patchArea, hiddenSize]
|
|
135
|
+
const embeddings = new Float32Array(numPatches * hiddenSize);
|
|
136
|
+
for (let p = 0; p < numPatches; p++) {
|
|
137
|
+
for (let h = 0; h < hiddenSize; h++) {
|
|
138
|
+
let val = biasData ? biasData[h] : 0;
|
|
139
|
+
for (let k = 0; k < patchArea; k++) {
|
|
140
|
+
val += patches[p * patchArea + k] * spatialWeight[h * patchArea + k];
|
|
141
|
+
}
|
|
142
|
+
embeddings[p * hiddenSize + h] = val;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Upload to GPU.
|
|
147
|
+
const patchBuffer = acquireBuffer(numPatches * hiddenSize * 4, 'vision-patch-embed');
|
|
148
|
+
device.queue.writeBuffer(patchBuffer, 0, embeddings);
|
|
149
|
+
|
|
150
|
+
return { patchBuffer, numPatches };
|
|
151
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
|
|
4
|
-
import { parseManifest } from '../formats/rdrr/index.js';
|
|
4
|
+
import { parseManifest, getExpectedShardHash } from '../formats/rdrr/index.js';
|
|
5
5
|
import { createPipeline } from './pipelines/text.js';
|
|
6
6
|
import { log as debugLog } from '../debug/index.js';
|
|
7
7
|
import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
|
|
@@ -66,11 +66,12 @@ export function parseRuntimeOverridesFromURL(searchParams) {
|
|
|
66
66
|
if (runtimeConfigRaw) {
|
|
67
67
|
try {
|
|
68
68
|
const parsed = JSON.parse(runtimeConfigRaw);
|
|
69
|
-
if (parsed
|
|
70
|
-
|
|
69
|
+
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
|
70
|
+
throw new Error('runtimeConfig must be a JSON object');
|
|
71
71
|
}
|
|
72
|
+
runtime.runtimeConfig = parsed;
|
|
72
73
|
} catch (e) {
|
|
73
|
-
|
|
74
|
+
throw new Error(`Failed to parse runtimeConfig URL parameter: ${e?.message}`);
|
|
74
75
|
}
|
|
75
76
|
}
|
|
76
77
|
|
|
@@ -79,12 +80,13 @@ export function parseRuntimeOverridesFromURL(searchParams) {
|
|
|
79
80
|
if (configChainRaw) {
|
|
80
81
|
try {
|
|
81
82
|
const parsed = JSON.parse(configChainRaw);
|
|
82
|
-
if (Array.isArray(parsed)) {
|
|
83
|
-
|
|
84
|
-
debugLog.info('TestHarness', `Config chain: ${parsed.join(' -> ')}`);
|
|
83
|
+
if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== 'string' || !entry.trim())) {
|
|
84
|
+
throw new Error('configChain must be an array of non-empty strings');
|
|
85
85
|
}
|
|
86
|
+
runtime.configChain = parsed;
|
|
87
|
+
debugLog.info('TestHarness', `Config chain: ${parsed.join(' -> ')}`);
|
|
86
88
|
} catch (e) {
|
|
87
|
-
|
|
89
|
+
throw new Error(`Failed to parse configChain URL parameter: ${e?.message}`);
|
|
88
90
|
}
|
|
89
91
|
}
|
|
90
92
|
|
|
@@ -168,7 +170,7 @@ export function createHttpShardLoader(baseUrl, manifest, log) {
|
|
|
168
170
|
distributionConfig,
|
|
169
171
|
algorithm,
|
|
170
172
|
requiredEncoding,
|
|
171
|
-
expectedHash: shard
|
|
173
|
+
expectedHash: getExpectedShardHash(shard, algorithm) || null,
|
|
172
174
|
expectedSize: Number.isFinite(shard.size) ? Math.floor(shard.size) : null,
|
|
173
175
|
expectedManifestVersionSet: manifestVersionSet,
|
|
174
176
|
writeToStore: false,
|
|
@@ -31,6 +31,7 @@ import type {
|
|
|
31
31
|
} from './loader-types.js';
|
|
32
32
|
import type { ShardCache } from './shard-cache.js';
|
|
33
33
|
import type { LoadingConfigSchema } from '../config/schema/loading.schema.js';
|
|
34
|
+
import type { LoaderDebugConfigSchema } from '../config/schema/debug.schema.js';
|
|
34
35
|
|
|
35
36
|
// Re-export types for backward compatibility
|
|
36
37
|
export type {
|
|
@@ -96,6 +97,8 @@ export declare class DopplerLoader {
|
|
|
96
97
|
|
|
97
98
|
setQ4KConfig(config: Q4KConfig): void;
|
|
98
99
|
|
|
100
|
+
setLoaderDebugConfig(loaderDebug: LoaderDebugConfigSchema | null): void;
|
|
101
|
+
|
|
99
102
|
setCustomShardLoader(loadShardFn: CustomShardLoader, options?: CustomShardLoaderOptions): void;
|
|
100
103
|
|
|
101
104
|
setTensorsJsonUrl(url: string | null): void;
|
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
} from '../storage/shard-manager.js';
|
|
12
12
|
import { clearManifest, parseManifest, setManifest as setCurrentManifest } from '../formats/rdrr/index.js';
|
|
13
13
|
import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
|
|
14
|
-
import { acquireBuffer, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
|
|
14
|
+
import { acquireBuffer, isBufferActive, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
|
|
15
15
|
import { getExpertCache } from './experts/expert-cache.js';
|
|
16
16
|
import { formatBytes } from '../storage/quota.js';
|
|
17
17
|
import { log, trace as debugTrace } from '../debug/index.js';
|
|
@@ -118,6 +118,7 @@ export class DopplerLoader {
|
|
|
118
118
|
// Loading configuration
|
|
119
119
|
|
|
120
120
|
#loadingConfig;
|
|
121
|
+
#loaderDebug = null;
|
|
121
122
|
|
|
122
123
|
// Fused Q4_K matmul: skip dequantization for matmul weights, use fused kernel
|
|
123
124
|
|
|
@@ -167,6 +168,10 @@ export class DopplerLoader {
|
|
|
167
168
|
}
|
|
168
169
|
}
|
|
169
170
|
|
|
171
|
+
setLoaderDebugConfig(loaderDebug) {
|
|
172
|
+
this.#loaderDebug = loaderDebug ?? null;
|
|
173
|
+
}
|
|
174
|
+
|
|
170
175
|
|
|
171
176
|
setQ4KConfig(config) {
|
|
172
177
|
this.useFusedQ4K = config.useFusedQ4K;
|
|
@@ -701,6 +706,7 @@ export class DopplerLoader {
|
|
|
701
706
|
useFusedQ4K: this.useFusedQ4K,
|
|
702
707
|
keepF32Weights: this.keepF32Weights,
|
|
703
708
|
q4kLayout: this.q4kLayout,
|
|
709
|
+
loaderDebug: this.#loaderDebug,
|
|
704
710
|
gpuCapabilities: this.gpuCapabilities,
|
|
705
711
|
allowF32UpcastNonMatmul,
|
|
706
712
|
};
|
|
@@ -924,7 +930,14 @@ export class DopplerLoader {
|
|
|
924
930
|
return this.layers.get(layerIdx) || null;
|
|
925
931
|
}
|
|
926
932
|
|
|
927
|
-
|
|
933
|
+
/**
|
|
934
|
+
* Load a tensor by name. Public interface for extension loaders (e.g., vision).
|
|
935
|
+
*/
|
|
936
|
+
async loadTensor(name, toGPU = true, silent = false) {
|
|
937
|
+
return this.#loadTensor(name, toGPU, silent);
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
|
|
928
941
|
getConfig() {
|
|
929
942
|
return (this.manifest?.config) || {};
|
|
930
943
|
}
|
|
@@ -968,7 +981,11 @@ export class DopplerLoader {
|
|
|
968
981
|
: (isGpuBufferInstance(value) ? value : null);
|
|
969
982
|
if (!gpuBuffer) return;
|
|
970
983
|
try {
|
|
971
|
-
|
|
984
|
+
if (isBufferActive(gpuBuffer)) {
|
|
985
|
+
releaseBuffer(gpuBuffer);
|
|
986
|
+
} else {
|
|
987
|
+
gpuBuffer.destroy();
|
|
988
|
+
}
|
|
972
989
|
} catch {
|
|
973
990
|
// Ignore already released/destroyed buffers.
|
|
974
991
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
|
-
import { releaseBuffer } from '../../memory/buffer-pool.js';
|
|
3
|
+
import { isBufferActive, releaseBuffer } from '../../memory/buffer-pool.js';
|
|
4
4
|
import { log, trace } from '../../debug/index.js';
|
|
5
5
|
import { getRuntimeConfig } from '../../config/runtime.js';
|
|
6
6
|
import { isWeightBuffer } from '../../gpu/weight-buffer.js';
|
|
@@ -266,7 +266,11 @@ export class ExpertCache {
|
|
|
266
266
|
: (isGpuBufferInstance(buf) ? buf : null);
|
|
267
267
|
if (!gpuBuffer) continue;
|
|
268
268
|
try {
|
|
269
|
-
|
|
269
|
+
if (isBufferActive(gpuBuffer)) {
|
|
270
|
+
releaseBuffer(gpuBuffer);
|
|
271
|
+
} else {
|
|
272
|
+
gpuBuffer.destroy();
|
|
273
|
+
}
|
|
270
274
|
} catch (e) {
|
|
271
275
|
// Buffer may already be released
|
|
272
276
|
}
|
|
@@ -9,7 +9,7 @@ import { isWeightBuffer } from '../../gpu/weight-buffer.js';
|
|
|
9
9
|
import { maybeDowncastToF16 } from '../weight-downcast.js';
|
|
10
10
|
import { log, trace as debugTrace } from '../../debug/index.js';
|
|
11
11
|
import { getRuntimeConfig } from '../../config/runtime.js';
|
|
12
|
-
import { releaseBuffer } from '../../memory/buffer-pool.js';
|
|
12
|
+
import { isBufferActive, releaseBuffer } from '../../memory/buffer-pool.js';
|
|
13
13
|
|
|
14
14
|
// ============================================================================
|
|
15
15
|
// Shard Preloading
|
|
@@ -283,7 +283,11 @@ function releasePackedLayerWeights(ctx, packed) {
|
|
|
283
283
|
const gpuBuffer = getGpuBuffer(entry);
|
|
284
284
|
if (!gpuBuffer) continue;
|
|
285
285
|
try {
|
|
286
|
-
|
|
286
|
+
if (isBufferActive(gpuBuffer)) {
|
|
287
|
+
releaseBuffer(gpuBuffer);
|
|
288
|
+
} else {
|
|
289
|
+
gpuBuffer.destroy();
|
|
290
|
+
}
|
|
287
291
|
ctx.gpuBuffers?.delete?.(gpuBuffer);
|
|
288
292
|
} catch {
|
|
289
293
|
// Ignore already-released buffers.
|
|
@@ -36,6 +36,8 @@ function isLikelyFinalNormName(name) {
|
|
|
36
36
|
return (
|
|
37
37
|
lower === 'norm.weight' ||
|
|
38
38
|
lower.includes('model.norm.weight') ||
|
|
39
|
+
lower.includes('language_model.norm.weight') ||
|
|
40
|
+
lower.includes('model.language_model.norm.weight') ||
|
|
39
41
|
lower.includes('embedding_norm.weight') ||
|
|
40
42
|
lower.includes('model.embedding_norm.weight') ||
|
|
41
43
|
lower.includes('final_layernorm.weight') ||
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
import { getKernelCapabilities } from '../gpu/device.js';
|
|
4
|
-
import { isWeightBuffer } from '../gpu/weight-buffer.js';
|
|
4
|
+
import { isWeightBuffer, createWeightBuffer, getWeightDtype } from '../gpu/weight-buffer.js';
|
|
5
|
+
import { dequantize, dequantizeRowwise } from '../gpu/kernel-selector.js';
|
|
6
|
+
import { releaseBuffer } from '../memory/buffer-pool.js';
|
|
5
7
|
import { batchDowncastWeights } from './weight-downcast.js';
|
|
8
|
+
import { QK_K } from './quantization-constants.js';
|
|
6
9
|
import { trace as debugTrace } from '../debug/index.js';
|
|
7
10
|
|
|
8
11
|
// ============================================================================
|
|
@@ -26,8 +29,8 @@ const ATTN_SUFFIXES = {
|
|
|
26
29
|
kProj: ['self_attn.k_proj.weight', 'attention.wk.weight', 'attn_k.weight'],
|
|
27
30
|
vProj: ['self_attn.v_proj.weight', 'attention.wv.weight', 'attn_v.weight'],
|
|
28
31
|
oProj: ['self_attn.o_proj.weight', 'self_attn.out_proj.weight', 'attention.wo.weight', 'attn_output.weight'],
|
|
29
|
-
qNorm: ['self_attn.q_norm.weight', 'attn_q_norm.weight'],
|
|
30
|
-
kNorm: ['self_attn.k_norm.weight', 'attn_k_norm.weight'],
|
|
32
|
+
qNorm: ['self_attn.q_norm.weight', 'self_attn.q_layernorm.weight', 'attn_q_norm.weight'],
|
|
33
|
+
kNorm: ['self_attn.k_norm.weight', 'self_attn.k_layernorm.weight', 'attn_k_norm.weight'],
|
|
31
34
|
postAttentionNorm: ['post_attention_layernorm.weight', 'post_attention_norm.weight', 'ffn_norm.weight'],
|
|
32
35
|
preFeedforwardNorm: ['pre_feedforward_layernorm.weight'],
|
|
33
36
|
postFeedforwardNorm: ['post_feedforward_layernorm.weight', 'post_ffw_norm.weight'],
|
|
@@ -415,4 +418,40 @@ async function downcastLayerWeights(ctx, weights, layerIdx) {
|
|
|
415
418
|
},
|
|
416
419
|
ctx.gpuBuffers
|
|
417
420
|
);
|
|
421
|
+
|
|
422
|
+
await dequantConvQ4KWeights(ctx, weights, layerIdx);
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
const CONV_Q4K_DEQUANT_KEYS = ['convInProj', 'convOutProj', 'convKernel'];
|
|
427
|
+
|
|
428
|
+
async function dequantConvQ4KWeights(ctx, weights, layerIdx) {
|
|
429
|
+
for (const key of CONV_Q4K_DEQUANT_KEYS) {
|
|
430
|
+
const buf = weights[key];
|
|
431
|
+
if (!buf || !isWeightBuffer(buf)) continue;
|
|
432
|
+
if (getWeightDtype(buf) !== 'q4k') continue;
|
|
433
|
+
|
|
434
|
+
const shape = buf.shape;
|
|
435
|
+
if (!Array.isArray(shape) || shape.length < 2) continue;
|
|
436
|
+
|
|
437
|
+
const is2D = shape.length === 2;
|
|
438
|
+
const totalElements = shape.reduce((a, b) => a * b, 1);
|
|
439
|
+
|
|
440
|
+
let dequantizedTensor;
|
|
441
|
+
const outputDtype = 'f32';
|
|
442
|
+
if (is2D && shape[1] % QK_K !== 0) {
|
|
443
|
+
dequantizedTensor = await dequantizeRowwise(buf.buffer, shape[0], shape[1], { outputDtype });
|
|
444
|
+
} else {
|
|
445
|
+
if (totalElements === 0 || totalElements % QK_K !== 0) continue;
|
|
446
|
+
const numBlocks = totalElements / QK_K;
|
|
447
|
+
dequantizedTensor = await dequantize(buf.buffer, numBlocks, { outputDtype });
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
releaseBuffer(buf.buffer);
|
|
451
|
+
const dequantizedBuffer = dequantizedTensor.buffer;
|
|
452
|
+
weights[key] = createWeightBuffer(dequantizedBuffer, outputDtype, 'row', shape, buf.label ?? key);
|
|
453
|
+
ctx.gpuBuffers.add(dequantizedBuffer);
|
|
454
|
+
|
|
455
|
+
debugTrace.loader(`Layer ${layerIdx} dequantized conv ${key} Q4K→${outputDtype.toUpperCase()}: [${shape.join(',')}]`);
|
|
456
|
+
}
|
|
418
457
|
}
|
|
@@ -7,6 +7,8 @@ import { formatBytes } from '../storage/quota.js';
|
|
|
7
7
|
import { log, trace as debugTrace } from '../debug/index.js';
|
|
8
8
|
import { selectRuleValue } from '../rules/rule-registry.js';
|
|
9
9
|
|
|
10
|
+
const STREAMABLE_DTYPES = new Set(['F16', 'F32', 'BF16']);
|
|
11
|
+
|
|
10
12
|
// ============================================================================
|
|
11
13
|
// Norm Weight Offset Detection
|
|
12
14
|
// ============================================================================
|
|
@@ -102,7 +104,7 @@ export function shouldStreamLargeWeight(name, location, label, gpuCapabilities,
|
|
|
102
104
|
if (estimate.bytes <= maxBytes) return false;
|
|
103
105
|
|
|
104
106
|
// Check if dtype can be streamed (only float types)
|
|
105
|
-
const canStream =
|
|
107
|
+
const canStream = STREAMABLE_DTYPES.has(location.dtype);
|
|
106
108
|
if (!canStream) {
|
|
107
109
|
log.warn(
|
|
108
110
|
'Loader',
|
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
computeHash,
|
|
6
6
|
getStorageBackendType,
|
|
7
7
|
} from '../storage/shard-manager.js';
|
|
8
|
+
import { getExpectedShardHash } from '../formats/rdrr/index.js';
|
|
8
9
|
import { formatBytes } from '../storage/quota.js';
|
|
9
10
|
import { log, trace as debugTrace } from '../debug/index.js';
|
|
10
11
|
import { getRuntimeConfig } from '../config/runtime.js';
|
|
@@ -484,11 +485,11 @@ export class ShardCache {
|
|
|
484
485
|
// Verify hash if enabled
|
|
485
486
|
if (this.#verifyHashes && this.#manifest) {
|
|
486
487
|
const shardInfo = this.#manifest.shards?.[shardIndex];
|
|
487
|
-
const
|
|
488
|
+
const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
|
|
489
|
+
const expectedHash = getExpectedShardHash(shardInfo, algorithm);
|
|
488
490
|
if (!expectedHash) {
|
|
489
491
|
throw new Error(`Shard ${shardIndex} missing hash in manifest.`);
|
|
490
492
|
}
|
|
491
|
-
const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
|
|
492
493
|
if (!algorithm) {
|
|
493
494
|
throw new Error(`Manifest missing hashAlgorithm for shard ${shardIndex}.`);
|
|
494
495
|
}
|
|
@@ -12,10 +12,13 @@
|
|
|
12
12
|
|
|
13
13
|
import type { WeightBuffer, WeightLayout } from '../../gpu/weight-buffer.js';
|
|
14
14
|
import type { TensorLocation, KernelCapabilities } from '../loader-types.js';
|
|
15
|
+
import type { LoaderDebugConfigSchema } from '../../config/schema/debug.schema.js';
|
|
15
16
|
|
|
16
17
|
export interface TensorLoadConfig {
|
|
17
18
|
/** Use fused Q4K matmul kernels */
|
|
18
19
|
useFusedQ4K: boolean;
|
|
20
|
+
/** Debug controls for Q4K loading/dequantization */
|
|
21
|
+
loaderDebug?: LoaderDebugConfigSchema | null;
|
|
19
22
|
/** Keep weights as F32 (disable F16 downcasting) */
|
|
20
23
|
keepF32Weights: boolean;
|
|
21
24
|
/** Allow F16->F32 upcast for non-matmul weights */
|