@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -0,0 +1,78 @@
1
+
2
+
3
+ import { getDevice } from '../../../gpu/device.js';
4
+ import { acquireBuffer, releaseBuffer } from '../../../memory/buffer-pool.js';
5
+ import { runLayerNorm } from '../../../gpu/kernels/layernorm.js';
6
+ import { dispatchMatmul } from '../../../gpu/kernels/matmul-dispatch.js';
7
+ import { runGelu } from '../../../gpu/kernels/gelu.js';
8
+ import { runResidualAdd } from '../../../gpu/kernels/residual.js';
9
+
10
+ /**
11
+ * Layer norm on GPU.
12
+ * @param {GPUBuffer} input [seqLen, hiddenSize]
13
+ * @param {GPUBuffer} weight [hiddenSize]
14
+ * @param {GPUBuffer} bias [hiddenSize] or null
15
+ * @param {{ seqLen: number, hiddenSize: number, eps: number }} opts
16
+ * @returns {Promise<GPUBuffer>}
17
+ */
18
+ export async function doLayerNorm(input, weight, bias, opts) {
19
+ const { seqLen, hiddenSize, eps } = opts;
20
+ const outputSize = seqLen * hiddenSize * 4;
21
+ const output = acquireBuffer(outputSize, 'vision-layernorm');
22
+ await runLayerNorm({
23
+ input,
24
+ weight,
25
+ bias: bias || null,
26
+ output,
27
+ seqLen,
28
+ hiddenSize,
29
+ eps,
30
+ });
31
+ return output;
32
+ }
33
+
34
+ /**
35
+ * Matrix multiply on GPU.
36
+ * @param {GPUBuffer} a [M, K]
37
+ * @param {GPUBuffer} b [K, N]
38
+ * @param {{ M: number, K: number, N: number, bias?: GPUBuffer }} opts
39
+ * @returns {Promise<GPUBuffer>}
40
+ */
41
+ export async function doMatmul(a, b, opts) {
42
+ const { M, K, N, bias } = opts;
43
+ const outputSize = M * N * 4;
44
+ const output = acquireBuffer(outputSize, 'vision-matmul');
45
+ await dispatchMatmul({
46
+ a, b, output,
47
+ M, K, N,
48
+ bias: bias || null,
49
+ });
50
+ return output;
51
+ }
52
+
53
+ /**
54
+ * GELU activation on GPU.
55
+ * @param {GPUBuffer} input Flat buffer
56
+ * @param {{ count: number }} opts Total element count
57
+ * @returns {Promise<GPUBuffer>}
58
+ */
59
+ export async function doGelu(input, opts) {
60
+ const { count } = opts;
61
+ const output = acquireBuffer(count * 4, 'vision-gelu');
62
+ await runGelu({ input, output, count });
63
+ return output;
64
+ }
65
+
66
+ /**
67
+ * Element-wise residual add on GPU.
68
+ * @param {GPUBuffer} a
69
+ * @param {GPUBuffer} b
70
+ * @param {{ count: number }} opts
71
+ * @returns {Promise<GPUBuffer>}
72
+ */
73
+ export async function doResidualAdd(a, b, opts) {
74
+ const { count } = opts;
75
+ const output = acquireBuffer(count * 4, 'vision-residual');
76
+ await runResidualAdd({ a, b, output, count });
77
+ return output;
78
+ }
@@ -0,0 +1,151 @@
1
+
2
+
3
+ import { trace } from '../../../debug/index.js';
4
+ import { getDevice } from '../../../gpu/device.js';
5
+ import { acquireBuffer } from '../../../memory/buffer-pool.js';
6
+
7
+ /**
8
+ * Patch embedding for the vision encoder.
9
+ *
10
+ * Qwen3-VL uses a 3D convolution for temporal+spatial patch extraction:
11
+ * Conv3D(in_channels=3, out_channels=hiddenSize, kernel=[temporalPatchSize, patchSize, patchSize])
12
+ *
13
+ * For single images (T=1), this reduces to a 2D convolution with stride=patchSize.
14
+ * The output is [numPatches, hiddenSize] where numPatches = (H/patchSize) * (W/patchSize).
15
+ *
16
+ * For the initial implementation, this runs on CPU and uploads to GPU.
17
+ * TODO(perf): GPU kernel for patch embedding (conv2d with large stride).
18
+ *
19
+ * @param {object} params
20
+ * @param {Float32Array} params.imageData Preprocessed image [C, H, W] normalized
21
+ * @param {number} params.height Image height (patch-aligned)
22
+ * @param {number} params.width Image width (patch-aligned)
23
+ * @param {number} params.channels Number of channels (3)
24
+ * @param {object} params.visionConfig Vision config
25
+ * @param {object} params.weights Vision encoder weight buffers
26
+ * @returns {Promise<{ patchBuffer: GPUBuffer, numPatches: number }>}
27
+ */
28
+ export async function patchEmbed(params) {
29
+ const {
30
+ imageData, height, width, channels,
31
+ visionConfig, weights,
32
+ } = params;
33
+
34
+ const {
35
+ patchSize = 16,
36
+ hiddenSize = 1024,
37
+ temporalPatchSize = 2,
38
+ } = visionConfig;
39
+
40
+ const gridH = Math.floor(height / patchSize);
41
+ const gridW = Math.floor(width / patchSize);
42
+ const numPatches = gridH * gridW;
43
+
44
+ trace('vision', `patchEmbed: ${height}x${width} -> ${gridH}x${gridW} = ${numPatches} patches (${hiddenSize}d)`);
45
+
46
+ // Read conv weight from GPU to CPU for the embedding computation.
47
+ // Weight shape: [hiddenSize, channels * temporalPatchSize * patchSize * patchSize]
48
+ // For single image: effectively [hiddenSize, channels * patchSize * patchSize]
49
+ //
50
+ // Qwen3-VL patch_embed is actually:
51
+ // proj = Conv3d(3, embed_dim, kernel_size=(tpp, pp, pp), stride=(tpp, pp, pp))
52
+ // For T=1 frame, temporal dim collapses: input is [1, C, 1, H, W]
53
+ // Output: [1, embed_dim, 1, H/pp, W/pp] -> reshape to [numPatches, embed_dim]
54
+
55
+ const device = getDevice();
56
+ const patchArea = channels * patchSize * patchSize;
57
+
58
+ // Extract patches from image: each patch is [C, patchSize, patchSize] flattened.
59
+ const patches = new Float32Array(numPatches * patchArea);
60
+ for (let ph = 0; ph < gridH; ph++) {
61
+ for (let pw = 0; pw < gridW; pw++) {
62
+ const patchIdx = ph * gridW + pw;
63
+ for (let c = 0; c < channels; c++) {
64
+ for (let py = 0; py < patchSize; py++) {
65
+ for (let px = 0; px < patchSize; px++) {
66
+ const imgY = ph * patchSize + py;
67
+ const imgX = pw * patchSize + px;
68
+ const srcIdx = c * height * width + imgY * width + imgX;
69
+ const dstIdx = patchIdx * patchArea + c * patchSize * patchSize + py * patchSize + px;
70
+ patches[dstIdx] = imageData[srcIdx];
71
+ }
72
+ }
73
+ }
74
+ }
75
+ }
76
+
77
+ // Read the projection weight from GPU.
78
+ // The weight tensor name is visual.patch_embed.proj.weight with shape [hiddenSize, C, tpp, pp, pp].
79
+ // For temporal_patch_size=2 and a single frame, we need to handle the temporal dimension.
80
+ // In practice for a single image, we sum over the temporal kernel dimension.
81
+ const weightKey = 'visual.patch_embed.proj.weight';
82
+ const biasKey = 'visual.patch_embed.proj.bias';
83
+
84
+ const weightBuffer = weights[weightKey];
85
+ const biasBuffer = weights[biasKey] || null;
86
+
87
+ // Full conv weight size: hiddenSize * channels * temporalPatchSize * patchSize * patchSize
88
+ const fullWeightSize = hiddenSize * channels * temporalPatchSize * patchSize * patchSize;
89
+ const weightData = new Float32Array(fullWeightSize);
90
+ {
91
+ const staging = device.createBuffer({
92
+ size: fullWeightSize * 4,
93
+ usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
94
+ });
95
+ const encoder = device.createCommandEncoder();
96
+ encoder.copyBufferToBuffer(weightBuffer, 0, staging, 0, fullWeightSize * 4);
97
+ device.queue.submit([encoder.finish()]);
98
+ await staging.mapAsync(GPUMapMode.READ);
99
+ weightData.set(new Float32Array(staging.getMappedRange()));
100
+ staging.unmap();
101
+ staging.destroy();
102
+ }
103
+
104
+ // For single frame: average over temporal kernel dimension to get [hiddenSize, C*pp*pp].
105
+ const spatialWeight = new Float32Array(hiddenSize * patchArea);
106
+ const spatialPatchArea = channels * patchSize * patchSize;
107
+ for (let h = 0; h < hiddenSize; h++) {
108
+ for (let s = 0; s < spatialPatchArea; s++) {
109
+ let sum = 0;
110
+ for (let t = 0; t < temporalPatchSize; t++) {
111
+ sum += weightData[h * temporalPatchSize * spatialPatchArea + t * spatialPatchArea + s];
112
+ }
113
+ spatialWeight[h * spatialPatchArea + s] = sum;
114
+ }
115
+ }
116
+
117
+ // Read bias if present.
118
+ let biasData = null;
119
+ if (biasBuffer) {
120
+ biasData = new Float32Array(hiddenSize);
121
+ const staging = device.createBuffer({
122
+ size: hiddenSize * 4,
123
+ usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
124
+ });
125
+ const encoder = device.createCommandEncoder();
126
+ encoder.copyBufferToBuffer(biasBuffer, 0, staging, 0, hiddenSize * 4);
127
+ device.queue.submit([encoder.finish()]);
128
+ await staging.mapAsync(GPUMapMode.READ);
129
+ biasData.set(new Float32Array(staging.getMappedRange()));
130
+ staging.unmap();
131
+ staging.destroy();
132
+ }
133
+
134
+ // Compute patch embeddings: patches [numPatches, patchArea] @ spatialWeight^T [patchArea, hiddenSize]
135
+ const embeddings = new Float32Array(numPatches * hiddenSize);
136
+ for (let p = 0; p < numPatches; p++) {
137
+ for (let h = 0; h < hiddenSize; h++) {
138
+ let val = biasData ? biasData[h] : 0;
139
+ for (let k = 0; k < patchArea; k++) {
140
+ val += patches[p * patchArea + k] * spatialWeight[h * patchArea + k];
141
+ }
142
+ embeddings[p * hiddenSize + h] = val;
143
+ }
144
+ }
145
+
146
+ // Upload to GPU.
147
+ const patchBuffer = acquireBuffer(numPatches * hiddenSize * 4, 'vision-patch-embed');
148
+ device.queue.writeBuffer(patchBuffer, 0, embeddings);
149
+
150
+ return { patchBuffer, numPatches };
151
+ }
@@ -1,7 +1,7 @@
1
1
 
2
2
 
3
3
  import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
4
- import { parseManifest } from '../formats/rdrr/index.js';
4
+ import { parseManifest, getExpectedShardHash } from '../formats/rdrr/index.js';
5
5
  import { createPipeline } from './pipelines/text.js';
6
6
  import { log as debugLog } from '../debug/index.js';
7
7
  import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
@@ -66,11 +66,12 @@ export function parseRuntimeOverridesFromURL(searchParams) {
66
66
  if (runtimeConfigRaw) {
67
67
  try {
68
68
  const parsed = JSON.parse(runtimeConfigRaw);
69
- if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
70
- runtime.runtimeConfig = (parsed);
69
+ if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
70
+ throw new Error('runtimeConfig must be a JSON object');
71
71
  }
72
+ runtime.runtimeConfig = parsed;
72
73
  } catch (e) {
73
- debugLog.warn('TestHarness', `Failed to parse runtimeConfig JSON: ${ (e).message}`);
74
+ throw new Error(`Failed to parse runtimeConfig URL parameter: ${e?.message}`);
74
75
  }
75
76
  }
76
77
 
@@ -79,12 +80,13 @@ export function parseRuntimeOverridesFromURL(searchParams) {
79
80
  if (configChainRaw) {
80
81
  try {
81
82
  const parsed = JSON.parse(configChainRaw);
82
- if (Array.isArray(parsed)) {
83
- runtime.configChain = parsed;
84
- debugLog.info('TestHarness', `Config chain: ${parsed.join(' -> ')}`);
83
+ if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== 'string' || !entry.trim())) {
84
+ throw new Error('configChain must be an array of non-empty strings');
85
85
  }
86
+ runtime.configChain = parsed;
87
+ debugLog.info('TestHarness', `Config chain: ${parsed.join(' -> ')}`);
86
88
  } catch (e) {
87
- debugLog.warn('TestHarness', `Failed to parse configChain JSON: ${ (e).message}`);
89
+ throw new Error(`Failed to parse configChain URL parameter: ${e?.message}`);
88
90
  }
89
91
  }
90
92
 
@@ -168,7 +170,7 @@ export function createHttpShardLoader(baseUrl, manifest, log) {
168
170
  distributionConfig,
169
171
  algorithm,
170
172
  requiredEncoding,
171
- expectedHash: shard.hash ?? null,
173
+ expectedHash: getExpectedShardHash(shard, algorithm) || null,
172
174
  expectedSize: Number.isFinite(shard.size) ? Math.floor(shard.size) : null,
173
175
  expectedManifestVersionSet: manifestVersionSet,
174
176
  writeToStore: false,
@@ -31,6 +31,7 @@ import type {
31
31
  } from './loader-types.js';
32
32
  import type { ShardCache } from './shard-cache.js';
33
33
  import type { LoadingConfigSchema } from '../config/schema/loading.schema.js';
34
+ import type { LoaderDebugConfigSchema } from '../config/schema/debug.schema.js';
34
35
 
35
36
  // Re-export types for backward compatibility
36
37
  export type {
@@ -96,6 +97,8 @@ export declare class DopplerLoader {
96
97
 
97
98
  setQ4KConfig(config: Q4KConfig): void;
98
99
 
100
+ setLoaderDebugConfig(loaderDebug: LoaderDebugConfigSchema | null): void;
101
+
99
102
  setCustomShardLoader(loadShardFn: CustomShardLoader, options?: CustomShardLoaderOptions): void;
100
103
 
101
104
  setTensorsJsonUrl(url: string | null): void;
@@ -11,7 +11,7 @@ import {
11
11
  } from '../storage/shard-manager.js';
12
12
  import { clearManifest, parseManifest, setManifest as setCurrentManifest } from '../formats/rdrr/index.js';
13
13
  import { initDevice, getDevice, getKernelCapabilities } from '../gpu/device.js';
14
- import { acquireBuffer, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
14
+ import { acquireBuffer, isBufferActive, releaseBuffer, forceBufferPoolReclaim } from '../memory/buffer-pool.js';
15
15
  import { getExpertCache } from './experts/expert-cache.js';
16
16
  import { formatBytes } from '../storage/quota.js';
17
17
  import { log, trace as debugTrace } from '../debug/index.js';
@@ -118,6 +118,7 @@ export class DopplerLoader {
118
118
  // Loading configuration
119
119
 
120
120
  #loadingConfig;
121
+ #loaderDebug = null;
121
122
 
122
123
  // Fused Q4_K matmul: skip dequantization for matmul weights, use fused kernel
123
124
 
@@ -167,6 +168,10 @@ export class DopplerLoader {
167
168
  }
168
169
  }
169
170
 
171
+ setLoaderDebugConfig(loaderDebug) {
172
+ this.#loaderDebug = loaderDebug ?? null;
173
+ }
174
+
170
175
 
171
176
  setQ4KConfig(config) {
172
177
  this.useFusedQ4K = config.useFusedQ4K;
@@ -701,6 +706,7 @@ export class DopplerLoader {
701
706
  useFusedQ4K: this.useFusedQ4K,
702
707
  keepF32Weights: this.keepF32Weights,
703
708
  q4kLayout: this.q4kLayout,
709
+ loaderDebug: this.#loaderDebug,
704
710
  gpuCapabilities: this.gpuCapabilities,
705
711
  allowF32UpcastNonMatmul,
706
712
  };
@@ -924,7 +930,14 @@ export class DopplerLoader {
924
930
  return this.layers.get(layerIdx) || null;
925
931
  }
926
932
 
927
-
933
+ /**
934
+ * Load a tensor by name. Public interface for extension loaders (e.g., vision).
935
+ */
936
+ async loadTensor(name, toGPU = true, silent = false) {
937
+ return this.#loadTensor(name, toGPU, silent);
938
+ }
939
+
940
+
928
941
  getConfig() {
929
942
  return (this.manifest?.config) || {};
930
943
  }
@@ -968,7 +981,11 @@ export class DopplerLoader {
968
981
  : (isGpuBufferInstance(value) ? value : null);
969
982
  if (!gpuBuffer) return;
970
983
  try {
971
- releaseBuffer(gpuBuffer);
984
+ if (isBufferActive(gpuBuffer)) {
985
+ releaseBuffer(gpuBuffer);
986
+ } else {
987
+ gpuBuffer.destroy();
988
+ }
972
989
  } catch {
973
990
  // Ignore already released/destroyed buffers.
974
991
  }
@@ -1,6 +1,6 @@
1
1
 
2
2
 
3
- import { releaseBuffer } from '../../memory/buffer-pool.js';
3
+ import { isBufferActive, releaseBuffer } from '../../memory/buffer-pool.js';
4
4
  import { log, trace } from '../../debug/index.js';
5
5
  import { getRuntimeConfig } from '../../config/runtime.js';
6
6
  import { isWeightBuffer } from '../../gpu/weight-buffer.js';
@@ -266,7 +266,11 @@ export class ExpertCache {
266
266
  : (isGpuBufferInstance(buf) ? buf : null);
267
267
  if (!gpuBuffer) continue;
268
268
  try {
269
- releaseBuffer(gpuBuffer);
269
+ if (isBufferActive(gpuBuffer)) {
270
+ releaseBuffer(gpuBuffer);
271
+ } else {
272
+ gpuBuffer.destroy();
273
+ }
270
274
  } catch (e) {
271
275
  // Buffer may already be released
272
276
  }
@@ -9,7 +9,7 @@ import { isWeightBuffer } from '../../gpu/weight-buffer.js';
9
9
  import { maybeDowncastToF16 } from '../weight-downcast.js';
10
10
  import { log, trace as debugTrace } from '../../debug/index.js';
11
11
  import { getRuntimeConfig } from '../../config/runtime.js';
12
- import { releaseBuffer } from '../../memory/buffer-pool.js';
12
+ import { isBufferActive, releaseBuffer } from '../../memory/buffer-pool.js';
13
13
 
14
14
  // ============================================================================
15
15
  // Shard Preloading
@@ -283,7 +283,11 @@ function releasePackedLayerWeights(ctx, packed) {
283
283
  const gpuBuffer = getGpuBuffer(entry);
284
284
  if (!gpuBuffer) continue;
285
285
  try {
286
- releaseBuffer(gpuBuffer);
286
+ if (isBufferActive(gpuBuffer)) {
287
+ releaseBuffer(gpuBuffer);
288
+ } else {
289
+ gpuBuffer.destroy();
290
+ }
287
291
  ctx.gpuBuffers?.delete?.(gpuBuffer);
288
292
  } catch {
289
293
  // Ignore already-released buffers.
@@ -36,6 +36,8 @@ function isLikelyFinalNormName(name) {
36
36
  return (
37
37
  lower === 'norm.weight' ||
38
38
  lower.includes('model.norm.weight') ||
39
+ lower.includes('language_model.norm.weight') ||
40
+ lower.includes('model.language_model.norm.weight') ||
39
41
  lower.includes('embedding_norm.weight') ||
40
42
  lower.includes('model.embedding_norm.weight') ||
41
43
  lower.includes('final_layernorm.weight') ||
@@ -1,8 +1,11 @@
1
1
 
2
2
 
3
3
  import { getKernelCapabilities } from '../gpu/device.js';
4
- import { isWeightBuffer } from '../gpu/weight-buffer.js';
4
+ import { isWeightBuffer, createWeightBuffer, getWeightDtype } from '../gpu/weight-buffer.js';
5
+ import { dequantize, dequantizeRowwise } from '../gpu/kernel-selector.js';
6
+ import { releaseBuffer } from '../memory/buffer-pool.js';
5
7
  import { batchDowncastWeights } from './weight-downcast.js';
8
+ import { QK_K } from './quantization-constants.js';
6
9
  import { trace as debugTrace } from '../debug/index.js';
7
10
 
8
11
  // ============================================================================
@@ -26,8 +29,8 @@ const ATTN_SUFFIXES = {
26
29
  kProj: ['self_attn.k_proj.weight', 'attention.wk.weight', 'attn_k.weight'],
27
30
  vProj: ['self_attn.v_proj.weight', 'attention.wv.weight', 'attn_v.weight'],
28
31
  oProj: ['self_attn.o_proj.weight', 'self_attn.out_proj.weight', 'attention.wo.weight', 'attn_output.weight'],
29
- qNorm: ['self_attn.q_norm.weight', 'attn_q_norm.weight'],
30
- kNorm: ['self_attn.k_norm.weight', 'attn_k_norm.weight'],
32
+ qNorm: ['self_attn.q_norm.weight', 'self_attn.q_layernorm.weight', 'attn_q_norm.weight'],
33
+ kNorm: ['self_attn.k_norm.weight', 'self_attn.k_layernorm.weight', 'attn_k_norm.weight'],
31
34
  postAttentionNorm: ['post_attention_layernorm.weight', 'post_attention_norm.weight', 'ffn_norm.weight'],
32
35
  preFeedforwardNorm: ['pre_feedforward_layernorm.weight'],
33
36
  postFeedforwardNorm: ['post_feedforward_layernorm.weight', 'post_ffw_norm.weight'],
@@ -415,4 +418,40 @@ async function downcastLayerWeights(ctx, weights, layerIdx) {
415
418
  },
416
419
  ctx.gpuBuffers
417
420
  );
421
+
422
+ await dequantConvQ4KWeights(ctx, weights, layerIdx);
423
+ }
424
+
425
+
426
+ const CONV_Q4K_DEQUANT_KEYS = ['convInProj', 'convOutProj', 'convKernel'];
427
+
428
+ async function dequantConvQ4KWeights(ctx, weights, layerIdx) {
429
+ for (const key of CONV_Q4K_DEQUANT_KEYS) {
430
+ const buf = weights[key];
431
+ if (!buf || !isWeightBuffer(buf)) continue;
432
+ if (getWeightDtype(buf) !== 'q4k') continue;
433
+
434
+ const shape = buf.shape;
435
+ if (!Array.isArray(shape) || shape.length < 2) continue;
436
+
437
+ const is2D = shape.length === 2;
438
+ const totalElements = shape.reduce((a, b) => a * b, 1);
439
+
440
+ let dequantizedTensor;
441
+ const outputDtype = 'f32';
442
+ if (is2D && shape[1] % QK_K !== 0) {
443
+ dequantizedTensor = await dequantizeRowwise(buf.buffer, shape[0], shape[1], { outputDtype });
444
+ } else {
445
+ if (totalElements === 0 || totalElements % QK_K !== 0) continue;
446
+ const numBlocks = totalElements / QK_K;
447
+ dequantizedTensor = await dequantize(buf.buffer, numBlocks, { outputDtype });
448
+ }
449
+
450
+ releaseBuffer(buf.buffer);
451
+ const dequantizedBuffer = dequantizedTensor.buffer;
452
+ weights[key] = createWeightBuffer(dequantizedBuffer, outputDtype, 'row', shape, buf.label ?? key);
453
+ ctx.gpuBuffers.add(dequantizedBuffer);
454
+
455
+ debugTrace.loader(`Layer ${layerIdx} dequantized conv ${key} Q4K→${outputDtype.toUpperCase()}: [${shape.join(',')}]`);
456
+ }
418
457
  }
@@ -7,6 +7,8 @@ import { formatBytes } from '../storage/quota.js';
7
7
  import { log, trace as debugTrace } from '../debug/index.js';
8
8
  import { selectRuleValue } from '../rules/rule-registry.js';
9
9
 
10
+ const STREAMABLE_DTYPES = new Set(['F16', 'F32', 'BF16']);
11
+
10
12
  // ============================================================================
11
13
  // Norm Weight Offset Detection
12
14
  // ============================================================================
@@ -102,7 +104,7 @@ export function shouldStreamLargeWeight(name, location, label, gpuCapabilities,
102
104
  if (estimate.bytes <= maxBytes) return false;
103
105
 
104
106
  // Check if dtype can be streamed (only float types)
105
- const canStream = location.dtype === 'F16' || location.dtype === 'F32' || location.dtype === 'BF16';
107
+ const canStream = STREAMABLE_DTYPES.has(location.dtype);
106
108
  if (!canStream) {
107
109
  log.warn(
108
110
  'Loader',
@@ -5,6 +5,7 @@ import {
5
5
  computeHash,
6
6
  getStorageBackendType,
7
7
  } from '../storage/shard-manager.js';
8
+ import { getExpectedShardHash } from '../formats/rdrr/index.js';
8
9
  import { formatBytes } from '../storage/quota.js';
9
10
  import { log, trace as debugTrace } from '../debug/index.js';
10
11
  import { getRuntimeConfig } from '../config/runtime.js';
@@ -484,11 +485,11 @@ export class ShardCache {
484
485
  // Verify hash if enabled
485
486
  if (this.#verifyHashes && this.#manifest) {
486
487
  const shardInfo = this.#manifest.shards?.[shardIndex];
487
- const expectedHash = shardInfo?.hash;
488
+ const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
489
+ const expectedHash = getExpectedShardHash(shardInfo, algorithm);
488
490
  if (!expectedHash) {
489
491
  throw new Error(`Shard ${shardIndex} missing hash in manifest.`);
490
492
  }
491
- const algorithm = shardInfo?.hashAlgorithm ?? this.#manifest.hashAlgorithm;
492
493
  if (!algorithm) {
493
494
  throw new Error(`Manifest missing hashAlgorithm for shard ${shardIndex}.`);
494
495
  }
@@ -12,10 +12,13 @@
12
12
 
13
13
  import type { WeightBuffer, WeightLayout } from '../../gpu/weight-buffer.js';
14
14
  import type { TensorLocation, KernelCapabilities } from '../loader-types.js';
15
+ import type { LoaderDebugConfigSchema } from '../../config/schema/debug.schema.js';
15
16
 
16
17
  export interface TensorLoadConfig {
17
18
  /** Use fused Q4K matmul kernels */
18
19
  useFusedQ4K: boolean;
20
+ /** Debug controls for Q4K loading/dequantization */
21
+ loaderDebug?: LoaderDebugConfigSchema | null;
19
22
  /** Keep weights as F32 (disable F16 downcasting) */
20
23
  keepF32Weights: boolean;
21
24
  /** Allow F16->F32 upcast for non-matmul weights */