@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -1,7 +1,7 @@
1
1
 
2
2
 
3
3
  import { getDevice, getKernelCapabilities } from '../../gpu/device.js';
4
- import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
4
+ import { acquireBuffer, releaseBuffer, readBuffer } from '../../memory/buffer-pool.js';
5
5
  import { dequantize, dequantizeRowwise, dequantizeQ6K, castF16ToF32, runBF16ToF16 } from '../../gpu/kernel-selector.js';
6
6
  import { createTensor } from '../../gpu/tensor.js';
7
7
  import { createWeightBuffer } from '../../gpu/weight-buffer.js';
@@ -9,6 +9,7 @@ import { f16ToF32, convertBF16ToF32GPU, shouldDequantizeToF16, applyBufferLayout
9
9
  import { QK_K, Q4K_BLOCK_BYTES, Q6K_BLOCK_BYTES } from '../quantization-constants.js';
10
10
  import { log, trace as debugTrace } from '../../debug/index.js';
11
11
  import { selectRuleValue } from '../../rules/rule-registry.js';
12
+ import { dequantizeQ4KM, dequantizeQ4KMRowWise } from '../../converter/quantizer.js';
12
13
 
13
14
  // ============================================================================
14
15
  // Q4K Detection
@@ -31,6 +32,24 @@ function releaseOwnedGpuBuffer(buffer, owned) {
31
32
  releaseBuffer(buffer);
32
33
  }
33
34
 
35
+ function normalizeLoaderDebugConfig(config) {
36
+ const debug = config?.loaderDebug;
37
+ if (!debug || typeof debug !== 'object') {
38
+ return null;
39
+ }
40
+
41
+ return {
42
+ enabled: debug.enabled === true,
43
+ forceGpuDequant: debug.forceGpuDequant === true,
44
+ preferCpuDequant: debug.preferCpuDequant === true,
45
+ failOnCpuDequantPath: debug.failOnCpuDequantPath === true,
46
+ runQ4KDequantParity: debug.runQ4KDequantParity === true,
47
+ q4kDequantParitySamples: Number.isFinite(debug.q4kDequantParitySamples)
48
+ ? Math.min(4096, Math.max(1, Math.trunc(debug.q4kDequantParitySamples)))
49
+ : 256,
50
+ };
51
+ }
52
+
34
53
  function logF32UpcastNonMatmul(name, numElements, bufferSize) {
35
54
  if (loggedF32UpcastNonMatmul) {
36
55
  return;
@@ -199,11 +218,52 @@ export async function loadQ4KDequant(shardData, location, name, config) {
199
218
  }
200
219
 
201
220
  const outputDtype = getQ4KOutputDtype(location, config);
221
+ const loaderDebug = normalizeLoaderDebugConfig(config);
222
+ const debugEnabled = loaderDebug?.enabled === true;
223
+ const forceGpuDequant = loaderDebug?.forceGpuDequant === true;
224
+ const failOnCpuDequantPath = loaderDebug?.failOnCpuDequantPath === true;
225
+ const runQ4KDequantParity = loaderDebug?.runQ4KDequantParity === true;
226
+ const paritySamples = loaderDebug?.q4kDequantParitySamples ?? 256;
202
227
 
203
228
  const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
204
229
  const K = is2DMatrix ? location.shape[1] : 0;
205
230
  const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
231
+ const layout = getWeightLayout(location, config);
232
+ const preferCpuDequant = loaderDebug?.preferCpuDequant === true;
233
+ const canUseCpuReference = !forceGpuDequant && preferCpuDequant && (
234
+ outputDtype === 'f32'
235
+ && !isGpuBufferInstance(shardData)
236
+ && (!needsRowwise || layout === 'row')
237
+ );
238
+
239
+ if (canUseCpuReference && failOnCpuDequantPath) {
240
+ throw new Error(
241
+ `[LoaderDebug] CPU dequant path taken for ${name}; this run is configured fail-closed. ` +
242
+ 'Set runtime.shared.debug.loader.forceGpuDequant=true to isolate GPU dequant.'
243
+ );
244
+ }
245
+
246
+ if (canUseCpuReference) {
247
+ const quantizedBytes = toUint8View(shardData);
248
+ const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
249
+ debugTrace.loader(
250
+ `Dequantizing ${name} with CPU reference path: ` +
251
+ `shape=[${location.shape.join(',')}], layout=${layout}, needsRowwise=${needsRowwise}`
252
+ );
253
+ const f32Weights = needsRowwise
254
+ ? dequantizeQ4KMRowWise(quantizedBytes, location.shape)
255
+ : dequantizeQ4KM(quantizedBytes, numBlocks, location.shape);
256
+ const outputBuffer = acquireAlignedBuffer(f32Weights.byteLength, `dequant_cpu_${name}`);
257
+ writeBufferAligned(device, outputBuffer, new Uint8Array(f32Weights.buffer));
258
+ releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
259
+ ownsQuantBuffer = false;
260
+ return {
261
+ data: createWeightBuffer(outputBuffer, 'f32', layout, location.shape, name),
262
+ allocatedBuffers: [outputBuffer],
263
+ };
264
+ }
206
265
 
266
+ let numBlocks = null;
207
267
  let dequantizedTensor;
208
268
  if (needsRowwise) {
209
269
  const rows = location.shape[0];
@@ -213,7 +273,7 @@ export async function loadQ4KDequant(shardData, location, name, config) {
213
273
  );
214
274
  dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
215
275
  } else {
216
- const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
276
+ numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
217
277
  debugTrace.loader(
218
278
  `Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
219
279
  `outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
@@ -223,10 +283,71 @@ export async function loadQ4KDequant(shardData, location, name, config) {
223
283
  dequantized = dequantizedTensor.buffer;
224
284
 
225
285
  debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
286
+
287
+ if (runQ4KDequantParity && !isGpuBufferInstance(shardData) && dequantized && numBlocks !== null) {
288
+ const isProbeTarget = debugEnabled &&
289
+ (name.includes('.self_attn.q_proj.weight') || name.includes('.self_attn.k_proj.weight') ||
290
+ name.includes('.self_attn.v_proj.weight') || name.includes('.self_attn.qkv_proj.weight'));
291
+
292
+ if (isProbeTarget) {
293
+ try {
294
+ const bytesPerElem = outputDtype === 'f16' ? 2 : 4;
295
+ const requestedOutputBytes = numBlocks * QK_K * bytesPerElem;
296
+ const sampleCount = paritySamples;
297
+ const readSize = Math.min(sampleCount * bytesPerElem, dequantized.size);
298
+ const gpuRaw = await readBuffer(dequantized, readSize);
299
+ const gpuBytes = gpuRaw instanceof ArrayBuffer
300
+ ? new Uint8Array(gpuRaw)
301
+ : new Uint8Array(gpuRaw.buffer, gpuRaw.byteOffset, gpuRaw.byteLength);
302
+
303
+ let gpuVals;
304
+ if (outputDtype === 'f16') {
305
+ const u16 = new Uint16Array(gpuBytes.buffer, gpuBytes.byteOffset,
306
+ Math.min(sampleCount, Math.floor(gpuBytes.byteLength / 2)));
307
+ gpuVals = Array.from(u16, (half) => f16ToF32(half));
308
+ } else {
309
+ const f32 = new Float32Array(gpuBytes.buffer, gpuBytes.byteOffset,
310
+ Math.min(sampleCount, Math.floor(gpuBytes.byteLength / 4)));
311
+ gpuVals = Array.from(f32);
312
+ }
313
+
314
+ const quantizedBytes = toUint8View(shardData);
315
+ const cpuRef = Array.from(
316
+ needsRowwise
317
+ ? dequantizeQ4KMRowWise(quantizedBytes, location.shape)
318
+ : dequantizeQ4KM(quantizedBytes, numBlocks, location.shape)
319
+ ).slice(0, gpuVals.length);
320
+
321
+ let maxDiff = 0;
322
+ let diffIdx = -1;
323
+ for (let i = 0; i < gpuVals.length && i < cpuRef.length; i++) {
324
+ const d = Math.abs(gpuVals[i] - cpuRef[i]);
325
+ if (d > maxDiff) {
326
+ maxDiff = d;
327
+ diffIdx = i;
328
+ }
329
+ }
330
+
331
+ log.warn('DequantProbe',
332
+ `tensor="${name}" shape=[${location.shape}] ` +
333
+ `location.size=${location.size} numBlocks=${numBlocks} outputDtype=${outputDtype} ` +
334
+ `bytesPerElem=${bytesPerElem} requestedOutputBytes=${requestedOutputBytes} bufSize=${dequantized.size} ` +
335
+ `runParity=true sampleCount=${sampleCount}`
336
+ );
337
+ log.warn('DequantProbe',
338
+ `parity: maxDiff=${maxDiff.toFixed(8)} at idx=${diffIdx} ` +
339
+ `gpu[0..3]=[${gpuVals.slice(0, 4).map((v) => v.toFixed(6))}] ` +
340
+ `cpu[0..3]=[${cpuRef.slice(0, 4).map((v) => v.toFixed(6))}]`
341
+ );
342
+ } catch (e) {
343
+ log.warn('DequantProbe', `Readback failed: ${e.message}`);
344
+ }
345
+ }
346
+ }
347
+
226
348
  releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
227
349
  ownsQuantBuffer = false;
228
350
 
229
- const layout = getWeightLayout(location, config);
230
351
  const dtype = outputDtype;
231
352
 
232
353
  return {
@@ -309,8 +430,9 @@ export async function loadBF16(shardData, location, name, config) {
309
430
  const numElements = location.size / 2;
310
431
  const caps = config.gpuCapabilities || getKernelCapabilities();
311
432
  const isMatmulWeight = shouldDequantizeToF16(location);
433
+ const keepF32Weights = config.keepF32Weights === true;
312
434
 
313
- if (caps?.hasF16 && isMatmulWeight) {
435
+ if (caps?.hasF16 && isMatmulWeight && !keepF32Weights) {
314
436
  const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
315
437
  resultBuffer = f16Tensor.buffer;
316
438
  releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
@@ -327,6 +449,10 @@ export async function loadBF16(shardData, location, name, config) {
327
449
  };
328
450
  }
329
451
 
452
+ if (isMatmulWeight && keepF32Weights) {
453
+ debugTrace.loader(`Keeping BF16 matmul weight in f32: ${name} (keepF32Weights=true)`);
454
+ }
455
+
330
456
  const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
331
457
  resultBuffer = dstBuffer;
332
458
  releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
@@ -59,6 +59,11 @@
59
59
  { "match": { "useF16": true }, "value": "f16" },
60
60
  { "match": {}, "value": { "context": "fallback" } }
61
61
  ],
62
+ "attentionProjectionOutputDtype": [
63
+ { "match": { "forceF32": true }, "value": "f32" },
64
+ { "match": { "useF16": true }, "value": "f16" },
65
+ { "match": {}, "value": { "context": "fallback" } }
66
+ ],
62
67
  "bytesPerElement": [
63
68
  { "match": { "dtype": "f16" }, "value": 2 },
64
69
  { "match": {}, "value": 4 }
@@ -46,7 +46,7 @@
46
46
  "hasSubgroups": false,
47
47
  "kernelPathRef": "lfm2-q4k-dequant-f32a-online"
48
48
  },
49
- "value": "gemma3-q4k-dequant-f32a-nosubgroups"
49
+ "value": "lfm2-q4k-dequant-f32a-nosubgroups"
50
50
  },
51
51
  {
52
52
  "match": {
@@ -77,7 +77,7 @@
77
77
  },
78
78
  {
79
79
  "match": { "kernelPathId": "lfm2-q4k-dequant-f32a-online" },
80
- "value": "gemma3-q4k-dequant-f32a-nosubgroups"
80
+ "value": "lfm2-q4k-dequant-f32a-nosubgroups"
81
81
  },
82
82
  {
83
83
  "match": { "kernelPathId": "gemma2-f16-f16a" },
@@ -0,0 +1,75 @@
1
+ {
2
+ "vendorQuirkProfile": [
3
+ {
4
+ "match": {
5
+ "vendor": {
6
+ "contains": ["intel", "amd"]
7
+ }
8
+ },
9
+ "value": {
10
+ "preferVec4Dequant": false,
11
+ "dequantTileShape": "scalar",
12
+ "routerWorkgroupSize": 128,
13
+ "maxTokensPerExpertScale": 0.85
14
+ }
15
+ },
16
+ {
17
+ "match": {
18
+ "vendor": {
19
+ "contains": ["nvidia", "apple", "qualcomm"]
20
+ }
21
+ },
22
+ "value": {
23
+ "preferVec4Dequant": false,
24
+ "dequantTileShape": "scalar",
25
+ "routerWorkgroupSize": 256,
26
+ "maxTokensPerExpertScale": 1.0
27
+ }
28
+ },
29
+ {
30
+ "match": {},
31
+ "value": {
32
+ "preferVec4Dequant": false,
33
+ "dequantTileShape": "scalar",
34
+ "routerWorkgroupSize": 128,
35
+ "maxTokensPerExpertScale": 1.0
36
+ }
37
+ }
38
+ ],
39
+ "routerTopKVariant": [
40
+ {
41
+ "match": { "modelType": "mixtral", "hasF16": true, "hasSubgroups": true, "routerDtype": "f32" },
42
+ "value": "softmax_topk_f32_subgroup"
43
+ },
44
+ {
45
+ "match": { "modelType": "mixtral", "routerDtype": "f32" },
46
+ "value": "softmax_topk_f32"
47
+ },
48
+ {
49
+ "match": { "modelType": "mixtral" },
50
+ "value": "softmax_topk_f32"
51
+ }
52
+ ],
53
+ "dequantVariant": [
54
+ {
55
+ "match": { "modelType": "mixtral", "weightsDtype": "q4k", "hasF16": true, "hasSubgroups": true, "outputDtype": "f32" },
56
+ "value": "q4k_expert_dequant_f32_subgroup"
57
+ },
58
+ {
59
+ "match": { "modelType": "mixtral", "weightsDtype": "q4k", "outputDtype": "f16", "hasF16": true },
60
+ "value": "q4k_expert_dequant_f16"
61
+ },
62
+ {
63
+ "match": { "modelType": "mixtral", "weightsDtype": "q4k" },
64
+ "value": "q4k_expert_dequant_f32"
65
+ },
66
+ {
67
+ "match": { "modelType": "mixtral", "weightsDtype": "f16", "outputDtype": "f16", "hasF16": true },
68
+ "value": "f16_expert_passthrough"
69
+ },
70
+ {
71
+ "match": { "modelType": "mixtral" },
72
+ "value": "f16_expert_upcast_f32"
73
+ }
74
+ ]
75
+ }
@@ -16,6 +16,8 @@
16
16
  },
17
17
  "value": "gptoss_router_topk"
18
18
  },
19
+ { "match": { "modelType": "mixtral", "inputDtype": "f16", "weightsDtype": "f16" }, "value": "fused_f16_w16" },
20
+ { "match": { "modelType": "mixtral" }, "value": "fused" },
19
21
  { "match": { "inputDtype": "f16", "weightsDtype": "f16" }, "value": "fused_f16_w16" },
20
22
  { "match": { "inputDtype": "f16" }, "value": "fused_f16" },
21
23
  { "match": {}, "value": "fused" }
@@ -0,0 +1,6 @@
1
+ {
2
+ "variant": [
3
+ { "match": { "outputDtype": "f16" }, "value": "f16" },
4
+ { "match": {}, "value": "default" }
5
+ ]
6
+ }
@@ -20,6 +20,7 @@ type KernelRuleGroup =
20
20
  | 'matmul'
21
21
  | 'moe'
22
22
  | 'moeGptoss'
23
+ | 'moeMixtral'
23
24
  | 'residual'
24
25
  | 'rmsnorm'
25
26
  | 'rope'
@@ -38,6 +38,7 @@ const layernormRules = await loadJson('./kernels/layernorm.rules.json', import.m
38
38
  const matmulRules = await loadJson('./kernels/matmul.rules.json', import.meta.url, 'Failed to load rules');
39
39
  const kernelMoeRules = await loadJson('./kernels/moe.rules.json', import.meta.url, 'Failed to load rules');
40
40
  const kernelMoeGptOssRules = await loadJson('./kernels/moe.rules.gptoss.json', import.meta.url, 'Failed to load rules');
41
+ const kernelMoeMixtralRules = await loadJson('./kernels/moe.rules.mixtral.json', import.meta.url, 'Failed to load rules');
41
42
  const modulateRules = await loadJson('./kernels/modulate.rules.json', import.meta.url, 'Failed to load rules');
42
43
  const pixelShuffleRules = await loadJson('./kernels/pixel_shuffle.rules.json', import.meta.url, 'Failed to load rules');
43
44
  const repeatChannelsRules = await loadJson('./kernels/repeat-channels.rules.json', import.meta.url, 'Failed to load rules');
@@ -50,6 +51,7 @@ const sampleRules = await loadJson('./kernels/sample.rules.json', import.meta.ur
50
51
  const scaleRules = await loadJson('./kernels/scale.rules.json', import.meta.url, 'Failed to load rules');
51
52
  const siluRules = await loadJson('./kernels/silu.rules.json', import.meta.url, 'Failed to load rules');
52
53
  const splitQkvRules = await loadJson('./kernels/split-qkv.rules.json', import.meta.url, 'Failed to load rules');
54
+ const splitQgRules = await loadJson('./kernels/split-qg.rules.json', import.meta.url, 'Failed to load rules');
53
55
  const softmaxRules = await loadJson('./kernels/softmax.rules.json', import.meta.url, 'Failed to load rules');
54
56
  const upsample2dRules = await loadJson('./kernels/upsample2d.rules.json', import.meta.url, 'Failed to load rules');
55
57
  const configRules = await loadJson('./inference/config.rules.json', import.meta.url, 'Failed to load rules');
@@ -112,6 +114,7 @@ const RULE_SETS = {
112
114
  matmul: matmulRules,
113
115
  moe: kernelMoeRules,
114
116
  moeGptoss: kernelMoeGptOssRules,
117
+ moeMixtral: kernelMoeMixtralRules,
115
118
  modulate: modulateRules,
116
119
  pixel_shuffle: pixelShuffleRules,
117
120
  repeatChannels: repeatChannelsRules,
@@ -124,6 +127,7 @@ const RULE_SETS = {
124
127
  scale: scaleRules,
125
128
  silu: siluRules,
126
129
  splitQkv: splitQkvRules,
130
+ splitQg: splitQgRules,
127
131
  softmax: softmaxRules,
128
132
  upsample2d: upsample2dRules,
129
133
  },
@@ -2,6 +2,7 @@
2
2
 
3
3
  import {
4
4
  parseManifest,
5
+ getExpectedShardHash,
5
6
  getManifestUrl,
6
7
  } from '../formats/rdrr/index.js';
7
8
 
@@ -726,7 +727,7 @@ export async function downloadModel(
726
727
  if (!algorithm) {
727
728
  throw new Error('Manifest missing hashAlgorithm for download verification.');
728
729
  }
729
- const expectedHash = shardInfo.hash;
730
+ const expectedHash = getExpectedShardHash(shardInfo, algorithm);
730
731
  if (!expectedHash) {
731
732
  throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
732
733
  }
@@ -13,6 +13,7 @@
13
13
 
14
14
  import type { DownloadProgress } from './downloader.js';
15
15
  import type { PreflightResult, ModelRequirements } from './preflight.js';
16
+ import type { HfResolveConfig } from '../utils/hf-resolve-url.js';
16
17
 
17
18
  /**
18
19
  * Remote model configuration
@@ -24,6 +25,8 @@ export interface RemoteModelConfig {
24
25
  displayName: string;
25
26
  /** Base URL for shards (any static CDN) */
26
27
  baseUrl?: string | null;
28
+ /** Hosted Hugging Face source used when baseUrl is omitted */
29
+ hf?: HfResolveConfig | null;
27
30
  /** Model requirements for pre-flight checks */
28
31
  requirements: ModelRequirements;
29
32
  }
@@ -7,6 +7,7 @@ import {
7
7
  } from './preflight.js';
8
8
  import { formatBytes } from './quota.js';
9
9
  import { getCdnBasePath } from './download-types.js';
10
+ import { buildHfResolveBaseUrl, DEFAULT_HF_CDN_BASE_URL } from '../utils/hf-resolve-url.js';
10
11
 
11
12
  // ============================================================================
12
13
  // Model Registry
@@ -15,40 +16,14 @@ import { getCdnBasePath } from './download-types.js';
15
16
 
16
17
  let cdnBaseOverride = null;
17
18
 
18
-
19
- function getEffectiveCDNBaseUrl() {
20
- const runtimeBase = getCdnBasePath();
21
- const base = cdnBaseOverride ?? runtimeBase ?? '';
22
- if (base) return base;
23
-
24
- // Auto-detect: use same origin for Firebase Hosting or local dev
25
- if (typeof globalThis.location !== 'undefined') {
26
- const path = globalThis.location.pathname || '';
27
- if (
28
- path === '/d' ||
29
- path.startsWith('/d/') ||
30
- path === '/doppler' ||
31
- path.startsWith('/doppler/') ||
32
- path === '/dr' ||
33
- path.startsWith('/dr/') ||
34
- globalThis.location.host.includes('replo')
35
- ) {
36
- return `${globalThis.location.origin}/doppler/models`;
37
- }
38
- return `${globalThis.location.origin}/models`;
39
- }
40
- // Fallback for non-browser-global contexts
41
- return '/models';
42
- }
43
-
44
-
45
19
  export function setCDNBaseUrl(url) {
46
- cdnBaseOverride = url.replace(/\/$/, ''); // Remove trailing slash
20
+ const normalized = typeof url === 'string' ? url.trim().replace(/\/$/, '') : '';
21
+ cdnBaseOverride = normalized || null;
47
22
  }
48
23
 
49
24
 
50
25
  export function getCDNBaseUrl() {
51
- return getEffectiveCDNBaseUrl();
26
+ return cdnBaseOverride ?? getCdnBasePath() ?? DEFAULT_HF_CDN_BASE_URL;
52
27
  }
53
28
 
54
29
 
@@ -57,12 +32,22 @@ export const QUICKSTART_MODELS = {
57
32
  modelId: 'gemma-3-270m-it-q4k-ehf16-af32',
58
33
  displayName: 'Gemma 3 270M IT (Q4K)',
59
34
  baseUrl: null,
35
+ hf: {
36
+ repoId: 'Clocksmith/rdrr',
37
+ revision: 'ca6f0dbdf3882d3893a65cf48f2bb6f1520df162',
38
+ path: 'models/gemma-3-270m-it-q4k-ehf16-af32',
39
+ },
60
40
  requirements: MODEL_REQUIREMENTS['gemma-3-270m-it-q4k-ehf16-af32'],
61
41
  },
62
42
  'google-embeddinggemma-300m-q4k-ehf16-af32': {
63
43
  modelId: 'google-embeddinggemma-300m-q4k-ehf16-af32',
64
44
  displayName: 'EmbeddingGemma 300M (Q4K)',
65
45
  baseUrl: null,
46
+ hf: {
47
+ repoId: 'Clocksmith/rdrr',
48
+ revision: '7e79c466d54455bd370c81685956ea9abae0fd30',
49
+ path: 'models/google-embeddinggemma-300m-q4k-ehf16-af32',
50
+ },
66
51
  requirements: MODEL_REQUIREMENTS['google-embeddinggemma-300m-q4k-ehf16-af32'],
67
52
  },
68
53
  };
@@ -82,6 +67,18 @@ export function registerQuickStartModel(config) {
82
67
  QUICKSTART_MODELS[config.modelId] = config;
83
68
  }
84
69
 
70
+ function resolveQuickStartModelBaseUrl(config) {
71
+ if (typeof config?.baseUrl === 'string' && config.baseUrl.trim().length > 0) {
72
+ return config.baseUrl.trim().replace(/\/$/, '');
73
+ }
74
+ if (config?.hf) {
75
+ return buildHfResolveBaseUrl(config.hf, { cdnBasePath: getCDNBaseUrl() });
76
+ }
77
+ throw new Error(
78
+ `Quickstart model "${config?.modelId ?? 'unknown'}" is missing an explicit baseUrl or hosted Hugging Face source.`
79
+ );
80
+ }
81
+
85
82
  // ============================================================================
86
83
  // Download Functions
87
84
  // ============================================================================
@@ -190,7 +187,7 @@ export async function downloadQuickStartModel(
190
187
  signal,
191
188
  };
192
189
 
193
- const baseUrl = config.baseUrl ?? `${getEffectiveCDNBaseUrl()}/${config.modelId}`;
190
+ const baseUrl = resolveQuickStartModelBaseUrl(config);
194
191
  const success = await downloadModel(
195
192
  baseUrl,
196
193
  onProgress,
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  getManifest,
3
+ getExpectedShardHash,
3
4
  getShardInfo,
4
5
  getShardCount,
5
6
  generateShardFilename,
@@ -280,7 +281,7 @@ export async function writeShard(shardIndex, data, options = { verify: true }) {
280
281
  const manifest = getManifest();
281
282
  const algorithm = requireManifestHashAlgorithm(manifest, 'shard write');
282
283
  const hash = await computeHash(bytes, algorithm);
283
- const expectedHash = shardInfo.hash;
284
+ const expectedHash = getExpectedShardHash(shardInfo, algorithm);
284
285
  if (!expectedHash) {
285
286
  await backend.deleteFile(shardInfo.filename);
286
287
  throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
@@ -369,7 +370,7 @@ export async function loadShard(shardIndex, options = { verify: false }) {
369
370
  const manifest = getManifest();
370
371
  const algorithm = requireManifestHashAlgorithm(manifest, 'shard load');
371
372
  const hash = await computeHash(buffer, algorithm);
372
- const expectedHash = shardInfo.hash;
373
+ const expectedHash = getExpectedShardHash(shardInfo, algorithm);
373
374
  if (!expectedHash) {
374
375
  throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
375
376
  }
@@ -531,7 +532,7 @@ export async function verifyIntegrity(options = {}) {
531
532
  const buffer = await loadShard(i, { verify: false });
532
533
  const hash = await computeHash(buffer, algorithm);
533
534
  const shardInfo = getShardInfo(i);
534
- const expectedHash = shardInfo?.hash;
535
+ const expectedHash = getExpectedShardHash(shardInfo, algorithm);
535
536
  if (!expectedHash) {
536
537
  corruptShards.push(i);
537
538
  continue;
@@ -2,6 +2,7 @@ import path from 'node:path';
2
2
 
3
3
  import { createConverterConfig } from '../config/schema/index.js';
4
4
  import { resolveConversionPlan } from '../converter/conversion-plan.js';
5
+ import { normalizeQuantTag } from '../converter/quantization-info.js';
5
6
 
6
7
  function toSafeString(value) {
7
8
  if (typeof value !== 'string') return '';
@@ -10,10 +11,7 @@ function toSafeString(value) {
10
11
  }
11
12
 
12
13
  function normalizeQuantizationTag(value) {
13
- const raw = toSafeString(value).toUpperCase();
14
- if (!raw) return 'f16';
15
- if (raw === 'Q4_K_M' || raw === 'Q4_K') return 'q4k';
16
- return raw.toLowerCase();
14
+ return normalizeQuantTag(toSafeString(value));
17
15
  }
18
16
 
19
17
  function resolveArchitectureHint(architecture) {
@@ -37,7 +35,7 @@ function extractSourceQuantization(manifest) {
37
35
  if (explicitWeights) return explicitWeights;
38
36
  const explicitQuant = toSafeString(manifest?.quantization);
39
37
  if (explicitQuant) return explicitQuant;
40
- return 'f16';
38
+ return normalizeQuantTag(null);
41
39
  }
42
40
 
43
41
  function buildRefreshRawConfig(manifest) {
@@ -541,18 +541,24 @@ async function listRelativeFiles(rootDir, relDir = '', out = []) {
541
541
  return out;
542
542
  }
543
543
 
544
- async function clearExistingShardFiles(outputDir) {
544
+ async function clearExistingConversionOutputs(outputDir) {
545
545
  let entries;
546
546
  try {
547
547
  entries = await fs.readdir(outputDir, { withFileTypes: true });
548
548
  } catch {
549
549
  return;
550
550
  }
551
- const shardFiles = entries
552
- .filter((entry) => entry.isFile() && /^shard_\d{5}\.bin$/i.test(entry.name))
551
+ const artifactFiles = entries
552
+ .filter((entry) => (
553
+ entry.isFile()
554
+ && (
555
+ /^shard_\d{5}\.bin$/i.test(entry.name)
556
+ || entry.name === 'manifest.json'
557
+ )
558
+ ))
553
559
  .map((entry) => path.join(outputDir, entry.name));
554
- if (shardFiles.length === 0) return;
555
- await Promise.all(shardFiles.map((filePath) => fs.unlink(filePath)));
560
+ if (artifactFiles.length === 0) return;
561
+ await Promise.all(artifactFiles.map((filePath) => fs.unlink(filePath)));
556
562
  }
557
563
 
558
564
  function createNodeConvertIO(outputDir, options) {
@@ -875,6 +881,7 @@ export async function convertSafetensorsDirectory(options) {
875
881
  let sourceQuantization = null;
876
882
  let tokenizerJson = null;
877
883
  let tokenizerConfig = null;
884
+ let generationConfig = null;
878
885
  let hasTokenizerModel = false;
879
886
  let tokenizerModelPath = null;
880
887
  let diffusionAuxFiles = [];
@@ -1101,6 +1108,7 @@ export async function convertSafetensorsDirectory(options) {
1101
1108
  },
1102
1109
  });
1103
1110
  config = parsedTransformer.config;
1111
+ generationConfig = parsedTransformer.generationConfig ?? null;
1104
1112
  tensors = parsedTransformer.tensors;
1105
1113
  architectureHint = parsedTransformer.architectureHint;
1106
1114
  architecture = extractArchitecture(config, null);
@@ -1151,7 +1159,7 @@ export async function convertSafetensorsDirectory(options) {
1151
1159
  const outputDir = resolveOutputDir(outputDirOverride, converterConfig, modelId);
1152
1160
 
1153
1161
  await fs.mkdir(outputDir, { recursive: true });
1154
- await clearExistingShardFiles(outputDir);
1162
+ await clearExistingConversionOutputs(outputDir);
1155
1163
 
1156
1164
  const model = {
1157
1165
  name: path.basename(inputDir),
@@ -1169,6 +1177,7 @@ export async function convertSafetensorsDirectory(options) {
1169
1177
  quantization: targetQuantization,
1170
1178
  tokenizerJson,
1171
1179
  tokenizerConfig,
1180
+ generationConfig,
1172
1181
  tokenizerModel: hasTokenizerModel ? 'tokenizer.model' : null,
1173
1182
  };
1174
1183
 
@@ -1177,6 +1186,15 @@ export async function convertSafetensorsDirectory(options) {
1177
1186
  computeHash,
1178
1187
  readRange: fileRangeReader.readRange,
1179
1188
  });
1189
+ const deferredManifestState = {
1190
+ manifest: null,
1191
+ };
1192
+ const convertIo = {
1193
+ ...io,
1194
+ async writeManifest(manifest) {
1195
+ deferredManifestState.manifest = manifest;
1196
+ },
1197
+ };
1180
1198
  const manifestArchitecture = modelKind === 'diffusion' ? 'diffusion' : architecture;
1181
1199
  let workerPool = null;
1182
1200
  let workerTensorTransformer = null;
@@ -1241,7 +1259,7 @@ export async function convertSafetensorsDirectory(options) {
1241
1259
  }));
1242
1260
 
1243
1261
  const convertTimer = createStageTimer('Convert tensors');
1244
- result = await convertModel(model, io, {
1262
+ result = await convertModel(model, convertIo, {
1245
1263
  modelId,
1246
1264
  modelType: resolvedModelType,
1247
1265
  quantization: targetQuantization,
@@ -1279,6 +1297,9 @@ export async function convertSafetensorsDirectory(options) {
1279
1297
  }
1280
1298
 
1281
1299
  normalizeTokenizerManifest(result.manifest);
1300
+ if (!deferredManifestState.manifest) {
1301
+ throw new Error('node convert: convert core did not produce a manifest.');
1302
+ }
1282
1303
  await io.writeManifest(result.manifest);
1283
1304
 
1284
1305
  const report = buildConvertReport(result, {