npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/loader/tensors/tensor-loader.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { getDevice, getKernelCapabilities } from '../../gpu/device.js';
-import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
+import { acquireBuffer, releaseBuffer, readBuffer } from '../../memory/buffer-pool.js';
 import { dequantize, dequantizeRowwise, dequantizeQ6K, castF16ToF32, runBF16ToF16 } from '../../gpu/kernel-selector.js';
 import { createTensor } from '../../gpu/tensor.js';
 import { createWeightBuffer } from '../../gpu/weight-buffer.js';
@@ -9,6 +9,7 @@ import { f16ToF32, convertBF16ToF32GPU, shouldDequantizeToF16, applyBufferLayout
 import { QK_K, Q4K_BLOCK_BYTES, Q6K_BLOCK_BYTES } from '../quantization-constants.js';
 import { log, trace as debugTrace } from '../../debug/index.js';
 import { selectRuleValue } from '../../rules/rule-registry.js';
+import { dequantizeQ4KM, dequantizeQ4KMRowWise } from '../../converter/quantizer.js';
 // ============================================================================
 // Q4K Detection
@@ -31,6 +32,24 @@ function releaseOwnedGpuBuffer(buffer, owned) {
   releaseBuffer(buffer);
 }
+function normalizeLoaderDebugConfig(config) {
+  const debug = config?.loaderDebug;
+  if (!debug || typeof debug !== 'object') {
+    return null;
+  }
+  return {
+    enabled: debug.enabled === true,
+    forceGpuDequant: debug.forceGpuDequant === true,
+    preferCpuDequant: debug.preferCpuDequant === true,
+    failOnCpuDequantPath: debug.failOnCpuDequantPath === true,
+    runQ4KDequantParity: debug.runQ4KDequantParity === true,
+    q4kDequantParitySamples: Number.isFinite(debug.q4kDequantParitySamples)
+      ? Math.min(4096, Math.max(1, Math.trunc(debug.q4kDequantParitySamples)))
+      : 256,
+  };
+}
 function logF32UpcastNonMatmul(name, numElements, bufferSize) {
   if (loggedF32UpcastNonMatmul) {
     return;
@@ -199,11 +218,52 @@ export async function loadQ4KDequant(shardData, location, name, config) {
     }
     const outputDtype = getQ4KOutputDtype(location, config);
+    const loaderDebug = normalizeLoaderDebugConfig(config);
+    const debugEnabled = loaderDebug?.enabled === true;
+    const forceGpuDequant = loaderDebug?.forceGpuDequant === true;
+    const failOnCpuDequantPath = loaderDebug?.failOnCpuDequantPath === true;
+    const runQ4KDequantParity = loaderDebug?.runQ4KDequantParity === true;
+    const paritySamples = loaderDebug?.q4kDequantParitySamples ?? 256;
     const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
     const K = is2DMatrix ? location.shape[1] : 0;
     const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
+    const layout = getWeightLayout(location, config);
+    const preferCpuDequant = loaderDebug?.preferCpuDequant === true;
+    const canUseCpuReference = !forceGpuDequant && preferCpuDequant && (
+      outputDtype === 'f32'
+      && !isGpuBufferInstance(shardData)
+      && (!needsRowwise || layout === 'row')
+    );
+    if (canUseCpuReference && failOnCpuDequantPath) {
+      throw new Error(
+        `[LoaderDebug] CPU dequant path taken for ${name}; this run is configured fail-closed. ` +
+        'Set runtime.shared.debug.loader.forceGpuDequant=true to isolate GPU dequant.'
+      );
+    }
+    if (canUseCpuReference) {
+      const quantizedBytes = toUint8View(shardData);
+      const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
+      debugTrace.loader(
+        `Dequantizing ${name} with CPU reference path: ` +
+        `shape=[${location.shape.join(',')}], layout=${layout}, needsRowwise=${needsRowwise}`
+      );
+      const f32Weights = needsRowwise
+        ? dequantizeQ4KMRowWise(quantizedBytes, location.shape)
+        : dequantizeQ4KM(quantizedBytes, numBlocks, location.shape);
+      const outputBuffer = acquireAlignedBuffer(f32Weights.byteLength, `dequant_cpu_${name}`);
+      writeBufferAligned(device, outputBuffer, new Uint8Array(f32Weights.buffer));
+      releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
+      ownsQuantBuffer = false;
+      return {
+        data: createWeightBuffer(outputBuffer, 'f32', layout, location.shape, name),
+        allocatedBuffers: [outputBuffer],
+      };
+    }
+    let numBlocks = null;
     let dequantizedTensor;
     if (needsRowwise) {
       const rows = location.shape[0];
@@ -213,7 +273,7 @@ export async function loadQ4KDequant(shardData, location, name, config) {
       );
       dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
     } else {
-      const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
+      numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
       debugTrace.loader(
         `Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
         `outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
@@ -223,10 +283,71 @@ export async function loadQ4KDequant(shardData, location, name, config) {
     dequantized = dequantizedTensor.buffer;
     debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
+    if (runQ4KDequantParity && !isGpuBufferInstance(shardData) && dequantized && numBlocks !== null) {
+      const isProbeTarget = debugEnabled &&
+        (name.includes('.self_attn.q_proj.weight') || name.includes('.self_attn.k_proj.weight') ||
+          name.includes('.self_attn.v_proj.weight') || name.includes('.self_attn.qkv_proj.weight'));
+      if (isProbeTarget) {
+        try {
+          const bytesPerElem = outputDtype === 'f16' ? 2 : 4;
+          const requestedOutputBytes = numBlocks * QK_K * bytesPerElem;
+          const sampleCount = paritySamples;
+          const readSize = Math.min(sampleCount * bytesPerElem, dequantized.size);
+          const gpuRaw = await readBuffer(dequantized, readSize);
+          const gpuBytes = gpuRaw instanceof ArrayBuffer
+            ? new Uint8Array(gpuRaw)
+            : new Uint8Array(gpuRaw.buffer, gpuRaw.byteOffset, gpuRaw.byteLength);
+          let gpuVals;
+          if (outputDtype === 'f16') {
+            const u16 = new Uint16Array(gpuBytes.buffer, gpuBytes.byteOffset,
+              Math.min(sampleCount, Math.floor(gpuBytes.byteLength / 2)));
+            gpuVals = Array.from(u16, (half) => f16ToF32(half));
+          } else {
+            const f32 = new Float32Array(gpuBytes.buffer, gpuBytes.byteOffset,
+              Math.min(sampleCount, Math.floor(gpuBytes.byteLength / 4)));
+            gpuVals = Array.from(f32);
+          }
+          const quantizedBytes = toUint8View(shardData);
+          const cpuRef = Array.from(
+            needsRowwise
+              ? dequantizeQ4KMRowWise(quantizedBytes, location.shape)
+              : dequantizeQ4KM(quantizedBytes, numBlocks, location.shape)
+          ).slice(0, gpuVals.length);
+          let maxDiff = 0;
+          let diffIdx = -1;
+          for (let i = 0; i < gpuVals.length && i < cpuRef.length; i++) {
+            const d = Math.abs(gpuVals[i] - cpuRef[i]);
+            if (d > maxDiff) {
+              maxDiff = d;
+              diffIdx = i;
+            }
+          }
+          log.warn('DequantProbe',
+            `tensor="${name}" shape=[${location.shape}] ` +
+            `location.size=${location.size} numBlocks=${numBlocks} outputDtype=${outputDtype} ` +
+            `bytesPerElem=${bytesPerElem} requestedOutputBytes=${requestedOutputBytes} bufSize=${dequantized.size} ` +
+            `runParity=true sampleCount=${sampleCount}`
+          );
+          log.warn('DequantProbe',
+            `parity: maxDiff=${maxDiff.toFixed(8)} at idx=${diffIdx} ` +
+            `gpu[0..3]=[${gpuVals.slice(0, 4).map((v) => v.toFixed(6))}] ` +
+            `cpu[0..3]=[${cpuRef.slice(0, 4).map((v) => v.toFixed(6))}]`
+          );
+        } catch (e) {
+          log.warn('DequantProbe', `Readback failed: ${e.message}`);
+        }
+      }
+    }
     releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
     ownsQuantBuffer = false;
-    const layout = getWeightLayout(location, config);
     const dtype = outputDtype;
     return {
@@ -309,8 +430,9 @@ export async function loadBF16(shardData, location, name, config) {
     const numElements = location.size / 2;
     const caps = config.gpuCapabilities || getKernelCapabilities();
     const isMatmulWeight = shouldDequantizeToF16(location);
+    const keepF32Weights = config.keepF32Weights === true;
-    if (caps?.hasF16 && isMatmulWeight) {
+    if (caps?.hasF16 && isMatmulWeight && !keepF32Weights) {
       const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
       resultBuffer = f16Tensor.buffer;
       releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
@@ -327,6 +449,10 @@ export async function loadBF16(shardData, location, name, config) {
       };
     }
+    if (isMatmulWeight && keepF32Weights) {
+      debugTrace.loader(`Keeping BF16 matmul weight in f32: ${name} (keepF32Weights=true)`);
+    }
     const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
     resultBuffer = dstBuffer;
     releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);

package/src/rules/inference/dtype.rules.json CHANGED Viewed

@@ -59,6 +59,11 @@
     { "match": { "useF16": true }, "value": "f16" },
     { "match": {}, "value": { "context": "fallback" } }
   ],
+  "attentionProjectionOutputDtype": [
+    { "match": { "forceF32": true }, "value": "f32" },
+    { "match": { "useF16": true }, "value": "f16" },
+    { "match": {}, "value": { "context": "fallback" } }
+  ],
   "bytesPerElement": [
     { "match": { "dtype": "f16" }, "value": 2 },
     { "match": {}, "value": 4 }

package/src/rules/inference/kernel-path.rules.json CHANGED Viewed

@@ -46,7 +46,7 @@
         "hasSubgroups": false,
         "kernelPathRef": "lfm2-q4k-dequant-f32a-online"
       },
-      "value": "gemma3-q4k-dequant-f32a-nosubgroups"
+      "value": "lfm2-q4k-dequant-f32a-nosubgroups"
     },
     {
       "match": {
@@ -77,7 +77,7 @@
     },
     {
       "match": { "kernelPathId": "lfm2-q4k-dequant-f32a-online" },
-      "value": "gemma3-q4k-dequant-f32a-nosubgroups"
+      "value": "lfm2-q4k-dequant-f32a-nosubgroups"
     },
     {
       "match": { "kernelPathId": "gemma2-f16-f16a" },

package/src/rules/kernels/moe.rules.mixtral.json ADDED Viewed

@@ -0,0 +1,75 @@
+{
+  "vendorQuirkProfile": [
+    {
+      "match": {
+        "vendor": {
+          "contains": ["intel", "amd"]
+        }
+      },
+      "value": {
+        "preferVec4Dequant": false,
+        "dequantTileShape": "scalar",
+        "routerWorkgroupSize": 128,
+        "maxTokensPerExpertScale": 0.85
+      }
+    },
+    {
+      "match": {
+        "vendor": {
+          "contains": ["nvidia", "apple", "qualcomm"]
+        }
+      },
+      "value": {
+        "preferVec4Dequant": false,
+        "dequantTileShape": "scalar",
+        "routerWorkgroupSize": 256,
+        "maxTokensPerExpertScale": 1.0
+      }
+    },
+    {
+      "match": {},
+      "value": {
+        "preferVec4Dequant": false,
+        "dequantTileShape": "scalar",
+        "routerWorkgroupSize": 128,
+        "maxTokensPerExpertScale": 1.0
+      }
+    }
+  ],
+  "routerTopKVariant": [
+    {
+      "match": { "modelType": "mixtral", "hasF16": true, "hasSubgroups": true, "routerDtype": "f32" },
+      "value": "softmax_topk_f32_subgroup"
+    },
+    {
+      "match": { "modelType": "mixtral", "routerDtype": "f32" },
+      "value": "softmax_topk_f32"
+    },
+    {
+      "match": { "modelType": "mixtral" },
+      "value": "softmax_topk_f32"
+    }
+  ],
+  "dequantVariant": [
+    {
+      "match": { "modelType": "mixtral", "weightsDtype": "q4k", "hasF16": true, "hasSubgroups": true, "outputDtype": "f32" },
+      "value": "q4k_expert_dequant_f32_subgroup"
+    },
+    {
+      "match": { "modelType": "mixtral", "weightsDtype": "q4k", "outputDtype": "f16", "hasF16": true },
+      "value": "q4k_expert_dequant_f16"
+    },
+    {
+      "match": { "modelType": "mixtral", "weightsDtype": "q4k" },
+      "value": "q4k_expert_dequant_f32"
+    },
+    {
+      "match": { "modelType": "mixtral", "weightsDtype": "f16", "outputDtype": "f16", "hasF16": true },
+      "value": "f16_expert_passthrough"
+    },
+    {
+      "match": { "modelType": "mixtral" },
+      "value": "f16_expert_upcast_f32"
+    }
+  ]
+}

package/src/rules/kernels/softmax.rules.json CHANGED Viewed

@@ -16,6 +16,8 @@
       },
       "value": "gptoss_router_topk"
     },
+    { "match": { "modelType": "mixtral", "inputDtype": "f16", "weightsDtype": "f16" }, "value": "fused_f16_w16" },
+    { "match": { "modelType": "mixtral" }, "value": "fused" },
     { "match": { "inputDtype": "f16", "weightsDtype": "f16" }, "value": "fused_f16_w16" },
     { "match": { "inputDtype": "f16" }, "value": "fused_f16" },
     { "match": {}, "value": "fused" }

package/src/rules/kernels/split-qg.rules.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+  "variant": [
+    { "match": { "outputDtype": "f16" }, "value": "f16" },
+    { "match": {}, "value": "default" }
+  ]
+}

package/src/rules/rule-registry.d.ts CHANGED Viewed

@@ -20,6 +20,7 @@ type KernelRuleGroup =
   | 'matmul'
   | 'moe'
   | 'moeGptoss'
+  | 'moeMixtral'
   | 'residual'
   | 'rmsnorm'
   | 'rope'

package/src/rules/rule-registry.js CHANGED Viewed

@@ -38,6 +38,7 @@ const layernormRules = await loadJson('./kernels/layernorm.rules.json', import.m
 const matmulRules = await loadJson('./kernels/matmul.rules.json', import.meta.url, 'Failed to load rules');
 const kernelMoeRules = await loadJson('./kernels/moe.rules.json', import.meta.url, 'Failed to load rules');
 const kernelMoeGptOssRules = await loadJson('./kernels/moe.rules.gptoss.json', import.meta.url, 'Failed to load rules');
+const kernelMoeMixtralRules = await loadJson('./kernels/moe.rules.mixtral.json', import.meta.url, 'Failed to load rules');
 const modulateRules = await loadJson('./kernels/modulate.rules.json', import.meta.url, 'Failed to load rules');
 const pixelShuffleRules = await loadJson('./kernels/pixel_shuffle.rules.json', import.meta.url, 'Failed to load rules');
 const repeatChannelsRules = await loadJson('./kernels/repeat-channels.rules.json', import.meta.url, 'Failed to load rules');
@@ -50,6 +51,7 @@ const sampleRules = await loadJson('./kernels/sample.rules.json', import.meta.ur
 const scaleRules = await loadJson('./kernels/scale.rules.json', import.meta.url, 'Failed to load rules');
 const siluRules = await loadJson('./kernels/silu.rules.json', import.meta.url, 'Failed to load rules');
 const splitQkvRules = await loadJson('./kernels/split-qkv.rules.json', import.meta.url, 'Failed to load rules');
+const splitQgRules = await loadJson('./kernels/split-qg.rules.json', import.meta.url, 'Failed to load rules');
 const softmaxRules = await loadJson('./kernels/softmax.rules.json', import.meta.url, 'Failed to load rules');
 const upsample2dRules = await loadJson('./kernels/upsample2d.rules.json', import.meta.url, 'Failed to load rules');
 const configRules = await loadJson('./inference/config.rules.json', import.meta.url, 'Failed to load rules');
@@ -112,6 +114,7 @@ const RULE_SETS = {
     matmul: matmulRules,
     moe: kernelMoeRules,
     moeGptoss: kernelMoeGptOssRules,
+    moeMixtral: kernelMoeMixtralRules,
     modulate: modulateRules,
     pixel_shuffle: pixelShuffleRules,
     repeatChannels: repeatChannelsRules,
@@ -124,6 +127,7 @@ const RULE_SETS = {
     scale: scaleRules,
     silu: siluRules,
     splitQkv: splitQkvRules,
+    splitQg: splitQgRules,
     softmax: softmaxRules,
     upsample2d: upsample2dRules,
   },

package/src/storage/downloader.js CHANGED Viewed

@@ -2,6 +2,7 @@
 import {
   parseManifest,
+  getExpectedShardHash,
   getManifestUrl,
 } from '../formats/rdrr/index.js';
@@ -726,7 +727,7 @@ export async function downloadModel(
         if (!algorithm) {
           throw new Error('Manifest missing hashAlgorithm for download verification.');
         }
-        const expectedHash = shardInfo.hash;
+        const expectedHash = getExpectedShardHash(shardInfo, algorithm);
         if (!expectedHash) {
           throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
         }

package/src/storage/quickstart-downloader.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@
 import type { DownloadProgress } from './downloader.js';
 import type { PreflightResult, ModelRequirements } from './preflight.js';
+import type { HfResolveConfig } from '../utils/hf-resolve-url.js';
 /**
  * Remote model configuration
@@ -24,6 +25,8 @@ export interface RemoteModelConfig {
   displayName: string;
   /** Base URL for shards (any static CDN) */
   baseUrl?: string | null;
+  /** Hosted Hugging Face source used when baseUrl is omitted */
+  hf?: HfResolveConfig | null;
   /** Model requirements for pre-flight checks */
   requirements: ModelRequirements;
 }

package/src/storage/quickstart-downloader.js CHANGED Viewed

@@ -7,6 +7,7 @@ import {
 } from './preflight.js';
 import { formatBytes } from './quota.js';
 import { getCdnBasePath } from './download-types.js';
+import { buildHfResolveBaseUrl, DEFAULT_HF_CDN_BASE_URL } from '../utils/hf-resolve-url.js';
 // ============================================================================
 // Model Registry
@@ -15,40 +16,14 @@ import { getCdnBasePath } from './download-types.js';
 let cdnBaseOverride = null;
-function getEffectiveCDNBaseUrl() {
-  const runtimeBase = getCdnBasePath();
-  const base = cdnBaseOverride ?? runtimeBase ?? '';
-  if (base) return base;
-  // Auto-detect: use same origin for Firebase Hosting or local dev
-  if (typeof globalThis.location !== 'undefined') {
-    const path = globalThis.location.pathname || '';
-    if (
-      path === '/d' ||
-      path.startsWith('/d/') ||
-      path === '/doppler' ||
-      path.startsWith('/doppler/') ||
-      path === '/dr' ||
-      path.startsWith('/dr/') ||
-      globalThis.location.host.includes('replo')
-    ) {
-      return `${globalThis.location.origin}/doppler/models`;
-    }
-    return `${globalThis.location.origin}/models`;
-  }
-  // Fallback for non-browser-global contexts
-  return '/models';
-}
 export function setCDNBaseUrl(url) {
-  cdnBaseOverride = url.replace(/\/$/, ''); // Remove trailing slash
+  const normalized = typeof url === 'string' ? url.trim().replace(/\/$/, '') : '';
+  cdnBaseOverride = normalized || null;
 }
 export function getCDNBaseUrl() {
-  return getEffectiveCDNBaseUrl();
+  return cdnBaseOverride ?? getCdnBasePath() ?? DEFAULT_HF_CDN_BASE_URL;
 }
@@ -57,12 +32,22 @@ export const QUICKSTART_MODELS = {
     modelId: 'gemma-3-270m-it-q4k-ehf16-af32',
     displayName: 'Gemma 3 270M IT (Q4K)',
     baseUrl: null,
+    hf: {
+      repoId: 'Clocksmith/rdrr',
+      revision: 'ca6f0dbdf3882d3893a65cf48f2bb6f1520df162',
+      path: 'models/gemma-3-270m-it-q4k-ehf16-af32',
+    },
     requirements: MODEL_REQUIREMENTS['gemma-3-270m-it-q4k-ehf16-af32'],
   },
   'google-embeddinggemma-300m-q4k-ehf16-af32': {
     modelId: 'google-embeddinggemma-300m-q4k-ehf16-af32',
     displayName: 'EmbeddingGemma 300M (Q4K)',
     baseUrl: null,
+    hf: {
+      repoId: 'Clocksmith/rdrr',
+      revision: '7e79c466d54455bd370c81685956ea9abae0fd30',
+      path: 'models/google-embeddinggemma-300m-q4k-ehf16-af32',
+    },
     requirements: MODEL_REQUIREMENTS['google-embeddinggemma-300m-q4k-ehf16-af32'],
   },
 };
@@ -82,6 +67,18 @@ export function registerQuickStartModel(config) {
   QUICKSTART_MODELS[config.modelId] = config;
 }
+function resolveQuickStartModelBaseUrl(config) {
+  if (typeof config?.baseUrl === 'string' && config.baseUrl.trim().length > 0) {
+    return config.baseUrl.trim().replace(/\/$/, '');
+  }
+  if (config?.hf) {
+    return buildHfResolveBaseUrl(config.hf, { cdnBasePath: getCDNBaseUrl() });
+  }
+  throw new Error(
+    `Quickstart model "${config?.modelId ?? 'unknown'}" is missing an explicit baseUrl or hosted Hugging Face source.`
+  );
+}
 // ============================================================================
 // Download Functions
 // ============================================================================
@@ -190,7 +187,7 @@ export async function downloadQuickStartModel(
       signal,
     };
-    const baseUrl = config.baseUrl ?? `${getEffectiveCDNBaseUrl()}/${config.modelId}`;
+    const baseUrl = resolveQuickStartModelBaseUrl(config);
     const success = await downloadModel(
       baseUrl,
       onProgress,

package/src/storage/shard-manager.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import {
   getManifest,
+  getExpectedShardHash,
   getShardInfo,
   getShardCount,
   generateShardFilename,
@@ -280,7 +281,7 @@ export async function writeShard(shardIndex, data, options = { verify: true }) {
       const manifest = getManifest();
       const algorithm = requireManifestHashAlgorithm(manifest, 'shard write');
       const hash = await computeHash(bytes, algorithm);
-      const expectedHash = shardInfo.hash;
+      const expectedHash = getExpectedShardHash(shardInfo, algorithm);
       if (!expectedHash) {
         await backend.deleteFile(shardInfo.filename);
         throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
@@ -369,7 +370,7 @@ export async function loadShard(shardIndex, options = { verify: false }) {
       const manifest = getManifest();
       const algorithm = requireManifestHashAlgorithm(manifest, 'shard load');
       const hash = await computeHash(buffer, algorithm);
-      const expectedHash = shardInfo.hash;
+      const expectedHash = getExpectedShardHash(shardInfo, algorithm);
       if (!expectedHash) {
         throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
       }
@@ -531,7 +532,7 @@ export async function verifyIntegrity(options = {}) {
         const buffer = await loadShard(i, { verify: false });
         const hash = await computeHash(buffer, algorithm);
         const shardInfo = getShardInfo(i);
-        const expectedHash = shardInfo?.hash;
+        const expectedHash = getExpectedShardHash(shardInfo, algorithm);
         if (!expectedHash) {
           corruptShards.push(i);
           continue;

package/src/tooling/conversion-config-materializer.js CHANGED Viewed

@@ -2,6 +2,7 @@ import path from 'node:path';
 import { createConverterConfig } from '../config/schema/index.js';
 import { resolveConversionPlan } from '../converter/conversion-plan.js';
+import { normalizeQuantTag } from '../converter/quantization-info.js';
 function toSafeString(value) {
   if (typeof value !== 'string') return '';
@@ -10,10 +11,7 @@ function toSafeString(value) {
 }
 function normalizeQuantizationTag(value) {
-  const raw = toSafeString(value).toUpperCase();
-  if (!raw) return 'f16';
-  if (raw === 'Q4_K_M' || raw === 'Q4_K') return 'q4k';
-  return raw.toLowerCase();
+  return normalizeQuantTag(toSafeString(value));
 }
 function resolveArchitectureHint(architecture) {
@@ -37,7 +35,7 @@ function extractSourceQuantization(manifest) {
   if (explicitWeights) return explicitWeights;
   const explicitQuant = toSafeString(manifest?.quantization);
   if (explicitQuant) return explicitQuant;
-  return 'f16';
+  return normalizeQuantTag(null);
 }
 function buildRefreshRawConfig(manifest) {

package/src/tooling/node-converter.js CHANGED Viewed

@@ -541,18 +541,24 @@ async function listRelativeFiles(rootDir, relDir = '', out = []) {
   return out;
 }
-async function clearExistingShardFiles(outputDir) {
+async function clearExistingConversionOutputs(outputDir) {
   let entries;
   try {
     entries = await fs.readdir(outputDir, { withFileTypes: true });
   } catch {
     return;
   }
-  const shardFiles = entries
-    .filter((entry) => entry.isFile() && /^shard_\d{5}\.bin$/i.test(entry.name))
+  const artifactFiles = entries
+    .filter((entry) => (
+      entry.isFile()
+      && (
+        /^shard_\d{5}\.bin$/i.test(entry.name)
+        || entry.name === 'manifest.json'
+      )
+    ))
     .map((entry) => path.join(outputDir, entry.name));
-  if (shardFiles.length === 0) return;
-  await Promise.all(shardFiles.map((filePath) => fs.unlink(filePath)));
+  if (artifactFiles.length === 0) return;
+  await Promise.all(artifactFiles.map((filePath) => fs.unlink(filePath)));
 }
 function createNodeConvertIO(outputDir, options) {
@@ -875,6 +881,7 @@ export async function convertSafetensorsDirectory(options) {
   let sourceQuantization = null;
   let tokenizerJson = null;
   let tokenizerConfig = null;
+  let generationConfig = null;
   let hasTokenizerModel = false;
   let tokenizerModelPath = null;
   let diffusionAuxFiles = [];
@@ -1101,6 +1108,7 @@ export async function convertSafetensorsDirectory(options) {
       },
     });
     config = parsedTransformer.config;
+    generationConfig = parsedTransformer.generationConfig ?? null;
     tensors = parsedTransformer.tensors;
     architectureHint = parsedTransformer.architectureHint;
     architecture = extractArchitecture(config, null);
@@ -1151,7 +1159,7 @@ export async function convertSafetensorsDirectory(options) {
   const outputDir = resolveOutputDir(outputDirOverride, converterConfig, modelId);
   await fs.mkdir(outputDir, { recursive: true });
-  await clearExistingShardFiles(outputDir);
+  await clearExistingConversionOutputs(outputDir);
   const model = {
     name: path.basename(inputDir),
@@ -1169,6 +1177,7 @@ export async function convertSafetensorsDirectory(options) {
     quantization: targetQuantization,
     tokenizerJson,
     tokenizerConfig,
+    generationConfig,
     tokenizerModel: hasTokenizerModel ? 'tokenizer.model' : null,
   };
@@ -1177,6 +1186,15 @@ export async function convertSafetensorsDirectory(options) {
     computeHash,
     readRange: fileRangeReader.readRange,
   });
+  const deferredManifestState = {
+    manifest: null,
+  };
+  const convertIo = {
+    ...io,
+    async writeManifest(manifest) {
+      deferredManifestState.manifest = manifest;
+    },
+  };
   const manifestArchitecture = modelKind === 'diffusion' ? 'diffusion' : architecture;
   let workerPool = null;
   let workerTensorTransformer = null;
@@ -1241,7 +1259,7 @@ export async function convertSafetensorsDirectory(options) {
     }));
     const convertTimer = createStageTimer('Convert tensors');
-    result = await convertModel(model, io, {
+    result = await convertModel(model, convertIo, {
       modelId,
       modelType: resolvedModelType,
       quantization: targetQuantization,
@@ -1279,6 +1297,9 @@ export async function convertSafetensorsDirectory(options) {
   }
   normalizeTokenizerManifest(result.manifest);
+  if (!deferredManifestState.manifest) {
+    throw new Error('node convert: convert core did not produce a manifest.');
+  }
   await io.writeManifest(result.manifest);
   const report = buildConvertReport(result, {