@simulatte/doppler 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +25 -6
- package/package.json +25 -38
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-api.browser.js +6 -0
- package/src/client/doppler-api.d.ts +3 -0
- package/src/client/doppler-api.js +11 -2
- package/src/client/doppler-registry.js +3 -5
- package/src/client/doppler-registry.json +2 -2
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/kernel-ref-digests.js +23 -21
- package/src/config/kernels/moe/mixtral.paths.json +46 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +9 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/platforms/loader.js +3 -1
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +21 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +4 -1
- package/src/config/presets/models/gemma4.json +61 -0
- package/src/config/presets/models/granite-docling.json +70 -0
- package/src/config/presets/models/lfm2.json +6 -1
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/models/qwen3_vl.json +40 -0
- package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
- package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
- package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/presets/runtime/modes/trace-layers.json +1 -0
- package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
- package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
- package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
- package/src/config/runtime.js +3 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/debug.schema.d.ts +40 -0
- package/src/config/schema/debug.schema.js +28 -0
- package/src/config/schema/index.js +2 -0
- package/src/config/schema/inference-defaults.schema.js +1 -1
- package/src/config/schema/kernel-path.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/memory-limits.schema.js +2 -2
- package/src/config/schema/storage.schema.js +2 -2
- package/src/converter/conversion-plan.js +11 -3
- package/src/converter/core.js +19 -8
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.d.ts +5 -0
- package/src/converter/quantizer.js +34 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +40 -1
- package/src/formats/rdrr/classification.js +32 -0
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernel-runtime.js +4 -2
- package/src/gpu/kernels/attention.js +2 -1
- package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
- package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
- package/src/gpu/kernels/dequant_shared.wgsl +4 -2
- package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
- package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
- package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
- package/src/gpu/kernels/gated-short-conv.js +284 -0
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/linear-attention-core.js +37 -17
- package/src/gpu/kernels/matmul-selection.js +48 -4
- package/src/gpu/kernels/matmul.d.ts +5 -0
- package/src/gpu/kernels/matmul.js +71 -2
- package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/sample.js +1 -3
- package/src/gpu/kernels/sample.wgsl +39 -9
- package/src/gpu/kernels/sample_f16.wgsl +38 -8
- package/src/gpu/kernels/shader-cache.js +9 -4
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/kv-cache/base.js +3 -10
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/pipeline.js +2 -1
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
- package/src/inference/pipelines/text/attention/projections.js +54 -13
- package/src/inference/pipelines/text/attention/record.js +16 -6
- package/src/inference/pipelines/text/attention/run.js +59 -6
- package/src/inference/pipelines/text/config.d.ts +1 -0
- package/src/inference/pipelines/text/config.js +46 -4
- package/src/inference/pipelines/text/embed.js +26 -7
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
- package/src/inference/pipelines/text/execution-v0.js +12 -1
- package/src/inference/pipelines/text/generator-helpers.js +1 -0
- package/src/inference/pipelines/text/generator-runtime.js +19 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
- package/src/inference/pipelines/text/generator-steps.js +71 -26
- package/src/inference/pipelines/text/generator.d.ts +5 -0
- package/src/inference/pipelines/text/generator.js +353 -166
- package/src/inference/pipelines/text/init.d.ts +15 -0
- package/src/inference/pipelines/text/init.js +35 -10
- package/src/inference/pipelines/text/layer.js +38 -8
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/moe-gpu.js +21 -3
- package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
- package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
- package/src/inference/pipelines/text/ops.js +123 -53
- package/src/inference/pipelines/text/probes.js +1 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/pipelines/text/state.js +2 -0
- package/src/inference/pipelines/text.d.ts +5 -0
- package/src/inference/pipelines/text.js +59 -1
- package/src/inference/pipelines/vision/encoder.js +386 -0
- package/src/inference/pipelines/vision/image-preprocess.js +151 -0
- package/src/inference/pipelines/vision/index.js +173 -0
- package/src/inference/pipelines/vision/ops.js +78 -0
- package/src/inference/pipelines/vision/patch-embed.js +151 -0
- package/src/inference/test-harness.js +11 -9
- package/src/loader/doppler-loader.d.ts +3 -0
- package/src/loader/doppler-loader.js +20 -3
- package/src/loader/experts/expert-cache.js +6 -2
- package/src/loader/experts/expert-loader.js +6 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/layer-loader.js +42 -3
- package/src/loader/manifest-config.js +3 -1
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.d.ts +3 -0
- package/src/loader/tensors/tensor-loader.js +130 -4
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/moe.rules.mixtral.json +75 -0
- package/src/rules/kernels/softmax.rules.json +2 -0
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.d.ts +1 -0
- package/src/rules/rule-registry.js +4 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/quickstart-downloader.d.ts +3 -0
- package/src/storage/quickstart-downloader.js +27 -30
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +28 -7
- package/src/tooling/node-source-runtime.js +65 -5
- package/src/tooling/node-webgpu.js +24 -7
- package/src/types/model.d.ts +5 -0
- package/src/utils/hf-resolve-url.d.ts +16 -0
- package/src/utils/hf-resolve-url.js +17 -0
- package/src/version.js +1 -1
- package/tools/doppler-cli.js +6 -1
- package/src/tooling/node-convert.d.ts +0 -54
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
import { getDevice, getKernelCapabilities } from '../../gpu/device.js';
|
|
4
|
-
import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
|
|
4
|
+
import { acquireBuffer, releaseBuffer, readBuffer } from '../../memory/buffer-pool.js';
|
|
5
5
|
import { dequantize, dequantizeRowwise, dequantizeQ6K, castF16ToF32, runBF16ToF16 } from '../../gpu/kernel-selector.js';
|
|
6
6
|
import { createTensor } from '../../gpu/tensor.js';
|
|
7
7
|
import { createWeightBuffer } from '../../gpu/weight-buffer.js';
|
|
@@ -9,6 +9,7 @@ import { f16ToF32, convertBF16ToF32GPU, shouldDequantizeToF16, applyBufferLayout
|
|
|
9
9
|
import { QK_K, Q4K_BLOCK_BYTES, Q6K_BLOCK_BYTES } from '../quantization-constants.js';
|
|
10
10
|
import { log, trace as debugTrace } from '../../debug/index.js';
|
|
11
11
|
import { selectRuleValue } from '../../rules/rule-registry.js';
|
|
12
|
+
import { dequantizeQ4KM, dequantizeQ4KMRowWise } from '../../converter/quantizer.js';
|
|
12
13
|
|
|
13
14
|
// ============================================================================
|
|
14
15
|
// Q4K Detection
|
|
@@ -31,6 +32,24 @@ function releaseOwnedGpuBuffer(buffer, owned) {
|
|
|
31
32
|
releaseBuffer(buffer);
|
|
32
33
|
}
|
|
33
34
|
|
|
35
|
+
function normalizeLoaderDebugConfig(config) {
|
|
36
|
+
const debug = config?.loaderDebug;
|
|
37
|
+
if (!debug || typeof debug !== 'object') {
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
enabled: debug.enabled === true,
|
|
43
|
+
forceGpuDequant: debug.forceGpuDequant === true,
|
|
44
|
+
preferCpuDequant: debug.preferCpuDequant === true,
|
|
45
|
+
failOnCpuDequantPath: debug.failOnCpuDequantPath === true,
|
|
46
|
+
runQ4KDequantParity: debug.runQ4KDequantParity === true,
|
|
47
|
+
q4kDequantParitySamples: Number.isFinite(debug.q4kDequantParitySamples)
|
|
48
|
+
? Math.min(4096, Math.max(1, Math.trunc(debug.q4kDequantParitySamples)))
|
|
49
|
+
: 256,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
34
53
|
function logF32UpcastNonMatmul(name, numElements, bufferSize) {
|
|
35
54
|
if (loggedF32UpcastNonMatmul) {
|
|
36
55
|
return;
|
|
@@ -199,11 +218,52 @@ export async function loadQ4KDequant(shardData, location, name, config) {
|
|
|
199
218
|
}
|
|
200
219
|
|
|
201
220
|
const outputDtype = getQ4KOutputDtype(location, config);
|
|
221
|
+
const loaderDebug = normalizeLoaderDebugConfig(config);
|
|
222
|
+
const debugEnabled = loaderDebug?.enabled === true;
|
|
223
|
+
const forceGpuDequant = loaderDebug?.forceGpuDequant === true;
|
|
224
|
+
const failOnCpuDequantPath = loaderDebug?.failOnCpuDequantPath === true;
|
|
225
|
+
const runQ4KDequantParity = loaderDebug?.runQ4KDequantParity === true;
|
|
226
|
+
const paritySamples = loaderDebug?.q4kDequantParitySamples ?? 256;
|
|
202
227
|
|
|
203
228
|
const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
|
|
204
229
|
const K = is2DMatrix ? location.shape[1] : 0;
|
|
205
230
|
const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
|
|
231
|
+
const layout = getWeightLayout(location, config);
|
|
232
|
+
const preferCpuDequant = loaderDebug?.preferCpuDequant === true;
|
|
233
|
+
const canUseCpuReference = !forceGpuDequant && preferCpuDequant && (
|
|
234
|
+
outputDtype === 'f32'
|
|
235
|
+
&& !isGpuBufferInstance(shardData)
|
|
236
|
+
&& (!needsRowwise || layout === 'row')
|
|
237
|
+
);
|
|
238
|
+
|
|
239
|
+
if (canUseCpuReference && failOnCpuDequantPath) {
|
|
240
|
+
throw new Error(
|
|
241
|
+
`[LoaderDebug] CPU dequant path taken for ${name}; this run is configured fail-closed. ` +
|
|
242
|
+
'Set runtime.shared.debug.loader.forceGpuDequant=true to isolate GPU dequant.'
|
|
243
|
+
);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (canUseCpuReference) {
|
|
247
|
+
const quantizedBytes = toUint8View(shardData);
|
|
248
|
+
const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
|
|
249
|
+
debugTrace.loader(
|
|
250
|
+
`Dequantizing ${name} with CPU reference path: ` +
|
|
251
|
+
`shape=[${location.shape.join(',')}], layout=${layout}, needsRowwise=${needsRowwise}`
|
|
252
|
+
);
|
|
253
|
+
const f32Weights = needsRowwise
|
|
254
|
+
? dequantizeQ4KMRowWise(quantizedBytes, location.shape)
|
|
255
|
+
: dequantizeQ4KM(quantizedBytes, numBlocks, location.shape);
|
|
256
|
+
const outputBuffer = acquireAlignedBuffer(f32Weights.byteLength, `dequant_cpu_${name}`);
|
|
257
|
+
writeBufferAligned(device, outputBuffer, new Uint8Array(f32Weights.buffer));
|
|
258
|
+
releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
|
|
259
|
+
ownsQuantBuffer = false;
|
|
260
|
+
return {
|
|
261
|
+
data: createWeightBuffer(outputBuffer, 'f32', layout, location.shape, name),
|
|
262
|
+
allocatedBuffers: [outputBuffer],
|
|
263
|
+
};
|
|
264
|
+
}
|
|
206
265
|
|
|
266
|
+
let numBlocks = null;
|
|
207
267
|
let dequantizedTensor;
|
|
208
268
|
if (needsRowwise) {
|
|
209
269
|
const rows = location.shape[0];
|
|
@@ -213,7 +273,7 @@ export async function loadQ4KDequant(shardData, location, name, config) {
|
|
|
213
273
|
);
|
|
214
274
|
dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
|
|
215
275
|
} else {
|
|
216
|
-
|
|
276
|
+
numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
|
|
217
277
|
debugTrace.loader(
|
|
218
278
|
`Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
|
|
219
279
|
`outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
|
|
@@ -223,10 +283,71 @@ export async function loadQ4KDequant(shardData, location, name, config) {
|
|
|
223
283
|
dequantized = dequantizedTensor.buffer;
|
|
224
284
|
|
|
225
285
|
debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
|
|
286
|
+
|
|
287
|
+
if (runQ4KDequantParity && !isGpuBufferInstance(shardData) && dequantized && numBlocks !== null) {
|
|
288
|
+
const isProbeTarget = debugEnabled &&
|
|
289
|
+
(name.includes('.self_attn.q_proj.weight') || name.includes('.self_attn.k_proj.weight') ||
|
|
290
|
+
name.includes('.self_attn.v_proj.weight') || name.includes('.self_attn.qkv_proj.weight'));
|
|
291
|
+
|
|
292
|
+
if (isProbeTarget) {
|
|
293
|
+
try {
|
|
294
|
+
const bytesPerElem = outputDtype === 'f16' ? 2 : 4;
|
|
295
|
+
const requestedOutputBytes = numBlocks * QK_K * bytesPerElem;
|
|
296
|
+
const sampleCount = paritySamples;
|
|
297
|
+
const readSize = Math.min(sampleCount * bytesPerElem, dequantized.size);
|
|
298
|
+
const gpuRaw = await readBuffer(dequantized, readSize);
|
|
299
|
+
const gpuBytes = gpuRaw instanceof ArrayBuffer
|
|
300
|
+
? new Uint8Array(gpuRaw)
|
|
301
|
+
: new Uint8Array(gpuRaw.buffer, gpuRaw.byteOffset, gpuRaw.byteLength);
|
|
302
|
+
|
|
303
|
+
let gpuVals;
|
|
304
|
+
if (outputDtype === 'f16') {
|
|
305
|
+
const u16 = new Uint16Array(gpuBytes.buffer, gpuBytes.byteOffset,
|
|
306
|
+
Math.min(sampleCount, Math.floor(gpuBytes.byteLength / 2)));
|
|
307
|
+
gpuVals = Array.from(u16, (half) => f16ToF32(half));
|
|
308
|
+
} else {
|
|
309
|
+
const f32 = new Float32Array(gpuBytes.buffer, gpuBytes.byteOffset,
|
|
310
|
+
Math.min(sampleCount, Math.floor(gpuBytes.byteLength / 4)));
|
|
311
|
+
gpuVals = Array.from(f32);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
const quantizedBytes = toUint8View(shardData);
|
|
315
|
+
const cpuRef = Array.from(
|
|
316
|
+
needsRowwise
|
|
317
|
+
? dequantizeQ4KMRowWise(quantizedBytes, location.shape)
|
|
318
|
+
: dequantizeQ4KM(quantizedBytes, numBlocks, location.shape)
|
|
319
|
+
).slice(0, gpuVals.length);
|
|
320
|
+
|
|
321
|
+
let maxDiff = 0;
|
|
322
|
+
let diffIdx = -1;
|
|
323
|
+
for (let i = 0; i < gpuVals.length && i < cpuRef.length; i++) {
|
|
324
|
+
const d = Math.abs(gpuVals[i] - cpuRef[i]);
|
|
325
|
+
if (d > maxDiff) {
|
|
326
|
+
maxDiff = d;
|
|
327
|
+
diffIdx = i;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
log.warn('DequantProbe',
|
|
332
|
+
`tensor="${name}" shape=[${location.shape}] ` +
|
|
333
|
+
`location.size=${location.size} numBlocks=${numBlocks} outputDtype=${outputDtype} ` +
|
|
334
|
+
`bytesPerElem=${bytesPerElem} requestedOutputBytes=${requestedOutputBytes} bufSize=${dequantized.size} ` +
|
|
335
|
+
`runParity=true sampleCount=${sampleCount}`
|
|
336
|
+
);
|
|
337
|
+
log.warn('DequantProbe',
|
|
338
|
+
`parity: maxDiff=${maxDiff.toFixed(8)} at idx=${diffIdx} ` +
|
|
339
|
+
`gpu[0..3]=[${gpuVals.slice(0, 4).map((v) => v.toFixed(6))}] ` +
|
|
340
|
+
`cpu[0..3]=[${cpuRef.slice(0, 4).map((v) => v.toFixed(6))}]`
|
|
341
|
+
);
|
|
342
|
+
} catch (e) {
|
|
343
|
+
log.warn('DequantProbe', `Readback failed: ${e.message}`);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
226
348
|
releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
|
|
227
349
|
ownsQuantBuffer = false;
|
|
228
350
|
|
|
229
|
-
const layout = getWeightLayout(location, config);
|
|
230
351
|
const dtype = outputDtype;
|
|
231
352
|
|
|
232
353
|
return {
|
|
@@ -309,8 +430,9 @@ export async function loadBF16(shardData, location, name, config) {
|
|
|
309
430
|
const numElements = location.size / 2;
|
|
310
431
|
const caps = config.gpuCapabilities || getKernelCapabilities();
|
|
311
432
|
const isMatmulWeight = shouldDequantizeToF16(location);
|
|
433
|
+
const keepF32Weights = config.keepF32Weights === true;
|
|
312
434
|
|
|
313
|
-
if (caps?.hasF16 && isMatmulWeight) {
|
|
435
|
+
if (caps?.hasF16 && isMatmulWeight && !keepF32Weights) {
|
|
314
436
|
const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
|
|
315
437
|
resultBuffer = f16Tensor.buffer;
|
|
316
438
|
releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
|
|
@@ -327,6 +449,10 @@ export async function loadBF16(shardData, location, name, config) {
|
|
|
327
449
|
};
|
|
328
450
|
}
|
|
329
451
|
|
|
452
|
+
if (isMatmulWeight && keepF32Weights) {
|
|
453
|
+
debugTrace.loader(`Keeping BF16 matmul weight in f32: ${name} (keepF32Weights=true)`);
|
|
454
|
+
}
|
|
455
|
+
|
|
330
456
|
const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
|
|
331
457
|
resultBuffer = dstBuffer;
|
|
332
458
|
releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
|
|
@@ -59,6 +59,11 @@
|
|
|
59
59
|
{ "match": { "useF16": true }, "value": "f16" },
|
|
60
60
|
{ "match": {}, "value": { "context": "fallback" } }
|
|
61
61
|
],
|
|
62
|
+
"attentionProjectionOutputDtype": [
|
|
63
|
+
{ "match": { "forceF32": true }, "value": "f32" },
|
|
64
|
+
{ "match": { "useF16": true }, "value": "f16" },
|
|
65
|
+
{ "match": {}, "value": { "context": "fallback" } }
|
|
66
|
+
],
|
|
62
67
|
"bytesPerElement": [
|
|
63
68
|
{ "match": { "dtype": "f16" }, "value": 2 },
|
|
64
69
|
{ "match": {}, "value": 4 }
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
"hasSubgroups": false,
|
|
47
47
|
"kernelPathRef": "lfm2-q4k-dequant-f32a-online"
|
|
48
48
|
},
|
|
49
|
-
"value": "
|
|
49
|
+
"value": "lfm2-q4k-dequant-f32a-nosubgroups"
|
|
50
50
|
},
|
|
51
51
|
{
|
|
52
52
|
"match": {
|
|
@@ -77,7 +77,7 @@
|
|
|
77
77
|
},
|
|
78
78
|
{
|
|
79
79
|
"match": { "kernelPathId": "lfm2-q4k-dequant-f32a-online" },
|
|
80
|
-
"value": "
|
|
80
|
+
"value": "lfm2-q4k-dequant-f32a-nosubgroups"
|
|
81
81
|
},
|
|
82
82
|
{
|
|
83
83
|
"match": { "kernelPathId": "gemma2-f16-f16a" },
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
{
|
|
2
|
+
"vendorQuirkProfile": [
|
|
3
|
+
{
|
|
4
|
+
"match": {
|
|
5
|
+
"vendor": {
|
|
6
|
+
"contains": ["intel", "amd"]
|
|
7
|
+
}
|
|
8
|
+
},
|
|
9
|
+
"value": {
|
|
10
|
+
"preferVec4Dequant": false,
|
|
11
|
+
"dequantTileShape": "scalar",
|
|
12
|
+
"routerWorkgroupSize": 128,
|
|
13
|
+
"maxTokensPerExpertScale": 0.85
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"match": {
|
|
18
|
+
"vendor": {
|
|
19
|
+
"contains": ["nvidia", "apple", "qualcomm"]
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"value": {
|
|
23
|
+
"preferVec4Dequant": false,
|
|
24
|
+
"dequantTileShape": "scalar",
|
|
25
|
+
"routerWorkgroupSize": 256,
|
|
26
|
+
"maxTokensPerExpertScale": 1.0
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"match": {},
|
|
31
|
+
"value": {
|
|
32
|
+
"preferVec4Dequant": false,
|
|
33
|
+
"dequantTileShape": "scalar",
|
|
34
|
+
"routerWorkgroupSize": 128,
|
|
35
|
+
"maxTokensPerExpertScale": 1.0
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
"routerTopKVariant": [
|
|
40
|
+
{
|
|
41
|
+
"match": { "modelType": "mixtral", "hasF16": true, "hasSubgroups": true, "routerDtype": "f32" },
|
|
42
|
+
"value": "softmax_topk_f32_subgroup"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"match": { "modelType": "mixtral", "routerDtype": "f32" },
|
|
46
|
+
"value": "softmax_topk_f32"
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"match": { "modelType": "mixtral" },
|
|
50
|
+
"value": "softmax_topk_f32"
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
"dequantVariant": [
|
|
54
|
+
{
|
|
55
|
+
"match": { "modelType": "mixtral", "weightsDtype": "q4k", "hasF16": true, "hasSubgroups": true, "outputDtype": "f32" },
|
|
56
|
+
"value": "q4k_expert_dequant_f32_subgroup"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"match": { "modelType": "mixtral", "weightsDtype": "q4k", "outputDtype": "f16", "hasF16": true },
|
|
60
|
+
"value": "q4k_expert_dequant_f16"
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"match": { "modelType": "mixtral", "weightsDtype": "q4k" },
|
|
64
|
+
"value": "q4k_expert_dequant_f32"
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"match": { "modelType": "mixtral", "weightsDtype": "f16", "outputDtype": "f16", "hasF16": true },
|
|
68
|
+
"value": "f16_expert_passthrough"
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"match": { "modelType": "mixtral" },
|
|
72
|
+
"value": "f16_expert_upcast_f32"
|
|
73
|
+
}
|
|
74
|
+
]
|
|
75
|
+
}
|
|
@@ -16,6 +16,8 @@
|
|
|
16
16
|
},
|
|
17
17
|
"value": "gptoss_router_topk"
|
|
18
18
|
},
|
|
19
|
+
{ "match": { "modelType": "mixtral", "inputDtype": "f16", "weightsDtype": "f16" }, "value": "fused_f16_w16" },
|
|
20
|
+
{ "match": { "modelType": "mixtral" }, "value": "fused" },
|
|
19
21
|
{ "match": { "inputDtype": "f16", "weightsDtype": "f16" }, "value": "fused_f16_w16" },
|
|
20
22
|
{ "match": { "inputDtype": "f16" }, "value": "fused_f16" },
|
|
21
23
|
{ "match": {}, "value": "fused" }
|
|
@@ -38,6 +38,7 @@ const layernormRules = await loadJson('./kernels/layernorm.rules.json', import.m
|
|
|
38
38
|
const matmulRules = await loadJson('./kernels/matmul.rules.json', import.meta.url, 'Failed to load rules');
|
|
39
39
|
const kernelMoeRules = await loadJson('./kernels/moe.rules.json', import.meta.url, 'Failed to load rules');
|
|
40
40
|
const kernelMoeGptOssRules = await loadJson('./kernels/moe.rules.gptoss.json', import.meta.url, 'Failed to load rules');
|
|
41
|
+
const kernelMoeMixtralRules = await loadJson('./kernels/moe.rules.mixtral.json', import.meta.url, 'Failed to load rules');
|
|
41
42
|
const modulateRules = await loadJson('./kernels/modulate.rules.json', import.meta.url, 'Failed to load rules');
|
|
42
43
|
const pixelShuffleRules = await loadJson('./kernels/pixel_shuffle.rules.json', import.meta.url, 'Failed to load rules');
|
|
43
44
|
const repeatChannelsRules = await loadJson('./kernels/repeat-channels.rules.json', import.meta.url, 'Failed to load rules');
|
|
@@ -50,6 +51,7 @@ const sampleRules = await loadJson('./kernels/sample.rules.json', import.meta.ur
|
|
|
50
51
|
const scaleRules = await loadJson('./kernels/scale.rules.json', import.meta.url, 'Failed to load rules');
|
|
51
52
|
const siluRules = await loadJson('./kernels/silu.rules.json', import.meta.url, 'Failed to load rules');
|
|
52
53
|
const splitQkvRules = await loadJson('./kernels/split-qkv.rules.json', import.meta.url, 'Failed to load rules');
|
|
54
|
+
const splitQgRules = await loadJson('./kernels/split-qg.rules.json', import.meta.url, 'Failed to load rules');
|
|
53
55
|
const softmaxRules = await loadJson('./kernels/softmax.rules.json', import.meta.url, 'Failed to load rules');
|
|
54
56
|
const upsample2dRules = await loadJson('./kernels/upsample2d.rules.json', import.meta.url, 'Failed to load rules');
|
|
55
57
|
const configRules = await loadJson('./inference/config.rules.json', import.meta.url, 'Failed to load rules');
|
|
@@ -112,6 +114,7 @@ const RULE_SETS = {
|
|
|
112
114
|
matmul: matmulRules,
|
|
113
115
|
moe: kernelMoeRules,
|
|
114
116
|
moeGptoss: kernelMoeGptOssRules,
|
|
117
|
+
moeMixtral: kernelMoeMixtralRules,
|
|
115
118
|
modulate: modulateRules,
|
|
116
119
|
pixel_shuffle: pixelShuffleRules,
|
|
117
120
|
repeatChannels: repeatChannelsRules,
|
|
@@ -124,6 +127,7 @@ const RULE_SETS = {
|
|
|
124
127
|
scale: scaleRules,
|
|
125
128
|
silu: siluRules,
|
|
126
129
|
splitQkv: splitQkvRules,
|
|
130
|
+
splitQg: splitQgRules,
|
|
127
131
|
softmax: softmaxRules,
|
|
128
132
|
upsample2d: upsample2dRules,
|
|
129
133
|
},
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import {
|
|
4
4
|
parseManifest,
|
|
5
|
+
getExpectedShardHash,
|
|
5
6
|
getManifestUrl,
|
|
6
7
|
} from '../formats/rdrr/index.js';
|
|
7
8
|
|
|
@@ -726,7 +727,7 @@ export async function downloadModel(
|
|
|
726
727
|
if (!algorithm) {
|
|
727
728
|
throw new Error('Manifest missing hashAlgorithm for download verification.');
|
|
728
729
|
}
|
|
729
|
-
const expectedHash = shardInfo
|
|
730
|
+
const expectedHash = getExpectedShardHash(shardInfo, algorithm);
|
|
730
731
|
if (!expectedHash) {
|
|
731
732
|
throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
|
|
732
733
|
}
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
|
|
14
14
|
import type { DownloadProgress } from './downloader.js';
|
|
15
15
|
import type { PreflightResult, ModelRequirements } from './preflight.js';
|
|
16
|
+
import type { HfResolveConfig } from '../utils/hf-resolve-url.js';
|
|
16
17
|
|
|
17
18
|
/**
|
|
18
19
|
* Remote model configuration
|
|
@@ -24,6 +25,8 @@ export interface RemoteModelConfig {
|
|
|
24
25
|
displayName: string;
|
|
25
26
|
/** Base URL for shards (any static CDN) */
|
|
26
27
|
baseUrl?: string | null;
|
|
28
|
+
/** Hosted Hugging Face source used when baseUrl is omitted */
|
|
29
|
+
hf?: HfResolveConfig | null;
|
|
27
30
|
/** Model requirements for pre-flight checks */
|
|
28
31
|
requirements: ModelRequirements;
|
|
29
32
|
}
|
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
} from './preflight.js';
|
|
8
8
|
import { formatBytes } from './quota.js';
|
|
9
9
|
import { getCdnBasePath } from './download-types.js';
|
|
10
|
+
import { buildHfResolveBaseUrl, DEFAULT_HF_CDN_BASE_URL } from '../utils/hf-resolve-url.js';
|
|
10
11
|
|
|
11
12
|
// ============================================================================
|
|
12
13
|
// Model Registry
|
|
@@ -15,40 +16,14 @@ import { getCdnBasePath } from './download-types.js';
|
|
|
15
16
|
|
|
16
17
|
let cdnBaseOverride = null;
|
|
17
18
|
|
|
18
|
-
|
|
19
|
-
function getEffectiveCDNBaseUrl() {
|
|
20
|
-
const runtimeBase = getCdnBasePath();
|
|
21
|
-
const base = cdnBaseOverride ?? runtimeBase ?? '';
|
|
22
|
-
if (base) return base;
|
|
23
|
-
|
|
24
|
-
// Auto-detect: use same origin for Firebase Hosting or local dev
|
|
25
|
-
if (typeof globalThis.location !== 'undefined') {
|
|
26
|
-
const path = globalThis.location.pathname || '';
|
|
27
|
-
if (
|
|
28
|
-
path === '/d' ||
|
|
29
|
-
path.startsWith('/d/') ||
|
|
30
|
-
path === '/doppler' ||
|
|
31
|
-
path.startsWith('/doppler/') ||
|
|
32
|
-
path === '/dr' ||
|
|
33
|
-
path.startsWith('/dr/') ||
|
|
34
|
-
globalThis.location.host.includes('replo')
|
|
35
|
-
) {
|
|
36
|
-
return `${globalThis.location.origin}/doppler/models`;
|
|
37
|
-
}
|
|
38
|
-
return `${globalThis.location.origin}/models`;
|
|
39
|
-
}
|
|
40
|
-
// Fallback for non-browser-global contexts
|
|
41
|
-
return '/models';
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
|
|
45
19
|
export function setCDNBaseUrl(url) {
|
|
46
|
-
|
|
20
|
+
const normalized = typeof url === 'string' ? url.trim().replace(/\/$/, '') : '';
|
|
21
|
+
cdnBaseOverride = normalized || null;
|
|
47
22
|
}
|
|
48
23
|
|
|
49
24
|
|
|
50
25
|
export function getCDNBaseUrl() {
|
|
51
|
-
return
|
|
26
|
+
return cdnBaseOverride ?? getCdnBasePath() ?? DEFAULT_HF_CDN_BASE_URL;
|
|
52
27
|
}
|
|
53
28
|
|
|
54
29
|
|
|
@@ -57,12 +32,22 @@ export const QUICKSTART_MODELS = {
|
|
|
57
32
|
modelId: 'gemma-3-270m-it-q4k-ehf16-af32',
|
|
58
33
|
displayName: 'Gemma 3 270M IT (Q4K)',
|
|
59
34
|
baseUrl: null,
|
|
35
|
+
hf: {
|
|
36
|
+
repoId: 'Clocksmith/rdrr',
|
|
37
|
+
revision: 'ca6f0dbdf3882d3893a65cf48f2bb6f1520df162',
|
|
38
|
+
path: 'models/gemma-3-270m-it-q4k-ehf16-af32',
|
|
39
|
+
},
|
|
60
40
|
requirements: MODEL_REQUIREMENTS['gemma-3-270m-it-q4k-ehf16-af32'],
|
|
61
41
|
},
|
|
62
42
|
'google-embeddinggemma-300m-q4k-ehf16-af32': {
|
|
63
43
|
modelId: 'google-embeddinggemma-300m-q4k-ehf16-af32',
|
|
64
44
|
displayName: 'EmbeddingGemma 300M (Q4K)',
|
|
65
45
|
baseUrl: null,
|
|
46
|
+
hf: {
|
|
47
|
+
repoId: 'Clocksmith/rdrr',
|
|
48
|
+
revision: '7e79c466d54455bd370c81685956ea9abae0fd30',
|
|
49
|
+
path: 'models/google-embeddinggemma-300m-q4k-ehf16-af32',
|
|
50
|
+
},
|
|
66
51
|
requirements: MODEL_REQUIREMENTS['google-embeddinggemma-300m-q4k-ehf16-af32'],
|
|
67
52
|
},
|
|
68
53
|
};
|
|
@@ -82,6 +67,18 @@ export function registerQuickStartModel(config) {
|
|
|
82
67
|
QUICKSTART_MODELS[config.modelId] = config;
|
|
83
68
|
}
|
|
84
69
|
|
|
70
|
+
function resolveQuickStartModelBaseUrl(config) {
|
|
71
|
+
if (typeof config?.baseUrl === 'string' && config.baseUrl.trim().length > 0) {
|
|
72
|
+
return config.baseUrl.trim().replace(/\/$/, '');
|
|
73
|
+
}
|
|
74
|
+
if (config?.hf) {
|
|
75
|
+
return buildHfResolveBaseUrl(config.hf, { cdnBasePath: getCDNBaseUrl() });
|
|
76
|
+
}
|
|
77
|
+
throw new Error(
|
|
78
|
+
`Quickstart model "${config?.modelId ?? 'unknown'}" is missing an explicit baseUrl or hosted Hugging Face source.`
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
|
|
85
82
|
// ============================================================================
|
|
86
83
|
// Download Functions
|
|
87
84
|
// ============================================================================
|
|
@@ -190,7 +187,7 @@ export async function downloadQuickStartModel(
|
|
|
190
187
|
signal,
|
|
191
188
|
};
|
|
192
189
|
|
|
193
|
-
const baseUrl = config
|
|
190
|
+
const baseUrl = resolveQuickStartModelBaseUrl(config);
|
|
194
191
|
const success = await downloadModel(
|
|
195
192
|
baseUrl,
|
|
196
193
|
onProgress,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
getManifest,
|
|
3
|
+
getExpectedShardHash,
|
|
3
4
|
getShardInfo,
|
|
4
5
|
getShardCount,
|
|
5
6
|
generateShardFilename,
|
|
@@ -280,7 +281,7 @@ export async function writeShard(shardIndex, data, options = { verify: true }) {
|
|
|
280
281
|
const manifest = getManifest();
|
|
281
282
|
const algorithm = requireManifestHashAlgorithm(manifest, 'shard write');
|
|
282
283
|
const hash = await computeHash(bytes, algorithm);
|
|
283
|
-
const expectedHash = shardInfo
|
|
284
|
+
const expectedHash = getExpectedShardHash(shardInfo, algorithm);
|
|
284
285
|
if (!expectedHash) {
|
|
285
286
|
await backend.deleteFile(shardInfo.filename);
|
|
286
287
|
throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
|
|
@@ -369,7 +370,7 @@ export async function loadShard(shardIndex, options = { verify: false }) {
|
|
|
369
370
|
const manifest = getManifest();
|
|
370
371
|
const algorithm = requireManifestHashAlgorithm(manifest, 'shard load');
|
|
371
372
|
const hash = await computeHash(buffer, algorithm);
|
|
372
|
-
const expectedHash = shardInfo
|
|
373
|
+
const expectedHash = getExpectedShardHash(shardInfo, algorithm);
|
|
373
374
|
if (!expectedHash) {
|
|
374
375
|
throw new Error(`Shard ${shardIndex} is missing hash in manifest`);
|
|
375
376
|
}
|
|
@@ -531,7 +532,7 @@ export async function verifyIntegrity(options = {}) {
|
|
|
531
532
|
const buffer = await loadShard(i, { verify: false });
|
|
532
533
|
const hash = await computeHash(buffer, algorithm);
|
|
533
534
|
const shardInfo = getShardInfo(i);
|
|
534
|
-
const expectedHash = shardInfo
|
|
535
|
+
const expectedHash = getExpectedShardHash(shardInfo, algorithm);
|
|
535
536
|
if (!expectedHash) {
|
|
536
537
|
corruptShards.push(i);
|
|
537
538
|
continue;
|
|
@@ -2,6 +2,7 @@ import path from 'node:path';
|
|
|
2
2
|
|
|
3
3
|
import { createConverterConfig } from '../config/schema/index.js';
|
|
4
4
|
import { resolveConversionPlan } from '../converter/conversion-plan.js';
|
|
5
|
+
import { normalizeQuantTag } from '../converter/quantization-info.js';
|
|
5
6
|
|
|
6
7
|
function toSafeString(value) {
|
|
7
8
|
if (typeof value !== 'string') return '';
|
|
@@ -10,10 +11,7 @@ function toSafeString(value) {
|
|
|
10
11
|
}
|
|
11
12
|
|
|
12
13
|
function normalizeQuantizationTag(value) {
|
|
13
|
-
|
|
14
|
-
if (!raw) return 'f16';
|
|
15
|
-
if (raw === 'Q4_K_M' || raw === 'Q4_K') return 'q4k';
|
|
16
|
-
return raw.toLowerCase();
|
|
14
|
+
return normalizeQuantTag(toSafeString(value));
|
|
17
15
|
}
|
|
18
16
|
|
|
19
17
|
function resolveArchitectureHint(architecture) {
|
|
@@ -37,7 +35,7 @@ function extractSourceQuantization(manifest) {
|
|
|
37
35
|
if (explicitWeights) return explicitWeights;
|
|
38
36
|
const explicitQuant = toSafeString(manifest?.quantization);
|
|
39
37
|
if (explicitQuant) return explicitQuant;
|
|
40
|
-
return
|
|
38
|
+
return normalizeQuantTag(null);
|
|
41
39
|
}
|
|
42
40
|
|
|
43
41
|
function buildRefreshRawConfig(manifest) {
|
|
@@ -541,18 +541,24 @@ async function listRelativeFiles(rootDir, relDir = '', out = []) {
|
|
|
541
541
|
return out;
|
|
542
542
|
}
|
|
543
543
|
|
|
544
|
-
async function
|
|
544
|
+
async function clearExistingConversionOutputs(outputDir) {
|
|
545
545
|
let entries;
|
|
546
546
|
try {
|
|
547
547
|
entries = await fs.readdir(outputDir, { withFileTypes: true });
|
|
548
548
|
} catch {
|
|
549
549
|
return;
|
|
550
550
|
}
|
|
551
|
-
const
|
|
552
|
-
.filter((entry) =>
|
|
551
|
+
const artifactFiles = entries
|
|
552
|
+
.filter((entry) => (
|
|
553
|
+
entry.isFile()
|
|
554
|
+
&& (
|
|
555
|
+
/^shard_\d{5}\.bin$/i.test(entry.name)
|
|
556
|
+
|| entry.name === 'manifest.json'
|
|
557
|
+
)
|
|
558
|
+
))
|
|
553
559
|
.map((entry) => path.join(outputDir, entry.name));
|
|
554
|
-
if (
|
|
555
|
-
await Promise.all(
|
|
560
|
+
if (artifactFiles.length === 0) return;
|
|
561
|
+
await Promise.all(artifactFiles.map((filePath) => fs.unlink(filePath)));
|
|
556
562
|
}
|
|
557
563
|
|
|
558
564
|
function createNodeConvertIO(outputDir, options) {
|
|
@@ -875,6 +881,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
875
881
|
let sourceQuantization = null;
|
|
876
882
|
let tokenizerJson = null;
|
|
877
883
|
let tokenizerConfig = null;
|
|
884
|
+
let generationConfig = null;
|
|
878
885
|
let hasTokenizerModel = false;
|
|
879
886
|
let tokenizerModelPath = null;
|
|
880
887
|
let diffusionAuxFiles = [];
|
|
@@ -1101,6 +1108,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1101
1108
|
},
|
|
1102
1109
|
});
|
|
1103
1110
|
config = parsedTransformer.config;
|
|
1111
|
+
generationConfig = parsedTransformer.generationConfig ?? null;
|
|
1104
1112
|
tensors = parsedTransformer.tensors;
|
|
1105
1113
|
architectureHint = parsedTransformer.architectureHint;
|
|
1106
1114
|
architecture = extractArchitecture(config, null);
|
|
@@ -1151,7 +1159,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1151
1159
|
const outputDir = resolveOutputDir(outputDirOverride, converterConfig, modelId);
|
|
1152
1160
|
|
|
1153
1161
|
await fs.mkdir(outputDir, { recursive: true });
|
|
1154
|
-
await
|
|
1162
|
+
await clearExistingConversionOutputs(outputDir);
|
|
1155
1163
|
|
|
1156
1164
|
const model = {
|
|
1157
1165
|
name: path.basename(inputDir),
|
|
@@ -1169,6 +1177,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1169
1177
|
quantization: targetQuantization,
|
|
1170
1178
|
tokenizerJson,
|
|
1171
1179
|
tokenizerConfig,
|
|
1180
|
+
generationConfig,
|
|
1172
1181
|
tokenizerModel: hasTokenizerModel ? 'tokenizer.model' : null,
|
|
1173
1182
|
};
|
|
1174
1183
|
|
|
@@ -1177,6 +1186,15 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1177
1186
|
computeHash,
|
|
1178
1187
|
readRange: fileRangeReader.readRange,
|
|
1179
1188
|
});
|
|
1189
|
+
const deferredManifestState = {
|
|
1190
|
+
manifest: null,
|
|
1191
|
+
};
|
|
1192
|
+
const convertIo = {
|
|
1193
|
+
...io,
|
|
1194
|
+
async writeManifest(manifest) {
|
|
1195
|
+
deferredManifestState.manifest = manifest;
|
|
1196
|
+
},
|
|
1197
|
+
};
|
|
1180
1198
|
const manifestArchitecture = modelKind === 'diffusion' ? 'diffusion' : architecture;
|
|
1181
1199
|
let workerPool = null;
|
|
1182
1200
|
let workerTensorTransformer = null;
|
|
@@ -1241,7 +1259,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1241
1259
|
}));
|
|
1242
1260
|
|
|
1243
1261
|
const convertTimer = createStageTimer('Convert tensors');
|
|
1244
|
-
result = await convertModel(model,
|
|
1262
|
+
result = await convertModel(model, convertIo, {
|
|
1245
1263
|
modelId,
|
|
1246
1264
|
modelType: resolvedModelType,
|
|
1247
1265
|
quantization: targetQuantization,
|
|
@@ -1279,6 +1297,9 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1279
1297
|
}
|
|
1280
1298
|
|
|
1281
1299
|
normalizeTokenizerManifest(result.manifest);
|
|
1300
|
+
if (!deferredManifestState.manifest) {
|
|
1301
|
+
throw new Error('node convert: convert core did not produce a manifest.');
|
|
1302
|
+
}
|
|
1282
1303
|
await io.writeManifest(result.manifest);
|
|
1283
1304
|
|
|
1284
1305
|
const report = buildConvertReport(result, {
|