@simulatte/doppler 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/package.json +21 -36
- package/src/browser/browser-converter.js +5 -0
- package/src/client/doppler-registry.json +1 -17
- package/src/config/kernel-path-loader.d.ts +5 -0
- package/src/config/kernel-path-loader.js +13 -0
- package/src/config/kernels/registry.json +74 -0
- package/src/config/loader.js +3 -0
- package/src/config/merge-contract-check.js +7 -0
- package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
- package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
- package/src/config/presets/kernel-paths/registry.json +14 -0
- package/src/config/presets/models/gemma2.json +2 -1
- package/src/config/presets/models/gemma3.json +2 -0
- package/src/config/presets/models/qwen3.json +4 -3
- package/src/config/presets/models/qwen3_5.json +16 -0
- package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
- package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
- package/src/config/schema/conversion.schema.d.ts +1 -0
- package/src/config/schema/manifest.schema.d.ts +1 -1
- package/src/config/schema/manifest.schema.js +1 -1
- package/src/config/schema/storage.schema.js +1 -1
- package/src/converter/conversion-plan.js +10 -2
- package/src/converter/core.js +2 -0
- package/src/converter/manifest-inference.js +12 -22
- package/src/converter/parsers/transformer.js +4 -0
- package/src/converter/quantization-info.js +5 -1
- package/src/converter/quantizer.js +19 -12
- package/src/converter/rope-config.js +8 -6
- package/src/converter/tokenizer-utils.d.ts +1 -0
- package/src/converter/tokenizer-utils.js +4 -1
- package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
- package/src/distribution/shard-delivery.js +6 -1
- package/src/formats/rdrr/parsing.d.ts +4 -0
- package/src/formats/rdrr/parsing.js +14 -1
- package/src/gpu/kernels/index.d.ts +8 -0
- package/src/gpu/kernels/index.js +6 -0
- package/src/gpu/kernels/matmul-selection.js +47 -4
- package/src/gpu/kernels/matmul.d.ts +2 -0
- package/src/gpu/kernels/matmul.js +1 -1
- package/src/gpu/kernels/rmsnorm.js +9 -2
- package/src/gpu/kernels/split_qg.d.ts +50 -0
- package/src/gpu/kernels/split_qg.js +46 -0
- package/src/gpu/kernels/split_qg.wgsl +58 -0
- package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
- package/src/gpu/weight-buffer.d.ts +1 -1
- package/src/gpu/weight-buffer.js +1 -1
- package/src/inference/browser-harness.d.ts +2 -0
- package/src/inference/browser-harness.js +20 -1
- package/src/inference/pipelines/diffusion/helpers.js +3 -0
- package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
- package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
- package/src/inference/pipelines/text/attention/output-projection.js +8 -0
- package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
- package/src/inference/pipelines/text/attention/projections.js +41 -11
- package/src/inference/pipelines/text/attention/record.js +15 -6
- package/src/inference/pipelines/text/attention/run.js +50 -6
- package/src/inference/pipelines/text/config.js +14 -0
- package/src/inference/pipelines/text/execution-plan.js +5 -4
- package/src/inference/pipelines/text/generator-runtime.js +5 -0
- package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
- package/src/inference/pipelines/text/generator-steps.js +43 -15
- package/src/inference/pipelines/text/generator.js +50 -17
- package/src/inference/pipelines/text/init.d.ts +13 -0
- package/src/inference/pipelines/text/init.js +16 -5
- package/src/inference/pipelines/text/layer.js +1 -0
- package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
- package/src/inference/pipelines/text/linear-attention.js +33 -3
- package/src/inference/pipelines/text/logits/gpu.js +2 -2
- package/src/inference/pipelines/text/logits/index.d.ts +6 -1
- package/src/inference/pipelines/text/logits/index.js +3 -1
- package/src/inference/pipelines/text/model-load.js +3 -0
- package/src/inference/pipelines/text/sampling.js +52 -6
- package/src/inference/test-harness.js +2 -2
- package/src/loader/final-weights-loader.js +2 -0
- package/src/loader/shard-cache.js +3 -2
- package/src/loader/tensors/tensor-loader.js +6 -1
- package/src/rules/inference/dtype.rules.json +5 -0
- package/src/rules/inference/kernel-path.rules.json +2 -2
- package/src/rules/kernels/split-qg.rules.json +6 -0
- package/src/rules/rule-registry.js +2 -0
- package/src/storage/downloader.js +2 -1
- package/src/storage/shard-manager.js +4 -3
- package/src/tooling/conversion-config-materializer.js +3 -5
- package/src/tooling/node-converter.js +3 -0
- package/src/tooling/node-source-runtime.js +36 -0
- package/src/types/model.d.ts +5 -0
- package/tools/doppler-cli.js +6 -1
|
@@ -2,6 +2,7 @@ import path from 'node:path';
|
|
|
2
2
|
|
|
3
3
|
import { createConverterConfig } from '../config/schema/index.js';
|
|
4
4
|
import { resolveConversionPlan } from '../converter/conversion-plan.js';
|
|
5
|
+
import { normalizeQuantTag } from '../converter/quantization-info.js';
|
|
5
6
|
|
|
6
7
|
function toSafeString(value) {
|
|
7
8
|
if (typeof value !== 'string') return '';
|
|
@@ -10,10 +11,7 @@ function toSafeString(value) {
|
|
|
10
11
|
}
|
|
11
12
|
|
|
12
13
|
function normalizeQuantizationTag(value) {
|
|
13
|
-
|
|
14
|
-
if (!raw) return 'f16';
|
|
15
|
-
if (raw === 'Q4_K_M' || raw === 'Q4_K') return 'q4k';
|
|
16
|
-
return raw.toLowerCase();
|
|
14
|
+
return normalizeQuantTag(toSafeString(value));
|
|
17
15
|
}
|
|
18
16
|
|
|
19
17
|
function resolveArchitectureHint(architecture) {
|
|
@@ -37,7 +35,7 @@ function extractSourceQuantization(manifest) {
|
|
|
37
35
|
if (explicitWeights) return explicitWeights;
|
|
38
36
|
const explicitQuant = toSafeString(manifest?.quantization);
|
|
39
37
|
if (explicitQuant) return explicitQuant;
|
|
40
|
-
return
|
|
38
|
+
return normalizeQuantTag(null);
|
|
41
39
|
}
|
|
42
40
|
|
|
43
41
|
function buildRefreshRawConfig(manifest) {
|
|
@@ -875,6 +875,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
875
875
|
let sourceQuantization = null;
|
|
876
876
|
let tokenizerJson = null;
|
|
877
877
|
let tokenizerConfig = null;
|
|
878
|
+
let generationConfig = null;
|
|
878
879
|
let hasTokenizerModel = false;
|
|
879
880
|
let tokenizerModelPath = null;
|
|
880
881
|
let diffusionAuxFiles = [];
|
|
@@ -1101,6 +1102,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1101
1102
|
},
|
|
1102
1103
|
});
|
|
1103
1104
|
config = parsedTransformer.config;
|
|
1105
|
+
generationConfig = parsedTransformer.generationConfig ?? null;
|
|
1104
1106
|
tensors = parsedTransformer.tensors;
|
|
1105
1107
|
architectureHint = parsedTransformer.architectureHint;
|
|
1106
1108
|
architecture = extractArchitecture(config, null);
|
|
@@ -1169,6 +1171,7 @@ export async function convertSafetensorsDirectory(options) {
|
|
|
1169
1171
|
quantization: targetQuantization,
|
|
1170
1172
|
tokenizerJson,
|
|
1171
1173
|
tokenizerConfig,
|
|
1174
|
+
generationConfig,
|
|
1172
1175
|
tokenizerModel: hasTokenizerModel ? 'tokenizer.model' : null,
|
|
1173
1176
|
};
|
|
1174
1177
|
|
|
@@ -411,6 +411,39 @@ function buildNodeFileReaders() {
|
|
|
411
411
|
};
|
|
412
412
|
}
|
|
413
413
|
|
|
414
|
+
// Source dtype → compute precision mapping for source-runtime inference.
|
|
415
|
+
// BF16/F32 sources require f32 compute (BF16 has no native WebGPU support).
|
|
416
|
+
// Quantized formats require f32 compute for dequantization accuracy.
|
|
417
|
+
// F16 sources can use f16 compute directly.
|
|
418
|
+
const SOURCE_QUANT_COMPUTE_MAP = {
|
|
419
|
+
'F16': 'f16',
|
|
420
|
+
'BF16': 'f32',
|
|
421
|
+
'F32': 'f32',
|
|
422
|
+
'Q4_K': 'f32',
|
|
423
|
+
'Q4_K_M': 'f32',
|
|
424
|
+
'Q6_K': 'f32',
|
|
425
|
+
};
|
|
426
|
+
const SOURCE_COMPUTE_DEFAULT = 'f16';
|
|
427
|
+
|
|
428
|
+
function resolveSourceRuntimeComputePrecision(tensors, sourceQuantization) {
|
|
429
|
+
const dtypes = new Set();
|
|
430
|
+
for (const tensor of Array.isArray(tensors) ? tensors : []) {
|
|
431
|
+
const dtype = String(tensor?.dtype || '').trim().toUpperCase();
|
|
432
|
+
if (dtype) {
|
|
433
|
+
dtypes.add(dtype);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
// If any tensor requires f32 compute, use f32 for all.
|
|
437
|
+
for (const dtype of dtypes) {
|
|
438
|
+
if (SOURCE_QUANT_COMPUTE_MAP[dtype] === 'f32') {
|
|
439
|
+
return 'f32';
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
const normalized = String(sourceQuantization || '').trim().toUpperCase();
|
|
444
|
+
return SOURCE_QUANT_COMPUTE_MAP[normalized] ?? SOURCE_COMPUTE_DEFAULT;
|
|
445
|
+
}
|
|
446
|
+
|
|
414
447
|
async function addHashesToFileEntries(entries, hashAlgorithm) {
|
|
415
448
|
const normalized = [];
|
|
416
449
|
for (const entry of Array.isArray(entries) ? entries : []) {
|
|
@@ -473,6 +506,9 @@ export async function resolveNodeSourceRuntimeBundle(options = {}) {
|
|
|
473
506
|
assertSupportedSourceDtypes(parsed.tensors, parsed.sourceKind);
|
|
474
507
|
|
|
475
508
|
const converterConfig = createConverterConfig({
|
|
509
|
+
quantization: {
|
|
510
|
+
computePrecision: resolveSourceRuntimeComputePrecision(parsed.tensors, parsed.sourceQuantization),
|
|
511
|
+
},
|
|
476
512
|
output: {
|
|
477
513
|
modelBaseId: options.modelId || null,
|
|
478
514
|
},
|
package/src/types/model.d.ts
CHANGED
|
@@ -9,7 +9,11 @@ export type ModelArchitecture =
|
|
|
9
9
|
| 'gemma'
|
|
10
10
|
| 'gemma2'
|
|
11
11
|
| 'gemma3'
|
|
12
|
+
| 'embeddinggemma'
|
|
12
13
|
| 'functiongemma'
|
|
14
|
+
| 'janus_text'
|
|
15
|
+
| 'lfm2'
|
|
16
|
+
| 'modernbert'
|
|
13
17
|
| 'qwen2'
|
|
14
18
|
| 'qwen3'
|
|
15
19
|
| 'phi3'
|
|
@@ -19,6 +23,7 @@ export type ModelArchitecture =
|
|
|
19
23
|
| 'deepseek'
|
|
20
24
|
| 'mamba'
|
|
21
25
|
| 'kimi_k2'
|
|
26
|
+
| 'translategemma'
|
|
22
27
|
| 'transformer';
|
|
23
28
|
|
|
24
29
|
/** Attention type variants */
|
package/tools/doppler-cli.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
+
import { existsSync } from 'node:fs';
|
|
3
4
|
import fs from 'node:fs/promises';
|
|
4
5
|
import path from 'node:path';
|
|
5
6
|
import { fileURLToPath, pathToFileURL } from 'node:url';
|
|
@@ -13,7 +14,8 @@ import { createToolingErrorEnvelope } from '../src/tooling/command-envelope.js';
|
|
|
13
14
|
|
|
14
15
|
const NODE_WEBGPU_INCOMPLETE_MESSAGE = 'node command: WebGPU runtime is incomplete in Node';
|
|
15
16
|
const CLI_POLICY_PATH = fileURLToPath(new URL('./configs/cli/doppler-cli-policy.json', import.meta.url));
|
|
16
|
-
const DEFAULT_EXTERNAL_MODELS_ROOT = process.env.DOPPLER_EXTERNAL_MODELS_ROOT
|
|
17
|
+
const DEFAULT_EXTERNAL_MODELS_ROOT = process.env.DOPPLER_EXTERNAL_MODELS_ROOT
|
|
18
|
+
|| (existsSync('/Volumes/models') ? '/Volumes/models' : '/media/x/models');
|
|
17
19
|
const DEFAULT_EXTERNAL_RDRR_ROOT = path.join(DEFAULT_EXTERNAL_MODELS_ROOT, 'rdrr');
|
|
18
20
|
const DEFAULT_CLI_POLICY = {
|
|
19
21
|
defaults: {
|
|
@@ -1260,6 +1262,9 @@ function printMetricsSummary(result) {
|
|
|
1260
1262
|
`prefill=${formatNumber(metrics.prefillTokensPerSec)} ` +
|
|
1261
1263
|
`decode=${formatNumber(metrics.decodeTokensPerSec)}`
|
|
1262
1264
|
);
|
|
1265
|
+
if (typeof result.output === 'string' && result.output.length > 0) {
|
|
1266
|
+
console.log(`[output] ${quoteOneLine(result.output)}`);
|
|
1267
|
+
}
|
|
1263
1268
|
printExecutionContractSummary(result);
|
|
1264
1269
|
printExecutionV0GraphSummary(metrics.executionV0GraphContractArtifact);
|
|
1265
1270
|
return;
|