@simulatte/doppler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/package.json +21 -36
  3. package/src/browser/browser-converter.js +5 -0
  4. package/src/client/doppler-registry.json +1 -17
  5. package/src/config/kernel-path-loader.d.ts +5 -0
  6. package/src/config/kernel-path-loader.js +13 -0
  7. package/src/config/kernels/registry.json +74 -0
  8. package/src/config/loader.js +3 -0
  9. package/src/config/merge-contract-check.js +7 -0
  10. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  11. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  12. package/src/config/presets/kernel-paths/registry.json +14 -0
  13. package/src/config/presets/models/gemma2.json +2 -1
  14. package/src/config/presets/models/gemma3.json +2 -0
  15. package/src/config/presets/models/qwen3.json +4 -3
  16. package/src/config/presets/models/qwen3_5.json +16 -0
  17. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  18. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  19. package/src/config/schema/conversion.schema.d.ts +1 -0
  20. package/src/config/schema/manifest.schema.d.ts +1 -1
  21. package/src/config/schema/manifest.schema.js +1 -1
  22. package/src/config/schema/storage.schema.js +1 -1
  23. package/src/converter/conversion-plan.js +10 -2
  24. package/src/converter/core.js +2 -0
  25. package/src/converter/manifest-inference.js +12 -22
  26. package/src/converter/parsers/transformer.js +4 -0
  27. package/src/converter/quantization-info.js +5 -1
  28. package/src/converter/quantizer.js +19 -12
  29. package/src/converter/rope-config.js +8 -6
  30. package/src/converter/tokenizer-utils.d.ts +1 -0
  31. package/src/converter/tokenizer-utils.js +4 -1
  32. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  33. package/src/distribution/shard-delivery.js +6 -1
  34. package/src/formats/rdrr/parsing.d.ts +4 -0
  35. package/src/formats/rdrr/parsing.js +14 -1
  36. package/src/gpu/kernels/index.d.ts +8 -0
  37. package/src/gpu/kernels/index.js +6 -0
  38. package/src/gpu/kernels/matmul-selection.js +47 -4
  39. package/src/gpu/kernels/matmul.d.ts +2 -0
  40. package/src/gpu/kernels/matmul.js +1 -1
  41. package/src/gpu/kernels/rmsnorm.js +9 -2
  42. package/src/gpu/kernels/split_qg.d.ts +50 -0
  43. package/src/gpu/kernels/split_qg.js +46 -0
  44. package/src/gpu/kernels/split_qg.wgsl +58 -0
  45. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  46. package/src/gpu/weight-buffer.d.ts +1 -1
  47. package/src/gpu/weight-buffer.js +1 -1
  48. package/src/inference/browser-harness.d.ts +2 -0
  49. package/src/inference/browser-harness.js +20 -1
  50. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  51. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
  52. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  53. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  54. package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
  55. package/src/inference/pipelines/text/attention/projections.js +41 -11
  56. package/src/inference/pipelines/text/attention/record.js +15 -6
  57. package/src/inference/pipelines/text/attention/run.js +50 -6
  58. package/src/inference/pipelines/text/config.js +14 -0
  59. package/src/inference/pipelines/text/execution-plan.js +5 -4
  60. package/src/inference/pipelines/text/generator-runtime.js +5 -0
  61. package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
  62. package/src/inference/pipelines/text/generator-steps.js +43 -15
  63. package/src/inference/pipelines/text/generator.js +50 -17
  64. package/src/inference/pipelines/text/init.d.ts +13 -0
  65. package/src/inference/pipelines/text/init.js +16 -5
  66. package/src/inference/pipelines/text/layer.js +1 -0
  67. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  68. package/src/inference/pipelines/text/linear-attention.js +33 -3
  69. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  70. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  71. package/src/inference/pipelines/text/logits/index.js +3 -1
  72. package/src/inference/pipelines/text/model-load.js +3 -0
  73. package/src/inference/pipelines/text/sampling.js +52 -6
  74. package/src/inference/test-harness.js +2 -2
  75. package/src/loader/final-weights-loader.js +2 -0
  76. package/src/loader/shard-cache.js +3 -2
  77. package/src/loader/tensors/tensor-loader.js +6 -1
  78. package/src/rules/inference/dtype.rules.json +5 -0
  79. package/src/rules/inference/kernel-path.rules.json +2 -2
  80. package/src/rules/kernels/split-qg.rules.json +6 -0
  81. package/src/rules/rule-registry.js +2 -0
  82. package/src/storage/downloader.js +2 -1
  83. package/src/storage/shard-manager.js +4 -3
  84. package/src/tooling/conversion-config-materializer.js +3 -5
  85. package/src/tooling/node-converter.js +3 -0
  86. package/src/tooling/node-source-runtime.js +36 -0
  87. package/src/types/model.d.ts +5 -0
  88. package/tools/doppler-cli.js +6 -1
@@ -2,6 +2,7 @@ import path from 'node:path';
2
2
 
3
3
  import { createConverterConfig } from '../config/schema/index.js';
4
4
  import { resolveConversionPlan } from '../converter/conversion-plan.js';
5
+ import { normalizeQuantTag } from '../converter/quantization-info.js';
5
6
 
6
7
  function toSafeString(value) {
7
8
  if (typeof value !== 'string') return '';
@@ -10,10 +11,7 @@ function toSafeString(value) {
10
11
  }
11
12
 
12
13
  function normalizeQuantizationTag(value) {
13
- const raw = toSafeString(value).toUpperCase();
14
- if (!raw) return 'f16';
15
- if (raw === 'Q4_K_M' || raw === 'Q4_K') return 'q4k';
16
- return raw.toLowerCase();
14
+ return normalizeQuantTag(toSafeString(value));
17
15
  }
18
16
 
19
17
  function resolveArchitectureHint(architecture) {
@@ -37,7 +35,7 @@ function extractSourceQuantization(manifest) {
37
35
  if (explicitWeights) return explicitWeights;
38
36
  const explicitQuant = toSafeString(manifest?.quantization);
39
37
  if (explicitQuant) return explicitQuant;
40
- return 'f16';
38
+ return normalizeQuantTag(null);
41
39
  }
42
40
 
43
41
  function buildRefreshRawConfig(manifest) {
@@ -875,6 +875,7 @@ export async function convertSafetensorsDirectory(options) {
875
875
  let sourceQuantization = null;
876
876
  let tokenizerJson = null;
877
877
  let tokenizerConfig = null;
878
+ let generationConfig = null;
878
879
  let hasTokenizerModel = false;
879
880
  let tokenizerModelPath = null;
880
881
  let diffusionAuxFiles = [];
@@ -1101,6 +1102,7 @@ export async function convertSafetensorsDirectory(options) {
1101
1102
  },
1102
1103
  });
1103
1104
  config = parsedTransformer.config;
1105
+ generationConfig = parsedTransformer.generationConfig ?? null;
1104
1106
  tensors = parsedTransformer.tensors;
1105
1107
  architectureHint = parsedTransformer.architectureHint;
1106
1108
  architecture = extractArchitecture(config, null);
@@ -1169,6 +1171,7 @@ export async function convertSafetensorsDirectory(options) {
1169
1171
  quantization: targetQuantization,
1170
1172
  tokenizerJson,
1171
1173
  tokenizerConfig,
1174
+ generationConfig,
1172
1175
  tokenizerModel: hasTokenizerModel ? 'tokenizer.model' : null,
1173
1176
  };
1174
1177
 
@@ -411,6 +411,39 @@ function buildNodeFileReaders() {
411
411
  };
412
412
  }
413
413
 
414
+ // Source dtype → compute precision mapping for source-runtime inference.
415
+ // BF16/F32 sources require f32 compute (BF16 has no native WebGPU support).
416
+ // Quantized formats require f32 compute for dequantization accuracy.
417
+ // F16 sources can use f16 compute directly.
418
+ const SOURCE_QUANT_COMPUTE_MAP = {
419
+ 'F16': 'f16',
420
+ 'BF16': 'f32',
421
+ 'F32': 'f32',
422
+ 'Q4_K': 'f32',
423
+ 'Q4_K_M': 'f32',
424
+ 'Q6_K': 'f32',
425
+ };
426
+ const SOURCE_COMPUTE_DEFAULT = 'f16';
427
+
428
+ function resolveSourceRuntimeComputePrecision(tensors, sourceQuantization) {
429
+ const dtypes = new Set();
430
+ for (const tensor of Array.isArray(tensors) ? tensors : []) {
431
+ const dtype = String(tensor?.dtype || '').trim().toUpperCase();
432
+ if (dtype) {
433
+ dtypes.add(dtype);
434
+ }
435
+ }
436
+ // If any tensor requires f32 compute, use f32 for all.
437
+ for (const dtype of dtypes) {
438
+ if (SOURCE_QUANT_COMPUTE_MAP[dtype] === 'f32') {
439
+ return 'f32';
440
+ }
441
+ }
442
+
443
+ const normalized = String(sourceQuantization || '').trim().toUpperCase();
444
+ return SOURCE_QUANT_COMPUTE_MAP[normalized] ?? SOURCE_COMPUTE_DEFAULT;
445
+ }
446
+
414
447
  async function addHashesToFileEntries(entries, hashAlgorithm) {
415
448
  const normalized = [];
416
449
  for (const entry of Array.isArray(entries) ? entries : []) {
@@ -473,6 +506,9 @@ export async function resolveNodeSourceRuntimeBundle(options = {}) {
473
506
  assertSupportedSourceDtypes(parsed.tensors, parsed.sourceKind);
474
507
 
475
508
  const converterConfig = createConverterConfig({
509
+ quantization: {
510
+ computePrecision: resolveSourceRuntimeComputePrecision(parsed.tensors, parsed.sourceQuantization),
511
+ },
476
512
  output: {
477
513
  modelBaseId: options.modelId || null,
478
514
  },
@@ -9,7 +9,11 @@ export type ModelArchitecture =
9
9
  | 'gemma'
10
10
  | 'gemma2'
11
11
  | 'gemma3'
12
+ | 'embeddinggemma'
12
13
  | 'functiongemma'
14
+ | 'janus_text'
15
+ | 'lfm2'
16
+ | 'modernbert'
13
17
  | 'qwen2'
14
18
  | 'qwen3'
15
19
  | 'phi3'
@@ -19,6 +23,7 @@ export type ModelArchitecture =
19
23
  | 'deepseek'
20
24
  | 'mamba'
21
25
  | 'kimi_k2'
26
+ | 'translategemma'
22
27
  | 'transformer';
23
28
 
24
29
  /** Attention type variants */
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
 
3
+ import { existsSync } from 'node:fs';
3
4
  import fs from 'node:fs/promises';
4
5
  import path from 'node:path';
5
6
  import { fileURLToPath, pathToFileURL } from 'node:url';
@@ -13,7 +14,8 @@ import { createToolingErrorEnvelope } from '../src/tooling/command-envelope.js';
13
14
 
14
15
  const NODE_WEBGPU_INCOMPLETE_MESSAGE = 'node command: WebGPU runtime is incomplete in Node';
15
16
  const CLI_POLICY_PATH = fileURLToPath(new URL('./configs/cli/doppler-cli-policy.json', import.meta.url));
16
- const DEFAULT_EXTERNAL_MODELS_ROOT = process.env.DOPPLER_EXTERNAL_MODELS_ROOT || '/media/x/models';
17
+ const DEFAULT_EXTERNAL_MODELS_ROOT = process.env.DOPPLER_EXTERNAL_MODELS_ROOT
18
+ || (existsSync('/Volumes/models') ? '/Volumes/models' : '/media/x/models');
17
19
  const DEFAULT_EXTERNAL_RDRR_ROOT = path.join(DEFAULT_EXTERNAL_MODELS_ROOT, 'rdrr');
18
20
  const DEFAULT_CLI_POLICY = {
19
21
  defaults: {
@@ -1260,6 +1262,9 @@ function printMetricsSummary(result) {
1260
1262
  `prefill=${formatNumber(metrics.prefillTokensPerSec)} ` +
1261
1263
  `decode=${formatNumber(metrics.decodeTokensPerSec)}`
1262
1264
  );
1265
+ if (typeof result.output === 'string' && result.output.length > 0) {
1266
+ console.log(`[output] ${quoteOneLine(result.output)}`);
1267
+ }
1263
1268
  printExecutionContractSummary(result);
1264
1269
  printExecutionV0GraphSummary(metrics.executionV0GraphContractArtifact);
1265
1270
  return;