@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +14 -1
  2. package/README.md +25 -6
  3. package/package.json +5 -3
  4. package/src/client/doppler-api.browser.js +6 -0
  5. package/src/client/doppler-api.d.ts +3 -0
  6. package/src/client/doppler-api.js +11 -2
  7. package/src/client/doppler-registry.js +3 -5
  8. package/src/client/doppler-registry.json +16 -0
  9. package/src/config/kernels/kernel-ref-digests.js +23 -21
  10. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  11. package/src/config/loader.js +6 -0
  12. package/src/config/platforms/loader.js +3 -1
  13. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  14. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  15. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  16. package/src/config/presets/kernel-paths/registry.json +7 -0
  17. package/src/config/presets/models/gemma3.json +2 -1
  18. package/src/config/presets/models/gemma4.json +61 -0
  19. package/src/config/presets/models/granite-docling.json +70 -0
  20. package/src/config/presets/models/lfm2.json +6 -1
  21. package/src/config/presets/models/qwen3_vl.json +40 -0
  22. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  23. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  24. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  25. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  26. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  27. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  28. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  29. package/src/config/runtime.js +3 -0
  30. package/src/config/schema/debug.schema.d.ts +40 -0
  31. package/src/config/schema/debug.schema.js +28 -0
  32. package/src/config/schema/index.js +2 -0
  33. package/src/config/schema/inference-defaults.schema.js +1 -1
  34. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  35. package/src/config/schema/memory-limits.schema.js +2 -2
  36. package/src/config/schema/storage.schema.js +1 -1
  37. package/src/converter/conversion-plan.js +1 -1
  38. package/src/converter/core.js +17 -8
  39. package/src/converter/quantizer.d.ts +5 -0
  40. package/src/converter/quantizer.js +15 -0
  41. package/src/distribution/shard-delivery.js +34 -0
  42. package/src/formats/rdrr/classification.js +32 -0
  43. package/src/gpu/kernel-runtime.js +4 -2
  44. package/src/gpu/kernels/attention.js +2 -1
  45. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  46. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  47. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  48. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  49. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  50. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  51. package/src/gpu/kernels/gated-short-conv.js +284 -0
  52. package/src/gpu/kernels/linear-attention-core.js +37 -17
  53. package/src/gpu/kernels/matmul-selection.js +1 -0
  54. package/src/gpu/kernels/matmul.d.ts +3 -0
  55. package/src/gpu/kernels/matmul.js +70 -1
  56. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  57. package/src/gpu/kernels/sample.js +1 -3
  58. package/src/gpu/kernels/sample.wgsl +39 -9
  59. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  60. package/src/gpu/kernels/shader-cache.js +9 -4
  61. package/src/inference/kv-cache/base.js +3 -10
  62. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  63. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
  64. package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
  65. package/src/inference/pipelines/text/attention/projections.js +13 -2
  66. package/src/inference/pipelines/text/attention/record.js +1 -0
  67. package/src/inference/pipelines/text/attention/run.js +9 -0
  68. package/src/inference/pipelines/text/config.d.ts +1 -0
  69. package/src/inference/pipelines/text/config.js +32 -4
  70. package/src/inference/pipelines/text/embed.js +26 -7
  71. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  72. package/src/inference/pipelines/text/execution-v0.js +12 -1
  73. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  74. package/src/inference/pipelines/text/generator-runtime.js +14 -0
  75. package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
  76. package/src/inference/pipelines/text/generator-steps.js +46 -29
  77. package/src/inference/pipelines/text/generator.d.ts +5 -0
  78. package/src/inference/pipelines/text/generator.js +320 -166
  79. package/src/inference/pipelines/text/init.d.ts +2 -0
  80. package/src/inference/pipelines/text/init.js +19 -5
  81. package/src/inference/pipelines/text/layer.js +37 -8
  82. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  83. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  84. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  85. package/src/inference/pipelines/text/ops.js +123 -53
  86. package/src/inference/pipelines/text/probes.js +1 -0
  87. package/src/inference/pipelines/text/state.js +2 -0
  88. package/src/inference/pipelines/text.d.ts +5 -0
  89. package/src/inference/pipelines/text.js +59 -1
  90. package/src/inference/pipelines/vision/encoder.js +386 -0
  91. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  92. package/src/inference/pipelines/vision/index.js +173 -0
  93. package/src/inference/pipelines/vision/ops.js +78 -0
  94. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  95. package/src/inference/test-harness.js +9 -7
  96. package/src/loader/doppler-loader.d.ts +3 -0
  97. package/src/loader/doppler-loader.js +20 -3
  98. package/src/loader/experts/expert-cache.js +6 -2
  99. package/src/loader/experts/expert-loader.js +6 -2
  100. package/src/loader/layer-loader.js +42 -3
  101. package/src/loader/manifest-config.js +3 -1
  102. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  103. package/src/loader/tensors/tensor-loader.js +124 -3
  104. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  105. package/src/rules/kernels/softmax.rules.json +2 -0
  106. package/src/rules/rule-registry.d.ts +1 -0
  107. package/src/rules/rule-registry.js +2 -0
  108. package/src/storage/quickstart-downloader.d.ts +3 -0
  109. package/src/storage/quickstart-downloader.js +27 -30
  110. package/src/tooling/node-converter.js +25 -7
  111. package/src/tooling/node-source-runtime.js +29 -5
  112. package/src/tooling/node-webgpu.js +24 -7
  113. package/src/utils/hf-resolve-url.d.ts +16 -0
  114. package/src/utils/hf-resolve-url.js +17 -0
  115. package/src/version.js +1 -1
  116. package/src/tooling/node-convert.d.ts +0 -54
@@ -1,6 +1,6 @@
1
1
  import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
2
2
  import { releaseBuffer, readBuffer } from '../../../memory/buffer-pool.js';
3
- import { runArgmax, runGPUSample, recordArgmax, recordGPUSample, isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
3
+ import { recordArgmax, recordGPUSample, isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
4
4
  import { recordCheckStop } from '../../../gpu/kernels/check-stop.js';
5
5
  import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
6
6
  import { createCommandRecorder, createProfilingRecorder, CommandRecorder } from '../../../gpu/command-recorder.js';
@@ -20,6 +20,7 @@ import { decodeReadback } from './debug-utils/index.js';
20
20
  import { getFinalNormWeights, extractEmbeddingFromHidden } from './generator-runtime.js';
21
21
  import { parseFinitenessStatusWords } from './finiteness-guard-status.js';
22
22
  import { hasLinearAttentionLayers } from './linear-attention.js';
23
+ import { hasConvLayers } from './layer.js';
23
24
 
24
25
  const UNKNOWN_TOKEN_TEXT = '<unknown>';
25
26
 
@@ -91,6 +92,13 @@ export function shouldUseBatchDecode(config) {
91
92
  return isBatchDecodeEnabled(config);
92
93
  }
93
94
 
95
+ export function shouldUseFusedDecodeSampling(config) {
96
+ return config.recorderEnabled === true
97
+ && config.gpuSamplingEnabled === true
98
+ && config.fusedDecodeDisabled !== true
99
+ && !hasConvLayers(config.layerTypes ?? []);
100
+ }
101
+
94
102
  export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
95
103
  let actualCount = tokens.length;
96
104
  if (stopFlags) {
@@ -403,7 +411,12 @@ export async function decodeStep(state, currentIds, opts, helpers) {
403
411
  const padTokenId = state.tokenizer?.getSpecialTokens?.()?.pad ?? null;
404
412
  const lmHeadIsCpu = isCpuWeightBuffer(state.weights.get('lm_head'));
405
413
  const useGPUSampling = state.useGPU && isGPUSamplingAvailable() && !lmHeadIsCpu;
406
- const useFusedDecode = recorder && useGPUSampling && !state.disableFusedDecode;
414
+ const useFusedDecode = shouldUseFusedDecodeSampling({
415
+ recorderEnabled: Boolean(recorder),
416
+ gpuSamplingEnabled: useGPUSampling,
417
+ fusedDecodeDisabled: state.disableFusedDecode,
418
+ layerTypes: config.layerTypes,
419
+ });
407
420
 
408
421
  if (useFusedDecode) {
409
422
  const ring = state.decodeRing;
@@ -631,36 +644,35 @@ export async function decodeStep(state, currentIds, opts, helpers) {
631
644
  );
632
645
  if (logitsResult) {
633
646
  const { logitsBuffer, vocabSize, logitsDtype } = logitsResult;
647
+ const logitsBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: logitsDtype });
648
+ const logitsData = await readBuffer(logitsBuffer, numTokens * vocabSize * logitsBytes);
649
+ releaseBuffer(logitsBuffer);
634
650
 
635
- const nextToken = opts.temperature < samplingDefaults.greedyThreshold
636
- ? await runArgmax(logitsBuffer, vocabSize, { padTokenId, logitSoftcap, logitsDtype, outputIndex: 0 })
637
- : await runGPUSample(logitsBuffer, vocabSize, {
638
- temperature: opts.temperature,
639
- topK: opts.topK,
640
- padTokenId,
641
- logitSoftcap,
642
- logitsDtype,
643
- outputIndex: 0,
644
- greedyThreshold: samplingDefaults.greedyThreshold,
645
- randomSeed: opts.seed,
646
- });
651
+ const rawLogits = decodeReadback(logitsData, logitsDtype);
652
+ const finalizedLogits = await finalizeLogits(
653
+ rawLogits,
654
+ numTokens,
655
+ vocabSize,
656
+ config.vocabSize,
657
+ config,
658
+ state.runtimeConfig.shared.debug.probes
659
+ );
660
+ const sampledLogits = extractLastPositionLogits(finalizedLogits, numTokens, config.vocabSize);
647
661
 
648
- releaseBuffer(logitsBuffer);
649
- const invalidGpuToken = nextToken >= config.vocabSize
650
- || (padTokenId != null && nextToken === padTokenId)
651
- || (padTokenId == null && nextToken === 0);
652
- if (!invalidGpuToken) {
653
- if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
654
- releaseBuffer(hiddenStates);
655
- }
656
- state.currentSeqLen++;
657
- return nextToken;
662
+ applyRepetitionPenalty(sampledLogits, currentIds, opts.repetitionPenalty);
663
+ const nextToken = sample(sampledLogits, {
664
+ temperature: opts.temperature,
665
+ topP: opts.topP,
666
+ topK: opts.topK,
667
+ padTokenId,
668
+ seed: opts.seed,
669
+ });
670
+
671
+ if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
672
+ releaseBuffer(hiddenStates);
658
673
  }
659
- state.disableFusedDecode = true;
660
- log.warn(
661
- 'Decode',
662
- `GPU sampling produced invalid token ${nextToken} (vocabSize=${config.vocabSize}, step=${state.decodeStepCount}); falling back to CPU sampling.`
663
- );
674
+ state.currentSeqLen++;
675
+ return nextToken;
664
676
  }
665
677
  }
666
678
 
@@ -887,6 +899,11 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
887
899
  '[Pipeline] Batch decode path is disabled for linear_attention models; use single-token decode.'
888
900
  );
889
901
  }
902
+ if (hasConvLayers(config.layerTypes)) {
903
+ throw new Error(
904
+ '[Pipeline] Batch decode path is disabled for conv models; use single-token decode.'
905
+ );
906
+ }
890
907
  const samplingDefaults = state.runtimeConfig.inference.sampling;
891
908
  const executionPlan = opts.executionPlan;
892
909
  const batchSize = executionPlan?.batchSize ?? opts.batchSize ?? state.runtimeConfig.inference.batching.batchSize;
@@ -27,6 +27,11 @@ export declare class PipelineGenerator {
27
27
  * Batching and readback cadence are controlled by runtime.inference.batching.
28
28
  */
29
29
  generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
30
+ generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
31
+ generateTokenIds(
32
+ prompt: PromptInput,
33
+ options?: GenerateOptions
34
+ ): Promise<{ tokenIds: number[]; stats: import('./types.js').PipelineStats }>;
30
35
  prefillKVOnly(prompt: PromptInput, options?: GenerateOptions): Promise<KVCacheSnapshot>;
31
36
  prefillWithEmbedding(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillEmbeddingResult>;
32
37
  prefillWithLogits(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillResult>;