@simulatte/doppler 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (392) hide show
  1. package/CHANGELOG.md +126 -0
  2. package/README.md +25 -17
  3. package/package.json +20 -4
  4. package/src/adapters/adapter-registry.js +12 -1
  5. package/src/adapters/lora-loader.js +23 -6
  6. package/src/bridge/extension-client.d.ts +5 -0
  7. package/src/bridge/extension-client.js +40 -0
  8. package/src/bridge/index.d.ts +2 -1
  9. package/src/bridge/index.js +6 -4
  10. package/src/browser/browser-converter.js +26 -1
  11. package/src/browser/file-picker.js +6 -0
  12. package/src/browser/safetensors-parser-browser.js +84 -1
  13. package/src/browser/shard-io-browser.js +2 -2
  14. package/src/browser/tensor-source-download.js +8 -2
  15. package/src/browser/tensor-source-http.d.ts +1 -0
  16. package/src/browser/tensor-source-http.js +5 -1
  17. package/src/client/doppler-api.browser.js +20 -4
  18. package/src/client/doppler-api.js +19 -3
  19. package/src/client/doppler-provider/generation.js +12 -0
  20. package/src/client/doppler-provider/model-manager.d.ts +10 -0
  21. package/src/client/doppler-provider/model-manager.js +91 -19
  22. package/src/client/doppler-provider/source-runtime.d.ts +2 -1
  23. package/src/client/doppler-provider/source-runtime.js +132 -13
  24. package/src/client/doppler-registry.json +8 -7
  25. package/src/config/backward-registry-loader.js +17 -2
  26. package/src/config/execution-v0-contract-check.js +113 -15
  27. package/src/config/kernel-path-contract-check.js +57 -29
  28. package/src/config/kernel-path-loader.js +5 -36
  29. package/src/config/kernels/kernel-ref-digests.js +39 -39
  30. package/src/config/kernels/registry.js +14 -1
  31. package/src/config/kernels/registry.json +49 -7
  32. package/src/config/loader.d.ts +1 -1
  33. package/src/config/loader.js +43 -4
  34. package/src/config/merge-contract-check.js +59 -4
  35. package/src/config/merge-helpers.js +128 -7
  36. package/src/config/merge.d.ts +1 -0
  37. package/src/config/merge.js +28 -0
  38. package/src/config/param-validator.js +47 -2
  39. package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
  40. package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
  41. package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
  42. package/src/config/presets/kernel-paths/registry.json +29 -8
  43. package/src/config/presets/models/gemma2.json +2 -2
  44. package/src/config/presets/models/qwen3.json +9 -2
  45. package/src/config/presets/models/transformer.json +5 -0
  46. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
  47. package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
  48. package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
  49. package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
  50. package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
  51. package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
  52. package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
  53. package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
  54. package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
  55. package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
  56. package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
  57. package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
  58. package/src/config/required-inference-fields-contract-check.js +6 -0
  59. package/src/config/runtime.js +6 -1
  60. package/src/config/schema/debug.schema.d.ts +5 -0
  61. package/src/config/schema/doppler.schema.js +16 -21
  62. package/src/config/schema/inference-defaults.schema.js +6 -3
  63. package/src/config/schema/inference.schema.d.ts +9 -0
  64. package/src/config/schema/kernel-path.schema.d.ts +11 -1
  65. package/src/config/schema/kernel-thresholds.schema.js +12 -4
  66. package/src/config/schema/manifest.schema.d.ts +8 -1
  67. package/src/config/schema/manifest.schema.js +19 -3
  68. package/src/config/training-defaults.js +30 -22
  69. package/src/converter/conversion-plan.js +94 -9
  70. package/src/converter/core.d.ts +7 -0
  71. package/src/converter/core.js +14 -9
  72. package/src/converter/execution-v0-manifest.js +4 -1
  73. package/src/converter/index.d.ts +1 -0
  74. package/src/converter/index.js +1 -0
  75. package/src/converter/manifest-inference.js +43 -12
  76. package/src/converter/parsers/diffusion.js +0 -3
  77. package/src/converter/quantization-info.js +35 -15
  78. package/src/converter/rope-config.js +42 -0
  79. package/src/converter/shard-packer.d.ts +1 -1
  80. package/src/converter/shard-packer.js +4 -1
  81. package/src/debug/config.js +123 -11
  82. package/src/debug/signals.js +7 -1
  83. package/src/debug/tensor.d.ts +2 -0
  84. package/src/debug/tensor.js +13 -2
  85. package/src/distribution/p2p-control-plane.js +52 -12
  86. package/src/distribution/p2p-observability.js +43 -7
  87. package/src/distribution/p2p-webrtc-browser.js +20 -0
  88. package/src/distribution/shard-delivery.js +77 -26
  89. package/src/formats/gguf/types.js +33 -16
  90. package/src/formats/rdrr/groups.d.ts +12 -4
  91. package/src/formats/rdrr/groups.js +3 -6
  92. package/src/formats/rdrr/parsing.js +39 -2
  93. package/src/formats/rdrr/types.d.ts +2 -1
  94. package/src/gpu/command-recorder.js +86 -61
  95. package/src/gpu/device.d.ts +1 -0
  96. package/src/gpu/device.js +131 -19
  97. package/src/gpu/kernel-tuner/benchmarks.js +326 -316
  98. package/src/gpu/kernel-tuner/cache.js +71 -4
  99. package/src/gpu/kernel-tuner/tuner.js +22 -4
  100. package/src/gpu/kernels/attention.js +113 -34
  101. package/src/gpu/kernels/backward/adam.js +62 -58
  102. package/src/gpu/kernels/backward/attention_backward.js +257 -169
  103. package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
  104. package/src/gpu/kernels/bias_add.wgsl +8 -6
  105. package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
  106. package/src/gpu/kernels/cast.js +191 -149
  107. package/src/gpu/kernels/check-stop.js +33 -44
  108. package/src/gpu/kernels/conv2d.js +27 -17
  109. package/src/gpu/kernels/conv2d.wgsl +7 -8
  110. package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
  111. package/src/gpu/kernels/cross_entropy_loss.js +21 -15
  112. package/src/gpu/kernels/depthwise_conv2d.js +37 -26
  113. package/src/gpu/kernels/depthwise_conv2d.wgsl +6 -9
  114. package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +6 -9
  115. package/src/gpu/kernels/dequant.js +178 -126
  116. package/src/gpu/kernels/energy.d.ts +3 -21
  117. package/src/gpu/kernels/energy.js +111 -88
  118. package/src/gpu/kernels/feature-check.js +1 -1
  119. package/src/gpu/kernels/fused_ffn.js +84 -65
  120. package/src/gpu/kernels/fused_matmul_residual.js +56 -33
  121. package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
  122. package/src/gpu/kernels/gather.js +33 -15
  123. package/src/gpu/kernels/gelu.js +19 -11
  124. package/src/gpu/kernels/grouped_pointwise_conv2d.js +34 -23
  125. package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +6 -9
  126. package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +6 -9
  127. package/src/gpu/kernels/groupnorm.js +34 -23
  128. package/src/gpu/kernels/kv-quantize.js +5 -2
  129. package/src/gpu/kernels/layernorm.js +35 -19
  130. package/src/gpu/kernels/logit-merge.js +5 -3
  131. package/src/gpu/kernels/matmul.js +83 -39
  132. package/src/gpu/kernels/modulate.js +23 -15
  133. package/src/gpu/kernels/moe.js +221 -175
  134. package/src/gpu/kernels/pixel_shuffle.js +22 -14
  135. package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
  136. package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
  137. package/src/gpu/kernels/relu.js +31 -10
  138. package/src/gpu/kernels/relu.wgsl +2 -1
  139. package/src/gpu/kernels/relu_f16.wgsl +2 -1
  140. package/src/gpu/kernels/repeat_channels.js +25 -17
  141. package/src/gpu/kernels/repeat_channels.wgsl +4 -5
  142. package/src/gpu/kernels/repeat_channels_f16.wgsl +4 -5
  143. package/src/gpu/kernels/residual.js +69 -23
  144. package/src/gpu/kernels/residual.wgsl +6 -3
  145. package/src/gpu/kernels/residual_f16.wgsl +2 -1
  146. package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
  147. package/src/gpu/kernels/residual_vec4.wgsl +2 -1
  148. package/src/gpu/kernels/rmsnorm.js +96 -28
  149. package/src/gpu/kernels/rmsnorm.wgsl +14 -6
  150. package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
  151. package/src/gpu/kernels/rope.d.ts +2 -0
  152. package/src/gpu/kernels/rope.js +14 -1
  153. package/src/gpu/kernels/rope.wgsl +56 -40
  154. package/src/gpu/kernels/sample.js +27 -38
  155. package/src/gpu/kernels/sana_linear_attention.js +19 -12
  156. package/src/gpu/kernels/sana_linear_attention_apply.wgsl +4 -5
  157. package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +4 -5
  158. package/src/gpu/kernels/sana_linear_attention_summary.wgsl +4 -0
  159. package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +4 -0
  160. package/src/gpu/kernels/scale.js +18 -11
  161. package/src/gpu/kernels/shader-cache.js +4 -2
  162. package/src/gpu/kernels/silu.d.ts +1 -0
  163. package/src/gpu/kernels/silu.js +148 -82
  164. package/src/gpu/kernels/silu.wgsl +19 -9
  165. package/src/gpu/kernels/silu_f16.wgsl +19 -9
  166. package/src/gpu/kernels/softmax.js +44 -25
  167. package/src/gpu/kernels/split_qkv.js +23 -13
  168. package/src/gpu/kernels/transpose.js +31 -10
  169. package/src/gpu/kernels/transpose.wgsl +6 -5
  170. package/src/gpu/kernels/upsample2d.js +22 -13
  171. package/src/gpu/kernels/upsample2d.wgsl +6 -9
  172. package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
  173. package/src/gpu/kernels/utils.js +35 -13
  174. package/src/gpu/partitioned-buffer-pool.js +10 -2
  175. package/src/gpu/perf-guards.js +2 -9
  176. package/src/gpu/profiler.js +27 -22
  177. package/src/gpu/readback-utils.d.ts +16 -0
  178. package/src/gpu/readback-utils.js +41 -0
  179. package/src/gpu/submit-tracker.js +13 -0
  180. package/src/gpu/uniform-cache.d.ts +1 -0
  181. package/src/gpu/uniform-cache.js +30 -9
  182. package/src/hotswap/intent-bundle.js +6 -0
  183. package/src/hotswap/manifest.d.ts +10 -1
  184. package/src/hotswap/manifest.js +12 -2
  185. package/src/hotswap/runtime.js +30 -8
  186. package/src/index-browser.d.ts +44 -0
  187. package/src/index-browser.js +14 -0
  188. package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
  189. package/src/inference/browser-harness-contract-helpers.js +28 -0
  190. package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
  191. package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
  192. package/src/inference/browser-harness-model-helpers.d.ts +16 -0
  193. package/src/inference/browser-harness-model-helpers.js +217 -0
  194. package/src/inference/browser-harness-report-helpers.d.ts +7 -0
  195. package/src/inference/browser-harness-report-helpers.js +42 -0
  196. package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
  197. package/src/inference/browser-harness-runtime-helpers.js +415 -0
  198. package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
  199. package/src/inference/browser-harness-suite-helpers.js +268 -0
  200. package/src/inference/browser-harness-text-helpers.d.ts +27 -0
  201. package/src/inference/browser-harness-text-helpers.js +788 -0
  202. package/src/inference/browser-harness.d.ts +6 -0
  203. package/src/inference/browser-harness.js +130 -1950
  204. package/src/inference/kv-cache/base.js +140 -94
  205. package/src/inference/kv-cache/tiered.js +5 -3
  206. package/src/inference/moe-router.js +88 -56
  207. package/src/inference/multi-model-network.js +5 -3
  208. package/src/inference/network-evolution.d.ts +11 -2
  209. package/src/inference/network-evolution.js +20 -21
  210. package/src/inference/pipelines/context.d.ts +3 -0
  211. package/src/inference/pipelines/context.js +142 -2
  212. package/src/inference/pipelines/diffusion/helpers.js +7 -2
  213. package/src/inference/pipelines/diffusion/pipeline.js +17 -7
  214. package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
  215. package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +5 -0
  216. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +27 -15
  217. package/src/inference/pipelines/diffusion/vae.js +3 -7
  218. package/src/inference/pipelines/energy/pipeline.js +27 -21
  219. package/src/inference/pipelines/energy/quintel.d.ts +5 -0
  220. package/src/inference/pipelines/energy/quintel.js +11 -0
  221. package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
  222. package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
  223. package/src/inference/pipelines/text/attention/projections.js +151 -101
  224. package/src/inference/pipelines/text/attention/record.js +73 -10
  225. package/src/inference/pipelines/text/attention/run.js +73 -10
  226. package/src/inference/pipelines/text/chat-format.js +25 -1
  227. package/src/inference/pipelines/text/config.d.ts +4 -0
  228. package/src/inference/pipelines/text/config.js +71 -5
  229. package/src/inference/pipelines/text/embed.js +2 -8
  230. package/src/inference/pipelines/text/execution-plan.js +64 -50
  231. package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
  232. package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
  233. package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
  234. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
  235. package/src/inference/pipelines/text/execution-v0.js +78 -1002
  236. package/src/inference/pipelines/text/ffn/standard.js +3 -0
  237. package/src/inference/pipelines/text/generator-steps.d.ts +46 -0
  238. package/src/inference/pipelines/text/generator-steps.js +298 -207
  239. package/src/inference/pipelines/text/generator.js +6 -23
  240. package/src/inference/pipelines/text/init.d.ts +4 -0
  241. package/src/inference/pipelines/text/init.js +134 -29
  242. package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
  243. package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
  244. package/src/inference/pipelines/text/kernel-trace.js +6 -0
  245. package/src/inference/pipelines/text/layer.js +14 -9
  246. package/src/inference/pipelines/text/linear-attention.d.ts +10 -0
  247. package/src/inference/pipelines/text/linear-attention.js +80 -6
  248. package/src/inference/pipelines/text/logits/gpu.js +10 -5
  249. package/src/inference/pipelines/text/logits/index.js +10 -11
  250. package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
  251. package/src/inference/pipelines/text/logits/utils.js +9 -0
  252. package/src/inference/pipelines/text/lora-apply.js +50 -32
  253. package/src/inference/pipelines/text/model-load.js +279 -104
  254. package/src/inference/pipelines/text/moe-cache.js +5 -4
  255. package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
  256. package/src/inference/pipelines/text/moe-cpu.js +42 -38
  257. package/src/inference/pipelines/text/moe-gpu.js +110 -86
  258. package/src/inference/pipelines/text/ops.js +90 -90
  259. package/src/inference/pipelines/text/probes.js +9 -9
  260. package/src/inference/pipelines/text/weights.js +17 -7
  261. package/src/inference/pipelines/text.js +17 -1
  262. package/src/inference/speculative.d.ts +2 -2
  263. package/src/inference/speculative.js +4 -18
  264. package/src/inference/test-harness.d.ts +1 -1
  265. package/src/inference/test-harness.js +15 -5
  266. package/src/inference/tokenizer.d.ts +0 -5
  267. package/src/inference/tokenizer.js +4 -23
  268. package/src/inference/tokenizers/bpe.js +9 -0
  269. package/src/inference/tokenizers/bundled.js +176 -33
  270. package/src/inference/tokenizers/sentencepiece.js +12 -0
  271. package/src/loader/doppler-loader.js +38 -22
  272. package/src/loader/dtype-utils.js +3 -44
  273. package/src/loader/embedding-loader.js +7 -3
  274. package/src/loader/experts/expert-cache.js +13 -6
  275. package/src/loader/experts/expert-loader.js +10 -6
  276. package/src/loader/final-weights-loader.js +8 -4
  277. package/src/loader/layer-loader.js +2 -1
  278. package/src/loader/loader-state.js +2 -2
  279. package/src/loader/memory-monitor.js +8 -0
  280. package/src/loader/multi-model-loader.d.ts +14 -0
  281. package/src/loader/multi-model-loader.js +70 -24
  282. package/src/loader/shard-cache.js +81 -12
  283. package/src/loader/shard-resolver.js +25 -3
  284. package/src/loader/tensors/tensor-loader.js +209 -144
  285. package/src/loader/tensors/tensor-reader.js +76 -19
  286. package/src/loader/weight-downcast.js +1 -1
  287. package/src/memory/buffer-pool.d.ts +9 -1
  288. package/src/memory/buffer-pool.js +109 -44
  289. package/src/memory/unified-detect.js +1 -1
  290. package/src/rules/inference/kernel-path.rules.json +24 -8
  291. package/src/rules/rule-registry.js +25 -1
  292. package/src/rules/tooling/command-runtime.rules.json +18 -0
  293. package/src/storage/backends/opfs-store.js +68 -24
  294. package/src/storage/downloader.js +364 -83
  295. package/src/storage/index.d.ts +3 -0
  296. package/src/storage/index.js +3 -0
  297. package/src/storage/preflight.d.ts +2 -2
  298. package/src/storage/preflight.js +24 -2
  299. package/src/storage/quickstart-downloader.js +11 -5
  300. package/src/storage/registry.js +10 -4
  301. package/src/storage/reports.js +1 -1
  302. package/src/storage/shard-manager.d.ts +15 -1
  303. package/src/storage/shard-manager.js +51 -3
  304. package/src/storage/source-artifact-store.d.ts +52 -0
  305. package/src/storage/source-artifact-store.js +234 -0
  306. package/src/tooling/command-api-constants.d.ts +9 -0
  307. package/src/tooling/command-api-constants.js +9 -0
  308. package/src/tooling/command-api-family-normalizers.d.ts +9 -0
  309. package/src/tooling/command-api-family-normalizers.js +343 -0
  310. package/src/tooling/command-api-helpers.d.ts +25 -0
  311. package/src/tooling/command-api-helpers.js +262 -0
  312. package/src/tooling/command-api.d.ts +27 -1
  313. package/src/tooling/command-api.js +26 -473
  314. package/src/tooling/command-envelope.js +4 -1
  315. package/src/tooling/command-runner-shared.js +52 -18
  316. package/src/tooling/lean-execution-contract.js +150 -3
  317. package/src/tooling/node-browser-command-runner.d.ts +4 -0
  318. package/src/tooling/node-browser-command-runner.js +218 -273
  319. package/src/tooling/node-command-runner.js +44 -3
  320. package/src/tooling/node-converter.js +27 -1
  321. package/src/tooling/node-source-runtime.d.ts +1 -1
  322. package/src/tooling/node-source-runtime.js +84 -3
  323. package/src/tooling/node-webgpu.js +30 -105
  324. package/src/tooling/opfs-cache.js +21 -4
  325. package/src/tooling/runtime-input-composition.d.ts +38 -0
  326. package/src/tooling/runtime-input-composition.js +86 -0
  327. package/src/tooling/source-runtime-bundle.d.ts +40 -5
  328. package/src/tooling/source-runtime-bundle.js +261 -34
  329. package/src/tooling/source-runtime-materializer.d.ts +6 -0
  330. package/src/tooling/source-runtime-materializer.js +93 -0
  331. package/src/training/attention-backward.js +32 -17
  332. package/src/training/autograd.js +80 -52
  333. package/src/training/checkpoint-watch.d.ts +8 -0
  334. package/src/training/checkpoint-watch.js +139 -0
  335. package/src/training/checkpoint.d.ts +6 -1
  336. package/src/training/checkpoint.js +46 -7
  337. package/src/training/clip.js +2 -1
  338. package/src/training/datasets/token-batch.js +20 -8
  339. package/src/training/distillation/artifacts.d.ts +71 -0
  340. package/src/training/distillation/artifacts.js +132 -0
  341. package/src/training/distillation/checkpoint-watch.d.ts +10 -0
  342. package/src/training/distillation/checkpoint-watch.js +58 -0
  343. package/src/training/distillation/dataset.d.ts +59 -0
  344. package/src/training/distillation/dataset.js +337 -0
  345. package/src/training/distillation/eval.d.ts +34 -0
  346. package/src/training/distillation/eval.js +310 -0
  347. package/src/training/distillation/index.d.ts +29 -0
  348. package/src/training/distillation/index.js +29 -0
  349. package/src/training/distillation/runtime.d.ts +20 -0
  350. package/src/training/distillation/runtime.js +121 -0
  351. package/src/training/distillation/scoreboard.d.ts +6 -0
  352. package/src/training/distillation/scoreboard.js +8 -0
  353. package/src/training/distillation/stage-a.d.ts +45 -0
  354. package/src/training/distillation/stage-a.js +338 -0
  355. package/src/training/distillation/stage-b.d.ts +24 -0
  356. package/src/training/distillation/stage-b.js +20 -0
  357. package/src/training/distillation/student-fixture.d.ts +22 -0
  358. package/src/training/distillation/student-fixture.js +846 -0
  359. package/src/training/distillation/suite-data.d.ts +45 -0
  360. package/src/training/distillation/suite-data.js +189 -0
  361. package/src/training/index.d.ts +10 -0
  362. package/src/training/index.js +10 -0
  363. package/src/training/lora-pipeline.d.ts +40 -0
  364. package/src/training/lora-pipeline.js +793 -0
  365. package/src/training/lora.js +26 -12
  366. package/src/training/loss.js +5 -6
  367. package/src/training/objectives/cross_entropy.js +2 -5
  368. package/src/training/objectives/distill_kd.js +4 -8
  369. package/src/training/objectives/distill_triplet.js +4 -8
  370. package/src/training/objectives/ul_stage2_base.js +4 -8
  371. package/src/training/operator-artifacts.d.ts +62 -0
  372. package/src/training/operator-artifacts.js +140 -0
  373. package/src/training/operator-command.d.ts +5 -0
  374. package/src/training/operator-command.js +455 -0
  375. package/src/training/operator-eval.d.ts +48 -0
  376. package/src/training/operator-eval.js +230 -0
  377. package/src/training/operator-scoreboard.d.ts +5 -0
  378. package/src/training/operator-scoreboard.js +44 -0
  379. package/src/training/optimizer.js +19 -7
  380. package/src/training/runner.d.ts +52 -0
  381. package/src/training/runner.js +31 -5
  382. package/src/training/suite.d.ts +112 -0
  383. package/src/training/suite.js +24 -984
  384. package/src/training/tensor-factory.d.ts +9 -0
  385. package/src/training/tensor-factory.js +13 -0
  386. package/src/training/trainer.js +3 -5
  387. package/src/training/ul_dataset.js +3 -5
  388. package/src/training/workloads.d.ts +164 -0
  389. package/src/training/workloads.js +530 -0
  390. package/src/version.js +1 -1
  391. package/tools/convert-safetensors-node.js +22 -16
  392. package/tools/doppler-cli.js +179 -63
@@ -16,6 +16,21 @@ import { selectRuleValue } from '../../rules/rule-registry.js';
16
16
 
17
17
  let loggedF32UpcastNonMatmul = false;
18
18
 
19
+ function isGpuBufferInstance(value) {
20
+ return typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer;
21
+ }
22
+
23
+ function isReleasableBuffer(value) {
24
+ return typeof value === 'object' && value !== null && 'size' in value;
25
+ }
26
+
27
+ function releaseOwnedGpuBuffer(buffer, owned) {
28
+ if (!owned || !isReleasableBuffer(buffer)) {
29
+ return;
30
+ }
31
+ releaseBuffer(buffer);
32
+ }
33
+
19
34
  function logF32UpcastNonMatmul(name, numElements, bufferSize) {
20
35
  if (loggedF32UpcastNonMatmul) {
21
36
  return;
@@ -152,66 +167,80 @@ export function convertF16ToF32CPU(f16Data) {
152
167
 
153
168
  export async function loadQ4KFused(shardData, location, name) {
154
169
  const device = getDevice();
155
- const buffer = shardData instanceof GPUBuffer
170
+ const ownsBuffer = !isGpuBufferInstance(shardData);
171
+ const buffer = isGpuBufferInstance(shardData)
156
172
  ? shardData
157
173
  : acquireAlignedBuffer(location.size, `q4k_${name}`);
158
- if (!(shardData instanceof GPUBuffer)) {
159
- writeBufferAligned(device, buffer, shardData);
174
+ try {
175
+ if (ownsBuffer) {
176
+ writeBufferAligned(device, buffer, shardData);
177
+ }
178
+ return {
179
+ data: createWeightBuffer(buffer, 'q4k', 'row', location.shape, name),
180
+ allocatedBuffers: [buffer],
181
+ };
182
+ } catch (error) {
183
+ releaseOwnedGpuBuffer(buffer, ownsBuffer);
184
+ throw error;
160
185
  }
161
-
162
- return {
163
- data: createWeightBuffer(buffer, 'q4k', 'row', location.shape, name),
164
- allocatedBuffers: [buffer],
165
- };
166
186
  }
167
187
 
168
188
 
169
189
  export async function loadQ4KDequant(shardData, location, name, config) {
170
190
  const device = getDevice();
171
- const quantBuffer = shardData instanceof GPUBuffer
191
+ let ownsQuantBuffer = !isGpuBufferInstance(shardData);
192
+ const quantBuffer = isGpuBufferInstance(shardData)
172
193
  ? shardData
173
194
  : acquireAlignedBuffer(location.size, `quant_${name}`);
174
- if (!(shardData instanceof GPUBuffer)) {
175
- writeBufferAligned(device, quantBuffer, shardData);
176
- }
177
-
178
- const outputDtype = getQ4KOutputDtype(location, config);
179
-
180
- // Check if this is a 2D matrix with K (columns) not aligned to QK_K (256).
181
- // If so, we need row-wise dequant to produce proper row-major output.
182
- const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
183
- const K = is2DMatrix ? location.shape[1] : 0;
184
- const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
195
+ let dequantized = null;
196
+ try {
197
+ if (ownsQuantBuffer) {
198
+ writeBufferAligned(device, quantBuffer, shardData);
199
+ }
185
200
 
186
- let dequantizedTensor;
187
- if (needsRowwise) {
188
- const rows = location.shape[0];
189
- debugTrace.loader(
190
- `Dequantizing ${name} (row-wise): [${rows},${K}], K not 256-aligned, ` +
191
- `outputDtype=${outputDtype}`
192
- );
193
- dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
194
- } else {
195
- const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
196
- debugTrace.loader(
197
- `Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
198
- `outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
199
- );
200
- dequantizedTensor = await dequantize(quantBuffer, numBlocks, { outputDtype });
201
- }
202
- const dequantized = dequantizedTensor.buffer;
201
+ const outputDtype = getQ4KOutputDtype(location, config);
202
+
203
+ const is2DMatrix = Array.isArray(location.shape) && location.shape.length === 2;
204
+ const K = is2DMatrix ? location.shape[1] : 0;
205
+ const needsRowwise = is2DMatrix && K > 0 && K % QK_K !== 0;
206
+
207
+ let dequantizedTensor;
208
+ if (needsRowwise) {
209
+ const rows = location.shape[0];
210
+ debugTrace.loader(
211
+ `Dequantizing ${name} (row-wise): [${rows},${K}], K not 256-aligned, ` +
212
+ `outputDtype=${outputDtype}`
213
+ );
214
+ dequantizedTensor = await dequantizeRowwise(quantBuffer, rows, K, { outputDtype });
215
+ } else {
216
+ const numBlocks = Math.ceil(location.size / Q4K_BLOCK_BYTES);
217
+ debugTrace.loader(
218
+ `Dequantizing ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
219
+ `outputDtype=${outputDtype}, expectedOutput=${numBlocks * QK_K * (outputDtype === 'f16' ? 2 : 4)}`
220
+ );
221
+ dequantizedTensor = await dequantize(quantBuffer, numBlocks, { outputDtype });
222
+ }
223
+ dequantized = dequantizedTensor.buffer;
203
224
 
204
- debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
205
- releaseBuffer(quantBuffer);
225
+ debugTrace.loader(`Dequantized ${name}: resultSize=${dequantized.size}`);
226
+ releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
227
+ ownsQuantBuffer = false;
206
228
 
207
- const layout = getWeightLayout(location, config);
208
-
209
- const dtype = outputDtype;
229
+ const layout = getWeightLayout(location, config);
230
+ const dtype = outputDtype;
210
231
 
211
- return {
212
- data: createWeightBuffer(dequantized, dtype, layout, location.shape, name),
213
- allocatedBuffers: [dequantized],
214
- };
232
+ return {
233
+ data: createWeightBuffer(dequantized, dtype, layout, location.shape, name),
234
+ allocatedBuffers: [dequantized],
235
+ };
236
+ } catch (error) {
237
+ if (isReleasableBuffer(dequantized)) {
238
+ releaseBuffer(dequantized);
239
+ }
240
+ throw error;
241
+ } finally {
242
+ releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
243
+ }
215
244
  }
216
245
 
217
246
 
@@ -219,97 +248,119 @@ export async function loadQ6K(shardData, location, name) {
219
248
  const device = getDevice();
220
249
 
221
250
  debugTrace.loader(`Loading Q6_K tensor "${name}", size=${location.size}`);
222
- const quantBuffer = shardData instanceof GPUBuffer
251
+ let ownsQuantBuffer = !isGpuBufferInstance(shardData);
252
+ const quantBuffer = isGpuBufferInstance(shardData)
223
253
  ? shardData
224
254
  : acquireAlignedBuffer(location.size, `quant_${name}`);
225
- if (!(shardData instanceof GPUBuffer)) {
226
- writeBufferAligned(device, quantBuffer, shardData);
227
- }
255
+ let dequantized = null;
256
+ try {
257
+ if (ownsQuantBuffer) {
258
+ writeBufferAligned(device, quantBuffer, shardData);
259
+ }
228
260
 
229
- const numBlocks = Math.floor(location.size / Q6K_BLOCK_BYTES);
230
- debugTrace.loader(
231
- `Dequantizing Q6_K ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
232
- `expectedOutput=${numBlocks * 256 * 2} (f16)`
233
- );
261
+ const numBlocks = Math.floor(location.size / Q6K_BLOCK_BYTES);
262
+ debugTrace.loader(
263
+ `Dequantizing Q6_K ${name}: size=${location.size}, numBlocks=${numBlocks}, ` +
264
+ `expectedOutput=${numBlocks * 256 * 2} (f16)`
265
+ );
234
266
 
235
- const dequantizedTensor = await dequantizeQ6K(quantBuffer, numBlocks, { outputDtype: 'f16' });
236
- const dequantized = dequantizedTensor.buffer;
267
+ const dequantizedTensor = await dequantizeQ6K(quantBuffer, numBlocks, { outputDtype: 'f16' });
268
+ dequantized = dequantizedTensor.buffer;
237
269
 
238
- debugTrace.loader(`Dequantized Q6_K ${name}: resultSize=${dequantized.size}`);
239
- releaseBuffer(quantBuffer);
270
+ debugTrace.loader(`Dequantized Q6_K ${name}: resultSize=${dequantized.size}`);
271
+ releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
272
+ ownsQuantBuffer = false;
273
+
274
+ const isMatmulWeight = shouldDequantizeToF16(location);
275
+ if (isMatmulWeight) {
276
+ return {
277
+ data: createWeightBuffer(dequantized, 'f16', 'row', location.shape, name),
278
+ allocatedBuffers: [dequantized],
279
+ };
280
+ }
240
281
 
241
- const isMatmulWeight = shouldDequantizeToF16(location);
242
- if (isMatmulWeight) {
243
282
  return {
244
- data: createWeightBuffer(dequantized, 'f16', 'row', location.shape, name),
283
+ data: applyBufferLayout(dequantized, location, 'f16'),
245
284
  allocatedBuffers: [dequantized],
246
285
  };
286
+ } catch (error) {
287
+ if (isReleasableBuffer(dequantized)) {
288
+ releaseBuffer(dequantized);
289
+ }
290
+ throw error;
291
+ } finally {
292
+ releaseOwnedGpuBuffer(quantBuffer, ownsQuantBuffer);
247
293
  }
248
-
249
- return {
250
- data: applyBufferLayout(dequantized, location, 'f16'),
251
- allocatedBuffers: [dequantized],
252
- };
253
294
  }
254
295
 
255
296
 
256
297
  export async function loadBF16(shardData, location, name, config) {
257
298
  const device = getDevice();
258
- const srcBuffer = shardData instanceof GPUBuffer
299
+ let ownsSrcBuffer = !isGpuBufferInstance(shardData);
300
+ const srcBuffer = isGpuBufferInstance(shardData)
259
301
  ? shardData
260
302
  : acquireAlignedBuffer(location.size, `${name}_bf16`);
261
- if (!(shardData instanceof GPUBuffer)) {
262
- writeBufferAligned(device, srcBuffer, shardData);
263
- }
264
-
265
- const numElements = location.size / 2;
266
- const caps = config.gpuCapabilities || getKernelCapabilities();
267
- const isMatmulWeight = shouldDequantizeToF16(location);
303
+ let resultBuffer = null;
304
+ try {
305
+ if (ownsSrcBuffer) {
306
+ writeBufferAligned(device, srcBuffer, shardData);
307
+ }
268
308
 
269
- // For matmul weights with F16 support: BF16 -> F16 directly
270
- if (caps?.hasF16 && isMatmulWeight) {
271
- const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
272
- releaseBuffer(srcBuffer);
273
- debugTrace.loader(`BF16->F16 for matmul weight: ${name} (${numElements} elements)`);
309
+ const numElements = location.size / 2;
310
+ const caps = config.gpuCapabilities || getKernelCapabilities();
311
+ const isMatmulWeight = shouldDequantizeToF16(location);
274
312
 
275
-
276
- const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
277
- layout: location.layout ?? null,
278
- useColumnWise: false,
279
- });
280
- return {
281
- data: createWeightBuffer(f16Tensor.buffer, 'f16', layout, location.shape, name),
282
- allocatedBuffers: [f16Tensor.buffer],
283
- };
284
- }
313
+ if (caps?.hasF16 && isMatmulWeight) {
314
+ const f16Tensor = await runBF16ToF16(srcBuffer, [numElements], name);
315
+ resultBuffer = f16Tensor.buffer;
316
+ releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
317
+ ownsSrcBuffer = false;
318
+ debugTrace.loader(`BF16->F16 for matmul weight: ${name} (${numElements} elements)`);
285
319
 
286
- // Standard path: BF16 -> F32
287
- const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
288
- releaseBuffer(srcBuffer);
289
-
290
- if (dstBuffer instanceof GPUBuffer) {
291
- if (isMatmulWeight) {
292
-
293
320
  const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
294
321
  layout: location.layout ?? null,
295
322
  useColumnWise: false,
296
323
  });
297
324
  return {
298
- data: createWeightBuffer(dstBuffer, 'f32', layout, location.shape, name),
325
+ data: createWeightBuffer(f16Tensor.buffer, 'f16', layout, location.shape, name),
326
+ allocatedBuffers: [f16Tensor.buffer],
327
+ };
328
+ }
329
+
330
+ const dstBuffer = await convertBF16ToF32GPU(srcBuffer, numElements, name);
331
+ resultBuffer = dstBuffer;
332
+ releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
333
+ ownsSrcBuffer = false;
334
+
335
+ if (isGpuBufferInstance(dstBuffer)) {
336
+ if (isMatmulWeight) {
337
+ const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
338
+ layout: location.layout ?? null,
339
+ useColumnWise: false,
340
+ });
341
+ return {
342
+ data: createWeightBuffer(dstBuffer, 'f32', layout, location.shape, name),
343
+ allocatedBuffers: [dstBuffer],
344
+ };
345
+ }
346
+ return {
347
+ data: applyBufferLayout(dstBuffer, location, 'f32'),
299
348
  allocatedBuffers: [dstBuffer],
300
349
  };
301
350
  }
351
+
302
352
  return {
303
- data: applyBufferLayout(dstBuffer, location, 'f32'),
304
- allocatedBuffers: [dstBuffer],
353
+ data: dstBuffer,
354
+ allocatedBuffers: [],
305
355
  };
356
+ } catch (error) {
357
+ if (isReleasableBuffer(resultBuffer)) {
358
+ releaseBuffer(resultBuffer);
359
+ }
360
+ throw error;
361
+ } finally {
362
+ releaseOwnedGpuBuffer(srcBuffer, ownsSrcBuffer);
306
363
  }
307
-
308
- // Float32Array returned (shouldn't happen in GPU path)
309
- return {
310
- data: dstBuffer,
311
- allocatedBuffers: [],
312
- };
313
364
  }
314
365
 
315
366
 
@@ -318,55 +369,69 @@ export async function loadFloat(shardData, location, name, config) {
318
369
  throw new Error('Tensor load config is required.');
319
370
  }
320
371
  const device = getDevice();
321
- const buffer = shardData instanceof GPUBuffer
372
+ let ownsBuffer = !isGpuBufferInstance(shardData);
373
+ const buffer = isGpuBufferInstance(shardData)
322
374
  ? shardData
323
375
  : acquireAlignedBuffer(location.size, name);
324
- if (!(shardData instanceof GPUBuffer)) {
325
- writeBufferAligned(device, buffer, shardData);
326
- }
327
-
328
- const dtype = selectRuleValue('loader', 'weights', 'floatLocationDtype', {
329
- locationDtype: location.dtype,
330
- });
331
- const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
332
- layout: location.layout ?? null,
333
- useColumnWise: false,
334
- });
335
- const isMatmulWeight = shouldDequantizeToF16(location);
376
+ let resultBuffer = null;
377
+ try {
378
+ if (ownsBuffer) {
379
+ writeBufferAligned(device, buffer, shardData);
380
+ }
336
381
 
337
- // Return WeightBuffer for matmul weights
338
- if (isMatmulWeight) {
339
- return {
340
- data: createWeightBuffer(buffer, dtype, layout, location.shape, name),
341
- allocatedBuffers: [buffer],
342
- };
343
- }
382
+ const dtype = selectRuleValue('loader', 'weights', 'floatLocationDtype', {
383
+ locationDtype: location.dtype,
384
+ });
385
+ const layout = selectRuleValue('loader', 'weights', 'weightLayout', {
386
+ layout: location.layout ?? null,
387
+ useColumnWise: false,
388
+ });
389
+ const isMatmulWeight = shouldDequantizeToF16(location);
344
390
 
345
- // Non-matmul F16 weights need upcast to F32
346
- if (dtype === 'f16') {
347
- if (config.allowF32UpcastNonMatmul === false) {
391
+ if (isMatmulWeight) {
392
+ ownsBuffer = false;
348
393
  return {
349
- data: applyBufferLayout(buffer, location, 'f16'),
394
+ data: createWeightBuffer(buffer, dtype, layout, location.shape, name),
350
395
  allocatedBuffers: [buffer],
351
396
  };
352
397
  }
353
- const numElements = location.shape.reduce((a, b) => a * b, 1);
354
- logF32UpcastNonMatmul(name, numElements, buffer.size);
355
- debugTrace.loader(`F16->F32 upcast for non-matmul: ${name} (${numElements} elements, bufSize=${buffer.size})`);
356
- const inputTensor = createTensor(buffer, 'f16', [numElements], `${name}_f16`);
357
- const f32Tensor = await castF16ToF32(inputTensor);
358
- debugTrace.loader(`F16->F32 complete: ${name} resultSize=${f32Tensor.buffer.size}`);
359
- releaseBuffer(buffer);
398
+
399
+ if (dtype === 'f16') {
400
+ if (config.allowF32UpcastNonMatmul === false) {
401
+ ownsBuffer = false;
402
+ return {
403
+ data: applyBufferLayout(buffer, location, 'f16'),
404
+ allocatedBuffers: [buffer],
405
+ };
406
+ }
407
+ const numElements = location.shape.reduce((a, b) => a * b, 1);
408
+ logF32UpcastNonMatmul(name, numElements, buffer.size);
409
+ debugTrace.loader(`F16->F32 upcast for non-matmul: ${name} (${numElements} elements, bufSize=${buffer.size})`);
410
+ const inputTensor = createTensor(buffer, 'f16', [numElements], `${name}_f16`);
411
+ const f32Tensor = await castF16ToF32(inputTensor);
412
+ resultBuffer = f32Tensor.buffer;
413
+ debugTrace.loader(`F16->F32 complete: ${name} resultSize=${f32Tensor.buffer.size}`);
414
+ releaseOwnedGpuBuffer(buffer, ownsBuffer);
415
+ ownsBuffer = false;
416
+ return {
417
+ data: applyBufferLayout(f32Tensor.buffer, location, 'f32'),
418
+ allocatedBuffers: [f32Tensor.buffer],
419
+ };
420
+ }
421
+
422
+ ownsBuffer = false;
360
423
  return {
361
- data: applyBufferLayout(f32Tensor.buffer, location, 'f32'),
362
- allocatedBuffers: [f32Tensor.buffer],
424
+ data: applyBufferLayout(buffer, location, dtype),
425
+ allocatedBuffers: [buffer],
363
426
  };
427
+ } catch (error) {
428
+ if (isReleasableBuffer(resultBuffer)) {
429
+ releaseBuffer(resultBuffer);
430
+ }
431
+ throw error;
432
+ } finally {
433
+ releaseOwnedGpuBuffer(buffer, ownsBuffer);
364
434
  }
365
-
366
- return {
367
- data: applyBufferLayout(buffer, location, dtype),
368
- allocatedBuffers: [buffer],
369
- };
370
435
  }
371
436
 
372
437
  // ============================================================================
@@ -2,30 +2,84 @@
2
2
 
3
3
  import { trace } from '../../debug/index.js';
4
4
 
5
+ function resolveSpanShardIndex(span, name, spanIndex) {
6
+ const shardIndex = typeof span?.shardIndex === 'number'
7
+ ? span.shardIndex
8
+ : span?.shard;
9
+ if (!Number.isInteger(shardIndex) || shardIndex < 0) {
10
+ throw new Error(
11
+ `[DopplerLoader] Tensor "${name}" span[${spanIndex}] has invalid shard index.`
12
+ );
13
+ }
14
+ return shardIndex;
15
+ }
16
+
17
+ function validateSpanField(value, field, name, spanIndex) {
18
+ if (!Number.isInteger(value) || value < 0) {
19
+ throw new Error(
20
+ `[DopplerLoader] Tensor "${name}" span[${spanIndex}] has invalid ${field}.`
21
+ );
22
+ }
23
+ return value;
24
+ }
25
+
26
+ function getLocationSpans(location) {
27
+ if (!Array.isArray(location?.spans) || location.spans.length === 0) {
28
+ return null;
29
+ }
30
+ return location.spans;
31
+ }
32
+
33
+ function resolveLocationShardIndex(location, name) {
34
+ const shardIndex = typeof location?.shardIndex === 'number'
35
+ ? location.shardIndex
36
+ : location?.shard;
37
+ if (!Number.isInteger(shardIndex) || shardIndex < 0) {
38
+ throw new Error(`[DopplerLoader] Tensor "${name}" has invalid shard index.`);
39
+ }
40
+ return shardIndex;
41
+ }
42
+
43
+ function validateLocationField(location, field, name) {
44
+ const value = location?.[field];
45
+ if (!Number.isInteger(value) || value < 0) {
46
+ throw new Error(`[DopplerLoader] Tensor "${name}" has invalid ${field}.`);
47
+ }
48
+ return value;
49
+ }
5
50
 
6
51
  export async function assembleShardData(location, name, loadShard, loadShardRange = null) {
7
- if (location.spans) {
8
- trace.loader(`Assembling tensor "${name}" from ${location.spans.length} spans`);
52
+ const spans = getLocationSpans(location);
53
+ if (spans) {
54
+ trace.loader(`Assembling tensor "${name}" from ${spans.length} spans`);
9
55
 
10
- const chunks = await Promise.all(location.spans.map(async (span) => {
56
+ const chunks = await Promise.all(spans.map(async (span, spanIndex) => {
57
+ const shardIndex = resolveSpanShardIndex(span, name, spanIndex);
58
+ const offset = validateSpanField(span.offset, 'offset', name, spanIndex);
59
+ const size = validateSpanField(span.size, 'size', name, spanIndex);
11
60
  if (loadShardRange) {
12
- const data = await loadShardRange(span.shardIndex, span.offset, span.size);
13
- if (span.size > data.byteLength) {
61
+ const data = await loadShardRange(shardIndex, offset, size);
62
+ if (size > data.byteLength) {
14
63
  throw new Error(
15
- `[DopplerLoader] Shard ${span.shardIndex} too small for tensor "${name}" span.`
64
+ `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" span.`
16
65
  );
17
66
  }
18
- return new Uint8Array(data, 0, span.size);
67
+ return new Uint8Array(data, 0, size);
19
68
  }
20
- const data = await loadShard(span.shardIndex);
21
- if (span.offset + span.size > data.byteLength) {
69
+ const data = await loadShard(shardIndex);
70
+ if (offset + size > data.byteLength) {
22
71
  throw new Error(
23
- `[DopplerLoader] Shard ${span.shardIndex} too small for tensor "${name}" span.`
72
+ `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" span.`
24
73
  );
25
74
  }
26
- return new Uint8Array(data, span.offset, span.size);
75
+ return new Uint8Array(data, offset, size);
27
76
  }));
28
77
  const totalSize = chunks.reduce((s, c) => s + c.length, 0);
78
+ if (Number.isInteger(location?.size) && totalSize !== location.size) {
79
+ throw new Error(
80
+ `[DopplerLoader] Tensor "${name}" spans total ${totalSize} bytes, expected ${location.size}.`
81
+ );
82
+ }
29
83
  const combined = new Uint8Array(totalSize);
30
84
  let offset = 0;
31
85
  for (const chunk of chunks) {
@@ -36,21 +90,24 @@ export async function assembleShardData(location, name, loadShard, loadShardRang
36
90
  }
37
91
 
38
92
  // Single shard - use view to avoid copying
93
+ const shardIndex = resolveLocationShardIndex(location, name);
94
+ const offset = validateLocationField(location, 'offset', name);
95
+ const size = validateLocationField(location, 'size', name);
39
96
  if (loadShardRange) {
40
- const slice = await loadShardRange(location.shardIndex, location.offset, location.size);
41
- if (location.size > slice.byteLength) {
97
+ const slice = await loadShardRange(shardIndex, offset, size);
98
+ if (size > slice.byteLength) {
42
99
  throw new Error(
43
- `[DopplerLoader] Shard ${location.shardIndex} too small for tensor "${name}" (offset=${location.offset}, size=${location.size}, shard=${slice.byteLength})`
100
+ `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" (offset=${offset}, size=${size}, shard=${slice.byteLength})`
44
101
  );
45
102
  }
46
- return new Uint8Array(slice, 0, location.size);
103
+ return new Uint8Array(slice, 0, size);
47
104
  }
48
105
 
49
- const fullShard = await loadShard(location.shardIndex);
50
- if (location.offset + location.size > fullShard.byteLength) {
106
+ const fullShard = await loadShard(shardIndex);
107
+ if (offset + size > fullShard.byteLength) {
51
108
  throw new Error(
52
- `[DopplerLoader] Shard ${location.shardIndex} too small for tensor "${name}" (offset=${location.offset}, size=${location.size}, shard=${fullShard.byteLength})`
109
+ `[DopplerLoader] Shard ${shardIndex} too small for tensor "${name}" (offset=${offset}, size=${size}, shard=${fullShard.byteLength})`
53
110
  );
54
111
  }
55
- return new Uint8Array(fullShard, location.offset, location.size);
112
+ return new Uint8Array(fullShard, offset, size);
56
113
  }
@@ -47,7 +47,7 @@ export async function maybeDowncastToF16(buf, options) {
47
47
  }
48
48
 
49
49
  // Handle raw GPUBuffer
50
- if (buf instanceof GPUBuffer) {
50
+ if (typeof GPUBuffer !== 'undefined' && buf instanceof GPUBuffer) {
51
51
  return downcastGPUBuffer(buf, options);
52
52
  }
53
53
 
@@ -80,6 +80,12 @@ export declare class BufferPool {
80
80
  */
81
81
  release(buffer: GPUBuffer): void;
82
82
 
83
+ /**
84
+ * Force-dispose an active buffer instead of returning it to the pool.
85
+ * Use for error paths where the buffer contents or device state may be invalid.
86
+ */
87
+ discard(buffer: GPUBuffer): void;
88
+
83
89
  /**
84
90
  * Check if a buffer is currently tracked as active by the pool
85
91
  */
@@ -159,7 +165,8 @@ export declare class BufferPool {
159
165
  }
160
166
 
161
167
  /**
162
- * Get the global buffer pool
168
+ * Get the global buffer pool for the current device epoch.
169
+ * If the active device has changed or was lost, a fresh global pool is created.
163
170
  */
164
171
  export function getBufferPool(): BufferPool;
165
172
 
@@ -179,6 +186,7 @@ export declare const createUploadBuffer: (size: number) => GPUBuffer;
179
186
  export declare const createUniformBuffer: (size: number) => GPUBuffer;
180
187
  export declare const acquireBuffer: (size: number, usage?: GPUBufferUsageFlags, label?: string) => GPUBuffer;
181
188
  export declare const releaseBuffer: (buffer: GPUBuffer) => void;
189
+ export declare const discardBuffer: (buffer: GPUBuffer) => void;
182
190
  export declare const isBufferActive: (buffer: GPUBuffer) => boolean;
183
191
  export declare const getBufferRequestedSize: (buffer: GPUBuffer) => number;
184
192
  export declare const uploadData: (buffer: GPUBuffer, data: ArrayBuffer | ArrayBufferView, offset?: number) => void;