@simulatte/doppler 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. package/CHANGELOG.md +126 -0
  2. package/README.md +16 -23
  3. package/package.json +14 -1
  4. package/src/adapters/adapter-registry.js +12 -1
  5. package/src/adapters/lora-loader.js +23 -6
  6. package/src/bridge/extension-client.d.ts +5 -0
  7. package/src/bridge/extension-client.js +40 -0
  8. package/src/bridge/index.d.ts +2 -1
  9. package/src/bridge/index.js +6 -4
  10. package/src/browser/browser-converter.js +26 -1
  11. package/src/browser/file-picker.js +6 -0
  12. package/src/browser/safetensors-parser-browser.js +84 -1
  13. package/src/browser/shard-io-browser.js +2 -2
  14. package/src/browser/tensor-source-download.js +8 -2
  15. package/src/browser/tensor-source-http.d.ts +1 -0
  16. package/src/browser/tensor-source-http.js +5 -1
  17. package/src/client/doppler-api.browser.js +20 -4
  18. package/src/client/doppler-api.js +19 -3
  19. package/src/client/doppler-provider/generation.js +12 -0
  20. package/src/client/doppler-provider/model-manager.d.ts +10 -0
  21. package/src/client/doppler-provider/model-manager.js +91 -19
  22. package/src/client/doppler-provider/source-runtime.d.ts +2 -1
  23. package/src/client/doppler-provider/source-runtime.js +132 -13
  24. package/src/client/doppler-registry.json +8 -7
  25. package/src/config/backward-registry-loader.js +17 -2
  26. package/src/config/execution-v0-contract-check.js +113 -15
  27. package/src/config/kernel-path-contract-check.js +57 -29
  28. package/src/config/kernel-path-loader.js +5 -36
  29. package/src/config/kernels/kernel-ref-digests.js +1 -1
  30. package/src/config/kernels/registry.js +14 -1
  31. package/src/config/kernels/registry.json +7 -5
  32. package/src/config/loader.d.ts +1 -1
  33. package/src/config/loader.js +12 -2
  34. package/src/config/merge-contract-check.js +59 -4
  35. package/src/config/merge-helpers.js +128 -7
  36. package/src/config/merge.d.ts +1 -0
  37. package/src/config/merge.js +10 -0
  38. package/src/config/param-validator.js +47 -2
  39. package/src/config/presets/kernel-paths/{gemma2-q4k-dequant-f32a.json → gemma2-q4k-dequant-f32a-nosubgroups.json} +3 -3
  40. package/src/config/presets/kernel-paths/gemma3-f16-fused-f32a-online-streamingprefill.json +223 -0
  41. package/src/config/presets/kernel-paths/{gemma3-q4k-dequant-f32a.json → gemma3-q4k-dequant-f32a-nosubgroups.json} +3 -3
  42. package/src/config/presets/kernel-paths/registry.json +29 -8
  43. package/src/config/presets/models/gemma2.json +2 -2
  44. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +1 -1
  45. package/src/config/presets/runtime/experiments/debug/gemma3-debug-q4k.json +1 -1
  46. package/src/config/presets/runtime/experiments/verify/gemma3-verify.json +1 -1
  47. package/src/config/presets/runtime/kernels/dequant-f16-q4k.json +6 -13
  48. package/src/config/presets/runtime/kernels/dequant-f32-q4k.json +6 -13
  49. package/src/config/presets/runtime/kernels/embeddinggemma-q4k-dequant-f32a.json +37 -0
  50. package/src/config/presets/runtime/kernels/fused-q4k.json +6 -13
  51. package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f16a.json +33 -0
  52. package/src/config/presets/runtime/kernels/gemma2-q4k-dequant-f32a-nosubgroups.json +33 -0
  53. package/src/config/presets/runtime/kernels/gemma2-q4k-fused-f32a.json +33 -0
  54. package/src/config/presets/runtime/kernels/safe-q4k.json +6 -13
  55. package/src/config/presets/runtime/platform/metal-apple-q4k.json +1 -1
  56. package/src/config/runtime.js +6 -1
  57. package/src/config/schema/debug.schema.d.ts +5 -0
  58. package/src/config/schema/doppler.schema.js +16 -21
  59. package/src/config/schema/inference-defaults.schema.js +3 -3
  60. package/src/config/schema/kernel-path.schema.d.ts +5 -1
  61. package/src/config/schema/kernel-thresholds.schema.js +12 -4
  62. package/src/config/schema/manifest.schema.d.ts +2 -1
  63. package/src/config/schema/manifest.schema.js +16 -3
  64. package/src/config/training-defaults.js +30 -22
  65. package/src/converter/conversion-plan.js +94 -9
  66. package/src/converter/core.d.ts +7 -0
  67. package/src/converter/core.js +14 -9
  68. package/src/converter/execution-v0-manifest.js +4 -1
  69. package/src/converter/index.d.ts +1 -0
  70. package/src/converter/index.js +1 -0
  71. package/src/converter/manifest-inference.js +43 -12
  72. package/src/converter/parsers/diffusion.js +0 -3
  73. package/src/converter/quantization-info.js +35 -15
  74. package/src/converter/shard-packer.d.ts +1 -1
  75. package/src/converter/shard-packer.js +4 -1
  76. package/src/debug/config.js +123 -11
  77. package/src/debug/signals.js +7 -1
  78. package/src/debug/tensor.d.ts +2 -0
  79. package/src/debug/tensor.js +13 -2
  80. package/src/distribution/p2p-control-plane.js +52 -12
  81. package/src/distribution/p2p-observability.js +43 -7
  82. package/src/distribution/p2p-webrtc-browser.js +20 -0
  83. package/src/distribution/shard-delivery.js +77 -26
  84. package/src/formats/gguf/types.js +33 -16
  85. package/src/formats/rdrr/groups.d.ts +12 -4
  86. package/src/formats/rdrr/groups.js +3 -6
  87. package/src/formats/rdrr/parsing.js +39 -2
  88. package/src/formats/rdrr/types.d.ts +2 -1
  89. package/src/gpu/command-recorder.js +86 -61
  90. package/src/gpu/device.d.ts +1 -0
  91. package/src/gpu/device.js +73 -19
  92. package/src/gpu/kernel-tuner/benchmarks.js +326 -316
  93. package/src/gpu/kernel-tuner/cache.js +71 -4
  94. package/src/gpu/kernel-tuner/tuner.js +22 -4
  95. package/src/gpu/kernels/attention.js +15 -34
  96. package/src/gpu/kernels/backward/adam.js +62 -58
  97. package/src/gpu/kernels/backward/attention_backward.js +257 -169
  98. package/src/gpu/kernels/backward/conv2d_backward.js +14 -1
  99. package/src/gpu/kernels/cast.js +191 -149
  100. package/src/gpu/kernels/check-stop.js +33 -44
  101. package/src/gpu/kernels/conv2d.js +27 -17
  102. package/src/gpu/kernels/cross_entropy_loss.js +21 -15
  103. package/src/gpu/kernels/depthwise_conv2d.js +36 -26
  104. package/src/gpu/kernels/dequant.js +178 -126
  105. package/src/gpu/kernels/energy.d.ts +3 -21
  106. package/src/gpu/kernels/energy.js +111 -88
  107. package/src/gpu/kernels/feature-check.js +1 -1
  108. package/src/gpu/kernels/fused_ffn.js +84 -65
  109. package/src/gpu/kernels/fused_matmul_residual.js +56 -33
  110. package/src/gpu/kernels/fused_matmul_rmsnorm.js +62 -45
  111. package/src/gpu/kernels/gather.js +33 -15
  112. package/src/gpu/kernels/gelu.js +19 -11
  113. package/src/gpu/kernels/grouped_pointwise_conv2d.js +33 -23
  114. package/src/gpu/kernels/groupnorm.js +34 -23
  115. package/src/gpu/kernels/kv-quantize.js +5 -2
  116. package/src/gpu/kernels/layernorm.js +35 -19
  117. package/src/gpu/kernels/logit-merge.js +5 -3
  118. package/src/gpu/kernels/matmul.js +58 -39
  119. package/src/gpu/kernels/modulate.js +23 -15
  120. package/src/gpu/kernels/moe.js +221 -175
  121. package/src/gpu/kernels/pixel_shuffle.js +22 -14
  122. package/src/gpu/kernels/relu.js +18 -10
  123. package/src/gpu/kernels/repeat_channels.js +25 -17
  124. package/src/gpu/kernels/residual.js +37 -27
  125. package/src/gpu/kernels/rmsnorm.js +57 -41
  126. package/src/gpu/kernels/rope.js +3 -0
  127. package/src/gpu/kernels/sample.js +27 -38
  128. package/src/gpu/kernels/sana_linear_attention.js +18 -10
  129. package/src/gpu/kernels/scale.js +18 -11
  130. package/src/gpu/kernels/shader-cache.js +4 -2
  131. package/src/gpu/kernels/silu.js +120 -72
  132. package/src/gpu/kernels/softmax.js +44 -25
  133. package/src/gpu/kernels/split_qkv.js +23 -13
  134. package/src/gpu/kernels/transpose.js +18 -10
  135. package/src/gpu/kernels/transpose.wgsl +5 -3
  136. package/src/gpu/kernels/upsample2d.js +21 -13
  137. package/src/gpu/kernels/utils.js +20 -13
  138. package/src/gpu/partitioned-buffer-pool.js +10 -2
  139. package/src/gpu/perf-guards.js +2 -9
  140. package/src/gpu/profiler.js +27 -22
  141. package/src/gpu/readback-utils.d.ts +16 -0
  142. package/src/gpu/readback-utils.js +41 -0
  143. package/src/gpu/submit-tracker.js +13 -0
  144. package/src/gpu/uniform-cache.d.ts +1 -0
  145. package/src/gpu/uniform-cache.js +30 -9
  146. package/src/hotswap/intent-bundle.js +6 -0
  147. package/src/hotswap/manifest.d.ts +10 -1
  148. package/src/hotswap/manifest.js +12 -2
  149. package/src/hotswap/runtime.js +30 -8
  150. package/src/index-browser.d.ts +44 -0
  151. package/src/index-browser.js +14 -0
  152. package/src/inference/browser-harness-contract-helpers.d.ts +5 -0
  153. package/src/inference/browser-harness-contract-helpers.js +28 -0
  154. package/src/inference/browser-harness-diffusion-energy-suites.d.ts +2 -0
  155. package/src/inference/browser-harness-diffusion-energy-suites.js +269 -0
  156. package/src/inference/browser-harness-model-helpers.d.ts +16 -0
  157. package/src/inference/browser-harness-model-helpers.js +217 -0
  158. package/src/inference/browser-harness-report-helpers.d.ts +7 -0
  159. package/src/inference/browser-harness-report-helpers.js +42 -0
  160. package/src/inference/browser-harness-runtime-helpers.d.ts +61 -0
  161. package/src/inference/browser-harness-runtime-helpers.js +415 -0
  162. package/src/inference/browser-harness-suite-helpers.d.ts +28 -0
  163. package/src/inference/browser-harness-suite-helpers.js +268 -0
  164. package/src/inference/browser-harness-text-helpers.d.ts +27 -0
  165. package/src/inference/browser-harness-text-helpers.js +788 -0
  166. package/src/inference/browser-harness.d.ts +6 -0
  167. package/src/inference/browser-harness.js +130 -1996
  168. package/src/inference/kv-cache/base.js +140 -94
  169. package/src/inference/kv-cache/tiered.js +5 -3
  170. package/src/inference/moe-router.js +88 -56
  171. package/src/inference/multi-model-network.js +5 -3
  172. package/src/inference/network-evolution.d.ts +11 -2
  173. package/src/inference/network-evolution.js +20 -21
  174. package/src/inference/pipelines/context.d.ts +3 -0
  175. package/src/inference/pipelines/context.js +142 -2
  176. package/src/inference/pipelines/diffusion/helpers.js +7 -2
  177. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  178. package/src/inference/pipelines/diffusion/sd3-transformer.js +10 -10
  179. package/src/inference/pipelines/diffusion/vae.js +3 -7
  180. package/src/inference/pipelines/energy/pipeline.js +27 -21
  181. package/src/inference/pipelines/energy/quintel.d.ts +5 -0
  182. package/src/inference/pipelines/energy/quintel.js +11 -0
  183. package/src/inference/pipelines/energy-head/row-head-pipeline.js +17 -13
  184. package/src/inference/pipelines/structured/json-head-pipeline.js +26 -11
  185. package/src/inference/pipelines/text/attention/projections.js +151 -101
  186. package/src/inference/pipelines/text/attention/record.js +62 -8
  187. package/src/inference/pipelines/text/attention/run.js +62 -8
  188. package/src/inference/pipelines/text/config.js +3 -4
  189. package/src/inference/pipelines/text/embed.js +2 -8
  190. package/src/inference/pipelines/text/execution-plan.js +41 -19
  191. package/src/inference/pipelines/text/execution-v0-contract-helpers.d.ts +59 -0
  192. package/src/inference/pipelines/text/execution-v0-contract-helpers.js +937 -0
  193. package/src/inference/pipelines/text/execution-v0-runtime-builders.d.ts +15 -0
  194. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +279 -0
  195. package/src/inference/pipelines/text/execution-v0.js +62 -1013
  196. package/src/inference/pipelines/text/generator-steps.d.ts +46 -0
  197. package/src/inference/pipelines/text/generator-steps.js +298 -207
  198. package/src/inference/pipelines/text/generator.js +6 -23
  199. package/src/inference/pipelines/text/init.js +78 -20
  200. package/src/inference/pipelines/text/kernel-path-auto-select.js +2 -0
  201. package/src/inference/pipelines/text/kernel-trace.d.ts +2 -0
  202. package/src/inference/pipelines/text/kernel-trace.js +6 -0
  203. package/src/inference/pipelines/text/layer.js +3 -9
  204. package/src/inference/pipelines/text/linear-attention.d.ts +10 -0
  205. package/src/inference/pipelines/text/linear-attention.js +80 -6
  206. package/src/inference/pipelines/text/logits/gpu.js +10 -5
  207. package/src/inference/pipelines/text/logits/index.js +10 -11
  208. package/src/inference/pipelines/text/logits/utils.d.ts +7 -0
  209. package/src/inference/pipelines/text/logits/utils.js +9 -0
  210. package/src/inference/pipelines/text/lora-apply.js +50 -32
  211. package/src/inference/pipelines/text/model-load.js +279 -104
  212. package/src/inference/pipelines/text/moe-cache.js +5 -4
  213. package/src/inference/pipelines/text/moe-cpu-gptoss.js +74 -69
  214. package/src/inference/pipelines/text/moe-cpu.js +42 -38
  215. package/src/inference/pipelines/text/moe-gpu.js +110 -86
  216. package/src/inference/pipelines/text/ops.js +90 -90
  217. package/src/inference/pipelines/text/probes.js +9 -9
  218. package/src/inference/pipelines/text/weights.js +17 -7
  219. package/src/inference/pipelines/text.js +13 -1
  220. package/src/inference/speculative.d.ts +2 -2
  221. package/src/inference/speculative.js +4 -18
  222. package/src/inference/test-harness.d.ts +1 -1
  223. package/src/inference/test-harness.js +15 -5
  224. package/src/inference/tokenizer.d.ts +0 -5
  225. package/src/inference/tokenizer.js +4 -23
  226. package/src/inference/tokenizers/bpe.js +9 -0
  227. package/src/inference/tokenizers/bundled.js +20 -0
  228. package/src/inference/tokenizers/sentencepiece.js +12 -0
  229. package/src/loader/doppler-loader.js +38 -22
  230. package/src/loader/dtype-utils.js +3 -44
  231. package/src/loader/embedding-loader.js +7 -3
  232. package/src/loader/experts/expert-cache.js +13 -6
  233. package/src/loader/experts/expert-loader.js +10 -6
  234. package/src/loader/final-weights-loader.js +8 -4
  235. package/src/loader/layer-loader.js +2 -1
  236. package/src/loader/loader-state.js +2 -2
  237. package/src/loader/memory-monitor.js +8 -0
  238. package/src/loader/multi-model-loader.d.ts +14 -0
  239. package/src/loader/multi-model-loader.js +70 -24
  240. package/src/loader/shard-cache.js +81 -12
  241. package/src/loader/shard-resolver.js +25 -3
  242. package/src/loader/tensors/tensor-loader.js +209 -144
  243. package/src/loader/tensors/tensor-reader.js +76 -19
  244. package/src/loader/weight-downcast.js +1 -1
  245. package/src/memory/buffer-pool.d.ts +9 -1
  246. package/src/memory/buffer-pool.js +109 -44
  247. package/src/memory/unified-detect.js +1 -1
  248. package/src/rules/inference/kernel-path.rules.json +24 -8
  249. package/src/rules/rule-registry.js +25 -1
  250. package/src/storage/backends/opfs-store.js +68 -24
  251. package/src/storage/downloader.js +364 -83
  252. package/src/storage/index.d.ts +3 -0
  253. package/src/storage/index.js +3 -0
  254. package/src/storage/preflight.d.ts +2 -2
  255. package/src/storage/preflight.js +24 -2
  256. package/src/storage/quickstart-downloader.js +11 -5
  257. package/src/storage/registry.js +10 -4
  258. package/src/storage/reports.js +1 -1
  259. package/src/storage/shard-manager.d.ts +15 -1
  260. package/src/storage/shard-manager.js +51 -3
  261. package/src/storage/source-artifact-store.d.ts +52 -0
  262. package/src/storage/source-artifact-store.js +234 -0
  263. package/src/tooling/command-api-constants.d.ts +9 -0
  264. package/src/tooling/command-api-constants.js +9 -0
  265. package/src/tooling/command-api-family-normalizers.d.ts +9 -0
  266. package/src/tooling/command-api-family-normalizers.js +343 -0
  267. package/src/tooling/command-api-helpers.d.ts +25 -0
  268. package/src/tooling/command-api-helpers.js +262 -0
  269. package/src/tooling/command-api.js +16 -602
  270. package/src/tooling/command-envelope.js +4 -1
  271. package/src/tooling/command-runner-shared.js +52 -18
  272. package/src/tooling/lean-execution-contract.js +150 -3
  273. package/src/tooling/node-browser-command-runner.js +161 -271
  274. package/src/tooling/node-command-runner.js +29 -3
  275. package/src/tooling/node-converter.js +27 -1
  276. package/src/tooling/node-source-runtime.d.ts +1 -1
  277. package/src/tooling/node-source-runtime.js +84 -3
  278. package/src/tooling/node-webgpu.js +24 -21
  279. package/src/tooling/opfs-cache.js +21 -4
  280. package/src/tooling/runtime-input-composition.d.ts +38 -0
  281. package/src/tooling/runtime-input-composition.js +86 -0
  282. package/src/tooling/source-runtime-bundle.d.ts +40 -5
  283. package/src/tooling/source-runtime-bundle.js +261 -34
  284. package/src/tooling/source-runtime-materializer.d.ts +6 -0
  285. package/src/tooling/source-runtime-materializer.js +93 -0
  286. package/src/training/attention-backward.js +32 -17
  287. package/src/training/autograd.js +80 -52
  288. package/src/training/checkpoint-watch.d.ts +2 -1
  289. package/src/training/checkpoint-watch.js +39 -6
  290. package/src/training/checkpoint.js +40 -11
  291. package/src/training/clip.js +2 -1
  292. package/src/training/datasets/token-batch.js +20 -8
  293. package/src/training/distillation/checkpoint-watch.js +1 -0
  294. package/src/training/distillation/student-fixture.d.ts +22 -0
  295. package/src/training/distillation/student-fixture.js +846 -0
  296. package/src/training/distillation/suite-data.d.ts +45 -0
  297. package/src/training/distillation/suite-data.js +189 -0
  298. package/src/training/lora-pipeline.js +4 -7
  299. package/src/training/lora.js +26 -12
  300. package/src/training/loss.js +5 -6
  301. package/src/training/objectives/cross_entropy.js +2 -5
  302. package/src/training/objectives/distill_kd.js +4 -8
  303. package/src/training/objectives/distill_triplet.js +4 -8
  304. package/src/training/objectives/ul_stage2_base.js +4 -8
  305. package/src/training/operator-command.js +2 -0
  306. package/src/training/optimizer.js +19 -7
  307. package/src/training/runner.js +2 -1
  308. package/src/training/suite.js +18 -978
  309. package/src/training/tensor-factory.d.ts +9 -0
  310. package/src/training/tensor-factory.js +13 -0
  311. package/src/training/trainer.js +3 -5
  312. package/src/training/ul_dataset.js +3 -5
  313. package/src/training/workloads.js +70 -79
  314. package/src/version.js +1 -1
  315. package/tools/convert-safetensors-node.js +22 -16
  316. package/tools/doppler-cli.js +44 -25
@@ -1,7 +1,7 @@
1
1
 
2
2
 
3
3
  import { getDevice } from '../device.js';
4
- import { acquireBuffer, getBufferRequestedSize } from '../../memory/buffer-pool.js';
4
+ import { acquireBuffer, getBufferRequestedSize, releaseBuffer } from '../../memory/buffer-pool.js';
5
5
  import { createTensor } from '../tensor.js';
6
6
  import { getBuffer } from '../weight-buffer.js';
7
7
  import { dispatch, recordDispatch } from './dispatch.js';
@@ -91,7 +91,8 @@ export async function runMatmulRMSNormFused(
91
91
  // Output buffer: [1, N] - size depends on dtype
92
92
  const bytesPerElement = dtype === 'f16' ? 2 : 4;
93
93
  const outputSize = N * bytesPerElement;
94
- const output = outputBuffer || acquireBuffer(outputSize, undefined, 'matmul_rmsnorm_fused_output');
94
+ const ownedOutput = outputBuffer ? null : acquireBuffer(outputSize, undefined, 'matmul_rmsnorm_fused_output');
95
+ const output = outputBuffer || ownedOutput;
95
96
 
96
97
  // Create uniform buffer (8 u32/f32 = 32 bytes, padded for alignment)
97
98
  const uniformBuffer = createUniformBufferWithView(
@@ -110,36 +111,44 @@ export async function runMatmulRMSNormFused(
110
111
  );
111
112
 
112
113
  // Create placeholder for residual if not provided
114
+ const ownsResidualBuffer = !residual;
113
115
  const residualBuffer = residual || device.createBuffer({
114
116
  label: 'matmul_rmsnorm_residual_placeholder',
115
117
  size: 4,
116
118
  usage: GPUBufferUsage.STORAGE,
117
119
  });
118
120
 
119
- // Create bind group
120
- const bindGroup = device.createBindGroup({
121
- label: 'matmul_rmsnorm_fused_bind_group',
122
- layout: pipeline.getBindGroupLayout(0),
123
- entries: [
124
- { binding: 0, resource: { buffer: uniformBuffer } },
125
- { binding: 1, resource: { buffer: input.buffer } },
126
- { binding: 2, resource: { buffer: weightBuffer } },
127
- { binding: 3, resource: { buffer: normWeightBuffer } },
128
- { binding: 4, resource: { buffer: output } },
129
- { binding: 5, resource: { buffer: residualBuffer } },
130
- ],
131
- });
132
-
133
- // Calculate workgroups
134
-
135
- const workgroups = 1;
136
-
137
- const dispatchLabel = label ? `matmul_rmsnorm_fused:${label}` : 'matmul_rmsnorm_fused';
138
- dispatch(device, pipeline, bindGroup, workgroups, dispatchLabel);
121
+ try {
122
+ const bindGroup = device.createBindGroup({
123
+ label: 'matmul_rmsnorm_fused_bind_group',
124
+ layout: pipeline.getBindGroupLayout(0),
125
+ entries: [
126
+ { binding: 0, resource: { buffer: uniformBuffer } },
127
+ { binding: 1, resource: { buffer: input.buffer } },
128
+ { binding: 2, resource: { buffer: weightBuffer } },
129
+ { binding: 3, resource: { buffer: normWeightBuffer } },
130
+ { binding: 4, resource: { buffer: output } },
131
+ { binding: 5, resource: { buffer: residualBuffer } },
132
+ ],
133
+ });
134
+
135
+ const workgroups = 1;
136
+ const dispatchLabel = label ? `matmul_rmsnorm_fused:${label}` : 'matmul_rmsnorm_fused';
137
+ dispatch(device, pipeline, bindGroup, workgroups, dispatchLabel);
138
+ } catch (error) {
139
+ uniformBuffer.destroy();
140
+ if (ownsResidualBuffer) {
141
+ residualBuffer.destroy();
142
+ }
143
+ if (ownedOutput) {
144
+ releaseBuffer(ownedOutput);
145
+ }
146
+ throw error;
147
+ }
139
148
 
140
149
  // Cleanup
141
150
  uniformBuffer.destroy();
142
- if (!residual) residualBuffer.destroy();
151
+ if (ownsResidualBuffer) residualBuffer.destroy();
143
152
 
144
153
  // Output dtype matches input dtype
145
154
  return createTensor(output, input.dtype, [1, N], 'matmul_rmsnorm_fused_output');
@@ -199,7 +208,8 @@ export async function recordMatmulRMSNormFused(
199
208
  // Output buffer - size depends on dtype
200
209
  const bytesPerElement = dtype === 'f16' ? 2 : 4;
201
210
  const outputSize = N * bytesPerElement;
202
- const output = outputBuffer || acquireBuffer(outputSize, undefined, 'matmul_rmsnorm_fused_output');
211
+ const ownedOutput = outputBuffer ? null : acquireBuffer(outputSize, undefined, 'matmul_rmsnorm_fused_output');
212
+ const output = outputBuffer || ownedOutput;
203
213
 
204
214
  // Uniform buffer via recorder (8 u32/f32 = 32 bytes, padded for alignment)
205
215
  const uniformBuffer = createUniformBufferWithView(
@@ -217,35 +227,42 @@ export async function recordMatmulRMSNormFused(
217
227
  );
218
228
 
219
229
  // Placeholder for residual
230
+ const ownsResidualBuffer = !residual;
220
231
  const residualBuffer = residual || device.createBuffer({
221
232
  label: 'matmul_rmsnorm_residual_placeholder',
222
233
  size: 4,
223
234
  usage: GPUBufferUsage.STORAGE,
224
235
  });
225
236
 
226
- // Bind group
227
- const bindGroup = device.createBindGroup({
228
- label: 'matmul_rmsnorm_fused_bind_group',
229
- layout: pipeline.getBindGroupLayout(0),
230
- entries: [
231
- { binding: 0, resource: { buffer: uniformBuffer } },
232
- { binding: 1, resource: { buffer: input.buffer } },
233
- { binding: 2, resource: { buffer: weightBuffer } },
234
- { binding: 3, resource: { buffer: normWeightBuffer } },
235
- { binding: 4, resource: { buffer: output } },
236
- { binding: 5, resource: { buffer: residualBuffer } },
237
- ],
238
- });
239
-
240
- // Calculate workgroups
241
-
242
- const workgroups = 1;
243
-
244
- const dispatchLabel = label ? `matmul_rmsnorm_fused:${label}` : 'matmul_rmsnorm_fused';
245
- recordDispatch(recorder, pipeline, bindGroup, workgroups, dispatchLabel);
237
+ try {
238
+ const bindGroup = device.createBindGroup({
239
+ label: 'matmul_rmsnorm_fused_bind_group',
240
+ layout: pipeline.getBindGroupLayout(0),
241
+ entries: [
242
+ { binding: 0, resource: { buffer: uniformBuffer } },
243
+ { binding: 1, resource: { buffer: input.buffer } },
244
+ { binding: 2, resource: { buffer: weightBuffer } },
245
+ { binding: 3, resource: { buffer: normWeightBuffer } },
246
+ { binding: 4, resource: { buffer: output } },
247
+ { binding: 5, resource: { buffer: residualBuffer } },
248
+ ],
249
+ });
250
+
251
+ const workgroups = 1;
252
+ const dispatchLabel = label ? `matmul_rmsnorm_fused:${label}` : 'matmul_rmsnorm_fused';
253
+ recordDispatch(recorder, pipeline, bindGroup, workgroups, dispatchLabel);
254
+ } catch (error) {
255
+ if (ownsResidualBuffer) {
256
+ residualBuffer.destroy();
257
+ }
258
+ if (ownedOutput) {
259
+ releaseBuffer(ownedOutput);
260
+ }
261
+ throw error;
262
+ }
246
263
 
247
264
  // Track placeholder for cleanup
248
- if (!residual) {
265
+ if (ownsResidualBuffer) {
249
266
  recorder.trackTemporaryBuffer(residualBuffer);
250
267
  }
251
268
 
@@ -1,5 +1,5 @@
1
1
  import { getKernelCapabilities } from '../device.js';
2
- import { acquireBuffer } from '../../memory/buffer-pool.js';
2
+ import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
3
3
  import { WORKGROUP_SIZES, VEC4_ELEMENTS_PER_WG } from './constants.js';
4
4
  import { unifiedKernelWrapper } from './utils.js';
5
5
  import { trace } from '../../debug/index.js';
@@ -26,7 +26,6 @@ async function _gather(
26
26
  options = {}
27
27
  ) {
28
28
  const {
29
- useVec4 = true,
30
29
  outputBuffer = null,
31
30
  embeddingDtype,
32
31
  outputDtype,
@@ -43,9 +42,22 @@ async function _gather(
43
42
  if (outputDtype == null) {
44
43
  throw new Error('[Gather] outputDtype is required.');
45
44
  }
45
+ if (embeddingDtype === 'f16' && !caps.hasF16) {
46
+ throw new Error('[Gather] embeddingDtype=f16 requires shader-f16 support.');
47
+ }
48
+ if (outputDtype === 'f16' && !caps.hasF16) {
49
+ throw new Error('[Gather] outputDtype=f16 requires shader-f16 support.');
50
+ }
46
51
 
47
- const useF16Input = embeddingDtype === 'f16' && caps.hasF16;
48
- const useF16Output = outputDtype === 'f16' && caps.hasF16;
52
+ const requestedVec4 = options.useVec4;
53
+ const wantsVec4 = requestedVec4 ?? true;
54
+ if (requestedVec4 === true && hiddenSize % 4 !== 0) {
55
+ throw new Error('[Gather] useVec4=true requires hiddenSize to be divisible by 4.');
56
+ }
57
+
58
+ const useF16Input = embeddingDtype === 'f16';
59
+ const useF16Output = outputDtype === 'f16';
60
+ const useVec4 = wantsVec4 && hiddenSize % 4 === 0;
49
61
 
50
62
  trace.embed(
51
63
  `Gather: numTokens=${numTokens}, hiddenSize=${hiddenSize}, vocabSize=${vocabSize}, ` +
@@ -64,6 +76,7 @@ async function _gather(
64
76
  const paddedHiddenSize = padToQ4KBlock(hiddenSize);
65
77
  const outputSize = numTokens * paddedHiddenSize * bytesPerElement;
66
78
  const output = outputBuffer || acquireBuffer(outputSize, undefined, 'gather_output');
79
+ const ownedOutput = outputBuffer ? null : output;
67
80
 
68
81
  const uniforms = {
69
82
  num_tokens: numTokens,
@@ -82,16 +95,22 @@ async function _gather(
82
95
  ? Math.ceil((numTokens * hiddenSize) / VEC4_ELEMENTS_PER_WG)
83
96
  : Math.ceil((numTokens * hiddenSize) / WORKGROUP_SIZES.DEFAULT));
84
97
 
85
- await unifiedKernelWrapper(
86
- 'gather',
87
- target,
88
- variant,
89
- [indices, embeddings, output],
90
- uniforms,
91
- workgroups
92
- );
93
-
94
- return createTensor(output, actualDtype, [numTokens, hiddenSize], 'gather_output');
98
+ try {
99
+ await unifiedKernelWrapper(
100
+ 'gather',
101
+ target,
102
+ variant,
103
+ [indices, embeddings, output],
104
+ uniforms,
105
+ workgroups
106
+ );
107
+ return createTensor(output, actualDtype, [numTokens, hiddenSize], 'gather_output');
108
+ } catch (error) {
109
+ if (ownedOutput) {
110
+ releaseBuffer(ownedOutput);
111
+ }
112
+ throw error;
113
+ }
95
114
  }
96
115
 
97
116
  export async function runGather(
@@ -116,4 +135,3 @@ export async function recordGather(
116
135
  ) {
117
136
  return _gather(recorder, indices, embeddings, numTokens, hiddenSize, vocabSize, options);
118
137
  }
119
-
@@ -1,5 +1,5 @@
1
1
 
2
- import { acquireBuffer } from '../../memory/buffer-pool.js';
2
+ import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
3
3
  import { createTensor, dtypeBytes } from '../tensor.js';
4
4
  import { WORKGROUP_SIZES } from './constants.js';
5
5
  import { unifiedKernelWrapper } from './utils.js';
@@ -26,16 +26,24 @@ async function _gelu(target, input, options = {}) {
26
26
  const outputSize = inferredSize * bytesPerElement;
27
27
  const output = outputBuffer || acquireBuffer(outputSize, undefined, 'gelu_output');
28
28
  const gateBuffer = gate ?? input;
29
-
30
- await unifiedKernelWrapper(
31
- 'gelu', target, variant,
32
- [input, output, gateBuffer],
33
- { size: inferredSize, rowsplit_dim: 0 },
34
- Math.ceil(inferredSize / WORKGROUP_SIZES.DEFAULT),
35
- overrides
36
- );
37
-
38
- return createTensor(output, input.dtype, [inferredSize], 'gelu_output');
29
+ const ownedOutput = outputBuffer ? null : output;
30
+
31
+ try {
32
+ await unifiedKernelWrapper(
33
+ 'gelu', target, variant,
34
+ [input, output, gateBuffer],
35
+ { size: inferredSize, rowsplit_dim: 0 },
36
+ Math.ceil(inferredSize / WORKGROUP_SIZES.DEFAULT),
37
+ overrides
38
+ );
39
+
40
+ return createTensor(output, input.dtype, [inferredSize], 'gelu_output');
41
+ } catch (error) {
42
+ if (ownedOutput) {
43
+ releaseBuffer(ownedOutput);
44
+ }
45
+ throw error;
46
+ }
39
47
  }
40
48
 
41
49
  export async function runGeLU(input, options = {}) {
@@ -55,33 +55,43 @@ async function _groupedPointwiseConv2D(target, input, weight, bias, options = {}
55
55
  device.queue.writeBuffer(biasBuffer, 0, new Uint8Array(paddedSize));
56
56
  }
57
57
 
58
- await unifiedKernelWrapper(
59
- 'grouped_pointwise_conv2d',
60
- target,
61
- variant,
62
- [input, weightBuffer, biasBuffer, output],
63
- {
64
- in_channels: inChannels,
65
- out_channels: outChannels,
66
- height,
67
- width,
68
- groups,
69
- _pad0: 0,
70
- _pad1: 0,
71
- _pad2: 0,
72
- },
73
- [Math.ceil(spatial / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
74
- );
58
+ try {
59
+ await unifiedKernelWrapper(
60
+ 'grouped_pointwise_conv2d',
61
+ target,
62
+ variant,
63
+ [input, weightBuffer, biasBuffer, output],
64
+ {
65
+ in_channels: inChannels,
66
+ out_channels: outChannels,
67
+ height,
68
+ width,
69
+ groups,
70
+ _pad0: 0,
71
+ _pad1: 0,
72
+ _pad2: 0,
73
+ },
74
+ [Math.ceil(spatial / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
75
+ );
76
+
77
+ if (tempBias) {
78
+ if (recorder) {
79
+ recorder.trackTemporaryBuffer(tempBias);
80
+ } else {
81
+ releaseBuffer(tempBias);
82
+ }
83
+ }
75
84
 
76
- if (tempBias) {
77
- if (recorder) {
78
- recorder.trackTemporaryBuffer(tempBias);
79
- } else {
85
+ return createTensor(output, input.dtype, [outChannels, height, width], 'grouped_pointwise_conv2d_output');
86
+ } catch (error) {
87
+ if (tempBias) {
80
88
  releaseBuffer(tempBias);
81
89
  }
90
+ if (!outputBuffer) {
91
+ releaseBuffer(output);
92
+ }
93
+ throw error;
82
94
  }
83
-
84
- return createTensor(output, input.dtype, [outChannels, height, width], 'grouped_pointwise_conv2d_output');
85
95
  }
86
96
 
87
97
  export async function runGroupedPointwiseConv2D(input, weight, bias, options = {}) {
@@ -17,6 +17,9 @@ function validateOptions(options) {
17
17
  if (!Number.isFinite(numGroups) || numGroups <= 0) {
18
18
  throw new Error('GroupNorm requires numGroups > 0.');
19
19
  }
20
+ if (channels % numGroups !== 0) {
21
+ throw new Error('GroupNorm requires channels to be divisible by numGroups.');
22
+ }
20
23
  if (!Number.isFinite(eps)) {
21
24
  throw new Error('GroupNorm requires eps.');
22
25
  }
@@ -44,34 +47,42 @@ async function _groupNorm(target, input, weight, bias, options = {}) {
44
47
 
45
48
  const statsSize = numGroups * 2 * 4;
46
49
  const statsBuffer = acquireBuffer(statsSize, undefined, 'groupnorm_stats');
47
-
48
- await unifiedKernelWrapper(
49
- 'groupnorm_stats',
50
- target,
51
- statsVariant,
52
- [input, statsBuffer],
53
- uniforms,
54
- numGroups
55
- );
56
-
57
50
  const bytesPerElement = dtypeBytes(input.dtype);
58
51
  const outputSize = channels * height * width * bytesPerElement;
59
- const output = outputBuffer || acquireBuffer(outputSize, undefined, 'groupnorm_output');
52
+ const ownedOutput = outputBuffer ? null : acquireBuffer(outputSize, undefined, 'groupnorm_output');
53
+ const output = outputBuffer || ownedOutput;
60
54
 
61
- const weightBuffer = getBuffer(weight);
62
- const biasBuffer = getBuffer(bias);
55
+ try {
56
+ await unifiedKernelWrapper(
57
+ 'groupnorm_stats',
58
+ target,
59
+ statsVariant,
60
+ [input, statsBuffer],
61
+ uniforms,
62
+ numGroups
63
+ );
63
64
 
64
- const total = channels * height * width;
65
- const workgroups = Math.ceil(total / WORKGROUP_SIZES.DEFAULT);
65
+ const weightBuffer = getBuffer(weight);
66
+ const biasBuffer = getBuffer(bias);
66
67
 
67
- await unifiedKernelWrapper(
68
- 'groupnorm_apply',
69
- target,
70
- applyVariant,
71
- [input, statsBuffer, weightBuffer, biasBuffer, output],
72
- uniforms,
73
- workgroups
74
- );
68
+ const total = channels * height * width;
69
+ const workgroups = Math.ceil(total / WORKGROUP_SIZES.DEFAULT);
70
+
71
+ await unifiedKernelWrapper(
72
+ 'groupnorm_apply',
73
+ target,
74
+ applyVariant,
75
+ [input, statsBuffer, weightBuffer, biasBuffer, output],
76
+ uniforms,
77
+ workgroups
78
+ );
79
+ } catch (error) {
80
+ releaseBuffer(statsBuffer);
81
+ if (ownedOutput) {
82
+ releaseBuffer(ownedOutput);
83
+ }
84
+ throw error;
85
+ }
75
86
 
76
87
  if (recorder) {
77
88
  recorder.trackTemporaryBuffer(statsBuffer);
@@ -78,8 +78,11 @@ export async function runKVQuantize(
78
78
  });
79
79
 
80
80
  const workgroups = [numKVHeads, numTokens, 1];
81
- dispatch(device, pipeline, bindGroup, workgroups, 'kv_quantize');
82
- uniformBuffer.destroy();
81
+ try {
82
+ dispatch(device, pipeline, bindGroup, workgroups, 'kv_quantize');
83
+ } finally {
84
+ uniformBuffer.destroy();
85
+ }
83
86
  }
84
87
 
85
88
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  import { getKernelCapabilities } from '../device.js';
3
- import { acquireBuffer } from '../../memory/buffer-pool.js';
3
+ import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
4
4
  import { createTensor } from '../tensor.js';
5
5
  import { padToQ4KBlock } from '../../config/schema/index.js';
6
6
  import { selectRuleValue } from './rule-registry.js';
@@ -36,17 +36,25 @@ export async function runLayerNorm(
36
36
  const paddedHiddenSize = padToQ4KBlock(inferredHiddenSize);
37
37
  const outputSize = batchSize * paddedHiddenSize * bytesPerElement;
38
38
  const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'layernorm_output');
39
+ const ownedOutput = outputBuffer ? null : outputBuf;
39
40
 
40
- await unifiedKernelWrapper(
41
- 'layernorm',
42
- null,
43
- variant,
44
- [input, weight, bias, outputBuf],
45
- { hidden_size: inferredHiddenSize, num_tokens: batchSize, eps },
46
- batchSize
47
- );
41
+ try {
42
+ await unifiedKernelWrapper(
43
+ 'layernorm',
44
+ null,
45
+ variant,
46
+ [input, weight, bias, outputBuf],
47
+ { hidden_size: inferredHiddenSize, num_tokens: batchSize, eps },
48
+ batchSize
49
+ );
48
50
 
49
- return createTensor(outputBuf, input.dtype, [batchSize, inferredHiddenSize], 'layernorm_output');
51
+ return createTensor(outputBuf, input.dtype, [batchSize, inferredHiddenSize], 'layernorm_output');
52
+ } catch (error) {
53
+ if (ownedOutput) {
54
+ releaseBuffer(ownedOutput);
55
+ }
56
+ throw error;
57
+ }
50
58
  }
51
59
 
52
60
  export async function recordLayerNorm(
@@ -66,15 +74,23 @@ export async function recordLayerNorm(
66
74
  const paddedHiddenSize = padToQ4KBlock(inferredHiddenSize);
67
75
  const outputSize = batchSize * paddedHiddenSize * bytesPerElement;
68
76
  const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'layernorm_output');
77
+ const ownedOutput = outputBuffer ? null : outputBuf;
69
78
 
70
- await unifiedKernelWrapper(
71
- 'layernorm',
72
- recorder,
73
- variant,
74
- [input, weight, bias, outputBuf],
75
- { hidden_size: inferredHiddenSize, num_tokens: batchSize, eps },
76
- batchSize
77
- );
79
+ try {
80
+ await unifiedKernelWrapper(
81
+ 'layernorm',
82
+ recorder,
83
+ variant,
84
+ [input, weight, bias, outputBuf],
85
+ { hidden_size: inferredHiddenSize, num_tokens: batchSize, eps },
86
+ batchSize
87
+ );
78
88
 
79
- return createTensor(outputBuf, input.dtype, [batchSize, inferredHiddenSize], 'layernorm_output');
89
+ return createTensor(outputBuf, input.dtype, [batchSize, inferredHiddenSize], 'layernorm_output');
90
+ } catch (error) {
91
+ if (ownedOutput) {
92
+ releaseBuffer(ownedOutput);
93
+ }
94
+ throw error;
95
+ }
80
96
  }
@@ -266,9 +266,11 @@ export class LogitMergeKernel {
266
266
  pass.end();
267
267
 
268
268
  this.#device.queue.submit([encoder.finish()]);
269
-
270
- // Cleanup temporary buffer
271
- paramsBuffer.destroy();
269
+ this.#device.queue.onSubmittedWorkDone()
270
+ .catch(() => {})
271
+ .finally(() => {
272
+ paramsBuffer.destroy();
273
+ });
272
274
 
273
275
  return mergedBuffer;
274
276
  }
@@ -1,4 +1,4 @@
1
- import { getDevice } from '../device.js';
1
+ import { getDevice, getKernelCapabilities } from '../device.js';
2
2
  import { createTensor } from '../tensor.js';
3
3
  import { getBuffer, getLayout, getWeightDtype } from '../weight-buffer.js';
4
4
  import { log, trace, isTraceEnabled } from '../../debug/index.js';
@@ -110,6 +110,7 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
110
110
  const mode = isRecord ? 'record' : 'run';
111
111
  const opLabel = isRecord ? 'recordMatmul' : 'runMatmul';
112
112
  const device = recorder?.device || getDevice();
113
+ const capabilities = getKernelCapabilities();
113
114
 
114
115
  const {
115
116
  alpha = 1.0,
@@ -139,6 +140,13 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
139
140
  const bDtype = toMatmulDtype(weightDtype ?? options.bDtype);
140
141
  const requestedOutputDtype = options.outputDtype || A.dtype;
141
142
 
143
+ if (bDtype === 'f16' && capabilities?.hasF16 !== true) {
144
+ throw new Error(`[${opLabel}] f16 weights require shader-f16 support.`);
145
+ }
146
+ if (requestedOutputDtype === 'f16' && capabilities?.hasF16 !== true) {
147
+ throw new Error(`[${opLabel}] f16 output requires shader-f16 support.`);
148
+ }
149
+
142
150
  if (!isRecord && isTraceEnabled('kernels') && !weightDtype && !options.bDtype && M <= 2) {
143
151
  log.warn('Matmul', `runMatmul: B buffer dtype unknown! size=${bBuffer.size}, M=${M}, N=${N}, K=${K}. Assuming f32.`);
144
152
  }
@@ -228,6 +236,7 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
228
236
  N,
229
237
  outputBuffer
230
238
  );
239
+ const ownsOutput = outputBuffer == null;
231
240
 
232
241
  if (!Number.isFinite(outputSize) || outputSize <= 0) {
233
242
  throw new Error(`[${opLabel}] Invalid output size: ${outputSize} (M=${M}, N=${N})`);
@@ -239,50 +248,60 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
239
248
  }
240
249
 
241
250
  const dispatchPlan = calculateMatmulDispatch(variant, useQ4KFused, useGemv, M, N, config);
242
- const uniformBuffer = createMatmulUniformBuffer(
243
- 'matmul_uniforms',
244
- M,
245
- N,
246
- K,
247
- alpha,
248
- useQ4KFused,
249
- transposeB,
250
- dispatchPlan.uniformWorkgroupsX,
251
- recorder || null,
252
- device
253
- );
251
+ let uniformBuffer = null;
252
+ let completed = false;
253
+ try {
254
+ uniformBuffer = createMatmulUniformBuffer(
255
+ 'matmul_uniforms',
256
+ M,
257
+ N,
258
+ K,
259
+ alpha,
260
+ useQ4KFused,
261
+ transposeB,
262
+ dispatchPlan.uniformWorkgroupsX,
263
+ recorder || null,
264
+ device
265
+ );
254
266
 
255
- const entries = createMatmulBindGroupEntries(
256
- variant,
257
- uniformBuffer,
258
- matmulInput,
259
- bBuffer,
260
- C,
261
- { aOffset, bOffset, cOffset },
262
- {
263
- aBindingSize: bindingSizes.aBindingSize,
264
- bBindingSize: bindingSizes.bBindingSize,
265
- cBindingSize,
266
- }
267
- );
267
+ const entries = createMatmulBindGroupEntries(
268
+ variant,
269
+ uniformBuffer,
270
+ matmulInput,
271
+ bBuffer,
272
+ C,
273
+ { aOffset, bOffset, cOffset },
274
+ {
275
+ aBindingSize: bindingSizes.aBindingSize,
276
+ bBindingSize: bindingSizes.bBindingSize,
277
+ cBindingSize,
278
+ }
279
+ );
268
280
 
269
- const bindGroup = device.createBindGroup({
270
- label: 'matmul_bind_group',
271
- layout: pipeline.getBindGroupLayout(0),
272
- entries,
273
- });
281
+ const bindGroup = device.createBindGroup({
282
+ label: 'matmul_bind_group',
283
+ layout: pipeline.getBindGroupLayout(0),
284
+ entries,
285
+ });
274
286
 
275
- if (isRecord) {
276
- kernel.record(recorder, pipeline, bindGroup, dispatchPlan.workgroups, buildProfileLabel(options));
277
- } else {
278
- kernel.dispatch(pipeline, bindGroup, dispatchPlan.workgroups);
279
- releaseUniformBuffer(uniformBuffer);
280
- if (castedInput) {
287
+ if (isRecord) {
288
+ kernel.record(recorder, pipeline, bindGroup, dispatchPlan.workgroups, buildProfileLabel(options));
289
+ } else {
290
+ kernel.dispatch(pipeline, bindGroup, dispatchPlan.workgroups);
291
+ }
292
+ completed = true;
293
+ return createTensor(C, actualOutputDtype, [M, N], 'matmul_output');
294
+ } finally {
295
+ if (!isRecord && uniformBuffer) {
296
+ releaseUniformBuffer(uniformBuffer);
297
+ }
298
+ if (!isRecord && castedInput) {
281
299
  releaseBuffer(castedInput.buffer);
282
300
  }
301
+ if (!completed && ownsOutput) {
302
+ releaseBuffer(C);
303
+ }
283
304
  }
284
-
285
- return createTensor(C, actualOutputDtype, [M, N], 'matmul_output');
286
305
  }
287
306
 
288
307