npm - tensorgrad - Versions diffs - 0.0.16 → 0.0.17 - Mend

tensorgrad 0.0.16 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/worker.debug.js CHANGED Viewed

@@ -148,34 +148,50 @@ ${k.wgsl}
       }
       queue.writeBuffer(buffers.get(bufId), 0, data);
     }
-    const encoder = device2.createCommandEncoder({ label: "tensorgrad-step" });
-    for (let i = 0; i < kernels.length; i++) {
-      const k = kernels[i];
-      if (!k.wgsl || k.threads === 0) continue;
-      const pipeline = pipelines[i];
-      const bindGroup = bindGroups[i];
-      const pass = encoder.beginComputePass({ label: k.opKind });
-      pass.setPipeline(pipeline);
-      pass.setBindGroup(0, bindGroup);
-      const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize));
-      const MAX_X = 65535;
-      const wgX = Math.min(wgCount, MAX_X);
-      const wgY = Math.ceil(wgCount / MAX_X);
-      pass.dispatchWorkgroups(wgX, wgY, 1);
-      pass.end();
-    }
-    for (const wb of plan.writebacks) {
-      encoder.copyBufferToBuffer(buffers.get(wb.source), 0, buffers.get(wb.dest), 0, wb.bytes);
-    }
-    encoder.copyBufferToBuffer(buffers.get(lossBufferId), 0, outputReadback, 0, outputSpec.byteSize);
+    const CHUNK_SIZE = 32;
     let layout = null;
     if (wantCaptures) {
       layout = ensureCaptureStaging();
-      for (const s of layout.slices) {
-        encoder.copyBufferToBuffer(buffers.get(s.bufId), 0, layout.buffer, s.offset, s.byteSize);
+    }
+    let kernelIdx = 0;
+    while (kernelIdx < kernels.length) {
+      const chunkEnd = Math.min(kernelIdx + CHUNK_SIZE, kernels.length);
+      const isLast = chunkEnd === kernels.length;
+      const encoder = device2.createCommandEncoder({
+        label: kernels.length > CHUNK_SIZE ? `tensorgrad-chunk-${kernelIdx}` : "tensorgrad-step"
+      });
+      for (let i = kernelIdx; i < chunkEnd; i++) {
+        const k = kernels[i];
+        if (!k.wgsl || k.threads === 0) continue;
+        const pipeline = pipelines[i];
+        const bindGroup = bindGroups[i];
+        const pass = encoder.beginComputePass({ label: k.opKind });
+        pass.setPipeline(pipeline);
+        pass.setBindGroup(0, bindGroup);
+        const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize));
+        const MAX_X = 65535;
+        const wgX = Math.min(wgCount, MAX_X);
+        const wgY = Math.ceil(wgCount / MAX_X);
+        pass.dispatchWorkgroups(wgX, wgY, 1);
+        pass.end();
       }
+      if (isLast) {
+        for (const wb of plan.writebacks) {
+          encoder.copyBufferToBuffer(buffers.get(wb.source), 0, buffers.get(wb.dest), 0, wb.bytes);
+        }
+        encoder.copyBufferToBuffer(buffers.get(lossBufferId), 0, outputReadback, 0, outputSpec.byteSize);
+        if (layout) {
+          for (const s of layout.slices) {
+            encoder.copyBufferToBuffer(buffers.get(s.bufId), 0, layout.buffer, s.offset, s.byteSize);
+          }
+        }
+      }
+      queue.submit([encoder.finish()]);
+      if (!isLast) {
+        await queue.onSubmittedWorkDone();
+      }
+      kernelIdx = chunkEnd;
     }
-    queue.submit([encoder.finish()]);
     if (!opts2.readback) return null;
     await outputReadback.mapAsync(GPUMapMode.READ);
     const output = new Float32Array(outputReadback.getMappedRange().slice(0));

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tensorgrad",
-  "version": "0.0.16",
+  "version": "0.0.17",
   "description": "Tiny TypeScript-native tensor library with autograd, compiling to WebGPU. Train small models in the browser without hand-writing kernels.",
   "license": "MIT",
   "author": "Ben Albahari",

package/src/runtime.ts CHANGED Viewed

@@ -348,42 +348,67 @@ export async function createRuntime(
       queue.writeBuffer(buffers.get(bufId)!, 0, data as unknown as BufferSource)
     }
-    const encoder = device.createCommandEncoder({ label: 'tensorgrad-step' })
-    for (let i = 0; i < kernels.length; i++) {
-      const k = kernels[i]!
-      if (!k.wgsl || k.threads === 0) continue
-      const pipeline = pipelines[i]!
-      const bindGroup = bindGroups[i]!
-      const pass = encoder.beginComputePass({ label: k.opKind })
-      pass.setPipeline(pipeline)
-      pass.setBindGroup(0, bindGroup)
-      // WebGPU caps each dispatch dimension at 65535 workgroups. Split into 2D
-      // when a kernel needs more than that on the X axis. Kernels compute their
-      // global index as `gid.x + gid.y * (65535 * workgroup_size)`, matching the
-      // stride we set here. For dispatches that fit in one row, gid.y is 0.
-      const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize))
-      const MAX_X = 65535
-      const wgX = Math.min(wgCount, MAX_X)
-      const wgY = Math.ceil(wgCount / MAX_X)
-      pass.dispatchWorkgroups(wgX, wgY, 1)
-      pass.end()
-    }
-    // After all dispatches: writebacks (Adam state, updated params). Empty for
-    // forward-only compiles.
-    for (const wb of plan.writebacks) {
-      encoder.copyBufferToBuffer(buffers.get(wb.source)!, 0, buffers.get(wb.dest)!, 0, wb.bytes)
-    }
-    encoder.copyBufferToBuffer(buffers.get(lossBufferId)!, 0, outputReadback, 0, outputSpec.byteSize)
-    // Capture readbacks (only when opted in). All captures concatenate into
-    // a single staging buffer so we mapAsync once instead of N times.
+    // Chunked submit. One queue.submit() of all 240 kernels monopolizes the
+    // GPU for the full step duration, blocking compositor frames the entire
+    // time. Splitting into chunks with an explicit GPU-drain await between
+    // them gives the compositor a slot at each chunk boundary. On graphs
+    // smaller than CHUNK_SIZE this collapses to a single submit (no
+    // overhead). See specs/WorkerArchitecture.md / mobile-jank investigation.
+    const CHUNK_SIZE = 32
     let layout: CaptureLayout | null = null
     if (wantCaptures) {
+      // Compute layout up front so the last chunk can append capture copies.
       layout = ensureCaptureStaging()
-      for (const s of layout.slices) {
-        encoder.copyBufferToBuffer(buffers.get(s.bufId)!, 0, layout.buffer, s.offset, s.byteSize)
+    }
+    let kernelIdx = 0
+    while (kernelIdx < kernels.length) {
+      const chunkEnd = Math.min(kernelIdx + CHUNK_SIZE, kernels.length)
+      const isLast = chunkEnd === kernels.length
+      const encoder = device.createCommandEncoder({
+        label: kernels.length > CHUNK_SIZE ? `tensorgrad-chunk-${kernelIdx}` : 'tensorgrad-step',
+      })
+      for (let i = kernelIdx; i < chunkEnd; i++) {
+        const k = kernels[i]!
+        if (!k.wgsl || k.threads === 0) continue
+        const pipeline = pipelines[i]!
+        const bindGroup = bindGroups[i]!
+        const pass = encoder.beginComputePass({ label: k.opKind })
+        pass.setPipeline(pipeline)
+        pass.setBindGroup(0, bindGroup)
+        // WebGPU caps each dispatch dimension at 65535 workgroups. Split into 2D
+        // when a kernel needs more than that on the X axis. Kernels compute their
+        // global index as `gid.x + gid.y * (65535 * workgroup_size)`, matching the
+        // stride we set here. For dispatches that fit in one row, gid.y is 0.
+        const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize))
+        const MAX_X = 65535
+        const wgX = Math.min(wgCount, MAX_X)
+        const wgY = Math.ceil(wgCount / MAX_X)
+        pass.dispatchWorkgroups(wgX, wgY, 1)
+        pass.end()
       }
+      if (isLast) {
+        // Writebacks (Adam state, updated params; empty for forward-only) +
+        // output readback copy + capture readback copies all go into the
+        // final chunk so a single mapAsync below sees everything.
+        for (const wb of plan.writebacks) {
+          encoder.copyBufferToBuffer(buffers.get(wb.source)!, 0, buffers.get(wb.dest)!, 0, wb.bytes)
+        }
+        encoder.copyBufferToBuffer(buffers.get(lossBufferId)!, 0, outputReadback, 0, outputSpec.byteSize)
+        if (layout) {
+          for (const s of layout.slices) {
+            encoder.copyBufferToBuffer(buffers.get(s.bufId)!, 0, layout.buffer, s.offset, s.byteSize)
+          }
+        }
+      }
+      queue.submit([encoder.finish()])
+      if (!isLast) {
+        // Drain the chunk before queuing the next one. This is the moment
+        // the compositor can interleave its own frame work onto the GPU.
+        await queue.onSubmittedWorkDone()
+      }
+      kernelIdx = chunkEnd
     }
-    queue.submit([encoder.finish()])
     // readback=false: training fire-and-forget. The encoder still copied
     // loss → outputReadback (and captures → staging), but we don't await