npm - @simulatte/webgpu - Versions diffs - 0.2.4 → 0.3.0 - Mend

@simulatte/webgpu 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/CHANGELOG.md +21 -0
package/README.md +263 -71
package/api-contract.md +70 -139
package/assets/package-layers.svg +63 -0
package/examples/direct-webgpu/compute-dispatch.js +66 -0
package/examples/direct-webgpu/explicit-bind-group.js +85 -0
package/examples/direct-webgpu/request-device.js +10 -0
package/examples/doe-api/buffers-readback.js +9 -0
package/examples/doe-api/compile-and-dispatch.js +30 -0
package/examples/doe-api/compute-dispatch.js +25 -0
package/examples/doe-routines/compute-once-like-input.js +36 -0
package/examples/doe-routines/compute-once-matmul.js +53 -0
package/examples/doe-routines/compute-once-multiple-inputs.js +27 -0
package/examples/doe-routines/compute-once.js +23 -0
package/headless-webgpu-comparison.md +2 -2
package/layering-plan.md +1 -1
package/native/doe_napi.c +102 -12
package/package.json +2 -1
package/prebuilds/darwin-arm64/doe_napi.node +0 -0
package/prebuilds/darwin-arm64/libwebgpu_doe.dylib +0 -0
package/prebuilds/darwin-arm64/metadata.json +6 -6
package/prebuilds/linux-x64/doe_napi.node +0 -0
package/prebuilds/linux-x64/libwebgpu_doe.so +0 -0
package/prebuilds/linux-x64/metadata.json +5 -5
package/scripts/generate-readme-assets.js +79 -6
package/scripts/prebuild.js +23 -19
package/src/auto_bind_group_layout.js +32 -0
package/src/bun-ffi.js +93 -12
package/src/bun.js +23 -2
package/src/compute.d.ts +2 -1
package/src/compute.js +671 -33
package/src/doe.d.ts +127 -27
package/src/doe.js +480 -114
package/src/full.d.ts +8 -1
package/src/full.js +28 -3
package/src/index.js +1013 -38

package/api-contract.md CHANGED Viewed

@@ -3,16 +3,47 @@
 Contract version: `v1`
 Scope: current headless WebGPU package contract for Node.js and Bun, with a
-default `full` surface, an explicit `compute` subpath, and Doe runtime helpers
-used by benchmarking, CI, and artifact-backed comparison workflows.
+default `full` surface, an explicit `compute` subpath, and the Doe API / Doe
+routines surface used by benchmarking, CI, and artifact-backed comparison
+workflows.
+Terminology in this contract is explicit:
+- `Doe runtime`
+  the Zig/native WebGPU runtime underneath the package
+- `Doe API`
+  the explicit JS convenience surface under `doe.bind(...)`, `gpu.buffers.*`,
+  `gpu.compute.run(...)`, and `gpu.compute.compile(...)`
+- `Doe routines`
+  the narrower, more opinionated JS flows layered on that same runtime;
+  currently `gpu.compute.once(...)`
 For the current `compute` vs `full` support split, see
 [`./support-contracts.md`](./support-contracts.md).
+Exact type and method shapes live in:
+- [`./src/full.d.ts`](./src/full.d.ts)
+- [`./src/compute.d.ts`](./src/compute.d.ts)
+- [`./src/doe.d.ts`](./src/doe.d.ts)
 This contract covers package-surface GPU access, provider metadata, and helper
 entrypoints. It does not promise DOM/canvas ownership or browser-process
 parity.
+## API styles
+The current package surface is organized around three API styles:
+- `Direct WebGPU`
+  raw `requestAdapter(...)`, `requestDevice(...)`, and direct `device.*` usage
+- `Doe API`
+  the package's explicit JS convenience surface under `doe.bind(...)`,
+  `gpu.buffers.*`, `gpu.compute.run(...)`, and `gpu.compute.compile(...)`
+- `Doe routines`
+  the package's more opinionated precomposed flows; currently
+  `gpu.compute.once(...)`
 ## Export surfaces
 ### `@simulatte/webgpu`
@@ -22,8 +53,8 @@ Default package surface.
 Contract:
 - headless `full` surface
-- includes compute plus render/sampler/surface APIs already exposed by the package runtime
-- also exports the `doe` ergonomic namespace
+- includes compute plus render/sampler/surface APIs already exposed by the Doe runtime package surface
+- also exports the shared `doe` namespace for the Doe API and Doe routines surface
 ### `@simulatte/webgpu/compute`
@@ -33,7 +64,7 @@ Contract:
 - sized for AI workloads and other buffer/dispatch-heavy headless execution
 - excludes render/sampler/surface methods from the public JS facade
-- also exports the same `doe` ergonomic namespace
+- also exports the same `doe` namespace for the Doe API and Doe routines surface
 ## Shared runtime API
@@ -42,175 +73,75 @@ Modules:
 - `@simulatte/webgpu`
 - `@simulatte/webgpu/compute`
-### `create(createArgs?)`
-Input:
-- `createArgs?: string[]` (currently ignored by the default Doe-native provider)
-Behavior:
-- loads the Doe-native N-API addon and `libwebgpu_doe`
-- returns a GPU object backed by the in-tree Doe provider
-Output:
-- `GPU` object with `requestAdapter(...)`
-### `globals`
-Output:
-- provider globals object suitable for `Object.assign(globalThis, globals)`
-### `setupGlobals(target?, createArgs?)`
-Input:
-- `target?: object` (default: `globalThis`)
-- `createArgs?: string[]`
-Behavior:
-- installs provider globals if missing
-- installs `navigator.gpu` if missing
-Output:
+### Top-level package API
-- `GPU` object
+The exact signatures are defined in the `.d.ts` files above. At the contract
+level:
-### `requestAdapter(adapterOptions?, createArgs?)`
+- `create(...)` loads the Doe-native addon/runtime and returns a package-local
+  `GPU` object.
+- `globals` exposes provider globals suitable for `Object.assign(...)` or
+  bootstrap wiring.
+- `setupGlobals(...)` installs globals and `navigator.gpu` when missing.
+- `requestAdapter(...)` and `requestDevice(...)` are the `Direct WebGPU` entry
+  points.
-Output:
-- `Promise<GPUAdapter | null>`
-### `requestDevice(options?)`
-Input:
-- `options.adapterOptions?: object`
-- `options.deviceDescriptor?: object`
-- `options.createArgs?: string[]`
-Output:
-- `Promise<GPUDevice>`
-On `@simulatte/webgpu/compute`, the returned device is a compute-only facade:
+On `@simulatte/webgpu/compute`, the returned device is intentionally
+compute-only:
 - buffer / bind group / compute pipeline / command encoder / queue methods are available
 - render / sampler / surface methods are intentionally absent from the facade
 ### `providerInfo()`
-Output object:
-- `module: string`
-- `loaded: boolean`
-- `loadError: string`
-- `defaultCreateArgs: string[]`
-- `doeNative: boolean`
-- `libraryFlavor: string`
-- `doeLibraryPath: string`
-- `buildMetadataSource: string`
-- `buildMetadataPath: string`
-- `leanVerifiedBuild: boolean | null`
-- `proofArtifactSha256: string | null`
 Behavior:
 - reports package-surface library provenance when prebuild metadata or Zig build
   metadata is available
 - does not guess: if metadata is unavailable, `leanVerifiedBuild` is `null`
+- reports whether the Doe-native path is loaded and where build metadata came from
 ### `doe`
-Output object:
-- `bind(device)`
-- `createBuffer(device, options)`
-- `createBufferFromData(device, data, options?)`
-- `readBuffer(device, buffer, TypedArray, options?)`
-- `runCompute(device, options)`
-- `compileCompute(device, options)`
 Behavior:
-- provides an ergonomic JS surface for common headless compute tasks
+- provides the `Doe API` and `Doe routines` surface for common headless
+  compute tasks
+- the exported `doe` namespace is the JS convenience surface, distinct from
+  the underlying Doe runtime
+- `requestDevice(options?)` resolves the package-local `requestDevice(...)` and returns
+  the bound helper object directly
 - supports both static helper calls and `doe.bind(device)` for device-bound workflows
-- infers `runCompute(...).bindings` access from Doe helper-created buffer usage when that
-  usage maps to one bindable access mode (`uniform`, `storage-read`, `storage-readwrite`)
+- helper methods are grouped under `buffers.*` and `compute.*`
+- `buffers.*`, `compute.run(...)`, and `compute.compile(...)` are the main
+  `Doe API` surface
+- `compute.once(...)` is the first `Doe routines` path and stays intentionally
+  narrow: typed-array/headless one-call execution, not a replacement for
+  explicit reusable resource ownership
+- infers `compute.run(...).bindings` access from Doe helper-created buffer usage when that
+  usage maps to one bindable access mode (`uniform`, `storageRead`, `storageReadWrite`)
+- `compute.once(...)` accepts Doe usage tokens only; raw numeric WebGPU usage flags stay on
+  the more explicit `Doe API` surface
 - fails fast for bare bindings that do not carry Doe helper usage metadata or whose
   usage is non-bindable/ambiguous; callers must pass `{ buffer, access }` explicitly
 - additive only; it does not replace the raw WebGPU-facing package API
 ### `createDoeRuntime(options?)`
-Input:
-- `options.binPath?: string`
-- `options.libPath?: string`
-Output object:
-- `binPath: string`
-- `libPath: string | null`
-- `runRaw(args: string[], spawnOptions?): RunResult`
-- `runBench(options: BenchOptions): BenchResult`
-`BenchOptions`:
-- `commandsPath: string` (required)
-- `quirksPath?: string`
-- `vendor?: string`
-- `api?: string`
-- `family?: string`
-- `driver?: string`
-- `traceJsonlPath?: string`
-- `traceMetaPath?: string`
-- `uploadBufferUsage?: string`
-- `uploadSubmitEvery?: number`
-- `queueWaitMode?: string`
-- `queueSyncMode?: string`
-- `extraArgs?: string[]`
-`RunResult`:
-- `ok: boolean`
-- `exitCode: number`
-- `stdout: string`
-- `stderr: string`
-- `signal: string | null`
-- `command: string[]`
-`BenchResult` extends `RunResult` with:
+Behavior:
-- `traceJsonlPath: string | null`
-- `traceMetaPath: string | null`
-- `traceMeta: object | null`
+- returns the local Doe runtime/CLI wrapper used for command-stream execution
+  and benchmark orchestration from Node/Bun environments
+- preserves explicit file-path ownership for the binary/library location rather
+  than hiding them behind package-only assumptions
 ### `runDawnVsDoeCompare(options)`
-Input:
-- `repoRoot?: string`
-- `compareScriptPath?: string`
-- `pythonBin?: string`
-- `configPath?: string`
-- `outPath?: string`
-- `extraArgs?: string[]`
-- `env?: Record<string, string>`
 Behavior:
 - wraps `bench/compare_dawn_vs_doe.py`
 - requires either `configPath` or `--config` in `extraArgs`
-Output:
-- `RunResult`
 ## CLI contract
 ### `fawn-webgpu-bench`

package/assets/package-layers.svg ADDED Viewed

@@ -0,0 +1,63 @@
+<!-- Generated by scripts/generate-readme-assets.js. Do not edit by hand. -->
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="470" viewBox="0 0 1200 470" role="img" aria-labelledby="layers-title layers-desc">
+  <title id="layers-title">@simulatte/webgpu layered package graph</title>
+  <desc id="layers-desc">Layered package graph showing direct WebGPU, Doe API, and Doe routines over the same package surfaces.</desc>
+  <defs>
+    <linearGradient id="layers-bg" x1="0%" y1="0%" x2="100%" y2="100%">
+      <stop offset="0%" stop-color="#050816"/>
+      <stop offset="100%" stop-color="#140c1f"/>
+    </linearGradient>
+    <radialGradient id="layers-glow-top" cx="25%" cy="18%" r="55%">
+      <stop offset="0%" stop-color="#ef444430"/>
+      <stop offset="55%" stop-color="#7c3aed18"/>
+      <stop offset="100%" stop-color="#00000000"/>
+    </radialGradient>
+    <radialGradient id="layers-glow-bottom" cx="78%" cy="84%" r="52%">
+      <stop offset="0%" stop-color="#f59e0b26"/>
+      <stop offset="60%" stop-color="#f9731618"/>
+      <stop offset="100%" stop-color="#00000000"/>
+    </radialGradient>
+    <linearGradient id="layers-root" x1="0%" y1="0%" x2="100%" y2="100%">
+      <stop offset="0%" stop-color="#7c3aed"/>
+      <stop offset="100%" stop-color="#ef4444"/>
+    </linearGradient>
+    <linearGradient id="layers-direct" x1="0%" y1="0%" x2="100%" y2="100%">
+      <stop offset="0%" stop-color="#ef4444"/>
+      <stop offset="100%" stop-color="#f97316"/>
+    </linearGradient>
+    <linearGradient id="layers-api" x1="0%" y1="0%" x2="100%" y2="100%">
+      <stop offset="0%" stop-color="#f97316"/>
+      <stop offset="100%" stop-color="#f59e0b"/>
+    </linearGradient>
+    <linearGradient id="layers-routines" x1="0%" y1="0%" x2="100%" y2="100%">
+      <stop offset="0%" stop-color="#f59e0b"/>
+      <stop offset="100%" stop-color="#eab308"/>
+    </linearGradient>
+    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
+      <feDropShadow dx="0" dy="10" stdDeviation="14" flood-color="#000000" flood-opacity="0.32"/>
+    </filter>
+    <style>
+      .title { font: 700 34px "Segoe UI", "Helvetica Neue", Arial, sans-serif; fill: #ffffff; paint-order: stroke fill; stroke: #000000; stroke-width: 2px; stroke-linejoin: round; }
+      .subtitle { font: 500 18px "Segoe UI", "Helvetica Neue", Arial, sans-serif; fill: #cbd5e1; paint-order: stroke fill; stroke: #000000; stroke-width: 2px; stroke-linejoin: round; }
+      .nodeTitle { font: 700 22px "Segoe UI", "Helvetica Neue", Arial, sans-serif; fill: #ffffff; paint-order: stroke fill; stroke: #000000; stroke-width: 2px; stroke-linejoin: round; }
+      .box { stroke-width: 2.5; filter: url(#shadow); }
+    </style>
+  </defs>
+  <rect width="1200" height="470" fill="url(#layers-bg)"/>
+  <rect width="1200" height="470" fill="url(#layers-glow-top)"/>
+  <rect width="1200" height="470" fill="url(#layers-glow-bottom)"/>
+  <text x="64" y="62" class="title">Same package, four layers</text>
+  <text x="64" y="94" class="subtitle">The package surface stays the same while the API gets progressively higher-level.</text>
+  <rect x="170" y="122" width="860" height="64" rx="20" fill="url(#layers-root)" stroke="#c4b5fd" class="box"/>
+  <text x="600" y="162" text-anchor="middle" class="nodeTitle">@simulatte/webgpu / @simulatte/webgpu/compute</text>
+  <rect x="220" y="222" width="760" height="52" rx="18" fill="url(#layers-direct)" stroke="#fca5a5" class="box"/>
+  <text x="600" y="255" text-anchor="middle" class="nodeTitle">Direct WebGPU</text>
+  <rect x="280" y="310" width="640" height="52" rx="18" fill="url(#layers-api)" stroke="#fdba74" class="box"/>
+  <text x="600" y="343" text-anchor="middle" class="nodeTitle">Doe API</text>
+  <rect x="360" y="398" width="480" height="52" rx="18" fill="url(#layers-routines)" stroke="#fde68a" class="box"/>
+  <text x="600" y="431" text-anchor="middle" class="nodeTitle">Doe routines</text>
+</svg>

package/examples/direct-webgpu/compute-dispatch.js ADDED Viewed

@@ -0,0 +1,66 @@
+import { globals, requestDevice } from "@simulatte/webgpu";
+const device = await requestDevice();
+const input = new Float32Array([1, 2, 3, 4]);
+const inputBuffer = device.createBuffer({
+  size: input.byteLength,
+  usage: globals.GPUBufferUsage.STORAGE | globals.GPUBufferUsage.COPY_DST,
+});
+device.queue.writeBuffer(inputBuffer, 0, input);
+const outputBuffer = device.createBuffer({
+  size: input.byteLength,
+  usage: globals.GPUBufferUsage.STORAGE | globals.GPUBufferUsage.COPY_SRC,
+});
+const readbackBuffer = device.createBuffer({
+  size: input.byteLength,
+  usage: globals.GPUBufferUsage.COPY_DST | globals.GPUBufferUsage.MAP_READ,
+});
+const shader = device.createShaderModule({
+  code: `
+    @group(0) @binding(0) var<storage, read> src: array<f32>;
+    @group(0) @binding(1) var<storage, read_write> dst: array<f32>;
+    @compute @workgroup_size(4)
+    fn main(@builtin(global_invocation_id) gid: vec3u) {
+      let i = gid.x;
+      dst[i] = src[i] * 2.0;
+    }
+  `,
+});
+const pipeline = device.createComputePipeline({
+  layout: "auto",
+  compute: {
+    module: shader,
+    entryPoint: "main",
+  },
+});
+const bindGroup = device.createBindGroup({
+  layout: pipeline.getBindGroupLayout(0),
+  entries: [
+    { binding: 0, resource: { buffer: inputBuffer } },
+    { binding: 1, resource: { buffer: outputBuffer } },
+  ],
+});
+const encoder = device.createCommandEncoder();
+const pass = encoder.beginComputePass();
+pass.setPipeline(pipeline);
+pass.setBindGroup(0, bindGroup);
+pass.dispatchWorkgroups(1);
+pass.end();
+encoder.copyBufferToBuffer(outputBuffer, 0, readbackBuffer, 0, input.byteLength);
+device.queue.submit([encoder.finish()]);
+await device.queue.onSubmittedWorkDone();
+await readbackBuffer.mapAsync(globals.GPUMapMode.READ);
+const result = new Float32Array(readbackBuffer.getMappedRange().slice(0));
+readbackBuffer.unmap();
+console.log(JSON.stringify(Array.from(result)));

package/examples/direct-webgpu/explicit-bind-group.js ADDED Viewed

@@ -0,0 +1,85 @@
+import { globals, requestDevice } from "@simulatte/webgpu";
+const device = await requestDevice();
+const input = new Float32Array([1, 2, 3, 4]);
+const inputBuffer = device.createBuffer({
+  size: input.byteLength,
+  usage: globals.GPUBufferUsage.STORAGE | globals.GPUBufferUsage.COPY_DST,
+});
+device.queue.writeBuffer(inputBuffer, 0, input);
+const outputBuffer = device.createBuffer({
+  size: input.byteLength,
+  usage: globals.GPUBufferUsage.STORAGE | globals.GPUBufferUsage.COPY_SRC,
+});
+const readbackBuffer = device.createBuffer({
+  size: input.byteLength,
+  usage: globals.GPUBufferUsage.COPY_DST | globals.GPUBufferUsage.MAP_READ,
+});
+const shader = device.createShaderModule({
+  code: `
+    @group(0) @binding(0) var<storage, read> src: array<f32>;
+    @group(0) @binding(1) var<storage, read_write> dst: array<f32>;
+    @compute @workgroup_size(4)
+    fn main(@builtin(global_invocation_id) gid: vec3u) {
+      let i = gid.x;
+      dst[i] = src[i] * 4.0;
+    }
+  `,
+});
+const bindGroupLayout = device.createBindGroupLayout({
+  entries: [
+    {
+      binding: 0,
+      visibility: globals.GPUShaderStage.COMPUTE,
+      buffer: { type: "read-only-storage" },
+    },
+    {
+      binding: 1,
+      visibility: globals.GPUShaderStage.COMPUTE,
+      buffer: { type: "storage" },
+    },
+  ],
+});
+const pipelineLayout = device.createPipelineLayout({
+  bindGroupLayouts: [bindGroupLayout],
+});
+const pipeline = device.createComputePipeline({
+  layout: pipelineLayout,
+  compute: {
+    module: shader,
+    entryPoint: "main",
+  },
+});
+const bindGroup = device.createBindGroup({
+  layout: bindGroupLayout,
+  entries: [
+    { binding: 0, resource: { buffer: inputBuffer } },
+    { binding: 1, resource: { buffer: outputBuffer } },
+  ],
+});
+const encoder = device.createCommandEncoder();
+const pass = encoder.beginComputePass();
+pass.setPipeline(pipeline);
+pass.setBindGroup(0, bindGroup);
+pass.dispatchWorkgroups(1);
+pass.end();
+encoder.copyBufferToBuffer(outputBuffer, 0, readbackBuffer, 0, input.byteLength);
+device.queue.submit([encoder.finish()]);
+await device.queue.onSubmittedWorkDone();
+await readbackBuffer.mapAsync(globals.GPUMapMode.READ);
+const result = new Float32Array(readbackBuffer.getMappedRange().slice(0));
+readbackBuffer.unmap();
+console.log(JSON.stringify(Array.from(result)));

package/examples/direct-webgpu/request-device.js ADDED Viewed

@@ -0,0 +1,10 @@
+import { requestDevice } from "@simulatte/webgpu";
+const device = await requestDevice();
+console.log(JSON.stringify({
+  createBuffer: typeof device.createBuffer === "function",
+  createComputePipeline: typeof device.createComputePipeline === "function",
+  createRenderPipeline: typeof device.createRenderPipeline === "function",
+  writeBuffer: typeof device.queue?.writeBuffer === "function",
+}));

package/examples/doe-api/buffers-readback.js ADDED Viewed

@@ -0,0 +1,9 @@
+import { doe } from "@simulatte/webgpu/compute";
+const gpu = await doe.requestDevice();
+const src = gpu.buffers.fromData(new Float32Array([1, 2, 3, 4]), {
+  usage: ["storageRead", "readback"],
+});
+const result = await gpu.buffers.read(src, Float32Array);
+console.log(JSON.stringify(Array.from(result)));

package/examples/doe-api/compile-and-dispatch.js ADDED Viewed

@@ -0,0 +1,30 @@
+import { doe } from "@simulatte/webgpu/compute";
+const gpu = await doe.requestDevice();
+const src = gpu.buffers.fromData(new Float32Array([1, 2, 3, 4]));
+const dst = gpu.buffers.like(src, {
+  usage: "storageReadWrite",
+});
+const kernel = gpu.compute.compile({
+  code: `
+    @group(0) @binding(0) var<storage, read> src: array<f32>;
+    @group(0) @binding(1) var<storage, read_write> dst: array<f32>;
+    @compute @workgroup_size(4)
+    fn main(@builtin(global_invocation_id) gid: vec3u) {
+      let i = gid.x;
+      dst[i] = src[i] * 5.0;
+    }
+  `,
+  bindings: [src, dst],
+  workgroups: 1,
+});
+await kernel.dispatch({
+  bindings: [src, dst],
+  workgroups: 1,
+});
+const result = await gpu.buffers.read(dst, Float32Array);
+console.log(JSON.stringify(Array.from(result)));

package/examples/doe-api/compute-dispatch.js ADDED Viewed

@@ -0,0 +1,25 @@
+import { doe } from "@simulatte/webgpu/compute";
+const gpu = await doe.requestDevice();
+const src = gpu.buffers.fromData(new Float32Array([1, 2, 3, 4]));
+const dst = gpu.buffers.like(src, {
+  usage: "storageReadWrite",
+});
+await gpu.compute.run({
+  code: `
+    @group(0) @binding(0) var<storage, read> src: array<f32>;
+    @group(0) @binding(1) var<storage, read_write> dst: array<f32>;
+    @compute @workgroup_size(4)
+    fn main(@builtin(global_invocation_id) gid: vec3u) {
+      let i = gid.x;
+      dst[i] = src[i] * 2.0;
+    }
+  `,
+  bindings: [src, dst],
+  workgroups: 1,
+});
+const result = await gpu.buffers.read(dst, Float32Array);
+console.log(JSON.stringify(Array.from(result)));

package/examples/doe-routines/compute-once-like-input.js ADDED Viewed

@@ -0,0 +1,36 @@
+import { doe } from "@simulatte/webgpu/compute";
+const gpu = await doe.requestDevice();
+const result = await gpu.compute.once({
+  code: `
+    struct Scale {
+      value: f32,
+    };
+    @group(0) @binding(0) var<uniform> scale: Scale;
+    @group(0) @binding(1) var<storage, read> src: array<f32>;
+    @group(0) @binding(2) var<storage, read_write> dst: array<f32>;
+    @compute @workgroup_size(4)
+    fn main(@builtin(global_invocation_id) gid: vec3u) {
+      let i = gid.x;
+      dst[i] = src[i] * scale.value;
+    }
+  `,
+  inputs: [
+    {
+      data: new Float32Array([2]),
+      usage: "uniform",
+      access: "uniform",
+    },
+    new Float32Array([1, 2, 3, 4]),
+  ],
+  output: {
+    type: Float32Array,
+    likeInput: 1,
+  },
+  workgroups: [1, 1],
+});
+console.log(JSON.stringify(Array.from(result)));

package/examples/doe-routines/compute-once-matmul.js ADDED Viewed

@@ -0,0 +1,53 @@
+import { doe } from "@simulatte/webgpu/compute";
+const gpu = await doe.requestDevice();
+const M = 256;
+const K = 512;
+const N = 256;
+const lhs = Float32Array.from({ length: M * K }, (_, i) => (i % 17) / 17);
+const rhs = Float32Array.from({ length: K * N }, (_, i) => (i % 13) / 13);
+const dims = new Uint32Array([M, K, N, 0]);
+const result = await gpu.compute.once({
+  code: `
+    struct Dims {
+      m: u32,
+      k: u32,
+      n: u32,
+      _pad: u32,
+    };
+    @group(0) @binding(0) var<uniform> dims: Dims;
+    @group(0) @binding(1) var<storage, read> lhs: array<f32>;
+    @group(0) @binding(2) var<storage, read> rhs: array<f32>;
+    @group(0) @binding(3) var<storage, read_write> out: array<f32>;
+    @compute @workgroup_size(8, 8)
+    fn main(@builtin(global_invocation_id) gid: vec3u) {
+      let row = gid.y;
+      let col = gid.x;
+      if (row >= dims.m || col >= dims.n) {
+        return;
+      }
+      var acc = 0.0;
+      for (var i = 0u; i < dims.k; i = i + 1u) {
+        acc += lhs[row * dims.k + i] * rhs[i * dims.n + col];
+      }
+      out[row * dims.n + col] = acc;
+    }
+  `,
+  inputs: [
+    { data: dims, usage: "uniform", access: "uniform" },
+    lhs,
+    rhs,
+  ],
+  output: {
+    type: Float32Array,
+    size: M * N * Float32Array.BYTES_PER_ELEMENT,
+  },
+  workgroups: [Math.ceil(N / 8), Math.ceil(M / 8)],
+});
+console.log(result.subarray(0, 8));