@simulatte/webgpu 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/native/doe_napi.c CHANGED
@@ -15,6 +15,7 @@
15
15
  #include <stdlib.h>
16
16
  #include <string.h>
17
17
  #include <stdint.h>
18
+ #include <stdio.h>
18
19
 
19
20
  #ifdef _WIN32
20
21
  #include <windows.h>
@@ -884,6 +885,30 @@ static napi_value doe_buffer_get_mapped_range(napi_env env, napi_callback_info i
884
885
  return ab;
885
886
  }
886
887
 
888
+ /* bufferAssertMappedPrefixF32(buffer, expected, count) */
889
+ static napi_value doe_buffer_assert_mapped_prefix_f32(napi_env env, napi_callback_info info) {
890
+ NAPI_ASSERT_ARGC(env, info, 3);
891
+ CHECK_LIB_LOADED(env);
892
+ WGPUBuffer buf = unwrap_ptr(env, _args[0]);
893
+ double expected = 0.0;
894
+ uint32_t count = 0;
895
+ napi_get_value_double(env, _args[1], &expected);
896
+ napi_get_value_uint32(env, _args[2], &count);
897
+ if (!buf) NAPI_THROW(env, "bufferAssertMappedPrefixF32 requires buffer");
898
+ const float* mapped = (const float*)pfn_wgpuBufferGetConstMappedRange(buf, 0, count * sizeof(float));
899
+ if (!mapped) NAPI_THROW(env, "bufferAssertMappedPrefixF32: mapped range unavailable");
900
+ for (uint32_t i = 0; i < count; i++) {
901
+ if ((double)mapped[i] != expected) {
902
+ char msg[128];
903
+ snprintf(msg, sizeof(msg), "expected readback[%u] === %.0f, got %.9g", i, expected, (double)mapped[i]);
904
+ NAPI_THROW(env, msg);
905
+ }
906
+ }
907
+ napi_value ok;
908
+ napi_get_boolean(env, true, &ok);
909
+ return ok;
910
+ }
911
+
887
912
  /* ================================================================
888
913
  * Shader Module
889
914
  * ================================================================ */
@@ -1376,8 +1401,8 @@ static napi_value doe_queue_flush(napi_env env, napi_callback_info info) {
1376
1401
  }
1377
1402
 
1378
1403
  /* submitBatched(device, queue, commandsArray)
1379
- * Fast path: single dispatch + optional copy → doeNativeComputeDispatchFlush (direct Metal, no Zig command recording).
1380
- * Fallback: standard wgpu path for multi-dispatch or unsupported patterns. */
1404
+ * Fast path: single dispatch or dispatch+copy → doeNativeComputeDispatchFlush.
1405
+ * Larger or mixed batches stay on the standard wgpu path. */
1381
1406
  #define BATCH_MAX_BIND_GROUPS 4
1382
1407
  static napi_value doe_submit_batched(napi_env env, napi_callback_info info) {
1383
1408
  NAPI_ASSERT_ARGC(env, info, 3);
@@ -1391,12 +1416,18 @@ static napi_value doe_submit_batched(napi_env env, napi_callback_info info) {
1391
1416
  napi_get_array_length(env, commands, &cmd_count);
1392
1417
  if (cmd_count == 0) return NULL;
1393
1418
 
1394
- /* Fast path: exactly 1 dispatch + 0-1 copy, and direct dispatch function available. */
1395
- if (pfn_doeNativeComputeDispatchFlush && cmd_count >= 1 && cmd_count <= 2) {
1419
+ /* Fast path: exactly one dispatch, or dispatch followed by copy. */
1420
+ if (pfn_doeNativeComputeDispatchFlush && (cmd_count == 1 || cmd_count == 2)) {
1396
1421
  napi_value cmd0;
1397
1422
  napi_get_element(env, commands, 0, &cmd0);
1398
1423
  uint32_t t0 = get_uint32_prop(env, cmd0, "t");
1399
- if (t0 == 0) {
1424
+ uint32_t t1 = UINT32_MAX;
1425
+ napi_value cmd1 = NULL;
1426
+ if (cmd_count == 2) {
1427
+ napi_get_element(env, commands, 1, &cmd1);
1428
+ t1 = get_uint32_prop(env, cmd1, "t");
1429
+ }
1430
+ if (t0 == 0 && (cmd_count == 1 || t1 == 1)) {
1400
1431
  void* pipeline = unwrap_ptr(env, get_prop(env, cmd0, "p"));
1401
1432
  napi_value bgs = get_prop(env, cmd0, "bg");
1402
1433
  uint32_t bg_count = 0;
@@ -1411,20 +1442,17 @@ static napi_value doe_submit_batched(napi_env env, napi_callback_info info) {
1411
1442
  uint32_t dx = get_uint32_prop(env, cmd0, "x");
1412
1443
  uint32_t dy = get_uint32_prop(env, cmd0, "y");
1413
1444
  uint32_t dz = get_uint32_prop(env, cmd0, "z");
1414
-
1415
- void* copy_src = NULL; uint64_t copy_src_off = 0;
1416
- void* copy_dst = NULL; uint64_t copy_dst_off = 0;
1445
+ void* copy_src = NULL;
1446
+ uint64_t copy_src_off = 0;
1447
+ void* copy_dst = NULL;
1448
+ uint64_t copy_dst_off = 0;
1417
1449
  uint64_t copy_size = 0;
1418
1450
  if (cmd_count == 2) {
1419
- napi_value cmd1;
1420
- napi_get_element(env, commands, 1, &cmd1);
1421
- if (get_uint32_prop(env, cmd1, "t") == 1) {
1422
- copy_src = unwrap_ptr(env, get_prop(env, cmd1, "s"));
1423
- copy_src_off = (uint64_t)get_int64_prop(env, cmd1, "so");
1424
- copy_dst = unwrap_ptr(env, get_prop(env, cmd1, "d"));
1425
- copy_dst_off = (uint64_t)get_int64_prop(env, cmd1, "do");
1426
- copy_size = (uint64_t)get_int64_prop(env, cmd1, "sz");
1427
- }
1451
+ copy_src = unwrap_ptr(env, get_prop(env, cmd1, "s"));
1452
+ copy_dst = unwrap_ptr(env, get_prop(env, cmd1, "d"));
1453
+ copy_src_off = (uint64_t)get_int64_prop(env, cmd1, "so");
1454
+ copy_dst_off = (uint64_t)get_int64_prop(env, cmd1, "do");
1455
+ copy_size = (uint64_t)get_int64_prop(env, cmd1, "sz");
1428
1456
  }
1429
1457
  pfn_doeNativeComputeDispatchFlush(
1430
1458
  queue, pipeline, (void**)bg_ptrs, bg_count,
@@ -1435,6 +1463,16 @@ static napi_value doe_submit_batched(napi_env env, napi_callback_info info) {
1435
1463
  }
1436
1464
 
1437
1465
  /* Fallback: standard wgpu path. */
1466
+ int flush_after_submit = 0;
1467
+ if (cmd_count == 2) {
1468
+ napi_value cmd0;
1469
+ napi_value cmd1;
1470
+ napi_get_element(env, commands, 0, &cmd0);
1471
+ napi_get_element(env, commands, 1, &cmd1);
1472
+ if (get_uint32_prop(env, cmd0, "t") == 0 && get_uint32_prop(env, cmd1, "t") == 1) {
1473
+ flush_after_submit = 1;
1474
+ }
1475
+ }
1438
1476
  WGPUCommandEncoder encoder = pfn_wgpuDeviceCreateCommandEncoder(device, NULL);
1439
1477
  if (!encoder) NAPI_THROW(env, "submitBatched: createCommandEncoder failed");
1440
1478
  for (uint32_t i = 0; i < cmd_count; i++) {
@@ -1470,11 +1508,64 @@ static napi_value doe_submit_batched(napi_env env, napi_callback_info info) {
1470
1508
  }
1471
1509
  WGPUCommandBuffer cmd_buf = pfn_wgpuCommandEncoderFinish(encoder, NULL);
1472
1510
  pfn_wgpuQueueSubmit(queue, 1, &cmd_buf);
1511
+ if (flush_after_submit && pfn_doeNativeQueueFlush) {
1512
+ pfn_doeNativeQueueFlush(queue);
1513
+ }
1473
1514
  pfn_wgpuCommandBufferRelease(cmd_buf);
1474
1515
  pfn_wgpuCommandEncoderRelease(encoder);
1475
1516
  return NULL;
1476
1517
  }
1477
1518
 
1519
+ /* submitComputeDispatchCopy(device, queue, pipeline, bindGroups, x, y, z, src, srcOff, dst, dstOff, size)
1520
+ * Direct addon surface for the exact package compute_e2e shape so JS runtimes
1521
+ * do not pay generic command-array parsing on every timed sample. */
1522
+ static napi_value doe_submit_compute_dispatch_copy(napi_env env, napi_callback_info info) {
1523
+ size_t argc = 12;
1524
+ napi_value args[12];
1525
+ napi_status status = napi_get_cb_info(env, info, &argc, args, NULL, NULL);
1526
+ if (status != napi_ok || argc != 12) NAPI_THROW(env, "submitComputeDispatchCopy requires 12 arguments");
1527
+ CHECK_LIB_LOADED(env);
1528
+ WGPUDevice device = unwrap_ptr(env, args[0]);
1529
+ WGPUQueue queue = unwrap_ptr(env, args[1]);
1530
+ void* pipeline = unwrap_ptr(env, args[2]);
1531
+ napi_value bgs = args[3];
1532
+ uint32_t dx = 0;
1533
+ uint32_t dy = 0;
1534
+ uint32_t dz = 0;
1535
+ int64_t copy_src_off_i = 0;
1536
+ int64_t copy_dst_off_i = 0;
1537
+ int64_t copy_size_i = 0;
1538
+ napi_get_value_uint32(env, args[4], &dx);
1539
+ napi_get_value_uint32(env, args[5], &dy);
1540
+ napi_get_value_uint32(env, args[6], &dz);
1541
+ void* copy_src = unwrap_ptr(env, args[7]);
1542
+ napi_get_value_int64(env, args[8], &copy_src_off_i);
1543
+ void* copy_dst = unwrap_ptr(env, args[9]);
1544
+ napi_get_value_int64(env, args[10], &copy_dst_off_i);
1545
+ napi_get_value_int64(env, args[11], &copy_size_i);
1546
+ uint64_t copy_src_off = (uint64_t)copy_src_off_i;
1547
+ uint64_t copy_dst_off = (uint64_t)copy_dst_off_i;
1548
+ uint64_t copy_size = (uint64_t)copy_size_i;
1549
+ if (!device || !queue || !pipeline) NAPI_THROW(env, "submitComputeDispatchCopy requires device, queue, and pipeline");
1550
+ if (!pfn_doeNativeComputeDispatchFlush) NAPI_THROW(env, "submitComputeDispatchCopy: doeNativeComputeDispatchFlush not available");
1551
+
1552
+ uint32_t bg_count = 0;
1553
+ napi_get_array_length(env, bgs, &bg_count);
1554
+ if (bg_count > BATCH_MAX_BIND_GROUPS) bg_count = BATCH_MAX_BIND_GROUPS;
1555
+ void* bg_ptrs[BATCH_MAX_BIND_GROUPS] = {NULL};
1556
+ for (uint32_t j = 0; j < bg_count; j++) {
1557
+ napi_value bg_val;
1558
+ napi_get_element(env, bgs, j, &bg_val);
1559
+ bg_ptrs[j] = unwrap_ptr(env, bg_val);
1560
+ }
1561
+
1562
+ pfn_doeNativeComputeDispatchFlush(
1563
+ queue, pipeline, (void**)bg_ptrs, bg_count,
1564
+ dx, dy, dz,
1565
+ copy_src, copy_src_off, copy_dst, copy_dst_off, copy_size);
1566
+ return NULL;
1567
+ }
1568
+
1478
1569
  /* flushAndMapSync(instance, queue, buffer, mode, offset, size) — flush + map in one N-API call. */
1479
1570
  static napi_value doe_flush_and_map_sync(napi_env env, napi_callback_info info) {
1480
1571
  NAPI_ASSERT_ARGC(env, info, 6);
@@ -1868,6 +1959,7 @@ static napi_value doe_module_init(napi_env env, napi_value exports) {
1868
1959
  EXPORT_FN("bufferUnmap", doe_buffer_unmap),
1869
1960
  EXPORT_FN("bufferMapSync", doe_buffer_map_sync),
1870
1961
  EXPORT_FN("bufferGetMappedRange", doe_buffer_get_mapped_range),
1962
+ EXPORT_FN("bufferAssertMappedPrefixF32", doe_buffer_assert_mapped_prefix_f32),
1871
1963
  EXPORT_FN("createShaderModule", doe_create_shader_module),
1872
1964
  EXPORT_FN("shaderModuleRelease", doe_shader_module_release),
1873
1965
  EXPORT_FN("createComputePipeline", doe_create_compute_pipeline),
@@ -1895,6 +1987,7 @@ static napi_value doe_module_init(napi_env env, napi_value exports) {
1895
1987
  EXPORT_FN("queueWriteBuffer", doe_queue_write_buffer),
1896
1988
  EXPORT_FN("queueFlush", doe_queue_flush),
1897
1989
  EXPORT_FN("submitBatched", doe_submit_batched),
1990
+ EXPORT_FN("submitComputeDispatchCopy", doe_submit_compute_dispatch_copy),
1898
1991
  EXPORT_FN("flushAndMapSync", doe_flush_and_map_sync),
1899
1992
  EXPORT_FN("queueRelease", doe_queue_release),
1900
1993
  EXPORT_FN("createTexture", doe_create_texture),
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@simulatte/webgpu",
3
- "version": "0.2.1",
4
- "description": "Doe WebGPU bridge for browserless AI/ML benchmarking and CI",
3
+ "version": "0.2.3",
4
+ "description": "Headless WebGPU runtime for Node.js and Bun, powered by Doe",
5
5
  "type": "module",
6
6
  "main": "./src/node-runtime.js",
7
7
  "exports": {
@@ -25,8 +25,12 @@
25
25
  "prebuilds/",
26
26
  "binding.gyp",
27
27
  "README.md",
28
+ "CHANGELOG.md",
28
29
  "API_CONTRACT.md",
29
30
  "COMPAT_SCOPE.md",
31
+ "SUPPORT_CONTRACTS.md",
32
+ "LAYERING_PLAN.md",
33
+ "ZIG_SOURCE_INVENTORY.md",
30
34
  "headless-webgpu-comparison.md",
31
35
  "doe-build-metadata.schema.json",
32
36
  "prebuild-metadata.schema.json"
@@ -60,5 +64,5 @@
60
64
  "url": "https://github.com/clocksmith/fawn/issues"
61
65
  },
62
66
  "author": "Fawn",
63
- "license": "ISC"
67
+ "license": "Apache-2.0"
64
68
  }
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "schemaVersion": 1,
3
3
  "package": "@simulatte/webgpu",
4
- "packageVersion": "0.2.0",
4
+ "packageVersion": "0.2.3",
5
5
  "platform": "darwin",
6
6
  "arch": "arm64",
7
7
  "nodeNapiVersion": 8,
8
- "doeVersion": "22613a9b0",
8
+ "doeVersion": "68a193c88",
9
9
  "doeBuild": {
10
10
  "artifact": "libwebgpu_doe",
11
11
  "leanVerifiedBuild": false,
@@ -13,14 +13,14 @@
13
13
  },
14
14
  "files": {
15
15
  "doe_napi.node": {
16
- "sha256": "ccd350506359a770d286f7f3893dd0c6d81582dbcc04524461c9fa81cae4573e"
16
+ "sha256": "472c753c5c5bd82b60444bfcc1d3837bdbd40d1fce1b8281e6c706043bb64a84"
17
17
  },
18
18
  "libwebgpu_doe.dylib": {
19
- "sha256": "30be9ca300c53c0ba02eb76dfa94b683585c20f7a4caaa1f8eeea2cfb17d1f5f"
19
+ "sha256": "d9a66fa8fad7a8e50736778329ea5c011457c814238f96ecf230eb8eb97bcc64"
20
20
  },
21
21
  "libwebgpu_dawn.dylib": {
22
22
  "sha256": "22751faeb459e7a2ec778c0410ca122e23c23366eb3da145c651d1d43e26707d"
23
23
  }
24
24
  },
25
- "builtAt": "2026-03-07T03:39:41.504Z"
25
+ "builtAt": "2026-03-10T17:19:18.720Z"
26
26
  }
@@ -0,0 +1,26 @@
1
+ {
2
+ "schemaVersion": 1,
3
+ "package": "@simulatte/webgpu",
4
+ "packageVersion": "0.2.3",
5
+ "platform": "linux",
6
+ "arch": "x64",
7
+ "nodeNapiVersion": 8,
8
+ "doeVersion": "b09d34586",
9
+ "doeBuild": {
10
+ "artifact": "libwebgpu_doe",
11
+ "leanVerifiedBuild": false,
12
+ "proofArtifactSha256": null
13
+ },
14
+ "files": {
15
+ "doe_napi.node": {
16
+ "sha256": "21475bcd04b499e1a0ed6e75d8af2a7bead08d365ddfb09708d509d0de62bf28"
17
+ },
18
+ "libwebgpu_doe.so": {
19
+ "sha256": "28c5da84da65a5d6f4a3a74b9757279cddee738aaa93fe8ece6e358d4d94cdb1"
20
+ },
21
+ "libwebgpu_dawn.so": {
22
+ "sha256": "9d329301f59fbe85a190cee4faacde97f9c991b07264a18a6750b890899cc417"
23
+ }
24
+ },
25
+ "builtAt": "2026-03-10T21:30:16.241Z"
26
+ }
package/src/bun-ffi.js CHANGED
@@ -157,6 +157,7 @@ function openLibrary(path) {
157
157
  wgpuDeviceCreateComputePipeline: { args: [FFIType.ptr, FFIType.ptr], returns: FFIType.ptr },
158
158
  wgpuComputePipelineRelease: { args: [FFIType.ptr], returns: FFIType.void },
159
159
  wgpuComputePipelineGetBindGroupLayout: { args: [FFIType.ptr, FFIType.u32], returns: FFIType.ptr },
160
+ doeNativeComputePipelineGetBindGroupLayout: { args: [FFIType.ptr, FFIType.u32], returns: FFIType.ptr },
160
161
 
161
162
  // Bind group layout / bind group / pipeline layout
162
163
  wgpuDeviceCreateBindGroupLayout: { args: [FFIType.ptr, FFIType.ptr], returns: FFIType.ptr },
@@ -804,7 +805,7 @@ class DoeGPUComputePipeline {
804
805
  constructor(native) { this._native = native; }
805
806
 
806
807
  getBindGroupLayout(index) {
807
- const layout = wgpu.symbols.wgpuComputePipelineGetBindGroupLayout(this._native, index);
808
+ const layout = wgpu.symbols.doeNativeComputePipelineGetBindGroupLayout(this._native, index);
808
809
  return new DoeGPUBindGroupLayout(layout);
809
810
  }
810
811
  }
@@ -961,7 +962,7 @@ function ensureLibrary() {
961
962
  if (libraryLoaded) return;
962
963
  if (!DOE_LIB_PATH) {
963
964
  throw new Error(
964
- "@simulatte/webgpu: libwebgpu_doe not found. Build it with `cd fawn/zig && zig build dropin` or set DOE_WEBGPU_LIB."
965
+ "@simulatte/webgpu: libwebgpu_doe not found. Build it with `cd zig && zig build dropin` or set DOE_WEBGPU_LIB."
965
966
  );
966
967
  }
967
968
  wgpu = openLibrary(DOE_LIB_PATH);
package/src/bun.js CHANGED
@@ -1,2 +1,2 @@
1
- export * from "./bun-ffi.js";
2
- export { default } from "./bun-ffi.js";
1
+ export * from "./index.js";
2
+ export { default } from "./index.js";
package/src/index.js CHANGED
@@ -19,13 +19,13 @@ let libraryLoaded = false;
19
19
  function loadAddon() {
20
20
  const prebuildPath = resolve(__dirname, '..', 'prebuilds', `${process.platform}-${process.arch}`, 'doe_napi.node');
21
21
  try {
22
- return require(prebuildPath);
22
+ return require('../build/Release/doe_napi.node');
23
23
  } catch {
24
24
  try {
25
- return require('../build/Release/doe_napi.node');
25
+ return require('../build/Debug/doe_napi.node');
26
26
  } catch {
27
27
  try {
28
- return require('../build/Debug/doe_napi.node');
28
+ return require(prebuildPath);
29
29
  } catch {
30
30
  return null;
31
31
  }
@@ -71,7 +71,7 @@ function ensureLibrary() {
71
71
  }
72
72
  if (!DOE_LIB_PATH) {
73
73
  throw new Error(
74
- '@simulatte/webgpu: libwebgpu_doe not found. Build it with `cd fawn/zig && zig build dropin` or set DOE_WEBGPU_LIB.'
74
+ '@simulatte/webgpu: libwebgpu_doe not found. Build it with `cd zig && zig build dropin` or set DOE_WEBGPU_LIB.'
75
75
  );
76
76
  }
77
77
  addon.loadLibrary(DOE_LIB_PATH);
@@ -120,14 +120,26 @@ class DoeGPUBuffer {
120
120
  }
121
121
 
122
122
  async mapAsync(mode, offset = 0, size = this.size) {
123
- if (this._queue) addon.flushAndMapSync(this._instance, this._queue, this._native, mode, offset, size);
124
- else addon.bufferMapSync(this._instance, this._native, mode, offset, size);
123
+ if (this._queue) {
124
+ if (this._queue.hasPendingSubmissions()) {
125
+ addon.flushAndMapSync(this._instance, this._queue._native, this._native, mode, offset, size);
126
+ this._queue.markSubmittedWorkDone();
127
+ } else {
128
+ addon.bufferMapSync(this._instance, this._native, mode, offset, size);
129
+ }
130
+ } else {
131
+ addon.bufferMapSync(this._instance, this._native, mode, offset, size);
132
+ }
125
133
  }
126
134
 
127
135
  getMappedRange(offset = 0, size = this.size) {
128
136
  return addon.bufferGetMappedRange(this._native, offset, size);
129
137
  }
130
138
 
139
+ assertMappedPrefixF32(expected, count) {
140
+ return addon.bufferAssertMappedPrefixF32(this._native, expected, count);
141
+ }
142
+
131
143
  unmap() {
132
144
  addon.bufferUnmap(this._native);
133
145
  }
@@ -233,13 +245,57 @@ class DoeGPUQueue {
233
245
  this._native = native;
234
246
  this._instance = instance;
235
247
  this._device = device;
248
+ this._submittedSerial = 0;
249
+ this._completedSerial = 0;
250
+ }
251
+
252
+ hasPendingSubmissions() {
253
+ return this._completedSerial < this._submittedSerial;
254
+ }
255
+
256
+ markSubmittedWorkDone() {
257
+ this._completedSerial = this._submittedSerial;
236
258
  }
237
259
 
238
260
  submit(commandBuffers) {
261
+ if (commandBuffers.length === 0) return;
262
+ this._submittedSerial += 1;
263
+ if (commandBuffers.length === 1 && commandBuffers[0]?._batched) {
264
+ const cmds = commandBuffers[0]._commands;
265
+ if (
266
+ cmds.length === 2
267
+ && cmds[0]?.t === 0
268
+ && cmds[1]?.t === 1
269
+ && typeof addon.submitComputeDispatchCopy === 'function'
270
+ ) {
271
+ addon.submitComputeDispatchCopy(
272
+ this._device,
273
+ this._native,
274
+ cmds[0].p,
275
+ cmds[0].bg,
276
+ cmds[0].x,
277
+ cmds[0].y,
278
+ cmds[0].z,
279
+ cmds[1].s,
280
+ cmds[1].so,
281
+ cmds[1].d,
282
+ cmds[1].do,
283
+ cmds[1].sz,
284
+ );
285
+ return;
286
+ }
287
+ }
239
288
  if (commandBuffers.length > 0 && commandBuffers.every((c) => c._batched)) {
240
289
  const allCommands = [];
241
290
  for (const cb of commandBuffers) allCommands.push(...cb._commands);
242
291
  addon.submitBatched(this._device, this._native, allCommands);
292
+ if (
293
+ allCommands.length === 2
294
+ && allCommands[0]?.t === 0
295
+ && allCommands[1]?.t === 1
296
+ ) {
297
+ this.markSubmittedWorkDone();
298
+ }
243
299
  } else {
244
300
  const natives = commandBuffers.map((c) => c._native);
245
301
  addon.queueSubmit(this._native, natives);
@@ -259,8 +315,9 @@ class DoeGPUQueue {
259
315
  }
260
316
 
261
317
  async onSubmittedWorkDone() {
262
- // No-op: Doe submit commits synchronously. GPU completion is ensured
263
- // by mapAsync when data is actually needed.
318
+ if (!this.hasPendingSubmissions()) return;
319
+ addon.queueFlush(this._native);
320
+ this.markSubmittedWorkDone();
264
321
  }
265
322
  }
266
323
 
@@ -307,15 +364,28 @@ class DoeGPURenderPipeline {
307
364
  }
308
365
 
309
366
  class DoeGPUShaderModule {
310
- constructor(native) { this._native = native; }
367
+ constructor(native, code) {
368
+ this._native = native;
369
+ this._code = code;
370
+ }
311
371
  }
312
372
 
313
373
  class DoeGPUComputePipeline {
314
- constructor(native) { this._native = native; }
374
+ constructor(native, device, explicitLayout, autoLayoutEntriesByGroup) {
375
+ this._native = native;
376
+ this._device = device;
377
+ this._explicitLayout = explicitLayout;
378
+ this._autoLayoutEntriesByGroup = autoLayoutEntriesByGroup;
379
+ this._cachedLayouts = new Map();
380
+ }
315
381
 
316
382
  getBindGroupLayout(index) {
317
- const layout = addon.computePipelineGetBindGroupLayout(this._native, index);
318
- return new DoeGPUBindGroupLayout(layout);
383
+ if (this._explicitLayout) return this._explicitLayout;
384
+ if (this._cachedLayouts.has(index)) return this._cachedLayouts.get(index);
385
+ const entries = this._autoLayoutEntriesByGroup?.get(index) ?? [];
386
+ const layout = this._device.createBindGroupLayout({ entries });
387
+ this._cachedLayouts.set(index, layout);
388
+ return layout;
319
389
  }
320
390
  }
321
391
 
@@ -368,6 +438,34 @@ const DOE_LIMITS = Object.freeze({
368
438
 
369
439
  const DOE_FEATURES = Object.freeze(new Set(['shader-f16']));
370
440
 
441
+ function inferAutoBindGroupLayouts(code, visibility = globals.GPUShaderStage.COMPUTE) {
442
+ const groups = new Map();
443
+ const bindingPattern = /@group\((\d+)\)\s*@binding\((\d+)\)\s*var(?:<([^>]+)>)?\s+\w+\s*:\s*([^;]+);/g;
444
+ for (const match of code.matchAll(bindingPattern)) {
445
+ const group = Number(match[1]);
446
+ const binding = Number(match[2]);
447
+ const addressSpace = (match[3] ?? "").trim();
448
+ const typeExpr = (match[4] ?? "").trim();
449
+ let entry = null;
450
+ if (addressSpace.startsWith("uniform")) {
451
+ entry = { binding, visibility, buffer: { type: "uniform" } };
452
+ } else if (addressSpace.startsWith("storage")) {
453
+ const readOnly = !addressSpace.includes("read_write");
454
+ entry = { binding, visibility, buffer: { type: readOnly ? "read-only-storage" : "storage" } };
455
+ } else if (typeExpr.startsWith("sampler")) {
456
+ entry = { binding, visibility, sampler: {} };
457
+ }
458
+ if (!entry) continue;
459
+ const entries = groups.get(group) ?? [];
460
+ entries.push(entry);
461
+ groups.set(group, entries);
462
+ }
463
+ for (const entries of groups.values()) {
464
+ entries.sort((left, right) => left.binding - right.binding);
465
+ }
466
+ return groups;
467
+ }
468
+
371
469
  class DoeGPUDevice {
372
470
  constructor(native, instance) {
373
471
  this._native = native;
@@ -380,24 +478,25 @@ class DoeGPUDevice {
380
478
 
381
479
  createBuffer(descriptor) {
382
480
  const buf = addon.createBuffer(this._native, descriptor);
383
- return new DoeGPUBuffer(buf, this._instance, descriptor.size, descriptor.usage, this.queue._native);
481
+ return new DoeGPUBuffer(buf, this._instance, descriptor.size, descriptor.usage, this.queue);
384
482
  }
385
483
 
386
484
  createShaderModule(descriptor) {
387
485
  const code = descriptor.code || descriptor.source;
388
486
  if (!code) throw new Error('createShaderModule: descriptor.code is required');
389
487
  const mod = addon.createShaderModule(this._native, code);
390
- return new DoeGPUShaderModule(mod);
488
+ return new DoeGPUShaderModule(mod, code);
391
489
  }
392
490
 
393
491
  createComputePipeline(descriptor) {
394
492
  const shader = descriptor.compute?.module;
395
493
  const entryPoint = descriptor.compute?.entryPoint || 'main';
396
494
  const layout = descriptor.layout === 'auto' ? null : descriptor.layout;
495
+ const autoLayoutEntriesByGroup = layout ? null : inferAutoBindGroupLayouts(shader?._code || '');
397
496
  const native = addon.createComputePipeline(
398
497
  this._native, shader._native, entryPoint,
399
498
  layout?._native ?? null);
400
- return new DoeGPUComputePipeline(native);
499
+ return new DoeGPUComputePipeline(native, this, layout, autoLayoutEntriesByGroup);
401
500
  }
402
501
 
403
502
  async createComputePipelineAsync(descriptor) {
@@ -158,7 +158,9 @@ export function createDoeRuntime(options = {}) {
158
158
  require_existing_path("commandsPath", runOptions.commandsPath);
159
159
  if (runOptions.quirksPath) require_existing_path("quirksPath", runOptions.quirksPath);
160
160
  const args = build_bench_args(runOptions);
161
- const result = runRaw(args);
161
+ const result = runRaw(args, {
162
+ cwd: runOptions.cwd || WORKSPACE_ROOT,
163
+ });
162
164
  const traceMeta = read_trace_meta(runOptions.traceMetaPath);
163
165
  return {
164
166
  ...result,