@cognipilot/rumoca-core 0.9.4 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/rumoca_gpu.js CHANGED
@@ -82,16 +82,24 @@ export async function probeGpu() {
82
82
  return adapter;
83
83
  }
84
84
 
85
- // Integrate a prepared model on the GPU with fixed-step RK4.
85
+ // Build a reusable GPU program for a prepared model: a WebGPU device, the
86
+ // compiled WGSL modules, compute pipelines, device buffers, and bind groups,
87
+ // plus a per-run `simulate(prep, onPhase)` closure.
88
+ //
89
+ // Everything built here is fully determined by the rendered shader and layout
90
+ // (i.e. the model source) and never by parameter *values*, so a parameter-only
91
+ // re-run can reuse the whole program and just re-upload y0/p0. `runGpuSimulation`
92
+ // caches the program keyed on `prep.wgsl`; call this directly only if you want
93
+ // to manage the program lifetime yourself.
86
94
  //
87
95
  // adapter : GPUAdapter (from `probeGpu`)
88
96
  // prep : the parsed JSON from WASM `prepare_gpu_simulation`
89
97
  // ({ wgsl, layout, n_states, y0, p0, t_start, t_end, dt })
90
98
  // onPhase : optional (message, fraction|null) progress callback
91
99
  //
92
- // Returns { payload: { names, allData, nStates, simDetails }, metrics }
93
- // shaped like `simulate_model` so plots/viz scripts work unchanged.
94
- export async function runGpuSimulation(adapter, prep, onPhase = () => {}) {
100
+ // Returns { device, simulate } where `simulate(prepNow, onPhaseNow)` runs the
101
+ // RK4 loop and resolves to a result shaped like `simulate_model`.
102
+ export async function buildGpuProgram(adapter, prep, onPhase = () => {}) {
95
103
  const layout = prep.layout || {};
96
104
  const nStates = prep.n_states | 0;
97
105
  const yLen = Math.max(layout.y_len | 0, 1);
@@ -160,10 +168,6 @@ export async function runGpuSimulation(adapter, prep, onPhase = () => {}) {
160
168
  const yStage = storage(yLen, 'y-stage');
161
169
  const pBuf = storage(Math.max(layout.p_len | 0, 1), 'p');
162
170
  const kBufs = [0, 1, 2, 3].map((i) => storage(rows, `k${i + 1}`));
163
- const y0 = new Float32Array(prep.y0 || []);
164
- device.queue.writeBuffer(yBuf, 0, y0);
165
- device.queue.writeBuffer(yStage, 0, y0);
166
- device.queue.writeBuffer(pBuf, 0, new Float32Array(prep.p0 || []));
167
171
 
168
172
  const timeUniform = device.createBuffer({
169
173
  size: 16, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
@@ -254,82 +258,138 @@ export async function runGpuSimulation(adapter, prep, onPhase = () => {}) {
254
258
  const writeTime = (t) => device.queue.writeBuffer(
255
259
  timeUniform, 0, new Float32Array([t, 0, 0, 0]));
256
260
 
257
- const times = [tStart];
258
- const samples = [Array.from(y0)];
259
- onPhase(`Simulating on WebGPU (0/${steps} steps)`, 0);
260
- const wallStart = performance.now();
261
- // One readback per step keeps the driver simple; the GPU work per
262
- // step is small enough that this is not the bottleneck yet.
263
- for (let step = 0; step < steps; step++) {
264
- const t = tStart + step * dt;
265
- const enc = device.createCommandEncoder();
266
- writeTime(t);
267
- dispatchDer(enc, 0);
268
- dispatchStage(enc, axpyPipeline, axpyBinds[0]);
269
- device.queue.submit([enc.finish()]);
270
- const enc2 = device.createCommandEncoder();
271
- writeTime(t + dt / 2);
272
- dispatchDer(enc2, 1);
273
- dispatchStage(enc2, axpyPipeline, axpyBinds[1]);
274
- device.queue.submit([enc2.finish()]);
275
- const enc3 = device.createCommandEncoder();
276
- dispatchDer(enc3, 2);
277
- dispatchStage(enc3, axpyPipeline, axpyBinds[2]);
278
- device.queue.submit([enc3.finish()]);
279
- const enc4 = device.createCommandEncoder();
280
- writeTime(t + dt);
281
- dispatchDer(enc4, 3);
282
- dispatchStage(enc4, combinePipeline, combineBind);
283
- enc4.copyBufferToBuffer(yBuf, 0, readback, 0, yLen * 4);
284
- device.queue.submit([enc4.finish()]);
285
- await readback.mapAsync(GPUMapMode.READ);
286
- samples.push(Array.from(new Float32Array(readback.getMappedRange())));
287
- readback.unmap();
288
- times.push(t + dt);
289
- if (step % 5 === 4 || step === steps - 1) {
290
- onPhase(
291
- `Simulating on WebGPU (${step + 1}/${steps} steps)`,
292
- (step + 1) / steps
293
- );
261
+ // Per-run execution. Only y0/p0 change when a parameter slider moves, so
262
+ // this re-uploads them and steps the RK4 loop; the device, modules,
263
+ // pipelines, buffers, and bind groups above are reused untouched.
264
+ async function simulate(prepNow, onPhaseNow = () => {}) {
265
+ const y0 = new Float32Array(prepNow.y0 || []);
266
+ device.queue.writeBuffer(yBuf, 0, y0);
267
+ device.queue.writeBuffer(yStage, 0, y0);
268
+ device.queue.writeBuffer(pBuf, 0, new Float32Array(prepNow.p0 || []));
269
+
270
+ const times = [tStart];
271
+ const samples = [Array.from(y0)];
272
+ onPhaseNow(`Simulating on WebGPU (0/${steps} steps)`, 0);
273
+ const wallStart = performance.now();
274
+ // One readback per step keeps the driver simple; the GPU work per
275
+ // step is small enough that this is not the bottleneck yet.
276
+ for (let step = 0; step < steps; step++) {
277
+ const t = tStart + step * dt;
278
+ const enc = device.createCommandEncoder();
279
+ writeTime(t);
280
+ dispatchDer(enc, 0);
281
+ dispatchStage(enc, axpyPipeline, axpyBinds[0]);
282
+ device.queue.submit([enc.finish()]);
283
+ const enc2 = device.createCommandEncoder();
284
+ writeTime(t + dt / 2);
285
+ dispatchDer(enc2, 1);
286
+ dispatchStage(enc2, axpyPipeline, axpyBinds[1]);
287
+ device.queue.submit([enc2.finish()]);
288
+ const enc3 = device.createCommandEncoder();
289
+ dispatchDer(enc3, 2);
290
+ dispatchStage(enc3, axpyPipeline, axpyBinds[2]);
291
+ device.queue.submit([enc3.finish()]);
292
+ const enc4 = device.createCommandEncoder();
293
+ writeTime(t + dt);
294
+ dispatchDer(enc4, 3);
295
+ dispatchStage(enc4, combinePipeline, combineBind);
296
+ enc4.copyBufferToBuffer(yBuf, 0, readback, 0, yLen * 4);
297
+ device.queue.submit([enc4.finish()]);
298
+ await readback.mapAsync(GPUMapMode.READ);
299
+ samples.push(Array.from(new Float32Array(readback.getMappedRange())));
300
+ readback.unmap();
301
+ times.push(t + dt);
302
+ if (step % 5 === 4 || step === steps - 1) {
303
+ onPhaseNow(
304
+ `Simulating on WebGPU (${step + 1}/${steps} steps)`,
305
+ (step + 1) / steps
306
+ );
307
+ }
294
308
  }
295
- }
296
- const gpuSeconds = (performance.now() - wallStart) / 1000;
297
- device.destroy();
309
+ const gpuSeconds = (performance.now() - wallStart) / 1000;
298
310
 
299
- // Shape the result like simulate_model so plots and viz scripts work
300
- // unchanged. Names come from the layout bindings (y-kind slots).
301
- // Bindings include bare base-name aliases ("u" -> 0) alongside the
302
- // indexed names ("u[1,1]" -> 0); prefer indexed names so array
303
- // models keep their element naming.
304
- const names = new Array(yLen).fill(null);
305
- for (const [name, slot] of Object.entries(layout.bindings || {})) {
306
- if (!slot || slot.kind !== 'y' || slot.index >= yLen) {
307
- continue;
311
+ // Shape the result like simulate_model so plots and viz scripts work
312
+ // unchanged. Names come from the layout bindings (y-kind slots).
313
+ // Bindings include bare base-name aliases ("u" -> 0) alongside the
314
+ // indexed names ("u[1,1]" -> 0); prefer indexed names so array
315
+ // models keep their element naming.
316
+ const names = new Array(yLen).fill(null);
317
+ for (const [name, slot] of Object.entries(layout.bindings || {})) {
318
+ if (!slot || slot.kind !== 'y' || slot.index >= yLen) {
319
+ continue;
320
+ }
321
+ const existing = names[slot.index];
322
+ if (!existing || (!existing.includes('[') && name.includes('['))) {
323
+ names[slot.index] = name;
324
+ }
308
325
  }
309
- const existing = names[slot.index];
310
- if (!existing || (!existing.includes('[') && name.includes('['))) {
311
- names[slot.index] = name;
326
+ for (let i = 0; i < yLen; i++) {
327
+ if (!names[i]) names[i] = `y[${i}]`;
312
328
  }
329
+ const allData = [times];
330
+ for (let i = 0; i < yLen; i++) {
331
+ allData.push(samples.map((row) => row[i]));
332
+ }
333
+ const eventNote = (layout.runtime_event_roots | 0) > 0
334
+ ? ' · events frozen (GPU v1)' : '';
335
+ return {
336
+ payload: {
337
+ names,
338
+ allData,
339
+ nStates,
340
+ simDetails: {
341
+ actual: { t_start: tStart, t_end: times[times.length - 1], points: times.length, variables: names.length },
342
+ requested: { solver: `wgsl-solve RK4 (f32)${eventNote}`, t_start: tStart, t_end: tEnd, dt },
343
+ },
344
+ },
345
+ metrics: { simulateSeconds: gpuSeconds },
346
+ };
313
347
  }
314
- for (let i = 0; i < yLen; i++) {
315
- if (!names[i]) names[i] = `y[${i}]`;
348
+
349
+ return { device, simulate };
350
+ }
351
+
352
+ // Module-level fallback cache for callers that do not supply their own. Pass an
353
+ // explicit per-instance `cache` object (e.g. one per widget) when running
354
+ // independent models concurrently so they do not evict each other.
355
+ const sharedGpuCache = {};
356
+
357
+ // Integrate a prepared model on the GPU with fixed-step RK4, reusing a compiled
358
+ // program across runs.
359
+ //
360
+ // adapter : GPUAdapter (from `probeGpu`)
361
+ // prep : the parsed JSON from WASM `prepare_gpu_simulation`
362
+ // onPhase : optional (message, fraction|null) progress callback
363
+ // cache : caller-owned `{ program?, wgsl? }` holder; defaults to a shared
364
+ // module-level cache
365
+ //
366
+ // The program (device, modules, pipelines, buffers, bind groups) is fully
367
+ // determined by `prep.wgsl`, so a parameter-only re-run (same shader, new
368
+ // y0/p0) reuses the cached program and skips the shader recompile + pipeline
369
+ // rebuild entirely. A source edit re-renders the shader (new key -> rebuild,
370
+ // destroying the old device). If a reused device is lost (context loss, tab
371
+ // backgrounding), the cache is dropped so the next run rebuilds from a fresh
372
+ // device.
373
+ //
374
+ // Returns { payload: { names, allData, nStates, simDetails }, metrics } shaped
375
+ // like `simulate_model` so plots/viz scripts work unchanged.
376
+ export async function runGpuSimulation(adapter, prep, onPhase = () => {}, cache = sharedGpuCache) {
377
+ if (!cache.program || cache.wgsl !== prep.wgsl) {
378
+ if (cache.program) {
379
+ try { cache.program.device.destroy(); } catch (err) { /* device already lost */ }
380
+ cache.program = null;
381
+ }
382
+ cache.program = await buildGpuProgram(adapter, prep, onPhase);
383
+ cache.wgsl = prep.wgsl;
316
384
  }
317
- const allData = [times];
318
- for (let i = 0; i < yLen; i++) {
319
- allData.push(samples.map((row) => row[i]));
385
+ try {
386
+ return await cache.program.simulate(prep, onPhase);
387
+ } catch (err) {
388
+ // A reused device can be lost (context loss, tab backgrounding).
389
+ // Drop the cache so the next run rebuilds from a fresh device,
390
+ // restoring the self-healing the per-run rebuild used to give.
391
+ cache.program = null;
392
+ cache.wgsl = null;
393
+ throw err;
320
394
  }
321
- const eventNote = (layout.runtime_event_roots | 0) > 0
322
- ? ' · events frozen (GPU v1)' : '';
323
- return {
324
- payload: {
325
- names,
326
- allData,
327
- nStates,
328
- simDetails: {
329
- actual: { t_start: tStart, t_end: times[times.length - 1], points: times.length, variables: names.length },
330
- requested: { solver: `wgsl-solve RK4 (f32)${eventNote}`, t_start: tStart, t_end: tEnd, dt },
331
- },
332
- },
333
- metrics: { simulateSeconds: gpuSeconds },
334
- };
335
395
  }