@cognipilot/rumoca 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +22 -0
- package/package.json +46 -0
- package/parse_worker.js +53 -0
- package/rumoca_bind_wasm.d.ts +295 -0
- package/rumoca_bind_wasm.js +1473 -0
- package/rumoca_bind_wasm_bg.wasm +0 -0
- package/rumoca_bind_wasm_diffsol.d.ts +49 -0
- package/rumoca_bind_wasm_diffsol.js +270 -0
- package/rumoca_bind_wasm_diffsol_bg.wasm +0 -0
- package/rumoca_diffsol.js +109 -0
- package/rumoca_gpu.js +335 -0
- package/rumoca_package_meta.json +3 -0
- package/rumoca_worker.js +394 -0
package/rumoca_gpu.js
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
// Rumoca WebGPU RK4 driver.
|
|
2
|
+
//
|
|
3
|
+
// Canonical, packaged runtime helper for the GPU simulation path. The
|
|
4
|
+
// compiler emits per-state derivative kernels via the `wgsl-solve` target
|
|
5
|
+
// (WASM `prepare_gpu_simulation`); this module wraps a fixed-step classic
|
|
6
|
+
// RK4 integrator around them on the GPU. The RK4 stage/combine algebra runs
|
|
7
|
+
// in the two small hand-written kernels below.
|
|
8
|
+
//
|
|
9
|
+
// v1 semantics: only the first `n_states` slots of y integrate; algebraic
|
|
10
|
+
// slots and all parameters (including relation memory) stay frozen at their
|
|
11
|
+
// prepared initial values, so event-driven behavior does not fire on this
|
|
12
|
+
// path.
|
|
13
|
+
//
|
|
14
|
+
// Consumed from npm as `@cognipilot/rumoca/gpu` and by the mdBook live
|
|
15
|
+
// examples (imported from the same pkg base as the WASM glue).
|
|
16
|
+
|
|
17
|
+
const GPU_STAGE_WGSL = `
|
|
18
|
+
struct StageUniforms { scale: f32, n: u32, _pad0: u32, _pad1: u32 }
|
|
19
|
+
@group(0) @binding(0) var<storage, read> base: array<f32>;
|
|
20
|
+
@group(0) @binding(1) var<storage, read> k: array<f32>;
|
|
21
|
+
@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
|
|
22
|
+
@group(0) @binding(3) var<uniform> su: StageUniforms;
|
|
23
|
+
|
|
24
|
+
// dst[i] = base[i] + scale * k[i] (first n slots only)
|
|
25
|
+
@compute @workgroup_size(64)
|
|
26
|
+
fn axpy(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
27
|
+
let i = gid.x;
|
|
28
|
+
if (i >= su.n) { return; }
|
|
29
|
+
dst[i] = base[i] + su.scale * k[i];
|
|
30
|
+
}
|
|
31
|
+
`;
|
|
32
|
+
|
|
33
|
+
const GPU_COMBINE_WGSL = `
|
|
34
|
+
struct CombineUniforms { h6: f32, n: u32, _pad0: u32, _pad1: u32 }
|
|
35
|
+
@group(0) @binding(0) var<storage, read> k1: array<f32>;
|
|
36
|
+
@group(0) @binding(1) var<storage, read> k2: array<f32>;
|
|
37
|
+
@group(0) @binding(2) var<storage, read> k3: array<f32>;
|
|
38
|
+
@group(0) @binding(3) var<storage, read> k4: array<f32>;
|
|
39
|
+
@group(0) @binding(4) var<storage, read_write> ystate: array<f32>;
|
|
40
|
+
@group(0) @binding(5) var<uniform> cu: CombineUniforms;
|
|
41
|
+
|
|
42
|
+
// y[i] += (h/6) * (k1 + 2 k2 + 2 k3 + k4)[i]
|
|
43
|
+
@compute @workgroup_size(64)
|
|
44
|
+
fn combine(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
45
|
+
let i = gid.x;
|
|
46
|
+
if (i >= cu.n) { return; }
|
|
47
|
+
ystate[i] = ystate[i]
|
|
48
|
+
+ cu.h6 * (k1[i] + 2.0 * k2[i] + 2.0 * k3[i] + k4[i]);
|
|
49
|
+
}
|
|
50
|
+
`;
|
|
51
|
+
|
|
52
|
+
async function compileGpuModule(device, code, label) {
|
|
53
|
+
const module = device.createShaderModule({ code, label });
|
|
54
|
+
const info = await module.getCompilationInfo();
|
|
55
|
+
const errors = info.messages.filter((m) => m.type === 'error');
|
|
56
|
+
if (errors.length > 0) {
|
|
57
|
+
throw new Error(`${label} WGSL error: ${errors[0].message}`);
|
|
58
|
+
}
|
|
59
|
+
return module;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Acquire a WebGPU adapter, throwing actionable errors when WebGPU is
|
|
63
|
+
// unavailable. Returns a GPUAdapter suitable for `runGpuSimulation`.
|
|
64
|
+
export async function probeGpu() {
|
|
65
|
+
if (!navigator.gpu) {
|
|
66
|
+
throw new Error(
|
|
67
|
+
'GPU requested but WebGPU is not available in this '
|
|
68
|
+
+ 'browser. Uncheck GPU to run on the CPU (WASM) path.'
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
const adapter = await navigator.gpu.requestAdapter()
|
|
72
|
+
|| await navigator.gpu.requestAdapter({ forceFallbackAdapter: true });
|
|
73
|
+
if (!adapter) {
|
|
74
|
+
throw new Error(
|
|
75
|
+
'GPU requested but no WebGPU adapter was found. '
|
|
76
|
+
+ 'On Linux Chrome, WebGPU is off by default: enable '
|
|
77
|
+
+ 'chrome://flags/#enable-unsafe-webgpu (or launch with '
|
|
78
|
+
+ '--enable-unsafe-webgpu --enable-features=Vulkan) and '
|
|
79
|
+
+ 'reload. Or uncheck GPU to run on the CPU (WASM) path.'
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
return adapter;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Integrate a prepared model on the GPU with fixed-step RK4.
|
|
86
|
+
//
|
|
87
|
+
// adapter : GPUAdapter (from `probeGpu`)
|
|
88
|
+
// prep : the parsed JSON from WASM `prepare_gpu_simulation`
|
|
89
|
+
// ({ wgsl, layout, n_states, y0, p0, t_start, t_end, dt })
|
|
90
|
+
// onPhase : optional (message, fraction|null) progress callback
|
|
91
|
+
//
|
|
92
|
+
// Returns { payload: { names, allData, nStates, simDetails }, metrics }
|
|
93
|
+
// shaped like `simulate_model` so plots/viz scripts work unchanged.
|
|
94
|
+
export async function runGpuSimulation(adapter, prep, onPhase = () => {}) {
|
|
95
|
+
const layout = prep.layout || {};
|
|
96
|
+
const nStates = prep.n_states | 0;
|
|
97
|
+
const yLen = Math.max(layout.y_len | 0, 1);
|
|
98
|
+
const rows = Math.max(layout.rows | 0, 0);
|
|
99
|
+
if (rows === 0 || nStates === 0) {
|
|
100
|
+
throw new Error('Model has no continuous states to integrate on the GPU.');
|
|
101
|
+
}
|
|
102
|
+
if (rows !== nStates) {
|
|
103
|
+
throw new Error(
|
|
104
|
+
`GPU path expects one derivative row per state (rows=${rows}, `
|
|
105
|
+
+ `states=${nStates}); this model is not supported yet.`
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
const tStart = Number(prep.t_start) || 0;
|
|
109
|
+
const tEnd = Number(prep.t_end) || 1;
|
|
110
|
+
const dt = Number(prep.dt) > 0
|
|
111
|
+
? Number(prep.dt) : (tEnd - tStart) / 500;
|
|
112
|
+
const steps = Math.max(1, Math.round((tEnd - tStart) / dt));
|
|
113
|
+
|
|
114
|
+
const device = await adapter.requestDevice();
|
|
115
|
+
onPhase('Parsing GPU kernels (WGSL)', null);
|
|
116
|
+
const derModule = await compileGpuModule(device, prep.wgsl, 'wgsl-solve');
|
|
117
|
+
const stageModule = await compileGpuModule(device, GPU_STAGE_WGSL, 'rk4-stage');
|
|
118
|
+
const combineModule = await compileGpuModule(device, GPU_COMBINE_WGSL, 'rk4-combine');
|
|
119
|
+
|
|
120
|
+
// Kernel inventory: stencil-family kernels + residual chunks from
|
|
121
|
+
// the layout manifest.
|
|
122
|
+
if (!Array.isArray(layout.kernels) || layout.kernels.length === 0) {
|
|
123
|
+
throw new Error(
|
|
124
|
+
'GPU layout manifest has no kernel inventory; the WASM package '
|
|
125
|
+
+ 'predates stencil emission. Rebuild it from the wgsl-backend '
|
|
126
|
+
+ 'sources (wasm-pack build crates/rumoca-bind-wasm).'
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
const kernelList = layout.kernels;
|
|
130
|
+
let pipelinesBuilt = 0;
|
|
131
|
+
onPhase(`Building GPU pipelines (0/${kernelList.length})`, 0);
|
|
132
|
+
const derPipelines = await Promise.all(
|
|
133
|
+
kernelList.map((kernel) => device.createComputePipelineAsync({
|
|
134
|
+
layout: 'auto',
|
|
135
|
+
compute: { module: derModule, entryPoint: kernel.entry },
|
|
136
|
+
}).then((pipeline) => {
|
|
137
|
+
pipelinesBuilt += 1;
|
|
138
|
+
onPhase(
|
|
139
|
+
`Building GPU pipelines (${pipelinesBuilt}/${kernelList.length})`,
|
|
140
|
+
pipelinesBuilt / kernelList.length
|
|
141
|
+
);
|
|
142
|
+
return pipeline;
|
|
143
|
+
}))
|
|
144
|
+
);
|
|
145
|
+
const kernelWorkgroups = kernelList.map(
|
|
146
|
+
(kernel) => Math.max(1, Math.ceil((kernel.rows | 0) / 64)));
|
|
147
|
+
const axpyPipeline = await device.createComputePipelineAsync({
|
|
148
|
+
layout: 'auto', compute: { module: stageModule, entryPoint: 'axpy' },
|
|
149
|
+
});
|
|
150
|
+
const combinePipeline = await device.createComputePipelineAsync({
|
|
151
|
+
layout: 'auto', compute: { module: combineModule, entryPoint: 'combine' },
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
const storage = (len, label) => device.createBuffer({
|
|
155
|
+
label,
|
|
156
|
+
size: Math.max(16, len * 4),
|
|
157
|
+
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
|
|
158
|
+
});
|
|
159
|
+
const yBuf = storage(yLen, 'y');
|
|
160
|
+
const yStage = storage(yLen, 'y-stage');
|
|
161
|
+
const pBuf = storage(Math.max(layout.p_len | 0, 1), 'p');
|
|
162
|
+
const kBufs = [0, 1, 2, 3].map((i) => storage(rows, `k${i + 1}`));
|
|
163
|
+
const y0 = new Float32Array(prep.y0 || []);
|
|
164
|
+
device.queue.writeBuffer(yBuf, 0, y0);
|
|
165
|
+
device.queue.writeBuffer(yStage, 0, y0);
|
|
166
|
+
device.queue.writeBuffer(pBuf, 0, new Float32Array(prep.p0 || []));
|
|
167
|
+
|
|
168
|
+
const timeUniform = device.createBuffer({
|
|
169
|
+
size: 16, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
|
170
|
+
});
|
|
171
|
+
const axpyUniforms = [0.5, 0.5, 1.0].map((scale) => {
|
|
172
|
+
const buffer = device.createBuffer({
|
|
173
|
+
size: 16, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
|
174
|
+
});
|
|
175
|
+
const data = new ArrayBuffer(16);
|
|
176
|
+
new Float32Array(data, 0, 1)[0] = scale * dt;
|
|
177
|
+
new Uint32Array(data, 4, 1)[0] = nStates;
|
|
178
|
+
device.queue.writeBuffer(buffer, 0, data);
|
|
179
|
+
return buffer;
|
|
180
|
+
});
|
|
181
|
+
const combineUniform = device.createBuffer({
|
|
182
|
+
size: 16, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
|
183
|
+
});
|
|
184
|
+
{
|
|
185
|
+
const data = new ArrayBuffer(16);
|
|
186
|
+
new Float32Array(data, 0, 1)[0] = dt / 6.0;
|
|
187
|
+
new Uint32Array(data, 4, 1)[0] = nStates;
|
|
188
|
+
device.queue.writeBuffer(combineUniform, 0, data);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const derBind = (yIn, kOut) => derPipelines.map((pipe) => device.createBindGroup({
|
|
192
|
+
layout: pipe.getBindGroupLayout(0),
|
|
193
|
+
entries: [
|
|
194
|
+
{ binding: 0, resource: { buffer: yIn } },
|
|
195
|
+
{ binding: 1, resource: { buffer: pBuf } },
|
|
196
|
+
{ binding: 2, resource: { buffer: kOut } },
|
|
197
|
+
{ binding: 3, resource: { buffer: timeUniform } },
|
|
198
|
+
],
|
|
199
|
+
}));
|
|
200
|
+
const derBinds = [
|
|
201
|
+
derBind(yBuf, kBufs[0]), // k1 = f(t, y)
|
|
202
|
+
derBind(yStage, kBufs[1]), // k2 = f(t + h/2, y + h/2 k1)
|
|
203
|
+
derBind(yStage, kBufs[2]), // k3 = f(t + h/2, y + h/2 k2)
|
|
204
|
+
derBind(yStage, kBufs[3]), // k4 = f(t + h, y + h k3)
|
|
205
|
+
];
|
|
206
|
+
const axpyBind = (kBuf, uniform) => device.createBindGroup({
|
|
207
|
+
layout: axpyPipeline.getBindGroupLayout(0),
|
|
208
|
+
entries: [
|
|
209
|
+
{ binding: 0, resource: { buffer: yBuf } },
|
|
210
|
+
{ binding: 1, resource: { buffer: kBuf } },
|
|
211
|
+
{ binding: 2, resource: { buffer: yStage } },
|
|
212
|
+
{ binding: 3, resource: { buffer: uniform } },
|
|
213
|
+
],
|
|
214
|
+
});
|
|
215
|
+
const axpyBinds = [
|
|
216
|
+
axpyBind(kBufs[0], axpyUniforms[0]),
|
|
217
|
+
axpyBind(kBufs[1], axpyUniforms[1]),
|
|
218
|
+
axpyBind(kBufs[2], axpyUniforms[2]),
|
|
219
|
+
];
|
|
220
|
+
const combineBind = device.createBindGroup({
|
|
221
|
+
layout: combinePipeline.getBindGroupLayout(0),
|
|
222
|
+
entries: [
|
|
223
|
+
{ binding: 0, resource: { buffer: kBufs[0] } },
|
|
224
|
+
{ binding: 1, resource: { buffer: kBufs[1] } },
|
|
225
|
+
{ binding: 2, resource: { buffer: kBufs[2] } },
|
|
226
|
+
{ binding: 3, resource: { buffer: kBufs[3] } },
|
|
227
|
+
{ binding: 4, resource: { buffer: yBuf } },
|
|
228
|
+
{ binding: 5, resource: { buffer: combineUniform } },
|
|
229
|
+
],
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
const stageGroups = Math.ceil(nStates / 64);
|
|
233
|
+
const dispatchDer = (enc, stage) => {
|
|
234
|
+
const pass = enc.beginComputePass();
|
|
235
|
+
derPipelines.forEach((pipe, c) => {
|
|
236
|
+
pass.setPipeline(pipe);
|
|
237
|
+
pass.setBindGroup(0, derBinds[stage][c]);
|
|
238
|
+
pass.dispatchWorkgroups(kernelWorkgroups[c]);
|
|
239
|
+
});
|
|
240
|
+
pass.end();
|
|
241
|
+
};
|
|
242
|
+
const dispatchStage = (enc, pipeline, bind) => {
|
|
243
|
+
const pass = enc.beginComputePass();
|
|
244
|
+
pass.setPipeline(pipeline);
|
|
245
|
+
pass.setBindGroup(0, bind);
|
|
246
|
+
pass.dispatchWorkgroups(stageGroups);
|
|
247
|
+
pass.end();
|
|
248
|
+
};
|
|
249
|
+
|
|
250
|
+
const readback = device.createBuffer({
|
|
251
|
+
size: Math.max(16, yLen * 4),
|
|
252
|
+
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
|
|
253
|
+
});
|
|
254
|
+
const writeTime = (t) => device.queue.writeBuffer(
|
|
255
|
+
timeUniform, 0, new Float32Array([t, 0, 0, 0]));
|
|
256
|
+
|
|
257
|
+
const times = [tStart];
|
|
258
|
+
const samples = [Array.from(y0)];
|
|
259
|
+
onPhase(`Simulating on WebGPU (0/${steps} steps)`, 0);
|
|
260
|
+
const wallStart = performance.now();
|
|
261
|
+
// One readback per step keeps the driver simple; the GPU work per
|
|
262
|
+
// step is small enough that this is not the bottleneck yet.
|
|
263
|
+
for (let step = 0; step < steps; step++) {
|
|
264
|
+
const t = tStart + step * dt;
|
|
265
|
+
const enc = device.createCommandEncoder();
|
|
266
|
+
writeTime(t);
|
|
267
|
+
dispatchDer(enc, 0);
|
|
268
|
+
dispatchStage(enc, axpyPipeline, axpyBinds[0]);
|
|
269
|
+
device.queue.submit([enc.finish()]);
|
|
270
|
+
const enc2 = device.createCommandEncoder();
|
|
271
|
+
writeTime(t + dt / 2);
|
|
272
|
+
dispatchDer(enc2, 1);
|
|
273
|
+
dispatchStage(enc2, axpyPipeline, axpyBinds[1]);
|
|
274
|
+
device.queue.submit([enc2.finish()]);
|
|
275
|
+
const enc3 = device.createCommandEncoder();
|
|
276
|
+
dispatchDer(enc3, 2);
|
|
277
|
+
dispatchStage(enc3, axpyPipeline, axpyBinds[2]);
|
|
278
|
+
device.queue.submit([enc3.finish()]);
|
|
279
|
+
const enc4 = device.createCommandEncoder();
|
|
280
|
+
writeTime(t + dt);
|
|
281
|
+
dispatchDer(enc4, 3);
|
|
282
|
+
dispatchStage(enc4, combinePipeline, combineBind);
|
|
283
|
+
enc4.copyBufferToBuffer(yBuf, 0, readback, 0, yLen * 4);
|
|
284
|
+
device.queue.submit([enc4.finish()]);
|
|
285
|
+
await readback.mapAsync(GPUMapMode.READ);
|
|
286
|
+
samples.push(Array.from(new Float32Array(readback.getMappedRange())));
|
|
287
|
+
readback.unmap();
|
|
288
|
+
times.push(t + dt);
|
|
289
|
+
if (step % 5 === 4 || step === steps - 1) {
|
|
290
|
+
onPhase(
|
|
291
|
+
`Simulating on WebGPU (${step + 1}/${steps} steps)`,
|
|
292
|
+
(step + 1) / steps
|
|
293
|
+
);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
const gpuSeconds = (performance.now() - wallStart) / 1000;
|
|
297
|
+
device.destroy();
|
|
298
|
+
|
|
299
|
+
// Shape the result like simulate_model so plots and viz scripts work
|
|
300
|
+
// unchanged. Names come from the layout bindings (y-kind slots).
|
|
301
|
+
// Bindings include bare base-name aliases ("u" -> 0) alongside the
|
|
302
|
+
// indexed names ("u[1,1]" -> 0); prefer indexed names so array
|
|
303
|
+
// models keep their element naming.
|
|
304
|
+
const names = new Array(yLen).fill(null);
|
|
305
|
+
for (const [name, slot] of Object.entries(layout.bindings || {})) {
|
|
306
|
+
if (!slot || slot.kind !== 'y' || slot.index >= yLen) {
|
|
307
|
+
continue;
|
|
308
|
+
}
|
|
309
|
+
const existing = names[slot.index];
|
|
310
|
+
if (!existing || (!existing.includes('[') && name.includes('['))) {
|
|
311
|
+
names[slot.index] = name;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
for (let i = 0; i < yLen; i++) {
|
|
315
|
+
if (!names[i]) names[i] = `y[${i}]`;
|
|
316
|
+
}
|
|
317
|
+
const allData = [times];
|
|
318
|
+
for (let i = 0; i < yLen; i++) {
|
|
319
|
+
allData.push(samples.map((row) => row[i]));
|
|
320
|
+
}
|
|
321
|
+
const eventNote = (layout.runtime_event_roots | 0) > 0
|
|
322
|
+
? ' · events frozen (GPU v1)' : '';
|
|
323
|
+
return {
|
|
324
|
+
payload: {
|
|
325
|
+
names,
|
|
326
|
+
allData,
|
|
327
|
+
nStates,
|
|
328
|
+
simDetails: {
|
|
329
|
+
actual: { t_start: tStart, t_end: times[times.length - 1], points: times.length, variables: names.length },
|
|
330
|
+
requested: { solver: `wgsl-solve RK4 (f32)${eventNote}`, t_start: tStart, t_end: tEnd, dt },
|
|
331
|
+
},
|
|
332
|
+
},
|
|
333
|
+
metrics: { simulateSeconds: gpuSeconds },
|
|
334
|
+
};
|
|
335
|
+
}
|