@gridspace/raster-path 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.html +1 -1
- package/build/raster-path.js +4 -12
- package/build/raster-worker.js +2450 -0
- package/package.json +8 -4
- package/scripts/build-shaders.js +32 -8
- package/src/core/path-planar.js +788 -0
- package/src/core/path-radial.js +651 -0
- package/src/core/raster-config.js +185 -0
- package/src/{index.js → core/raster-path.js} +4 -12
- package/src/core/raster-planar.js +754 -0
- package/src/core/raster-tool.js +104 -0
- package/src/core/raster-worker.js +152 -0
- package/src/core/workload-calibrate.js +416 -0
- package/src/shaders/workload-calibrate.wgsl +106 -0
- package/src/test/calibrate-test.cjs +136 -0
- package/src/test/extreme-work-test.cjs +167 -0
- package/src/test/radial-thread-limit-test.cjs +152 -0
- package/src/web/index.html +1 -1
- package/build/webgpu-worker.js +0 -2800
- package/src/web/webgpu-worker.js +0 -2303
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
3
|
+
* Raster Tool - Sparse Tool Representation
|
|
4
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
5
|
+
*
|
|
6
|
+
* Utilities for creating sparse tool representations from rasterized tool data.
|
|
7
|
+
* Converts dense grid data into compact offset-based format for GPU toolpath
|
|
8
|
+
* generation.
|
|
9
|
+
*
|
|
10
|
+
* EXPORTS:
|
|
11
|
+
* ────────
|
|
12
|
+
* Functions:
|
|
13
|
+
* - createSparseToolFromPoints(points) - Convert tool points to sparse format
|
|
14
|
+
*
|
|
15
|
+
* DATA FORMATS:
|
|
16
|
+
* ─────────────
|
|
17
|
+
* Input (Dense Grid):
|
|
18
|
+
* - Float32Array of [gridX, gridY, Z, gridX, gridY, Z, ...]
|
|
19
|
+
* - gridX, gridY are integer grid indices (not world coordinates)
|
|
20
|
+
* - Z is height in world units (mm)
|
|
21
|
+
*
|
|
22
|
+
* Output (Sparse Tool):
|
|
23
|
+
* {
|
|
24
|
+
* count: number, // Number of tool points
|
|
25
|
+
* xOffsets: Int32Array, // X offset from tool center (grid cells)
|
|
26
|
+
* yOffsets: Int32Array, // Y offset from tool center (grid cells)
|
|
27
|
+
* zValues: Float32Array, // Z height (mm)
|
|
28
|
+
* referenceZ: number // Tool tip Z (lowest point)
|
|
29
|
+
* }
|
|
30
|
+
*
|
|
31
|
+
* ALGORITHM:
|
|
32
|
+
* ──────────
|
|
33
|
+
* 1. Find bounding box in grid space (integer coordinates)
|
|
34
|
+
* 2. Calculate tool center (grid coordinates)
|
|
35
|
+
* 3. Convert each point to offset from center
|
|
36
|
+
* 4. Store as parallel arrays for GPU consumption
|
|
37
|
+
*
|
|
38
|
+
* This sparse representation reduces memory usage and improves GPU cache
|
|
39
|
+
* coherency during toolpath generation, as the tool can be "stamped" at
|
|
40
|
+
* each position using simple offset arithmetic.
|
|
41
|
+
*
|
|
42
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
43
|
+
*/
|
|
44
|
+
|
|
45
|
+
// Create sparse tool representation from rasterized points
|
|
46
|
+
// Points come from GPU as [gridX, gridY, Z] - pure integer grid coordinates for X/Y
|
|
47
|
+
export function createSparseToolFromPoints(points) {
|
|
48
|
+
if (!points || points.length === 0) {
|
|
49
|
+
throw new Error('No tool points provided');
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Points are [gridX, gridY, Z] where gridX/gridY are grid indices (floats but integer values)
|
|
53
|
+
// Find bounds in grid space and tool tip Z
|
|
54
|
+
let minGridX = Infinity, minGridY = Infinity, minZ = Infinity;
|
|
55
|
+
let maxGridX = -Infinity, maxGridY = -Infinity;
|
|
56
|
+
|
|
57
|
+
for (let i = 0; i < points.length; i += 3) {
|
|
58
|
+
const gridX = points[i]; // Already a grid index
|
|
59
|
+
const gridY = points[i + 1]; // Already a grid index
|
|
60
|
+
const z = points[i + 2];
|
|
61
|
+
|
|
62
|
+
minGridX = Math.min(minGridX, gridX);
|
|
63
|
+
maxGridX = Math.max(maxGridX, gridX);
|
|
64
|
+
minGridY = Math.min(minGridY, gridY);
|
|
65
|
+
maxGridY = Math.max(maxGridY, gridY);
|
|
66
|
+
minZ = Math.min(minZ, z);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Calculate tool center in grid coordinates (pure integer)
|
|
70
|
+
const width = Math.floor(maxGridX - minGridX) + 1;
|
|
71
|
+
const height = Math.floor(maxGridY - minGridY) + 1;
|
|
72
|
+
const centerX = Math.floor(minGridX) + Math.floor(width / 2);
|
|
73
|
+
const centerY = Math.floor(minGridY) + Math.floor(height / 2);
|
|
74
|
+
|
|
75
|
+
// Convert each point to offset from center (integer arithmetic only)
|
|
76
|
+
const xOffsets = [];
|
|
77
|
+
const yOffsets = [];
|
|
78
|
+
const zValues = [];
|
|
79
|
+
|
|
80
|
+
for (let i = 0; i < points.length; i += 3) {
|
|
81
|
+
const gridX = Math.floor(points[i]); // Grid index (ensure integer)
|
|
82
|
+
const gridY = Math.floor(points[i + 1]); // Grid index (ensure integer)
|
|
83
|
+
const z = points[i + 2];
|
|
84
|
+
|
|
85
|
+
// Calculate offset from tool center (pure integer arithmetic)
|
|
86
|
+
const xOffset = gridX - centerX;
|
|
87
|
+
const yOffset = gridY - centerY;
|
|
88
|
+
// Z relative to tool tip: tip=0, points above tip are positive
|
|
89
|
+
// minZ is the lowest Z (tip), so z - minZ gives positive offsets upward
|
|
90
|
+
const zValue = z;// - minZ;
|
|
91
|
+
|
|
92
|
+
xOffsets.push(xOffset);
|
|
93
|
+
yOffsets.push(yOffset);
|
|
94
|
+
zValues.push(zValue);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
count: xOffsets.length,
|
|
99
|
+
xOffsets: new Int32Array(xOffsets),
|
|
100
|
+
yOffsets: new Int32Array(yOffsets),
|
|
101
|
+
zValues: new Float32Array(zValues),
|
|
102
|
+
referenceZ: minZ
|
|
103
|
+
};
|
|
104
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
3
|
+
* WebGPU Worker - GPU Compute Operations
|
|
4
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
5
|
+
*
|
|
6
|
+
* Offloads all WebGPU compute operations to a worker thread to prevent UI blocking.
|
|
7
|
+
* Handles both planar (XY grid) and radial (cylindrical) rasterization modes.
|
|
8
|
+
*
|
|
9
|
+
* MESSAGE PROTOCOL:
|
|
10
|
+
* ─────────────────
|
|
11
|
+
* Main Thread → Worker:
|
|
12
|
+
* 'init' - Initialize WebGPU device
|
|
13
|
+
* 'rasterize-planar' - Rasterize geometry to XY grid
|
|
14
|
+
* 'generate-toolpath-planar' - Generate planar toolpath from rasters
|
|
15
|
+
* 'radial-generate-toolpaths'- Generate radial toolpaths (does rasterization + toolpath)
|
|
16
|
+
* 'calibrate' - Run GPU workload calibration
|
|
17
|
+
*
|
|
18
|
+
* Worker → Main Thread:
|
|
19
|
+
* 'webgpu-ready' - Initialization complete
|
|
20
|
+
* 'rasterize-complete' - Planar rasterization complete
|
|
21
|
+
* 'rasterize-progress' - Progress update (0-1)
|
|
22
|
+
* 'toolpath-complete' - Planar toolpath complete
|
|
23
|
+
* 'toolpath-progress' - Progress update (0-1)
|
|
24
|
+
* 'radial-toolpaths-complete' - Radial toolpaths complete
|
|
25
|
+
* 'calibrate-complete' - Calibration results
|
|
26
|
+
*
|
|
27
|
+
* ARCHITECTURE:
|
|
28
|
+
* ─────────────
|
|
29
|
+
* 1. PLANAR MODE:
|
|
30
|
+
* - Rasterize terrain: XY grid, keep max Z per cell
|
|
31
|
+
* - Rasterize tool: XY grid, keep min Z per cell
|
|
32
|
+
* - Generate toolpath: Scan tool over terrain, compute Z-heights
|
|
33
|
+
*
|
|
34
|
+
* 2. RADIAL MODE:
|
|
35
|
+
* - Batched processing: 360 angles per batch
|
|
36
|
+
* - X-bucketing: Spatial partitioning to reduce triangle tests
|
|
37
|
+
* - For each angle:
|
|
38
|
+
* * Cast ray from origin
|
|
39
|
+
* * Rasterize terrain triangles along ray
|
|
40
|
+
* * Calculate tool-terrain collision
|
|
41
|
+
* * Output Z-heights along X-axis
|
|
42
|
+
*
|
|
43
|
+
* MEMORY MANAGEMENT:
|
|
44
|
+
* ──────────────────
|
|
45
|
+
* - All GPU buffers are preallocated to known maximum sizes
|
|
46
|
+
* - Triangle data transferred once per operation
|
|
47
|
+
* - Output buffers mapped asynchronously to avoid blocking
|
|
48
|
+
* - Worker maintains pipeline cache to avoid recompilation
|
|
49
|
+
*
|
|
50
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
51
|
+
*/
|
|
52
|
+
|
|
53
|
+
import { initWebGPU, setConfig, updateConfig, deviceCapabilities, debug, device } from './raster-config.js';
|
|
54
|
+
import { rasterizeMesh } from './raster-planar.js';
|
|
55
|
+
import { generateToolpath } from './path-planar.js';
|
|
56
|
+
import { generateRadialToolpaths } from './path-radial.js';
|
|
57
|
+
import { calibrateGPU } from './workload-calibrate.js';
|
|
58
|
+
|
|
59
|
+
// Global error handler for uncaught errors in worker
|
|
60
|
+
self.addEventListener('error', (event) => {
|
|
61
|
+
debug.error('Uncaught error:', event.error || event.message);
|
|
62
|
+
debug.error('Stack:', event.error?.stack);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
self.addEventListener('unhandledrejection', (event) => {
|
|
66
|
+
debug.error('Unhandled promise rejection:', event.reason);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// Handle messages from main thread
|
|
70
|
+
self.onmessage = async function(e) {
|
|
71
|
+
const { type, data } = e.data;
|
|
72
|
+
|
|
73
|
+
try {
|
|
74
|
+
switch (type) {
|
|
75
|
+
case 'init':
|
|
76
|
+
// Store config
|
|
77
|
+
setConfig(data?.config || {
|
|
78
|
+
maxGPUMemoryMB: 256,
|
|
79
|
+
gpuMemorySafetyMargin: 0.8,
|
|
80
|
+
tileOverlapMM: 10,
|
|
81
|
+
autoTiling: true,
|
|
82
|
+
batchDivisor: 1, // For testing batching overhead: 1=optimal, 2=2x batches, 4=4x batches, etc.
|
|
83
|
+
maxConcurrentThreads: 32768 // GPU watchdog limit: max threads across all workgroups in a dispatch
|
|
84
|
+
});
|
|
85
|
+
const success = await initWebGPU();
|
|
86
|
+
self.postMessage({
|
|
87
|
+
type: 'webgpu-ready',
|
|
88
|
+
data: {
|
|
89
|
+
success,
|
|
90
|
+
capabilities: deviceCapabilities
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
break;
|
|
94
|
+
|
|
95
|
+
case 'update-config':
|
|
96
|
+
updateConfig(data.config);
|
|
97
|
+
debug.log('Config updated');
|
|
98
|
+
break;
|
|
99
|
+
|
|
100
|
+
case 'rasterize':
|
|
101
|
+
const { triangles, stepSize, filterMode, boundsOverride } = data;
|
|
102
|
+
const rasterOptions = boundsOverride || {};
|
|
103
|
+
const rasterResult = await rasterizeMesh(triangles, stepSize, filterMode, rasterOptions);
|
|
104
|
+
self.postMessage({
|
|
105
|
+
type: 'rasterize-complete',
|
|
106
|
+
data: rasterResult,
|
|
107
|
+
}, [rasterResult.positions.buffer]);
|
|
108
|
+
break;
|
|
109
|
+
|
|
110
|
+
case 'generate-toolpath':
|
|
111
|
+
const { terrainPositions, toolPositions, xStep, yStep, zFloor, gridStep, terrainBounds, singleScanline } = data;
|
|
112
|
+
const toolpathResult = await generateToolpath(
|
|
113
|
+
terrainPositions, toolPositions, xStep, yStep, zFloor, gridStep, terrainBounds, singleScanline
|
|
114
|
+
);
|
|
115
|
+
self.postMessage({
|
|
116
|
+
type: 'toolpath-complete',
|
|
117
|
+
data: toolpathResult
|
|
118
|
+
}, [toolpathResult.pathData.buffer]);
|
|
119
|
+
break;
|
|
120
|
+
|
|
121
|
+
case 'radial-generate-toolpaths':
|
|
122
|
+
const radialToolpathResult = await generateRadialToolpaths(data);
|
|
123
|
+
const toolpathTransferBuffers = radialToolpathResult.strips.map(strip => strip.pathData.buffer);
|
|
124
|
+
self.postMessage({
|
|
125
|
+
type: 'radial-toolpaths-complete',
|
|
126
|
+
data: radialToolpathResult
|
|
127
|
+
}, toolpathTransferBuffers);
|
|
128
|
+
break;
|
|
129
|
+
|
|
130
|
+
case 'calibrate':
|
|
131
|
+
const calibrationResult = await calibrateGPU(device, data?.options || {});
|
|
132
|
+
self.postMessage({
|
|
133
|
+
type: 'calibrate-complete',
|
|
134
|
+
data: calibrationResult
|
|
135
|
+
});
|
|
136
|
+
break;
|
|
137
|
+
|
|
138
|
+
default:
|
|
139
|
+
self.postMessage({
|
|
140
|
+
type: 'error',
|
|
141
|
+
message: 'Unknown message type: ' + type
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
} catch (error) {
|
|
145
|
+
debug.error('Error:', error);
|
|
146
|
+
self.postMessage({
|
|
147
|
+
type: 'error',
|
|
148
|
+
message: error.message,
|
|
149
|
+
stack: error.stack
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
};
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
3
|
+
* Workload Calibrate - GPU Watchdog Limit Detection
|
|
4
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
5
|
+
*
|
|
6
|
+
* Calibrates GPU capabilities by testing workgroup configurations until
|
|
7
|
+
* watchdog kills are detected. Uses actual ray-triangle intersection code
|
|
8
|
+
* to simulate real workload characteristics.
|
|
9
|
+
*
|
|
10
|
+
* EXPORTS:
|
|
11
|
+
* ────────
|
|
12
|
+
* Functions:
|
|
13
|
+
* - calibrateGPU(device, options) - Run calibration and return limits
|
|
14
|
+
*
|
|
15
|
+
* DETECTION STRATEGY:
|
|
16
|
+
* ───────────────────
|
|
17
|
+
* Problem: Watchdog kills are SILENT - threads just stop executing
|
|
18
|
+
* Solution: Initialize output buffer to zeros, each thread writes 1 on completion
|
|
19
|
+
* Detection: Any zeros remaining = that thread was killed by watchdog
|
|
20
|
+
*
|
|
21
|
+
* TEST PARAMETERS:
|
|
22
|
+
* ────────────────
|
|
23
|
+
* 1. Workgroup dimensions (x, y, z)
|
|
24
|
+
* - Start: 16x16x1 (256 threads)
|
|
25
|
+
* - Increase: 32x32x1, 64x64x1, etc.
|
|
26
|
+
*
|
|
27
|
+
* 2. Work intensity (triangle_tests)
|
|
28
|
+
* - Start: 1000 intersection tests per thread
|
|
29
|
+
* - Increase: 2000, 5000, 10000, etc.
|
|
30
|
+
*
|
|
31
|
+
* BINARY SEARCH:
|
|
32
|
+
* ──────────────
|
|
33
|
+
* For each workgroup size:
|
|
34
|
+
* 1. Start with low work intensity (known to pass)
|
|
35
|
+
* 2. Binary search for max intensity before watchdog kill
|
|
36
|
+
* 3. Record (workgroup_size, max_intensity) pair
|
|
37
|
+
*
|
|
38
|
+
* OUTPUT:
|
|
39
|
+
* ───────
|
|
40
|
+
* {
|
|
41
|
+
* maxWorkgroupSize: { x: 64, y: 64, z: 1 }, // Largest safe config
|
|
42
|
+
* maxWorkPerThread: 50000, // Max intersection tests
|
|
43
|
+
* safeWorkloadMatrix: [ // Safe configs tested
|
|
44
|
+
* { workgroupSize: [16,16,1], maxWork: 100000, timingMs: 45 },
|
|
45
|
+
* { workgroupSize: [32,32,1], maxWork: 50000, timingMs: 123 },
|
|
46
|
+
* ...
|
|
47
|
+
* ],
|
|
48
|
+
* deviceInfo: {
|
|
49
|
+
* maxComputeWorkgroupSizeX: 256,
|
|
50
|
+
* maxComputeWorkgroupsPerDimension: 65535,
|
|
51
|
+
* ...
|
|
52
|
+
* }
|
|
53
|
+
* }
|
|
54
|
+
*
|
|
55
|
+
* USAGE:
|
|
56
|
+
* ──────
|
|
57
|
+
* const limits = await calibrateGPU(device, {
|
|
58
|
+
* minWorkgroupSize: [8, 8, 1],
|
|
59
|
+
* maxWorkgroupSize: [64, 64, 1],
|
|
60
|
+
* minWork: 1000,
|
|
61
|
+
* maxWork: 100000,
|
|
62
|
+
* });
|
|
63
|
+
*
|
|
64
|
+
* ═══════════════════════════════════════════════════════════════════════════
|
|
65
|
+
*/
|
|
66
|
+
|
|
67
|
+
const calibrateShaderCode = 'SHADER:workload-calibrate';
|
|
68
|
+
|
|
69
|
+
// Test multiple workgroup dispatches (simulates real-world usage)
|
|
70
|
+
async function testWorkloadDispatch(device, pipeline, workgroupSize, triangleTests, dispatchCount) {
|
|
71
|
+
const [x, y, z] = workgroupSize;
|
|
72
|
+
const threadsPerWorkgroup = x * y * z;
|
|
73
|
+
const totalThreads = threadsPerWorkgroup * dispatchCount;
|
|
74
|
+
|
|
75
|
+
// Create completion flags buffer for ALL workgroups
|
|
76
|
+
const completionBuffer = device.createBuffer({
|
|
77
|
+
size: totalThreads * 4,
|
|
78
|
+
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// Initialize to zeros
|
|
82
|
+
const zeroData = new Uint32Array(totalThreads);
|
|
83
|
+
device.queue.writeBuffer(completionBuffer, 0, zeroData);
|
|
84
|
+
|
|
85
|
+
// Create uniforms
|
|
86
|
+
const uniformData = new Uint32Array([x, y, z, triangleTests]);
|
|
87
|
+
const uniformBuffer = device.createBuffer({
|
|
88
|
+
size: uniformData.byteLength,
|
|
89
|
+
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
|
90
|
+
});
|
|
91
|
+
device.queue.writeBuffer(uniformBuffer, 0, uniformData);
|
|
92
|
+
|
|
93
|
+
await device.queue.onSubmittedWorkDone();
|
|
94
|
+
|
|
95
|
+
// Create bind group
|
|
96
|
+
const bindGroup = device.createBindGroup({
|
|
97
|
+
layout: pipeline.getBindGroupLayout(0),
|
|
98
|
+
entries: [
|
|
99
|
+
{ binding: 0, resource: { buffer: completionBuffer } },
|
|
100
|
+
{ binding: 1, resource: { buffer: uniformBuffer } },
|
|
101
|
+
],
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
// Dispatch multiple workgroups
|
|
105
|
+
const startTime = performance.now();
|
|
106
|
+
const commandEncoder = device.createCommandEncoder();
|
|
107
|
+
const passEncoder = commandEncoder.beginComputePass();
|
|
108
|
+
passEncoder.setPipeline(pipeline);
|
|
109
|
+
passEncoder.setBindGroup(0, bindGroup);
|
|
110
|
+
|
|
111
|
+
// Dispatch NxN workgroups (e.g., 10×10 = 100 workgroups)
|
|
112
|
+
const dispatchX = Math.ceil(Math.sqrt(dispatchCount));
|
|
113
|
+
const dispatchY = Math.ceil(dispatchCount / dispatchX);
|
|
114
|
+
passEncoder.dispatchWorkgroups(dispatchX, dispatchY, 1);
|
|
115
|
+
passEncoder.end();
|
|
116
|
+
|
|
117
|
+
// Readback
|
|
118
|
+
const stagingBuffer = device.createBuffer({
|
|
119
|
+
size: totalThreads * 4,
|
|
120
|
+
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
commandEncoder.copyBufferToBuffer(completionBuffer, 0, stagingBuffer, 0, totalThreads * 4);
|
|
124
|
+
device.queue.submit([commandEncoder.finish()]);
|
|
125
|
+
|
|
126
|
+
await device.queue.onSubmittedWorkDone();
|
|
127
|
+
const elapsed = performance.now() - startTime;
|
|
128
|
+
|
|
129
|
+
await stagingBuffer.mapAsync(GPUMapMode.READ);
|
|
130
|
+
const completionData = new Uint32Array(stagingBuffer.getMappedRange());
|
|
131
|
+
const completionCopy = new Uint32Array(completionData);
|
|
132
|
+
stagingBuffer.unmap();
|
|
133
|
+
|
|
134
|
+
// Check for failures
|
|
135
|
+
let failedThreads = 0;
|
|
136
|
+
for (let i = 0; i < totalThreads; i++) {
|
|
137
|
+
if (completionCopy[i] === 0) {
|
|
138
|
+
failedThreads++;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Cleanup
|
|
143
|
+
completionBuffer.destroy();
|
|
144
|
+
uniformBuffer.destroy();
|
|
145
|
+
stagingBuffer.destroy();
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
success: failedThreads === 0,
|
|
149
|
+
failedThreads,
|
|
150
|
+
totalThreads,
|
|
151
|
+
dispatchCount,
|
|
152
|
+
elapsed,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Test a specific workload configuration (single workgroup)
|
|
157
|
+
async function testWorkload(device, pipeline, workgroupSize, triangleTests) {
|
|
158
|
+
const [x, y, z] = workgroupSize;
|
|
159
|
+
const totalThreads = x * y * z;
|
|
160
|
+
|
|
161
|
+
// Create completion flags buffer (initialized to zeros)
|
|
162
|
+
const completionBuffer = device.createBuffer({
|
|
163
|
+
size: totalThreads * 4, // u32 per thread
|
|
164
|
+
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
// Initialize to zeros (so we can detect threads that never completed)
|
|
168
|
+
const zeroData = new Uint32Array(totalThreads);
|
|
169
|
+
device.queue.writeBuffer(completionBuffer, 0, zeroData);
|
|
170
|
+
|
|
171
|
+
// Create uniforms
|
|
172
|
+
const uniformData = new Uint32Array([x, y, z, triangleTests]);
|
|
173
|
+
const uniformBuffer = device.createBuffer({
|
|
174
|
+
size: uniformData.byteLength,
|
|
175
|
+
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
|
176
|
+
});
|
|
177
|
+
device.queue.writeBuffer(uniformBuffer, 0, uniformData);
|
|
178
|
+
|
|
179
|
+
// Wait for writes to complete
|
|
180
|
+
await device.queue.onSubmittedWorkDone();
|
|
181
|
+
|
|
182
|
+
// Create bind group
|
|
183
|
+
const bindGroup = device.createBindGroup({
|
|
184
|
+
layout: pipeline.getBindGroupLayout(0),
|
|
185
|
+
entries: [
|
|
186
|
+
{ binding: 0, resource: { buffer: completionBuffer } },
|
|
187
|
+
{ binding: 1, resource: { buffer: uniformBuffer } },
|
|
188
|
+
],
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
// Dispatch compute shader
|
|
192
|
+
const startTime = performance.now();
|
|
193
|
+
const commandEncoder = device.createCommandEncoder();
|
|
194
|
+
const passEncoder = commandEncoder.beginComputePass();
|
|
195
|
+
passEncoder.setPipeline(pipeline);
|
|
196
|
+
passEncoder.setBindGroup(0, bindGroup);
|
|
197
|
+
|
|
198
|
+
// Dispatch exactly 1 workgroup (16x16x1 = 256 threads by default)
|
|
199
|
+
// The shader itself is parameterized with the workgroup size to test
|
|
200
|
+
passEncoder.dispatchWorkgroups(1, 1, 1);
|
|
201
|
+
passEncoder.end();
|
|
202
|
+
|
|
203
|
+
// Create staging buffer for readback
|
|
204
|
+
const stagingBuffer = device.createBuffer({
|
|
205
|
+
size: totalThreads * 4,
|
|
206
|
+
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
commandEncoder.copyBufferToBuffer(completionBuffer, 0, stagingBuffer, 0, totalThreads * 4);
|
|
210
|
+
device.queue.submit([commandEncoder.finish()]);
|
|
211
|
+
|
|
212
|
+
// Wait for GPU to complete
|
|
213
|
+
await device.queue.onSubmittedWorkDone();
|
|
214
|
+
const elapsed = performance.now() - startTime;
|
|
215
|
+
|
|
216
|
+
// Read back completion flags
|
|
217
|
+
await stagingBuffer.mapAsync(GPUMapMode.READ);
|
|
218
|
+
const completionData = new Uint32Array(stagingBuffer.getMappedRange());
|
|
219
|
+
const completionCopy = new Uint32Array(completionData);
|
|
220
|
+
stagingBuffer.unmap();
|
|
221
|
+
|
|
222
|
+
// Check for failures (any zeros = thread didn't complete)
|
|
223
|
+
let failedThreads = 0;
|
|
224
|
+
for (let i = 0; i < totalThreads; i++) {
|
|
225
|
+
if (completionCopy[i] === 0) {
|
|
226
|
+
failedThreads++;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Cleanup
|
|
231
|
+
completionBuffer.destroy();
|
|
232
|
+
uniformBuffer.destroy();
|
|
233
|
+
stagingBuffer.destroy();
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
success: failedThreads === 0,
|
|
237
|
+
failedThreads,
|
|
238
|
+
totalThreads,
|
|
239
|
+
elapsed,
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Binary search for max work intensity at a given workgroup size
|
|
244
|
+
async function findMaxWork(device, pipeline, workgroupSize, minWork, maxWork) {
|
|
245
|
+
let low = minWork;
|
|
246
|
+
let high = maxWork;
|
|
247
|
+
let lastSuccess = minWork;
|
|
248
|
+
|
|
249
|
+
while (low <= high) {
|
|
250
|
+
const mid = Math.floor((low + high) / 2);
|
|
251
|
+
const result = await testWorkload(device, pipeline, workgroupSize, mid);
|
|
252
|
+
|
|
253
|
+
if (result.success) {
|
|
254
|
+
lastSuccess = mid;
|
|
255
|
+
low = mid + 1;
|
|
256
|
+
} else {
|
|
257
|
+
high = mid - 1;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return lastSuccess;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Calibrate dispatch count limits (how many workgroups can be queued)
|
|
265
|
+
export async function calibrateDispatchLimits(device, options = {}) {
|
|
266
|
+
const {
|
|
267
|
+
workgroupSize = [16, 16, 1], // Use known-good size
|
|
268
|
+
triangleTests = 10000, // Moderate work per thread
|
|
269
|
+
minDispatch = 1,
|
|
270
|
+
maxDispatch = 100000, // Test up to 100k workgroups
|
|
271
|
+
verbose = true,
|
|
272
|
+
} = options;
|
|
273
|
+
|
|
274
|
+
const shaderModule = device.createShaderModule({ code: calibrateShaderCode });
|
|
275
|
+
const pipeline = device.createComputePipeline({
|
|
276
|
+
layout: 'auto',
|
|
277
|
+
compute: { module: shaderModule, entryPoint: 'main' },
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
if (verbose) {
|
|
281
|
+
console.log('[Calibrate] Testing dispatch count limits...');
|
|
282
|
+
console.log(`[Calibrate] Workgroup size: ${workgroupSize.join('x')}, work: ${triangleTests} tests/thread`);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Binary search for max dispatch count
|
|
286
|
+
let low = minDispatch;
|
|
287
|
+
let high = maxDispatch;
|
|
288
|
+
let lastSuccess = minDispatch;
|
|
289
|
+
const results = [];
|
|
290
|
+
|
|
291
|
+
while (low <= high) {
|
|
292
|
+
const mid = Math.floor((low + high) / 2);
|
|
293
|
+
|
|
294
|
+
if (verbose) {
|
|
295
|
+
console.log(`[Calibrate] Testing ${mid} workgroups...`);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
const result = await testWorkloadDispatch(device, pipeline, workgroupSize, triangleTests, mid);
|
|
299
|
+
results.push({ ...result, dispatchCount: mid });
|
|
300
|
+
|
|
301
|
+
if (result.success) {
|
|
302
|
+
lastSuccess = mid;
|
|
303
|
+
if (verbose) {
|
|
304
|
+
const totalThreads = result.totalThreads.toLocaleString();
|
|
305
|
+
const totalWork = (result.totalThreads * triangleTests).toLocaleString();
|
|
306
|
+
console.log(`[Calibrate] ✓ ${mid} workgroups OK (${totalThreads} threads, ${totalWork} total tests) in ${result.elapsed.toFixed(1)}ms`);
|
|
307
|
+
}
|
|
308
|
+
low = mid + 1;
|
|
309
|
+
} else {
|
|
310
|
+
if (verbose) {
|
|
311
|
+
console.log(`[Calibrate] ❌ ${mid} workgroups FAILED (${result.failedThreads}/${result.totalThreads} threads killed)`);
|
|
312
|
+
}
|
|
313
|
+
high = mid - 1;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if (verbose) {
|
|
318
|
+
console.log(`[Calibrate] Max safe dispatch count: ${lastSuccess} workgroups`);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return {
|
|
322
|
+
maxSafeDispatchCount: lastSuccess,
|
|
323
|
+
workgroupSize,
|
|
324
|
+
triangleTests,
|
|
325
|
+
results,
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Main calibration function
|
|
330
|
+
export async function calibrateGPU(device, options = {}) {
|
|
331
|
+
const {
|
|
332
|
+
workgroupSizes = [
|
|
333
|
+
[8, 8, 1],
|
|
334
|
+
[16, 16, 1],
|
|
335
|
+
[32, 32, 1],
|
|
336
|
+
[64, 64, 1],
|
|
337
|
+
],
|
|
338
|
+
minWork = 1000,
|
|
339
|
+
maxWork = 100000,
|
|
340
|
+
verbose = true,
|
|
341
|
+
} = options;
|
|
342
|
+
|
|
343
|
+
// Compile calibration shader
|
|
344
|
+
const shaderModule = device.createShaderModule({ code: calibrateShaderCode });
|
|
345
|
+
const pipeline = device.createComputePipeline({
|
|
346
|
+
layout: 'auto',
|
|
347
|
+
compute: { module: shaderModule, entryPoint: 'main' },
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
const results = [];
|
|
351
|
+
|
|
352
|
+
if (verbose) {
|
|
353
|
+
console.log('[Calibrate] Starting GPU calibration...');
|
|
354
|
+
console.log('[Calibrate] Testing workgroup sizes:', workgroupSizes);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
for (const size of workgroupSizes) {
|
|
358
|
+
const [x, y, z] = size;
|
|
359
|
+
const totalThreads = x * y * z;
|
|
360
|
+
|
|
361
|
+
if (verbose) {
|
|
362
|
+
console.log(`[Calibrate] Testing ${x}x${y}x${z} (${totalThreads} threads)...`);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// First, verify minimal work succeeds
|
|
366
|
+
const minTest = await testWorkload(device, pipeline, size, minWork);
|
|
367
|
+
if (!minTest.success) {
|
|
368
|
+
if (verbose) {
|
|
369
|
+
console.log(`[Calibrate] ❌ Failed even at minimum work (${minWork} tests)`);
|
|
370
|
+
}
|
|
371
|
+
break; // This workgroup size is too large
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Binary search for maximum work
|
|
375
|
+
const maxWorkFound = await findMaxWork(device, pipeline, size, minWork, maxWork);
|
|
376
|
+
const finalTest = await testWorkload(device, pipeline, size, maxWorkFound);
|
|
377
|
+
|
|
378
|
+
results.push({
|
|
379
|
+
workgroupSize: size,
|
|
380
|
+
totalThreads,
|
|
381
|
+
maxWork: maxWorkFound,
|
|
382
|
+
timingMs: finalTest.elapsed,
|
|
383
|
+
msPerThread: finalTest.elapsed / totalThreads,
|
|
384
|
+
testsPerSecond: (maxWorkFound * totalThreads) / (finalTest.elapsed / 1000),
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
if (verbose) {
|
|
388
|
+
console.log(`[Calibrate] ✓ Max work: ${maxWorkFound} tests (${finalTest.elapsed.toFixed(1)}ms)`);
|
|
389
|
+
console.log(`[Calibrate] ${(maxWorkFound * totalThreads).toLocaleString()} total ray-triangle tests`);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// Determine overall limits
|
|
394
|
+
const maxWorkgroupResult = results[results.length - 1];
|
|
395
|
+
const minWorkPerThread = Math.min(...results.map(r => r.maxWork));
|
|
396
|
+
|
|
397
|
+
const calibration = {
|
|
398
|
+
maxWorkgroupSize: maxWorkgroupResult.workgroupSize,
|
|
399
|
+
maxWorkPerThread: minWorkPerThread, // Conservative: min across all sizes
|
|
400
|
+
safeWorkloadMatrix: results,
|
|
401
|
+
deviceInfo: {
|
|
402
|
+
maxComputeWorkgroupSizeX: device.limits.maxComputeWorkgroupSizeX,
|
|
403
|
+
maxComputeWorkgroupSizeY: device.limits.maxComputeWorkgroupSizeY,
|
|
404
|
+
maxComputeWorkgroupSizeZ: device.limits.maxComputeWorkgroupSizeZ,
|
|
405
|
+
maxComputeWorkgroupsPerDimension: device.limits.maxComputeWorkgroupsPerDimension,
|
|
406
|
+
},
|
|
407
|
+
};
|
|
408
|
+
|
|
409
|
+
if (verbose) {
|
|
410
|
+
console.log('[Calibrate] Calibration complete:');
|
|
411
|
+
console.log(`[Calibrate] Max safe workgroup: ${maxWorkgroupResult.workgroupSize.join('x')}`);
|
|
412
|
+
console.log(`[Calibrate] Max work per thread: ${minWorkPerThread.toLocaleString()}`);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
return calibration;
|
|
416
|
+
}
|