@manycore/aholo-splat-transform 1.2.8 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/CHANGELOG.md +120 -113
  2. package/README.md +39 -39
  3. package/THIRD_PARTY_LICENSES.txt +1373 -1373
  4. package/bin/cli.js +125 -118
  5. package/dist/SplatData.d.ts +67 -67
  6. package/dist/SplatData.js +167 -150
  7. package/dist/constant.d.ts +3 -3
  8. package/dist/constant.js +13 -13
  9. package/dist/file/IFile.d.ts +5 -5
  10. package/dist/file/IFile.js +1 -1
  11. package/dist/file/esz.d.ts +11 -11
  12. package/dist/file/esz.js +337 -322
  13. package/dist/file/index.d.ts +8 -8
  14. package/dist/file/index.js +7 -7
  15. package/dist/file/ksplat.d.ts +12 -12
  16. package/dist/file/ksplat.js +293 -231
  17. package/dist/file/lcc.d.ts +11 -11
  18. package/dist/file/lcc.js +161 -158
  19. package/dist/file/ply.d.ts +13 -13
  20. package/dist/file/ply.js +439 -390
  21. package/dist/file/sog.d.ts +80 -80
  22. package/dist/file/sog.js +525 -494
  23. package/dist/file/splat.d.ts +6 -6
  24. package/dist/file/splat.js +119 -99
  25. package/dist/file/spz.d.ts +11 -11
  26. package/dist/file/spz.js +597 -583
  27. package/dist/file/voxel.d.ts +43 -37
  28. package/dist/file/voxel.js +411 -280
  29. package/dist/index.d.ts +33 -33
  30. package/dist/index.js +54 -54
  31. package/dist/native/index.d.ts +54 -54
  32. package/dist/native/index.js +122 -129
  33. package/dist/native/utils.d.ts +1 -0
  34. package/dist/native/utils.js +54 -0
  35. package/dist/tasks/AutoChunkLodTask.d.ts +13 -13
  36. package/dist/tasks/AutoChunkLodTask.js +117 -117
  37. package/dist/tasks/AutoLodTask.d.ts +10 -10
  38. package/dist/tasks/AutoLodTask.js +20 -20
  39. package/dist/tasks/BaseTask.d.ts +15 -15
  40. package/dist/tasks/BaseTask.js +5 -5
  41. package/dist/tasks/FlexLodTask.d.ts +12 -12
  42. package/dist/tasks/FlexLodTask.js +54 -44
  43. package/dist/tasks/ModifyTask.d.ts +9 -9
  44. package/dist/tasks/ModifyTask.js +166 -156
  45. package/dist/tasks/ReadTask.d.ts +9 -9
  46. package/dist/tasks/ReadTask.js +29 -29
  47. package/dist/tasks/SkeletonLodTask.d.ts +10 -10
  48. package/dist/tasks/SkeletonLodTask.js +176 -156
  49. package/dist/tasks/VoxelTask.d.ts +35 -30
  50. package/dist/tasks/VoxelTask.js +40 -37
  51. package/dist/tasks/WriteTask.d.ts +12 -12
  52. package/dist/tasks/WriteTask.js +70 -70
  53. package/dist/utils/BufferReader.d.ts +12 -12
  54. package/dist/utils/BufferReader.js +45 -45
  55. package/dist/utils/Logger.d.ts +11 -11
  56. package/dist/utils/Logger.js +40 -40
  57. package/dist/utils/StreamChunkDecoder.d.ts +16 -16
  58. package/dist/utils/StreamChunkDecoder.js +31 -31
  59. package/dist/utils/index.d.ts +27 -27
  60. package/dist/utils/index.js +101 -101
  61. package/dist/utils/k-means.d.ts +4 -4
  62. package/dist/utils/k-means.js +340 -341
  63. package/dist/utils/math.d.ts +46 -46
  64. package/dist/utils/math.js +350 -346
  65. package/dist/utils/quantize-1d.d.ts +4 -4
  66. package/dist/utils/quantize-1d.js +164 -164
  67. package/dist/utils/sh-rotate.d.ts +2 -2
  68. package/dist/utils/sh-rotate.js +236 -175
  69. package/dist/utils/splat.d.ts +21 -21
  70. package/dist/utils/splat.js +397 -387
  71. package/dist/utils/voxel/binary.d.ts +8 -0
  72. package/dist/utils/voxel/binary.js +176 -0
  73. package/dist/utils/voxel/common.d.ts +178 -162
  74. package/dist/utils/voxel/common.js +1752 -1682
  75. package/dist/utils/voxel/coplanar-merge.d.ts +63 -63
  76. package/dist/utils/voxel/coplanar-merge.js +818 -819
  77. package/dist/utils/voxel/filter-cluster.d.ts +20 -0
  78. package/dist/utils/voxel/filter-cluster.js +628 -0
  79. package/dist/utils/voxel/gpu-dilation.d.ts +2 -2
  80. package/dist/utils/voxel/gpu-dilation.js +677 -656
  81. package/dist/utils/voxel/marching-cubes.d.ts +42 -42
  82. package/dist/utils/voxel/marching-cubes.js +1645 -1657
  83. package/dist/utils/voxel/mesh.d.ts +3 -3
  84. package/dist/utils/voxel/mesh.js +130 -130
  85. package/dist/utils/voxel/nav.d.ts +29 -29
  86. package/dist/utils/voxel/nav.js +1068 -1043
  87. package/dist/utils/voxel/postprocess.d.ts +23 -23
  88. package/dist/utils/voxel/postprocess.js +408 -375
  89. package/dist/utils/voxel/voxel-faces.d.ts +18 -18
  90. package/dist/utils/voxel/voxel-faces.js +662 -663
  91. package/dist/utils/voxel/voxelize.d.ts +34 -33
  92. package/dist/utils/voxel/voxelize.js +1208 -1193
  93. package/dist/utils/webgpu.d.ts +8 -8
  94. package/dist/utils/webgpu.js +122 -122
  95. package/package.json +37 -39
  96. package/dist/native/cpp/bin/linux/binding.node +0 -0
  97. package/dist/native/cpp/bin/windows/binding.node +0 -0
@@ -1,1193 +1,1208 @@
1
- import { getOrCreateDevice } from '../webgpu.js';
2
- import { ALPHA_THRESHOLD, BlockMaskBuffer, GaussianBVH, LEAF_SIZE } from './common.js';
3
- import { availableParallelism } from 'node:os';
4
- import { Worker } from 'node:worker_threads';
5
- /** Per gaussian: increment overlap count for each coarse batch cell its AABB touches (GPU atomics). */
6
- const buildPerBatchCountsWgsl = () => /* wgsl */ `
7
- struct Uniforms {
8
- gridMinX: f32,
9
- gridMinY: f32,
10
- gridMinZ: f32,
11
- batchWorldSize: f32,
12
- numBatchX: u32,
13
- numBatchY: u32,
14
- numBatchZ: u32,
15
- gaussianCount: u32
16
- }
17
-
18
- struct Gaussian {
19
- posX: f32, posY: f32, posZ: f32, opacity: f32,
20
- rotW: f32, rotX: f32, rotY: f32, rotZ: f32,
21
- scaleX: f32, scaleY: f32, scaleZ: f32,
22
- extentX: f32, extentY: f32, extentZ: f32,
23
- _padding0: f32, _padding1: f32
24
- }
25
-
26
- @group(0) @binding(0) var<uniform> uniforms: Uniforms;
27
- @group(0) @binding(1) var<storage, read> allGaussians: array<Gaussian>;
28
- @group(0) @binding(2) var<storage, read_write> batchCounts: array<atomic<u32>>;
29
-
30
- @compute @workgroup_size(256)
31
- fn main(@builtin(global_invocation_id) global_id: vec3u) {
32
- let gaussianIdx = global_id.x;
33
- if (gaussianIdx >= uniforms.gaussianCount) { return; }
34
- let g = allGaussians[gaussianIdx];
35
- if (g.opacity <= 0.0) { return; }
36
- let gMinX = g.posX - g.extentX - uniforms.gridMinX;
37
- let gMinY = g.posY - g.extentY - uniforms.gridMinY;
38
- let gMinZ = g.posZ - g.extentZ - uniforms.gridMinZ;
39
- let gMaxX = g.posX + g.extentX - uniforms.gridMinX;
40
- let gMaxY = g.posY + g.extentY - uniforms.gridMinY;
41
- let gMaxZ = g.posZ + g.extentZ - uniforms.gridMinZ;
42
- let maxWorldX = uniforms.batchWorldSize * f32(uniforms.numBatchX);
43
- let maxWorldY = uniforms.batchWorldSize * f32(uniforms.numBatchY);
44
- let maxWorldZ = uniforms.batchWorldSize * f32(uniforms.numBatchZ);
45
- if (gMaxX < 0.0 || gMinX > maxWorldX || gMaxY < 0.0 || gMinY > maxWorldY || gMaxZ < 0.0 || gMinZ > maxWorldZ) { return; }
46
- let minBx = clamp(i32(floor(gMinX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
47
- let minBy = clamp(i32(floor(gMinY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
48
- let minBz = clamp(i32(floor(gMinZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
49
- let maxBx = clamp(i32(floor(gMaxX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
50
- let maxBy = clamp(i32(floor(gMaxY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
51
- let maxBz = clamp(i32(floor(gMaxZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
52
- for (var bz = minBz; bz <= maxBz; bz++) {
53
- for (var by = minBy; by <= maxBy; by++) {
54
- for (var bx = minBx; bx <= maxBx; bx++) {
55
- let batchId = u32(bz) * uniforms.numBatchX * uniforms.numBatchY + u32(by) * uniforms.numBatchX + u32(bx);
56
- atomicAdd(&batchCounts[batchId], 1u);
57
- }
58
- }
59
- }
60
- }
61
- `;
62
- /** Scatter gaussian indices into packed `indices` using prefix `batchOffsets` and per-batch atomic write heads. */
63
- const fillPerBatchCandidatesWgsl = () => /* wgsl */ `
64
- struct Uniforms {
65
- gridMinX: f32,
66
- gridMinY: f32,
67
- gridMinZ: f32,
68
- batchWorldSize: f32,
69
- numBatchX: u32,
70
- numBatchY: u32,
71
- numBatchZ: u32,
72
- gaussianCount: u32
73
- }
74
-
75
- struct Gaussian {
76
- posX: f32, posY: f32, posZ: f32, opacity: f32,
77
- rotW: f32, rotX: f32, rotY: f32, rotZ: f32,
78
- scaleX: f32, scaleY: f32, scaleZ: f32,
79
- extentX: f32, extentY: f32, extentZ: f32,
80
- _padding0: f32, _padding1: f32
81
- }
82
-
83
- @group(0) @binding(0) var<uniform> uniforms: Uniforms;
84
- @group(0) @binding(1) var<storage, read> allGaussians: array<Gaussian>;
85
- @group(0) @binding(2) var<storage, read> batchOffsets: array<u32>;
86
- @group(0) @binding(3) var<storage, read_write> batchWriteHeads: array<atomic<u32>>;
87
- @group(0) @binding(4) var<storage, read_write> indices: array<u32>;
88
-
89
- @compute @workgroup_size(256)
90
- fn main(@builtin(global_invocation_id) global_id: vec3u) {
91
- let gaussianIdx = global_id.x;
92
- if (gaussianIdx >= uniforms.gaussianCount) { return; }
93
- let g = allGaussians[gaussianIdx];
94
- if (g.opacity <= 0.0) { return; }
95
- let gMinX = g.posX - g.extentX - uniforms.gridMinX;
96
- let gMinY = g.posY - g.extentY - uniforms.gridMinY;
97
- let gMinZ = g.posZ - g.extentZ - uniforms.gridMinZ;
98
- let gMaxX = g.posX + g.extentX - uniforms.gridMinX;
99
- let gMaxY = g.posY + g.extentY - uniforms.gridMinY;
100
- let gMaxZ = g.posZ + g.extentZ - uniforms.gridMinZ;
101
- let maxWorldX = uniforms.batchWorldSize * f32(uniforms.numBatchX);
102
- let maxWorldY = uniforms.batchWorldSize * f32(uniforms.numBatchY);
103
- let maxWorldZ = uniforms.batchWorldSize * f32(uniforms.numBatchZ);
104
- if (gMaxX < 0.0 || gMinX > maxWorldX || gMaxY < 0.0 || gMinY > maxWorldY || gMaxZ < 0.0 || gMinZ > maxWorldZ) { return; }
105
- let minBx = clamp(i32(floor(gMinX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
106
- let minBy = clamp(i32(floor(gMinY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
107
- let minBz = clamp(i32(floor(gMinZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
108
- let maxBx = clamp(i32(floor(gMaxX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
109
- let maxBy = clamp(i32(floor(gMaxY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
110
- let maxBz = clamp(i32(floor(gMaxZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
111
- for (var bz = minBz; bz <= maxBz; bz++) {
112
- for (var by = minBy; by <= maxBy; by++) {
113
- for (var bx = minBx; bx <= maxBx; bx++) {
114
- let batchId = u32(bz) * uniforms.numBatchX * uniforms.numBatchY + u32(by) * uniforms.numBatchX + u32(bx);
115
- let local = atomicAdd(&batchWriteHeads[batchId], 1u);
116
- let dst = batchOffsets[batchId] + local;
117
- indices[dst] = gaussianIdx;
118
- }
119
- }
120
- }
121
- }
122
- `;
123
- /**
124
- * From https://github.com/playcanvas/splat-transform/blob/8f3b843efdc378f97d4f6a66a3a90a2de6d479a4/src/lib/gpu/gpu-voxelization.ts
125
- * WGSL shader for multi-batch voxelization of 4x4x4 blocks.
126
- *
127
- * Each workgroup processes one block in one batch.
128
- * - workgroup_id.z = batch index
129
- * - workgroup_id.x = flat block index within the batch
130
- * Per-batch metadata (index range, block origin, dimensions) comes from a storage buffer,
131
- * allowing many batches to be dispatched in a single GPU call.
132
- */
133
- const voxelizeMultiBatchWgsl = () => /* wgsl */ `
134
- struct Uniforms {
135
- opacityCutoff: f32,
136
- voxelResolution: f32,
137
- maxBlocksPerBatch: u32
138
- }
139
-
140
- struct BatchInfo {
141
- indexOffset: u32,
142
- indexCount: u32,
143
- numBlocksX: u32,
144
- numBlocksY: u32,
145
- numBlocksZ: u32,
146
- blockMinX: f32,
147
- blockMinY: f32,
148
- blockMinZ: f32
149
- }
150
-
151
- struct Gaussian {
152
- posX: f32,
153
- posY: f32,
154
- posZ: f32,
155
- opacity: f32,
156
- rotW: f32,
157
- rotX: f32,
158
- rotY: f32,
159
- rotZ: f32,
160
- scaleX: f32,
161
- scaleY: f32,
162
- scaleZ: f32,
163
- extentX: f32,
164
- extentY: f32,
165
- extentZ: f32,
166
- _padding0: f32,
167
- _padding1: f32
168
- }
169
-
170
- @group(0) @binding(0) var<uniform> uniforms: Uniforms;
171
- @group(0) @binding(1) var<storage, read> allGaussians: array<Gaussian>;
172
- @group(0) @binding(2) var<storage, read> indices: array<u32>;
173
- @group(0) @binding(3) var<storage, read_write> results: array<atomic<u32>>;
174
- @group(0) @binding(4) var<storage, read> batchInfos: array<BatchInfo>;
175
-
176
- // Shared memory for cooperative Gaussian loading.
177
- // All 64 threads in a workgroup load one Gaussian each, then all threads
178
- // evaluate against the shared chunk (reducing global memory reads by 64x).
179
- // 64 Gaussians * 64 bytes each = 4 KB (well within 16 KB WebGPU minimum).
180
- const tileSize = 64u;
181
- var<workgroup> sharedGaussians: array<Gaussian, tileSize>;
182
- var<workgroup> blockMasks: array<atomic<u32>, 2>;
183
-
184
- fn mortonToXYZ(m: u32) -> vec3u {
185
- return vec3u(
186
- (m & 1u) | ((m >> 2u) & 2u),
187
- ((m >> 1u) & 1u) | ((m >> 3u) & 2u),
188
- ((m >> 2u) & 1u) | ((m >> 4u) & 2u)
189
- );
190
- }
191
-
192
- fn evaluateGaussianForVoxel(voxelCenter: vec3f, voxelHalfSize: f32, g: Gaussian) -> f32 {
193
- let gaussianCenter = vec3f(g.posX, g.posY, g.posZ);
194
- let diff = voxelCenter - gaussianCenter;
195
- // Use pre-computed world-space AABB half-extents (3-sigma, accounts for rotation)
196
- let extent = vec3f(g.extentX, g.extentY, g.extentZ);
197
- // Per-axis AABB overlap check
198
- if (any(abs(diff) > (extent + voxelHalfSize))) {
199
- return 0.0;
200
- }
201
- // Find closest point in voxel to Gaussian center
202
- let closestPoint = clamp(gaussianCenter, voxelCenter - voxelHalfSize, voxelCenter + voxelHalfSize);
203
- let closestDiff = closestPoint - gaussianCenter;
204
- // Inverse rotation using cross-product formula (Rodrigues rotation)
205
- // For inverse: negate xyz components of quaternion
206
- let qxyz = vec3f(-g.rotX, -g.rotY, -g.rotZ);
207
- let t = 2.0 * cross(qxyz, closestDiff);
208
- let localDiff = closestDiff + g.rotW * t + cross(qxyz, t);
209
- // Calculate Mahalanobis distance squared
210
- let invScale = vec3f(1.0 / max(g.scaleX, 1e-8), 1.0 / max(g.scaleY, 1e-8), 1.0 / max(g.scaleZ, 1e-8));
211
- let scaled = localDiff * invScale;
212
- let d2 = dot(scaled, scaled);
213
- return g.opacity * exp(-0.5 * d2);
214
- }
215
-
216
- @compute @workgroup_size(64)
217
- fn main(
218
- @builtin(local_invocation_index) local_invocation_index: u32,
219
- @builtin(workgroup_id) workgroup_id: vec3u
220
- ) {
221
- let batchIdx = workgroup_id.z;
222
- let flatBlockId = workgroup_id.x;
223
- let info = batchInfos[batchIdx];
224
- // Skip padded workgroups beyond the batch's actual block count
225
- let totalBlocks = info.numBlocksX * info.numBlocksY * info.numBlocksZ;
226
- if (flatBlockId >= totalBlocks) { return; }
227
-
228
- // Decompose flat block ID to 3D coordinates within the batch
229
- let blockX = flatBlockId % info.numBlocksX;
230
- let blockY = (flatBlockId / info.numBlocksX) % info.numBlocksY;
231
- let blockZ = flatBlockId / (info.numBlocksX * info.numBlocksY);
232
- let localPos = mortonToXYZ(local_invocation_index);
233
-
234
- let blockMin = vec3f(info.blockMinX, info.blockMinY, info.blockMinZ);
235
- let blockOffset = vec3f(f32(blockX), f32(blockY), f32(blockZ)) * 4.0 * uniforms.voxelResolution;
236
- let voxelCenter = blockMin + blockOffset + (vec3f(localPos) + 0.5) * uniforms.voxelResolution;
237
- let voxelHalfSize = uniforms.voxelResolution * 0.5;
238
- if (local_invocation_index < 2u) {
239
- atomicStore(&blockMasks[local_invocation_index], 0u);
240
- }
241
- workgroupBarrier();
242
-
243
- var totalSigma = 0.0;
244
- let numIndices = info.indexCount;
245
- let numTiles = (numIndices + tileSize - 1u) / tileSize;
246
- for (var tile = 0u; tile < numTiles; tile++) {
247
- // Cooperative load: each thread loads one Gaussian into shared memory
248
- let loadIdx = tile * tileSize + local_invocation_index;
249
- if (loadIdx < numIndices) {
250
- let gaussianIdx = indices[info.indexOffset + loadIdx];
251
- sharedGaussians[local_invocation_index] = allGaussians[gaussianIdx];
252
- }
253
- // Wait for all threads to finish loading the tile
254
- workgroupBarrier();
255
-
256
- if (totalSigma < 7.0) {
257
- let thisTileSize = min(tileSize, numIndices - tile * tileSize);
258
- for (var c = 0u; c < thisTileSize; c++) {
259
- totalSigma += evaluateGaussianForVoxel(voxelCenter, voxelHalfSize, sharedGaussians[c]);
260
- if (totalSigma >= 7.0) { break; }
261
- }
262
- }
263
- // Wait before next tile overwrites shared memory
264
- workgroupBarrier();
265
- }
266
-
267
- // Convert accumulated density to opacity using Beer-Lambert law
268
- let finalOpacity = 1.0 - exp(-totalSigma);
269
- let isSolid = finalOpacity >= uniforms.opacityCutoff;
270
- // Accumulate block bits in workgroup-local atomics first to reduce global atomic contention.
271
- if (isSolid) {
272
- let linearIdx = localPos.z * 16u + localPos.y * 4u + localPos.x;
273
- atomicOr(&blockMasks[linearIdx >> 5u], 1u << (linearIdx & 31u));
274
- }
275
- workgroupBarrier();
276
- if (local_invocation_index < 2u) {
277
- let batchResultBase = batchIdx * uniforms.maxBlocksPerBatch * 2u;
278
- let wordIndex = batchResultBase + flatBlockId * 2u + local_invocation_index;
279
- atomicStore(&results[wordIndex], atomicLoad(&blockMasks[local_invocation_index]));
280
- }
281
- }
282
- `;
283
- const GPU_BUFFER_USAGE_STORAGE = 128;
284
- const GPU_BUFFER_USAGE_COPY_DST = 8;
285
- const GPU_BUFFER_USAGE_COPY_SRC = 4;
286
- const GPU_BUFFER_USAGE_UNIFORM = 64;
287
- const GPU_BUFFER_USAGE_MAP_READ = 1;
288
- const GPU_MAP_MODE_READ = 1;
289
- /**
290
- * CPU voxelization fallback (simplified path).
291
- * Iterates candidate gaussians per batch and writes occupied voxel bits directly.
292
- */
293
- const CPU_VOXEL_PARALLEL_MIN_GAUSSIANS = 0;
294
- const parsePositiveInteger = (value) => {
295
- if (typeof value === 'number') {
296
- return Number.isFinite(value) && value > 0 ? Math.floor(value) : undefined;
297
- }
298
- if (typeof value !== 'string' || value.trim() === '') {
299
- return undefined;
300
- }
301
- const parsed = Number(value);
302
- return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : undefined;
303
- };
304
- const resolveCpuVoxelWorkerCount = (override) => {
305
- if (override !== undefined) {
306
- if (override === -1) {
307
- return Math.max(1, availableParallelism() - 1);
308
- }
309
- return parsePositiveInteger(override) ?? Math.max(1, availableParallelism() - 1);
310
- }
311
- return parsePositiveInteger(process.env.SPLAT_CPU_VOXEL_WORKERS) ??
312
- parsePositiveInteger(process.env.CPU_VOXEL_WORKERS) ??
313
- Math.max(1, availableParallelism() - 1);
314
- };
315
- const cpuVoxelizeWorkerScript = `
316
- const { parentPort, workerData } = require('node:worker_threads');
317
- const {
318
- voxelResolution, opacityCutoff, alphaThreshold, gridMinX, gridMinY, gridMinZ,
319
- nBlockX, nBlockY, nBlockXY,
320
- xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents
321
- } = workerData;
322
- const x = new Float32Array(xCol);
323
- const y = new Float32Array(yCol);
324
- const z = new Float32Array(zCol);
325
- const sx = new Float32Array(sxCol);
326
- const sy = new Float32Array(syCol);
327
- const sz = new Float32Array(szCol);
328
- const qx = new Float32Array(qxCol);
329
- const qy = new Float32Array(qyCol);
330
- const qz = new Float32Array(qzCol);
331
- const qw = new Float32Array(qwCol);
332
- const a = new Float32Array(aCol);
333
- const ext = new Float32Array(extents);
334
- const half = voxelResolution * 0.5;
335
- const sigmaCutoff = opacityCutoff <= 0 ? 0 : -Math.log1p(-Math.min(opacityCutoff, 1 - 1e-8));
336
- const SPEC_STRIDE = 8;
337
- const BATCH_BLOCK_SIZE = 4;
338
- const BATCH_VOXEL_SIZE = BATCH_BLOCK_SIZE * 4;
339
- const MAX_BATCH_VOXELS = BATCH_VOXEL_SIZE * BATCH_VOXEL_SIZE * BATCH_VOXEL_SIZE;
340
- const MAX_BATCH_BLOCKS = BATCH_BLOCK_SIZE * BATCH_BLOCK_SIZE * BATCH_BLOCK_SIZE;
341
- const sigmaBuffer = new Float32Array(MAX_BATCH_VOXELS);
342
- const solidBuffer = new Uint8Array(MAX_BATCH_VOXELS);
343
- const masksLoBuffer = new Uint32Array(MAX_BATCH_BLOCKS);
344
- const masksHiBuffer = new Uint32Array(MAX_BATCH_BLOCKS);
345
- const runBatchSet = (batchSpecs, candidateIndices) => {
346
- const specs = new Uint32Array(batchSpecs);
347
- const candidates = new Uint32Array(candidateIndices);
348
- const packedBlocks = [];
349
- const batchCount = specs.length / SPEC_STRIDE;
350
- for (let specIdx = 0; specIdx < batchCount; specIdx++) {
351
- const specBase = specIdx * SPEC_STRIDE;
352
- const batchBlockX = specs[specBase + 0];
353
- const batchBlockY = specs[specBase + 1];
354
- const batchBlockZ = specs[specBase + 2];
355
- const numBlocksX = specs[specBase + 3];
356
- const numBlocksY = specs[specBase + 4];
357
- const numBlocksZ = specs[specBase + 5];
358
- const indexOffset = specs[specBase + 6];
359
- const indexCount = specs[specBase + 7];
360
- const numVoxelsX = numBlocksX * 4;
361
- const numVoxelsY = numBlocksY * 4;
362
- const numVoxelsZ = numBlocksZ * 4;
363
- const totalVoxels = numVoxelsX * numVoxelsY * numVoxelsZ;
364
- const totalBlocks = numBlocksX * numBlocksY * numBlocksZ;
365
- sigmaBuffer.fill(0, 0, totalVoxels);
366
- solidBuffer.fill(0, 0, totalVoxels);
367
- masksLoBuffer.fill(0, 0, totalBlocks);
368
- masksHiBuffer.fill(0, 0, totalBlocks);
369
- const batchMinVoxelX = batchBlockX * 4;
370
- const batchMinVoxelY = batchBlockY * 4;
371
- const batchMinVoxelZ = batchBlockZ * 4;
372
- const batchMaxVoxelX = batchMinVoxelX + numVoxelsX - 1;
373
- const batchMaxVoxelY = batchMinVoxelY + numVoxelsY - 1;
374
- const batchMaxVoxelZ = batchMinVoxelZ + numVoxelsZ - 1;
375
- for (let c = 0; c < indexCount; c++) {
376
- const i = candidates[indexOffset + c];
377
- const xi = x[i];
378
- const yi = y[i];
379
- const zi = z[i];
380
- const opacity = a[i];
381
- if (opacity <= 0) continue;
382
- const maxContributionD2 = alphaThreshold <= 0 ? Infinity : (opacity <= alphaThreshold ? 0 : -2 * Math.log(alphaThreshold / opacity));
383
- if (maxContributionD2 <= 0) continue;
384
- const ex = ext[i * 3];
385
- const ey = ext[i * 3 + 1];
386
- const ez = ext[i * 3 + 2];
387
- const minIx = Math.max(batchMinVoxelX, Math.floor((xi - ex - gridMinX) / voxelResolution));
388
- const minIy = Math.max(batchMinVoxelY, Math.floor((yi - ey - gridMinY) / voxelResolution));
389
- const minIz = Math.max(batchMinVoxelZ, Math.floor((zi - ez - gridMinZ) / voxelResolution));
390
- const maxIx = Math.min(batchMaxVoxelX, Math.ceil((xi + ex - gridMinX) / voxelResolution));
391
- const maxIy = Math.min(batchMaxVoxelY, Math.ceil((yi + ey - gridMinY) / voxelResolution));
392
- const maxIz = Math.min(batchMaxVoxelZ, Math.ceil((zi + ez - gridMinZ) / voxelResolution));
393
- if (minIx > maxIx || minIy > maxIy || minIz > maxIz) continue;
394
- const iqx = -qx[i], iqy = -qy[i], iqz = -qz[i], iqw = qw[i];
395
- const isx = sx[i] > 1e-8 ? 1 / sx[i] : 1e8;
396
- const isy = sy[i] > 1e-8 ? 1 / sy[i] : 1e8;
397
- const isz = sz[i] > 1e-8 ? 1 / sz[i] : 1e8;
398
- for (let iz = minIz; iz <= maxIz; iz++) {
399
- const localZ = iz - batchMinVoxelZ;
400
- const vz = gridMinZ + (iz + 0.5) * voxelResolution;
401
- for (let iy = minIy; iy <= maxIy; iy++) {
402
- const localY = iy - batchMinVoxelY;
403
- const vy = gridMinY + (iy + 0.5) * voxelResolution;
404
- for (let ix = minIx; ix <= maxIx; ix++) {
405
- const localX = ix - batchMinVoxelX;
406
- const localIndex = localX + localY * numVoxelsX + localZ * numVoxelsX * numVoxelsY;
407
- if (solidBuffer[localIndex]) continue;
408
- const vx = gridMinX + (ix + 0.5) * voxelResolution;
409
- const px = Math.min(Math.max(xi, vx - half), vx + half);
410
- const py = Math.min(Math.max(yi, vy - half), vy + half);
411
- const pz = Math.min(Math.max(zi, vz - half), vz + half);
412
- const dx = px - xi;
413
- const dy = py - yi;
414
- const dz = pz - zi;
415
- const tx = 2 * (iqy * dz - iqz * dy);
416
- const ty = 2 * (iqz * dx - iqx * dz);
417
- const tz = 2 * (iqx * dy - iqy * dx);
418
- const lx = dx + iqw * tx + (iqy * tz - iqz * ty);
419
- const ly = dy + iqw * ty + (iqz * tx - iqx * tz);
420
- const lz = dz + iqw * tz + (iqx * ty - iqy * tx);
421
- const sxv = lx * isx;
422
- const syv = ly * isy;
423
- const szv = lz * isz;
424
- const d2 = sxv * sxv + syv * syv + szv * szv;
425
- if (d2 > maxContributionD2) continue;
426
- const contribution = opacity * Math.exp(-0.5 * d2);
427
- if (contribution <= 0) continue;
428
- const total = sigmaBuffer[localIndex] + contribution;
429
- sigmaBuffer[localIndex] = total;
430
- if (total < sigmaCutoff) continue;
431
- solidBuffer[localIndex] = 1;
432
- const localBlockX = localX >> 2;
433
- const localBlockY = localY >> 2;
434
- const localBlockZ = localZ >> 2;
435
- const localBlock = localBlockX + localBlockY * numBlocksX + localBlockZ * numBlocksX * numBlocksY;
436
- const bitIdx = (localX & 3) + ((localY & 3) << 2) + ((localZ & 3) << 4);
437
- if (bitIdx < 32) masksLoBuffer[localBlock] = (masksLoBuffer[localBlock] | (1 << bitIdx)) >>> 0;
438
- else masksHiBuffer[localBlock] = (masksHiBuffer[localBlock] | (1 << (bitIdx - 32))) >>> 0;
439
- }
440
- }
441
- }
442
- }
443
- for (let localBlock = 0; localBlock < totalBlocks; localBlock++) {
444
- const lo = masksLoBuffer[localBlock];
445
- const hi = masksHiBuffer[localBlock];
446
- if ((lo | hi) === 0) continue;
447
- const localBlockX = localBlock % numBlocksX;
448
- const localBlockY = ((localBlock / numBlocksX) | 0) % numBlocksY;
449
- const localBlockZ = (localBlock / (numBlocksX * numBlocksY)) | 0;
450
- const blockIdx = (batchBlockX + localBlockX) + (batchBlockY + localBlockY) * nBlockX + (batchBlockZ + localBlockZ) * nBlockXY;
451
- packedBlocks.push(blockIdx >>> 0, lo >>> 0, hi >>> 0);
452
- }
453
- }
454
- const packed = new Uint32Array(packedBlocks.length);
455
- packed.set(packedBlocks);
456
- return packed.buffer;
457
- };
458
- parentPort.on('message', (msg) => {
459
- if (!msg || typeof msg !== 'object') return;
460
- if (msg.type === 'shutdown') {
461
- process.exit(0);
462
- return;
463
- }
464
- if (msg.type !== 'run') return;
465
- const taskId = msg.taskId;
466
- const packed = runBatchSet(msg.batchSpecs, msg.candidateIndices);
467
- parentPort.postMessage({ taskId, packed }, [packed]);
468
- });
469
- `;
470
- const toSharedFloat32 = (src) => {
471
- const sab = new SharedArrayBuffer(src.byteLength);
472
- new Float32Array(sab).set(src);
473
- return sab;
474
- };
475
- const cpuVoxelizeSingleThread = (xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff) => {
476
- const nx = Math.max(4, Math.round((gridBounds.max.x - gridBounds.min.x) / voxelResolution));
477
- const ny = Math.max(4, Math.round((gridBounds.max.y - gridBounds.min.y) / voxelResolution));
478
- const nz = Math.max(4, Math.round((gridBounds.max.z - gridBounds.min.z) / voxelResolution));
479
- const gridMinX = gridBounds.min.x;
480
- const gridMinY = gridBounds.min.y;
481
- const gridMinZ = gridBounds.min.z;
482
- const cullMinX = gridBounds.min.x;
483
- const cullMinY = gridBounds.min.y;
484
- const cullMinZ = gridBounds.min.z;
485
- const cullMaxX = gridBounds.max.x;
486
- const cullMaxY = gridBounds.max.y;
487
- const cullMaxZ = gridBounds.max.z;
488
- const half = voxelResolution * 0.5;
489
- const nBlockX = (nx + 3) >> 2;
490
- const nBlockY = (ny + 3) >> 2;
491
- const nBlockXY = nBlockX * nBlockY;
492
- const opacityThreshold = Math.min(Math.max(opacityCutoff, 0), 1);
493
- const blockMasks = {};
494
- for (let i = 0; i < xCol.length; i++) {
495
- const xi = xCol[i];
496
- const yi = yCol[i];
497
- const zi = zCol[i];
498
- const opacity = aCol[i];
499
- if (opacity <= 0) {
500
- continue;
501
- }
502
- if (xi < cullMinX || xi > cullMaxX ||
503
- yi < cullMinY || yi > cullMaxY ||
504
- zi < cullMinZ || zi > cullMaxZ) {
505
- continue;
506
- }
507
- const maxD2 = opacityThreshold <= 0 ? Infinity : (opacity <= opacityThreshold ? 0 : -2 * Math.log(opacityThreshold / opacity));
508
- if (maxD2 <= 0) {
509
- continue;
510
- }
511
- const ex = extents[i * 3];
512
- const ey = extents[i * 3 + 1];
513
- const ez = extents[i * 3 + 2];
514
- const minIx = Math.max(0, Math.floor((xi - ex - gridMinX) / voxelResolution));
515
- const minIy = Math.max(0, Math.floor((yi - ey - gridMinY) / voxelResolution));
516
- const minIz = Math.max(0, Math.floor((zi - ez - gridMinZ) / voxelResolution));
517
- const maxIx = Math.min(nx - 1, Math.ceil((xi + ex - gridMinX) / voxelResolution));
518
- const maxIy = Math.min(ny - 1, Math.ceil((yi + ey - gridMinY) / voxelResolution));
519
- const maxIz = Math.min(nz - 1, Math.ceil((zi + ez - gridMinZ) / voxelResolution));
520
- if (minIx > maxIx || minIy > maxIy || minIz > maxIz) {
521
- continue;
522
- }
523
- const qx = qxCol[i];
524
- const qy = qyCol[i];
525
- const qz = qzCol[i];
526
- const qw = qwCol[i];
527
- // Input quaternions are already normalized.
528
- const iqx = -qx;
529
- const iqy = -qy;
530
- const iqz = -qz;
531
- const iqw = qw;
532
- const isx = sxCol[i] > 1e-8 ? 1 / sxCol[i] : 1e8;
533
- const isy = syCol[i] > 1e-8 ? 1 / syCol[i] : 1e8;
534
- const isz = szCol[i] > 1e-8 ? 1 / szCol[i] : 1e8;
535
- for (let iz = minIz; iz <= maxIz; iz++) {
536
- const vz = gridMinZ + (iz + 0.5) * voxelResolution;
537
- for (let iy = minIy; iy <= maxIy; iy++) {
538
- const vy = gridMinY + (iy + 0.5) * voxelResolution;
539
- for (let ix = minIx; ix <= maxIx; ix++) {
540
- const vx = gridMinX + (ix + 0.5) * voxelResolution;
541
- const px = Math.min(Math.max(xi, vx - half), vx + half);
542
- const py = Math.min(Math.max(yi, vy - half), vy + half);
543
- const pz = Math.min(Math.max(zi, vz - half), vz + half);
544
- const dx = px - xi;
545
- const dy = py - yi;
546
- const dz = pz - zi;
547
- const tx = 2 * (iqy * dz - iqz * dy);
548
- const ty = 2 * (iqz * dx - iqx * dz);
549
- const tz = 2 * (iqx * dy - iqy * dx);
550
- const lx = dx + iqw * tx + (iqy * tz - iqz * ty);
551
- const ly = dy + iqw * ty + (iqz * tx - iqx * tz);
552
- const lz = dz + iqw * tz + (iqx * ty - iqy * tx);
553
- const sxv = lx * isx;
554
- const syv = ly * isy;
555
- const szv = lz * isz;
556
- const d2 = sxv * sxv + syv * syv + szv * szv;
557
- if (d2 > maxD2) {
558
- continue;
559
- }
560
- const blockX = ix >> 2;
561
- const blockY = iy >> 2;
562
- const blockZ = iz >> 2;
563
- const blockLinear = blockX + blockY * nBlockX + blockZ * nBlockXY;
564
- const bitIdx = (ix & 3) + ((iy & 3) << 2) + ((iz & 3) << 4);
565
- const curr = blockMasks[blockLinear] ?? [0, 0];
566
- if (bitIdx < 32) {
567
- curr[0] = (curr[0] | (1 << bitIdx)) >>> 0;
568
- }
569
- else {
570
- curr[1] = (curr[1] | (1 << (bitIdx - 32))) >>> 0;
571
- }
572
- blockMasks[blockLinear] = curr;
573
- }
574
- }
575
- }
576
- }
577
- const output = new BlockMaskBuffer();
578
- for (const [blockLinearRaw, [lo, hi]] of Object.entries(blockMasks)) {
579
- const blockLinear = Number(blockLinearRaw);
580
- output.addBlock(blockLinear, lo, hi);
581
- }
582
- return output;
583
- };
584
- export const cpuVoxelize = async (xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff, options) => {
585
- if (xCol.length < CPU_VOXEL_PARALLEL_MIN_GAUSSIANS) {
586
- return cpuVoxelizeSingleThread(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff);
587
- }
588
- const nx = Math.max(4, Math.round((gridBounds.max.x - gridBounds.min.x) / voxelResolution));
589
- const ny = Math.max(4, Math.round((gridBounds.max.y - gridBounds.min.y) / voxelResolution));
590
- const nz = Math.max(4, Math.round((gridBounds.max.z - gridBounds.min.z) / voxelResolution));
591
- const gridMinX = gridBounds.min.x;
592
- const gridMinY = gridBounds.min.y;
593
- const gridMinZ = gridBounds.min.z;
594
- if (xCol.length === 0) {
595
- return new BlockMaskBuffer();
596
- }
597
- const workers = Math.min(resolveCpuVoxelWorkerCount(options?.workerCount), xCol.length);
598
- const nBlockX = (nx + 3) >> 2;
599
- const nBlockY = (ny + 3) >> 2;
600
- const nBlockXY = nBlockX * nBlockY;
601
- const batchBlockSize = 4;
602
- const numBatchX = Math.ceil(nBlockX / batchBlockSize);
603
- const numBatchY = Math.ceil(nBlockY / batchBlockSize);
604
- const numBatchZ = Math.ceil(Math.max(1, (nz + 3) >> 2) / batchBlockSize);
605
- const bvh = new GaussianBVH(xCol, yCol, zCol, extents);
606
- const shared = {
607
- xCol: toSharedFloat32(xCol),
608
- yCol: toSharedFloat32(yCol),
609
- zCol: toSharedFloat32(zCol),
610
- sxCol: toSharedFloat32(sxCol),
611
- syCol: toSharedFloat32(syCol),
612
- szCol: toSharedFloat32(szCol),
613
- qxCol: toSharedFloat32(qxCol),
614
- qyCol: toSharedFloat32(qyCol),
615
- qzCol: toSharedFloat32(qzCol),
616
- qwCol: toSharedFloat32(qwCol),
617
- aCol: toSharedFloat32(aCol),
618
- extents: toSharedFloat32(extents)
619
- };
620
- try {
621
- const output = new BlockMaskBuffer();
622
- let nextTaskId = 1;
623
- const pool = Array.from({ length: workers }, (_v, slotId) => {
624
- const worker = new Worker(cpuVoxelizeWorkerScript, {
625
- eval: true,
626
- workerData: {
627
- workerId: slotId,
628
- voxelResolution,
629
- opacityCutoff,
630
- alphaThreshold: ALPHA_THRESHOLD,
631
- gridMinX,
632
- gridMinY,
633
- gridMinZ,
634
- nBlockX,
635
- nBlockY,
636
- nBlockXY,
637
- ...shared
638
- }
639
- });
640
- let currentResolve;
641
- let currentReject;
642
- worker.on('message', (message) => {
643
- if (message && typeof message === 'object' && 'packed' in message) {
644
- const typed = message;
645
- if (!currentResolve) {
646
- return;
647
- }
648
- const resolve = currentResolve;
649
- currentResolve = undefined;
650
- currentReject = undefined;
651
- resolve({ packed: typed.packed });
652
- return;
653
- }
654
- if (!currentResolve) {
655
- return;
656
- }
657
- const resolve = currentResolve;
658
- currentResolve = undefined;
659
- currentReject = undefined;
660
- resolve({ packed: message });
661
- });
662
- worker.on('error', (error) => {
663
- currentReject?.(error);
664
- currentResolve = undefined;
665
- currentReject = undefined;
666
- });
667
- worker.on('exit', (code) => {
668
- if (code !== 0) {
669
- currentReject?.(new Error(`cpu voxel worker exited with code ${code}`));
670
- currentResolve = undefined;
671
- currentReject = undefined;
672
- }
673
- });
674
- const runTask = (batchSpecs, candidateIndices) => new Promise((resolve, reject) => {
675
- if (currentResolve) {
676
- reject(new Error(`cpu voxel worker ${slotId} received concurrent task`));
677
- return;
678
- }
679
- currentResolve = resolve;
680
- currentReject = reject;
681
- const taskId = nextTaskId++;
682
- const batchSpecsBuffer = batchSpecs.buffer;
683
- const candidateIndicesBuffer = candidateIndices.buffer;
684
- worker.postMessage({
685
- type: 'run',
686
- taskId,
687
- workerId: slotId,
688
- batchSpecs: batchSpecsBuffer,
689
- candidateIndices: candidateIndicesBuffer
690
- }, [batchSpecsBuffer, candidateIndicesBuffer]);
691
- });
692
- return { worker, runTask };
693
- });
694
- const addPackedResult = (buf) => {
695
- const packed = new Uint32Array(buf);
696
- for (let i = 0; i < packed.length; i += 3) {
697
- output.addBlock(packed[i], packed[i + 1], packed[i + 2]);
698
- }
699
- };
700
- const availableSlots = pool.map((_slot, slotId) => Promise.resolve(slotId));
701
- const dispatchTask = async (batchSpecs, candidateIndices) => {
702
- const slotId = await Promise.race(availableSlots);
703
- availableSlots[slotId] = pool[slotId].runTask(batchSpecs, candidateIndices).then((result) => {
704
- addPackedResult(result.packed);
705
- return slotId;
706
- });
707
- };
708
- const maxPendingBatches = 256;
709
- const maxPendingIndices = 2 * 1024 * 1024;
710
- const totalBlockZ = Math.max(1, (nz + 3) >> 2);
711
- let pendingSpecs = [];
712
- let pendingCandidates = new Uint32Array(Math.min(Math.max(1024, xCol.length), maxPendingIndices));
713
- let pendingCandidateCount = 0;
714
- const ensurePendingCandidateCapacity = (needed) => {
715
- if (needed <= pendingCandidates.length) {
716
- return;
717
- }
718
- const next = new Uint32Array(Math.max(needed, pendingCandidates.length * 2));
719
- next.set(pendingCandidates.subarray(0, pendingCandidateCount));
720
- pendingCandidates = next;
721
- };
722
- const flushPendingTask = async () => {
723
- if (pendingSpecs.length === 0) {
724
- return;
725
- }
726
- const batchSpecs = new Uint32Array(pendingSpecs);
727
- const candidateIndices = pendingCandidates.slice(0, pendingCandidateCount);
728
- pendingSpecs = [];
729
- pendingCandidateCount = 0;
730
- await dispatchTask(batchSpecs, candidateIndices);
731
- };
732
- for (let bz = 0; bz < numBatchZ; bz++) {
733
- for (let by = 0; by < numBatchY; by++) {
734
- for (let bx = 0; bx < numBatchX; bx++) {
735
- const blockX = bx * batchBlockSize;
736
- const blockY = by * batchBlockSize;
737
- const blockZ = bz * batchBlockSize;
738
- const numBlocksX = Math.min(batchBlockSize, nBlockX - blockX);
739
- const numBlocksY = Math.min(batchBlockSize, nBlockY - blockY);
740
- const numBlocksZ = Math.min(batchBlockSize, totalBlockZ - blockZ);
741
- if (numBlocksX <= 0 || numBlocksY <= 0 || numBlocksZ <= 0) {
742
- continue;
743
- }
744
- const minX = gridMinX + blockX * LEAF_SIZE * voxelResolution;
745
- const minY = gridMinY + blockY * LEAF_SIZE * voxelResolution;
746
- const minZ = gridMinZ + blockZ * LEAF_SIZE * voxelResolution;
747
- const maxX = Math.min(gridBounds.max.x, minX + numBlocksX * LEAF_SIZE * voxelResolution);
748
- const maxY = Math.min(gridBounds.max.y, minY + numBlocksY * LEAF_SIZE * voxelResolution);
749
- const maxZ = Math.min(gridBounds.max.z, minZ + numBlocksZ * LEAF_SIZE * voxelResolution);
750
- let overlappingCount = bvh.queryOverlappingRawInto(minX, minY, minZ, maxX, maxY, maxZ, pendingCandidates, pendingCandidateCount);
751
- if (overlappingCount === 0) {
752
- continue;
753
- }
754
- if (pendingSpecs.length > 0 &&
755
- (pendingSpecs.length / 8 >= maxPendingBatches ||
756
- pendingCandidateCount + overlappingCount > maxPendingIndices)) {
757
- await flushPendingTask();
758
- overlappingCount = bvh.queryOverlappingRawInto(minX, minY, minZ, maxX, maxY, maxZ, pendingCandidates, pendingCandidateCount);
759
- }
760
- const needed = pendingCandidateCount + overlappingCount;
761
- if (needed > pendingCandidates.length) {
762
- ensurePendingCandidateCapacity(needed);
763
- overlappingCount = bvh.queryOverlappingRawInto(minX, minY, minZ, maxX, maxY, maxZ, pendingCandidates, pendingCandidateCount);
764
- }
765
- pendingSpecs.push(blockX, blockY, blockZ, numBlocksX, numBlocksY, numBlocksZ, pendingCandidateCount, overlappingCount);
766
- pendingCandidateCount += overlappingCount;
767
- if (pendingSpecs.length / 8 >= maxPendingBatches || pendingCandidateCount >= maxPendingIndices) {
768
- await flushPendingTask();
769
- }
770
- }
771
- }
772
- }
773
- await flushPendingTask();
774
- await Promise.all(availableSlots);
775
- await Promise.all(pool.map(async (slot) => {
776
- slot.worker.postMessage({ type: 'shutdown' });
777
- await slot.worker.terminate();
778
- }));
779
- return output;
780
- }
781
- catch (_e) {
782
- // Fallback when worker threads are unavailable or fail.
783
- return cpuVoxelizeSingleThread(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff);
784
- }
785
- };
786
- /**
787
- * GPU voxelization path using tiled multi-batch WGSL dispatch.
788
- * Per-batch Gaussian indices are built on the GPU (count pass, CPU prefix sum, fill pass) into `indexBuffer`,
789
- * replacing BVH `queryOverlappingRaw` on reference implementation. Batches are packed into mega-dispatches, then read back
790
- * as per-block 64-bit masks to populate `BlockMaskBuffer`.
791
- */
792
- export const gpuVoxelize = async (xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff) => {
793
- const FLOATS_PER_GAUSSIAN = 16;
794
- const UPLOAD_CHUNK_GAUSSIANS = 1 << 18;
795
- const WORKGROUP_SIZE = 256;
796
- // Tuning knobs: trade off submit overhead vs. peak memory/latency per mega-dispatch.
797
- // Upstream caps 16^3-block batches at 256; with this port's 4^3-block
798
- // batches, 16384 preserves the same max blocks per mega-dispatch.
799
- const MEGA_MAX_BATCHES = 16384;
800
- const MEGA_MAX_INDICES = 2 * 1024 * 1024;
801
- const BATCH_SIZE = 4;
802
- const MAX_BLOCKS_PER_BATCH = BATCH_SIZE * BATCH_SIZE * BATCH_SIZE;
803
- const blockSize = LEAF_SIZE * voxelResolution;
804
- const numBlocksX = Math.round((gridBounds.max.x - gridBounds.min.x) / blockSize);
805
- const numBlocksY = Math.round((gridBounds.max.y - gridBounds.min.y) / blockSize);
806
- const numBlocksZ = Math.round((gridBounds.max.z - gridBounds.min.z) / blockSize);
807
- const numBatchX = Math.ceil(numBlocksX / BATCH_SIZE);
808
- const numBatchY = Math.ceil(numBlocksY / BATCH_SIZE);
809
- const numBatchZ = Math.ceil(numBlocksZ / BATCH_SIZE);
810
- const totalBatchCount = numBatchX * numBatchY * numBatchZ;
811
- const gridMinX = gridBounds.min.x;
812
- const gridMinY = gridBounds.min.y;
813
- const gridMinZ = gridBounds.min.z;
814
- const gaussianCount = xCol.length;
815
- const batchWorldSize = blockSize * BATCH_SIZE;
816
- const device = await getOrCreateDevice();
817
- const gaussianBufferBytes = gaussianCount * FLOATS_PER_GAUSSIAN * 4;
818
- const maxBufferSize = Number(device.limits.maxBufferSize);
819
- if (gaussianBufferBytes > maxBufferSize) {
820
- throw new Error(`gpuVoxelize: gaussian buffer size ${gaussianBufferBytes} exceeds device maxBufferSize ${maxBufferSize} ` +
821
- `(gaussianCount=${gaussianCount}, bytesPerGaussian=${FLOATS_PER_GAUSSIAN * 4}).`);
822
- }
823
- const batchCountPipeline = device.createComputePipeline({
824
- layout: 'auto',
825
- compute: { module: device.createShaderModule({ code: buildPerBatchCountsWgsl() }), entryPoint: 'main' }
826
- });
827
- const batchFillPipeline = device.createComputePipeline({
828
- layout: 'auto',
829
- compute: { module: device.createShaderModule({ code: fillPerBatchCandidatesWgsl() }), entryPoint: 'main' }
830
- });
831
- const voxelPipeline = device.createComputePipeline({
832
- layout: 'auto',
833
- compute: { module: device.createShaderModule({ code: voxelizeMultiBatchWgsl() }), entryPoint: 'main' }
834
- });
835
- const blockBuffer = new BlockMaskBuffer();
836
- const gaussianBuffer = device.createBuffer({
837
- size: gaussianBufferBytes,
838
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST
839
- });
840
- const chunkRows = Math.min(gaussianCount, UPLOAD_CHUNK_GAUSSIANS);
841
- const interleavedChunk = new Float32Array(chunkRows * FLOATS_PER_GAUSSIAN);
842
- for (let chunkStart = 0; chunkStart < gaussianCount; chunkStart += chunkRows) {
843
- const chunkCount = Math.min(chunkRows, gaussianCount - chunkStart);
844
- for (let j = 0; j < chunkCount; j++) {
845
- const i = chunkStart + j;
846
- const offset = j * FLOATS_PER_GAUSSIAN;
847
- interleavedChunk[offset + 0] = xCol[i];
848
- interleavedChunk[offset + 1] = yCol[i];
849
- interleavedChunk[offset + 2] = zCol[i];
850
- interleavedChunk[offset + 3] = aCol[i];
851
- const rotW = qwCol[i];
852
- const rotX = qxCol[i];
853
- const rotY = qyCol[i];
854
- const rotZ = qzCol[i];
855
- const qlen = Math.sqrt(rotW * rotW + rotX * rotX + rotY * rotY + rotZ * rotZ);
856
- const invLen = qlen > 0 ? 1 / qlen : 0;
857
- interleavedChunk[offset + 4] = rotW * invLen;
858
- interleavedChunk[offset + 5] = rotX * invLen;
859
- interleavedChunk[offset + 6] = rotY * invLen;
860
- interleavedChunk[offset + 7] = rotZ * invLen;
861
- interleavedChunk[offset + 8] = sxCol[i];
862
- interleavedChunk[offset + 9] = syCol[i];
863
- interleavedChunk[offset + 10] = szCol[i];
864
- interleavedChunk[offset + 11] = extents[i * 3 + 0];
865
- interleavedChunk[offset + 12] = extents[i * 3 + 1];
866
- interleavedChunk[offset + 13] = extents[i * 3 + 2];
867
- interleavedChunk[offset + 14] = 0;
868
- interleavedChunk[offset + 15] = 0;
869
- }
870
- device.queue.writeBuffer(gaussianBuffer, chunkStart * FLOATS_PER_GAUSSIAN * 4, interleavedChunk.buffer, 0, chunkCount * FLOATS_PER_GAUSSIAN * 4);
871
- }
872
- const batchUniformBuffer = device.createBuffer({
873
- size: 256,
874
- usage: GPU_BUFFER_USAGE_UNIFORM | GPU_BUFFER_USAGE_COPY_DST
875
- });
876
- const batchCountsBuffer = device.createBuffer({
877
- size: Math.max(4, totalBatchCount * 4),
878
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_COPY_SRC
879
- });
880
- const batchCountsReadBuffer = device.createBuffer({
881
- size: Math.max(4, totalBatchCount * 4),
882
- usage: GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_MAP_READ
883
- });
884
- const batchUniformRaw = new Uint32Array(16);
885
- const batchUniformFloats = new Float32Array(batchUniformRaw.buffer);
886
- batchUniformFloats[0] = gridMinX;
887
- batchUniformFloats[1] = gridMinY;
888
- batchUniformFloats[2] = gridMinZ;
889
- batchUniformFloats[3] = batchWorldSize;
890
- batchUniformRaw[4] = numBatchX;
891
- batchUniformRaw[5] = numBatchY;
892
- batchUniformRaw[6] = numBatchZ;
893
- batchUniformRaw[7] = gaussianCount;
894
- device.queue.writeBuffer(batchUniformBuffer, 0, batchUniformRaw.buffer, 0, 32);
895
- const countBindGroup = device.createBindGroup({
896
- layout: batchCountPipeline.getBindGroupLayout(0),
897
- entries: [
898
- { binding: 0, resource: { buffer: batchUniformBuffer } },
899
- { binding: 1, resource: { buffer: gaussianBuffer } },
900
- { binding: 2, resource: { buffer: batchCountsBuffer } }
901
- ]
902
- });
903
- const zeroBatchCounts = new Uint32Array(Math.max(1, totalBatchCount));
904
- device.queue.writeBuffer(batchCountsBuffer, 0, zeroBatchCounts);
905
- // Count overlaps per coarse batch on GPU; copy out for CPU exclusive-prefix into batchCandidateOffsets.
906
- {
907
- const encoder = device.createCommandEncoder();
908
- const pass = encoder.beginComputePass();
909
- pass.setPipeline(batchCountPipeline);
910
- pass.setBindGroup(0, countBindGroup);
911
- pass.dispatchWorkgroups(Math.ceil(gaussianCount / WORKGROUP_SIZE), 1, 1);
912
- pass.end();
913
- encoder.copyBufferToBuffer(batchCountsBuffer, 0, batchCountsReadBuffer, 0, totalBatchCount * 4);
914
- device.queue.submit([encoder.finish()]);
915
- }
916
- await batchCountsReadBuffer.mapAsync(GPU_MAP_MODE_READ);
917
- const countsMapped = new Uint32Array(batchCountsReadBuffer.getMappedRange());
918
- const batchCandidateCounts = new Uint32Array(totalBatchCount);
919
- batchCandidateCounts.set(countsMapped.subarray(0, totalBatchCount));
920
- batchCountsReadBuffer.unmap();
921
- const batchCandidateOffsets = new Uint32Array(totalBatchCount);
922
- let totalCandidateCount = 0;
923
- for (let i = 0; i < totalBatchCount; i++) {
924
- batchCandidateOffsets[i] = totalCandidateCount;
925
- totalCandidateCount += batchCandidateCounts[i];
926
- }
927
- if (totalCandidateCount === 0) {
928
- batchUniformBuffer.destroy();
929
- batchCountsBuffer.destroy();
930
- batchCountsReadBuffer.destroy();
931
- gaussianBuffer.destroy();
932
- return blockBuffer;
933
- }
934
- const batchOffsetsBuffer = device.createBuffer({
935
- size: batchCandidateOffsets.byteLength,
936
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST
937
- });
938
- device.queue.writeBuffer(batchOffsetsBuffer, 0, batchCandidateOffsets);
939
- const batchWriteHeadsBuffer = device.createBuffer({
940
- size: Math.max(4, totalBatchCount * 4),
941
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST
942
- });
943
- device.queue.writeBuffer(batchWriteHeadsBuffer, 0, zeroBatchCounts);
944
- // Packed gaussian indices for all batches (size = totalCandidateCount); filled by GPU scatter pass.
945
- const indexBuffer = device.createBuffer({
946
- size: totalCandidateCount * 4,
947
- usage: GPU_BUFFER_USAGE_STORAGE
948
- });
949
- // GPU scatter pass: write gaussian indices into each batch segment of `indexBuffer`.
950
- const fillBindGroup = device.createBindGroup({
951
- layout: batchFillPipeline.getBindGroupLayout(0),
952
- entries: [
953
- { binding: 0, resource: { buffer: batchUniformBuffer } },
954
- { binding: 1, resource: { buffer: gaussianBuffer } },
955
- { binding: 2, resource: { buffer: batchOffsetsBuffer } },
956
- { binding: 3, resource: { buffer: batchWriteHeadsBuffer } },
957
- { binding: 4, resource: { buffer: indexBuffer } }
958
- ]
959
- });
960
- {
961
- const encoder = device.createCommandEncoder();
962
- const pass = encoder.beginComputePass();
963
- pass.setPipeline(batchFillPipeline);
964
- pass.setBindGroup(0, fillBindGroup);
965
- pass.dispatchWorkgroups(Math.ceil(gaussianCount / WORKGROUP_SIZE), 1, 1);
966
- pass.end();
967
- device.queue.submit([encoder.finish()]);
968
- }
969
- // BatchInfo struct in WGSL: 5xu32 + 3xf32 packed as 8xu32 per batch.
970
- const BATCH_INFO_U32S = 8;
971
- const createSlot = () => {
972
- const uniformBuffer = device.createBuffer({
973
- size: 256,
974
- usage: GPU_BUFFER_USAGE_UNIFORM | GPU_BUFFER_USAGE_COPY_DST
975
- });
976
- const resultsBuffer = device.createBuffer({
977
- size: MEGA_MAX_BATCHES * MAX_BLOCKS_PER_BATCH * 2 * 4,
978
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_SRC | GPU_BUFFER_USAGE_COPY_DST
979
- });
980
- const readBuffer = device.createBuffer({
981
- size: MEGA_MAX_BATCHES * MAX_BLOCKS_PER_BATCH * 2 * 4,
982
- usage: GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_MAP_READ
983
- });
984
- const batchInfoBuffer = device.createBuffer({
985
- size: MEGA_MAX_BATCHES * BATCH_INFO_U32S * 4,
986
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST
987
- });
988
- const bindGroup = device.createBindGroup({
989
- layout: voxelPipeline.getBindGroupLayout(0),
990
- entries: [
991
- { binding: 0, resource: { buffer: uniformBuffer } },
992
- { binding: 1, resource: { buffer: gaussianBuffer } },
993
- { binding: 2, resource: { buffer: indexBuffer } },
994
- { binding: 3, resource: { buffer: resultsBuffer } },
995
- { binding: 4, resource: { buffer: batchInfoBuffer } }
996
- ]
997
- });
998
- return {
999
- uniformBuffer,
1000
- resultsBuffer,
1001
- readBuffer,
1002
- batchInfoBuffer,
1003
- bindGroup,
1004
- resultsBufferSize: MEGA_MAX_BATCHES * MAX_BLOCKS_PER_BATCH * 2 * 4,
1005
- batchInfoCapacityBytes: MEGA_MAX_BATCHES * BATCH_INFO_U32S * 4
1006
- };
1007
- };
1008
- const slots = [createSlot(), createSlot()];
1009
- let currentSlot = 0;
1010
- let inflight;
1011
- const ensureSlotCapacity = (slot, batchCount) => {
1012
- const resultBytes = Math.max(8, batchCount * MAX_BLOCKS_PER_BATCH * 2 * 4);
1013
- const batchInfoBytes = Math.max(32, batchCount * BATCH_INFO_U32S * 4);
1014
- if (resultBytes > slot.resultsBufferSize) {
1015
- slot.resultsBuffer.destroy();
1016
- slot.readBuffer.destroy();
1017
- // Growth (at least x2) avoids frequent GPU buffer reallocations when batch sizes fluctuate.
1018
- slot.resultsBufferSize = Math.max(slot.resultsBufferSize * 2, resultBytes);
1019
- slot.resultsBuffer = device.createBuffer({
1020
- size: slot.resultsBufferSize,
1021
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_SRC | GPU_BUFFER_USAGE_COPY_DST
1022
- });
1023
- slot.readBuffer = device.createBuffer({
1024
- size: slot.resultsBufferSize,
1025
- usage: GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_MAP_READ
1026
- });
1027
- slot.bindGroup = device.createBindGroup({
1028
- layout: voxelPipeline.getBindGroupLayout(0),
1029
- entries: [
1030
- { binding: 0, resource: { buffer: slot.uniformBuffer } },
1031
- { binding: 1, resource: { buffer: gaussianBuffer } },
1032
- { binding: 2, resource: { buffer: indexBuffer } },
1033
- { binding: 3, resource: { buffer: slot.resultsBuffer } },
1034
- { binding: 4, resource: { buffer: slot.batchInfoBuffer } }
1035
- ]
1036
- });
1037
- }
1038
- if (batchInfoBytes > slot.batchInfoCapacityBytes) {
1039
- slot.batchInfoBuffer.destroy();
1040
- // Same growth policy as results/read buffers.
1041
- slot.batchInfoCapacityBytes = Math.max(slot.batchInfoCapacityBytes * 2, batchInfoBytes);
1042
- slot.batchInfoBuffer = device.createBuffer({
1043
- size: slot.batchInfoCapacityBytes,
1044
- usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST
1045
- });
1046
- slot.bindGroup = device.createBindGroup({
1047
- layout: voxelPipeline.getBindGroupLayout(0),
1048
- entries: [
1049
- { binding: 0, resource: { buffer: slot.uniformBuffer } },
1050
- { binding: 1, resource: { buffer: gaussianBuffer } },
1051
- { binding: 2, resource: { buffer: indexBuffer } },
1052
- { binding: 3, resource: { buffer: slot.resultsBuffer } },
1053
- { binding: 4, resource: { buffer: slot.batchInfoBuffer } }
1054
- ]
1055
- });
1056
- }
1057
- };
1058
- const processResults = (masks, batches) => {
1059
- for (let b = 0; b < batches.length; b++) {
1060
- const batch = batches[b];
1061
- const batchResultOffset = b * MAX_BLOCKS_PER_BATCH * 2;
1062
- const totalBatchBlocks = batch.numBlocksX * batch.numBlocksY * batch.numBlocksZ;
1063
- for (let blockIdx = 0; blockIdx < totalBatchBlocks; blockIdx++) {
1064
- const maskLo = masks[batchResultOffset + blockIdx * 2];
1065
- const maskHi = masks[batchResultOffset + blockIdx * 2 + 1];
1066
- if (maskLo === 0 && maskHi === 0) {
1067
- continue;
1068
- }
1069
- const localX = blockIdx % batch.numBlocksX;
1070
- const localY = Math.floor(blockIdx / batch.numBlocksX) % batch.numBlocksY;
1071
- const localZ = Math.floor(blockIdx / (batch.numBlocksX * batch.numBlocksY));
1072
- const blockLinear = (batch.bx + localX) + (batch.by + localY) * numBlocksX + (batch.bz + localZ) * numBlocksX * numBlocksY;
1073
- blockBuffer.addBlock(blockLinear, maskLo, maskHi);
1074
- }
1075
- }
1076
- };
1077
- let pendingBatches = [];
1078
- let megaIndexSpan = 0;
1079
- const flushPendingBatches = async () => {
1080
- if (pendingBatches.length === 0) {
1081
- return;
1082
- }
1083
- const submitSlot = currentSlot;
1084
- currentSlot = (currentSlot + 1) & 1;
1085
- const batchesToSubmit = pendingBatches;
1086
- pendingBatches = [];
1087
- megaIndexSpan = 0;
1088
- const slot = slots[submitSlot];
1089
- ensureSlotCapacity(slot, batchesToSubmit.length);
1090
- const resultsU32Count = batchesToSubmit.length * MAX_BLOCKS_PER_BATCH * 2;
1091
- const batchInfoU32Count = batchesToSubmit.length * BATCH_INFO_U32S;
1092
- const batchInfoF32 = new Float32Array(batchInfoU32Count);
1093
- const batchInfoU32 = new Uint32Array(batchInfoF32.buffer);
1094
- for (let i = 0; i < batchesToSubmit.length; i++) {
1095
- const batch = batchesToSubmit[i];
1096
- const base = i * BATCH_INFO_U32S;
1097
- batchInfoU32[base + 0] = batch.indexOffset;
1098
- batchInfoU32[base + 1] = batch.indexCount;
1099
- batchInfoU32[base + 2] = batch.numBlocksX;
1100
- batchInfoU32[base + 3] = batch.numBlocksY;
1101
- batchInfoU32[base + 4] = batch.numBlocksZ;
1102
- batchInfoF32[base + 5] = batch.blockMinX;
1103
- batchInfoF32[base + 6] = batch.blockMinY;
1104
- batchInfoF32[base + 7] = batch.blockMinZ;
1105
- }
1106
- device.queue.writeBuffer(slot.batchInfoBuffer, 0, batchInfoU32.buffer, batchInfoU32.byteOffset, batchInfoU32.byteLength);
1107
- const uniform = new Uint32Array(16);
1108
- const uf = new Float32Array(uniform.buffer);
1109
- uf[0] = opacityCutoff;
1110
- uf[1] = voxelResolution;
1111
- uniform[2] = MAX_BLOCKS_PER_BATCH;
1112
- device.queue.writeBuffer(slot.uniformBuffer, 0, uniform.buffer, 0, 12);
1113
- const encoder = device.createCommandEncoder();
1114
- encoder.clearBuffer(slot.resultsBuffer, 0, resultsU32Count * 4);
1115
- const pass = encoder.beginComputePass();
1116
- pass.setPipeline(voxelPipeline);
1117
- pass.setBindGroup(0, slot.bindGroup);
1118
- pass.dispatchWorkgroups(MAX_BLOCKS_PER_BATCH, 1, batchesToSubmit.length);
1119
- pass.end();
1120
- encoder.copyBufferToBuffer(slot.resultsBuffer, 0, slot.readBuffer, 0, resultsU32Count * 4);
1121
- device.queue.submit([encoder.finish()]);
1122
- const taskPromise = (async () => {
1123
- await slot.readBuffer.mapAsync(GPU_MAP_MODE_READ);
1124
- const mapped = new Uint32Array(slot.readBuffer.getMappedRange());
1125
- const copied = new Uint32Array(resultsU32Count);
1126
- copied.set(mapped.subarray(0, resultsU32Count));
1127
- slot.readBuffer.unmap();
1128
- return { masks: copied, batches: batchesToSubmit };
1129
- })();
1130
- if (inflight) {
1131
- const done = await inflight.taskId;
1132
- processResults(done.masks, done.batches);
1133
- }
1134
- inflight = { taskId: taskPromise };
1135
- };
1136
- for (let bz = 0; bz < numBatchZ; bz++) {
1137
- for (let by = 0; by < numBatchY; by++) {
1138
- for (let bx = 0; bx < numBatchX; bx++) {
1139
- const batchId = bz * numBatchX * numBatchY + by * numBatchX + bx;
1140
- const indexCount = batchCandidateCounts[batchId];
1141
- if (indexCount === 0) {
1142
- continue;
1143
- }
1144
- const bxBlock = bx * BATCH_SIZE;
1145
- const byBlock = by * BATCH_SIZE;
1146
- const bzBlock = bz * BATCH_SIZE;
1147
- const currBatchX = Math.min(BATCH_SIZE, numBlocksX - bxBlock);
1148
- const currBatchY = Math.min(BATCH_SIZE, numBlocksY - byBlock);
1149
- const currBatchZ = Math.min(BATCH_SIZE, numBlocksZ - bzBlock);
1150
- // World-space origin of this batch's 16^3 block grid; indexOffset/indexCount refer to `indexBuffer`.
1151
- const blockMinX = gridMinX + bxBlock * blockSize;
1152
- const blockMinY = gridMinY + byBlock * blockSize;
1153
- const blockMinZ = gridMinZ + bzBlock * blockSize;
1154
- pendingBatches.push({
1155
- indexOffset: batchCandidateOffsets[batchId],
1156
- indexCount,
1157
- blockMinX,
1158
- blockMinY,
1159
- blockMinZ,
1160
- numBlocksX: currBatchX,
1161
- numBlocksY: currBatchY,
1162
- numBlocksZ: currBatchZ,
1163
- bx: bxBlock,
1164
- by: byBlock,
1165
- bz: bzBlock
1166
- });
1167
- megaIndexSpan += indexCount;
1168
- if (pendingBatches.length >= MEGA_MAX_BATCHES || megaIndexSpan >= MEGA_MAX_INDICES) {
1169
- await flushPendingBatches();
1170
- }
1171
- }
1172
- }
1173
- }
1174
- await flushPendingBatches();
1175
- if (inflight) {
1176
- const done = await inflight.taskId;
1177
- processResults(done.masks, done.batches);
1178
- }
1179
- batchUniformBuffer.destroy();
1180
- batchCountsBuffer.destroy();
1181
- batchCountsReadBuffer.destroy();
1182
- batchOffsetsBuffer.destroy();
1183
- batchWriteHeadsBuffer.destroy();
1184
- indexBuffer.destroy();
1185
- for (const slot of slots) {
1186
- slot.uniformBuffer.destroy();
1187
- slot.resultsBuffer.destroy();
1188
- slot.readBuffer.destroy();
1189
- slot.batchInfoBuffer.destroy();
1190
- }
1191
- gaussianBuffer.destroy();
1192
- return blockBuffer;
1193
- };
1
+ import { getOrCreateDevice } from '../webgpu.js';
2
+ import { logger } from '../Logger.js';
3
+ import { ALPHA_THRESHOLD, BlockMaskBuffer, GaussianBVH, LEAF_SIZE } from './common.js';
4
+ import { availableParallelism } from 'node:os';
5
+ import { Worker } from 'node:worker_threads';
6
+ /** Per gaussian: increment overlap count for each coarse batch cell its AABB touches (GPU atomics). */
7
+ function buildPerBatchCountsWgsl() {
8
+ return /* wgsl */ `
9
+ struct Uniforms {
10
+ gridMinX: f32,
11
+ gridMinY: f32,
12
+ gridMinZ: f32,
13
+ batchWorldSize: f32,
14
+ numBatchX: u32,
15
+ numBatchY: u32,
16
+ numBatchZ: u32,
17
+ gaussianCount: u32
18
+ }
19
+
20
+ struct Gaussian {
21
+ posX: f32, posY: f32, posZ: f32, opacity: f32,
22
+ rotW: f32, rotX: f32, rotY: f32, rotZ: f32,
23
+ scaleX: f32, scaleY: f32, scaleZ: f32,
24
+ extentX: f32, extentY: f32, extentZ: f32,
25
+ _padding0: f32, _padding1: f32
26
+ }
27
+
28
+ @group(0) @binding(0) var<uniform> uniforms: Uniforms;
29
+ @group(0) @binding(1) var<storage, read> allGaussians: array<Gaussian>;
30
+ @group(0) @binding(2) var<storage, read_write> batchCounts: array<atomic<u32>>;
31
+
32
+ @compute @workgroup_size(256)
33
+ fn main(@builtin(global_invocation_id) global_id: vec3u) {
34
+ let gaussianIdx = global_id.x;
35
+ if (gaussianIdx >= uniforms.gaussianCount) { return; }
36
+ let g = allGaussians[gaussianIdx];
37
+ if (g.opacity <= 0.0) { return; }
38
+ let gMinX = g.posX - g.extentX - uniforms.gridMinX;
39
+ let gMinY = g.posY - g.extentY - uniforms.gridMinY;
40
+ let gMinZ = g.posZ - g.extentZ - uniforms.gridMinZ;
41
+ let gMaxX = g.posX + g.extentX - uniforms.gridMinX;
42
+ let gMaxY = g.posY + g.extentY - uniforms.gridMinY;
43
+ let gMaxZ = g.posZ + g.extentZ - uniforms.gridMinZ;
44
+ let maxWorldX = uniforms.batchWorldSize * f32(uniforms.numBatchX);
45
+ let maxWorldY = uniforms.batchWorldSize * f32(uniforms.numBatchY);
46
+ let maxWorldZ = uniforms.batchWorldSize * f32(uniforms.numBatchZ);
47
+ if (gMaxX < 0.0 || gMinX > maxWorldX || gMaxY < 0.0 || gMinY > maxWorldY || gMaxZ < 0.0 || gMinZ > maxWorldZ) { return; }
48
+ let minBx = clamp(i32(floor(gMinX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
49
+ let minBy = clamp(i32(floor(gMinY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
50
+ let minBz = clamp(i32(floor(gMinZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
51
+ let maxBx = clamp(i32(floor(gMaxX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
52
+ let maxBy = clamp(i32(floor(gMaxY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
53
+ let maxBz = clamp(i32(floor(gMaxZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
54
+ for (var bz = minBz; bz <= maxBz; bz++) {
55
+ for (var by = minBy; by <= maxBy; by++) {
56
+ for (var bx = minBx; bx <= maxBx; bx++) {
57
+ let batchId = u32(bz) * uniforms.numBatchX * uniforms.numBatchY + u32(by) * uniforms.numBatchX + u32(bx);
58
+ atomicAdd(&batchCounts[batchId], 1u);
59
+ }
60
+ }
61
+ }
62
+ }
63
+ `;
64
+ }
65
+ /** Scatter gaussian indices into packed `indices` using prefix `batchOffsets` and per-batch atomic write heads. */
66
+ function fillPerBatchCandidatesWgsl() {
67
+ return /* wgsl */ `
68
+ struct Uniforms {
69
+ gridMinX: f32,
70
+ gridMinY: f32,
71
+ gridMinZ: f32,
72
+ batchWorldSize: f32,
73
+ numBatchX: u32,
74
+ numBatchY: u32,
75
+ numBatchZ: u32,
76
+ gaussianCount: u32
77
+ }
78
+
79
+ struct Gaussian {
80
+ posX: f32, posY: f32, posZ: f32, opacity: f32,
81
+ rotW: f32, rotX: f32, rotY: f32, rotZ: f32,
82
+ scaleX: f32, scaleY: f32, scaleZ: f32,
83
+ extentX: f32, extentY: f32, extentZ: f32,
84
+ _padding0: f32, _padding1: f32
85
+ }
86
+
87
+ @group(0) @binding(0) var<uniform> uniforms: Uniforms;
88
+ @group(0) @binding(1) var<storage, read> allGaussians: array<Gaussian>;
89
+ @group(0) @binding(2) var<storage, read> batchOffsets: array<u32>;
90
+ @group(0) @binding(3) var<storage, read_write> batchWriteHeads: array<atomic<u32>>;
91
+ @group(0) @binding(4) var<storage, read_write> indices: array<u32>;
92
+
93
+ @compute @workgroup_size(256)
94
+ fn main(@builtin(global_invocation_id) global_id: vec3u) {
95
+ let gaussianIdx = global_id.x;
96
+ if (gaussianIdx >= uniforms.gaussianCount) { return; }
97
+ let g = allGaussians[gaussianIdx];
98
+ if (g.opacity <= 0.0) { return; }
99
+ let gMinX = g.posX - g.extentX - uniforms.gridMinX;
100
+ let gMinY = g.posY - g.extentY - uniforms.gridMinY;
101
+ let gMinZ = g.posZ - g.extentZ - uniforms.gridMinZ;
102
+ let gMaxX = g.posX + g.extentX - uniforms.gridMinX;
103
+ let gMaxY = g.posY + g.extentY - uniforms.gridMinY;
104
+ let gMaxZ = g.posZ + g.extentZ - uniforms.gridMinZ;
105
+ let maxWorldX = uniforms.batchWorldSize * f32(uniforms.numBatchX);
106
+ let maxWorldY = uniforms.batchWorldSize * f32(uniforms.numBatchY);
107
+ let maxWorldZ = uniforms.batchWorldSize * f32(uniforms.numBatchZ);
108
+ if (gMaxX < 0.0 || gMinX > maxWorldX || gMaxY < 0.0 || gMinY > maxWorldY || gMaxZ < 0.0 || gMinZ > maxWorldZ) { return; }
109
+ let minBx = clamp(i32(floor(gMinX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
110
+ let minBy = clamp(i32(floor(gMinY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
111
+ let minBz = clamp(i32(floor(gMinZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
112
+ let maxBx = clamp(i32(floor(gMaxX / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchX) - 1);
113
+ let maxBy = clamp(i32(floor(gMaxY / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchY) - 1);
114
+ let maxBz = clamp(i32(floor(gMaxZ / uniforms.batchWorldSize)), 0, i32(uniforms.numBatchZ) - 1);
115
+ for (var bz = minBz; bz <= maxBz; bz++) {
116
+ for (var by = minBy; by <= maxBy; by++) {
117
+ for (var bx = minBx; bx <= maxBx; bx++) {
118
+ let batchId = u32(bz) * uniforms.numBatchX * uniforms.numBatchY + u32(by) * uniforms.numBatchX + u32(bx);
119
+ let local = atomicAdd(&batchWriteHeads[batchId], 1u);
120
+ let dst = batchOffsets[batchId] + local;
121
+ indices[dst] = gaussianIdx;
122
+ }
123
+ }
124
+ }
125
+ }
126
+ `;
127
+ }
128
+ /**
129
+ * From https://github.com/playcanvas/splat-transform/blob/8f3b843efdc378f97d4f6a66a3a90a2de6d479a4/src/lib/gpu/gpu-voxelization.ts
130
+ * WGSL shader for multi-batch voxelization of 4x4x4 blocks.
131
+ *
132
+ * Each workgroup processes one block in one batch.
133
+ * - workgroup_id.z = batch index
134
+ * - workgroup_id.x = flat block index within the batch
135
+ * Per-batch metadata (index range, block origin, dimensions) comes from a storage buffer,
136
+ * allowing many batches to be dispatched in a single GPU call.
137
+ */
138
+ function voxelizeMultiBatchWgsl() {
139
+ return /* wgsl */ `
140
+ struct Uniforms {
141
+ opacityCutoff: f32,
142
+ voxelResolution: f32,
143
+ maxBlocksPerBatch: u32
144
+ }
145
+
146
+ struct BatchInfo {
147
+ indexOffset: u32,
148
+ indexCount: u32,
149
+ numBlocksX: u32,
150
+ numBlocksY: u32,
151
+ numBlocksZ: u32,
152
+ blockMinX: f32,
153
+ blockMinY: f32,
154
+ blockMinZ: f32
155
+ }
156
+
157
+ struct Gaussian {
158
+ posX: f32,
159
+ posY: f32,
160
+ posZ: f32,
161
+ opacity: f32,
162
+ rotW: f32,
163
+ rotX: f32,
164
+ rotY: f32,
165
+ rotZ: f32,
166
+ scaleX: f32,
167
+ scaleY: f32,
168
+ scaleZ: f32,
169
+ extentX: f32,
170
+ extentY: f32,
171
+ extentZ: f32,
172
+ _padding0: f32,
173
+ _padding1: f32
174
+ }
175
+
176
+ @group(0) @binding(0) var<uniform> uniforms: Uniforms;
177
+ @group(0) @binding(1) var<storage, read> allGaussians: array<Gaussian>;
178
+ @group(0) @binding(2) var<storage, read> indices: array<u32>;
179
+ @group(0) @binding(3) var<storage, read_write> results: array<atomic<u32>>;
180
+ @group(0) @binding(4) var<storage, read> batchInfos: array<BatchInfo>;
181
+
182
+ // Shared memory for cooperative Gaussian loading.
183
+ // All 64 threads in a workgroup load one Gaussian each, then all threads
184
+ // evaluate against the shared chunk (reducing global memory reads by 64x).
185
+ // 64 Gaussians * 64 bytes each = 4 KB (well within 16 KB WebGPU minimum).
186
+ const tileSize = 64u;
187
+ var<workgroup> sharedGaussians: array<Gaussian, tileSize>;
188
+ var<workgroup> blockMasks: array<atomic<u32>, 2>;
189
+
190
+ fn mortonToXYZ(m: u32) -> vec3u {
191
+ return vec3u(
192
+ (m & 1u) | ((m >> 2u) & 2u),
193
+ ((m >> 1u) & 1u) | ((m >> 3u) & 2u),
194
+ ((m >> 2u) & 1u) | ((m >> 4u) & 2u)
195
+ );
196
+ }
197
+
198
+ fn evaluateGaussianForVoxel(voxelCenter: vec3f, voxelHalfSize: f32, g: Gaussian) -> f32 {
199
+ let gaussianCenter = vec3f(g.posX, g.posY, g.posZ);
200
+ let diff = voxelCenter - gaussianCenter;
201
+ // Use pre-computed world-space AABB half-extents (3-sigma, accounts for rotation)
202
+ let extent = vec3f(g.extentX, g.extentY, g.extentZ);
203
+ // Per-axis AABB overlap check
204
+ if (any(abs(diff) > (extent + voxelHalfSize))) {
205
+ return 0.0;
206
+ }
207
+ // Find closest point in voxel to Gaussian center
208
+ let closestPoint = clamp(gaussianCenter, voxelCenter - voxelHalfSize, voxelCenter + voxelHalfSize);
209
+ let closestDiff = closestPoint - gaussianCenter;
210
+ // Inverse rotation using cross-product formula (Rodrigues rotation)
211
+ // For inverse: negate xyz components of quaternion
212
+ let qxyz = vec3f(-g.rotX, -g.rotY, -g.rotZ);
213
+ let t = 2.0 * cross(qxyz, closestDiff);
214
+ let localDiff = closestDiff + g.rotW * t + cross(qxyz, t);
215
+ // Calculate Mahalanobis distance squared
216
+ let invScale = vec3f(1.0 / max(g.scaleX, 1e-8), 1.0 / max(g.scaleY, 1e-8), 1.0 / max(g.scaleZ, 1e-8));
217
+ let scaled = localDiff * invScale;
218
+ let d2 = dot(scaled, scaled);
219
+ return g.opacity * exp(-0.5 * d2);
220
+ }
221
+
222
+ @compute @workgroup_size(64)
223
+ fn main(
224
+ @builtin(local_invocation_index) local_invocation_index: u32,
225
+ @builtin(workgroup_id) workgroup_id: vec3u
226
+ ) {
227
+ let batchIdx = workgroup_id.z;
228
+ let flatBlockId = workgroup_id.x;
229
+ let info = batchInfos[batchIdx];
230
+ // Skip padded workgroups beyond the batch's actual block count
231
+ let totalBlocks = info.numBlocksX * info.numBlocksY * info.numBlocksZ;
232
+ if (flatBlockId >= totalBlocks) { return; }
233
+
234
+ // Decompose flat block ID to 3D coordinates within the batch
235
+ let blockX = flatBlockId % info.numBlocksX;
236
+ let blockY = (flatBlockId / info.numBlocksX) % info.numBlocksY;
237
+ let blockZ = flatBlockId / (info.numBlocksX * info.numBlocksY);
238
+ let localPos = mortonToXYZ(local_invocation_index);
239
+
240
+ let blockMin = vec3f(info.blockMinX, info.blockMinY, info.blockMinZ);
241
+ let blockOffset = vec3f(f32(blockX), f32(blockY), f32(blockZ)) * 4.0 * uniforms.voxelResolution;
242
+ let voxelCenter = blockMin + blockOffset + (vec3f(localPos) + 0.5) * uniforms.voxelResolution;
243
+ let voxelHalfSize = uniforms.voxelResolution * 0.5;
244
+ if (local_invocation_index < 2u) {
245
+ atomicStore(&blockMasks[local_invocation_index], 0u);
246
+ }
247
+ workgroupBarrier();
248
+
249
+ var totalSigma = 0.0;
250
+ let numIndices = info.indexCount;
251
+ let numTiles = (numIndices + tileSize - 1u) / tileSize;
252
+ for (var tile = 0u; tile < numTiles; tile++) {
253
+ // Cooperative load: each thread loads one Gaussian into shared memory
254
+ let loadIdx = tile * tileSize + local_invocation_index;
255
+ if (loadIdx < numIndices) {
256
+ let gaussianIdx = indices[info.indexOffset + loadIdx];
257
+ sharedGaussians[local_invocation_index] = allGaussians[gaussianIdx];
258
+ }
259
+ // Wait for all threads to finish loading the tile
260
+ workgroupBarrier();
261
+
262
+ if (totalSigma < 7.0) {
263
+ let thisTileSize = min(tileSize, numIndices - tile * tileSize);
264
+ for (var c = 0u; c < thisTileSize; c++) {
265
+ totalSigma += evaluateGaussianForVoxel(voxelCenter, voxelHalfSize, sharedGaussians[c]);
266
+ if (totalSigma >= 7.0) { break; }
267
+ }
268
+ }
269
+ // Wait before next tile overwrites shared memory
270
+ workgroupBarrier();
271
+ }
272
+
273
+ // Convert accumulated density to opacity using Beer-Lambert law
274
+ let finalOpacity = 1.0 - exp(-totalSigma);
275
+ let isSolid = finalOpacity >= uniforms.opacityCutoff;
276
+ // Accumulate block bits in workgroup-local atomics first to reduce global atomic contention.
277
+ if (isSolid) {
278
+ let linearIdx = localPos.z * 16u + localPos.y * 4u + localPos.x;
279
+ atomicOr(&blockMasks[linearIdx >> 5u], 1u << (linearIdx & 31u));
280
+ }
281
+ workgroupBarrier();
282
+ if (local_invocation_index < 2u) {
283
+ let batchResultBase = batchIdx * uniforms.maxBlocksPerBatch * 2u;
284
+ let wordIndex = batchResultBase + flatBlockId * 2u + local_invocation_index;
285
+ atomicStore(&results[wordIndex], atomicLoad(&blockMasks[local_invocation_index]));
286
+ }
287
+ }
288
+ `;
289
+ }
290
+ const GPU_BUFFER_USAGE_STORAGE = 128;
291
+ const GPU_BUFFER_USAGE_COPY_DST = 8;
292
+ const GPU_BUFFER_USAGE_COPY_SRC = 4;
293
+ const GPU_BUFFER_USAGE_UNIFORM = 64;
294
+ const GPU_BUFFER_USAGE_MAP_READ = 1;
295
+ const GPU_MAP_MODE_READ = 1;
296
+ /**
297
+ * CPU voxelization fallback (simplified path).
298
+ * Iterates candidate gaussians per batch and writes occupied voxel bits directly.
299
+ */
300
+ const CPU_VOXEL_PARALLEL_MIN_GAUSSIANS = 0;
301
+ function parsePositiveInteger(value) {
302
+ if (typeof value === 'number') {
303
+ return Number.isFinite(value) && value > 0 ? Math.floor(value) : undefined;
304
+ }
305
+ if (typeof value !== 'string' || value.trim() === '') {
306
+ return undefined;
307
+ }
308
+ const parsed = Number(value);
309
+ return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : undefined;
310
+ }
311
+ function resolveCpuVoxelWorkerCount(override) {
312
+ if (override !== undefined) {
313
+ if (override === -1) {
314
+ return Math.max(1, availableParallelism() - 1);
315
+ }
316
+ return parsePositiveInteger(override) ?? Math.max(1, availableParallelism() - 1);
317
+ }
318
+ return (parsePositiveInteger(process.env.SPLAT_CPU_VOXEL_WORKERS) ??
319
+ parsePositiveInteger(process.env.CPU_VOXEL_WORKERS) ??
320
+ Math.max(1, availableParallelism() - 1));
321
+ }
322
+ const cpuVoxelizeWorkerScript = `
323
+ import { parentPort, workerData } from 'node:worker_threads';
324
+ const {
325
+ voxelResolution, opacityCutoff, alphaThreshold, gridMinX, gridMinY, gridMinZ,
326
+ nBlockX, nBlockY, nBlockXY,
327
+ xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents
328
+ } = workerData;
329
+ const x = new Float32Array(xCol);
330
+ const y = new Float32Array(yCol);
331
+ const z = new Float32Array(zCol);
332
+ const sx = new Float32Array(sxCol);
333
+ const sy = new Float32Array(syCol);
334
+ const sz = new Float32Array(szCol);
335
+ const qx = new Float32Array(qxCol);
336
+ const qy = new Float32Array(qyCol);
337
+ const qz = new Float32Array(qzCol);
338
+ const qw = new Float32Array(qwCol);
339
+ const a = new Float32Array(aCol);
340
+ const ext = new Float32Array(extents);
341
+ const half = voxelResolution * 0.5;
342
+ const sigmaCutoff = opacityCutoff <= 0 ? 0 : -Math.log1p(-Math.min(opacityCutoff, 1 - 1e-8));
343
+ const SPEC_STRIDE = 8;
344
+ const BATCH_BLOCK_SIZE = 4;
345
+ const BATCH_VOXEL_SIZE = BATCH_BLOCK_SIZE * 4;
346
+ const MAX_BATCH_VOXELS = BATCH_VOXEL_SIZE * BATCH_VOXEL_SIZE * BATCH_VOXEL_SIZE;
347
+ const MAX_BATCH_BLOCKS = BATCH_BLOCK_SIZE * BATCH_BLOCK_SIZE * BATCH_BLOCK_SIZE;
348
+ const sigmaBuffer = new Float32Array(MAX_BATCH_VOXELS);
349
+ const solidBuffer = new Uint8Array(MAX_BATCH_VOXELS);
350
+ const masksLoBuffer = new Uint32Array(MAX_BATCH_BLOCKS);
351
+ const masksHiBuffer = new Uint32Array(MAX_BATCH_BLOCKS);
352
+ function runBatchSet(batchSpecs, candidateIndices) {
353
+ const specs = new Uint32Array(batchSpecs);
354
+ const candidates = new Uint32Array(candidateIndices);
355
+ const packedBlocks = [];
356
+ const batchCount = specs.length / SPEC_STRIDE;
357
+ for (let specIdx = 0; specIdx < batchCount; specIdx++) {
358
+ const specBase = specIdx * SPEC_STRIDE;
359
+ const batchBlockX = specs[specBase + 0];
360
+ const batchBlockY = specs[specBase + 1];
361
+ const batchBlockZ = specs[specBase + 2];
362
+ const numBlocksX = specs[specBase + 3];
363
+ const numBlocksY = specs[specBase + 4];
364
+ const numBlocksZ = specs[specBase + 5];
365
+ const indexOffset = specs[specBase + 6];
366
+ const indexCount = specs[specBase + 7];
367
+ const numVoxelsX = numBlocksX * 4;
368
+ const numVoxelsY = numBlocksY * 4;
369
+ const numVoxelsZ = numBlocksZ * 4;
370
+ const totalVoxels = numVoxelsX * numVoxelsY * numVoxelsZ;
371
+ const totalBlocks = numBlocksX * numBlocksY * numBlocksZ;
372
+ sigmaBuffer.fill(0, 0, totalVoxels);
373
+ solidBuffer.fill(0, 0, totalVoxels);
374
+ masksLoBuffer.fill(0, 0, totalBlocks);
375
+ masksHiBuffer.fill(0, 0, totalBlocks);
376
+ const batchMinVoxelX = batchBlockX * 4;
377
+ const batchMinVoxelY = batchBlockY * 4;
378
+ const batchMinVoxelZ = batchBlockZ * 4;
379
+ const batchMaxVoxelX = batchMinVoxelX + numVoxelsX - 1;
380
+ const batchMaxVoxelY = batchMinVoxelY + numVoxelsY - 1;
381
+ const batchMaxVoxelZ = batchMinVoxelZ + numVoxelsZ - 1;
382
+ for (let c = 0; c < indexCount; c++) {
383
+ const i = candidates[indexOffset + c];
384
+ const xi = x[i];
385
+ const yi = y[i];
386
+ const zi = z[i];
387
+ const opacity = a[i];
388
+ if (opacity <= 0) continue;
389
+ const maxContributionD2 = alphaThreshold <= 0 ? Infinity : (opacity <= alphaThreshold ? 0 : -2 * Math.log(alphaThreshold / opacity));
390
+ if (maxContributionD2 <= 0) continue;
391
+ const ex = ext[i * 3];
392
+ const ey = ext[i * 3 + 1];
393
+ const ez = ext[i * 3 + 2];
394
+ const minIx = Math.max(batchMinVoxelX, Math.floor((xi - ex - gridMinX) / voxelResolution));
395
+ const minIy = Math.max(batchMinVoxelY, Math.floor((yi - ey - gridMinY) / voxelResolution));
396
+ const minIz = Math.max(batchMinVoxelZ, Math.floor((zi - ez - gridMinZ) / voxelResolution));
397
+ const maxIx = Math.min(batchMaxVoxelX, Math.ceil((xi + ex - gridMinX) / voxelResolution));
398
+ const maxIy = Math.min(batchMaxVoxelY, Math.ceil((yi + ey - gridMinY) / voxelResolution));
399
+ const maxIz = Math.min(batchMaxVoxelZ, Math.ceil((zi + ez - gridMinZ) / voxelResolution));
400
+ if (minIx > maxIx || minIy > maxIy || minIz > maxIz) continue;
401
+ const iqx = -qx[i], iqy = -qy[i], iqz = -qz[i], iqw = qw[i];
402
+ const isx = sx[i] > 1e-8 ? 1 / sx[i] : 1e8;
403
+ const isy = sy[i] > 1e-8 ? 1 / sy[i] : 1e8;
404
+ const isz = sz[i] > 1e-8 ? 1 / sz[i] : 1e8;
405
+ for (let iz = minIz; iz <= maxIz; iz++) {
406
+ const localZ = iz - batchMinVoxelZ;
407
+ const vz = gridMinZ + (iz + 0.5) * voxelResolution;
408
+ for (let iy = minIy; iy <= maxIy; iy++) {
409
+ const localY = iy - batchMinVoxelY;
410
+ const vy = gridMinY + (iy + 0.5) * voxelResolution;
411
+ for (let ix = minIx; ix <= maxIx; ix++) {
412
+ const localX = ix - batchMinVoxelX;
413
+ const localIndex = localX + localY * numVoxelsX + localZ * numVoxelsX * numVoxelsY;
414
+ if (solidBuffer[localIndex]) continue;
415
+ const vx = gridMinX + (ix + 0.5) * voxelResolution;
416
+ const px = Math.min(Math.max(xi, vx - half), vx + half);
417
+ const py = Math.min(Math.max(yi, vy - half), vy + half);
418
+ const pz = Math.min(Math.max(zi, vz - half), vz + half);
419
+ const dx = px - xi;
420
+ const dy = py - yi;
421
+ const dz = pz - zi;
422
+ const tx = 2 * (iqy * dz - iqz * dy);
423
+ const ty = 2 * (iqz * dx - iqx * dz);
424
+ const tz = 2 * (iqx * dy - iqy * dx);
425
+ const lx = dx + iqw * tx + (iqy * tz - iqz * ty);
426
+ const ly = dy + iqw * ty + (iqz * tx - iqx * tz);
427
+ const lz = dz + iqw * tz + (iqx * ty - iqy * tx);
428
+ const sxv = lx * isx;
429
+ const syv = ly * isy;
430
+ const szv = lz * isz;
431
+ const d2 = sxv * sxv + syv * syv + szv * szv;
432
+ if (d2 > maxContributionD2) continue;
433
+ const contribution = opacity * Math.exp(-0.5 * d2);
434
+ if (contribution <= 0) continue;
435
+ const total = sigmaBuffer[localIndex] + contribution;
436
+ sigmaBuffer[localIndex] = total;
437
+ if (total < sigmaCutoff) continue;
438
+ solidBuffer[localIndex] = 1;
439
+ const localBlockX = localX >> 2;
440
+ const localBlockY = localY >> 2;
441
+ const localBlockZ = localZ >> 2;
442
+ const localBlock = localBlockX + localBlockY * numBlocksX + localBlockZ * numBlocksX * numBlocksY;
443
+ const bitIdx = (localX & 3) + ((localY & 3) << 2) + ((localZ & 3) << 4);
444
+ if (bitIdx < 32) masksLoBuffer[localBlock] = (masksLoBuffer[localBlock] | (1 << bitIdx)) >>> 0;
445
+ else masksHiBuffer[localBlock] = (masksHiBuffer[localBlock] | (1 << (bitIdx - 32))) >>> 0;
446
+ }
447
+ }
448
+ }
449
+ }
450
+ for (let localBlock = 0; localBlock < totalBlocks; localBlock++) {
451
+ const lo = masksLoBuffer[localBlock];
452
+ const hi = masksHiBuffer[localBlock];
453
+ if ((lo | hi) === 0) continue;
454
+ const localBlockX = localBlock % numBlocksX;
455
+ const localBlockY = ((localBlock / numBlocksX) | 0) % numBlocksY;
456
+ const localBlockZ = (localBlock / (numBlocksX * numBlocksY)) | 0;
457
+ const blockIdx = (batchBlockX + localBlockX) + (batchBlockY + localBlockY) * nBlockX + (batchBlockZ + localBlockZ) * nBlockXY;
458
+ packedBlocks.push(blockIdx >>> 0, lo >>> 0, hi >>> 0);
459
+ }
460
+ }
461
+ const packed = new Uint32Array(packedBlocks.length);
462
+ packed.set(packedBlocks);
463
+ return packed.buffer;
464
+ }
465
+ parentPort.on('message', (msg) => {
466
+ if (!msg || typeof msg !== 'object') return;
467
+ if (msg.type === 'shutdown') {
468
+ process.exit(0);
469
+ return;
470
+ }
471
+ if (msg.type !== 'run') return;
472
+ const taskId = msg.taskId;
473
+ const packed = runBatchSet(msg.batchSpecs, msg.candidateIndices);
474
+ parentPort.postMessage({ taskId, packed }, [packed]);
475
+ });
476
+ `;
477
+ function toSharedFloat32(src) {
478
+ const sab = new SharedArrayBuffer(src.byteLength);
479
+ new Float32Array(sab).set(src);
480
+ return sab;
481
+ }
482
+ function cpuVoxelizeSingleThread(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff) {
483
+ const nx = Math.max(4, Math.round((gridBounds.max.x - gridBounds.min.x) / voxelResolution));
484
+ const ny = Math.max(4, Math.round((gridBounds.max.y - gridBounds.min.y) / voxelResolution));
485
+ const nz = Math.max(4, Math.round((gridBounds.max.z - gridBounds.min.z) / voxelResolution));
486
+ const gridMinX = gridBounds.min.x;
487
+ const gridMinY = gridBounds.min.y;
488
+ const gridMinZ = gridBounds.min.z;
489
+ const cullMinX = gridBounds.min.x;
490
+ const cullMinY = gridBounds.min.y;
491
+ const cullMinZ = gridBounds.min.z;
492
+ const cullMaxX = gridBounds.max.x;
493
+ const cullMaxY = gridBounds.max.y;
494
+ const cullMaxZ = gridBounds.max.z;
495
+ const half = voxelResolution * 0.5;
496
+ const nBlockX = (nx + 3) >> 2;
497
+ const nBlockY = (ny + 3) >> 2;
498
+ const nBlockXY = nBlockX * nBlockY;
499
+ const opacityThreshold = Math.min(Math.max(opacityCutoff, 0), 1);
500
+ const blockMasks = {};
501
+ for (let i = 0; i < xCol.length; i++) {
502
+ const xi = xCol[i];
503
+ const yi = yCol[i];
504
+ const zi = zCol[i];
505
+ const opacity = aCol[i];
506
+ if (opacity <= 0) {
507
+ continue;
508
+ }
509
+ if (xi < cullMinX || xi > cullMaxX || yi < cullMinY || yi > cullMaxY || zi < cullMinZ || zi > cullMaxZ) {
510
+ continue;
511
+ }
512
+ const maxD2 = opacityThreshold <= 0
513
+ ? Infinity
514
+ : opacity <= opacityThreshold
515
+ ? 0
516
+ : -2 * Math.log(opacityThreshold / opacity);
517
+ if (maxD2 <= 0) {
518
+ continue;
519
+ }
520
+ const ex = extents[i * 3];
521
+ const ey = extents[i * 3 + 1];
522
+ const ez = extents[i * 3 + 2];
523
+ const minIx = Math.max(0, Math.floor((xi - ex - gridMinX) / voxelResolution));
524
+ const minIy = Math.max(0, Math.floor((yi - ey - gridMinY) / voxelResolution));
525
+ const minIz = Math.max(0, Math.floor((zi - ez - gridMinZ) / voxelResolution));
526
+ const maxIx = Math.min(nx - 1, Math.ceil((xi + ex - gridMinX) / voxelResolution));
527
+ const maxIy = Math.min(ny - 1, Math.ceil((yi + ey - gridMinY) / voxelResolution));
528
+ const maxIz = Math.min(nz - 1, Math.ceil((zi + ez - gridMinZ) / voxelResolution));
529
+ if (minIx > maxIx || minIy > maxIy || minIz > maxIz) {
530
+ continue;
531
+ }
532
+ const qx = qxCol[i];
533
+ const qy = qyCol[i];
534
+ const qz = qzCol[i];
535
+ const qw = qwCol[i];
536
+ // Input quaternions are already normalized.
537
+ const iqx = -qx;
538
+ const iqy = -qy;
539
+ const iqz = -qz;
540
+ const iqw = qw;
541
+ const isx = sxCol[i] > 1e-8 ? 1 / sxCol[i] : 1e8;
542
+ const isy = syCol[i] > 1e-8 ? 1 / syCol[i] : 1e8;
543
+ const isz = szCol[i] > 1e-8 ? 1 / szCol[i] : 1e8;
544
+ for (let iz = minIz; iz <= maxIz; iz++) {
545
+ const vz = gridMinZ + (iz + 0.5) * voxelResolution;
546
+ for (let iy = minIy; iy <= maxIy; iy++) {
547
+ const vy = gridMinY + (iy + 0.5) * voxelResolution;
548
+ for (let ix = minIx; ix <= maxIx; ix++) {
549
+ const vx = gridMinX + (ix + 0.5) * voxelResolution;
550
+ const px = Math.min(Math.max(xi, vx - half), vx + half);
551
+ const py = Math.min(Math.max(yi, vy - half), vy + half);
552
+ const pz = Math.min(Math.max(zi, vz - half), vz + half);
553
+ const dx = px - xi;
554
+ const dy = py - yi;
555
+ const dz = pz - zi;
556
+ const tx = 2 * (iqy * dz - iqz * dy);
557
+ const ty = 2 * (iqz * dx - iqx * dz);
558
+ const tz = 2 * (iqx * dy - iqy * dx);
559
+ const lx = dx + iqw * tx + (iqy * tz - iqz * ty);
560
+ const ly = dy + iqw * ty + (iqz * tx - iqx * tz);
561
+ const lz = dz + iqw * tz + (iqx * ty - iqy * tx);
562
+ const sxv = lx * isx;
563
+ const syv = ly * isy;
564
+ const szv = lz * isz;
565
+ const d2 = sxv * sxv + syv * syv + szv * szv;
566
+ if (d2 > maxD2) {
567
+ continue;
568
+ }
569
+ const blockX = ix >> 2;
570
+ const blockY = iy >> 2;
571
+ const blockZ = iz >> 2;
572
+ const blockLinear = blockX + blockY * nBlockX + blockZ * nBlockXY;
573
+ const bitIdx = (ix & 3) + ((iy & 3) << 2) + ((iz & 3) << 4);
574
+ const curr = blockMasks[blockLinear] ?? [0, 0];
575
+ if (bitIdx < 32) {
576
+ curr[0] = (curr[0] | (1 << bitIdx)) >>> 0;
577
+ }
578
+ else {
579
+ curr[1] = (curr[1] | (1 << (bitIdx - 32))) >>> 0;
580
+ }
581
+ blockMasks[blockLinear] = curr;
582
+ }
583
+ }
584
+ }
585
+ }
586
+ const output = new BlockMaskBuffer();
587
+ for (const [blockLinearRaw, [lo, hi]] of Object.entries(blockMasks)) {
588
+ const blockLinear = Number(blockLinearRaw);
589
+ output.addBlock(blockLinear, lo, hi);
590
+ }
591
+ return output;
592
+ }
593
+ export async function cpuVoxelize(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff, options) {
594
+ if (xCol.length < CPU_VOXEL_PARALLEL_MIN_GAUSSIANS) {
595
+ return cpuVoxelizeSingleThread(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff);
596
+ }
597
+ const nx = Math.max(4, Math.round((gridBounds.max.x - gridBounds.min.x) / voxelResolution));
598
+ const ny = Math.max(4, Math.round((gridBounds.max.y - gridBounds.min.y) / voxelResolution));
599
+ const nz = Math.max(4, Math.round((gridBounds.max.z - gridBounds.min.z) / voxelResolution));
600
+ const gridMinX = gridBounds.min.x;
601
+ const gridMinY = gridBounds.min.y;
602
+ const gridMinZ = gridBounds.min.z;
603
+ if (xCol.length === 0) {
604
+ return new BlockMaskBuffer();
605
+ }
606
+ const workers = Math.min(resolveCpuVoxelWorkerCount(options?.workerCount), xCol.length);
607
+ const nBlockX = (nx + 3) >> 2;
608
+ const nBlockY = (ny + 3) >> 2;
609
+ const nBlockXY = nBlockX * nBlockY;
610
+ const batchBlockSize = 4;
611
+ const numBatchX = Math.ceil(nBlockX / batchBlockSize);
612
+ const numBatchY = Math.ceil(nBlockY / batchBlockSize);
613
+ const numBatchZ = Math.ceil(Math.max(1, (nz + 3) >> 2) / batchBlockSize);
614
+ const bvh = new GaussianBVH(xCol, yCol, zCol, extents);
615
+ const shared = {
616
+ xCol: toSharedFloat32(xCol),
617
+ yCol: toSharedFloat32(yCol),
618
+ zCol: toSharedFloat32(zCol),
619
+ sxCol: toSharedFloat32(sxCol),
620
+ syCol: toSharedFloat32(syCol),
621
+ szCol: toSharedFloat32(szCol),
622
+ qxCol: toSharedFloat32(qxCol),
623
+ qyCol: toSharedFloat32(qyCol),
624
+ qzCol: toSharedFloat32(qzCol),
625
+ qwCol: toSharedFloat32(qwCol),
626
+ aCol: toSharedFloat32(aCol),
627
+ extents: toSharedFloat32(extents),
628
+ };
629
+ try {
630
+ const output = new BlockMaskBuffer();
631
+ let nextTaskId = 1;
632
+ const pool = Array.from({ length: workers }, (_v, slotId) => {
633
+ const worker = new Worker(cpuVoxelizeWorkerScript, {
634
+ eval: true,
635
+ workerData: {
636
+ workerId: slotId,
637
+ voxelResolution,
638
+ opacityCutoff,
639
+ alphaThreshold: options?.alphaThreshold ?? ALPHA_THRESHOLD,
640
+ gridMinX,
641
+ gridMinY,
642
+ gridMinZ,
643
+ nBlockX,
644
+ nBlockY,
645
+ nBlockXY,
646
+ ...shared,
647
+ },
648
+ });
649
+ let currentResolve;
650
+ let currentReject;
651
+ worker.on('message', message => {
652
+ if (message && typeof message === 'object' && 'packed' in message) {
653
+ const typed = message;
654
+ if (!currentResolve) {
655
+ return;
656
+ }
657
+ const resolve = currentResolve;
658
+ currentResolve = undefined;
659
+ currentReject = undefined;
660
+ resolve({ packed: typed.packed });
661
+ return;
662
+ }
663
+ if (!currentResolve) {
664
+ return;
665
+ }
666
+ const resolve = currentResolve;
667
+ currentResolve = undefined;
668
+ currentReject = undefined;
669
+ resolve({ packed: message });
670
+ });
671
+ worker.on('error', error => {
672
+ currentReject?.(error);
673
+ currentResolve = undefined;
674
+ currentReject = undefined;
675
+ });
676
+ worker.on('exit', code => {
677
+ if (code !== 0) {
678
+ currentReject?.(new Error(`cpu voxel worker exited with code ${code}`));
679
+ currentResolve = undefined;
680
+ currentReject = undefined;
681
+ }
682
+ });
683
+ function runTask(batchSpecs, candidateIndices) {
684
+ return new Promise((resolve, reject) => {
685
+ if (currentResolve) {
686
+ reject(new Error(`cpu voxel worker ${slotId} received concurrent task`));
687
+ return;
688
+ }
689
+ currentResolve = resolve;
690
+ currentReject = reject;
691
+ const taskId = nextTaskId++;
692
+ const batchSpecsBuffer = batchSpecs.buffer;
693
+ const candidateIndicesBuffer = candidateIndices.buffer;
694
+ worker.postMessage({
695
+ type: 'run',
696
+ taskId,
697
+ workerId: slotId,
698
+ batchSpecs: batchSpecsBuffer,
699
+ candidateIndices: candidateIndicesBuffer,
700
+ }, [batchSpecsBuffer, candidateIndicesBuffer]);
701
+ });
702
+ }
703
+ return { worker, runTask };
704
+ });
705
+ function addPackedResult(buf) {
706
+ const packed = new Uint32Array(buf);
707
+ for (let i = 0; i < packed.length; i += 3) {
708
+ output.addBlock(packed[i], packed[i + 1], packed[i + 2]);
709
+ }
710
+ }
711
+ const availableSlots = pool.map((_slot, slotId) => Promise.resolve(slotId));
712
+ async function dispatchTask(batchSpecs, candidateIndices) {
713
+ const slotId = await Promise.race(availableSlots);
714
+ availableSlots[slotId] = pool[slotId].runTask(batchSpecs, candidateIndices).then(result => {
715
+ addPackedResult(result.packed);
716
+ return slotId;
717
+ });
718
+ }
719
+ const maxPendingBatches = 256;
720
+ const maxPendingIndices = 2 * 1024 * 1024;
721
+ const totalBlockZ = Math.max(1, (nz + 3) >> 2);
722
+ let pendingSpecs = [];
723
+ let pendingCandidates = new Uint32Array(Math.min(Math.max(1024, xCol.length), maxPendingIndices));
724
+ let pendingCandidateCount = 0;
725
+ function ensurePendingCandidateCapacity(needed) {
726
+ if (needed <= pendingCandidates.length) {
727
+ return;
728
+ }
729
+ const next = new Uint32Array(Math.max(needed, pendingCandidates.length * 2));
730
+ next.set(pendingCandidates.subarray(0, pendingCandidateCount));
731
+ pendingCandidates = next;
732
+ }
733
+ async function flushPendingTask() {
734
+ if (pendingSpecs.length === 0) {
735
+ return;
736
+ }
737
+ const batchSpecs = new Uint32Array(pendingSpecs);
738
+ const candidateIndices = pendingCandidates.slice(0, pendingCandidateCount);
739
+ pendingSpecs = [];
740
+ pendingCandidateCount = 0;
741
+ await dispatchTask(batchSpecs, candidateIndices);
742
+ }
743
+ for (let bz = 0; bz < numBatchZ; bz++) {
744
+ for (let by = 0; by < numBatchY; by++) {
745
+ for (let bx = 0; bx < numBatchX; bx++) {
746
+ const blockX = bx * batchBlockSize;
747
+ const blockY = by * batchBlockSize;
748
+ const blockZ = bz * batchBlockSize;
749
+ const numBlocksX = Math.min(batchBlockSize, nBlockX - blockX);
750
+ const numBlocksY = Math.min(batchBlockSize, nBlockY - blockY);
751
+ const numBlocksZ = Math.min(batchBlockSize, totalBlockZ - blockZ);
752
+ if (numBlocksX <= 0 || numBlocksY <= 0 || numBlocksZ <= 0) {
753
+ continue;
754
+ }
755
+ const minX = gridMinX + blockX * LEAF_SIZE * voxelResolution;
756
+ const minY = gridMinY + blockY * LEAF_SIZE * voxelResolution;
757
+ const minZ = gridMinZ + blockZ * LEAF_SIZE * voxelResolution;
758
+ const maxX = Math.min(gridBounds.max.x, minX + numBlocksX * LEAF_SIZE * voxelResolution);
759
+ const maxY = Math.min(gridBounds.max.y, minY + numBlocksY * LEAF_SIZE * voxelResolution);
760
+ const maxZ = Math.min(gridBounds.max.z, minZ + numBlocksZ * LEAF_SIZE * voxelResolution);
761
+ let overlappingCount = bvh.queryOverlappingRawInto(minX, minY, minZ, maxX, maxY, maxZ, pendingCandidates, pendingCandidateCount);
762
+ if (overlappingCount === 0) {
763
+ continue;
764
+ }
765
+ if (pendingSpecs.length > 0 &&
766
+ (pendingSpecs.length / 8 >= maxPendingBatches ||
767
+ pendingCandidateCount + overlappingCount > maxPendingIndices)) {
768
+ await flushPendingTask();
769
+ overlappingCount = bvh.queryOverlappingRawInto(minX, minY, minZ, maxX, maxY, maxZ, pendingCandidates, pendingCandidateCount);
770
+ }
771
+ const needed = pendingCandidateCount + overlappingCount;
772
+ if (needed > pendingCandidates.length) {
773
+ ensurePendingCandidateCapacity(needed);
774
+ overlappingCount = bvh.queryOverlappingRawInto(minX, minY, minZ, maxX, maxY, maxZ, pendingCandidates, pendingCandidateCount);
775
+ }
776
+ pendingSpecs.push(blockX, blockY, blockZ, numBlocksX, numBlocksY, numBlocksZ, pendingCandidateCount, overlappingCount);
777
+ pendingCandidateCount += overlappingCount;
778
+ if (pendingSpecs.length / 8 >= maxPendingBatches || pendingCandidateCount >= maxPendingIndices) {
779
+ await flushPendingTask();
780
+ }
781
+ }
782
+ }
783
+ }
784
+ await flushPendingTask();
785
+ await Promise.all(availableSlots);
786
+ await Promise.all(pool.map(async (slot) => {
787
+ slot.worker.postMessage({ type: 'shutdown' });
788
+ await slot.worker.terminate();
789
+ }));
790
+ return output;
791
+ }
792
+ catch (e) {
793
+ logger.warn(`cpu voxel worker failed, using simplified single-thread fallback; ` +
794
+ `voxel result may differ from worker/GPU path: ${e instanceof Error ? e.message : String(e)}`);
795
+ return cpuVoxelizeSingleThread(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff);
796
+ }
797
+ }
798
+ /**
799
+ * GPU voxelization path using tiled multi-batch WGSL dispatch.
800
+ * Per-batch Gaussian indices are built on the GPU (count pass, CPU prefix sum, fill pass) into `indexBuffer`,
801
+ * replacing BVH `queryOverlappingRaw` on reference implementation. Batches are packed into mega-dispatches, then read back
802
+ * as per-block 64-bit masks to populate `BlockMaskBuffer`.
803
+ */
804
+ export async function gpuVoxelize(xCol, yCol, zCol, sxCol, syCol, szCol, qxCol, qyCol, qzCol, qwCol, aCol, extents, gridBounds, voxelResolution, opacityCutoff) {
805
+ const FLOATS_PER_GAUSSIAN = 16;
806
+ const UPLOAD_CHUNK_GAUSSIANS = 1 << 18;
807
+ const WORKGROUP_SIZE = 256;
808
+ // Tuning knobs: trade off submit overhead vs. peak memory/latency per mega-dispatch.
809
+ // Upstream caps 16^3-block batches at 256; with this port's 4^3-block
810
+ // batches, 16384 preserves the same max blocks per mega-dispatch.
811
+ const MEGA_MAX_BATCHES = 16384;
812
+ const MEGA_MAX_INDICES = 2 * 1024 * 1024;
813
+ const BATCH_SIZE = 4;
814
+ const MAX_BLOCKS_PER_BATCH = BATCH_SIZE * BATCH_SIZE * BATCH_SIZE;
815
+ const blockSize = LEAF_SIZE * voxelResolution;
816
+ const numBlocksX = Math.round((gridBounds.max.x - gridBounds.min.x) / blockSize);
817
+ const numBlocksY = Math.round((gridBounds.max.y - gridBounds.min.y) / blockSize);
818
+ const numBlocksZ = Math.round((gridBounds.max.z - gridBounds.min.z) / blockSize);
819
+ const numBatchX = Math.ceil(numBlocksX / BATCH_SIZE);
820
+ const numBatchY = Math.ceil(numBlocksY / BATCH_SIZE);
821
+ const numBatchZ = Math.ceil(numBlocksZ / BATCH_SIZE);
822
+ const totalBatchCount = numBatchX * numBatchY * numBatchZ;
823
+ const gridMinX = gridBounds.min.x;
824
+ const gridMinY = gridBounds.min.y;
825
+ const gridMinZ = gridBounds.min.z;
826
+ const gaussianCount = xCol.length;
827
+ const batchWorldSize = blockSize * BATCH_SIZE;
828
+ const device = await getOrCreateDevice();
829
+ const gaussianBufferBytes = gaussianCount * FLOATS_PER_GAUSSIAN * 4;
830
+ const maxBufferSize = Number(device.limits.maxBufferSize);
831
+ if (gaussianBufferBytes > maxBufferSize) {
832
+ throw new Error(`gpuVoxelize: gaussian buffer size ${gaussianBufferBytes} exceeds device maxBufferSize ${maxBufferSize} ` +
833
+ `(gaussianCount=${gaussianCount}, bytesPerGaussian=${FLOATS_PER_GAUSSIAN * 4}).`);
834
+ }
835
+ const batchCountPipeline = device.createComputePipeline({
836
+ layout: 'auto',
837
+ compute: { module: device.createShaderModule({ code: buildPerBatchCountsWgsl() }), entryPoint: 'main' },
838
+ });
839
+ const batchFillPipeline = device.createComputePipeline({
840
+ layout: 'auto',
841
+ compute: { module: device.createShaderModule({ code: fillPerBatchCandidatesWgsl() }), entryPoint: 'main' },
842
+ });
843
+ const voxelPipeline = device.createComputePipeline({
844
+ layout: 'auto',
845
+ compute: { module: device.createShaderModule({ code: voxelizeMultiBatchWgsl() }), entryPoint: 'main' },
846
+ });
847
+ const blockBuffer = new BlockMaskBuffer();
848
+ const gaussianBuffer = device.createBuffer({
849
+ size: gaussianBufferBytes,
850
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST,
851
+ });
852
+ const chunkRows = Math.min(gaussianCount, UPLOAD_CHUNK_GAUSSIANS);
853
+ const interleavedChunk = new Float32Array(chunkRows * FLOATS_PER_GAUSSIAN);
854
+ for (let chunkStart = 0; chunkStart < gaussianCount; chunkStart += chunkRows) {
855
+ const chunkCount = Math.min(chunkRows, gaussianCount - chunkStart);
856
+ for (let j = 0; j < chunkCount; j++) {
857
+ const i = chunkStart + j;
858
+ const offset = j * FLOATS_PER_GAUSSIAN;
859
+ interleavedChunk[offset + 0] = xCol[i];
860
+ interleavedChunk[offset + 1] = yCol[i];
861
+ interleavedChunk[offset + 2] = zCol[i];
862
+ interleavedChunk[offset + 3] = aCol[i];
863
+ const rotW = qwCol[i];
864
+ const rotX = qxCol[i];
865
+ const rotY = qyCol[i];
866
+ const rotZ = qzCol[i];
867
+ const qlen = Math.sqrt(rotW * rotW + rotX * rotX + rotY * rotY + rotZ * rotZ);
868
+ const invLen = qlen > 0 ? 1 / qlen : 0;
869
+ interleavedChunk[offset + 4] = rotW * invLen;
870
+ interleavedChunk[offset + 5] = rotX * invLen;
871
+ interleavedChunk[offset + 6] = rotY * invLen;
872
+ interleavedChunk[offset + 7] = rotZ * invLen;
873
+ interleavedChunk[offset + 8] = sxCol[i];
874
+ interleavedChunk[offset + 9] = syCol[i];
875
+ interleavedChunk[offset + 10] = szCol[i];
876
+ interleavedChunk[offset + 11] = extents[i * 3 + 0];
877
+ interleavedChunk[offset + 12] = extents[i * 3 + 1];
878
+ interleavedChunk[offset + 13] = extents[i * 3 + 2];
879
+ interleavedChunk[offset + 14] = 0;
880
+ interleavedChunk[offset + 15] = 0;
881
+ }
882
+ device.queue.writeBuffer(gaussianBuffer, chunkStart * FLOATS_PER_GAUSSIAN * 4, interleavedChunk.buffer, 0, chunkCount * FLOATS_PER_GAUSSIAN * 4);
883
+ }
884
+ const batchUniformBuffer = device.createBuffer({
885
+ size: 256,
886
+ usage: GPU_BUFFER_USAGE_UNIFORM | GPU_BUFFER_USAGE_COPY_DST,
887
+ });
888
+ const batchCountsBuffer = device.createBuffer({
889
+ size: Math.max(4, totalBatchCount * 4),
890
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_COPY_SRC,
891
+ });
892
+ const batchCountsReadBuffer = device.createBuffer({
893
+ size: Math.max(4, totalBatchCount * 4),
894
+ usage: GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_MAP_READ,
895
+ });
896
+ const batchUniformRaw = new Uint32Array(16);
897
+ const batchUniformFloats = new Float32Array(batchUniformRaw.buffer);
898
+ batchUniformFloats[0] = gridMinX;
899
+ batchUniformFloats[1] = gridMinY;
900
+ batchUniformFloats[2] = gridMinZ;
901
+ batchUniformFloats[3] = batchWorldSize;
902
+ batchUniformRaw[4] = numBatchX;
903
+ batchUniformRaw[5] = numBatchY;
904
+ batchUniformRaw[6] = numBatchZ;
905
+ batchUniformRaw[7] = gaussianCount;
906
+ device.queue.writeBuffer(batchUniformBuffer, 0, batchUniformRaw.buffer, 0, 32);
907
+ const countBindGroup = device.createBindGroup({
908
+ layout: batchCountPipeline.getBindGroupLayout(0),
909
+ entries: [
910
+ { binding: 0, resource: { buffer: batchUniformBuffer } },
911
+ { binding: 1, resource: { buffer: gaussianBuffer } },
912
+ { binding: 2, resource: { buffer: batchCountsBuffer } },
913
+ ],
914
+ });
915
+ const zeroBatchCounts = new Uint32Array(Math.max(1, totalBatchCount));
916
+ device.queue.writeBuffer(batchCountsBuffer, 0, zeroBatchCounts);
917
+ // Count overlaps per coarse batch on GPU; copy out for CPU exclusive-prefix into batchCandidateOffsets.
918
+ {
919
+ const encoder = device.createCommandEncoder();
920
+ const pass = encoder.beginComputePass();
921
+ pass.setPipeline(batchCountPipeline);
922
+ pass.setBindGroup(0, countBindGroup);
923
+ pass.dispatchWorkgroups(Math.ceil(gaussianCount / WORKGROUP_SIZE), 1, 1);
924
+ pass.end();
925
+ encoder.copyBufferToBuffer(batchCountsBuffer, 0, batchCountsReadBuffer, 0, totalBatchCount * 4);
926
+ device.queue.submit([encoder.finish()]);
927
+ }
928
+ await batchCountsReadBuffer.mapAsync(GPU_MAP_MODE_READ);
929
+ const countsMapped = new Uint32Array(batchCountsReadBuffer.getMappedRange());
930
+ const batchCandidateCounts = new Uint32Array(totalBatchCount);
931
+ batchCandidateCounts.set(countsMapped.subarray(0, totalBatchCount));
932
+ batchCountsReadBuffer.unmap();
933
+ const batchCandidateOffsets = new Uint32Array(totalBatchCount);
934
+ let totalCandidateCount = 0;
935
+ for (let i = 0; i < totalBatchCount; i++) {
936
+ batchCandidateOffsets[i] = totalCandidateCount;
937
+ totalCandidateCount += batchCandidateCounts[i];
938
+ }
939
+ if (totalCandidateCount === 0) {
940
+ batchUniformBuffer.destroy();
941
+ batchCountsBuffer.destroy();
942
+ batchCountsReadBuffer.destroy();
943
+ gaussianBuffer.destroy();
944
+ return blockBuffer;
945
+ }
946
+ const batchOffsetsBuffer = device.createBuffer({
947
+ size: batchCandidateOffsets.byteLength,
948
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST,
949
+ });
950
+ device.queue.writeBuffer(batchOffsetsBuffer, 0, batchCandidateOffsets);
951
+ const batchWriteHeadsBuffer = device.createBuffer({
952
+ size: Math.max(4, totalBatchCount * 4),
953
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST,
954
+ });
955
+ device.queue.writeBuffer(batchWriteHeadsBuffer, 0, zeroBatchCounts);
956
+ // Packed gaussian indices for all batches (size = totalCandidateCount); filled by GPU scatter pass.
957
+ const indexBuffer = device.createBuffer({
958
+ size: totalCandidateCount * 4,
959
+ usage: GPU_BUFFER_USAGE_STORAGE,
960
+ });
961
+ // GPU scatter pass: write gaussian indices into each batch segment of `indexBuffer`.
962
+ const fillBindGroup = device.createBindGroup({
963
+ layout: batchFillPipeline.getBindGroupLayout(0),
964
+ entries: [
965
+ { binding: 0, resource: { buffer: batchUniformBuffer } },
966
+ { binding: 1, resource: { buffer: gaussianBuffer } },
967
+ { binding: 2, resource: { buffer: batchOffsetsBuffer } },
968
+ { binding: 3, resource: { buffer: batchWriteHeadsBuffer } },
969
+ { binding: 4, resource: { buffer: indexBuffer } },
970
+ ],
971
+ });
972
+ {
973
+ const encoder = device.createCommandEncoder();
974
+ const pass = encoder.beginComputePass();
975
+ pass.setPipeline(batchFillPipeline);
976
+ pass.setBindGroup(0, fillBindGroup);
977
+ pass.dispatchWorkgroups(Math.ceil(gaussianCount / WORKGROUP_SIZE), 1, 1);
978
+ pass.end();
979
+ device.queue.submit([encoder.finish()]);
980
+ }
981
+ // BatchInfo struct in WGSL: 5xu32 + 3xf32 packed as 8xu32 per batch.
982
+ const BATCH_INFO_U32S = 8;
983
+ function createSlot() {
984
+ const uniformBuffer = device.createBuffer({
985
+ size: 256,
986
+ usage: GPU_BUFFER_USAGE_UNIFORM | GPU_BUFFER_USAGE_COPY_DST,
987
+ });
988
+ const resultsBuffer = device.createBuffer({
989
+ size: MEGA_MAX_BATCHES * MAX_BLOCKS_PER_BATCH * 2 * 4,
990
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_SRC | GPU_BUFFER_USAGE_COPY_DST,
991
+ });
992
+ const readBuffer = device.createBuffer({
993
+ size: MEGA_MAX_BATCHES * MAX_BLOCKS_PER_BATCH * 2 * 4,
994
+ usage: GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_MAP_READ,
995
+ });
996
+ const batchInfoBuffer = device.createBuffer({
997
+ size: MEGA_MAX_BATCHES * BATCH_INFO_U32S * 4,
998
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST,
999
+ });
1000
+ const bindGroup = device.createBindGroup({
1001
+ layout: voxelPipeline.getBindGroupLayout(0),
1002
+ entries: [
1003
+ { binding: 0, resource: { buffer: uniformBuffer } },
1004
+ { binding: 1, resource: { buffer: gaussianBuffer } },
1005
+ { binding: 2, resource: { buffer: indexBuffer } },
1006
+ { binding: 3, resource: { buffer: resultsBuffer } },
1007
+ { binding: 4, resource: { buffer: batchInfoBuffer } },
1008
+ ],
1009
+ });
1010
+ return {
1011
+ uniformBuffer,
1012
+ resultsBuffer,
1013
+ readBuffer,
1014
+ batchInfoBuffer,
1015
+ bindGroup,
1016
+ resultsBufferSize: MEGA_MAX_BATCHES * MAX_BLOCKS_PER_BATCH * 2 * 4,
1017
+ batchInfoCapacityBytes: MEGA_MAX_BATCHES * BATCH_INFO_U32S * 4,
1018
+ };
1019
+ }
1020
+ const slots = [createSlot(), createSlot()];
1021
+ let currentSlot = 0;
1022
+ let inflight;
1023
+ function ensureSlotCapacity(slot, batchCount) {
1024
+ const resultBytes = Math.max(8, batchCount * MAX_BLOCKS_PER_BATCH * 2 * 4);
1025
+ const batchInfoBytes = Math.max(32, batchCount * BATCH_INFO_U32S * 4);
1026
+ if (resultBytes > slot.resultsBufferSize) {
1027
+ slot.resultsBuffer.destroy();
1028
+ slot.readBuffer.destroy();
1029
+ // Growth (at least x2) avoids frequent GPU buffer reallocations when batch sizes fluctuate.
1030
+ slot.resultsBufferSize = Math.max(slot.resultsBufferSize * 2, resultBytes);
1031
+ slot.resultsBuffer = device.createBuffer({
1032
+ size: slot.resultsBufferSize,
1033
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_SRC | GPU_BUFFER_USAGE_COPY_DST,
1034
+ });
1035
+ slot.readBuffer = device.createBuffer({
1036
+ size: slot.resultsBufferSize,
1037
+ usage: GPU_BUFFER_USAGE_COPY_DST | GPU_BUFFER_USAGE_MAP_READ,
1038
+ });
1039
+ slot.bindGroup = device.createBindGroup({
1040
+ layout: voxelPipeline.getBindGroupLayout(0),
1041
+ entries: [
1042
+ { binding: 0, resource: { buffer: slot.uniformBuffer } },
1043
+ { binding: 1, resource: { buffer: gaussianBuffer } },
1044
+ { binding: 2, resource: { buffer: indexBuffer } },
1045
+ { binding: 3, resource: { buffer: slot.resultsBuffer } },
1046
+ { binding: 4, resource: { buffer: slot.batchInfoBuffer } },
1047
+ ],
1048
+ });
1049
+ }
1050
+ if (batchInfoBytes > slot.batchInfoCapacityBytes) {
1051
+ slot.batchInfoBuffer.destroy();
1052
+ // Same growth policy as results/read buffers.
1053
+ slot.batchInfoCapacityBytes = Math.max(slot.batchInfoCapacityBytes * 2, batchInfoBytes);
1054
+ slot.batchInfoBuffer = device.createBuffer({
1055
+ size: slot.batchInfoCapacityBytes,
1056
+ usage: GPU_BUFFER_USAGE_STORAGE | GPU_BUFFER_USAGE_COPY_DST,
1057
+ });
1058
+ slot.bindGroup = device.createBindGroup({
1059
+ layout: voxelPipeline.getBindGroupLayout(0),
1060
+ entries: [
1061
+ { binding: 0, resource: { buffer: slot.uniformBuffer } },
1062
+ { binding: 1, resource: { buffer: gaussianBuffer } },
1063
+ { binding: 2, resource: { buffer: indexBuffer } },
1064
+ { binding: 3, resource: { buffer: slot.resultsBuffer } },
1065
+ { binding: 4, resource: { buffer: slot.batchInfoBuffer } },
1066
+ ],
1067
+ });
1068
+ }
1069
+ }
1070
+ function processResults(masks, batches) {
1071
+ for (let b = 0; b < batches.length; b++) {
1072
+ const batch = batches[b];
1073
+ const batchResultOffset = b * MAX_BLOCKS_PER_BATCH * 2;
1074
+ const totalBatchBlocks = batch.numBlocksX * batch.numBlocksY * batch.numBlocksZ;
1075
+ for (let blockIdx = 0; blockIdx < totalBatchBlocks; blockIdx++) {
1076
+ const maskLo = masks[batchResultOffset + blockIdx * 2];
1077
+ const maskHi = masks[batchResultOffset + blockIdx * 2 + 1];
1078
+ if (maskLo === 0 && maskHi === 0) {
1079
+ continue;
1080
+ }
1081
+ const localX = blockIdx % batch.numBlocksX;
1082
+ const localY = Math.floor(blockIdx / batch.numBlocksX) % batch.numBlocksY;
1083
+ const localZ = Math.floor(blockIdx / (batch.numBlocksX * batch.numBlocksY));
1084
+ const blockLinear = batch.bx +
1085
+ localX +
1086
+ (batch.by + localY) * numBlocksX +
1087
+ (batch.bz + localZ) * numBlocksX * numBlocksY;
1088
+ blockBuffer.addBlock(blockLinear, maskLo, maskHi);
1089
+ }
1090
+ }
1091
+ }
1092
+ let pendingBatches = [];
1093
+ let megaIndexSpan = 0;
1094
+ async function flushPendingBatches() {
1095
+ if (pendingBatches.length === 0) {
1096
+ return;
1097
+ }
1098
+ const submitSlot = currentSlot;
1099
+ currentSlot = (currentSlot + 1) & 1;
1100
+ const batchesToSubmit = pendingBatches;
1101
+ pendingBatches = [];
1102
+ megaIndexSpan = 0;
1103
+ const slot = slots[submitSlot];
1104
+ ensureSlotCapacity(slot, batchesToSubmit.length);
1105
+ const resultsU32Count = batchesToSubmit.length * MAX_BLOCKS_PER_BATCH * 2;
1106
+ const batchInfoU32Count = batchesToSubmit.length * BATCH_INFO_U32S;
1107
+ const batchInfoF32 = new Float32Array(batchInfoU32Count);
1108
+ const batchInfoU32 = new Uint32Array(batchInfoF32.buffer);
1109
+ for (let i = 0; i < batchesToSubmit.length; i++) {
1110
+ const batch = batchesToSubmit[i];
1111
+ const base = i * BATCH_INFO_U32S;
1112
+ batchInfoU32[base + 0] = batch.indexOffset;
1113
+ batchInfoU32[base + 1] = batch.indexCount;
1114
+ batchInfoU32[base + 2] = batch.numBlocksX;
1115
+ batchInfoU32[base + 3] = batch.numBlocksY;
1116
+ batchInfoU32[base + 4] = batch.numBlocksZ;
1117
+ batchInfoF32[base + 5] = batch.blockMinX;
1118
+ batchInfoF32[base + 6] = batch.blockMinY;
1119
+ batchInfoF32[base + 7] = batch.blockMinZ;
1120
+ }
1121
+ device.queue.writeBuffer(slot.batchInfoBuffer, 0, batchInfoU32.buffer, batchInfoU32.byteOffset, batchInfoU32.byteLength);
1122
+ const uniform = new Uint32Array(16);
1123
+ const uf = new Float32Array(uniform.buffer);
1124
+ uf[0] = opacityCutoff;
1125
+ uf[1] = voxelResolution;
1126
+ uniform[2] = MAX_BLOCKS_PER_BATCH;
1127
+ device.queue.writeBuffer(slot.uniformBuffer, 0, uniform.buffer, 0, 12);
1128
+ const encoder = device.createCommandEncoder();
1129
+ encoder.clearBuffer(slot.resultsBuffer, 0, resultsU32Count * 4);
1130
+ const pass = encoder.beginComputePass();
1131
+ pass.setPipeline(voxelPipeline);
1132
+ pass.setBindGroup(0, slot.bindGroup);
1133
+ pass.dispatchWorkgroups(MAX_BLOCKS_PER_BATCH, 1, batchesToSubmit.length);
1134
+ pass.end();
1135
+ encoder.copyBufferToBuffer(slot.resultsBuffer, 0, slot.readBuffer, 0, resultsU32Count * 4);
1136
+ device.queue.submit([encoder.finish()]);
1137
+ const taskPromise = (async () => {
1138
+ await slot.readBuffer.mapAsync(GPU_MAP_MODE_READ);
1139
+ const mapped = new Uint32Array(slot.readBuffer.getMappedRange());
1140
+ const copied = new Uint32Array(resultsU32Count);
1141
+ copied.set(mapped.subarray(0, resultsU32Count));
1142
+ slot.readBuffer.unmap();
1143
+ return { masks: copied, batches: batchesToSubmit };
1144
+ })();
1145
+ if (inflight) {
1146
+ const done = await inflight.taskId;
1147
+ processResults(done.masks, done.batches);
1148
+ }
1149
+ inflight = { taskId: taskPromise };
1150
+ }
1151
+ for (let bz = 0; bz < numBatchZ; bz++) {
1152
+ for (let by = 0; by < numBatchY; by++) {
1153
+ for (let bx = 0; bx < numBatchX; bx++) {
1154
+ const batchId = bz * numBatchX * numBatchY + by * numBatchX + bx;
1155
+ const indexCount = batchCandidateCounts[batchId];
1156
+ if (indexCount === 0) {
1157
+ continue;
1158
+ }
1159
+ const bxBlock = bx * BATCH_SIZE;
1160
+ const byBlock = by * BATCH_SIZE;
1161
+ const bzBlock = bz * BATCH_SIZE;
1162
+ const currBatchX = Math.min(BATCH_SIZE, numBlocksX - bxBlock);
1163
+ const currBatchY = Math.min(BATCH_SIZE, numBlocksY - byBlock);
1164
+ const currBatchZ = Math.min(BATCH_SIZE, numBlocksZ - bzBlock);
1165
+ // World-space origin of this batch's 16^3 block grid; indexOffset/indexCount refer to `indexBuffer`.
1166
+ const blockMinX = gridMinX + bxBlock * blockSize;
1167
+ const blockMinY = gridMinY + byBlock * blockSize;
1168
+ const blockMinZ = gridMinZ + bzBlock * blockSize;
1169
+ pendingBatches.push({
1170
+ indexOffset: batchCandidateOffsets[batchId],
1171
+ indexCount,
1172
+ blockMinX,
1173
+ blockMinY,
1174
+ blockMinZ,
1175
+ numBlocksX: currBatchX,
1176
+ numBlocksY: currBatchY,
1177
+ numBlocksZ: currBatchZ,
1178
+ bx: bxBlock,
1179
+ by: byBlock,
1180
+ bz: bzBlock,
1181
+ });
1182
+ megaIndexSpan += indexCount;
1183
+ if (pendingBatches.length >= MEGA_MAX_BATCHES || megaIndexSpan >= MEGA_MAX_INDICES) {
1184
+ await flushPendingBatches();
1185
+ }
1186
+ }
1187
+ }
1188
+ }
1189
+ await flushPendingBatches();
1190
+ if (inflight) {
1191
+ const done = await inflight.taskId;
1192
+ processResults(done.masks, done.batches);
1193
+ }
1194
+ batchUniformBuffer.destroy();
1195
+ batchCountsBuffer.destroy();
1196
+ batchCountsReadBuffer.destroy();
1197
+ batchOffsetsBuffer.destroy();
1198
+ batchWriteHeadsBuffer.destroy();
1199
+ indexBuffer.destroy();
1200
+ for (const slot of slots) {
1201
+ slot.uniformBuffer.destroy();
1202
+ slot.resultsBuffer.destroy();
1203
+ slot.readBuffer.destroy();
1204
+ slot.batchInfoBuffer.destroy();
1205
+ }
1206
+ gaussianBuffer.destroy();
1207
+ return blockBuffer;
1208
+ }