rayzee 6.1.0 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,44 +7,23 @@ import { Ray, HitInfo } from '../TSL/Struct.js';
7
7
  import { traverseBVH } from '../TSL/BVHTraversal.js';
8
8
 
9
9
  /**
10
- * NormalDepth Stage for WebGPU (Compute Shader)
10
+ * NormalDepth primary-ray G-buffer for SVGF gates.
11
11
  *
12
- * Produces a G-buffer containing surface normals and linear depth by casting
13
- * primary rays through the BVH. This is a lightweight pass (~1-2 ms) that
14
- * shares the same BVH / triangle / material storage buffers as the path tracer.
12
+ * RGB = geometric world normal · 0.5 + 0.5, A = linear ray distance (sky=1e6).
13
+ * Geometric (not shading) normals because shading normals carry sub-pixel
14
+ * jitter that breaks the temporal gate's same-pixel-across-frames comparison.
15
+ * The path tracer's MRT already carries shading normals for OIDN; this stage
16
+ * is a separate, cheap, jitter-free signal for the denoiser.
15
17
  *
16
- * The output is required by denoising stages (ASVGF, BilateralFiltering)
17
- * and by the MotionVector.
18
+ * Ping-pong RenderTargets hold current/prev. On a dispatch we swap so prev
19
+ * is last frame's geometry. On a skipped dispatch (static camera) prev
20
+ * aliases current — without that aliasing prev would point at older data
21
+ * while this frame's motion vector reflects zero motion → false rejection.
18
22
  *
19
- * Architecture (copy approach — proven working in PathTracer):
20
- * 1. Compute shader writes to a StorageTexture via textureStore
21
- * 2. After dispatch, copyTextureToTexture transfers StorageTexture → RenderTarget
22
- * 3. RenderTarget texture is published to context (NOT StorageTexture —
23
- * cross-dispatch reads from StorageTexture return zeros in Three.js WebGPU)
24
- *
25
- * Output format (RGBA Float):
26
- * RGB — world-space normal encoded as (N * 0.5 + 0.5)
27
- * A — linear depth (distance along primary ray)
28
- *
29
- * Caching: Only re-renders when the camera moves or the scene is rebuilt.
30
- * During static accumulation the previous result is reused.
31
- *
32
- * Execution mode: ALWAYS (but internal dirty flag skips redundant work)
33
- *
34
- * Events listened:
35
- * camera:moved — mark dirty
36
- * pipeline:reset — mark dirty
37
- *
38
- * Textures published:
39
- * pathtracer:normalDepth — RGBA Float G-buffer (from RenderTarget, not StorageTexture)
23
+ * Publishes: pathtracer:normalDepth, pathtracer:prevNormalDepth
40
24
  */
41
25
  export class NormalDepth extends RenderStage {
42
26
 
43
- /**
44
- * @param {WebGPURenderer} renderer
45
- * @param {Object} options
46
- * @param {Object} options.pathTracer — reference to PathTracer (for shared buffers)
47
- */
48
27
  constructor( renderer, options = {} ) {
49
28
 
50
29
  super( 'NormalDepth', {
@@ -55,62 +34,48 @@ export class NormalDepth extends RenderStage {
55
34
  this.renderer = renderer;
56
35
  this.pathTracer = options.pathTracer;
57
36
 
58
- // Dirty flag — only re-render when true
59
37
  this._dirty = true;
60
38
 
61
- // Own camera uniforms (updated from PathTracer values each frame)
62
39
  this.cameraWorldMatrix = uniform( new Matrix4(), 'mat4' );
63
40
  this.cameraProjectionMatrixInverse = uniform( new Matrix4(), 'mat4' );
64
-
65
- // Resolution uniforms
66
41
  this.resolutionWidth = uniform( options.width || 1 );
67
42
  this.resolutionHeight = uniform( options.height || 1 );
68
43
 
69
44
  const w = options.width || 1;
70
45
  const h = options.height || 1;
71
46
 
72
- // Write-only StorageTexture (compute output)
73
47
  this._outputStorageTex = new StorageTexture( w, h );
74
48
  this._outputStorageTex.type = HalfFloatType;
75
49
  this._outputStorageTex.format = RGBAFormat;
76
50
  this._outputStorageTex.minFilter = NearestFilter;
77
51
  this._outputStorageTex.magFilter = NearestFilter;
78
52
 
79
- // Readable RenderTarget (copy destination published to context)
80
- this.renderTarget = new RenderTarget( w, h, {
53
+ // Ping-pong RTs share format with the StorageTexture so copyTextureToTexture works.
54
+ const rtOpts = {
81
55
  type: HalfFloatType,
82
56
  format: RGBAFormat,
83
57
  minFilter: NearestFilter,
84
58
  magFilter: NearestFilter,
85
59
  depthBuffer: false,
86
60
  stencilBuffer: false
87
- } );
61
+ };
62
+ this._rtA = new RenderTarget( w, h, rtOpts );
63
+ this._rtB = new RenderTarget( w, h, rtOpts );
64
+ this._currentIdx = 0;
65
+ this._hasHistory = false;
88
66
 
89
- // Dispatch dimensions (8x8 workgroups)
90
67
  this._dispatchX = Math.ceil( w / 8 );
91
68
  this._dispatchY = Math.ceil( h / 8 );
92
69
 
93
- // Own storage nodes — created lazily when data is available
94
70
  this._triStorageNode = null;
95
71
  this._bvhStorageNode = null;
96
-
97
- // Last-seen attribute identities. PathTracer replaces these in-place
98
- // across model load / BVH rebuild; the compute's bind group is locked
99
- // to whatever buffer was bound at pipeline compile time, so we rebuild
100
- // when any of them swaps to a new object.
101
72
  this._lastTriAttr = null;
102
73
  this._lastBvhAttr = null;
103
-
104
- // Compute node — built once when storage buffers are ready
105
74
  this._computeNode = null;
106
75
  this._computeBuilt = false;
107
76
 
108
77
  }
109
78
 
110
- // ──────────────────────────────────────────────────
111
- // Pipeline lifecycle
112
- // ──────────────────────────────────────────────────
113
-
114
79
  setupEventListeners() {
115
80
 
116
81
  this.on( 'camera:moved', () => {
@@ -122,40 +87,23 @@ export class NormalDepth extends RenderStage {
122
87
  this.on( 'pipeline:reset', () => {
123
88
 
124
89
  this._dirty = true;
90
+ this._hasHistory = false;
125
91
 
126
92
  } );
127
93
 
128
94
  }
129
95
 
130
- // ──────────────────────────────────────────────────
131
- // Storage buffer synchronisation
132
- // ──────────────────────────────────────────────────
133
-
134
- /**
135
- * Synchronise storage buffer nodes from PathTracer.
136
- *
137
- * Creates own `storage()` nodes pointing at the same underlying
138
- * StorageInstancedBufferAttribute so the GPU buffer is shared,
139
- * but each compute node has its own binding (avoids the module-scope
140
- * TextureNode issue that breaks MRT).
141
- */
142
96
  _syncStorageBuffers() {
143
97
 
144
98
  const pt = this.pathTracer;
145
99
  if ( ! pt ) return false;
146
100
 
147
- // Detect attribute identity swap (PathTracer.setTriangleData /
148
- // setBVHData replace the attribute object on growth). The compute
149
- // node's bind group is locked to the buffer bound at compile time —
150
- // updating the storage node's .value alone leaves the GPU binding
151
- // pointing at the now-discarded buffer, so every traversal misses.
152
101
  const triSwapped = pt.triangleStorageAttr && pt.triangleStorageAttr !== this._lastTriAttr;
153
102
  const bvhSwapped = pt.bvhStorageAttr && pt.bvhStorageAttr !== this._lastBvhAttr;
154
103
 
155
104
  if ( triSwapped || bvhSwapped ) {
156
105
 
157
- // Drop compute + storage nodes so they get rebuilt against the
158
- // current buffers. Cheap: this only happens on model load.
106
+ // Buffer identity changed compute's bind group is stale; rebuild.
159
107
  this._computeNode?.dispose?.();
160
108
  this._computeNode = null;
161
109
  this._computeBuilt = false;
@@ -188,10 +136,6 @@ export class NormalDepth extends RenderStage {
188
136
 
189
137
  }
190
138
 
191
- // ──────────────────────────────────────────────────
192
- // Compute node (built once when buffers are ready)
193
- // ──────────────────────────────────────────────────
194
-
195
139
  _buildCompute() {
196
140
 
197
141
  const triStorage = this._triStorageNode;
@@ -204,8 +148,8 @@ export class NormalDepth extends RenderStage {
204
148
 
205
149
  const WG_SIZE = 8;
206
150
 
207
- // Pass mat4 uniforms as Fn parameters so TSL wraps them
208
- // with bracket-indexing support (closure captures don't get this)
151
+ // mat4 uniforms as Fn parameters so TSL emits bracket indexing
152
+ // (closure captures don't get this).
209
153
  const computeFn = Fn( ( [ camWorldMat, camProjInvMat ] ) => {
210
154
 
211
155
  const gx = int( workgroupId.x ).mul( WG_SIZE ).add( int( localId.x ) );
@@ -213,13 +157,13 @@ export class NormalDepth extends RenderStage {
213
157
 
214
158
  If( gx.lessThan( int( resW ) ).and( gy.lessThan( int( resH ) ) ), () => {
215
159
 
216
- // Pixel coordinate → NDC
217
- // Negate Y: in WebGPU, pixel Y=0 at top of screen
160
+ // Pixel center → NDC, Y negated for Three.js WebGPU.
218
161
  const ndcX = float( gx ).add( 0.5 ).div( resW ).mul( 2.0 ).sub( 1.0 );
219
162
  const ndcY = float( gy ).add( 0.5 ).div( resH ).mul( 2.0 ).sub( 1.0 ).negate();
220
163
  const ndcPos = vec3( ndcX, ndcY, 1.0 );
221
164
 
222
- // Camera ray (no DOF)
165
+ // No jitter — deterministic per-pixel ray so the temporal gate
166
+ // sees stable per-pixel normals across frames.
223
167
  const rayDirCS = camProjInvMat.mul( vec4( ndcPos, 1.0 ) );
224
168
  const rayDirWorld = normalize(
225
169
  mat3(
@@ -231,15 +175,11 @@ export class NormalDepth extends RenderStage {
231
175
  const rayOrigin = vec3( camWorldMat[ 3 ] );
232
176
 
233
177
  const ray = Ray( { origin: rayOrigin, direction: rayDirWorld } );
234
-
235
- // BVH traversal (primary ray only) — wrap result for struct field access
236
178
  const hit = HitInfo.wrap( traverseBVH( ray, bvhStorage, triStorage ) );
237
179
 
238
- // Encode: normal * 0.5 + 0.5 in RGB, linear depth in A
239
180
  const encodedNormal = hit.normal.mul( 0.5 ).add( 0.5 );
240
181
  const depth = hit.dst;
241
182
 
242
- // Sky / miss: zero normal, large depth
243
183
  const result = hit.didHit.select(
244
184
  vec4( encodedNormal, depth ),
245
185
  vec4( 0.0, 0.0, 0.0, float( 1e6 ) )
@@ -264,26 +204,15 @@ export class NormalDepth extends RenderStage {
264
204
 
265
205
  }
266
206
 
267
- // ──────────────────────────────────────────────────
268
- // Render
269
- // ──────────────────────────────────────────────────
270
-
271
207
  render( context ) {
272
208
 
273
209
  if ( ! this.enabled ) return;
274
210
 
275
- // Sync storage buffers from path tracer
276
211
  const buffersReady = this._syncStorageBuffers();
277
212
  if ( ! buffersReady ) return;
278
213
 
279
- // Build compute node on first call (deferred until buffers exist)
280
- if ( ! this._computeBuilt ) {
281
-
282
- this._buildCompute();
283
-
284
- }
214
+ if ( ! this._computeBuilt ) this._buildCompute();
285
215
 
286
- // Sync camera uniforms from PathTracer
287
216
  const pt = this.pathTracer;
288
217
  if ( pt ) {
289
218
 
@@ -292,22 +221,24 @@ export class NormalDepth extends RenderStage {
292
221
 
293
222
  }
294
223
 
295
- // Skip if not dirty (camera hasn't moved, scene hasn't changed)
296
- if ( ! this._dirty && this.renderTarget.texture ) {
224
+ // Static camera: republish current and alias prev to current. Without
225
+ // the alias, prev would still hold older geometry while motion vector
226
+ // reflects zero motion → false rejection at every pixel.
227
+ if ( ! this._dirty && this._hasHistory ) {
297
228
 
298
- // Still publish the cached texture
299
- context.setTexture( 'pathtracer:normalDepth', this.renderTarget.texture );
229
+ const currentRT = this._currentIdx === 0 ? this._rtA : this._rtB;
230
+ context.setTexture( 'pathtracer:normalDepth', currentRT.texture );
231
+ context.setTexture( 'pathtracer:prevNormalDepth', currentRT.texture );
300
232
  return;
301
233
 
302
234
  }
303
235
 
304
- // Auto-match size to path tracer output
305
236
  const ptColor = context.getTexture( 'pathtracer:color' );
306
237
  if ( ptColor && ptColor.image ) {
307
238
 
308
239
  const img = ptColor.image;
309
240
  if ( img.width > 0 && img.height > 0 &&
310
- ( img.width !== this.renderTarget.width || img.height !== this.renderTarget.height ) ) {
241
+ ( img.width !== this._rtA.width || img.height !== this._rtA.height ) ) {
311
242
 
312
243
  this.setSize( img.width, img.height );
313
244
 
@@ -315,40 +246,46 @@ export class NormalDepth extends RenderStage {
315
246
 
316
247
  }
317
248
 
318
- // Dispatch compute shader
249
+ // Swap roles: what was current becomes prev, write into the free slot.
250
+ if ( this._hasHistory ) this._currentIdx = 1 - this._currentIdx;
251
+ const writeRT = this._currentIdx === 0 ? this._rtA : this._rtB;
252
+ const prevRT = this._currentIdx === 0 ? this._rtB : this._rtA;
253
+
319
254
  this.renderer.compute( this._computeNode );
255
+ this.renderer.copyTextureToTexture( this._outputStorageTex, writeRT.texture );
256
+
257
+ // First dispatch: seed prev from current so ASVGF doesn't see false
258
+ // disocclusion on frame 1.
259
+ if ( ! this._hasHistory ) {
320
260
 
321
- // Copy StorageTexture → RenderTarget (cross-dispatch reads from
322
- // StorageTexture return zeros — must use RenderTarget for downstream stages)
323
- this.renderer.copyTextureToTexture( this._outputStorageTex, this.renderTarget.texture );
261
+ this.renderer.copyTextureToTexture( this._outputStorageTex, prevRT.texture );
262
+ this._hasHistory = true;
324
263
 
325
- // Publish RenderTarget texture to context
326
- context.setTexture( 'pathtracer:normalDepth', this.renderTarget.texture );
264
+ }
265
+
266
+ context.setTexture( 'pathtracer:normalDepth', writeRT.texture );
267
+ context.setTexture( 'pathtracer:prevNormalDepth', prevRT.texture );
327
268
 
328
- // Clear dirty flag — next frame will reuse cached result
329
269
  this._dirty = false;
330
270
 
331
271
  }
332
272
 
333
- // ──────────────────────────────────────────────────
334
- // Lifecycle
335
- // ──────────────────────────────────────────────────
336
-
337
273
  reset() {
338
274
 
339
275
  this._dirty = true;
276
+ this._hasHistory = false;
340
277
 
341
278
  }
342
279
 
343
280
  setSize( width, height ) {
344
281
 
345
282
  this._outputStorageTex.setSize( width, height );
346
- this.renderTarget.setSize( width, height );
347
- this.renderTarget.texture.needsUpdate = true;
283
+ this._rtA.setSize( width, height );
284
+ this._rtB.setSize( width, height );
285
+ this._hasHistory = false;
348
286
  this.resolutionWidth.value = width;
349
287
  this.resolutionHeight.value = height;
350
288
 
351
- // Update dispatch dimensions
352
289
  this._dispatchX = Math.ceil( width / 8 );
353
290
  this._dispatchY = Math.ceil( height / 8 );
354
291
  if ( this._computeNode ) {
@@ -365,7 +302,8 @@ export class NormalDepth extends RenderStage {
365
302
 
366
303
  this._computeNode?.dispose();
367
304
  this._outputStorageTex?.dispose();
368
- this.renderTarget?.dispose();
305
+ this._rtA?.dispose();
306
+ this._rtB?.dispose();
369
307
 
370
308
  }
371
309
 
@@ -44,37 +44,66 @@ const createStack = () => array( 'int', MAX_STACK_DEPTH ).toVar();
44
44
  // RAY INTERSECTION HELPERS (inlined for BVH traversal performance)
45
45
  // ================================================================================
46
46
 
47
+ // Woop watertight intersection (Woop/Benthin/Wald 2013). Eliminates edge leakage
48
+ // at shared triangle edges that Möller-Trumbore exhibits under FP32. Per-ray shears
49
+ // are precomputed once via computeWoopRayParams; per-triangle test is FMA-friendly
50
+ // and uses sign-aware depth comparison so it works for any det orientation.
47
51
  const RayTriangleGeometry = wgslFn( `
48
- fn RayTriangleGeometry( rayOrigin: vec3f, rayDir: vec3f, pA: vec3f, pB: vec3f, pC: vec3f, closestHitDst: f32 ) -> vec4f {
52
+ fn RayTriangleGeometry( rayOrigin: vec3f, rayDir: vec3f, pA: vec3f, pB: vec3f, pC: vec3f, closestHitDst: f32, woopParams: vec4f ) -> vec4f {
49
53
 
50
- // Returns vec4(t, u, v, hit) where hit > 0.5 means intersection
54
+ // Returns vec4(t, u, v, hit) where hit > 0.5 means intersection.
55
+ // woopParams: (Sx, Sy, Sz, bitcast<f32>(packed kx|ky<<2|kz<<4))
51
56
  var result = vec4f( 1e20f, 0.0f, 0.0f, 0.0f );
52
57
 
53
- let edge1 = pB - pA;
54
- let edge2 = pC - pA;
55
- let h = cross( rayDir, edge2 );
56
- let a = dot( edge1, h );
58
+ let Sx = woopParams.x;
59
+ let Sy = woopParams.y;
60
+ let Sz = woopParams.z;
61
+ // Packed as regular f32 (values 0–42), not bitcast — avoids subnormal FTZ on Apple GPUs.
62
+ let packed = i32( woopParams.w );
63
+ let kx = packed & 3;
64
+ let ky = ( packed >> 2 ) & 3;
65
+ let kz = ( packed >> 4 ) & 3;
57
66
 
58
- if ( abs( a ) >= 1e-8f ) {
67
+ let A = pA - rayOrigin;
68
+ let B = pB - rayOrigin;
69
+ let C = pC - rayOrigin;
59
70
 
60
- let f = 1.0f / a;
61
- let s = rayOrigin - pA;
62
- let u = f * dot( s, h );
71
+ let Akz = A[ kz ];
72
+ let Bkz = B[ kz ];
73
+ let Ckz = C[ kz ];
63
74
 
64
- if ( u >= 0.0f && u <= 1.0f ) {
75
+ let Ax = A[ kx ] - Sx * Akz;
76
+ let Ay = A[ ky ] - Sy * Akz;
77
+ let Bx = B[ kx ] - Sx * Bkz;
78
+ let By = B[ ky ] - Sy * Bkz;
79
+ let Cx = C[ kx ] - Sx * Ckz;
80
+ let Cy = C[ ky ] - Sy * Ckz;
65
81
 
66
- let q = cross( s, edge1 );
67
- let v = f * dot( rayDir, q );
82
+ // Edge function tests — all three must share sign (or be exactly zero) for hit.
83
+ let U = Cx * By - Cy * Bx;
84
+ let V = Ax * Cy - Ay * Cx;
85
+ let W = Bx * Ay - By * Ax;
68
86
 
69
- if ( v >= 0.0f && ( u + v ) <= 1.0f ) {
87
+ let neg = U < 0.0f || V < 0.0f || W < 0.0f;
88
+ let pos = U > 0.0f || V > 0.0f || W > 0.0f;
89
+ if ( !( neg && pos ) ) {
70
90
 
71
- let t = f * dot( edge2, q );
91
+ let det = U + V + W;
92
+ if ( det != 0.0f ) {
72
93
 
73
- if ( t > 0.0f && t < closestHitDst ) {
94
+ let T = U * ( Sz * Akz ) + V * ( Sz * Bkz ) + W * ( Sz * Ckz );
74
95
 
75
- result = vec4f( t, u, v, 1.0f );
96
+ // Sign-aware bounds check on t (multiply both sides by sign(det) once).
97
+ let detSign = select( -1.0f, 1.0f, det > 0.0f );
98
+ let tSigned = T * detSign;
99
+ let detAbs = abs( det );
76
100
 
77
- }
101
+ if ( tSigned > 0.0f && tSigned < closestHitDst * detAbs ) {
102
+
103
+ // Match Möller-Trumbore convention: u = weight of B, v = weight of C.
104
+ // In Woop's edge functions, U → weight of A, V → weight of B, W → weight of C.
105
+ let invDet = 1.0f / det;
106
+ result = vec4f( T * invDet, V * invDet, W * invDet, 1.0f );
78
107
 
79
108
  }
80
109
 
@@ -87,6 +116,40 @@ const RayTriangleGeometry = wgslFn( `
87
116
  }
88
117
  ` );
89
118
 
119
+ // Compute Woop ray-space transform (Woop 2013, §3.1) — runs once per ray and
120
+ // amortizes across hundreds of triangle tests. Returns Sx/Sy/Sz shears plus the
121
+ // permuted axis indices packed via bitcast into the .w slot.
122
+ const computeWoopRayParams = wgslFn( `
123
+ fn computeWoopRayParams( rayDir: vec3f ) -> vec4f {
124
+
125
+ let absDir = abs( rayDir );
126
+
127
+ // kz = argmax(|dir|)
128
+ var kz: i32 = 0;
129
+ if ( absDir.y >= absDir.x ) { kz = 1; }
130
+ if ( absDir.z >= absDir[ u32( kz ) ] ) { kz = 2; }
131
+
132
+ var kx: i32 = ( kz + 1 ) % 3;
133
+ var ky: i32 = ( kx + 1 ) % 3;
134
+
135
+ // Preserve triangle winding when the dominant axis component is negative.
136
+ if ( rayDir[ u32( kz ) ] < 0.0f ) {
137
+ let tmp = kx;
138
+ kx = ky;
139
+ ky = tmp;
140
+ }
141
+
142
+ let dz = rayDir[ u32( kz ) ];
143
+ let Sx = rayDir[ u32( kx ) ] / dz;
144
+ let Sy = rayDir[ u32( ky ) ] / dz;
145
+ let Sz = 1.0f / dz;
146
+
147
+ let packed = kx | ( ky << 2 ) | ( kz << 4 );
148
+ return vec4f( Sx, Sy, Sz, f32( packed ) );
149
+
150
+ }
151
+ ` );
152
+
90
153
  const fastRayAABBDst = wgslFn( `
91
154
  fn fastRayAABBDst( rayOrigin: vec3f, invDir: vec3f, boxMin: vec3f, boxMax: vec3f ) -> f32 {
92
155
 
@@ -151,6 +214,9 @@ export const traverseBVH = Fn( ( [
151
214
  const rayOrigin = ray.origin;
152
215
  const rayDirection = ray.direction;
153
216
 
217
+ // Woop watertight intersection: precompute per-ray shears + axis permutation.
218
+ const woopParams = computeWoopRayParams( { rayDir: rayDirection } ).toVar();
219
+
154
220
  const iterCount = int( 0 ).toVar();
155
221
 
156
222
  Loop( stackPtr.greaterThan( int( 0 ) ).and( iterCount.lessThan( int( MAX_BVH_ITERATIONS ) ) ), () => {
@@ -186,7 +252,7 @@ export const traverseBVH = Fn( ( [
186
252
  const pB = getDatafromStorageBuffer( triangleBuffer, triIndex, int( 1 ), int( TRI_STRIDE ) ).xyz;
187
253
  const pC = getDatafromStorageBuffer( triangleBuffer, triIndex, int( 2 ), int( TRI_STRIDE ) ).xyz;
188
254
 
189
- const triResult = RayTriangleGeometry( { rayOrigin, rayDir: rayDirection, pA, pB, pC, closestHitDst: closestHit.dst } );
255
+ const triResult = RayTriangleGeometry( { rayOrigin, rayDir: rayDirection, pA, pB, pC, closestHitDst: closestHit.dst, woopParams } );
190
256
 
191
257
  // RayTriangleGeometry already guarantees t < closestHit.dst when w > 0.5
192
258
  If( triResult.w.greaterThan( 0.5 ), () => {
@@ -205,13 +271,15 @@ export const traverseBVH = Fn( ( [
205
271
  const nC = normalCData.xyz;
206
272
  const side = int( normalCData.w ).toVar();
207
273
 
208
- // Interpolate normal
274
+ // Interpolate normal for the side-culling dot product (kept local,
275
+ // not stored on closestHit — re-derived post-loop from closestTriIdx).
209
276
  const w = float( 1.0 ).sub( u ).sub( v );
210
- const normal = normalize( nA.mul( w ).add( nB.mul( u ) ).add( nC.mul( v ) ) ).toVar();
277
+ const rayDotNormal = rayDirection.dot(
278
+ normalize( nA.mul( w ).add( nB.mul( u ) ).add( nC.mul( v ) ) )
279
+ );
211
280
 
212
281
  // Side culling (inline; per-mesh visibility is at the BLAS-pointer level).
213
282
  // 0=front (reject back-facing), 1=back (reject front-facing), 2=double (pass).
214
- const rayDotNormal = rayDirection.dot( normal );
215
283
  const sidePass = side.equal( int( 2 ) )
216
284
  .or( side.equal( int( 0 ) ).and( rayDotNormal.lessThan( - 0.0001 ) ) )
217
285
  .or( side.equal( int( 1 ) ).and( rayDotNormal.greaterThan( 0.0001 ) ) );
@@ -219,10 +287,9 @@ export const traverseBVH = Fn( ( [
219
287
 
220
288
  closestHit.didHit.assign( true );
221
289
  closestHit.dst.assign( t );
222
- closestHit.normal.assign( normal );
223
290
 
224
- // Defer materialIndex/meshIndex/hitPoint/UV to post-traversal
225
- // (all re-derived from closestTriIdx with a single uvData2 fetch below).
291
+ // Defer normal/materialIndex/meshIndex/hitPoint/UV to post-traversal
292
+ // (all re-derived from closestTriIdx after the loop exits).
226
293
  closestTriIdx.assign( triIndex );
227
294
  closestU.assign( u );
228
295
  closestV.assign( v );
@@ -245,11 +312,9 @@ export const traverseBVH = Fn( ( [
245
312
  // BLAS-pointer leaf (marker -2) — push BLAS root onto stack if mesh is visible
246
313
  // nodeData0: [blasRootNodeIndex, meshIndex, visibility, -2]
247
314
  // Visibility is free-fetched with the leaf — no extra storage read.
248
- const blasRoot = int( nodeData0.x ).toVar();
249
-
250
315
  If( nodeData0.z.greaterThan( 0.5 ).and( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ) ), () => {
251
316
 
252
- stack.element( stackPtr ).assign( blasRoot );
317
+ stack.element( stackPtr ).assign( int( nodeData0.x ) );
253
318
  stackPtr.addAssign( 1 );
254
319
 
255
320
  } );
@@ -275,14 +340,11 @@ export const traverseBVH = Fn( ( [
275
340
 
276
341
  // Improved node ordering with fewer conditionals
277
342
  const aCloser = dstA.lessThan( dstB );
278
- const nearChild = select( aCloser, leftChild, rightChild ).toVar();
279
- const farChild = select( aCloser, rightChild, leftChild ).toVar();
280
- const farDst = select( aCloser, dstB, dstA ).toVar();
281
343
 
282
344
  // Push far child first (processed last)
283
- If( farDst.lessThan( closestHit.dst ).and( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ) ), () => {
345
+ If( select( aCloser, dstB, dstA ).lessThan( closestHit.dst ).and( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ) ), () => {
284
346
 
285
- stack.element( stackPtr ).assign( farChild );
347
+ stack.element( stackPtr ).assign( select( aCloser, rightChild, leftChild ) );
286
348
  stackPtr.addAssign( 1 );
287
349
 
288
350
  } );
@@ -290,7 +352,7 @@ export const traverseBVH = Fn( ( [
290
352
  // Push near child second (processed first)
291
353
  If( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ), () => {
292
354
 
293
- stack.element( stackPtr ).assign( nearChild );
355
+ stack.element( stackPtr ).assign( select( aCloser, leftChild, rightChild ) );
294
356
  stackPtr.addAssign( 1 );
295
357
 
296
358
  } );
@@ -301,12 +363,20 @@ export const traverseBVH = Fn( ( [
301
363
 
302
364
  } );
303
365
 
304
- // Deferred: compute hitPoint, UVs, and fetch matIdx/meshIndex once for the final closest hit
366
+ // Deferred: compute normal, hitPoint, UVs, and fetch matIdx/meshIndex once for the final closest hit
305
367
  If( closestHit.didHit, () => {
306
368
 
307
369
  closestHit.hitPoint.assign( ray.origin.add( ray.direction.mul( closestHit.dst ) ) );
308
370
 
309
371
  const w = float( 1.0 ).sub( closestU ).sub( closestV );
372
+
373
+ // Re-fetch the winning triangle's normals — trading 3 storage reads (once)
374
+ // for ~3 regs freed across every BVH iteration.
375
+ const nA = getDatafromStorageBuffer( triangleBuffer, closestTriIdx, int( 3 ), int( TRI_STRIDE ) ).xyz;
376
+ const nB = getDatafromStorageBuffer( triangleBuffer, closestTriIdx, int( 4 ), int( TRI_STRIDE ) ).xyz;
377
+ const nC = getDatafromStorageBuffer( triangleBuffer, closestTriIdx, int( 5 ), int( TRI_STRIDE ) ).xyz;
378
+ closestHit.normal.assign( normalize( nA.mul( w ).add( nB.mul( closestU ) ).add( nC.mul( closestV ) ) ) );
379
+
310
380
  const uvData1 = getDatafromStorageBuffer( triangleBuffer, closestTriIdx, int( 6 ), int( TRI_STRIDE ) );
311
381
  const uvData2 = getDatafromStorageBuffer( triangleBuffer, closestTriIdx, int( 7 ), int( TRI_STRIDE ) );
312
382
  closestHit.uv.assign(
@@ -356,6 +426,9 @@ export const traverseBVHShadow = Fn( ( [
356
426
  lessThan( abs( ray.direction ), vec3( 1e-8 ) )
357
427
  ).toVar();
358
428
 
429
+ // Woop watertight intersection: precompute per-ray shears + axis permutation.
430
+ const woopParams = computeWoopRayParams( { rayDir: ray.direction } ).toVar();
431
+
359
432
  const sIterCount = int( 0 ).toVar();
360
433
 
361
434
  Loop( stackPtr.greaterThan( int( 0 ) ).and( closestHit.didHit.not() ).and( sIterCount.lessThan( int( MAX_BVH_ITERATIONS ) ) ), () => {
@@ -383,7 +456,7 @@ export const traverseBVHShadow = Fn( ( [
383
456
  const pB = getDatafromStorageBuffer( triangleBuffer, triIndex, int( 1 ), int( TRI_STRIDE ) ).xyz;
384
457
  const pC = getDatafromStorageBuffer( triangleBuffer, triIndex, int( 2 ), int( TRI_STRIDE ) ).xyz;
385
458
 
386
- const triResult = RayTriangleGeometry( { rayOrigin: ray.origin, rayDir: ray.direction, pA, pB, pC, closestHitDst: closestHit.dst } );
459
+ const triResult = RayTriangleGeometry( { rayOrigin: ray.origin, rayDir: ray.direction, pA, pB, pC, closestHitDst: closestHit.dst, woopParams } );
387
460
 
388
461
  If( triResult.w.greaterThan( 0.5 ), () => {
389
462
 
@@ -418,11 +491,9 @@ export const traverseBVHShadow = Fn( ( [
418
491
 
419
492
  // BLAS-pointer leaf (marker -2) — push BLAS root onto stack if mesh is visible
420
493
  // nodeData0: [blasRootNodeIndex, meshIndex, visibility, -2]
421
- const blasRoot = int( nodeData0.x ).toVar();
422
-
423
494
  If( nodeData0.z.greaterThan( 0.5 ).and( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ) ), () => {
424
495
 
425
- stack.element( stackPtr ).assign( blasRoot );
496
+ stack.element( stackPtr ).assign( int( nodeData0.x ) );
426
497
  stackPtr.addAssign( 1 );
427
498
 
428
499
  } );
@@ -449,14 +520,11 @@ export const traverseBVHShadow = Fn( ( [
449
520
  If( minDst.lessThan( closestHit.dst ), () => {
450
521
 
451
522
  const aCloser = dstA.lessThan( dstB );
452
- const nearChild = select( aCloser, leftChild, rightChild ).toVar();
453
- const farChild = select( aCloser, rightChild, leftChild ).toVar();
454
- const farDst = select( aCloser, dstB, dstA ).toVar();
455
523
 
456
524
  // Push far child first (processed last)
457
- If( farDst.lessThan( closestHit.dst ).and( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ) ), () => {
525
+ If( select( aCloser, dstB, dstA ).lessThan( closestHit.dst ).and( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ) ), () => {
458
526
 
459
- stack.element( stackPtr ).assign( farChild );
527
+ stack.element( stackPtr ).assign( select( aCloser, rightChild, leftChild ) );
460
528
  stackPtr.addAssign( 1 );
461
529
 
462
530
  } );
@@ -464,7 +532,7 @@ export const traverseBVHShadow = Fn( ( [
464
532
  // Push near child second (processed first)
465
533
  If( stackPtr.lessThan( int( MAX_STACK_DEPTH ) ), () => {
466
534
 
467
- stack.element( stackPtr ).assign( nearChild );
535
+ stack.element( stackPtr ).assign( select( aCloser, leftChild, rightChild ) );
468
536
  stackPtr.addAssign( 1 );
469
537
 
470
538
  } );
@@ -510,9 +578,6 @@ export const generateRayFromCamera = Fn( ( [
510
578
  // Check if DOF is disabled or conditions make it ineffective
511
579
  If( enableDOF.and( focalLength.greaterThan( 0.0 ) ).and( aperture.lessThan( 64.0 ) ).and( focusDistance.greaterThan( 0.001 ) ), () => {
512
580
 
513
- // Calculate focal point - where rays converge
514
- const focalPoint = rayOriginWorld.add( rayDirectionWorld.mul( focusDistance ) ).toVar();
515
-
516
581
  // Physical aperture calculation
517
582
  const effectiveAperture = focalLength.div( aperture );
518
583
  // Apply scene scale to maintain correct physical aperture size
@@ -534,7 +599,7 @@ export const generateRayFromCamera = Fn( ( [
534
599
 
535
600
  // Calculate new ray from offset origin to focal point
536
601
  resultOrigin.assign( rayOriginWorld.add( offset ) );
537
- resultDirection.assign( normalize( focalPoint.sub( resultOrigin ) ) );
602
+ resultDirection.assign( normalize( rayOriginWorld.add( rayDirectionWorld.mul( focusDistance ) ).sub( resultOrigin ) ) );
538
603
 
539
604
  } );
540
605