nv-sgl 0.6.0__cp313-cp313-win_amd64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. include/tevclient.h +393 -0
  2. nv_sgl-0.6.0.dist-info/LICENSE +29 -0
  3. nv_sgl-0.6.0.dist-info/METADATA +21 -0
  4. nv_sgl-0.6.0.dist-info/RECORD +142 -0
  5. nv_sgl-0.6.0.dist-info/WHEEL +5 -0
  6. nv_sgl-0.6.0.dist-info/top_level.txt +1 -0
  7. sgl/__init__.py +15 -0
  8. sgl/__init__.pyi +6738 -0
  9. sgl/d3d12/D3D12Core.dll +0 -0
  10. sgl/d3d12/d3d12SDKLayers.dll +0 -0
  11. sgl/dxcompiler.dll +0 -0
  12. sgl/dxil.dll +0 -0
  13. sgl/gfx.dll +0 -0
  14. sgl/include/sgl/app/app.h +113 -0
  15. sgl/include/sgl/core/bitmap.h +302 -0
  16. sgl/include/sgl/core/crypto.h +89 -0
  17. sgl/include/sgl/core/data_type.h +46 -0
  18. sgl/include/sgl/core/dds_file.h +103 -0
  19. sgl/include/sgl/core/enum.h +201 -0
  20. sgl/include/sgl/core/error.h +161 -0
  21. sgl/include/sgl/core/file_stream.h +77 -0
  22. sgl/include/sgl/core/file_system_watcher.h +141 -0
  23. sgl/include/sgl/core/format.h +36 -0
  24. sgl/include/sgl/core/fwd.h +90 -0
  25. sgl/include/sgl/core/hash.h +45 -0
  26. sgl/include/sgl/core/input.h +522 -0
  27. sgl/include/sgl/core/logger.h +214 -0
  28. sgl/include/sgl/core/macros.h +184 -0
  29. sgl/include/sgl/core/maths.h +45 -0
  30. sgl/include/sgl/core/memory_mapped_file.h +112 -0
  31. sgl/include/sgl/core/memory_mapped_file_stream.h +32 -0
  32. sgl/include/sgl/core/memory_stream.h +74 -0
  33. sgl/include/sgl/core/object.h +683 -0
  34. sgl/include/sgl/core/platform.h +239 -0
  35. sgl/include/sgl/core/plugin.h +331 -0
  36. sgl/include/sgl/core/resolver.h +39 -0
  37. sgl/include/sgl/core/short_vector.h +141 -0
  38. sgl/include/sgl/core/static_vector.h +111 -0
  39. sgl/include/sgl/core/stream.h +54 -0
  40. sgl/include/sgl/core/string.h +276 -0
  41. sgl/include/sgl/core/struct.h +360 -0
  42. sgl/include/sgl/core/thread.h +28 -0
  43. sgl/include/sgl/core/timer.h +52 -0
  44. sgl/include/sgl/core/traits.h +15 -0
  45. sgl/include/sgl/core/type_utils.h +19 -0
  46. sgl/include/sgl/core/window.h +177 -0
  47. sgl/include/sgl/device/agility_sdk.h +24 -0
  48. sgl/include/sgl/device/blit.h +88 -0
  49. sgl/include/sgl/device/buffer_cursor.h +162 -0
  50. sgl/include/sgl/device/command.h +539 -0
  51. sgl/include/sgl/device/cuda_api.h +766 -0
  52. sgl/include/sgl/device/cuda_interop.h +39 -0
  53. sgl/include/sgl/device/cuda_utils.h +107 -0
  54. sgl/include/sgl/device/cursor_utils.h +129 -0
  55. sgl/include/sgl/device/device.h +668 -0
  56. sgl/include/sgl/device/device_resource.h +37 -0
  57. sgl/include/sgl/device/fence.h +91 -0
  58. sgl/include/sgl/device/formats.h +330 -0
  59. sgl/include/sgl/device/framebuffer.h +85 -0
  60. sgl/include/sgl/device/fwd.h +164 -0
  61. sgl/include/sgl/device/helpers.h +20 -0
  62. sgl/include/sgl/device/hot_reload.h +75 -0
  63. sgl/include/sgl/device/input_layout.h +74 -0
  64. sgl/include/sgl/device/kernel.h +69 -0
  65. sgl/include/sgl/device/memory_heap.h +155 -0
  66. sgl/include/sgl/device/native_formats.h +342 -0
  67. sgl/include/sgl/device/native_handle.h +73 -0
  68. sgl/include/sgl/device/native_handle_traits.h +65 -0
  69. sgl/include/sgl/device/pipeline.h +138 -0
  70. sgl/include/sgl/device/print.h +45 -0
  71. sgl/include/sgl/device/python/cursor_utils.h +853 -0
  72. sgl/include/sgl/device/query.h +52 -0
  73. sgl/include/sgl/device/raytracing.h +84 -0
  74. sgl/include/sgl/device/reflection.h +1254 -0
  75. sgl/include/sgl/device/resource.h +705 -0
  76. sgl/include/sgl/device/sampler.h +57 -0
  77. sgl/include/sgl/device/shader.h +516 -0
  78. sgl/include/sgl/device/shader_cursor.h +85 -0
  79. sgl/include/sgl/device/shader_object.h +94 -0
  80. sgl/include/sgl/device/shader_offset.h +67 -0
  81. sgl/include/sgl/device/shared_handle.h +12 -0
  82. sgl/include/sgl/device/slang_utils.h +54 -0
  83. sgl/include/sgl/device/swapchain.h +74 -0
  84. sgl/include/sgl/device/types.h +782 -0
  85. sgl/include/sgl/math/colorspace.h +56 -0
  86. sgl/include/sgl/math/constants.h +7 -0
  87. sgl/include/sgl/math/float16.h +146 -0
  88. sgl/include/sgl/math/matrix.h +6 -0
  89. sgl/include/sgl/math/matrix_math.h +746 -0
  90. sgl/include/sgl/math/matrix_types.h +207 -0
  91. sgl/include/sgl/math/python/primitivetype.h +33 -0
  92. sgl/include/sgl/math/quaternion.h +6 -0
  93. sgl/include/sgl/math/quaternion_math.h +484 -0
  94. sgl/include/sgl/math/quaternion_types.h +83 -0
  95. sgl/include/sgl/math/ray.h +47 -0
  96. sgl/include/sgl/math/scalar_math.h +249 -0
  97. sgl/include/sgl/math/scalar_types.h +107 -0
  98. sgl/include/sgl/math/vector.h +6 -0
  99. sgl/include/sgl/math/vector_math.h +1796 -0
  100. sgl/include/sgl/math/vector_types.h +336 -0
  101. sgl/include/sgl/python/nanobind.h +489 -0
  102. sgl/include/sgl/python/py_doc.h +11600 -0
  103. sgl/include/sgl/python/sgl_ext_pch.h +8 -0
  104. sgl/include/sgl/sgl.h +21 -0
  105. sgl/include/sgl/sgl_pch.h +6 -0
  106. sgl/include/sgl/stl/bit.h +377 -0
  107. sgl/include/sgl/tests/testing.h +54 -0
  108. sgl/include/sgl/ui/fwd.h +34 -0
  109. sgl/include/sgl/ui/imgui_config.h +43 -0
  110. sgl/include/sgl/ui/ui.h +71 -0
  111. sgl/include/sgl/ui/widgets.h +918 -0
  112. sgl/include/sgl/utils/python/slangpy.h +366 -0
  113. sgl/include/sgl/utils/renderdoc.h +50 -0
  114. sgl/include/sgl/utils/slangpy.h +153 -0
  115. sgl/include/sgl/utils/tev.h +93 -0
  116. sgl/include/sgl/utils/texture_loader.h +106 -0
  117. sgl/math/__init__.pyi +5083 -0
  118. sgl/platform/__init__.pyi +102 -0
  119. sgl/renderdoc/__init__.pyi +51 -0
  120. sgl/sgl.dll +0 -0
  121. sgl/sgl_ext.cp313-win_amd64.pyd +0 -0
  122. sgl/shaders/nvapi/nvHLSLExtns.h +2315 -0
  123. sgl/shaders/nvapi/nvHLSLExtnsInternal.h +758 -0
  124. sgl/shaders/nvapi/nvShaderExtnEnums.h +142 -0
  125. sgl/shaders/sgl/device/blit.slang +93 -0
  126. sgl/shaders/sgl/device/nvapi.slang +5 -0
  127. sgl/shaders/sgl/device/nvapi.slangh +7 -0
  128. sgl/shaders/sgl/device/print.slang +445 -0
  129. sgl/shaders/sgl/math/constants.slang +4 -0
  130. sgl/shaders/sgl/math/ray.slang +29 -0
  131. sgl/shaders/sgl/ui/imgui.slang +49 -0
  132. sgl/slang-glslang.dll +0 -0
  133. sgl/slang-llvm.dll +0 -0
  134. sgl/slang-rt.dll +0 -0
  135. sgl/slang.dll +0 -0
  136. sgl/slangpy/__init__.pyi +268 -0
  137. sgl/tev/__init__.pyi +108 -0
  138. sgl/tevclient.lib +0 -0
  139. sgl/thread/__init__.pyi +4 -0
  140. sgl/ui/__init__.pyi +1118 -0
  141. share/cmake/tevclient/tevclient-config-release.cmake +19 -0
  142. share/cmake/tevclient/tevclient-config.cmake +108 -0
@@ -0,0 +1,2315 @@
1
+ /*********************************************************************************************************\
2
+ |* *|
3
+ |* SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. *|
4
+ |* SPDX-License-Identifier: MIT *|
5
+ |* *|
6
+ |* Permission is hereby granted, free of charge, to any person obtaining a *|
7
+ |* copy of this software and associated documentation files (the "Software"), *|
8
+ |* to deal in the Software without restriction, including without limitation *|
9
+ |* the rights to use, copy, modify, merge, publish, distribute, sublicense, *|
10
+ |* and/or sell copies of the Software, and to permit persons to whom the *|
11
+ |* Software is furnished to do so, subject to the following conditions: *|
12
+ |* *|
13
+ |* The above copyright notice and this permission notice shall be included in *|
14
+ |* all copies or substantial portions of the Software. *|
15
+ |* *|
16
+ |* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *|
17
+ |* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *|
18
+ |* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *|
19
+ |* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *|
20
+ |* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING *|
21
+ |* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER *|
22
+ |* DEALINGS IN THE SOFTWARE. *|
23
+ |* *|
24
+ |* *|
25
+ \*********************************************************************************************************/
26
+
27
+ ////////////////////////// NVIDIA SHADER EXTENSIONS /////////////////
28
+
29
+ // this file is to be #included in the app HLSL shader code to make
30
+ // use of nvidia shader extensions
31
+
32
+
33
+ #include "nvHLSLExtnsInternal.h"
34
+
35
+ //----------------------------------------------------------------------------//
36
+ //------------------------- Warp Shuffle Functions ---------------------------//
37
+ //----------------------------------------------------------------------------//
38
+
39
+ // all functions have variants with width parameter which permits sub-division
40
+ // of the warp into segments - for example to exchange data between 4 groups of
41
+ // 8 lanes in a SIMD manner. If width is less than warpSize then each subsection
42
+ // of the warp behaves as a separate entity with a starting logical lane ID of 0.
43
+ // A thread may only exchange data with others in its own subsection. Width must
44
+ // have a value which is a power of 2 so that the warp can be subdivided equally;
45
+ // results are undefined if width is not a power of 2, or is a number greater
46
+ // than warpSize.
47
+
48
+ //
49
+ // simple variant of SHFL instruction
50
+ // returns val from the specified lane
51
+ // optional width parameter must be a power of two and width <= 32
52
+ //
53
+ int NvShfl(int val, uint srcLane, int width = NV_WARP_SIZE)
54
+ {
55
+ uint index = g_NvidiaExt.IncrementCounter();
56
+ g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
57
+ g_NvidiaExt[index].src0u.y = srcLane; // source lane
58
+ g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
59
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL;
60
+
61
+ // result is returned as the return value of IncrementCounter on fake UAV slot
62
+ return g_NvidiaExt.IncrementCounter();
63
+ }
64
+
65
+ int2 NvShfl(int2 val, uint srcLane, int width = NV_WARP_SIZE)
66
+ {
67
+ int x = NvShfl(val.x, srcLane, width);
68
+ int y = NvShfl(val.y, srcLane, width);
69
+ return int2(x, y);
70
+ }
71
+
72
+ int4 NvShfl(int4 val, uint srcLane, int width = NV_WARP_SIZE)
73
+ {
74
+ int x = NvShfl(val.x, srcLane, width);
75
+ int y = NvShfl(val.y, srcLane, width);
76
+ int z = NvShfl(val.z, srcLane, width);
77
+ int w = NvShfl(val.w, srcLane, width);
78
+ return int4(x, y, z, w);
79
+ }
80
+
81
+ //
82
+ // Copy from a lane with lower ID relative to caller
83
+ //
84
+ int NvShflUp(int val, uint delta, int width = NV_WARP_SIZE)
85
+ {
86
+ uint index = g_NvidiaExt.IncrementCounter();
87
+ g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
88
+ g_NvidiaExt[index].src0u.y = delta; // relative lane offset
89
+ g_NvidiaExt[index].src0u.z = (NV_WARP_SIZE - width) << 8; // minIndex = maxIndex for shfl_up (src2[4:0] is expected to be 0)
90
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_UP;
91
+ return g_NvidiaExt.IncrementCounter();
92
+ }
93
+
94
+ //
95
+ // Copy from a lane with higher ID relative to caller
96
+ //
97
+ int NvShflDown(int val, uint delta, int width = NV_WARP_SIZE)
98
+ {
99
+ uint index = g_NvidiaExt.IncrementCounter();
100
+ g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
101
+ g_NvidiaExt[index].src0u.y = delta; // relative lane offset
102
+ g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
103
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_DOWN;
104
+ return g_NvidiaExt.IncrementCounter();
105
+ }
106
+
107
+ //
108
+ // Copy from a lane based on bitwise XOR of own lane ID
109
+ //
110
+ int NvShflXor(int val, uint laneMask, int width = NV_WARP_SIZE)
111
+ {
112
+ uint index = g_NvidiaExt.IncrementCounter();
113
+ g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
114
+ g_NvidiaExt[index].src0u.y = laneMask; // laneMask to be XOR'ed with current laneId to get the source lane id
115
+ g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
116
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_XOR;
117
+ return g_NvidiaExt.IncrementCounter();
118
+ }
119
+
120
+
121
+ //----------------------------------------------------------------------------//
122
+ //----------------------------- Warp Vote Functions---------------------------//
123
+ //----------------------------------------------------------------------------//
124
+
125
+ // returns 0xFFFFFFFF if the predicate is true for any thread in the warp, returns 0 otherwise
126
+ uint NvAny(int predicate)
127
+ {
128
+ uint index = g_NvidiaExt.IncrementCounter();
129
+ g_NvidiaExt[index].src0u.x = predicate;
130
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ANY;
131
+ return g_NvidiaExt.IncrementCounter();
132
+ }
133
+
134
+ // returns 0xFFFFFFFF if the predicate is true for ALL threads in the warp, returns 0 otherwise
135
+ uint NvAll(int predicate)
136
+ {
137
+ uint index = g_NvidiaExt.IncrementCounter();
138
+ g_NvidiaExt[index].src0u.x = predicate;
139
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ALL;
140
+ return g_NvidiaExt.IncrementCounter();
141
+ }
142
+
143
+ // returns a mask of all threads in the warp with bits set for threads that have predicate true
144
+ uint NvBallot(int predicate)
145
+ {
146
+ uint index = g_NvidiaExt.IncrementCounter();
147
+ g_NvidiaExt[index].src0u.x = predicate;
148
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_BALLOT;
149
+ return g_NvidiaExt.IncrementCounter();
150
+ }
151
+
152
+
153
+ //----------------------------------------------------------------------------//
154
+ //----------------------------- Utility Functions ----------------------------//
155
+ //----------------------------------------------------------------------------//
156
+
157
+ // returns the lane index of the current thread (thread index in warp)
158
+ int NvGetLaneId()
159
+ {
160
+ uint index = g_NvidiaExt.IncrementCounter();
161
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_LANE_ID;
162
+ return g_NvidiaExt.IncrementCounter();
163
+ }
164
+
165
+ // returns value of special register - specify subopcode from any of NV_SPECIALOP_* specified in nvShaderExtnEnums.h - other opcodes undefined behavior
166
+ uint NvGetSpecial(uint subOpCode)
167
+ {
168
+ return __NvGetSpecial(subOpCode);
169
+ }
170
+
171
+ //----------------------------------------------------------------------------//
172
+ //----------------------------- FP16 Atmoic Functions-------------------------//
173
+ //----------------------------------------------------------------------------//
174
+
175
+ // The functions below performs atomic operations on two consecutive fp16
176
+ // values in the given raw UAV.
177
+ // The uint paramater 'fp16x2Val' is treated as two fp16 values byteAddress must be multiple of 4
178
+ // The returned value are the two fp16 values packed into a single uint
179
+
180
+ uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
181
+ {
182
+ return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_ADD);
183
+ }
184
+
185
+ uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
186
+ {
187
+ return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MIN);
188
+ }
189
+
190
+ uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
191
+ {
192
+ return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MAX);
193
+ }
194
+
195
+
196
+ // versions of the above functions taking two fp32 values (internally converted to fp16 values)
197
+ uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
198
+ {
199
+ return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
200
+ }
201
+
202
+ uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
203
+ {
204
+ return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
205
+ }
206
+
207
+ uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
208
+ {
209
+ return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
210
+ }
211
+
212
+
213
+ //----------------------------------------------------------------------------//
214
+
215
+ // The functions below perform atomic operation on a R16G16_FLOAT UAV at the given address
216
+ // the uint paramater 'fp16x2Val' is treated as two fp16 values
217
+ // the returned value are the two fp16 values (.x and .y components) packed into a single uint
218
+ // Warning: Behaviour of these set of functions is undefined if the UAV is not
219
+ // of R16G16_FLOAT format (might result in app crash or TDR)
220
+
221
+ uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
222
+ {
223
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
224
+ }
225
+
226
+ uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
227
+ {
228
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
229
+ }
230
+
231
+ uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
232
+ {
233
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
234
+ }
235
+
236
+ uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
237
+ {
238
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
239
+ }
240
+
241
+ uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
242
+ {
243
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
244
+ }
245
+
246
+ uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
247
+ {
248
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
249
+ }
250
+
251
+ uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
252
+ {
253
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
254
+ }
255
+
256
+ uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
257
+ {
258
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
259
+ }
260
+
261
+ uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
262
+ {
263
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
264
+ }
265
+
266
+
267
+ // versions taking two fp32 values (internally converted to fp16)
268
+ uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
269
+ {
270
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
271
+ }
272
+
273
+ uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
274
+ {
275
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
276
+ }
277
+
278
+ uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
279
+ {
280
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
281
+ }
282
+
283
+ uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
284
+ {
285
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
286
+ }
287
+
288
+ uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
289
+ {
290
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
291
+ }
292
+
293
+ uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
294
+ {
295
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
296
+ }
297
+
298
+ uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
299
+ {
300
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
301
+ }
302
+
303
+ uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
304
+ {
305
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
306
+ }
307
+
308
+ uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
309
+ {
310
+ return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
311
+ }
312
+
313
+
314
+ //----------------------------------------------------------------------------//
315
+
316
+ // The functions below perform Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
317
+ // the uint2 paramater 'fp16x2Val' is treated as four fp16 values
318
+ // i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
319
+ // The returned value are the four fp16 values (.xyzw components) packed into uint2
320
+ // Warning: Behaviour of these set of functions is undefined if the UAV is not
321
+ // of R16G16B16A16_FLOAT format (might result in app crash or TDR)
322
+
323
+ uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
324
+ {
325
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
326
+ }
327
+
328
+ uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
329
+ {
330
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
331
+ }
332
+
333
+ uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
334
+ {
335
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
336
+ }
337
+
338
+ uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
339
+ {
340
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
341
+ }
342
+
343
+ uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
344
+ {
345
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
346
+ }
347
+
348
+ uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
349
+ {
350
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
351
+ }
352
+
353
+ uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
354
+ {
355
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
356
+ }
357
+
358
+ uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
359
+ {
360
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
361
+ }
362
+
363
+ uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
364
+ {
365
+ return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
366
+ }
367
+
368
+ // versions taking four fp32 values (internally converted to fp16)
369
+ uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
370
+ {
371
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
372
+ }
373
+
374
+ uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
375
+ {
376
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
377
+ }
378
+
379
+ uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
380
+ {
381
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
382
+ }
383
+
384
+ uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
385
+ {
386
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
387
+ }
388
+
389
+ uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
390
+ {
391
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
392
+ }
393
+
394
+ uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
395
+ {
396
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
397
+ }
398
+
399
+ uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
400
+ {
401
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
402
+ }
403
+
404
+ uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
405
+ {
406
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
407
+ }
408
+
409
+ uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
410
+ {
411
+ return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
412
+ }
413
+
414
+
415
+ //----------------------------------------------------------------------------//
416
+ //----------------------------- FP32 Atmoic Functions-------------------------//
417
+ //----------------------------------------------------------------------------//
418
+
419
+ // The functions below performs atomic add on the given UAV treating the value as float
420
+ // byteAddress must be multiple of 4
421
+ // The returned value is the value present in memory location before the atomic add
422
+
423
+ float NvInterlockedAddFp32(RWByteAddressBuffer uav, uint byteAddress, float val)
424
+ {
425
+ return __NvAtomicAddFP32(uav, byteAddress, val);
426
+ }
427
+
428
+ //----------------------------------------------------------------------------//
429
+
430
+ // The functions below perform atomic add on a R32_FLOAT UAV at the given address
431
+ // the returned value is the value before performing the atomic add
432
+ // Warning: Behaviour of these set of functions is undefined if the UAV is not
433
+ // of R32_FLOAT format (might result in app crash or TDR)
434
+
435
+ float NvInterlockedAddFp32(RWTexture1D<float> uav, uint address, float val)
436
+ {
437
+ return __NvAtomicAddFP32(uav, address, val);
438
+ }
439
+
440
+ float NvInterlockedAddFp32(RWTexture2D<float> uav, uint2 address, float val)
441
+ {
442
+ return __NvAtomicAddFP32(uav, address, val);
443
+ }
444
+
445
+ float NvInterlockedAddFp32(RWTexture3D<float> uav, uint3 address, float val)
446
+ {
447
+ return __NvAtomicAddFP32(uav, address, val);
448
+ }
449
+
450
+
451
+ //----------------------------------------------------------------------------//
452
+ //--------------------------- UINT64 Atmoic Functions-------------------------//
453
+ //----------------------------------------------------------------------------//
454
+
455
+ // The functions below performs atomic operation on the given UAV treating the value as uint64
456
+ // byteAddress must be multiple of 8
457
+ // The returned value is the value present in memory location before the atomic operation
458
+ // uint2 vector type is used to represent a single uint64 value with the x component containing the low 32 bits and y component the high 32 bits.
459
+
460
+ uint2 NvInterlockedAddUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
461
+ {
462
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_ADD);
463
+ }
464
+
465
+ uint2 NvInterlockedMaxUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
466
+ {
467
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_MAX);
468
+ }
469
+
470
+ uint2 NvInterlockedMinUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
471
+ {
472
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_MIN);
473
+ }
474
+
475
+ uint2 NvInterlockedAndUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
476
+ {
477
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_AND);
478
+ }
479
+
480
+ uint2 NvInterlockedOrUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
481
+ {
482
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_OR);
483
+ }
484
+
485
+ uint2 NvInterlockedXorUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
486
+ {
487
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_XOR);
488
+ }
489
+
490
+ uint2 NvInterlockedCompareExchangeUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 compare_value, uint2 value)
491
+ {
492
+ return __NvAtomicCompareExchangeUINT64(uav, byteAddress, compare_value, value);
493
+ }
494
+
495
+ uint2 NvInterlockedExchangeUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
496
+ {
497
+ return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_SWAP);
498
+ }
499
+
500
+ //----------------------------------------------------------------------------//
501
+
502
+ // The functions below perform atomic operation on a R32G32_UINT UAV at the given address treating the value as uint64
503
+ // the returned value is the value before performing the atomic operation
504
+ // uint2 vector type is used to represent a single uint64 value with the x component containing the low 32 bits and y component the high 32 bits.
505
+ // Warning: Behaviour of these set of functions is undefined if the UAV is not of R32G32_UINT format (might result in app crash or TDR)
506
+
507
+ uint2 NvInterlockedAddUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
508
+ {
509
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_ADD);
510
+ }
511
+
512
+ uint2 NvInterlockedMaxUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
513
+ {
514
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MAX);
515
+ }
516
+
517
+ uint2 NvInterlockedMinUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
518
+ {
519
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MIN);
520
+ }
521
+
522
+ uint2 NvInterlockedAndUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
523
+ {
524
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_AND);
525
+ }
526
+
527
+ uint2 NvInterlockedOrUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
528
+ {
529
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_OR);
530
+ }
531
+
532
+ uint2 NvInterlockedXorUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
533
+ {
534
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_XOR);
535
+ }
536
+
537
+ uint2 NvInterlockedCompareExchangeUint64(RWTexture1D<uint2> uav, uint address, uint2 compare_value, uint2 value)
538
+ {
539
+ return __NvAtomicCompareExchangeUINT64(uav, address, compare_value, value);
540
+ }
541
+
542
+ uint2 NvInterlockedExchangeUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
543
+ {
544
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_SWAP);
545
+ }
546
+
547
+ uint2 NvInterlockedAddUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
548
+ {
549
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_ADD);
550
+ }
551
+
552
+ uint2 NvInterlockedMaxUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
553
+ {
554
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MAX);
555
+ }
556
+
557
+ uint2 NvInterlockedMinUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
558
+ {
559
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MIN);
560
+ }
561
+
562
+ uint2 NvInterlockedAndUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
563
+ {
564
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_AND);
565
+ }
566
+
567
+ uint2 NvInterlockedOrUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
568
+ {
569
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_OR);
570
+ }
571
+
572
+ uint2 NvInterlockedXorUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
573
+ {
574
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_XOR);
575
+ }
576
+
577
+ uint2 NvInterlockedCompareExchangeUint64(RWTexture2D<uint2> uav, uint2 address, uint2 compare_value, uint2 value)
578
+ {
579
+ return __NvAtomicCompareExchangeUINT64(uav, address, compare_value, value);
580
+ }
581
+
582
+ uint2 NvInterlockedExchangeUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
583
+ {
584
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_SWAP);
585
+ }
586
+
587
+ uint2 NvInterlockedAddUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
588
+ {
589
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_ADD);
590
+ }
591
+
592
+ uint2 NvInterlockedMaxUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
593
+ {
594
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MAX);
595
+ }
596
+
597
+ uint2 NvInterlockedMinUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
598
+ {
599
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MIN);
600
+ }
601
+
602
+ uint2 NvInterlockedAndUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
603
+ {
604
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_AND);
605
+ }
606
+
607
+ uint2 NvInterlockedOrUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
608
+ {
609
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_OR);
610
+ }
611
+
612
+ uint2 NvInterlockedXorUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
613
+ {
614
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_XOR);
615
+ }
616
+
617
+ uint2 NvInterlockedCompareExchangeUint64(RWTexture3D<uint2> uav, uint3 address, uint2 compare_value, uint2 value)
618
+ {
619
+ return __NvAtomicCompareExchangeUINT64(uav, address, compare_value, value);
620
+ }
621
+
622
+ uint2 NvInterlockedExchangeUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
623
+ {
624
+ return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_SWAP);
625
+ }
626
+
627
+ //----------------------------------------------------------------------------//
628
+ //--------------------------- VPRS functions ---------------------------------//
629
+ //----------------------------------------------------------------------------//
630
+
631
+ // Returns the shading rate and the number of per-pixel shading passes for current VPRS pixel
632
+ uint3 NvGetShadingRate()
633
+ {
634
+ uint3 shadingRate = (uint3)0;
635
+ uint index = g_NvidiaExt.IncrementCounter();
636
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_SHADING_RATE;
637
+ g_NvidiaExt[index].numOutputsForIncCounter = 3;
638
+ shadingRate.x = g_NvidiaExt.IncrementCounter();
639
+ shadingRate.y = g_NvidiaExt.IncrementCounter();
640
+ shadingRate.z = g_NvidiaExt.IncrementCounter();
641
+ return shadingRate;
642
+ }
643
+
644
+ float NvEvaluateAttributeAtSampleForVPRS(float attrib, uint sampleIndex, int2 pixelOffset)
645
+ {
646
+ float value = (float)0;
647
+ uint ext = g_NvidiaExt.IncrementCounter();
648
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
649
+ g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
650
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
651
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
652
+ g_NvidiaExt[ext].numOutputsForIncCounter = 1;
653
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
654
+ return value;
655
+ }
656
+
657
+ float2 NvEvaluateAttributeAtSampleForVPRS(float2 attrib, uint sampleIndex, int2 pixelOffset)
658
+ {
659
+ float2 value = (float2)0;
660
+ uint ext = g_NvidiaExt.IncrementCounter();
661
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
662
+ g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
663
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
664
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
665
+ g_NvidiaExt[ext].numOutputsForIncCounter = 2;
666
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
667
+ value.y = asfloat(g_NvidiaExt.IncrementCounter());
668
+ return value;
669
+ }
670
+
671
+ float3 NvEvaluateAttributeAtSampleForVPRS(float3 attrib, uint sampleIndex, int2 pixelOffset)
672
+ {
673
+ float3 value = (float3)0;
674
+ uint ext = g_NvidiaExt.IncrementCounter();
675
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
676
+ g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
677
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
678
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
679
+ g_NvidiaExt[ext].numOutputsForIncCounter = 3;
680
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
681
+ value.y = asfloat(g_NvidiaExt.IncrementCounter());
682
+ value.z = asfloat(g_NvidiaExt.IncrementCounter());
683
+ return value;
684
+ }
685
+
686
+ float4 NvEvaluateAttributeAtSampleForVPRS(float4 attrib, uint sampleIndex, int2 pixelOffset)
687
+ {
688
+ float4 value = (float4)0;
689
+ uint ext = g_NvidiaExt.IncrementCounter();
690
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
691
+ g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
692
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
693
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
694
+ g_NvidiaExt[ext].numOutputsForIncCounter = 4;
695
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
696
+ value.y = asfloat(g_NvidiaExt.IncrementCounter());
697
+ value.z = asfloat(g_NvidiaExt.IncrementCounter());
698
+ value.w = asfloat(g_NvidiaExt.IncrementCounter());
699
+ return value;
700
+ }
701
+
702
+ int NvEvaluateAttributeAtSampleForVPRS(int attrib, uint sampleIndex, int2 pixelOffset)
703
+ {
704
+ int value = (int)0;
705
+ uint ext = g_NvidiaExt.IncrementCounter();
706
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
707
+ g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
708
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
709
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
710
+ g_NvidiaExt[ext].numOutputsForIncCounter = 1;
711
+ value.x = asint(g_NvidiaExt.IncrementCounter());
712
+ return value;
713
+ }
714
+
715
+ int2 NvEvaluateAttributeAtSampleForVPRS(int2 attrib, uint sampleIndex, int2 pixelOffset)
716
+ {
717
+ int2 value = (int2)0;
718
+ uint ext = g_NvidiaExt.IncrementCounter();
719
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
720
+ g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
721
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
722
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
723
+ g_NvidiaExt[ext].numOutputsForIncCounter = 2;
724
+ value.x = asint(g_NvidiaExt.IncrementCounter());
725
+ value.y = asint(g_NvidiaExt.IncrementCounter());
726
+ return value;
727
+ }
728
+
729
+ int3 NvEvaluateAttributeAtSampleForVPRS(int3 attrib, uint sampleIndex, int2 pixelOffset)
730
+ {
731
+ int3 value = (int3)0;
732
+ uint ext = g_NvidiaExt.IncrementCounter();
733
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
734
+ g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
735
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
736
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
737
+ g_NvidiaExt[ext].numOutputsForIncCounter = 3;
738
+ value.x = asint(g_NvidiaExt.IncrementCounter());
739
+ value.y = asint(g_NvidiaExt.IncrementCounter());
740
+ value.z = asint(g_NvidiaExt.IncrementCounter());
741
+ return value;
742
+ }
743
+
744
+ int4 NvEvaluateAttributeAtSampleForVPRS(int4 attrib, uint sampleIndex, int2 pixelOffset)
745
+ {
746
+ int4 value = (int4)0;
747
+ uint ext = g_NvidiaExt.IncrementCounter();
748
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
749
+ g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
750
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
751
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
752
+ g_NvidiaExt[ext].numOutputsForIncCounter = 4;
753
+ value.x = asint(g_NvidiaExt.IncrementCounter());
754
+ value.y = asint(g_NvidiaExt.IncrementCounter());
755
+ value.z = asint(g_NvidiaExt.IncrementCounter());
756
+ value.w = asint(g_NvidiaExt.IncrementCounter());
757
+ return value;
758
+ }
759
+
760
+ uint NvEvaluateAttributeAtSampleForVPRS(uint attrib, uint sampleIndex, int2 pixelOffset)
761
+ {
762
+ uint value = (uint)0;
763
+ uint ext = g_NvidiaExt.IncrementCounter();
764
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
765
+ g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
766
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
767
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
768
+ g_NvidiaExt[ext].numOutputsForIncCounter = 1;
769
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
770
+ return value;
771
+ }
772
+
773
+ uint2 NvEvaluateAttributeAtSampleForVPRS(uint2 attrib, uint sampleIndex, int2 pixelOffset)
774
+ {
775
+ uint2 value = (uint2)0;
776
+ uint ext = g_NvidiaExt.IncrementCounter();
777
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
778
+ g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
779
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
780
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
781
+ g_NvidiaExt[ext].numOutputsForIncCounter = 2;
782
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
783
+ value.y = asuint(g_NvidiaExt.IncrementCounter());
784
+ return value;
785
+ }
786
+
787
+ uint3 NvEvaluateAttributeAtSampleForVPRS(uint3 attrib, uint sampleIndex, int2 pixelOffset)
788
+ {
789
+ uint3 value = (uint3)0;
790
+ uint ext = g_NvidiaExt.IncrementCounter();
791
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
792
+ g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
793
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
794
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
795
+ g_NvidiaExt[ext].numOutputsForIncCounter = 3;
796
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
797
+ value.y = asuint(g_NvidiaExt.IncrementCounter());
798
+ value.z = asuint(g_NvidiaExt.IncrementCounter());
799
+ return value;
800
+ }
801
+
802
+ uint4 NvEvaluateAttributeAtSampleForVPRS(uint4 attrib, uint sampleIndex, int2 pixelOffset)
803
+ {
804
+ uint4 value = (uint4)0;
805
+ uint ext = g_NvidiaExt.IncrementCounter();
806
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
807
+ g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
808
+ g_NvidiaExt[ext].src1u.x = sampleIndex;
809
+ g_NvidiaExt[ext].src2u.xy = pixelOffset;
810
+ g_NvidiaExt[ext].numOutputsForIncCounter = 4;
811
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
812
+ value.y = asuint(g_NvidiaExt.IncrementCounter());
813
+ value.z = asuint(g_NvidiaExt.IncrementCounter());
814
+ value.w = asuint(g_NvidiaExt.IncrementCounter());
815
+ return value;
816
+ }
817
+
818
+
819
+ float NvEvaluateAttributeSnappedForVPRS(float attrib, uint2 offset)
820
+ {
821
+ float value = (float)0;
822
+ uint ext = g_NvidiaExt.IncrementCounter();
823
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
824
+ g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
825
+ g_NvidiaExt[ext].src1u.xy = offset;
826
+ g_NvidiaExt[ext].numOutputsForIncCounter = 1;
827
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
828
+ return value;
829
+ }
830
+
831
+ float2 NvEvaluateAttributeSnappedForVPRS(float2 attrib, uint2 offset)
832
+ {
833
+ float2 value = (float2)0;
834
+ uint ext = g_NvidiaExt.IncrementCounter();
835
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
836
+ g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
837
+ g_NvidiaExt[ext].src1u.xy = offset;
838
+ g_NvidiaExt[ext].numOutputsForIncCounter = 2;
839
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
840
+ value.y = asfloat(g_NvidiaExt.IncrementCounter());
841
+ return value;
842
+ }
843
+
844
+ float3 NvEvaluateAttributeSnappedForVPRS(float3 attrib, uint2 offset)
845
+ {
846
+ float3 value = (float3)0;
847
+ uint ext = g_NvidiaExt.IncrementCounter();
848
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
849
+ g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
850
+ g_NvidiaExt[ext].src1u.xy = offset;
851
+ g_NvidiaExt[ext].numOutputsForIncCounter = 3;
852
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
853
+ value.y = asfloat(g_NvidiaExt.IncrementCounter());
854
+ value.z = asfloat(g_NvidiaExt.IncrementCounter());
855
+ return value;
856
+ }
857
+
858
+ float4 NvEvaluateAttributeSnappedForVPRS(float4 attrib, uint2 offset)
859
+ {
860
+ float4 value = (float4)0;
861
+ uint ext = g_NvidiaExt.IncrementCounter();
862
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
863
+ g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
864
+ g_NvidiaExt[ext].src1u.xy = offset;
865
+ g_NvidiaExt[ext].numOutputsForIncCounter = 4;
866
+ value.x = asfloat(g_NvidiaExt.IncrementCounter());
867
+ value.y = asfloat(g_NvidiaExt.IncrementCounter());
868
+ value.z = asfloat(g_NvidiaExt.IncrementCounter());
869
+ value.w = asfloat(g_NvidiaExt.IncrementCounter());
870
+ return value;
871
+ }
872
+
873
+ int NvEvaluateAttributeSnappedForVPRS(int attrib, uint2 offset)
874
+ {
875
+ int value = (int)0;
876
+ uint ext = g_NvidiaExt.IncrementCounter();
877
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
878
+ g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
879
+ g_NvidiaExt[ext].src1u.xy = offset;
880
+ g_NvidiaExt[ext].numOutputsForIncCounter = 1;
881
+ value.x = asint(g_NvidiaExt.IncrementCounter());
882
+ return value;
883
+ }
884
+
885
+ int2 NvEvaluateAttributeSnappedForVPRS(int2 attrib, uint2 offset)
886
+ {
887
+ int2 value = (int2)0;
888
+ uint ext = g_NvidiaExt.IncrementCounter();
889
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
890
+ g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
891
+ g_NvidiaExt[ext].src1u.xy = offset;
892
+ g_NvidiaExt[ext].numOutputsForIncCounter = 2;
893
+ value.x = asint(g_NvidiaExt.IncrementCounter());
894
+ value.y = asint(g_NvidiaExt.IncrementCounter());
895
+ return value;
896
+ }
897
+
898
+ int3 NvEvaluateAttributeSnappedForVPRS(int3 attrib, uint2 offset)
899
+ {
900
+ int3 value = (int3)0;
901
+ uint ext = g_NvidiaExt.IncrementCounter();
902
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
903
+ g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
904
+ g_NvidiaExt[ext].src1u.xy = offset;
905
+ g_NvidiaExt[ext].numOutputsForIncCounter = 3;
906
+ value.x = asint(g_NvidiaExt.IncrementCounter());
907
+ value.y = asint(g_NvidiaExt.IncrementCounter());
908
+ value.z = asint(g_NvidiaExt.IncrementCounter());
909
+ return value;
910
+ }
911
+
912
+ int4 NvEvaluateAttributeSnappedForVPRS(int4 attrib, uint2 offset)
913
+ {
914
+ int4 value = (int4)0;
915
+ uint ext = g_NvidiaExt.IncrementCounter();
916
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
917
+ g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
918
+ g_NvidiaExt[ext].src1u.xy = offset;
919
+ g_NvidiaExt[ext].numOutputsForIncCounter = 4;
920
+ value.x = asint(g_NvidiaExt.IncrementCounter());
921
+ value.y = asint(g_NvidiaExt.IncrementCounter());
922
+ value.z = asint(g_NvidiaExt.IncrementCounter());
923
+ value.w = asint(g_NvidiaExt.IncrementCounter());
924
+ return value;
925
+ }
926
+
927
+ uint NvEvaluateAttributeSnappedForVPRS(uint attrib, uint2 offset)
928
+ {
929
+ uint value = (uint)0;
930
+ uint ext = g_NvidiaExt.IncrementCounter();
931
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
932
+ g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
933
+ g_NvidiaExt[ext].src1u.xy = offset;
934
+ g_NvidiaExt[ext].numOutputsForIncCounter = 1;
935
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
936
+ return value;
937
+ }
938
+
939
+ uint2 NvEvaluateAttributeSnappedForVPRS(uint2 attrib, uint2 offset)
940
+ {
941
+ uint2 value = (uint2)0;
942
+ uint ext = g_NvidiaExt.IncrementCounter();
943
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
944
+ g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
945
+ g_NvidiaExt[ext].src1u.xy = offset;
946
+ g_NvidiaExt[ext].numOutputsForIncCounter = 2;
947
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
948
+ value.y = asuint(g_NvidiaExt.IncrementCounter());
949
+ return value;
950
+ }
951
+
952
+ uint3 NvEvaluateAttributeSnappedForVPRS(uint3 attrib, uint2 offset)
953
+ {
954
+ uint3 value = (uint3)0;
955
+ uint ext = g_NvidiaExt.IncrementCounter();
956
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
957
+ g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
958
+ g_NvidiaExt[ext].src1u.xy = offset;
959
+ g_NvidiaExt[ext].numOutputsForIncCounter = 3;
960
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
961
+ value.y = asuint(g_NvidiaExt.IncrementCounter());
962
+ value.z = asuint(g_NvidiaExt.IncrementCounter());
963
+ return value;
964
+ }
965
+
966
+ uint4 NvEvaluateAttributeSnappedForVPRS(uint4 attrib, uint2 offset)
967
+ {
968
+ uint4 value = (uint4)0;
969
+ uint ext = g_NvidiaExt.IncrementCounter();
970
+ g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
971
+ g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
972
+ g_NvidiaExt[ext].src1u.xy = offset;
973
+ g_NvidiaExt[ext].numOutputsForIncCounter = 4;
974
+ value.x = asuint(g_NvidiaExt.IncrementCounter());
975
+ value.y = asuint(g_NvidiaExt.IncrementCounter());
976
+ value.z = asuint(g_NvidiaExt.IncrementCounter());
977
+ value.w = asuint(g_NvidiaExt.IncrementCounter());
978
+ return value;
979
+ }
980
+
981
+ // MATCH instruction variants
982
+ uint NvWaveMatch(uint value)
983
+ {
984
+ uint index = g_NvidiaExt.IncrementCounter();
985
+ g_NvidiaExt[index].src0u.x = value;
986
+ g_NvidiaExt[index].src1u.x = 1;
987
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
988
+ // result is returned as the return value of IncrementCounter on fake UAV slot
989
+ return g_NvidiaExt.IncrementCounter();
990
+ }
991
+
992
+ uint NvWaveMatch(uint2 value)
993
+ {
994
+ uint index = g_NvidiaExt.IncrementCounter();
995
+ g_NvidiaExt[index].src0u.xy = value.xy;
996
+ g_NvidiaExt[index].src1u.x = 2;
997
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
998
+ // result is returned as the return value of IncrementCounter on fake UAV slot
999
+ return g_NvidiaExt.IncrementCounter();
1000
+ }
1001
+
1002
+ uint NvWaveMatch(uint4 value)
1003
+ {
1004
+ uint index = g_NvidiaExt.IncrementCounter();
1005
+ g_NvidiaExt[index].src0u = value;
1006
+ g_NvidiaExt[index].src1u.x = 4;
1007
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1008
+ // result is returned as the return value of IncrementCounter on fake UAV slot
1009
+ return g_NvidiaExt.IncrementCounter();
1010
+ }
1011
+
1012
+ uint NvWaveMatch(float value)
1013
+ {
1014
+ uint index = g_NvidiaExt.IncrementCounter();
1015
+ g_NvidiaExt[index].src0u.x = asuint(value);
1016
+ g_NvidiaExt[index].src1u.x = 1;
1017
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1018
+ // result is returned as the return value of IncrementCounter on fake UAV slot
1019
+ return g_NvidiaExt.IncrementCounter();
1020
+ }
1021
+
1022
+ uint NvWaveMatch(float2 value)
1023
+ {
1024
+ uint index = g_NvidiaExt.IncrementCounter();
1025
+ g_NvidiaExt[index].src0u.xy = asuint(value);
1026
+ g_NvidiaExt[index].src1u.x = 2;
1027
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1028
+ // result is returned as the return value of IncrementCounter on fake UAV slot
1029
+ return g_NvidiaExt.IncrementCounter();
1030
+ }
1031
+
1032
+ uint NvWaveMatch(float4 value)
1033
+ {
1034
+ uint index = g_NvidiaExt.IncrementCounter();
1035
+ g_NvidiaExt[index].src0u = asuint(value);
1036
+ g_NvidiaExt[index].src1u.x = 4;
1037
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1038
+ // result is returned as the return value of IncrementCounter on fake UAV slot
1039
+ return g_NvidiaExt.IncrementCounter();
1040
+ }
1041
+
1042
+
1043
+ //----------------------------------------------------------------------------//
1044
+ //------------------------------ Footprint functions -------------------------//
1045
+ //----------------------------------------------------------------------------//
1046
+ // texSpace and smpSpace must be immediates, texIndex and smpIndex can be variable
1047
+ // offset must be immediate
1048
+ // the required components of location and offset fields can be filled depending on the dimension/type of the texture
1049
+ // texType should be one of 2D or 3D as defined in nvShaderExtnEnums.h and and should be an immediate literal
1050
+ // if the above restrictions are not met, the behaviour of this instruction is undefined
1051
+
1052
+ uint4 NvFootprintFine(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, int3 offset = int3(0, 0, 0))
1053
+ {
1054
+ return __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, offset);
1055
+ }
1056
+
1057
+ uint4 NvFootprintCoarse(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, int3 offset = int3(0, 0, 0))
1058
+ {
1059
+ return __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, offset);
1060
+ }
1061
+
1062
+
1063
+
1064
+ uint4 NvFootprintFineBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, int3 offset = int3(0, 0, 0))
1065
+ {
1066
+ return __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, bias, offset);
1067
+ }
1068
+
1069
+ uint4 NvFootprintCoarseBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, int3 offset = int3(0, 0, 0))
1070
+ {
1071
+ return __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, bias, offset);
1072
+ }
1073
+
1074
+
1075
+
1076
+ uint4 NvFootprintFineLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, int3 offset = int3(0, 0, 0))
1077
+ {
1078
+ return __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, lodLevel, offset);
1079
+ }
1080
+
1081
+ uint4 NvFootprintCoarseLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, int3 offset = int3(0, 0, 0))
1082
+ {
1083
+ return __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, lodLevel, offset);
1084
+ }
1085
+
1086
+
1087
+
1088
+ uint4 NvFootprintFineGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, int3 offset = int3(0, 0, 0))
1089
+ {
1090
+ return __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, ddx, ddy, offset);
1091
+ }
1092
+
1093
+ uint4 NvFootprintCoarseGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, int3 offset = int3(0, 0, 0))
1094
+ {
1095
+ return __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, ddx, ddy, offset);
1096
+ }
1097
+
1098
+ uint NvFootprintExtractLOD(uint4 blob)
1099
+ {
1100
+ return ((blob.w & 0xF000) >> 12);
1101
+ }
1102
+
1103
+ uint NvFootprintExtractReturnGran(uint4 blob)
1104
+ {
1105
+ return ((blob.z & 0xF000000) >> 24);
1106
+ }
1107
+
1108
+ uint2 NvFootprintExtractAnchorTileLoc2D(uint4 blob)
1109
+ {
1110
+ uint2 loc;
1111
+ loc.x = (blob.w & 0xFFF);
1112
+ loc.y = (blob.z & 0xFFF);
1113
+ return loc;
1114
+ }
1115
+
1116
+ uint3 NvFootprintExtractAnchorTileLoc3D(uint4 blob)
1117
+ {
1118
+ uint3 loc;
1119
+ loc.x = (blob.w & 0xFFF);
1120
+ loc.y = ((blob.w & 0xFFF0000) >> 16);
1121
+ loc.z = (blob.z & 0x1FFF);
1122
+ return loc;
1123
+ }
1124
+
1125
+ uint2 NvFootprintExtractOffset2D(uint4 blob)
1126
+ {
1127
+ uint2 loc;
1128
+ loc.x = ((blob.z & 0x070000) >> 16);
1129
+ loc.y = ((blob.z & 0x380000) >> 19);
1130
+ return loc;
1131
+ }
1132
+
1133
+ uint3 NvFootprintExtractOffset3D(uint4 blob)
1134
+ {
1135
+ uint3 loc;
1136
+ loc.x = ((blob.z & 0x030000) >> 16);
1137
+ loc.y = ((blob.z & 0x0C0000) >> 18);
1138
+ loc.z = ((blob.z & 0x300000) >> 20);
1139
+ return loc;
1140
+ }
1141
+
1142
+ uint2 NvFootprintExtractBitmask(uint4 blob)
1143
+ {
1144
+ return blob.xy;
1145
+ }
1146
+
1147
+
1148
+ // Variant of Footprint extensions which returns isSingleLod (out parameter)
1149
+ // isSingleLod = true -> This footprint request touched the texels from only single LOD.
1150
+ uint4 NvFootprintFine(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1151
+ {
1152
+ uint4 res = __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, offset);
1153
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1154
+ return res;
1155
+ }
1156
+
1157
+ uint4 NvFootprintCoarse(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1158
+ {
1159
+ uint4 res = __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, offset);
1160
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1161
+ return res;
1162
+ }
1163
+
1164
+
1165
+
1166
+ uint4 NvFootprintFineBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1167
+ {
1168
+ uint4 res = __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, bias, offset);
1169
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1170
+ return res;
1171
+ }
1172
+
1173
+ uint4 NvFootprintCoarseBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1174
+ {
1175
+ uint4 res = __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, bias, offset);
1176
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1177
+ return res;
1178
+ }
1179
+
1180
+
1181
+
1182
+ uint4 NvFootprintFineLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1183
+ {
1184
+ uint4 res = __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, lodLevel, offset);
1185
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1186
+ return res;
1187
+ }
1188
+
1189
+ uint4 NvFootprintCoarseLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1190
+ {
1191
+ uint4 res = __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, lodLevel, offset);
1192
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1193
+ return res;
1194
+ }
1195
+
1196
+
1197
+
1198
+ uint4 NvFootprintFineGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1199
+ {
1200
+ uint4 res = __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, ddx, ddy, offset);
1201
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1202
+ return res;
1203
+ }
1204
+
1205
+ uint4 NvFootprintCoarseGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1206
+ {
1207
+ uint4 res = __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, ddx, ddy, offset);
1208
+ isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1209
+ return res;
1210
+ }
1211
+
1212
+
1213
+ uint NvActiveThreads()
1214
+ {
1215
+ return NvBallot(1);
1216
+ }
1217
+
1218
+
1219
+ //----------------------------------------------------------------------------//
1220
+ //------------------------------ WaveMultiPrefix functions -------------------//
1221
+ //----------------------------------------------------------------------------//
1222
+
1223
+ // Following are the WaveMultiPrefix functions for different operations (Add, Bitand, BitOr, BitXOr) for different datatypes (uint, uint2, uint4)
1224
+ // This is a set of functions which implement multi-prefix operations among the set of active lanes in the current wave (WARP).
1225
+ // A multi-prefix operation comprises a set of prefix operations, executed in parallel within subsets of lanes identified with the provided bitmasks.
1226
+ // These bitmasks represent partitioning of the set of active lanes in the current wave into N groups (where N is the number of unique masks across all lanes in the wave).
1227
+ // N prefix operations are then performed each within its corresponding group.
1228
+ // The groups are assumed to be non-intersecting (that is, a given lane can be a member of one and only one group),
1229
+ // and bitmasks in all lanes belonging to the same group are required to be the same.
1230
+ // There are 2 type of functions - Exclusive and Inclusive prefix operations.
1231
+ // e.g. For NvWaveMultiPrefixInclusiveAdd(val, mask) operation - For each of the groups (for which mask input is same) following is the expected output :
1232
+ // i^th thread in a group has value = sum(values of threads 0 to i)
1233
+ // For Exclusive version of same opeartion -
1234
+ // i^th thread in a group has value = sum(values of threads 0 to i-1) and 0th thread in a the Group has value 0
1235
+
1236
+ // Extensions for Add
1237
+ uint NvWaveMultiPrefixInclusiveAdd(uint val, uint mask)
1238
+ {
1239
+ uint temp;
1240
+ uint a = NvActiveThreads();
1241
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1242
+ uint nextLane = firstbithigh(remainingThreads);
1243
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1244
+ {
1245
+ temp = NvShfl(val, nextLane);
1246
+ uint laneValid;
1247
+ // As remainingThreads only has threads in group with smaller thread ids than its own thread-id nextLane can never be 31 for any thread in the group except the smallest one
1248
+ // For smallest thread in the group, remainingThreads is 0 --> nextLane is ~0 (i.e. considering last 5 bits its 31)
1249
+ // So passing maskClampValue=30 to __NvShflGeneric, it will return laneValid=false for the smallest thread in the group. So update val and nextLane based on laneValid.
1250
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1251
+ if (laneValid) // if nextLane's nextLane is valid
1252
+ {
1253
+ val = val + temp;
1254
+ nextLane = newLane;
1255
+ }
1256
+ }
1257
+ return val;
1258
+ }
1259
+
1260
+ uint NvWaveMultiPrefixExclusiveAdd(uint val, uint mask)
1261
+ {
1262
+ uint temp;
1263
+ uint a = NvActiveThreads();
1264
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1265
+ uint lane = firstbithigh(remainingThreads);
1266
+ temp = NvShfl(val, lane);
1267
+ val = remainingThreads != 0 ? temp : 0;
1268
+ return NvWaveMultiPrefixInclusiveAdd(val, mask);
1269
+ }
1270
+
1271
+ uint2 NvWaveMultiPrefixInclusiveAdd(uint2 val, uint mask)
1272
+ {
1273
+ uint2 temp;
1274
+ uint a = NvActiveThreads();
1275
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1276
+ uint nextLane = firstbithigh(remainingThreads);
1277
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1278
+ {
1279
+ temp = NvShfl(val, nextLane);
1280
+ uint laneValid;
1281
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1282
+ if (laneValid) // if nextLane's nextLane is valid
1283
+ {
1284
+ val = val + temp;
1285
+ nextLane = newLane;
1286
+ }
1287
+ }
1288
+ return val;
1289
+ }
1290
+
1291
+ uint2 NvWaveMultiPrefixExclusiveAdd(uint2 val, uint mask)
1292
+ {
1293
+ uint2 temp;
1294
+ uint a = NvActiveThreads();
1295
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1296
+ uint lane = firstbithigh(remainingThreads);
1297
+ temp = NvShfl(val, lane);
1298
+ val = remainingThreads != 0 ? temp : uint2(0, 0);
1299
+ return NvWaveMultiPrefixInclusiveAdd(val, mask);
1300
+ }
1301
+
1302
+ uint4 NvWaveMultiPrefixInclusiveAdd(uint4 val, uint mask)
1303
+ {
1304
+ uint4 temp;
1305
+ uint a = NvActiveThreads();
1306
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1307
+ uint nextLane = firstbithigh(remainingThreads);
1308
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1309
+ {
1310
+ temp = NvShfl(val, nextLane);
1311
+ uint laneValid;
1312
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1313
+ if (laneValid) // if nextLane's nextLane is valid
1314
+ {
1315
+ val = val + temp;
1316
+ nextLane = newLane;
1317
+ }
1318
+ }
1319
+ return val;
1320
+ }
1321
+
1322
+ uint4 NvWaveMultiPrefixExclusiveAdd(uint4 val, uint mask)
1323
+ {
1324
+ uint4 temp;
1325
+ uint a = NvActiveThreads();
1326
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1327
+ uint lane = firstbithigh(remainingThreads);
1328
+ temp = NvShfl(val, lane);
1329
+ val = remainingThreads != 0 ? temp : uint4(0, 0, 0, 0);
1330
+ return NvWaveMultiPrefixInclusiveAdd(val, mask);
1331
+ }
1332
+
1333
+ // MultiPrefix extensions for Bitand
1334
+ uint NvWaveMultiPrefixInclusiveAnd(uint val, uint mask)
1335
+ {
1336
+ uint temp;
1337
+ uint a = NvActiveThreads();
1338
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1339
+ uint nextLane = firstbithigh(remainingThreads);
1340
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1341
+ {
1342
+ temp = NvShfl(val, nextLane);
1343
+ uint laneValid;
1344
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1345
+ if (laneValid) // if nextLane's nextLane is valid
1346
+ {
1347
+ val = val & temp;
1348
+ nextLane = newLane;
1349
+ }
1350
+ }
1351
+ return val;
1352
+ }
1353
+
1354
+ uint NvWaveMultiPrefixExclusiveAnd(uint val, uint mask)
1355
+ {
1356
+ uint temp;
1357
+ uint a = NvActiveThreads();
1358
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1359
+ uint lane = firstbithigh(remainingThreads);
1360
+ temp = NvShfl(val, lane);
1361
+ val = remainingThreads != 0 ? temp : ~0;
1362
+ return NvWaveMultiPrefixInclusiveAnd(val, mask);
1363
+ }
1364
+
1365
+ uint2 NvWaveMultiPrefixInclusiveAnd(uint2 val, uint mask)
1366
+ {
1367
+ uint2 temp;
1368
+ uint a = NvActiveThreads();
1369
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1370
+ uint nextLane = firstbithigh(remainingThreads);
1371
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1372
+ {
1373
+ temp = NvShfl(val, nextLane);
1374
+ uint laneValid;
1375
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1376
+ if (laneValid) // if nextLane's nextLane is valid
1377
+ {
1378
+ val = val & temp;
1379
+ nextLane = newLane;
1380
+ }
1381
+ }
1382
+ return val;
1383
+ }
1384
+
1385
+ uint2 NvWaveMultiPrefixExclusiveAnd(uint2 val, uint mask)
1386
+ {
1387
+ uint2 temp;
1388
+ uint a = NvActiveThreads();
1389
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1390
+ uint lane = firstbithigh(remainingThreads);
1391
+ temp = NvShfl(val, lane);
1392
+ val = remainingThreads != 0 ? temp : uint2(~0, ~0);
1393
+ return NvWaveMultiPrefixInclusiveAnd(val, mask);
1394
+ }
1395
+
1396
+
1397
+ uint4 NvWaveMultiPrefixInclusiveAnd(uint4 val, uint mask)
1398
+ {
1399
+ uint4 temp;
1400
+ uint a = NvActiveThreads();
1401
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1402
+ uint nextLane = firstbithigh(remainingThreads);
1403
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1404
+ {
1405
+ temp = NvShfl(val, nextLane);
1406
+ uint laneValid;
1407
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1408
+ if (laneValid) // if nextLane's nextLane is valid
1409
+ {
1410
+ val = val & temp;
1411
+ nextLane = newLane;
1412
+ }
1413
+ }
1414
+ return val;
1415
+ }
1416
+
1417
+ uint4 NvWaveMultiPrefixExclusiveAnd(uint4 val, uint mask)
1418
+ {
1419
+ uint4 temp;
1420
+ uint a = NvActiveThreads();
1421
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1422
+ uint lane = firstbithigh(remainingThreads);
1423
+ temp = NvShfl(val, lane);
1424
+ val = remainingThreads != 0 ? temp : uint4(~0, ~0, ~0, ~0);
1425
+ return NvWaveMultiPrefixInclusiveAnd(val, mask);
1426
+ }
1427
+
1428
+
1429
+ // MultiPrefix extensions for BitOr
1430
+ uint NvWaveMultiPrefixInclusiveOr(uint val, uint mask)
1431
+ {
1432
+ uint temp;
1433
+ uint a = NvActiveThreads();
1434
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1435
+ uint nextLane = firstbithigh(remainingThreads);
1436
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1437
+ {
1438
+ temp = NvShfl(val, nextLane);
1439
+ uint laneValid;
1440
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1441
+ if (laneValid) // if nextLane's nextLane is valid
1442
+ {
1443
+ val = val | temp;
1444
+ nextLane = newLane;
1445
+ }
1446
+ }
1447
+ return val;
1448
+ }
1449
+
1450
+ uint NvWaveMultiPrefixExclusiveOr(uint val, uint mask)
1451
+ {
1452
+ uint temp;
1453
+ uint a = NvActiveThreads();
1454
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1455
+ uint lane = firstbithigh(remainingThreads);
1456
+ temp = NvShfl(val, lane);
1457
+ val = remainingThreads != 0 ? temp : 0;
1458
+ return NvWaveMultiPrefixInclusiveOr(val, mask);
1459
+ }
1460
+
1461
+ uint2 NvWaveMultiPrefixInclusiveOr(uint2 val, uint mask)
1462
+ {
1463
+ uint2 temp;
1464
+ uint a = NvActiveThreads();
1465
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1466
+ uint nextLane = firstbithigh(remainingThreads);
1467
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1468
+ {
1469
+ temp = NvShfl(val, nextLane);
1470
+ uint laneValid;
1471
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1472
+ if (laneValid) // if nextLane's nextLane is valid
1473
+ {
1474
+ val = val | temp;
1475
+ nextLane = newLane;
1476
+ }
1477
+ }
1478
+ return val;
1479
+ }
1480
+
1481
+ uint2 NvWaveMultiPrefixExclusiveOr(uint2 val, uint mask)
1482
+ {
1483
+ uint2 temp;
1484
+ uint a = NvActiveThreads();
1485
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1486
+ uint lane = firstbithigh(remainingThreads);
1487
+ temp = NvShfl(val, lane);
1488
+ val = remainingThreads != 0 ? temp : uint2(0, 0);
1489
+ return NvWaveMultiPrefixInclusiveOr(val, mask);
1490
+ }
1491
+
1492
+
1493
+ uint4 NvWaveMultiPrefixInclusiveOr(uint4 val, uint mask)
1494
+ {
1495
+ uint4 temp;
1496
+ uint a = NvActiveThreads();
1497
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1498
+ uint nextLane = firstbithigh(remainingThreads);
1499
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1500
+ {
1501
+ temp = NvShfl(val, nextLane);
1502
+ uint laneValid;
1503
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1504
+ if (laneValid) // if nextLane's nextLane is valid
1505
+ {
1506
+ val = val | temp;
1507
+ nextLane = newLane;
1508
+ }
1509
+ }
1510
+ return val;
1511
+ }
1512
+
1513
+ uint4 NvWaveMultiPrefixExclusiveOr(uint4 val, uint mask)
1514
+ {
1515
+ uint4 temp;
1516
+ uint a = NvActiveThreads();
1517
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1518
+ uint lane = firstbithigh(remainingThreads);
1519
+ temp = NvShfl(val, lane);
1520
+ val = remainingThreads != 0 ? temp : uint4(0, 0, 0, 0);
1521
+ return NvWaveMultiPrefixInclusiveOr(val, mask);
1522
+ }
1523
+
1524
+
1525
+ // MultiPrefix extensions for BitXOr
1526
+ uint NvWaveMultiPrefixInclusiveXOr(uint val, uint mask)
1527
+ {
1528
+ uint temp;
1529
+ uint a = NvActiveThreads();
1530
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1531
+ uint nextLane = firstbithigh(remainingThreads);
1532
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1533
+ {
1534
+ temp = NvShfl(val, nextLane);
1535
+ uint laneValid;
1536
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1537
+ if (laneValid) // if nextLane's nextLane is valid
1538
+ {
1539
+ val = val ^ temp;
1540
+ nextLane = newLane;
1541
+ }
1542
+ }
1543
+ return val;
1544
+ }
1545
+
1546
+ uint NvWaveMultiPrefixExclusiveXOr(uint val, uint mask)
1547
+ {
1548
+ uint temp;
1549
+ uint a = NvActiveThreads();
1550
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1551
+ uint lane = firstbithigh(remainingThreads);
1552
+ temp = NvShfl(val, lane);
1553
+ val = remainingThreads != 0 ? temp : 0;
1554
+ return NvWaveMultiPrefixInclusiveXOr(val, mask);
1555
+ }
1556
+
1557
+ uint2 NvWaveMultiPrefixInclusiveXOr(uint2 val, uint mask)
1558
+ {
1559
+ uint2 temp;
1560
+ uint a = NvActiveThreads();
1561
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1562
+ uint nextLane = firstbithigh(remainingThreads);
1563
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1564
+ {
1565
+ temp = NvShfl(val, nextLane);
1566
+ uint laneValid;
1567
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1568
+ if (laneValid) // if nextLane's nextLane is valid
1569
+ {
1570
+ val = val ^ temp;
1571
+ nextLane = newLane;
1572
+ }
1573
+ }
1574
+ return val;
1575
+ }
1576
+
1577
+ uint2 NvWaveMultiPrefixExclusiveXOr(uint2 val, uint mask)
1578
+ {
1579
+ uint2 temp;
1580
+ uint a = NvActiveThreads();
1581
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1582
+ uint lane = firstbithigh(remainingThreads);
1583
+ temp = NvShfl(val, lane);
1584
+ val = remainingThreads != 0 ? temp : uint2(0, 0);
1585
+ return NvWaveMultiPrefixInclusiveXOr(val, mask);
1586
+ }
1587
+
1588
+
1589
+ uint4 NvWaveMultiPrefixInclusiveXOr(uint4 val, uint mask)
1590
+ {
1591
+ uint4 temp;
1592
+ uint a = NvActiveThreads();
1593
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1594
+ uint nextLane = firstbithigh(remainingThreads);
1595
+ for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1596
+ {
1597
+ temp = NvShfl(val, nextLane);
1598
+ uint laneValid;
1599
+ uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1600
+ if (laneValid) // if nextLane's nextLane is valid
1601
+ {
1602
+ val = val ^ temp;
1603
+ nextLane = newLane;
1604
+ }
1605
+ }
1606
+ return val;
1607
+ }
1608
+
1609
+ uint4 NvWaveMultiPrefixExclusiveXOr(uint4 val, uint mask)
1610
+ {
1611
+ uint4 temp;
1612
+ uint a = NvActiveThreads();
1613
+ uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1614
+ uint lane = firstbithigh(remainingThreads);
1615
+ temp = NvShfl(val, lane);
1616
+ val = remainingThreads != 0 ? temp : uint4(0, 0, 0, 0);
1617
+ return NvWaveMultiPrefixInclusiveXOr(val, mask);
1618
+ }
1619
+
1620
+
1621
+ //----------------------------------------------------------------------------//
1622
+ //------------------------- DXR Micro-map Extension --------------------------//
1623
+ //----------------------------------------------------------------------------//
1624
+
1625
+ float3x3 NvRtTriangleObjectPositions()
1626
+ {
1627
+ uint index = g_NvidiaExt.IncrementCounter();
1628
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_TRIANGLE_OBJECT_POSITIONS;
1629
+
1630
+ float3x3 ret;
1631
+ ret[0][0] = asfloat(g_NvidiaExt.IncrementCounter());
1632
+ ret[0][1] = asfloat(g_NvidiaExt.IncrementCounter());
1633
+ ret[0][2] = asfloat(g_NvidiaExt.IncrementCounter());
1634
+ ret[1][0] = asfloat(g_NvidiaExt.IncrementCounter());
1635
+ ret[1][1] = asfloat(g_NvidiaExt.IncrementCounter());
1636
+ ret[1][2] = asfloat(g_NvidiaExt.IncrementCounter());
1637
+ ret[2][0] = asfloat(g_NvidiaExt.IncrementCounter());
1638
+ ret[2][1] = asfloat(g_NvidiaExt.IncrementCounter());
1639
+ ret[2][2] = asfloat(g_NvidiaExt.IncrementCounter());
1640
+ return ret;
1641
+ }
1642
+
1643
+ float3x3 NvRtMicroTriangleObjectPositions()
1644
+ {
1645
+ uint index = g_NvidiaExt.IncrementCounter();
1646
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_TRIANGLE_OBJECT_POSITIONS;
1647
+
1648
+ float3x3 ret;
1649
+ ret[0][0] = asfloat(g_NvidiaExt.IncrementCounter());
1650
+ ret[0][1] = asfloat(g_NvidiaExt.IncrementCounter());
1651
+ ret[0][2] = asfloat(g_NvidiaExt.IncrementCounter());
1652
+ ret[1][0] = asfloat(g_NvidiaExt.IncrementCounter());
1653
+ ret[1][1] = asfloat(g_NvidiaExt.IncrementCounter());
1654
+ ret[1][2] = asfloat(g_NvidiaExt.IncrementCounter());
1655
+ ret[2][0] = asfloat(g_NvidiaExt.IncrementCounter());
1656
+ ret[2][1] = asfloat(g_NvidiaExt.IncrementCounter());
1657
+ ret[2][2] = asfloat(g_NvidiaExt.IncrementCounter());
1658
+ return ret;
1659
+ }
1660
+
1661
+ float3x2 NvRtMicroTriangleBarycentrics()
1662
+ {
1663
+ uint index = g_NvidiaExt.IncrementCounter();
1664
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_TRIANGLE_BARYCENTRICS;
1665
+
1666
+ float3x2 ret;
1667
+ ret[0][0] = asfloat(g_NvidiaExt.IncrementCounter());
1668
+ ret[0][1] = asfloat(g_NvidiaExt.IncrementCounter());
1669
+ ret[1][0] = asfloat(g_NvidiaExt.IncrementCounter());
1670
+ ret[1][1] = asfloat(g_NvidiaExt.IncrementCounter());
1671
+ ret[2][0] = asfloat(g_NvidiaExt.IncrementCounter());
1672
+ ret[2][1] = asfloat(g_NvidiaExt.IncrementCounter());
1673
+ return ret;
1674
+ }
1675
+
1676
+ bool NvRtIsMicroTriangleHit()
1677
+ {
1678
+ uint index = g_NvidiaExt.IncrementCounter();
1679
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_IS_MICRO_TRIANGLE_HIT;
1680
+ uint ret = g_NvidiaExt.IncrementCounter();
1681
+ return ret != 0;
1682
+ }
1683
+
1684
+ bool NvRtIsBackFacing()
1685
+ {
1686
+ uint index = g_NvidiaExt.IncrementCounter();
1687
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_IS_BACK_FACING;
1688
+ uint ret = g_NvidiaExt.IncrementCounter();
1689
+ return ret != 0;
1690
+ }
1691
+
1692
+ #if __SHADER_TARGET_MAJOR > 6 || (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 5)
1693
+
1694
+ float3 NvRtMicroVertexObjectPosition(RaytracingAccelerationStructure AccelerationStructure, uint InstanceIndex, uint GeometryIndex, uint PrimitiveIndex, uint2 UV)
1695
+ {
1696
+ uint index = g_NvidiaExt.IncrementCounter();
1697
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_VERTEX_OBJECT_POSITION;
1698
+ g_NvidiaExt[index].src0u.x = InstanceIndex;
1699
+ g_NvidiaExt[index].src0u.y = GeometryIndex;
1700
+ g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1701
+ g_NvidiaExt[index].src0u.w = UV.x;
1702
+ g_NvidiaExt[index].src1u.x = UV.y;
1703
+ uint handle = g_NvidiaExt.IncrementCounter();
1704
+ float3 ret;
1705
+ ret.x = asfloat(g_NvidiaExt.IncrementCounter());
1706
+ ret.y = asfloat(g_NvidiaExt.IncrementCounter());
1707
+ ret.z = asfloat(g_NvidiaExt.IncrementCounter());
1708
+
1709
+ RayQuery<0> rq;
1710
+ rq.TraceRayInline(AccelerationStructure, 0, handle, (RayDesc)0);
1711
+
1712
+ return ret;
1713
+ }
1714
+
1715
+ float2 NvRtMicroVertexBarycentrics(RaytracingAccelerationStructure AccelerationStructure, uint InstanceIndex, uint GeometryIndex, uint PrimitiveIndex, uint2 UV)
1716
+ {
1717
+ uint index = g_NvidiaExt.IncrementCounter();
1718
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_VERTEX_BARYCENTRICS;
1719
+ g_NvidiaExt[index].src0u.x = InstanceIndex;
1720
+ g_NvidiaExt[index].src0u.y = GeometryIndex;
1721
+ g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1722
+ g_NvidiaExt[index].src0u.w = UV.x;
1723
+ g_NvidiaExt[index].src1u.x = UV.y;
1724
+ uint handle = g_NvidiaExt.IncrementCounter();
1725
+ float2 ret;
1726
+ ret.x = asfloat(g_NvidiaExt.IncrementCounter());
1727
+ ret.y = asfloat(g_NvidiaExt.IncrementCounter());
1728
+
1729
+ RayQuery<0> rq;
1730
+ rq.TraceRayInline(AccelerationStructure, 0, handle, (RayDesc)0);
1731
+
1732
+ return ret;
1733
+ }
1734
+
1735
+ #endif
1736
+
1737
+ //----------------------------------------------------------------------------//
1738
+ //------------------------- DXR HitObject Extension --------------------------//
1739
+ //----------------------------------------------------------------------------//
1740
+
1741
+ // Support for templates in HLSL requires HLSL 2021+. When using dxc,
1742
+ // use the -HV 2021 command line argument to enable these versions.
1743
+ #if defined(__HLSL_VERSION) && (__HLSL_VERSION >= 2021) && !defined(NV_HITOBJECT_USE_MACRO_API)
1744
+
1745
+ struct NvHitObject {
1746
+ uint _handle;
1747
+
1748
+ bool IsMiss()
1749
+ {
1750
+ uint index = g_NvidiaExt.IncrementCounter();
1751
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_MISS;
1752
+ g_NvidiaExt[index].src0u.x = _handle;
1753
+ uint ret = g_NvidiaExt.IncrementCounter();
1754
+ return ret != 0;
1755
+ }
1756
+
1757
+ bool IsHit()
1758
+ {
1759
+ uint index = g_NvidiaExt.IncrementCounter();
1760
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_HIT;
1761
+ g_NvidiaExt[index].src0u.x = _handle;
1762
+ uint ret = g_NvidiaExt.IncrementCounter();
1763
+ return ret != 0;
1764
+ }
1765
+
1766
+ bool IsNop()
1767
+ {
1768
+ uint index = g_NvidiaExt.IncrementCounter();
1769
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_NOP;
1770
+ g_NvidiaExt[index].src0u.x = _handle;
1771
+ uint ret = g_NvidiaExt.IncrementCounter();
1772
+ return ret != 0;
1773
+ }
1774
+
1775
+ uint GetInstanceID()
1776
+ {
1777
+ uint index = g_NvidiaExt.IncrementCounter();
1778
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_ID;
1779
+ g_NvidiaExt[index].src0u.x = _handle;
1780
+ return g_NvidiaExt.IncrementCounter();
1781
+ }
1782
+
1783
+ uint GetInstanceIndex()
1784
+ {
1785
+ uint index = g_NvidiaExt.IncrementCounter();
1786
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_INDEX;
1787
+ g_NvidiaExt[index].src0u.x = _handle;
1788
+ return g_NvidiaExt.IncrementCounter();
1789
+ }
1790
+
1791
+ uint GetPrimitiveIndex()
1792
+ {
1793
+ uint index = g_NvidiaExt.IncrementCounter();
1794
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_PRIMITIVE_INDEX;
1795
+ g_NvidiaExt[index].src0u.x = _handle;
1796
+ return g_NvidiaExt.IncrementCounter();
1797
+ }
1798
+
1799
+ uint GetGeometryIndex()
1800
+ {
1801
+ uint index = g_NvidiaExt.IncrementCounter();
1802
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_GEOMETRY_INDEX;
1803
+ g_NvidiaExt[index].src0u.x = _handle;
1804
+ return g_NvidiaExt.IncrementCounter();
1805
+ }
1806
+
1807
+ uint GetHitKind()
1808
+ {
1809
+ uint index = g_NvidiaExt.IncrementCounter();
1810
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_HIT_KIND;
1811
+ g_NvidiaExt[index].src0u.x = _handle;
1812
+ return g_NvidiaExt.IncrementCounter();
1813
+ }
1814
+
1815
+ RayDesc GetRayDesc()
1816
+ {
1817
+ uint index = g_NvidiaExt.IncrementCounter();
1818
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_RAY_DESC;
1819
+ g_NvidiaExt[index].src0u.x = _handle;
1820
+
1821
+ uint tmin = g_NvidiaExt.IncrementCounter();
1822
+ uint tmax = g_NvidiaExt.IncrementCounter();
1823
+ uint rayOrgX = g_NvidiaExt.IncrementCounter();
1824
+ uint rayOrgY = g_NvidiaExt.IncrementCounter();
1825
+ uint rayOrgZ = g_NvidiaExt.IncrementCounter();
1826
+ uint rayDirX = g_NvidiaExt.IncrementCounter();
1827
+ uint rayDirY = g_NvidiaExt.IncrementCounter();
1828
+ uint rayDirZ = g_NvidiaExt.IncrementCounter();
1829
+
1830
+ RayDesc ray;
1831
+ ray.TMin = asfloat(tmin);
1832
+ ray.TMax = asfloat(tmax);
1833
+ ray.Origin.x = asfloat(rayOrgX);
1834
+ ray.Origin.y = asfloat(rayOrgY);
1835
+ ray.Origin.z = asfloat(rayOrgZ);
1836
+ ray.Direction.x = asfloat(rayDirX);
1837
+ ray.Direction.y = asfloat(rayDirY);
1838
+ ray.Direction.z = asfloat(rayDirZ);
1839
+
1840
+ return ray;
1841
+ }
1842
+
1843
+ template <typename T>
1844
+ T GetAttributes()
1845
+ {
1846
+ uint index = g_NvidiaExt.IncrementCounter();
1847
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_ATTRIBUTES;
1848
+ g_NvidiaExt[index].src0u.x = _handle;
1849
+ uint callHandle = g_NvidiaExt.IncrementCounter();
1850
+
1851
+ T attrs;
1852
+ CallShader(callHandle, attrs);
1853
+ return attrs;
1854
+ }
1855
+
1856
+ uint GetShaderTableIndex()
1857
+ {
1858
+ uint index = g_NvidiaExt.IncrementCounter();
1859
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_SHADER_TABLE_INDEX;
1860
+ g_NvidiaExt[index].src0u.x = _handle;
1861
+ return g_NvidiaExt.IncrementCounter();
1862
+ }
1863
+
1864
+ uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes)
1865
+ {
1866
+ uint index = g_NvidiaExt.IncrementCounter();
1867
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_LOAD_LOCAL_ROOT_TABLE_CONSTANT;
1868
+ g_NvidiaExt[index].src0u.x = _handle;
1869
+ g_NvidiaExt[index].src0u.y = RootConstantOffsetInBytes;
1870
+ return g_NvidiaExt.IncrementCounter();
1871
+ }
1872
+ };
1873
+
1874
+ template<typename T>
1875
+ NvHitObject NvTraceRayHitObject(
1876
+ RaytracingAccelerationStructure AccelerationStructure,
1877
+ uint RayFlags,
1878
+ uint InstanceInclusionMask,
1879
+ uint RayContributionToHitGroupIndex,
1880
+ uint MultiplierForGeometryContributionToHitGroupIndex,
1881
+ uint MissShaderIndex,
1882
+ RayDesc Ray,
1883
+ inout T Payload)
1884
+ {
1885
+ uint index = g_NvidiaExt.IncrementCounter();
1886
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_TRACE_RAY;
1887
+ g_NvidiaExt[index].numOutputsForIncCounter = 2;
1888
+ g_NvidiaExt[index].src0u.x = MissShaderIndex;
1889
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
1890
+ uint traceHandle = g_NvidiaExt.IncrementCounter();
1891
+
1892
+ TraceRay(AccelerationStructure, RayFlags, InstanceInclusionMask, RayContributionToHitGroupIndex, MultiplierForGeometryContributionToHitGroupIndex, traceHandle, Ray, Payload);
1893
+
1894
+ NvHitObject hitObj;
1895
+ hitObj._handle = hitHandle;
1896
+ return hitObj;
1897
+ }
1898
+
1899
+ template <typename T>
1900
+ NvHitObject NvMakeHit(
1901
+ RaytracingAccelerationStructure AccelerationStructure,
1902
+ uint InstanceIndex,
1903
+ uint GeometryIndex,
1904
+ uint PrimitiveIndex,
1905
+ uint HitKind,
1906
+ uint RayContributionToHitGroupIndex,
1907
+ uint MultiplierForGeometryContributionToHitGroupIndex,
1908
+ RayDesc Ray,
1909
+ T Attributes)
1910
+ {
1911
+ uint index = g_NvidiaExt.IncrementCounter();
1912
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT;
1913
+ g_NvidiaExt[index].numOutputsForIncCounter = 2;
1914
+ g_NvidiaExt[index].src0u.x = InstanceIndex;
1915
+ g_NvidiaExt[index].src0u.y = GeometryIndex;
1916
+ g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1917
+ g_NvidiaExt[index].src0u.w = HitKind;
1918
+ g_NvidiaExt[index].src1u.x = RayContributionToHitGroupIndex;
1919
+ g_NvidiaExt[index].src1u.y = MultiplierForGeometryContributionToHitGroupIndex;
1920
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
1921
+ uint traceHandle = g_NvidiaExt.IncrementCounter();
1922
+
1923
+ struct AttrWrapper { T Attrs; };
1924
+ AttrWrapper wrapper;
1925
+ wrapper.Attrs = Attributes;
1926
+ CallShader(traceHandle, wrapper);
1927
+
1928
+ struct DummyPayload { int a; };
1929
+ DummyPayload payload;
1930
+ TraceRay(AccelerationStructure, 0, 0, 0, 0, traceHandle, Ray, payload);
1931
+
1932
+ NvHitObject hitObj;
1933
+ hitObj._handle = hitHandle;
1934
+ return hitObj;
1935
+ }
1936
+
1937
+ template <typename T>
1938
+ NvHitObject NvMakeHitWithRecordIndex(
1939
+ uint HitGroupRecordIndex,
1940
+ RaytracingAccelerationStructure AccelerationStructure,
1941
+ uint InstanceIndex,
1942
+ uint GeometryIndex,
1943
+ uint PrimitiveIndex,
1944
+ uint HitKind,
1945
+ RayDesc Ray,
1946
+ T Attributes)
1947
+ {
1948
+ uint index = g_NvidiaExt.IncrementCounter();
1949
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT_WITH_RECORD_INDEX;
1950
+ g_NvidiaExt[index].numOutputsForIncCounter = 2;
1951
+ g_NvidiaExt[index].src0u.x = InstanceIndex;
1952
+ g_NvidiaExt[index].src0u.y = GeometryIndex;
1953
+ g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1954
+ g_NvidiaExt[index].src0u.w = HitKind;
1955
+ g_NvidiaExt[index].src1u.x = HitGroupRecordIndex;
1956
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
1957
+ uint traceHandle = g_NvidiaExt.IncrementCounter();
1958
+
1959
+ struct AttrWrapper { T Attrs; };
1960
+ AttrWrapper wrapper;
1961
+ wrapper.Attrs = Attributes;
1962
+ CallShader(traceHandle, wrapper);
1963
+
1964
+ struct DummyPayload { int a; };
1965
+ DummyPayload payload;
1966
+ TraceRay(AccelerationStructure, 0, 0, 0, 0, traceHandle, Ray, payload);
1967
+
1968
+ NvHitObject hitObj;
1969
+ hitObj._handle = hitHandle;
1970
+ return hitObj;
1971
+ }
1972
+
1973
+ NvHitObject NvMakeMiss(
1974
+ uint MissShaderIndex,
1975
+ RayDesc Ray)
1976
+ {
1977
+ uint index = g_NvidiaExt.IncrementCounter();
1978
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_MISS;
1979
+ g_NvidiaExt[index].src0u.x = MissShaderIndex;
1980
+ g_NvidiaExt[index].src0u.y = asuint(Ray.TMin);
1981
+ g_NvidiaExt[index].src0u.z = asuint(Ray.TMax);
1982
+ g_NvidiaExt[index].src1u.x = asuint(Ray.Origin.x);
1983
+ g_NvidiaExt[index].src1u.y = asuint(Ray.Origin.y);
1984
+ g_NvidiaExt[index].src1u.z = asuint(Ray.Origin.z);
1985
+ g_NvidiaExt[index].src2u.x = asuint(Ray.Direction.x);
1986
+ g_NvidiaExt[index].src2u.y = asuint(Ray.Direction.y);
1987
+ g_NvidiaExt[index].src2u.z = asuint(Ray.Direction.z);
1988
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
1989
+
1990
+ NvHitObject hitObj;
1991
+ hitObj._handle = hitHandle;
1992
+ return hitObj;
1993
+ }
1994
+
1995
+ NvHitObject NvMakeNop()
1996
+ {
1997
+ uint index = g_NvidiaExt.IncrementCounter();
1998
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_NOP;
1999
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
2000
+
2001
+ NvHitObject hitObj;
2002
+ hitObj._handle = hitHandle;
2003
+ return hitObj;
2004
+ }
2005
+
2006
+ void NvReorderThread(uint CoherenceHint, uint NumCoherenceHintBits)
2007
+ {
2008
+ uint index = g_NvidiaExt.IncrementCounter();
2009
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2010
+ g_NvidiaExt[index].src0u.x = 0;
2011
+ g_NvidiaExt[index].src0u.y = 0;
2012
+ g_NvidiaExt[index].src0u.z = CoherenceHint;
2013
+ g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2014
+ g_NvidiaExt.IncrementCounter();
2015
+ }
2016
+
2017
+ void NvReorderThread(NvHitObject HitObj, uint CoherenceHint, uint NumCoherenceHintBits)
2018
+ {
2019
+ uint index = g_NvidiaExt.IncrementCounter();
2020
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2021
+ g_NvidiaExt[index].src0u.x = 1;
2022
+ g_NvidiaExt[index].src0u.y = HitObj._handle;
2023
+ g_NvidiaExt[index].src0u.z = CoherenceHint;
2024
+ g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2025
+ g_NvidiaExt.IncrementCounter();
2026
+ }
2027
+
2028
+ void NvReorderThread(NvHitObject HitObj)
2029
+ {
2030
+ NvReorderThread(HitObj, 0, 0);
2031
+ }
2032
+
2033
+ template<typename T>
2034
+ void NvInvokeHitObject(
2035
+ RaytracingAccelerationStructure AccelerationStructure,
2036
+ NvHitObject HitObj,
2037
+ inout T Payload)
2038
+ {
2039
+ uint index = g_NvidiaExt.IncrementCounter();
2040
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_INVOKE;
2041
+ g_NvidiaExt[index].src0u.x = HitObj._handle;
2042
+ uint handle = g_NvidiaExt.IncrementCounter();
2043
+
2044
+ TraceRay(AccelerationStructure, 0, 0, 0, 0, handle, (RayDesc)0, Payload);
2045
+ }
2046
+
2047
+ // Macro-based version of the HitObject API. Use this when HLSL 2021 is not available.
2048
+ // Enable by specifying #define NV_HITOBJECT_USE_MACRO_API before including this header.
2049
+ #elif defined(NV_HITOBJECT_USE_MACRO_API)
2050
+
2051
+ struct NvHitObject {
2052
+ uint _handle;
2053
+
2054
+ bool IsMiss()
2055
+ {
2056
+ uint index = g_NvidiaExt.IncrementCounter();
2057
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_MISS;
2058
+ g_NvidiaExt[index].src0u.x = _handle;
2059
+ uint ret = g_NvidiaExt.IncrementCounter();
2060
+ return ret != 0;
2061
+ }
2062
+
2063
+ bool IsHit()
2064
+ {
2065
+ uint index = g_NvidiaExt.IncrementCounter();
2066
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_HIT;
2067
+ g_NvidiaExt[index].src0u.x = _handle;
2068
+ uint ret = g_NvidiaExt.IncrementCounter();
2069
+ return ret != 0;
2070
+ }
2071
+
2072
+ bool IsNop()
2073
+ {
2074
+ uint index = g_NvidiaExt.IncrementCounter();
2075
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_NOP;
2076
+ g_NvidiaExt[index].src0u.x = _handle;
2077
+ uint ret = g_NvidiaExt.IncrementCounter();
2078
+ return ret != 0;
2079
+ }
2080
+
2081
+ uint GetInstanceID()
2082
+ {
2083
+ uint index = g_NvidiaExt.IncrementCounter();
2084
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_ID;
2085
+ g_NvidiaExt[index].src0u.x = _handle;
2086
+ return g_NvidiaExt.IncrementCounter();
2087
+ }
2088
+
2089
+ uint GetInstanceIndex()
2090
+ {
2091
+ uint index = g_NvidiaExt.IncrementCounter();
2092
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_INDEX;
2093
+ g_NvidiaExt[index].src0u.x = _handle;
2094
+ return g_NvidiaExt.IncrementCounter();
2095
+ }
2096
+
2097
+ uint GetPrimitiveIndex()
2098
+ {
2099
+ uint index = g_NvidiaExt.IncrementCounter();
2100
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_PRIMITIVE_INDEX;
2101
+ g_NvidiaExt[index].src0u.x = _handle;
2102
+ return g_NvidiaExt.IncrementCounter();
2103
+ }
2104
+
2105
+ uint GetGeometryIndex()
2106
+ {
2107
+ uint index = g_NvidiaExt.IncrementCounter();
2108
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_GEOMETRY_INDEX;
2109
+ g_NvidiaExt[index].src0u.x = _handle;
2110
+ return g_NvidiaExt.IncrementCounter();
2111
+ }
2112
+
2113
+ uint GetHitKind()
2114
+ {
2115
+ uint index = g_NvidiaExt.IncrementCounter();
2116
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_HIT_KIND;
2117
+ g_NvidiaExt[index].src0u.x = _handle;
2118
+ return g_NvidiaExt.IncrementCounter();
2119
+ }
2120
+
2121
+ RayDesc GetRayDesc()
2122
+ {
2123
+ uint index = g_NvidiaExt.IncrementCounter();
2124
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_RAY_DESC;
2125
+ g_NvidiaExt[index].src0u.x = _handle;
2126
+
2127
+ uint tmin = g_NvidiaExt.IncrementCounter();
2128
+ uint tmax = g_NvidiaExt.IncrementCounter();
2129
+ uint rayOrgX = g_NvidiaExt.IncrementCounter();
2130
+ uint rayOrgY = g_NvidiaExt.IncrementCounter();
2131
+ uint rayOrgZ = g_NvidiaExt.IncrementCounter();
2132
+ uint rayDirX = g_NvidiaExt.IncrementCounter();
2133
+ uint rayDirY = g_NvidiaExt.IncrementCounter();
2134
+ uint rayDirZ = g_NvidiaExt.IncrementCounter();
2135
+
2136
+ RayDesc ray;
2137
+ ray.TMin = asfloat(tmin);
2138
+ ray.TMax = asfloat(tmax);
2139
+ ray.Origin.x = asfloat(rayOrgX);
2140
+ ray.Origin.y = asfloat(rayOrgY);
2141
+ ray.Origin.z = asfloat(rayOrgZ);
2142
+ ray.Direction.x = asfloat(rayDirX);
2143
+ ray.Direction.y = asfloat(rayDirY);
2144
+ ray.Direction.z = asfloat(rayDirZ);
2145
+
2146
+ return ray;
2147
+ }
2148
+
2149
+ uint GetShaderTableIndex()
2150
+ {
2151
+ uint index = g_NvidiaExt.IncrementCounter();
2152
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_SHADER_TABLE_INDEX;
2153
+ g_NvidiaExt[index].src0u.x = _handle;
2154
+ return g_NvidiaExt.IncrementCounter();
2155
+ }
2156
+
2157
+ uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes)
2158
+ {
2159
+ uint index = g_NvidiaExt.IncrementCounter();
2160
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_LOAD_LOCAL_ROOT_TABLE_CONSTANT;
2161
+ g_NvidiaExt[index].src0u.x = _handle;
2162
+ g_NvidiaExt[index].src0u.y = RootConstantOffsetInBytes;
2163
+ return g_NvidiaExt.IncrementCounter();
2164
+ }
2165
+ };
2166
+
2167
+ #define NvTraceRayHitObject(AccelerationStructure,RayFlags,InstanceInclusionMask,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToHitGroupIndex,MissShaderIndex,Ray,Payload,ResultHitObj) \
2168
+ do { \
2169
+ uint _rayFlags = RayFlags; \
2170
+ uint _instanceInclusionMask = InstanceInclusionMask; \
2171
+ uint _rayContributionToHitGroupIndex = RayContributionToHitGroupIndex; \
2172
+ uint _multiplierForGeometryContributionToHitGroupIndex = MultiplierForGeometryContributionToHitGroupIndex; \
2173
+ uint _missShaderIndex = MissShaderIndex; \
2174
+ RayDesc _ray = Ray; \
2175
+ uint _index = g_NvidiaExt.IncrementCounter(); \
2176
+ g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_TRACE_RAY; \
2177
+ g_NvidiaExt[_index].numOutputsForIncCounter = 2; \
2178
+ g_NvidiaExt[_index].src0u.x = _missShaderIndex; \
2179
+ uint _hitHandle = g_NvidiaExt.IncrementCounter(); \
2180
+ uint _traceHandle = g_NvidiaExt.IncrementCounter(); \
2181
+ TraceRay(AccelerationStructure, _rayFlags, _instanceInclusionMask, _rayContributionToHitGroupIndex, _multiplierForGeometryContributionToHitGroupIndex, _traceHandle, _ray, Payload); \
2182
+ ResultHitObj._handle = _hitHandle; \
2183
+ } while(0)
2184
+
2185
+ struct NvHitObjectMacroDummyPayloadType { int a; };
2186
+
2187
+ #define NvMakeHit(AccelerationStructure,InstanceIndex,GeometryIndex,PrimitiveIndex,HitKind,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToHitGroupIndex,Ray,Attributes,ResultHitObj) \
2188
+ do { \
2189
+ uint _instanceIndex = InstanceIndex; \
2190
+ uint _geometryIndex = GeometryIndex; \
2191
+ uint _primitiveIndex = PrimitiveIndex; \
2192
+ uint _hitKind = HitKind; \
2193
+ uint _rayContributionToHitGroupIndex = RayContributionToHitGroupIndex; \
2194
+ uint _multiplierForGeometryContributionToHitGroupIndex = MultiplierForGeometryContributionToHitGroupIndex; \
2195
+ RayDesc _ray = Ray; \
2196
+ uint _index = g_NvidiaExt.IncrementCounter(); \
2197
+ g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT; \
2198
+ g_NvidiaExt[_index].numOutputsForIncCounter = 2; \
2199
+ g_NvidiaExt[_index].src0u.x = _instanceIndex; \
2200
+ g_NvidiaExt[_index].src0u.y = _geometryIndex; \
2201
+ g_NvidiaExt[_index].src0u.z = _primitiveIndex; \
2202
+ g_NvidiaExt[_index].src0u.w = _hitKind; \
2203
+ g_NvidiaExt[_index].src1u.x = _rayContributionToHitGroupIndex; \
2204
+ g_NvidiaExt[_index].src1u.y = _multiplierForGeometryContributionToHitGroupIndex; \
2205
+ uint _hitHandle = g_NvidiaExt.IncrementCounter(); \
2206
+ uint _traceHandle = g_NvidiaExt.IncrementCounter(); \
2207
+ CallShader(_traceHandle, Attributes); \
2208
+ NvHitObjectMacroDummyPayloadType _payload; \
2209
+ TraceRay(AccelerationStructure, 0, 0, 0, 0, _traceHandle, _ray, _payload); \
2210
+ ResultHitObj._handle = _hitHandle; \
2211
+ } while(0)
2212
+
2213
+ #define NvMakeHitWithRecordIndex(HitGroupRecordIndex,AccelerationStructure,InstanceIndex,GeometryIndex,PrimitiveIndex,HitKind,Ray,Attributes,ResultHitObj) \
2214
+ do { \
2215
+ uint _hitGroupRecordIndex = HitGroupRecordIndex; \
2216
+ uint _instanceIndex = InstanceIndex; \
2217
+ uint _geometryIndex = GeometryIndex; \
2218
+ uint _primitiveIndex = PrimitiveIndex; \
2219
+ uint _hitKind = HitKind; \
2220
+ RayDesc _ray = Ray; \
2221
+ uint _index = g_NvidiaExt.IncrementCounter(); \
2222
+ g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT_WITH_RECORD_INDEX; \
2223
+ g_NvidiaExt[_index].numOutputsForIncCounter = 2; \
2224
+ g_NvidiaExt[_index].src0u.x = _instanceIndex; \
2225
+ g_NvidiaExt[_index].src0u.y = _geometryIndex; \
2226
+ g_NvidiaExt[_index].src0u.z = _primitiveIndex; \
2227
+ g_NvidiaExt[_index].src0u.w = _hitKind; \
2228
+ g_NvidiaExt[_index].src1u.x = _hitGroupRecordIndex; \
2229
+ uint _hitHandle = g_NvidiaExt.IncrementCounter(); \
2230
+ uint _traceHandle = g_NvidiaExt.IncrementCounter(); \
2231
+ CallShader(_traceHandle, Attributes); \
2232
+ NvHitObjectMacroDummyPayloadType _payload; \
2233
+ TraceRay(AccelerationStructure, 0, 0, 0, 0, _traceHandle, _ray, _payload); \
2234
+ ResultHitObj._handle = _hitHandle; \
2235
+ } while(0)
2236
+
2237
+ NvHitObject NvMakeMiss(
2238
+ uint MissShaderIndex,
2239
+ RayDesc Ray)
2240
+ {
2241
+ uint index = g_NvidiaExt.IncrementCounter();
2242
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_MISS;
2243
+ g_NvidiaExt[index].src0u.x = MissShaderIndex;
2244
+ g_NvidiaExt[index].src0u.y = asuint(Ray.TMin);
2245
+ g_NvidiaExt[index].src0u.z = asuint(Ray.TMax);
2246
+ g_NvidiaExt[index].src1u.x = asuint(Ray.Origin.x);
2247
+ g_NvidiaExt[index].src1u.y = asuint(Ray.Origin.y);
2248
+ g_NvidiaExt[index].src1u.z = asuint(Ray.Origin.z);
2249
+ g_NvidiaExt[index].src2u.x = asuint(Ray.Direction.x);
2250
+ g_NvidiaExt[index].src2u.y = asuint(Ray.Direction.y);
2251
+ g_NvidiaExt[index].src2u.z = asuint(Ray.Direction.z);
2252
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
2253
+
2254
+ NvHitObject hitObj;
2255
+ hitObj._handle = hitHandle;
2256
+ return hitObj;
2257
+ }
2258
+
2259
+ NvHitObject NvMakeNop()
2260
+ {
2261
+ uint index = g_NvidiaExt.IncrementCounter();
2262
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_NOP;
2263
+ uint hitHandle = g_NvidiaExt.IncrementCounter();
2264
+
2265
+ NvHitObject hitObj;
2266
+ hitObj._handle = hitHandle;
2267
+ return hitObj;
2268
+ }
2269
+
2270
+ #define NvGetAttributesFromHitObject(HitObj,ResultAttributes) \
2271
+ do { \
2272
+ uint _index = g_NvidiaExt.IncrementCounter(); \
2273
+ g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_ATTRIBUTES; \
2274
+ g_NvidiaExt[_index].src0u.x = HitObj._handle; \
2275
+ uint _callHandle = g_NvidiaExt.IncrementCounter(); \
2276
+ CallShader(_callHandle, ResultAttributes); \
2277
+ } while(0)
2278
+
2279
+ void NvReorderThread(uint CoherenceHint, uint NumCoherenceHintBits)
2280
+ {
2281
+ uint index = g_NvidiaExt.IncrementCounter();
2282
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2283
+ g_NvidiaExt[index].src0u.x = 0;
2284
+ g_NvidiaExt[index].src0u.y = 0;
2285
+ g_NvidiaExt[index].src0u.z = CoherenceHint;
2286
+ g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2287
+ g_NvidiaExt.IncrementCounter();
2288
+ }
2289
+
2290
+ void NvReorderThread(NvHitObject HitObj, uint CoherenceHint, uint NumCoherenceHintBits)
2291
+ {
2292
+ uint index = g_NvidiaExt.IncrementCounter();
2293
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2294
+ g_NvidiaExt[index].src0u.x = 1;
2295
+ g_NvidiaExt[index].src0u.y = HitObj._handle;
2296
+ g_NvidiaExt[index].src0u.z = CoherenceHint;
2297
+ g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2298
+ g_NvidiaExt.IncrementCounter();
2299
+ }
2300
+
2301
+ void NvReorderThread(NvHitObject HitObj)
2302
+ {
2303
+ NvReorderThread(HitObj, 0, 0);
2304
+ }
2305
+
2306
+ #define NvInvokeHitObject(AccelerationStructure,HitObj,Payload) \
2307
+ do { \
2308
+ uint _index = g_NvidiaExt.IncrementCounter(); \
2309
+ g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_INVOKE; \
2310
+ g_NvidiaExt[_index].src0u.x = HitObj._handle; \
2311
+ uint _handle = g_NvidiaExt.IncrementCounter(); \
2312
+ TraceRay(AccelerationStructure, 0, 0, 0, 0, _handle, (RayDesc)0, Payload); \
2313
+ } while(0)
2314
+
2315
+ #endif