nv-sgl 0.6.0__cp313-cp313-win_amd64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. include/tevclient.h +393 -0
  2. nv_sgl-0.6.0.dist-info/LICENSE +29 -0
  3. nv_sgl-0.6.0.dist-info/METADATA +21 -0
  4. nv_sgl-0.6.0.dist-info/RECORD +142 -0
  5. nv_sgl-0.6.0.dist-info/WHEEL +5 -0
  6. nv_sgl-0.6.0.dist-info/top_level.txt +1 -0
  7. sgl/__init__.py +15 -0
  8. sgl/__init__.pyi +6738 -0
  9. sgl/d3d12/D3D12Core.dll +0 -0
  10. sgl/d3d12/d3d12SDKLayers.dll +0 -0
  11. sgl/dxcompiler.dll +0 -0
  12. sgl/dxil.dll +0 -0
  13. sgl/gfx.dll +0 -0
  14. sgl/include/sgl/app/app.h +113 -0
  15. sgl/include/sgl/core/bitmap.h +302 -0
  16. sgl/include/sgl/core/crypto.h +89 -0
  17. sgl/include/sgl/core/data_type.h +46 -0
  18. sgl/include/sgl/core/dds_file.h +103 -0
  19. sgl/include/sgl/core/enum.h +201 -0
  20. sgl/include/sgl/core/error.h +161 -0
  21. sgl/include/sgl/core/file_stream.h +77 -0
  22. sgl/include/sgl/core/file_system_watcher.h +141 -0
  23. sgl/include/sgl/core/format.h +36 -0
  24. sgl/include/sgl/core/fwd.h +90 -0
  25. sgl/include/sgl/core/hash.h +45 -0
  26. sgl/include/sgl/core/input.h +522 -0
  27. sgl/include/sgl/core/logger.h +214 -0
  28. sgl/include/sgl/core/macros.h +184 -0
  29. sgl/include/sgl/core/maths.h +45 -0
  30. sgl/include/sgl/core/memory_mapped_file.h +112 -0
  31. sgl/include/sgl/core/memory_mapped_file_stream.h +32 -0
  32. sgl/include/sgl/core/memory_stream.h +74 -0
  33. sgl/include/sgl/core/object.h +683 -0
  34. sgl/include/sgl/core/platform.h +239 -0
  35. sgl/include/sgl/core/plugin.h +331 -0
  36. sgl/include/sgl/core/resolver.h +39 -0
  37. sgl/include/sgl/core/short_vector.h +141 -0
  38. sgl/include/sgl/core/static_vector.h +111 -0
  39. sgl/include/sgl/core/stream.h +54 -0
  40. sgl/include/sgl/core/string.h +276 -0
  41. sgl/include/sgl/core/struct.h +360 -0
  42. sgl/include/sgl/core/thread.h +28 -0
  43. sgl/include/sgl/core/timer.h +52 -0
  44. sgl/include/sgl/core/traits.h +15 -0
  45. sgl/include/sgl/core/type_utils.h +19 -0
  46. sgl/include/sgl/core/window.h +177 -0
  47. sgl/include/sgl/device/agility_sdk.h +24 -0
  48. sgl/include/sgl/device/blit.h +88 -0
  49. sgl/include/sgl/device/buffer_cursor.h +162 -0
  50. sgl/include/sgl/device/command.h +539 -0
  51. sgl/include/sgl/device/cuda_api.h +766 -0
  52. sgl/include/sgl/device/cuda_interop.h +39 -0
  53. sgl/include/sgl/device/cuda_utils.h +107 -0
  54. sgl/include/sgl/device/cursor_utils.h +129 -0
  55. sgl/include/sgl/device/device.h +668 -0
  56. sgl/include/sgl/device/device_resource.h +37 -0
  57. sgl/include/sgl/device/fence.h +91 -0
  58. sgl/include/sgl/device/formats.h +330 -0
  59. sgl/include/sgl/device/framebuffer.h +85 -0
  60. sgl/include/sgl/device/fwd.h +164 -0
  61. sgl/include/sgl/device/helpers.h +20 -0
  62. sgl/include/sgl/device/hot_reload.h +75 -0
  63. sgl/include/sgl/device/input_layout.h +74 -0
  64. sgl/include/sgl/device/kernel.h +69 -0
  65. sgl/include/sgl/device/memory_heap.h +155 -0
  66. sgl/include/sgl/device/native_formats.h +342 -0
  67. sgl/include/sgl/device/native_handle.h +73 -0
  68. sgl/include/sgl/device/native_handle_traits.h +65 -0
  69. sgl/include/sgl/device/pipeline.h +138 -0
  70. sgl/include/sgl/device/print.h +45 -0
  71. sgl/include/sgl/device/python/cursor_utils.h +853 -0
  72. sgl/include/sgl/device/query.h +52 -0
  73. sgl/include/sgl/device/raytracing.h +84 -0
  74. sgl/include/sgl/device/reflection.h +1254 -0
  75. sgl/include/sgl/device/resource.h +705 -0
  76. sgl/include/sgl/device/sampler.h +57 -0
  77. sgl/include/sgl/device/shader.h +516 -0
  78. sgl/include/sgl/device/shader_cursor.h +85 -0
  79. sgl/include/sgl/device/shader_object.h +94 -0
  80. sgl/include/sgl/device/shader_offset.h +67 -0
  81. sgl/include/sgl/device/shared_handle.h +12 -0
  82. sgl/include/sgl/device/slang_utils.h +54 -0
  83. sgl/include/sgl/device/swapchain.h +74 -0
  84. sgl/include/sgl/device/types.h +782 -0
  85. sgl/include/sgl/math/colorspace.h +56 -0
  86. sgl/include/sgl/math/constants.h +7 -0
  87. sgl/include/sgl/math/float16.h +146 -0
  88. sgl/include/sgl/math/matrix.h +6 -0
  89. sgl/include/sgl/math/matrix_math.h +746 -0
  90. sgl/include/sgl/math/matrix_types.h +207 -0
  91. sgl/include/sgl/math/python/primitivetype.h +33 -0
  92. sgl/include/sgl/math/quaternion.h +6 -0
  93. sgl/include/sgl/math/quaternion_math.h +484 -0
  94. sgl/include/sgl/math/quaternion_types.h +83 -0
  95. sgl/include/sgl/math/ray.h +47 -0
  96. sgl/include/sgl/math/scalar_math.h +249 -0
  97. sgl/include/sgl/math/scalar_types.h +107 -0
  98. sgl/include/sgl/math/vector.h +6 -0
  99. sgl/include/sgl/math/vector_math.h +1796 -0
  100. sgl/include/sgl/math/vector_types.h +336 -0
  101. sgl/include/sgl/python/nanobind.h +489 -0
  102. sgl/include/sgl/python/py_doc.h +11600 -0
  103. sgl/include/sgl/python/sgl_ext_pch.h +8 -0
  104. sgl/include/sgl/sgl.h +21 -0
  105. sgl/include/sgl/sgl_pch.h +6 -0
  106. sgl/include/sgl/stl/bit.h +377 -0
  107. sgl/include/sgl/tests/testing.h +54 -0
  108. sgl/include/sgl/ui/fwd.h +34 -0
  109. sgl/include/sgl/ui/imgui_config.h +43 -0
  110. sgl/include/sgl/ui/ui.h +71 -0
  111. sgl/include/sgl/ui/widgets.h +918 -0
  112. sgl/include/sgl/utils/python/slangpy.h +366 -0
  113. sgl/include/sgl/utils/renderdoc.h +50 -0
  114. sgl/include/sgl/utils/slangpy.h +153 -0
  115. sgl/include/sgl/utils/tev.h +93 -0
  116. sgl/include/sgl/utils/texture_loader.h +106 -0
  117. sgl/math/__init__.pyi +5083 -0
  118. sgl/platform/__init__.pyi +102 -0
  119. sgl/renderdoc/__init__.pyi +51 -0
  120. sgl/sgl.dll +0 -0
  121. sgl/sgl_ext.cp313-win_amd64.pyd +0 -0
  122. sgl/shaders/nvapi/nvHLSLExtns.h +2315 -0
  123. sgl/shaders/nvapi/nvHLSLExtnsInternal.h +758 -0
  124. sgl/shaders/nvapi/nvShaderExtnEnums.h +142 -0
  125. sgl/shaders/sgl/device/blit.slang +93 -0
  126. sgl/shaders/sgl/device/nvapi.slang +5 -0
  127. sgl/shaders/sgl/device/nvapi.slangh +7 -0
  128. sgl/shaders/sgl/device/print.slang +445 -0
  129. sgl/shaders/sgl/math/constants.slang +4 -0
  130. sgl/shaders/sgl/math/ray.slang +29 -0
  131. sgl/shaders/sgl/ui/imgui.slang +49 -0
  132. sgl/slang-glslang.dll +0 -0
  133. sgl/slang-llvm.dll +0 -0
  134. sgl/slang-rt.dll +0 -0
  135. sgl/slang.dll +0 -0
  136. sgl/slangpy/__init__.pyi +268 -0
  137. sgl/tev/__init__.pyi +108 -0
  138. sgl/tevclient.lib +0 -0
  139. sgl/thread/__init__.pyi +4 -0
  140. sgl/ui/__init__.pyi +1118 -0
  141. share/cmake/tevclient/tevclient-config-release.cmake +19 -0
  142. share/cmake/tevclient/tevclient-config.cmake +108 -0
@@ -0,0 +1,758 @@
1
+ /*********************************************************************************************************\
2
+ |* *|
3
+ |* SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. *|
4
+ |* SPDX-License-Identifier: MIT *|
5
+ |* *|
6
+ |* Permission is hereby granted, free of charge, to any person obtaining a *|
7
+ |* copy of this software and associated documentation files (the "Software"), *|
8
+ |* to deal in the Software without restriction, including without limitation *|
9
+ |* the rights to use, copy, modify, merge, publish, distribute, sublicense, *|
10
+ |* and/or sell copies of the Software, and to permit persons to whom the *|
11
+ |* Software is furnished to do so, subject to the following conditions: *|
12
+ |* *|
13
+ |* The above copyright notice and this permission notice shall be included in *|
14
+ |* all copies or substantial portions of the Software. *|
15
+ |* *|
16
+ |* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *|
17
+ |* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *|
18
+ |* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *|
19
+ |* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *|
20
+ |* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING *|
21
+ |* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER *|
22
+ |* DEALINGS IN THE SOFTWARE. *|
23
+ |* *|
24
+ |* *|
25
+ \*********************************************************************************************************/
26
+ ////////////////////////// NVIDIA SHADER EXTENSIONS /////////////////
27
+ // internal functions
28
+ // Functions in this file are not expected to be called by apps directly
29
+
30
+ #include "nvShaderExtnEnums.h"
31
+
32
+ struct NvShaderExtnStruct
33
+ {
34
+ uint opcode; // opcode
35
+ uint rid; // resource ID
36
+ uint sid; // sampler ID
37
+
38
+ uint4 dst1u; // destination operand 1 (for instructions that need extra destination operands)
39
+ uint4 src3u; // source operand 3
40
+ uint4 src4u; // source operand 4
41
+ uint4 src5u; // source operand 5
42
+
43
+ uint4 src0u; // uint source operand 0
44
+ uint4 src1u; // uint source operand 0
45
+ uint4 src2u; // uint source operand 0
46
+ uint4 dst0u; // uint destination operand
47
+
48
+ uint markUavRef; // the next store to UAV is fake and is used only to identify the uav slot
49
+ uint numOutputsForIncCounter; // Used for output to IncrementCounter
50
+ float padding1[27]; // struct size: 256 bytes
51
+ };
52
+
53
+ // RW structured buffer for Nvidia shader extensions
54
+
55
+ // Application needs to define NV_SHADER_EXTN_SLOT as a unused slot, which should be
56
+ // set using NvAPI_D3D11_SetNvShaderExtnSlot() call before creating the first shader that
57
+ // uses nvidia shader extensions. E.g before including this file in shader define it as:
58
+ // #define NV_SHADER_EXTN_SLOT u7
59
+
60
+ // For SM5.1, application needs to define NV_SHADER_EXTN_REGISTER_SPACE as register space
61
+ // E.g. before including this file in shader define it as:
62
+ // #define NV_SHADER_EXTN_REGISTER_SPACE space2
63
+
64
+ // Note that other operations to this UAV will be ignored so application
65
+ // should bind a null resource
66
+
67
+ #ifdef NV_SHADER_EXTN_REGISTER_SPACE
68
+ RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT, NV_SHADER_EXTN_REGISTER_SPACE );
69
+ #else
70
+ RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT );
71
+ #endif
72
+
73
+ //----------------------------------------------------------------------------//
74
+ // the exposed SHFL instructions accept a mask parameter in src2
75
+ // To compute lane mask from width of segment:
76
+ // minLaneID : currentLaneId & src2[12:8]
77
+ // maxLaneID : minLaneId | (src2[4:0] & ~src2[12:8])
78
+ // where [minLaneId, maxLaneId] defines the segment where currentLaneId belongs
79
+ // we always set src2[4:0] to 11111 (0x1F), and set src2[12:8] as (32 - width)
80
+ int __NvGetShflMaskFromWidth(uint width)
81
+ {
82
+ return ((NV_WARP_SIZE - width) << 8) | 0x1F;
83
+ }
84
+
85
+ //----------------------------------------------------------------------------//
86
+
87
+ void __NvReferenceUAVForOp(RWByteAddressBuffer uav)
88
+ {
89
+ uint index = g_NvidiaExt.IncrementCounter();
90
+ g_NvidiaExt[index].markUavRef = 1;
91
+ uav.Store(index, 0);
92
+ }
93
+
94
+ void __NvReferenceUAVForOp(RWTexture1D<float2> uav)
95
+ {
96
+ uint index = g_NvidiaExt.IncrementCounter();
97
+ g_NvidiaExt[index].markUavRef = 1;
98
+ uav[index] = float2(0,0);
99
+ }
100
+
101
+ void __NvReferenceUAVForOp(RWTexture2D<float2> uav)
102
+ {
103
+ uint index = g_NvidiaExt.IncrementCounter();
104
+ g_NvidiaExt[index].markUavRef = 1;
105
+ uav[uint2(index,index)] = float2(0,0);
106
+ }
107
+
108
+ void __NvReferenceUAVForOp(RWTexture3D<float2> uav)
109
+ {
110
+ uint index = g_NvidiaExt.IncrementCounter();
111
+ g_NvidiaExt[index].markUavRef = 1;
112
+ uav[uint3(index,index,index)] = float2(0,0);
113
+ }
114
+
115
+ void __NvReferenceUAVForOp(RWTexture1D<float4> uav)
116
+ {
117
+ uint index = g_NvidiaExt.IncrementCounter();
118
+ g_NvidiaExt[index].markUavRef = 1;
119
+ uav[index] = float4(0,0,0,0);
120
+ }
121
+
122
+ void __NvReferenceUAVForOp(RWTexture2D<float4> uav)
123
+ {
124
+ uint index = g_NvidiaExt.IncrementCounter();
125
+ g_NvidiaExt[index].markUavRef = 1;
126
+ uav[uint2(index,index)] = float4(0,0,0,0);
127
+ }
128
+
129
+ void __NvReferenceUAVForOp(RWTexture3D<float4> uav)
130
+ {
131
+ uint index = g_NvidiaExt.IncrementCounter();
132
+ g_NvidiaExt[index].markUavRef = 1;
133
+ uav[uint3(index,index,index)] = float4(0,0,0,0);
134
+ }
135
+
136
+ void __NvReferenceUAVForOp(RWTexture1D<float> uav)
137
+ {
138
+ uint index = g_NvidiaExt.IncrementCounter();
139
+ g_NvidiaExt[index].markUavRef = 1;
140
+ uav[index] = 0.0f;
141
+ }
142
+
143
+ void __NvReferenceUAVForOp(RWTexture2D<float> uav)
144
+ {
145
+ uint index = g_NvidiaExt.IncrementCounter();
146
+ g_NvidiaExt[index].markUavRef = 1;
147
+ uav[uint2(index,index)] = 0.0f;
148
+ }
149
+
150
+ void __NvReferenceUAVForOp(RWTexture3D<float> uav)
151
+ {
152
+ uint index = g_NvidiaExt.IncrementCounter();
153
+ g_NvidiaExt[index].markUavRef = 1;
154
+ uav[uint3(index,index,index)] = 0.0f;
155
+ }
156
+
157
+
158
+ void __NvReferenceUAVForOp(RWTexture1D<uint2> uav)
159
+ {
160
+ uint index = g_NvidiaExt.IncrementCounter();
161
+ g_NvidiaExt[index].markUavRef = 1;
162
+ uav[index] = uint2(0,0);
163
+ }
164
+
165
+ void __NvReferenceUAVForOp(RWTexture2D<uint2> uav)
166
+ {
167
+ uint index = g_NvidiaExt.IncrementCounter();
168
+ g_NvidiaExt[index].markUavRef = 1;
169
+ uav[uint2(index,index)] = uint2(0,0);
170
+ }
171
+
172
+ void __NvReferenceUAVForOp(RWTexture3D<uint2> uav)
173
+ {
174
+ uint index = g_NvidiaExt.IncrementCounter();
175
+ g_NvidiaExt[index].markUavRef = 1;
176
+ uav[uint3(index,index,index)] = uint2(0,0);
177
+ }
178
+
179
+ void __NvReferenceUAVForOp(RWTexture1D<uint4> uav)
180
+ {
181
+ uint index = g_NvidiaExt.IncrementCounter();
182
+ g_NvidiaExt[index].markUavRef = 1;
183
+ uav[index] = uint4(0,0,0,0);
184
+ }
185
+
186
+ void __NvReferenceUAVForOp(RWTexture2D<uint4> uav)
187
+ {
188
+ uint index = g_NvidiaExt.IncrementCounter();
189
+ g_NvidiaExt[index].markUavRef = 1;
190
+ uav[uint2(index,index)] = uint4(0,0,0,0);
191
+ }
192
+
193
+ void __NvReferenceUAVForOp(RWTexture3D<uint4> uav)
194
+ {
195
+ uint index = g_NvidiaExt.IncrementCounter();
196
+ g_NvidiaExt[index].markUavRef = 1;
197
+ uav[uint3(index,index,index)] = uint4(0,0,0,0);
198
+ }
199
+
200
+ void __NvReferenceUAVForOp(RWTexture1D<uint> uav)
201
+ {
202
+ uint index = g_NvidiaExt.IncrementCounter();
203
+ g_NvidiaExt[index].markUavRef = 1;
204
+ uav[index] = 0;
205
+ }
206
+
207
+ void __NvReferenceUAVForOp(RWTexture2D<uint> uav)
208
+ {
209
+ uint index = g_NvidiaExt.IncrementCounter();
210
+ g_NvidiaExt[index].markUavRef = 1;
211
+ uav[uint2(index,index)] = 0;
212
+ }
213
+
214
+ void __NvReferenceUAVForOp(RWTexture3D<uint> uav)
215
+ {
216
+ uint index = g_NvidiaExt.IncrementCounter();
217
+ g_NvidiaExt[index].markUavRef = 1;
218
+ uav[uint3(index,index,index)] = 0;
219
+ }
220
+
221
+ void __NvReferenceUAVForOp(RWTexture1D<int2> uav)
222
+ {
223
+ uint index = g_NvidiaExt.IncrementCounter();
224
+ g_NvidiaExt[index].markUavRef = 1;
225
+ uav[index] = int2(0,0);
226
+ }
227
+
228
+ void __NvReferenceUAVForOp(RWTexture2D<int2> uav)
229
+ {
230
+ uint index = g_NvidiaExt.IncrementCounter();
231
+ g_NvidiaExt[index].markUavRef = 1;
232
+ uav[uint2(index,index)] = int2(0,0);
233
+ }
234
+
235
+ void __NvReferenceUAVForOp(RWTexture3D<int2> uav)
236
+ {
237
+ uint index = g_NvidiaExt.IncrementCounter();
238
+ g_NvidiaExt[index].markUavRef = 1;
239
+ uav[uint3(index,index,index)] = int2(0,0);
240
+ }
241
+
242
+ void __NvReferenceUAVForOp(RWTexture1D<int4> uav)
243
+ {
244
+ uint index = g_NvidiaExt.IncrementCounter();
245
+ g_NvidiaExt[index].markUavRef = 1;
246
+ uav[index] = int4(0,0,0,0);
247
+ }
248
+
249
+ void __NvReferenceUAVForOp(RWTexture2D<int4> uav)
250
+ {
251
+ uint index = g_NvidiaExt.IncrementCounter();
252
+ g_NvidiaExt[index].markUavRef = 1;
253
+ uav[uint2(index,index)] = int4(0,0,0,0);
254
+ }
255
+
256
+ void __NvReferenceUAVForOp(RWTexture3D<int4> uav)
257
+ {
258
+ uint index = g_NvidiaExt.IncrementCounter();
259
+ g_NvidiaExt[index].markUavRef = 1;
260
+ uav[uint3(index,index,index)] = int4(0,0,0,0);
261
+ }
262
+
263
+ void __NvReferenceUAVForOp(RWTexture1D<int> uav)
264
+ {
265
+ uint index = g_NvidiaExt.IncrementCounter();
266
+ g_NvidiaExt[index].markUavRef = 1;
267
+ uav[index] = 0;
268
+ }
269
+
270
+ void __NvReferenceUAVForOp(RWTexture2D<int> uav)
271
+ {
272
+ uint index = g_NvidiaExt.IncrementCounter();
273
+ g_NvidiaExt[index].markUavRef = 1;
274
+ uav[uint2(index,index)] = 0;
275
+ }
276
+
277
+ void __NvReferenceUAVForOp(RWTexture3D<int> uav)
278
+ {
279
+ uint index = g_NvidiaExt.IncrementCounter();
280
+ g_NvidiaExt[index].markUavRef = 1;
281
+ uav[uint3(index,index,index)] = 0;
282
+ }
283
+
284
+ //----------------------------------------------------------------------------//
285
+ // ATOMIC op sub-opcodes
286
+ #define NV_EXTN_ATOM_AND 0
287
+ #define NV_EXTN_ATOM_OR 1
288
+ #define NV_EXTN_ATOM_XOR 2
289
+
290
+ #define NV_EXTN_ATOM_ADD 3
291
+ #define NV_EXTN_ATOM_MAX 6
292
+ #define NV_EXTN_ATOM_MIN 7
293
+
294
+ #define NV_EXTN_ATOM_SWAP 8
295
+ #define NV_EXTN_ATOM_CAS 9
296
+
297
+ //----------------------------------------------------------------------------//
298
+
299
+ // performs Atomic operation on two consecutive fp16 values in the given UAV
300
+ // the uint paramater 'fp16x2Val' is treated as two fp16 values
301
+ // the passed sub-opcode 'op' should be an immediate constant
302
+ // byteAddress must be multiple of 4
303
+ // the returned value are the two fp16 values packed into a single uint
304
+ uint __NvAtomicOpFP16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val, uint atomicOpType)
305
+ {
306
+ __NvReferenceUAVForOp(uav);
307
+ uint index = g_NvidiaExt.IncrementCounter();
308
+ g_NvidiaExt[index].src0u.x = byteAddress;
309
+ g_NvidiaExt[index].src1u.x = fp16x2Val;
310
+ g_NvidiaExt[index].src2u.x = atomicOpType;
311
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
312
+
313
+ return g_NvidiaExt[index].dst0u.x;
314
+ }
315
+
316
+ //----------------------------------------------------------------------------//
317
+
318
+ // performs Atomic operation on a R16G16_FLOAT UAV at the given address
319
+ // the uint paramater 'fp16x2Val' is treated as two fp16 values
320
+ // the passed sub-opcode 'op' should be an immediate constant
321
+ // the returned value are the two fp16 values (.x and .y components) packed into a single uint
322
+ // Warning: Behaviour of these set of functions is undefined if the UAV is not
323
+ // of R16G16_FLOAT format (might result in app crash or TDR)
324
+
325
+ uint __NvAtomicOpFP16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val, uint atomicOpType)
326
+ {
327
+ __NvReferenceUAVForOp(uav);
328
+ uint index = g_NvidiaExt.IncrementCounter();
329
+ g_NvidiaExt[index].src0u.x = address;
330
+ g_NvidiaExt[index].src1u.x = fp16x2Val;
331
+ g_NvidiaExt[index].src2u.x = atomicOpType;
332
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
333
+
334
+ return g_NvidiaExt[index].dst0u.x;
335
+ }
336
+
337
+ uint __NvAtomicOpFP16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val, uint atomicOpType)
338
+ {
339
+ __NvReferenceUAVForOp(uav);
340
+ uint index = g_NvidiaExt.IncrementCounter();
341
+ g_NvidiaExt[index].src0u.xy = address;
342
+ g_NvidiaExt[index].src1u.x = fp16x2Val;
343
+ g_NvidiaExt[index].src2u.x = atomicOpType;
344
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
345
+
346
+ return g_NvidiaExt[index].dst0u.x;
347
+ }
348
+
349
+ uint __NvAtomicOpFP16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val, uint atomicOpType)
350
+ {
351
+ __NvReferenceUAVForOp(uav);
352
+ uint index = g_NvidiaExt.IncrementCounter();
353
+ g_NvidiaExt[index].src0u.xyz = address;
354
+ g_NvidiaExt[index].src1u.x = fp16x2Val;
355
+ g_NvidiaExt[index].src2u.x = atomicOpType;
356
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
357
+
358
+ return g_NvidiaExt[index].dst0u.x;
359
+ }
360
+
361
+ //----------------------------------------------------------------------------//
362
+
363
+ // performs Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
364
+ // the uint2 paramater 'fp16x2Val' is treated as four fp16 values
365
+ // i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
366
+ // the passed sub-opcode 'op' should be an immediate constant
367
+ // the returned value are the four fp16 values (.xyzw components) packed into uint2
368
+ // Warning: Behaviour of these set of functions is undefined if the UAV is not
369
+ // of R16G16B16A16_FLOAT format (might result in app crash or TDR)
370
+
371
+ uint2 __NvAtomicOpFP16x2(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val, uint atomicOpType)
372
+ {
373
+ __NvReferenceUAVForOp(uav);
374
+
375
+ // break it down into two fp16x2 atomic ops
376
+ uint2 retVal;
377
+
378
+ // first op has x-coordinate = x * 2
379
+ uint index = g_NvidiaExt.IncrementCounter();
380
+ g_NvidiaExt[index].src0u.x = address * 2;
381
+ g_NvidiaExt[index].src1u.x = fp16x2Val.x;
382
+ g_NvidiaExt[index].src2u.x = atomicOpType;
383
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
384
+ retVal.x = g_NvidiaExt[index].dst0u.x;
385
+
386
+ // second op has x-coordinate = x * 2 + 1
387
+ index = g_NvidiaExt.IncrementCounter();
388
+ g_NvidiaExt[index].src0u.x = address * 2 + 1;
389
+ g_NvidiaExt[index].src1u.x = fp16x2Val.y;
390
+ g_NvidiaExt[index].src2u.x = atomicOpType;
391
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
392
+ retVal.y = g_NvidiaExt[index].dst0u.x;
393
+
394
+ return retVal;
395
+ }
396
+
397
+ uint2 __NvAtomicOpFP16x2(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val, uint atomicOpType)
398
+ {
399
+ __NvReferenceUAVForOp(uav);
400
+
401
+ // break it down into two fp16x2 atomic ops
402
+ uint2 retVal;
403
+
404
+ // first op has x-coordinate = x * 2
405
+ uint2 addressTemp = uint2(address.x * 2, address.y);
406
+ uint index = g_NvidiaExt.IncrementCounter();
407
+ g_NvidiaExt[index].src0u.xy = addressTemp;
408
+ g_NvidiaExt[index].src1u.x = fp16x2Val.x;
409
+ g_NvidiaExt[index].src2u.x = atomicOpType;
410
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
411
+ retVal.x = g_NvidiaExt[index].dst0u.x;
412
+
413
+ // second op has x-coordinate = x * 2 + 1
414
+ addressTemp.x++;
415
+ index = g_NvidiaExt.IncrementCounter();
416
+ g_NvidiaExt[index].src0u.xy = addressTemp;
417
+ g_NvidiaExt[index].src1u.x = fp16x2Val.y;
418
+ g_NvidiaExt[index].src2u.x = atomicOpType;
419
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
420
+ retVal.y = g_NvidiaExt[index].dst0u.x;
421
+
422
+ return retVal;
423
+ }
424
+
425
+ uint2 __NvAtomicOpFP16x2(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val, uint atomicOpType)
426
+ {
427
+ __NvReferenceUAVForOp(uav);
428
+
429
+ // break it down into two fp16x2 atomic ops
430
+ uint2 retVal;
431
+
432
+ // first op has x-coordinate = x * 2
433
+ uint3 addressTemp = uint3(address.x * 2, address.y, address.z);
434
+ uint index = g_NvidiaExt.IncrementCounter();
435
+ g_NvidiaExt[index].src0u.xyz = addressTemp;
436
+ g_NvidiaExt[index].src1u.x = fp16x2Val.x;
437
+ g_NvidiaExt[index].src2u.x = atomicOpType;
438
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
439
+ retVal.x = g_NvidiaExt[index].dst0u.x;
440
+
441
+ // second op has x-coordinate = x * 2 + 1
442
+ addressTemp.x++;
443
+ index = g_NvidiaExt.IncrementCounter();
444
+ g_NvidiaExt[index].src0u.xyz = addressTemp;
445
+ g_NvidiaExt[index].src1u.x = fp16x2Val.y;
446
+ g_NvidiaExt[index].src2u.x = atomicOpType;
447
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
448
+ retVal.y = g_NvidiaExt[index].dst0u.x;
449
+
450
+ return retVal;
451
+ }
452
+
453
+ uint __fp32x2Tofp16x2(float2 val)
454
+ {
455
+ return (f32tof16(val.y)<<16) | f32tof16(val.x) ;
456
+ }
457
+
458
+ uint2 __fp32x4Tofp16x4(float4 val)
459
+ {
460
+ return uint2( (f32tof16(val.y)<<16) | f32tof16(val.x), (f32tof16(val.w)<<16) | f32tof16(val.z) ) ;
461
+ }
462
+
463
+ //----------------------------------------------------------------------------//
464
+
465
+ // FP32 Atomic functions
466
+ // performs Atomic operation treating the uav as float (fp32) values
467
+ // the passed sub-opcode 'op' should be an immediate constant
468
+ // byteAddress must be multiple of 4
469
+ float __NvAtomicAddFP32(RWByteAddressBuffer uav, uint byteAddress, float val)
470
+ {
471
+ __NvReferenceUAVForOp(uav);
472
+ uint index = g_NvidiaExt.IncrementCounter();
473
+ g_NvidiaExt[index].src0u.x = byteAddress;
474
+ g_NvidiaExt[index].src1u.x = asuint(val); // passing as uint to make it more convinient for the driver to translate
475
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
476
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
477
+
478
+ return asfloat(g_NvidiaExt[index].dst0u.x);
479
+ }
480
+
481
+ float __NvAtomicAddFP32(RWTexture1D<float> uav, uint address, float val)
482
+ {
483
+ __NvReferenceUAVForOp(uav);
484
+ uint index = g_NvidiaExt.IncrementCounter();
485
+ g_NvidiaExt[index].src0u.x = address;
486
+ g_NvidiaExt[index].src1u.x = asuint(val);
487
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
488
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
489
+
490
+ return asfloat(g_NvidiaExt[index].dst0u.x);
491
+ }
492
+
493
+ float __NvAtomicAddFP32(RWTexture2D<float> uav, uint2 address, float val)
494
+ {
495
+ __NvReferenceUAVForOp(uav);
496
+ uint index = g_NvidiaExt.IncrementCounter();
497
+ g_NvidiaExt[index].src0u.xy = address;
498
+ g_NvidiaExt[index].src1u.x = asuint(val);
499
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
500
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
501
+
502
+ return asfloat(g_NvidiaExt[index].dst0u.x);
503
+ }
504
+
505
+ float __NvAtomicAddFP32(RWTexture3D<float> uav, uint3 address, float val)
506
+ {
507
+ __NvReferenceUAVForOp(uav);
508
+ uint index = g_NvidiaExt.IncrementCounter();
509
+ g_NvidiaExt[index].src0u.xyz = address;
510
+ g_NvidiaExt[index].src1u.x = asuint(val);
511
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
512
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
513
+
514
+ return asfloat(g_NvidiaExt[index].dst0u.x);
515
+ }
516
+
517
+ //----------------------------------------------------------------------------//
518
+
519
+ // UINT64 Atmoic Functions
520
+ // The functions below performs atomic operation on the given UAV treating the value as uint64
521
+ // byteAddress must be multiple of 8
522
+ // The returned value is the value present in memory location before the atomic operation
523
+ // uint2 vector type is used to represent a single uint64 value with the x component containing the low 32 bits and y component the high 32 bits.
524
+
525
+ uint2 __NvAtomicCompareExchangeUINT64(RWByteAddressBuffer uav, uint byteAddress, uint2 compareValue, uint2 value)
526
+ {
527
+ __NvReferenceUAVForOp(uav);
528
+
529
+ uint index = g_NvidiaExt.IncrementCounter();
530
+ g_NvidiaExt[index].src0u.x = byteAddress;
531
+ g_NvidiaExt[index].src1u.xy = compareValue;
532
+ g_NvidiaExt[index].src1u.zw = value;
533
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
534
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
535
+
536
+ return g_NvidiaExt[index].dst0u.xy;
537
+ }
538
+
539
+ uint2 __NvAtomicOpUINT64(RWByteAddressBuffer uav, uint byteAddress, uint2 value, uint atomicOpType)
540
+ {
541
+ __NvReferenceUAVForOp(uav);
542
+
543
+ uint index = g_NvidiaExt.IncrementCounter();
544
+ g_NvidiaExt[index].src0u.x = byteAddress;
545
+ g_NvidiaExt[index].src1u.xy = value;
546
+ g_NvidiaExt[index].src2u.x = atomicOpType;
547
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
548
+
549
+ return g_NvidiaExt[index].dst0u.xy;
550
+ }
551
+
552
+ uint2 __NvAtomicCompareExchangeUINT64(RWTexture1D<uint2> uav, uint address, uint2 compareValue, uint2 value)
553
+ {
554
+ __NvReferenceUAVForOp(uav);
555
+
556
+ uint index = g_NvidiaExt.IncrementCounter();
557
+ g_NvidiaExt[index].src0u.x = address;
558
+ g_NvidiaExt[index].src1u.xy = compareValue;
559
+ g_NvidiaExt[index].src1u.zw = value;
560
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
561
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
562
+
563
+ return g_NvidiaExt[index].dst0u.xy;
564
+ }
565
+
566
+ uint2 __NvAtomicOpUINT64(RWTexture1D<uint2> uav, uint address, uint2 value, uint atomicOpType)
567
+ {
568
+ __NvReferenceUAVForOp(uav);
569
+
570
+ uint index = g_NvidiaExt.IncrementCounter();
571
+ g_NvidiaExt[index].src0u.x = address;
572
+ g_NvidiaExt[index].src1u.xy = value;
573
+ g_NvidiaExt[index].src2u.x = atomicOpType;
574
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
575
+
576
+ return g_NvidiaExt[index].dst0u.xy;
577
+ }
578
+
579
+ uint2 __NvAtomicCompareExchangeUINT64(RWTexture2D<uint2> uav, uint2 address, uint2 compareValue, uint2 value)
580
+ {
581
+ __NvReferenceUAVForOp(uav);
582
+
583
+ uint index = g_NvidiaExt.IncrementCounter();
584
+ g_NvidiaExt[index].src0u.xy = address;
585
+ g_NvidiaExt[index].src1u.xy = compareValue;
586
+ g_NvidiaExt[index].src1u.zw = value;
587
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
588
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
589
+
590
+ return g_NvidiaExt[index].dst0u.xy;
591
+ }
592
+
593
+ uint2 __NvAtomicOpUINT64(RWTexture2D<uint2> uav, uint2 address, uint2 value, uint atomicOpType)
594
+ {
595
+ __NvReferenceUAVForOp(uav);
596
+
597
+ uint index = g_NvidiaExt.IncrementCounter();
598
+ g_NvidiaExt[index].src0u.xy = address;
599
+ g_NvidiaExt[index].src1u.xy = value;
600
+ g_NvidiaExt[index].src2u.x = atomicOpType;
601
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
602
+
603
+ return g_NvidiaExt[index].dst0u.xy;
604
+ }
605
+
606
+ uint2 __NvAtomicCompareExchangeUINT64(RWTexture3D<uint2> uav, uint3 address, uint2 compareValue, uint2 value)
607
+ {
608
+ __NvReferenceUAVForOp(uav);
609
+
610
+ uint index = g_NvidiaExt.IncrementCounter();
611
+ g_NvidiaExt[index].src0u.xyz = address;
612
+ g_NvidiaExt[index].src1u.xy = compareValue;
613
+ g_NvidiaExt[index].src1u.zw = value;
614
+ g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
615
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
616
+
617
+ return g_NvidiaExt[index].dst0u.xy;
618
+ }
619
+
620
+ uint2 __NvAtomicOpUINT64(RWTexture3D<uint2> uav, uint3 address, uint2 value, uint atomicOpType)
621
+ {
622
+ __NvReferenceUAVForOp(uav);
623
+
624
+ uint index = g_NvidiaExt.IncrementCounter();
625
+ g_NvidiaExt[index].src0u.xyz = address;
626
+ g_NvidiaExt[index].src1u.xy = value;
627
+ g_NvidiaExt[index].src2u.x = atomicOpType;
628
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
629
+
630
+ return g_NvidiaExt[index].dst0u.xy;
631
+ }
632
+
633
+
634
+ uint4 __NvFootprint(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, int3 offset = int3(0, 0, 0))
635
+ {
636
+ uint index = g_NvidiaExt.IncrementCounter();
637
+ g_NvidiaExt[index].src0u.x = texIndex;
638
+ g_NvidiaExt[index].src0u.y = smpIndex;
639
+ g_NvidiaExt[index].src1u.xyz = asuint(location);
640
+ g_NvidiaExt[index].src1u.w = gran;
641
+ g_NvidiaExt[index].src3u.x = texSpace;
642
+ g_NvidiaExt[index].src3u.y = smpSpace;
643
+ g_NvidiaExt[index].src3u.z = texType;
644
+ g_NvidiaExt[index].src3u.w = footprintmode;
645
+ g_NvidiaExt[index].src4u.xyz = asuint(offset);
646
+
647
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT;
648
+ g_NvidiaExt[index].numOutputsForIncCounter = 4;
649
+
650
+ // result is returned as the return value of IncrementCounter on fake UAV slot
651
+ uint4 op;
652
+ op.x = g_NvidiaExt.IncrementCounter();
653
+ op.y = g_NvidiaExt.IncrementCounter();
654
+ op.z = g_NvidiaExt.IncrementCounter();
655
+ op.w = g_NvidiaExt.IncrementCounter();
656
+ return op;
657
+ }
658
+
659
+ uint4 __NvFootprintBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, float bias, int3 offset = int3(0, 0, 0))
660
+ {
661
+ uint index = g_NvidiaExt.IncrementCounter();
662
+ g_NvidiaExt[index].src0u.x = texIndex;
663
+ g_NvidiaExt[index].src0u.y = smpIndex;
664
+ g_NvidiaExt[index].src1u.xyz = asuint(location);
665
+ g_NvidiaExt[index].src1u.w = gran;
666
+ g_NvidiaExt[index].src2u.x = asuint(bias);
667
+ g_NvidiaExt[index].src3u.x = texSpace;
668
+ g_NvidiaExt[index].src3u.y = smpSpace;
669
+ g_NvidiaExt[index].src3u.z = texType;
670
+ g_NvidiaExt[index].src3u.w = footprintmode;
671
+ g_NvidiaExt[index].src4u.xyz = asuint(offset);
672
+
673
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT_BIAS;
674
+ g_NvidiaExt[index].numOutputsForIncCounter = 4;
675
+
676
+ // result is returned as the return value of IncrementCounter on fake UAV slot
677
+ uint4 op;
678
+ op.x = g_NvidiaExt.IncrementCounter();
679
+ op.y = g_NvidiaExt.IncrementCounter();
680
+ op.z = g_NvidiaExt.IncrementCounter();
681
+ op.w = g_NvidiaExt.IncrementCounter();
682
+ return op;
683
+ }
684
+
685
+ uint4 __NvFootprintLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, float lodLevel, int3 offset = int3(0, 0, 0))
686
+ {
687
+ uint index = g_NvidiaExt.IncrementCounter();
688
+ g_NvidiaExt[index].src0u.x = texIndex;
689
+ g_NvidiaExt[index].src0u.y = smpIndex;
690
+ g_NvidiaExt[index].src1u.xyz = asuint(location);
691
+ g_NvidiaExt[index].src1u.w = gran;
692
+ g_NvidiaExt[index].src2u.x = asuint(lodLevel);
693
+ g_NvidiaExt[index].src3u.x = texSpace;
694
+ g_NvidiaExt[index].src3u.y = smpSpace;
695
+ g_NvidiaExt[index].src3u.z = texType;
696
+ g_NvidiaExt[index].src3u.w = footprintmode;
697
+ g_NvidiaExt[index].src4u.xyz = asuint(offset);
698
+
699
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT_LEVEL;
700
+ g_NvidiaExt[index].numOutputsForIncCounter = 4;
701
+
702
+ // result is returned as the return value of IncrementCounter on fake UAV slot
703
+ uint4 op;
704
+ op.x = g_NvidiaExt.IncrementCounter();
705
+ op.y = g_NvidiaExt.IncrementCounter();
706
+ op.z = g_NvidiaExt.IncrementCounter();
707
+ op.w = g_NvidiaExt.IncrementCounter();
708
+ return op;
709
+ }
710
+
711
+ uint4 __NvFootprintGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, float3 ddx, float3 ddy, int3 offset = int3(0, 0, 0))
712
+ {
713
+ uint index = g_NvidiaExt.IncrementCounter();
714
+ g_NvidiaExt[index].src0u.x = texIndex;
715
+ g_NvidiaExt[index].src0u.y = smpIndex;
716
+ g_NvidiaExt[index].src1u.xyz = asuint(location);
717
+ g_NvidiaExt[index].src1u.w = gran;
718
+ g_NvidiaExt[index].src2u.xyz = asuint(ddx);
719
+ g_NvidiaExt[index].src5u.xyz = asuint(ddy);
720
+ g_NvidiaExt[index].src3u.x = texSpace;
721
+ g_NvidiaExt[index].src3u.y = smpSpace;
722
+ g_NvidiaExt[index].src3u.z = texType;
723
+ g_NvidiaExt[index].src3u.w = footprintmode;
724
+ g_NvidiaExt[index].src4u.xyz = asuint(offset);
725
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT_GRAD;
726
+ g_NvidiaExt[index].numOutputsForIncCounter = 4;
727
+
728
+ // result is returned as the return value of IncrementCounter on fake UAV slot
729
+ uint4 op;
730
+ op.x = g_NvidiaExt.IncrementCounter();
731
+ op.y = g_NvidiaExt.IncrementCounter();
732
+ op.z = g_NvidiaExt.IncrementCounter();
733
+ op.w = g_NvidiaExt.IncrementCounter();
734
+ return op;
735
+ }
736
+
737
+ // returns value of special register - specify subopcode from any of NV_SPECIALOP_* specified in nvShaderExtnEnums.h - other opcodes undefined behavior
738
+ uint __NvGetSpecial(uint subOpCode)
739
+ {
740
+ uint index = g_NvidiaExt.IncrementCounter();
741
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_SPECIAL;
742
+ g_NvidiaExt[index].src0u.x = subOpCode;
743
+ return g_NvidiaExt.IncrementCounter();
744
+ }
745
+
746
+ // predicate is returned in laneValid indicating if srcLane is in range and val from specified lane is returned.
747
+ int __NvShflGeneric(int val, uint srcLane, uint maskClampVal, out uint laneValid)
748
+ {
749
+ uint index = g_NvidiaExt.IncrementCounter();
750
+ g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
751
+ g_NvidiaExt[index].src0u.y = srcLane; // source lane
752
+ g_NvidiaExt[index].src0u.z = maskClampVal;
753
+ g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_GENERIC;
754
+ g_NvidiaExt[index].numOutputsForIncCounter = 2;
755
+
756
+ laneValid = asuint(g_NvidiaExt.IncrementCounter());
757
+ return g_NvidiaExt.IncrementCounter();
758
+ }