warp-lang 1.4.2__py3-none-manylinux2014_aarch64.whl → 1.5.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (166) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +21 -2
  6. warp/build_dll.py +23 -6
  7. warp/builtins.py +1819 -7
  8. warp/codegen.py +197 -61
  9. warp/config.py +2 -2
  10. warp/context.py +379 -107
  11. warp/examples/assets/pixel.jpg +0 -0
  12. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  13. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  14. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  15. warp/examples/benchmarks/benchmark_tile.py +179 -0
  16. warp/examples/fem/example_adaptive_grid.py +37 -10
  17. warp/examples/fem/example_apic_fluid.py +3 -2
  18. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  19. warp/examples/fem/example_deformed_geometry.py +1 -1
  20. warp/examples/fem/example_diffusion_3d.py +47 -4
  21. warp/examples/fem/example_distortion_energy.py +220 -0
  22. warp/examples/fem/example_magnetostatics.py +127 -85
  23. warp/examples/fem/example_nonconforming_contact.py +5 -5
  24. warp/examples/fem/example_stokes.py +3 -1
  25. warp/examples/fem/example_streamlines.py +12 -19
  26. warp/examples/fem/utils.py +38 -15
  27. warp/examples/sim/example_cloth.py +4 -25
  28. warp/examples/sim/example_quadruped.py +2 -1
  29. warp/examples/tile/example_tile_convolution.py +58 -0
  30. warp/examples/tile/example_tile_fft.py +47 -0
  31. warp/examples/tile/example_tile_filtering.py +105 -0
  32. warp/examples/tile/example_tile_matmul.py +79 -0
  33. warp/examples/tile/example_tile_mlp.py +375 -0
  34. warp/fem/__init__.py +8 -0
  35. warp/fem/cache.py +16 -12
  36. warp/fem/dirichlet.py +1 -1
  37. warp/fem/domain.py +44 -1
  38. warp/fem/field/__init__.py +1 -2
  39. warp/fem/field/field.py +31 -19
  40. warp/fem/field/nodal_field.py +101 -49
  41. warp/fem/field/virtual.py +794 -0
  42. warp/fem/geometry/__init__.py +2 -2
  43. warp/fem/geometry/deformed_geometry.py +3 -105
  44. warp/fem/geometry/element.py +13 -0
  45. warp/fem/geometry/geometry.py +165 -7
  46. warp/fem/geometry/grid_2d.py +3 -6
  47. warp/fem/geometry/grid_3d.py +31 -28
  48. warp/fem/geometry/hexmesh.py +3 -46
  49. warp/fem/geometry/nanogrid.py +3 -2
  50. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  51. warp/fem/geometry/tetmesh.py +2 -43
  52. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  53. warp/fem/integrate.py +683 -261
  54. warp/fem/linalg.py +404 -0
  55. warp/fem/operator.py +101 -18
  56. warp/fem/polynomial.py +5 -5
  57. warp/fem/quadrature/quadrature.py +45 -21
  58. warp/fem/space/__init__.py +45 -11
  59. warp/fem/space/basis_function_space.py +451 -0
  60. warp/fem/space/basis_space.py +58 -11
  61. warp/fem/space/function_space.py +146 -5
  62. warp/fem/space/grid_2d_function_space.py +80 -66
  63. warp/fem/space/grid_3d_function_space.py +113 -68
  64. warp/fem/space/hexmesh_function_space.py +96 -108
  65. warp/fem/space/nanogrid_function_space.py +62 -110
  66. warp/fem/space/quadmesh_function_space.py +208 -0
  67. warp/fem/space/shape/__init__.py +45 -7
  68. warp/fem/space/shape/cube_shape_function.py +328 -54
  69. warp/fem/space/shape/shape_function.py +10 -1
  70. warp/fem/space/shape/square_shape_function.py +328 -60
  71. warp/fem/space/shape/tet_shape_function.py +269 -19
  72. warp/fem/space/shape/triangle_shape_function.py +238 -19
  73. warp/fem/space/tetmesh_function_space.py +69 -37
  74. warp/fem/space/topology.py +38 -0
  75. warp/fem/space/trimesh_function_space.py +179 -0
  76. warp/fem/utils.py +6 -331
  77. warp/jax_experimental.py +3 -1
  78. warp/native/array.h +15 -0
  79. warp/native/builtin.h +66 -26
  80. warp/native/bvh.h +4 -0
  81. warp/native/coloring.cpp +604 -0
  82. warp/native/cuda_util.cpp +68 -51
  83. warp/native/cuda_util.h +2 -1
  84. warp/native/fabric.h +8 -0
  85. warp/native/hashgrid.h +4 -0
  86. warp/native/marching.cu +8 -0
  87. warp/native/mat.h +14 -3
  88. warp/native/mathdx.cpp +59 -0
  89. warp/native/mesh.h +4 -0
  90. warp/native/range.h +13 -1
  91. warp/native/reduce.cpp +9 -1
  92. warp/native/reduce.cu +7 -0
  93. warp/native/runlength_encode.cpp +9 -1
  94. warp/native/runlength_encode.cu +7 -1
  95. warp/native/scan.cpp +8 -0
  96. warp/native/scan.cu +8 -0
  97. warp/native/scan.h +8 -1
  98. warp/native/sparse.cpp +8 -0
  99. warp/native/sparse.cu +8 -0
  100. warp/native/temp_buffer.h +7 -0
  101. warp/native/tile.h +1854 -0
  102. warp/native/tile_gemm.h +341 -0
  103. warp/native/tile_reduce.h +210 -0
  104. warp/native/volume_builder.cu +8 -0
  105. warp/native/volume_builder.h +8 -0
  106. warp/native/warp.cpp +10 -2
  107. warp/native/warp.cu +369 -15
  108. warp/native/warp.h +12 -2
  109. warp/optim/adam.py +39 -4
  110. warp/paddle.py +29 -12
  111. warp/render/render_opengl.py +140 -67
  112. warp/sim/graph_coloring.py +292 -0
  113. warp/sim/import_urdf.py +8 -8
  114. warp/sim/integrator_euler.py +4 -2
  115. warp/sim/integrator_featherstone.py +115 -44
  116. warp/sim/integrator_vbd.py +6 -0
  117. warp/sim/model.py +109 -32
  118. warp/sparse.py +1 -1
  119. warp/stubs.py +569 -4
  120. warp/tape.py +12 -7
  121. warp/tests/assets/pixel.npy +0 -0
  122. warp/tests/aux_test_instancing_gc.py +18 -0
  123. warp/tests/test_array.py +39 -0
  124. warp/tests/test_codegen.py +81 -1
  125. warp/tests/test_codegen_instancing.py +30 -0
  126. warp/tests/test_collision.py +110 -0
  127. warp/tests/test_coloring.py +251 -0
  128. warp/tests/test_context.py +34 -0
  129. warp/tests/test_examples.py +21 -5
  130. warp/tests/test_fem.py +453 -113
  131. warp/tests/test_func.py +34 -4
  132. warp/tests/test_generics.py +52 -0
  133. warp/tests/test_iter.py +68 -0
  134. warp/tests/test_lerp.py +13 -87
  135. warp/tests/test_mat_scalar_ops.py +1 -1
  136. warp/tests/test_matmul.py +6 -9
  137. warp/tests/test_matmul_lite.py +6 -11
  138. warp/tests/test_mesh_query_point.py +1 -1
  139. warp/tests/test_module_hashing.py +23 -0
  140. warp/tests/test_overwrite.py +45 -0
  141. warp/tests/test_paddle.py +27 -87
  142. warp/tests/test_print.py +56 -1
  143. warp/tests/test_smoothstep.py +17 -83
  144. warp/tests/test_spatial.py +1 -1
  145. warp/tests/test_static.py +3 -3
  146. warp/tests/test_tile.py +744 -0
  147. warp/tests/test_tile_mathdx.py +144 -0
  148. warp/tests/test_tile_mlp.py +383 -0
  149. warp/tests/test_tile_reduce.py +374 -0
  150. warp/tests/test_tile_shared_memory.py +190 -0
  151. warp/tests/test_vbd.py +12 -20
  152. warp/tests/test_volume.py +43 -0
  153. warp/tests/unittest_suites.py +19 -2
  154. warp/tests/unittest_utils.py +4 -2
  155. warp/types.py +340 -74
  156. warp/utils.py +23 -3
  157. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
  158. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +161 -134
  159. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
  160. warp/fem/field/test.py +0 -180
  161. warp/fem/field/trial.py +0 -183
  162. warp/fem/space/collocated_function_space.py +0 -102
  163. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  164. warp/fem/space/trimesh_2d_function_space.py +0 -153
  165. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
  166. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0
warp/native/cuda_util.cpp CHANGED
@@ -100,6 +100,8 @@ static PFN_cuGraphicsUnmapResources_v3000 pfn_cuGraphicsUnmapResources;
100
100
  static PFN_cuGraphicsResourceGetMappedPointer_v3020 pfn_cuGraphicsResourceGetMappedPointer;
101
101
  static PFN_cuGraphicsGLRegisterBuffer_v3000 pfn_cuGraphicsGLRegisterBuffer;
102
102
  static PFN_cuGraphicsUnregisterResource_v3000 pfn_cuGraphicsUnregisterResource;
103
+ static PFN_cuModuleGetGlobal_v3020 pfn_cuModuleGetGlobal;
104
+ static PFN_cuFuncSetAttribute_v9000 pfn_cuFuncSetAttribute;
103
105
 
104
106
  static bool cuda_driver_initialized = false;
105
107
 
@@ -118,15 +120,17 @@ static inline int get_minor(int version)
118
120
  return (version % 1000) / 10;
119
121
  }
120
122
 
121
- static bool get_driver_entry_point(const char* name, void** pfn)
123
+ // Get versioned driver entry point. The version argument should match the function pointer type.
124
+ // For example, to initialize PFN_cuCtxCreate_v3020 use version 3020.
125
+ static bool get_driver_entry_point(const char* name, int version, void** pfn)
122
126
  {
123
127
  if (!pfn_cuGetProcAddress || !name || !pfn)
124
128
  return false;
125
129
 
126
130
  #if CUDA_VERSION < 12000
127
- CUresult r = pfn_cuGetProcAddress(name, pfn, WP_CUDA_DRIVER_VERSION, CU_GET_PROC_ADDRESS_DEFAULT);
131
+ CUresult r = pfn_cuGetProcAddress(name, pfn, version, CU_GET_PROC_ADDRESS_DEFAULT);
128
132
  #else
129
- CUresult r = pfn_cuGetProcAddress(name, pfn, WP_CUDA_DRIVER_VERSION, CU_GET_PROC_ADDRESS_DEFAULT, NULL);
133
+ CUresult r = pfn_cuGetProcAddress(name, pfn, version, CU_GET_PROC_ADDRESS_DEFAULT, NULL);
130
134
  #endif
131
135
 
132
136
  if (r != CUDA_SUCCESS)
@@ -168,7 +172,8 @@ bool init_cuda_driver()
168
172
 
169
173
  // check the CUDA driver version and report an error if it's too low
170
174
  int driver_version = 0;
171
- if (get_driver_entry_point("cuDriverGetVersion", &(void*&)pfn_cuDriverGetVersion) && check_cu(pfn_cuDriverGetVersion(&driver_version)))
175
+ if (get_driver_entry_point("cuDriverGetVersion", 2020, &(void*&)pfn_cuDriverGetVersion) &&
176
+ check_cu(pfn_cuDriverGetVersion(&driver_version)))
172
177
  {
173
178
  if (driver_version < WP_CUDA_DRIVER_VERSION)
174
179
  {
@@ -184,53 +189,55 @@ bool init_cuda_driver()
184
189
  }
185
190
 
186
191
  // initialize driver entry points
187
- get_driver_entry_point("cuGetErrorString", &(void*&)pfn_cuGetErrorString);
188
- get_driver_entry_point("cuGetErrorName", &(void*&)pfn_cuGetErrorName);
189
- get_driver_entry_point("cuInit", &(void*&)pfn_cuInit);
190
- get_driver_entry_point("cuDeviceGet", &(void*&)pfn_cuDeviceGet);
191
- get_driver_entry_point("cuDeviceGetCount", &(void*&)pfn_cuDeviceGetCount);
192
- get_driver_entry_point("cuDeviceGetName", &(void*&)pfn_cuDeviceGetName);
193
- get_driver_entry_point("cuDeviceGetAttribute", &(void*&)pfn_cuDeviceGetAttribute);
194
- get_driver_entry_point("cuDeviceGetUuid", &(void*&)pfn_cuDeviceGetUuid);
195
- get_driver_entry_point("cuDevicePrimaryCtxRetain", &(void*&)pfn_cuDevicePrimaryCtxRetain);
196
- get_driver_entry_point("cuDevicePrimaryCtxRelease", &(void*&)pfn_cuDevicePrimaryCtxRelease);
197
- get_driver_entry_point("cuDeviceCanAccessPeer", &(void*&)pfn_cuDeviceCanAccessPeer);
198
- get_driver_entry_point("cuMemGetInfo", &(void*&)pfn_cuMemGetInfo);
199
- get_driver_entry_point("cuCtxSetCurrent", &(void*&)pfn_cuCtxSetCurrent);
200
- get_driver_entry_point("cuCtxGetCurrent", &(void*&)pfn_cuCtxGetCurrent);
201
- get_driver_entry_point("cuCtxPushCurrent", &(void*&)pfn_cuCtxPushCurrent);
202
- get_driver_entry_point("cuCtxPopCurrent", &(void*&)pfn_cuCtxPopCurrent);
203
- get_driver_entry_point("cuCtxSynchronize", &(void*&)pfn_cuCtxSynchronize);
204
- get_driver_entry_point("cuCtxGetDevice", &(void*&)pfn_cuCtxGetDevice);
205
- get_driver_entry_point("cuCtxCreate", &(void*&)pfn_cuCtxCreate);
206
- get_driver_entry_point("cuCtxDestroy", &(void*&)pfn_cuCtxDestroy);
207
- get_driver_entry_point("cuCtxEnablePeerAccess", &(void*&)pfn_cuCtxEnablePeerAccess);
208
- get_driver_entry_point("cuCtxDisablePeerAccess", &(void*&)pfn_cuCtxDisablePeerAccess);
209
- get_driver_entry_point("cuStreamCreate", &(void*&)pfn_cuStreamCreate);
210
- get_driver_entry_point("cuStreamDestroy", &(void*&)pfn_cuStreamDestroy);
211
- get_driver_entry_point("cuStreamSynchronize", &(void*&)pfn_cuStreamSynchronize);
212
- get_driver_entry_point("cuStreamWaitEvent", &(void*&)pfn_cuStreamWaitEvent);
213
- get_driver_entry_point("cuStreamGetCtx", &(void*&)pfn_cuStreamGetCtx);
214
- get_driver_entry_point("cuStreamGetCaptureInfo", &(void*&)pfn_cuStreamGetCaptureInfo);
215
- get_driver_entry_point("cuStreamUpdateCaptureDependencies", &(void*&)pfn_cuStreamUpdateCaptureDependencies);
216
- get_driver_entry_point("cuStreamCreateWithPriority", &(void*&)pfn_cuStreamCreateWithPriority);
217
- get_driver_entry_point("cuStreamGetPriority", &(void*&)pfn_cuStreamGetPriority);
218
- get_driver_entry_point("cuEventCreate", &(void*&)pfn_cuEventCreate);
219
- get_driver_entry_point("cuEventDestroy", &(void*&)pfn_cuEventDestroy);
220
- get_driver_entry_point("cuEventRecord", &(void*&)pfn_cuEventRecord);
221
- get_driver_entry_point("cuEventRecordWithFlags", &(void*&)pfn_cuEventRecordWithFlags);
222
- get_driver_entry_point("cuEventSynchronize", &(void*&)pfn_cuEventSynchronize);
223
- get_driver_entry_point("cuModuleLoadDataEx", &(void*&)pfn_cuModuleLoadDataEx);
224
- get_driver_entry_point("cuModuleUnload", &(void*&)pfn_cuModuleUnload);
225
- get_driver_entry_point("cuModuleGetFunction", &(void*&)pfn_cuModuleGetFunction);
226
- get_driver_entry_point("cuLaunchKernel", &(void*&)pfn_cuLaunchKernel);
227
- get_driver_entry_point("cuMemcpyPeerAsync", &(void*&)pfn_cuMemcpyPeerAsync);
228
- get_driver_entry_point("cuPointerGetAttribute", &(void*&)pfn_cuPointerGetAttribute);
229
- get_driver_entry_point("cuGraphicsMapResources", &(void*&)pfn_cuGraphicsMapResources);
230
- get_driver_entry_point("cuGraphicsUnmapResources", &(void*&)pfn_cuGraphicsUnmapResources);
231
- get_driver_entry_point("cuGraphicsResourceGetMappedPointer", &(void*&)pfn_cuGraphicsResourceGetMappedPointer);
232
- get_driver_entry_point("cuGraphicsGLRegisterBuffer", &(void*&)pfn_cuGraphicsGLRegisterBuffer);
233
- get_driver_entry_point("cuGraphicsUnregisterResource", &(void*&)pfn_cuGraphicsUnregisterResource);
192
+ get_driver_entry_point("cuGetErrorString", 6000, &(void*&)pfn_cuGetErrorString);
193
+ get_driver_entry_point("cuGetErrorName", 6000, &(void*&)pfn_cuGetErrorName);
194
+ get_driver_entry_point("cuInit", 2000, &(void*&)pfn_cuInit);
195
+ get_driver_entry_point("cuDeviceGet", 2000, &(void*&)pfn_cuDeviceGet);
196
+ get_driver_entry_point("cuDeviceGetCount", 2000, &(void*&)pfn_cuDeviceGetCount);
197
+ get_driver_entry_point("cuDeviceGetName", 2000, &(void*&)pfn_cuDeviceGetName);
198
+ get_driver_entry_point("cuDeviceGetAttribute", 2000, &(void*&)pfn_cuDeviceGetAttribute);
199
+ get_driver_entry_point("cuDeviceGetUuid", 110400, &(void*&)pfn_cuDeviceGetUuid);
200
+ get_driver_entry_point("cuDevicePrimaryCtxRetain", 7000, &(void*&)pfn_cuDevicePrimaryCtxRetain);
201
+ get_driver_entry_point("cuDevicePrimaryCtxRelease", 11000, &(void*&)pfn_cuDevicePrimaryCtxRelease);
202
+ get_driver_entry_point("cuDeviceCanAccessPeer", 4000, &(void*&)pfn_cuDeviceCanAccessPeer);
203
+ get_driver_entry_point("cuMemGetInfo", 3020, &(void*&)pfn_cuMemGetInfo);
204
+ get_driver_entry_point("cuCtxSetCurrent", 4000, &(void*&)pfn_cuCtxSetCurrent);
205
+ get_driver_entry_point("cuCtxGetCurrent", 4000, &(void*&)pfn_cuCtxGetCurrent);
206
+ get_driver_entry_point("cuCtxPushCurrent", 4000, &(void*&)pfn_cuCtxPushCurrent);
207
+ get_driver_entry_point("cuCtxPopCurrent", 4000, &(void*&)pfn_cuCtxPopCurrent);
208
+ get_driver_entry_point("cuCtxSynchronize", 2000, &(void*&)pfn_cuCtxSynchronize);
209
+ get_driver_entry_point("cuCtxGetDevice", 2000, &(void*&)pfn_cuCtxGetDevice);
210
+ get_driver_entry_point("cuCtxCreate", 3020, &(void*&)pfn_cuCtxCreate);
211
+ get_driver_entry_point("cuCtxDestroy", 4000, &(void*&)pfn_cuCtxDestroy);
212
+ get_driver_entry_point("cuCtxEnablePeerAccess", 4000, &(void*&)pfn_cuCtxEnablePeerAccess);
213
+ get_driver_entry_point("cuCtxDisablePeerAccess", 4000, &(void*&)pfn_cuCtxDisablePeerAccess);
214
+ get_driver_entry_point("cuStreamCreate", 2000, &(void*&)pfn_cuStreamCreate);
215
+ get_driver_entry_point("cuStreamDestroy", 4000, &(void*&)pfn_cuStreamDestroy);
216
+ get_driver_entry_point("cuStreamSynchronize", 2000, &(void*&)pfn_cuStreamSynchronize);
217
+ get_driver_entry_point("cuStreamWaitEvent", 3020, &(void*&)pfn_cuStreamWaitEvent);
218
+ get_driver_entry_point("cuStreamGetCtx", 9020, &(void*&)pfn_cuStreamGetCtx);
219
+ get_driver_entry_point("cuStreamGetCaptureInfo", 11030, &(void*&)pfn_cuStreamGetCaptureInfo);
220
+ get_driver_entry_point("cuStreamUpdateCaptureDependencies", 11030, &(void*&)pfn_cuStreamUpdateCaptureDependencies);
221
+ get_driver_entry_point("cuStreamCreateWithPriority", 5050, &(void*&)pfn_cuStreamCreateWithPriority);
222
+ get_driver_entry_point("cuStreamGetPriority", 5050, &(void*&)pfn_cuStreamGetPriority);
223
+ get_driver_entry_point("cuEventCreate", 2000, &(void*&)pfn_cuEventCreate);
224
+ get_driver_entry_point("cuEventDestroy", 4000, &(void*&)pfn_cuEventDestroy);
225
+ get_driver_entry_point("cuEventRecord", 2000, &(void*&)pfn_cuEventRecord);
226
+ get_driver_entry_point("cuEventRecordWithFlags", 11010, &(void*&)pfn_cuEventRecordWithFlags);
227
+ get_driver_entry_point("cuEventSynchronize", 2000, &(void*&)pfn_cuEventSynchronize);
228
+ get_driver_entry_point("cuModuleLoadDataEx", 2010, &(void*&)pfn_cuModuleLoadDataEx);
229
+ get_driver_entry_point("cuModuleUnload", 2000, &(void*&)pfn_cuModuleUnload);
230
+ get_driver_entry_point("cuModuleGetFunction", 2000, &(void*&)pfn_cuModuleGetFunction);
231
+ get_driver_entry_point("cuLaunchKernel", 4000, &(void*&)pfn_cuLaunchKernel);
232
+ get_driver_entry_point("cuMemcpyPeerAsync", 4000, &(void*&)pfn_cuMemcpyPeerAsync);
233
+ get_driver_entry_point("cuPointerGetAttribute", 4000, &(void*&)pfn_cuPointerGetAttribute);
234
+ get_driver_entry_point("cuGraphicsMapResources", 3000, &(void*&)pfn_cuGraphicsMapResources);
235
+ get_driver_entry_point("cuGraphicsUnmapResources", 3000, &(void*&)pfn_cuGraphicsUnmapResources);
236
+ get_driver_entry_point("cuGraphicsResourceGetMappedPointer", 3020, &(void*&)pfn_cuGraphicsResourceGetMappedPointer);
237
+ get_driver_entry_point("cuGraphicsGLRegisterBuffer", 3000, &(void*&)pfn_cuGraphicsGLRegisterBuffer);
238
+ get_driver_entry_point("cuGraphicsUnregisterResource", 3000, &(void*&)pfn_cuGraphicsUnregisterResource);
239
+ get_driver_entry_point("cuModuleGetGlobal", 3020, &(void*&)pfn_cuModuleGetGlobal);
240
+ get_driver_entry_point("cuFuncSetAttribute", 9000, &(void*&)pfn_cuFuncSetAttribute);
234
241
 
235
242
  if (pfn_cuInit)
236
243
  cuda_driver_initialized = check_cu(pfn_cuInit(0));
@@ -568,4 +575,14 @@ CUresult cuGraphicsUnregisterResource_f(CUgraphicsResource resource)
568
575
  return pfn_cuGraphicsUnregisterResource ? pfn_cuGraphicsUnregisterResource(resource) : DRIVER_ENTRY_POINT_ERROR;
569
576
  }
570
577
 
578
+ CUresult cuModuleGetGlobal_f(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name )
579
+ {
580
+ return pfn_cuModuleGetGlobal ? pfn_cuModuleGetGlobal(dptr, bytes, hmod, name) : DRIVER_ENTRY_POINT_ERROR;
581
+ }
582
+
583
+ CUresult cuFuncSetAttribute_f(CUfunction hfunc, CUfunction_attribute attrib, int value)
584
+ {
585
+ return pfn_cuFuncSetAttribute ? pfn_cuFuncSetAttribute(hfunc, attrib, value) : DRIVER_ENTRY_POINT_ERROR;
586
+ }
587
+
571
588
  #endif // WP_ENABLE_CUDA
warp/native/cuda_util.h CHANGED
@@ -99,7 +99,8 @@ CUresult cuGraphicsUnmapResources_f(unsigned int count, CUgraphicsResource* reso
99
99
  CUresult cuGraphicsResourceGetMappedPointer_f(CUdeviceptr* pDevPtr, size_t* pSize, CUgraphicsResource resource);
100
100
  CUresult cuGraphicsGLRegisterBuffer_f(CUgraphicsResource *pCudaResource, unsigned int buffer, unsigned int flags);
101
101
  CUresult cuGraphicsUnregisterResource_f(CUgraphicsResource resource);
102
-
102
+ CUresult cuModuleGetGlobal_f(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name );
103
+ CUresult cuFuncSetAttribute_f(CUfunction hfunc, CUfunction_attribute attrib, int value);
103
104
 
104
105
  bool init_cuda_driver();
105
106
  bool is_cuda_driver_initialized();
warp/native/fabric.h CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #pragma once
2
10
 
3
11
  #include "builtin.h"
warp/native/hashgrid.h CHANGED
@@ -209,6 +209,10 @@ CUDA_CALLABLE inline hash_grid_query_t iter_reverse(const hash_grid_query_t& que
209
209
  return query;
210
210
  }
211
211
 
212
+ CUDA_CALLABLE inline void adj_iter_reverse(const hash_grid_query_t& query, hash_grid_query_t& adj_query, hash_grid_query_t& adj_ret)
213
+ {
214
+ }
215
+
212
216
 
213
217
 
214
218
  CUDA_CALLABLE inline int hash_grid_point_id(uint64_t id, int& index)
warp/native/marching.cu CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "warp.h"
2
10
  #include "cuda_util.h"
3
11
  #include "scan.h"
warp/native/mat.h CHANGED
@@ -210,6 +210,12 @@ inline CUDA_CALLABLE mat_t<Rows, Rows, Type> identity()
210
210
  return m;
211
211
  }
212
212
 
213
+ template<unsigned Rows, typename Type>
214
+ inline CUDA_CALLABLE void adj_identity(const mat_t<Rows, Rows, Type>& adj_ret)
215
+ {
216
+ // nop
217
+ }
218
+
213
219
  template<unsigned Rows, unsigned Cols, typename Type>
214
220
  inline CUDA_CALLABLE bool operator==(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
215
221
  {
@@ -650,13 +656,18 @@ inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a
650
656
  {
651
657
  mat_t<Rows,ColsOut,Type> t(0);
652
658
  for (unsigned i=0; i < Rows; ++i)
653
- {
654
- for (unsigned j=0; j < ColsOut; ++j)
659
+ {
660
+ for (unsigned j=0; j < ColsOut; ++j)
655
661
  {
662
+ Type sum(0.0);
663
+
656
664
  for (unsigned k=0; k < Cols; ++k)
657
665
  {
658
- t.data[i][j] += a.data[i][k]*b.data[k][j];
666
+ //t.data[i][j] += a.data[i][k]*b.data[k][j];
667
+ sum = fmaf(a.data[i][k], b.data[k][j], sum);
659
668
  }
669
+
670
+ t.data[i][j] = sum;
660
671
  }
661
672
  }
662
673
 
warp/native/mathdx.cpp ADDED
@@ -0,0 +1,59 @@
1
+ /** Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
9
+ #include "builtin.h"
10
+
11
+ // stubs for platforms where there is no CUDA
12
+ #if !WP_ENABLE_CUDA || !WP_ENABLE_MATHDX
13
+
14
+ extern "C"
15
+ {
16
+
17
+ WP_API
18
+ bool cuda_compile_fft(
19
+ const char* ltoir_output_path,
20
+ const char* symbol_name, int num_include_dirs,
21
+ const char** include_dirs,
22
+ const char* mathdx_include_dir,
23
+ int arch,
24
+ int size,
25
+ int elements_per_thread,
26
+ int direction,
27
+ int precision,
28
+ int* shared_memory_size)
29
+ {
30
+ printf("CUDA is disabled and/or Warp was not compiled with MathDx support.\n");
31
+ return false;
32
+ }
33
+
34
+ WP_API bool cuda_compile_dot(
35
+ const char* ltoir_output_path,
36
+ const char* symbol_name,
37
+ int num_include_dirs,
38
+ const char** include_dirs,
39
+ const char* mathdx_include_dir,
40
+ int arch,
41
+ int M,
42
+ int N,
43
+ int K,
44
+ int precision_A,
45
+ int precision_B,
46
+ int precision_C,
47
+ int type,
48
+ int a_arrangement,
49
+ int b_arrangement,
50
+ int c_arrangement,
51
+ int num_threads)
52
+ {
53
+ printf("CUDA is disabled and/or Warp was not compiled with MathDx support.\n");
54
+ return false;
55
+ }
56
+
57
+ } // extern "C"
58
+
59
+ #endif // !WP_ENABLE_CUDA || !WP_ENABLE_MATHDX
warp/native/mesh.h CHANGED
@@ -1693,6 +1693,10 @@ CUDA_CALLABLE inline mesh_query_aabb_t iter_reverse(const mesh_query_aabb_t& que
1693
1693
  return query;
1694
1694
  }
1695
1695
 
1696
+ CUDA_CALLABLE inline void adj_iter_reverse(const mesh_query_aabb_t& query, mesh_query_aabb_t& adj_query, mesh_query_aabb_t& adj_ret)
1697
+ {
1698
+ }
1699
+
1696
1700
 
1697
1701
  // stub
1698
1702
  CUDA_CALLABLE inline void adj_mesh_query_aabb_next(mesh_query_aabb_t& query, int& index, mesh_query_aabb_t&, int&, bool&)
warp/native/range.h CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #pragma once
2
10
 
3
11
  namespace wp
@@ -115,4 +123,8 @@ CUDA_CALLABLE inline range_t iter_reverse(const range_t& r)
115
123
  return rev;
116
124
  }
117
125
 
118
- } // namespace wp
126
+ CUDA_CALLABLE inline void adj_iter_reverse(const range_t& r, range_t& adj_r, range_t& adj_ret)
127
+ {
128
+ }
129
+
130
+ } // namespace wp
warp/native/reduce.cpp CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "warp.h"
2
10
 
3
11
  namespace
@@ -154,4 +162,4 @@ void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride
154
162
  void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
155
163
  {
156
164
  }
157
- #endif
165
+ #endif
warp/native/reduce.cu CHANGED
@@ -1,3 +1,10 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
1
8
 
2
9
  #include "cuda_util.h"
3
10
  #include "warp.h"
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "warp.h"
2
10
 
3
11
  #include <cstdint>
@@ -59,4 +67,4 @@ void runlength_encode_int_device(
59
67
  int n)
60
68
  {
61
69
  }
62
- #endif
70
+ #endif
@@ -1,4 +1,10 @@
1
-
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
2
8
 
3
9
  #include "warp.h"
4
10
  #include "cuda_util.h"
warp/native/scan.cpp CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "scan.h"
2
10
 
3
11
  #include <numeric>
warp/native/scan.cu CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "warp.h"
2
10
  #include "scan.h"
3
11
 
warp/native/scan.h CHANGED
@@ -1,7 +1,14 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #pragma once
2
10
 
3
11
  template<typename T>
4
12
  void scan_host(const T* values_in, T* values_out, int n, bool inclusive = true);
5
13
  template<typename T>
6
14
  void scan_device(const T* values_in, T* values_out, int n, bool inclusive = true);
7
-
warp/native/sparse.cpp CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "warp.h"
2
10
 
3
11
  #include <algorithm>
warp/native/sparse.cu CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "cuda_util.h"
2
10
  #include "warp.h"
3
11
 
warp/native/temp_buffer.h CHANGED
@@ -1,3 +1,10 @@
1
+ /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
1
8
 
2
9
  #pragma once
3
10