warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show
  1. warp/__init__.py +8 -0
  2. warp/bin/warp-clang.so +0 -0
  3. warp/bin/warp.so +0 -0
  4. warp/build.py +7 -6
  5. warp/build_dll.py +70 -79
  6. warp/builtins.py +10 -6
  7. warp/codegen.py +51 -19
  8. warp/config.py +7 -8
  9. warp/constants.py +3 -0
  10. warp/context.py +948 -245
  11. warp/dlpack.py +198 -113
  12. warp/examples/assets/bunny.usd +0 -0
  13. warp/examples/assets/cartpole.urdf +110 -0
  14. warp/examples/assets/crazyflie.usd +0 -0
  15. warp/examples/assets/cube.usda +42 -0
  16. warp/examples/assets/nv_ant.xml +92 -0
  17. warp/examples/assets/nv_humanoid.xml +183 -0
  18. warp/examples/assets/quadruped.urdf +268 -0
  19. warp/examples/assets/rocks.nvdb +0 -0
  20. warp/examples/assets/rocks.usd +0 -0
  21. warp/examples/assets/sphere.usda +56 -0
  22. warp/examples/assets/torus.usda +105 -0
  23. warp/examples/benchmarks/benchmark_api.py +383 -0
  24. warp/examples/benchmarks/benchmark_cloth.py +279 -0
  25. warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
  26. warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
  27. warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
  28. warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
  29. warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
  30. warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
  31. warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
  32. warp/examples/benchmarks/benchmark_launches.py +295 -0
  33. warp/examples/core/example_dem.py +221 -0
  34. warp/examples/core/example_fluid.py +267 -0
  35. warp/examples/core/example_graph_capture.py +129 -0
  36. warp/examples/core/example_marching_cubes.py +177 -0
  37. warp/examples/core/example_mesh.py +154 -0
  38. warp/examples/core/example_mesh_intersect.py +193 -0
  39. warp/examples/core/example_nvdb.py +169 -0
  40. warp/examples/core/example_raycast.py +89 -0
  41. warp/examples/core/example_raymarch.py +178 -0
  42. warp/examples/core/example_render_opengl.py +141 -0
  43. warp/examples/core/example_sph.py +389 -0
  44. warp/examples/core/example_torch.py +181 -0
  45. warp/examples/core/example_wave.py +249 -0
  46. warp/examples/fem/bsr_utils.py +380 -0
  47. warp/examples/fem/example_apic_fluid.py +391 -0
  48. warp/examples/fem/example_convection_diffusion.py +168 -0
  49. warp/examples/fem/example_convection_diffusion_dg.py +209 -0
  50. warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
  51. warp/examples/fem/example_deformed_geometry.py +159 -0
  52. warp/examples/fem/example_diffusion.py +173 -0
  53. warp/examples/fem/example_diffusion_3d.py +152 -0
  54. warp/examples/fem/example_diffusion_mgpu.py +214 -0
  55. warp/examples/fem/example_mixed_elasticity.py +222 -0
  56. warp/examples/fem/example_navier_stokes.py +243 -0
  57. warp/examples/fem/example_stokes.py +192 -0
  58. warp/examples/fem/example_stokes_transfer.py +249 -0
  59. warp/examples/fem/mesh_utils.py +109 -0
  60. warp/examples/fem/plot_utils.py +287 -0
  61. warp/examples/optim/example_bounce.py +248 -0
  62. warp/examples/optim/example_cloth_throw.py +210 -0
  63. warp/examples/optim/example_diffray.py +535 -0
  64. warp/examples/optim/example_drone.py +850 -0
  65. warp/examples/optim/example_inverse_kinematics.py +169 -0
  66. warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
  67. warp/examples/optim/example_spring_cage.py +234 -0
  68. warp/examples/optim/example_trajectory.py +201 -0
  69. warp/examples/sim/example_cartpole.py +128 -0
  70. warp/examples/sim/example_cloth.py +184 -0
  71. warp/examples/sim/example_granular.py +113 -0
  72. warp/examples/sim/example_granular_collision_sdf.py +185 -0
  73. warp/examples/sim/example_jacobian_ik.py +213 -0
  74. warp/examples/sim/example_particle_chain.py +106 -0
  75. warp/examples/sim/example_quadruped.py +179 -0
  76. warp/examples/sim/example_rigid_chain.py +191 -0
  77. warp/examples/sim/example_rigid_contact.py +176 -0
  78. warp/examples/sim/example_rigid_force.py +126 -0
  79. warp/examples/sim/example_rigid_gyroscopic.py +97 -0
  80. warp/examples/sim/example_rigid_soft_contact.py +124 -0
  81. warp/examples/sim/example_soft_body.py +178 -0
  82. warp/fabric.py +29 -20
  83. warp/fem/cache.py +0 -1
  84. warp/fem/dirichlet.py +0 -2
  85. warp/fem/integrate.py +0 -1
  86. warp/jax.py +45 -0
  87. warp/jax_experimental.py +339 -0
  88. warp/native/builtin.h +12 -0
  89. warp/native/bvh.cu +18 -18
  90. warp/native/clang/clang.cpp +8 -3
  91. warp/native/cuda_util.cpp +94 -5
  92. warp/native/cuda_util.h +35 -6
  93. warp/native/cutlass_gemm.cpp +1 -1
  94. warp/native/cutlass_gemm.cu +4 -1
  95. warp/native/error.cpp +66 -0
  96. warp/native/error.h +27 -0
  97. warp/native/mesh.cu +2 -2
  98. warp/native/reduce.cu +4 -4
  99. warp/native/runlength_encode.cu +2 -2
  100. warp/native/scan.cu +2 -2
  101. warp/native/sparse.cu +0 -1
  102. warp/native/temp_buffer.h +2 -2
  103. warp/native/warp.cpp +95 -60
  104. warp/native/warp.cu +1053 -218
  105. warp/native/warp.h +49 -32
  106. warp/optim/linear.py +33 -16
  107. warp/render/render_opengl.py +202 -101
  108. warp/render/render_usd.py +82 -40
  109. warp/sim/__init__.py +13 -4
  110. warp/sim/articulation.py +4 -5
  111. warp/sim/collide.py +320 -175
  112. warp/sim/import_mjcf.py +25 -30
  113. warp/sim/import_urdf.py +94 -63
  114. warp/sim/import_usd.py +51 -36
  115. warp/sim/inertia.py +3 -2
  116. warp/sim/integrator.py +233 -0
  117. warp/sim/integrator_euler.py +447 -469
  118. warp/sim/integrator_featherstone.py +1991 -0
  119. warp/sim/integrator_xpbd.py +1420 -640
  120. warp/sim/model.py +765 -487
  121. warp/sim/particles.py +2 -1
  122. warp/sim/render.py +35 -13
  123. warp/sim/utils.py +222 -11
  124. warp/stubs.py +8 -0
  125. warp/tape.py +16 -1
  126. warp/tests/aux_test_grad_customs.py +23 -0
  127. warp/tests/test_array.py +190 -1
  128. warp/tests/test_async.py +656 -0
  129. warp/tests/test_bool.py +50 -0
  130. warp/tests/test_dlpack.py +164 -11
  131. warp/tests/test_examples.py +166 -74
  132. warp/tests/test_fem.py +8 -1
  133. warp/tests/test_generics.py +15 -5
  134. warp/tests/test_grad.py +1 -1
  135. warp/tests/test_grad_customs.py +172 -12
  136. warp/tests/test_jax.py +254 -0
  137. warp/tests/test_large.py +29 -6
  138. warp/tests/test_launch.py +25 -0
  139. warp/tests/test_linear_solvers.py +20 -3
  140. warp/tests/test_matmul.py +61 -16
  141. warp/tests/test_matmul_lite.py +13 -13
  142. warp/tests/test_mempool.py +186 -0
  143. warp/tests/test_multigpu.py +3 -0
  144. warp/tests/test_options.py +16 -2
  145. warp/tests/test_peer.py +137 -0
  146. warp/tests/test_print.py +3 -1
  147. warp/tests/test_quat.py +23 -0
  148. warp/tests/test_sim_kinematics.py +97 -0
  149. warp/tests/test_snippet.py +126 -3
  150. warp/tests/test_streams.py +108 -79
  151. warp/tests/test_torch.py +16 -8
  152. warp/tests/test_utils.py +32 -27
  153. warp/tests/test_verify_fp.py +65 -0
  154. warp/tests/test_volume.py +1 -1
  155. warp/tests/unittest_serial.py +2 -0
  156. warp/tests/unittest_suites.py +12 -0
  157. warp/tests/unittest_utils.py +14 -7
  158. warp/thirdparty/unittest_parallel.py +15 -3
  159. warp/torch.py +10 -8
  160. warp/types.py +363 -246
  161. warp/utils.py +143 -19
  162. warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
  163. warp_lang-1.0.0.dist-info/METADATA +394 -0
  164. {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
  165. warp/sim/optimizer.py +0 -138
  166. warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
  167. warp_lang-0.11.0.dist-info/METADATA +0 -238
  168. /warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
  169. {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
  170. {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,383 @@
1
+ # Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import gc
9
+ import statistics as stats
10
+ import warp as wp
11
+
12
+
13
+ ENABLE_MEMPOOLS = False
14
+ ENABLE_PEER_ACCESS = False
15
+ ENABLE_MEMPOOL_ACCESS = False
16
+ ENABLE_MEMPOOL_RELEASE_THRESHOLD = False
17
+
18
+ MEMPOOL_RELEASE_THRESHOLD = 1024 * 1024 * 1024
19
+
20
+ DO_SYNC = False
21
+ VERBOSE = False
22
+ USE_NVTX = False
23
+
24
+ num_elems = 10000
25
+ num_runs = 10000
26
+ trim_runs = 2500
27
+
28
+
29
+ @wp.kernel
30
+ def inc_kernel(a: wp.array(dtype=float)):
31
+ tid = wp.tid()
32
+ a[tid] = a[tid] + 1.0
33
+
34
+
35
+ wp.init()
36
+
37
+ # configure devices
38
+ for target_device in wp.get_cuda_devices():
39
+ try:
40
+ wp.set_mempool_enabled(target_device, ENABLE_MEMPOOLS)
41
+ if ENABLE_MEMPOOL_RELEASE_THRESHOLD:
42
+ wp.set_mempool_release_threshold(target_device, MEMPOOL_RELEASE_THRESHOLD)
43
+ except Exception as e:
44
+ print(f"Error: {e}")
45
+
46
+ for peer_device in wp.get_cuda_devices():
47
+ try:
48
+ wp.set_peer_access_enabled(target_device, peer_device, ENABLE_PEER_ACCESS)
49
+ except Exception as e:
50
+ print(f"Error: {e}")
51
+
52
+ try:
53
+ wp.set_mempool_access_enabled(target_device, peer_device, ENABLE_MEMPOOL_ACCESS)
54
+ except Exception as e:
55
+ print(f"Error: {e}")
56
+
57
+ cuda_device_count = wp.get_cuda_device_count()
58
+
59
+ cuda0 = wp.get_device("cuda:0")
60
+
61
+ # preallocate some arrays
62
+ arr_host = wp.zeros(num_elems, dtype=float, device="cpu", pinned=False)
63
+ arr_host_pinned = wp.zeros(num_elems, dtype=float, device="cpu", pinned=True)
64
+ arr_cuda0 = wp.zeros(num_elems, dtype=float, device=cuda0)
65
+ arr_cuda0_src = wp.zeros(num_elems, dtype=float, device=cuda0)
66
+ arr_cuda0_dst = wp.zeros(num_elems, dtype=float, device=cuda0)
67
+
68
+ # mgpu support
69
+ if cuda_device_count > 1:
70
+ cuda1 = wp.get_device("cuda:1")
71
+ arr_cuda1 = wp.zeros(num_elems, dtype=float, device=cuda1)
72
+
73
+ stream0 = wp.Stream(cuda0)
74
+
75
+ # preload module
76
+ wp.force_load(cuda0)
77
+ if cuda_device_count > 1:
78
+ wp.force_load(cuda1)
79
+
80
+ # capture graph
81
+ with wp.ScopedDevice(cuda0):
82
+ wp.capture_begin()
83
+ wp.launch(inc_kernel, dim=arr_cuda0.size, inputs=[arr_cuda0])
84
+ graph0 = wp.capture_end()
85
+
86
+
87
+ g_allocs = [None] * num_runs
88
+
89
+
90
+ def test_alloc(num_elems, device, idx):
91
+ wp.synchronize()
92
+
93
+ with wp.ScopedTimer("alloc", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
94
+ g_allocs[idx] = wp.empty(num_elems, dtype=float, device=device)
95
+
96
+ if DO_SYNC:
97
+ wp.synchronize_device(device)
98
+
99
+ return timer.elapsed
100
+
101
+
102
+ def test_free(device, idx):
103
+ wp.synchronize()
104
+
105
+ with wp.ScopedTimer("free", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
106
+ g_allocs[idx] = None
107
+
108
+ if DO_SYNC:
109
+ wp.synchronize_device(device)
110
+
111
+ return timer.elapsed
112
+
113
+
114
+ def test_zeros(num_elems, device, idx):
115
+ wp.synchronize()
116
+
117
+ with wp.ScopedTimer("zeros", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
118
+ g_allocs[idx] = wp.zeros(num_elems, dtype=float, device=device)
119
+
120
+ if DO_SYNC:
121
+ wp.synchronize_device(device)
122
+
123
+ return timer.elapsed
124
+
125
+
126
+ def test_h2d(num_elems, device):
127
+ wp.synchronize()
128
+
129
+ with wp.ScopedTimer("h2d", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
130
+ wp.copy(arr_cuda0, arr_host)
131
+
132
+ if DO_SYNC:
133
+ wp.synchronize_device(device)
134
+
135
+ return timer.elapsed
136
+
137
+
138
+ def test_d2h(num_elems, device):
139
+ wp.synchronize()
140
+
141
+ with wp.ScopedTimer("d2h", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
142
+ wp.copy(arr_host, arr_cuda0)
143
+
144
+ if DO_SYNC:
145
+ wp.synchronize_device(device)
146
+
147
+ return timer.elapsed
148
+
149
+
150
+ def test_h2d_pinned(num_elems, device):
151
+ wp.synchronize()
152
+
153
+ with wp.ScopedTimer("h2d pinned", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
154
+ wp.copy(arr_cuda0, arr_host_pinned)
155
+
156
+ if DO_SYNC:
157
+ wp.synchronize_device(device)
158
+
159
+ return timer.elapsed
160
+
161
+
162
+ def test_d2h_pinned(num_elems, device):
163
+ wp.synchronize()
164
+
165
+ with wp.ScopedTimer("d2h pinned", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
166
+ wp.copy(arr_host_pinned, arr_cuda0)
167
+
168
+ if DO_SYNC:
169
+ wp.synchronize_device(device)
170
+
171
+ return timer.elapsed
172
+
173
+
174
+ def test_d2d(num_elems, device):
175
+ wp.synchronize()
176
+
177
+ with wp.ScopedTimer("d2d", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
178
+ wp.copy(arr_cuda0_dst, arr_cuda0_src)
179
+
180
+ if DO_SYNC:
181
+ wp.synchronize_device(device)
182
+
183
+ return timer.elapsed
184
+
185
+
186
+ def test_p2p(num_elems, src_device, dst_device):
187
+ wp.synchronize()
188
+
189
+ with wp.ScopedTimer("p2p", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
190
+ wp.copy(arr_cuda0, arr_cuda1)
191
+
192
+ if DO_SYNC:
193
+ wp.synchronize_device(src_device)
194
+ wp.synchronize_device(dst_device)
195
+
196
+ return timer.elapsed
197
+
198
+
199
+ def test_p2p_stream(num_elems, src_device, dst_device):
200
+ stream = stream0
201
+
202
+ wp.synchronize()
203
+
204
+ with wp.ScopedTimer("p2p stream", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
205
+ wp.copy(arr_cuda0, arr_cuda1, stream=stream)
206
+
207
+ if DO_SYNC:
208
+ wp.synchronize_device(src_device)
209
+ wp.synchronize_device(dst_device)
210
+
211
+ return timer.elapsed
212
+
213
+
214
+ def test_launch(num_elems, device):
215
+ a = arr_cuda0
216
+
217
+ wp.synchronize()
218
+
219
+ with wp.ScopedTimer("launch", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
220
+ wp.launch(inc_kernel, dim=a.size, inputs=[a], device=device)
221
+
222
+ if DO_SYNC:
223
+ wp.synchronize_device(device)
224
+
225
+ return timer.elapsed
226
+
227
+
228
+ def test_launch_stream(num_elems, device):
229
+ a = arr_cuda0
230
+ stream = stream0
231
+
232
+ wp.synchronize()
233
+
234
+ with wp.ScopedTimer("launch stream", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
235
+ wp.launch(inc_kernel, dim=a.size, inputs=[a], stream=stream)
236
+
237
+ if DO_SYNC:
238
+ wp.synchronize_device(device)
239
+
240
+ return timer.elapsed
241
+
242
+
243
+ def test_graph(num_elems, device):
244
+ wp.synchronize()
245
+
246
+ with wp.ScopedTimer("graph", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
247
+ wp.capture_launch(graph0)
248
+
249
+ if DO_SYNC:
250
+ wp.synchronize_device(device)
251
+
252
+ return timer.elapsed
253
+
254
+
255
+ def test_graph_stream(num_elems, device):
256
+ wp.synchronize()
257
+
258
+ with wp.ScopedTimer("graph", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
259
+ wp.capture_launch(graph0, stream=stream0)
260
+
261
+ if DO_SYNC:
262
+ wp.synchronize_device(device)
263
+
264
+ return timer.elapsed
265
+
266
+
267
+ alloc_times = [0] * num_runs
268
+ free_times = [0] * num_runs
269
+ zeros_times = [0] * num_runs
270
+ d2h_times = [0] * num_runs
271
+ h2d_times = [0] * num_runs
272
+ d2h_pinned_times = [0] * num_runs
273
+ h2d_pinned_times = [0] * num_runs
274
+ d2d_times = [0] * num_runs
275
+ p2p_times = [0] * num_runs
276
+ p2p_stream_times = [0] * num_runs
277
+ launch_times = [0] * num_runs
278
+ launch_stream_times = [0] * num_runs
279
+ graph_times = [0] * num_runs
280
+ graph_stream_times = [0] * num_runs
281
+
282
+ wp.set_device(cuda0)
283
+
284
+ # alloc
285
+ for i in range(num_runs):
286
+ gc.disable()
287
+ alloc_times[i] = test_alloc(num_elems, cuda0, i)
288
+ gc.enable()
289
+
290
+ # free
291
+ for i in range(num_runs):
292
+ gc.disable()
293
+ free_times[i] = test_free(cuda0, i)
294
+ gc.enable()
295
+
296
+ # zeros
297
+ for i in range(num_runs):
298
+ gc.disable()
299
+ zeros_times[i] = test_zeros(num_elems, cuda0, i)
300
+ gc.enable()
301
+
302
+ # free zeros
303
+ for i in range(num_runs):
304
+ g_allocs[i] = None
305
+
306
+ # h2d, d2h pageable copy
307
+ for i in range(num_runs):
308
+ gc.disable()
309
+ h2d_times[i] = test_h2d(num_elems, cuda0)
310
+ d2h_times[i] = test_d2h(num_elems, cuda0)
311
+ gc.enable()
312
+
313
+ # h2d, d2h pinned copy
314
+ for i in range(num_runs):
315
+ gc.disable()
316
+ h2d_pinned_times[i] = test_h2d_pinned(num_elems, cuda0)
317
+ d2h_pinned_times[i] = test_d2h_pinned(num_elems, cuda0)
318
+ gc.enable()
319
+
320
+ # d2d copy
321
+ for i in range(num_runs):
322
+ gc.disable()
323
+ d2d_times[i] = test_d2d(num_elems, cuda0)
324
+ gc.enable()
325
+
326
+ # p2p copy
327
+ if cuda_device_count > 1:
328
+ for i in range(num_runs):
329
+ gc.disable()
330
+ p2p_times[i] = test_p2p(num_elems, cuda1, cuda0)
331
+ p2p_stream_times[i] = test_p2p_stream(num_elems, cuda1, cuda0)
332
+ gc.enable()
333
+
334
+ # launch
335
+ for i in range(num_runs):
336
+ gc.disable()
337
+ launch_times[i] = test_launch(num_elems, cuda0)
338
+ launch_stream_times[i] = test_launch_stream(num_elems, cuda0)
339
+ gc.enable()
340
+
341
+ # graph
342
+ for i in range(num_runs):
343
+ gc.disable()
344
+ graph_times[i] = test_graph(num_elems, cuda0)
345
+ graph_stream_times[i] = test_graph_stream(num_elems, cuda0)
346
+ gc.enable()
347
+
348
+
349
+ def print_stat(name, data, trim=trim_runs):
350
+ assert(len(data) - 2 * trim > 0)
351
+ if trim > 0:
352
+ data = sorted(data)[trim:-trim]
353
+ print(f"{name:15s} {1000000 * stats.mean(data):.0f}")
354
+
355
+
356
+ print("=========================")
357
+ print_stat("Alloc", alloc_times)
358
+ print_stat("Free", free_times)
359
+ print_stat("Zeros", zeros_times)
360
+ print_stat("H2D", h2d_times)
361
+ print_stat("D2H", d2h_times)
362
+ print_stat("H2D pinned", h2d_pinned_times)
363
+ print_stat("D2H pinned", d2h_pinned_times)
364
+ print_stat("D2D", d2d_times)
365
+ print_stat("P2P", p2p_times)
366
+ print_stat("P2P stream", p2p_stream_times)
367
+ print_stat("Launch", launch_times)
368
+ print_stat("Launch stream", launch_stream_times)
369
+ print_stat("Graph", graph_times)
370
+ print_stat("Graph stream", graph_stream_times)
371
+
372
+
373
+ # ========= profiling ==========
374
+
375
+ # from pyinstrument import Profiler
376
+ # profiler = Profiler()
377
+ # profiler.start()
378
+ # for i in range(10):
379
+ # # test_alloc(num_elems, cuda0)
380
+ # # test_h2d(num_elems, cuda0)
381
+ # test_p2p(num_elems, cuda0, cuda1)
382
+ # profiler.stop()
383
+ # print(profiler.output_text(show_all=True))
@@ -0,0 +1,279 @@
1
+ # Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ # include parent path
9
+ import os
10
+ import sys, getopt
11
+ import numpy as np
12
+ import math
13
+ import ctypes
14
+
15
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
16
+
17
+ from pxr import Usd, UsdGeom, Gf, Sdf
18
+
19
+ import warp as wp
20
+
21
+
22
+ class Cloth:
23
+ def __init__(
24
+ self, lower, dx, dy, radius, stretch_stiffness, bend_stiffness, shear_stiffness, mass, fix_corners=True
25
+ ):
26
+ self.triangles = []
27
+
28
+ self.positions = []
29
+ self.velocities = []
30
+ self.inv_masses = []
31
+
32
+ self.spring_indices = []
33
+ self.spring_lengths = []
34
+ self.spring_stiffness = []
35
+ self.spring_damping = []
36
+
37
+ def grid(x, y, stride):
38
+ return y * stride + x
39
+
40
+ def create_spring(i, j, stiffness, damp=10.0):
41
+ length = np.linalg.norm(np.array(self.positions[i]) - np.array(self.positions[j]))
42
+
43
+ self.spring_indices.append(i)
44
+ self.spring_indices.append(j)
45
+ self.spring_lengths.append(length)
46
+ self.spring_stiffness.append(stiffness)
47
+ self.spring_damping.append(damp)
48
+
49
+ for y in range(dy):
50
+ for x in range(dx):
51
+ p = np.array(lower) + radius * np.array((float(x), float(0.0), float(y)))
52
+
53
+ self.positions.append(p)
54
+ self.velocities.append(np.zeros(3))
55
+
56
+ if x > 0 and y > 0:
57
+ self.triangles.append(grid(x - 1, y - 1, dx))
58
+ self.triangles.append(grid(x, y - 1, dx))
59
+ self.triangles.append(grid(x, y, dx))
60
+
61
+ self.triangles.append(grid(x - 1, y - 1, dx))
62
+ self.triangles.append(grid(x, y, dx))
63
+ self.triangles.append(grid(x - 1, y, dx))
64
+
65
+ if fix_corners and y == 0 and (x == 0 or x == dx - 1):
66
+ w = 0.0
67
+ else:
68
+ w = 1.0 / mass
69
+
70
+ self.inv_masses.append(w)
71
+
72
+ # horizontal springs
73
+ for y in range(dy):
74
+ for x in range(dx):
75
+ index0 = y * dx + x
76
+
77
+ if x > 0:
78
+ index1 = y * dx + x - 1
79
+ create_spring(index0, index1, stretch_stiffness)
80
+
81
+ if x > 1 and bend_stiffness > 0.0:
82
+ index2 = y * dx + x - 2
83
+ create_spring(index0, index2, bend_stiffness)
84
+
85
+ if y > 0 and x < dx - 1 and shear_stiffness > 0.0:
86
+ indexDiag = (y - 1) * dx + x + 1
87
+ create_spring(index0, indexDiag, shear_stiffness)
88
+
89
+ if y > 0 and x > 0 and shear_stiffness > 0.0:
90
+ indexDiag = (y - 1) * dx + x - 1
91
+ create_spring(index0, indexDiag, shear_stiffness)
92
+
93
+ # vertical
94
+ for x in range(dx):
95
+ for y in range(dy):
96
+ index0 = y * dx + x
97
+
98
+ if y > 0:
99
+ index1 = (y - 1) * dx + x
100
+ create_spring(index0, index1, stretch_stiffness)
101
+
102
+ if y > 1 and bend_stiffness > 0.0:
103
+ index2 = (y - 2) * dx + x
104
+ create_spring(index0, index2, bend_stiffness)
105
+
106
+ # harden to np arrays
107
+ self.positions = np.array(self.positions, dtype=np.float32)
108
+ self.velocities = np.array(self.velocities, dtype=np.float32)
109
+ self.inv_masses = np.array(self.inv_masses, dtype=np.float32)
110
+ self.spring_lengths = np.array(self.spring_lengths, dtype=np.float32)
111
+ self.spring_indices = np.array(self.spring_indices, dtype=np.int32)
112
+ self.spring_stiffness = np.array(self.spring_stiffness, dtype=np.float32)
113
+ self.spring_damping = np.array(self.spring_damping, dtype=np.float32)
114
+
115
+ self.num_particles = len(self.positions)
116
+ self.num_springs = len(self.spring_lengths)
117
+ self.num_tris = int(len(self.triangles) / 3)
118
+
119
+
120
+ def run_benchmark(mode, dim, timers, render=False):
121
+ # params
122
+ sim_width = dim
123
+ sim_height = dim
124
+
125
+ sim_fps = 60.0
126
+ sim_substeps = 16
127
+ sim_duration = 1.0
128
+ sim_frames = int(sim_duration * sim_fps)
129
+ sim_dt = 1.0 / sim_fps
130
+ sim_time = 0.0
131
+
132
+ # wave constants
133
+ k_stretch = 1000.0
134
+ k_shear = 1000.0
135
+ k_bend = 1000.0
136
+ k_damp = 0.0
137
+
138
+ cloth = Cloth(
139
+ lower=(0.0, 0.0, 0.0),
140
+ dx=sim_width,
141
+ dy=sim_height,
142
+ radius=0.1,
143
+ stretch_stiffness=k_stretch,
144
+ bend_stiffness=k_bend,
145
+ shear_stiffness=k_shear,
146
+ mass=0.1,
147
+ fix_corners=True,
148
+ )
149
+
150
+ if render:
151
+ # set up grid for visualization
152
+ stage = Usd.Stage.CreateNew(os.path.join(os.path.dirname(__file__), "outputs/benchmark.usd"))
153
+ stage.SetStartTimeCode(0.0)
154
+ stage.SetEndTimeCode(sim_duration * sim_fps)
155
+ stage.SetTimeCodesPerSecond(sim_fps)
156
+
157
+ grid = UsdGeom.Mesh.Define(stage, "/root")
158
+ grid.GetPointsAttr().Set(cloth.positions, 0.0)
159
+ grid.GetFaceVertexIndicesAttr().Set(cloth.triangles, 0.0)
160
+ grid.GetFaceVertexCountsAttr().Set([3] * cloth.num_tris, 0.0)
161
+
162
+ with wp.ScopedTimer("Initialization", dict=timers):
163
+ if mode == "warp_cpu":
164
+ import examples.benchmark_cloth_warp
165
+
166
+ integrator = examples.benchmark_cloth_warp.WpIntegrator(cloth, "cpu")
167
+
168
+ elif mode == "warp_gpu":
169
+ import examples.benchmark_cloth_warp
170
+
171
+ integrator = examples.benchmark_cloth_warp.WpIntegrator(cloth, "cuda")
172
+
173
+ elif mode == "taichi_cpu":
174
+ import examples.benchmark_cloth_taichi
175
+
176
+ integrator = examples.benchmark_cloth_taichi.TiIntegrator(cloth, "cpu")
177
+
178
+ elif mode == "taichi_gpu":
179
+ import examples.benchmark_cloth_taichi
180
+
181
+ integrator = examples.benchmark_cloth_taichi.TiIntegrator(cloth, "cuda")
182
+
183
+ elif mode == "numpy":
184
+ import examples.benchmark_cloth_numpy
185
+
186
+ integrator = examples.benchmark_cloth_numpy.NpIntegrator(cloth)
187
+
188
+ elif mode == "cupy":
189
+ import examples.benchmark_cloth_cupy
190
+
191
+ integrator = examples.benchmark_cloth_cupy.CpIntegrator(cloth)
192
+
193
+ elif mode == "numba":
194
+ import examples.benchmark_cloth_numba
195
+
196
+ integrator = examples.benchmark_cloth_numba.NbIntegrator(cloth)
197
+
198
+ elif mode == "torch_cpu":
199
+ import examples.benchmark_cloth_pytorch
200
+
201
+ integrator = examples.benchmark_cloth_pytorch.TrIntegrator(cloth, "cpu")
202
+
203
+ elif mode == "torch_gpu":
204
+ import examples.benchmark_cloth_pytorch
205
+
206
+ integrator = examples.benchmark_cloth_pytorch.TrIntegrator(cloth, "cuda")
207
+
208
+ elif mode == "jax_cpu":
209
+ os.environ["JAX_PLATFORM_NAME"] = "cpu"
210
+
211
+ import examples.benchmark_cloth_jax
212
+
213
+ integrator = examples.benchmark_cloth_jax.JxIntegrator(cloth)
214
+
215
+ elif mode == "jax_gpu":
216
+ os.environ["JAX_PLATFORM_NAME"] = "gpu"
217
+
218
+ import examples.benchmark_cloth_jax
219
+
220
+ integrator = examples.benchmark_cloth_jax.JxIntegrator(cloth)
221
+
222
+ else:
223
+ raise RuntimeError("Unknown simulation backend")
224
+
225
+ # run one warm-up iteration to accurately measure initialization time (some engines do lazy init)
226
+ positions = integrator.simulate(sim_dt, sim_substeps)
227
+
228
+ label = "Dim ({}^2)".format(dim)
229
+
230
+ # run simulation
231
+ for i in range(sim_frames):
232
+ # simulate
233
+ with wp.ScopedTimer(label, dict=timers):
234
+ positions = integrator.simulate(sim_dt, sim_substeps)
235
+
236
+ if render:
237
+ grid.GetPointsAttr().Set(positions, sim_time * sim_fps)
238
+
239
+ sim_time += sim_dt
240
+
241
+ if render:
242
+ stage.Save()
243
+
244
+
245
+ # record profiling information
246
+ timers = {}
247
+
248
+ if len(sys.argv) > 1:
249
+ mode = sys.argv[1]
250
+ else:
251
+ mode = "warp_gpu"
252
+
253
+ run_benchmark(mode, 32, timers, render=False)
254
+ run_benchmark(mode, 64, timers, render=False)
255
+ run_benchmark(mode, 128, timers, render=False)
256
+
257
+ # write results
258
+ import csv
259
+
260
+ for k, v in timers.items():
261
+ print("{:16} min: {:8.2f} max: {:8.2f} avg: {:8.2f}".format(k, np.min(v), np.max(v), np.mean(v)))
262
+
263
+ report = open(os.path.join(os.path.dirname(__file__), "outputs/benchmark.csv"), "a")
264
+ writer = csv.writer(report, delimiter=",")
265
+
266
+ if report.tell() == 0:
267
+ writer.writerow(["Name", "Init", "Dim (32^2)", "Dim (64^2)", "Dim (128^2)"])
268
+
269
+ writer.writerow(
270
+ [
271
+ mode,
272
+ np.max(timers["Initialization"]),
273
+ np.mean(timers["Dim (32^2)"]),
274
+ np.mean(timers["Dim (64^2)"]),
275
+ np.mean(timers["Dim (128^2)"]),
276
+ ]
277
+ )
278
+
279
+ report.close()