warp-lang 1.4.2__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (158) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +21 -2
  6. warp/build_dll.py +23 -6
  7. warp/builtins.py +1783 -2
  8. warp/codegen.py +177 -45
  9. warp/config.py +2 -2
  10. warp/context.py +321 -73
  11. warp/examples/assets/pixel.jpg +0 -0
  12. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  13. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  14. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  15. warp/examples/benchmarks/benchmark_tile.py +179 -0
  16. warp/examples/fem/example_adaptive_grid.py +37 -10
  17. warp/examples/fem/example_apic_fluid.py +3 -2
  18. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  19. warp/examples/fem/example_deformed_geometry.py +1 -1
  20. warp/examples/fem/example_diffusion_3d.py +47 -4
  21. warp/examples/fem/example_distortion_energy.py +220 -0
  22. warp/examples/fem/example_magnetostatics.py +127 -85
  23. warp/examples/fem/example_nonconforming_contact.py +5 -5
  24. warp/examples/fem/example_stokes.py +3 -1
  25. warp/examples/fem/example_streamlines.py +12 -19
  26. warp/examples/fem/utils.py +38 -15
  27. warp/examples/sim/example_cloth.py +2 -25
  28. warp/examples/sim/example_quadruped.py +2 -1
  29. warp/examples/tile/example_tile_convolution.py +58 -0
  30. warp/examples/tile/example_tile_fft.py +47 -0
  31. warp/examples/tile/example_tile_filtering.py +105 -0
  32. warp/examples/tile/example_tile_matmul.py +79 -0
  33. warp/examples/tile/example_tile_mlp.py +375 -0
  34. warp/fem/__init__.py +8 -0
  35. warp/fem/cache.py +16 -12
  36. warp/fem/dirichlet.py +1 -1
  37. warp/fem/domain.py +44 -1
  38. warp/fem/field/__init__.py +1 -2
  39. warp/fem/field/field.py +31 -19
  40. warp/fem/field/nodal_field.py +101 -49
  41. warp/fem/field/virtual.py +794 -0
  42. warp/fem/geometry/__init__.py +2 -2
  43. warp/fem/geometry/deformed_geometry.py +3 -105
  44. warp/fem/geometry/element.py +13 -0
  45. warp/fem/geometry/geometry.py +165 -5
  46. warp/fem/geometry/grid_2d.py +3 -6
  47. warp/fem/geometry/grid_3d.py +31 -28
  48. warp/fem/geometry/hexmesh.py +3 -46
  49. warp/fem/geometry/nanogrid.py +3 -2
  50. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  51. warp/fem/geometry/tetmesh.py +2 -43
  52. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  53. warp/fem/integrate.py +683 -261
  54. warp/fem/linalg.py +404 -0
  55. warp/fem/operator.py +101 -18
  56. warp/fem/polynomial.py +5 -5
  57. warp/fem/quadrature/quadrature.py +45 -21
  58. warp/fem/space/__init__.py +45 -11
  59. warp/fem/space/basis_function_space.py +451 -0
  60. warp/fem/space/basis_space.py +58 -11
  61. warp/fem/space/function_space.py +146 -5
  62. warp/fem/space/grid_2d_function_space.py +80 -66
  63. warp/fem/space/grid_3d_function_space.py +113 -68
  64. warp/fem/space/hexmesh_function_space.py +96 -108
  65. warp/fem/space/nanogrid_function_space.py +62 -110
  66. warp/fem/space/quadmesh_function_space.py +208 -0
  67. warp/fem/space/shape/__init__.py +45 -7
  68. warp/fem/space/shape/cube_shape_function.py +328 -54
  69. warp/fem/space/shape/shape_function.py +10 -1
  70. warp/fem/space/shape/square_shape_function.py +328 -60
  71. warp/fem/space/shape/tet_shape_function.py +269 -19
  72. warp/fem/space/shape/triangle_shape_function.py +238 -19
  73. warp/fem/space/tetmesh_function_space.py +69 -37
  74. warp/fem/space/topology.py +38 -0
  75. warp/fem/space/trimesh_function_space.py +179 -0
  76. warp/fem/utils.py +6 -331
  77. warp/jax_experimental.py +3 -1
  78. warp/native/array.h +15 -0
  79. warp/native/builtin.h +66 -26
  80. warp/native/bvh.h +4 -0
  81. warp/native/coloring.cpp +600 -0
  82. warp/native/cuda_util.cpp +14 -0
  83. warp/native/cuda_util.h +2 -1
  84. warp/native/fabric.h +8 -0
  85. warp/native/hashgrid.h +4 -0
  86. warp/native/marching.cu +8 -0
  87. warp/native/mat.h +14 -3
  88. warp/native/mathdx.cpp +59 -0
  89. warp/native/mesh.h +4 -0
  90. warp/native/range.h +13 -1
  91. warp/native/reduce.cpp +9 -1
  92. warp/native/reduce.cu +7 -0
  93. warp/native/runlength_encode.cpp +9 -1
  94. warp/native/runlength_encode.cu +7 -1
  95. warp/native/scan.cpp +8 -0
  96. warp/native/scan.cu +8 -0
  97. warp/native/scan.h +8 -1
  98. warp/native/sparse.cpp +8 -0
  99. warp/native/sparse.cu +8 -0
  100. warp/native/temp_buffer.h +7 -0
  101. warp/native/tile.h +1857 -0
  102. warp/native/tile_gemm.h +341 -0
  103. warp/native/tile_reduce.h +210 -0
  104. warp/native/volume_builder.cu +8 -0
  105. warp/native/volume_builder.h +8 -0
  106. warp/native/warp.cpp +10 -2
  107. warp/native/warp.cu +369 -15
  108. warp/native/warp.h +12 -2
  109. warp/optim/adam.py +39 -4
  110. warp/paddle.py +29 -12
  111. warp/render/render_opengl.py +137 -65
  112. warp/sim/graph_coloring.py +292 -0
  113. warp/sim/integrator_euler.py +4 -2
  114. warp/sim/integrator_featherstone.py +115 -44
  115. warp/sim/integrator_vbd.py +6 -0
  116. warp/sim/model.py +88 -15
  117. warp/stubs.py +569 -4
  118. warp/tape.py +12 -7
  119. warp/tests/assets/pixel.npy +0 -0
  120. warp/tests/aux_test_instancing_gc.py +18 -0
  121. warp/tests/test_array.py +39 -0
  122. warp/tests/test_codegen.py +81 -1
  123. warp/tests/test_codegen_instancing.py +30 -0
  124. warp/tests/test_collision.py +110 -0
  125. warp/tests/test_coloring.py +241 -0
  126. warp/tests/test_context.py +34 -0
  127. warp/tests/test_examples.py +18 -4
  128. warp/tests/test_fem.py +453 -113
  129. warp/tests/test_func.py +13 -0
  130. warp/tests/test_generics.py +52 -0
  131. warp/tests/test_iter.py +68 -0
  132. warp/tests/test_mat_scalar_ops.py +1 -1
  133. warp/tests/test_mesh_query_point.py +1 -1
  134. warp/tests/test_module_hashing.py +23 -0
  135. warp/tests/test_paddle.py +27 -87
  136. warp/tests/test_print.py +56 -1
  137. warp/tests/test_spatial.py +1 -1
  138. warp/tests/test_tile.py +700 -0
  139. warp/tests/test_tile_mathdx.py +144 -0
  140. warp/tests/test_tile_mlp.py +383 -0
  141. warp/tests/test_tile_reduce.py +374 -0
  142. warp/tests/test_tile_shared_memory.py +190 -0
  143. warp/tests/test_vbd.py +12 -20
  144. warp/tests/test_volume.py +43 -0
  145. warp/tests/unittest_suites.py +19 -2
  146. warp/tests/unittest_utils.py +4 -0
  147. warp/types.py +338 -72
  148. warp/utils.py +22 -1
  149. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
  150. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/RECORD +153 -126
  151. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
  152. warp/fem/field/test.py +0 -180
  153. warp/fem/field/trial.py +0 -183
  154. warp/fem/space/collocated_function_space.py +0 -102
  155. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  156. warp/fem/space/trimesh_2d_function_space.py +0 -153
  157. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
  158. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,374 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import unittest
9
+
10
+ import numpy as np
11
+
12
+ import warp as wp
13
+ from warp.tests.unittest_utils import *
14
+
15
+ TILE_M = wp.constant(8)
16
+ TILE_N = wp.constant(4)
17
+ TILE_K = wp.constant(8)
18
+
19
+ # num threads per-tile
20
+ TILE_DIM = 64
21
+
22
+
23
+ @wp.kernel
24
+ def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
25
+ # output tile index
26
+ i = wp.tid()
27
+
28
+ n = input.shape[1]
29
+ count = int(n / TILE_DIM)
30
+
31
+ s = wp.tile_zeros(m=1, n=1, dtype=float)
32
+
33
+ for j in range(count):
34
+ a = wp.tile_load(input, i, j, m=1, n=TILE_DIM)
35
+ s += wp.tile_sum(a) * 0.5
36
+
37
+ wp.tile_store(output, i, s)
38
+
39
+
40
+ def test_tile_reduce_sum(test, device):
41
+ batch_count = 56
42
+
43
+ N = TILE_DIM * 3
44
+
45
+ rng = np.random.default_rng(42)
46
+ input = rng.random((batch_count, N), dtype=np.float32)
47
+
48
+ input_wp = wp.array(input, requires_grad=True, device=device)
49
+ output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
50
+
51
+ with wp.Tape() as tape:
52
+ wp.launch_tiled(
53
+ tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
54
+ )
55
+
56
+ sum_wp = output_wp.numpy()
57
+ for i in range(batch_count):
58
+ sum_np = np.sum(input[i]) * 0.5
59
+ test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
60
+
61
+ output_wp.grad.fill_(1.0)
62
+
63
+ tape.backward()
64
+
65
+ assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
66
+
67
+
68
+ @wp.kernel
69
+ def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
70
+ # output tile index
71
+ i = wp.tid()
72
+
73
+ a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
74
+ m = wp.tile_min(a)
75
+
76
+ wp.tile_store(output, i, m)
77
+
78
+
79
+ def test_tile_reduce_min(test, device):
80
+ batch_count = 56
81
+
82
+ N = TILE_DIM
83
+
84
+ rng = np.random.default_rng(42)
85
+ input = rng.random((batch_count, N), dtype=np.float32)
86
+
87
+ input_wp = wp.array(input, requires_grad=True, device=device)
88
+ output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
89
+
90
+ with wp.Tape() as tape:
91
+ wp.launch_tiled(
92
+ tile_min_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
93
+ )
94
+
95
+ min_wp = output_wp.numpy()
96
+ for i in range(batch_count):
97
+ min_np = np.min(input[i])
98
+ test.assertAlmostEqual(min_wp[i], min_np, places=4)
99
+
100
+
101
+ @wp.kernel
102
+ def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
103
+ # output tile index
104
+ i = wp.tid()
105
+
106
+ a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
107
+ m = wp.tile_max(a)
108
+
109
+ wp.tile_store(output, i, m)
110
+
111
+
112
+ def test_tile_reduce_max(test, device):
113
+ batch_count = 56
114
+
115
+ N = TILE_DIM
116
+
117
+ rng = np.random.default_rng(42)
118
+ input = rng.random((batch_count, N), dtype=np.float32)
119
+
120
+ input_wp = wp.array(input, requires_grad=True, device=device)
121
+ output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
122
+
123
+ with wp.Tape() as tape:
124
+ wp.launch_tiled(
125
+ tile_max_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
126
+ )
127
+
128
+ max_wp = output_wp.numpy()
129
+ for i in range(batch_count):
130
+ max_np = np.max(input[i])
131
+ test.assertAlmostEqual(max_wp[i], max_np, places=4)
132
+
133
+
134
+ @wp.kernel
135
+ def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
136
+ # output tile index
137
+ i = wp.tid()
138
+
139
+ a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
140
+ m = wp.tile_reduce(wp.mul, a)
141
+
142
+ wp.tile_store(output, i, m)
143
+
144
+
145
+ def test_tile_reduce_custom(test, device):
146
+ batch_count = 56
147
+
148
+ N = TILE_DIM
149
+
150
+ rng = np.random.default_rng(42)
151
+ input = rng.random((batch_count, N), dtype=np.float32)
152
+
153
+ input_wp = wp.array(input, requires_grad=True, device=device)
154
+ output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
155
+
156
+ with wp.Tape() as tape:
157
+ wp.launch_tiled(
158
+ tile_reduce_custom_kernel,
159
+ dim=[batch_count],
160
+ inputs=[input_wp, output_wp],
161
+ block_dim=TILE_DIM,
162
+ device=device,
163
+ )
164
+
165
+ prod_wp = output_wp.numpy()
166
+ for i in range(batch_count):
167
+ prod_np = np.prod(input[i])
168
+ test.assertAlmostEqual(prod_wp[i], prod_np, places=4)
169
+
170
+
171
+ @wp.kernel
172
+ def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
173
+ # output tile index
174
+ i = wp.tid()
175
+
176
+ a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
177
+ s = wp.tile_sum(a) * 0.5
178
+
179
+ wp.tile_store(output, i, s)
180
+
181
+
182
+ def test_tile_reduce_grouped_sum(test, device):
183
+ batch_count = 56
184
+
185
+ M = TILE_M
186
+ N = TILE_N
187
+
188
+ rng = np.random.default_rng(42)
189
+ input = rng.random((batch_count, M, N), dtype=np.float32)
190
+
191
+ input_wp = wp.array(input, requires_grad=True, device=device)
192
+ output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
193
+
194
+ with wp.Tape() as tape:
195
+ wp.launch_tiled(
196
+ tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
197
+ )
198
+
199
+ sum_wp = output_wp.numpy()
200
+ for i in range(batch_count):
201
+ sum_np = np.sum(input[i]) * 0.5
202
+ test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
203
+
204
+ output_wp.grad.fill_(1.0)
205
+
206
+ tape.backward()
207
+
208
+ assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
209
+
210
+
211
+ @wp.kernel
212
+ def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
213
+ # thread index
214
+ i = wp.tid()
215
+
216
+ t = wp.tile(i) # convert to block wide tile
217
+ s = wp.tile_sum(t) # sum over block
218
+
219
+ # update global sum
220
+ wp.tile_atomic_add(output, 0, 0, s)
221
+
222
+
223
+ def test_tile_reduce_simt(test, device):
224
+ # use an unaligned grid dimension
225
+ N = TILE_DIM * 4 + 5
226
+
227
+ output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
228
+
229
+ with wp.Tape() as tape:
230
+ wp.launch(tile_reduce_simt_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
231
+
232
+ test.assertEqual(output.numpy()[0], np.sum(np.arange(N)))
233
+
234
+
235
+ @wp.kernel
236
+ def tile_untile_kernel(output: wp.array(dtype=int)):
237
+ # thread index
238
+ i = wp.tid()
239
+
240
+ # convert to block wide tile
241
+ t = wp.tile(i) * 2
242
+ s = wp.untile(t)
243
+
244
+ output[i] = s
245
+
246
+
247
+ def test_tile_untile(test, device):
248
+ # use an unaligned grid dimension
249
+ N = TILE_DIM * 4 + 5
250
+
251
+ output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
252
+
253
+ with wp.Tape() as tape:
254
+ wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
255
+
256
+ assert_np_equal(output.numpy(), np.arange(N) * 2)
257
+
258
+
259
+ @wp.kernel
260
+ def tile_untile_scalar_kernel(output: wp.array(dtype=int)):
261
+ # thread index
262
+ i = wp.tid()
263
+
264
+ # convert to block wide tile
265
+ t = wp.tile(i) * 2
266
+ s = wp.untile(t)
267
+
268
+ output[i] = s
269
+
270
+
271
+ def test_tile_untile_scalar(test, device):
272
+ # use an unaligned grid dimension
273
+ N = TILE_DIM * 4 + 5
274
+
275
+ output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
276
+
277
+ with wp.Tape() as tape:
278
+ wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
279
+
280
+ assert_np_equal(output.numpy(), np.arange(N) * 2)
281
+
282
+
283
+ @wp.kernel
284
+ def test_untile_vector_kernel(input: wp.array(dtype=wp.vec3), output: wp.array(dtype=wp.vec3)):
285
+ i = wp.tid()
286
+
287
+ v = input[i] * 0.5
288
+
289
+ t = wp.tile(v)
290
+ u = wp.untile(t)
291
+
292
+ output[i] = u * 2.0
293
+
294
+
295
+ def test_tile_untile_vector(test, device):
296
+ input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True, device=device)
297
+ output = wp.zeros_like(input, device=device)
298
+
299
+ with wp.Tape() as tape:
300
+ wp.launch(test_untile_vector_kernel, dim=16, inputs=[input, output], block_dim=16, device=device)
301
+
302
+ output.grad = wp.ones_like(output, device=device)
303
+ tape.backward()
304
+
305
+ assert_np_equal(output.numpy(), input.numpy())
306
+ assert_np_equal(input.grad.numpy(), np.ones((16, 3)))
307
+
308
+
309
+ @wp.kernel
310
+ def tile_ones_kernel(out: wp.array(dtype=float)):
311
+ i = wp.tid()
312
+
313
+ t = wp.tile_ones(dtype=float, m=16, n=16)
314
+ s = wp.tile_sum(t)
315
+
316
+ wp.tile_store(out, 0, s)
317
+
318
+
319
+ def test_tile_ones(test, device):
320
+ output = wp.zeros(1, dtype=float, device=device)
321
+
322
+ with wp.Tape() as tape:
323
+ wp.launch_tiled(tile_ones_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
324
+
325
+ test.assertAlmostEqual(output.numpy()[0], 256.0)
326
+
327
+
328
+ @wp.kernel
329
+ def tile_arange_kernel(out: wp.array2d(dtype=int)):
330
+ i = wp.tid()
331
+
332
+ a = wp.tile_arange(17, dtype=int)
333
+ b = wp.tile_arange(5, 23, dtype=int)
334
+ c = wp.tile_arange(0, 34, 2, dtype=int)
335
+
336
+ wp.tile_store(out, 0, 0, a)
337
+ wp.tile_store(out, 1, 0, b)
338
+ wp.tile_store(out, 2, 0, c)
339
+
340
+
341
+ def test_tile_arange(test, device):
342
+ N = 17
343
+
344
+ output = wp.zeros(shape=(3, N), dtype=int, device=device)
345
+
346
+ with wp.Tape() as tape:
347
+ wp.launch_tiled(tile_arange_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
348
+
349
+ assert_np_equal(output.numpy()[0], np.arange(17))
350
+ assert_np_equal(output.numpy()[1], np.arange(5, 22))
351
+ assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
352
+
353
+
354
+ devices = get_cuda_test_devices()
355
+
356
+
357
+ class TestTileReduce(unittest.TestCase):
358
+ pass
359
+
360
+
361
+ add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
362
+ add_function_test(TestTileReduce, "test_tile_reduce_min", test_tile_reduce_min, devices=devices)
363
+ add_function_test(TestTileReduce, "test_tile_reduce_max", test_tile_reduce_max, devices=devices)
364
+ add_function_test(TestTileReduce, "test_tile_reduce_custom", test_tile_reduce_custom, devices=devices)
365
+ add_function_test(TestTileReduce, "test_tile_reduce_grouped_sum", test_tile_reduce_sum, devices=devices)
366
+ add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
367
+ add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
368
+ add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
369
+ add_function_test(TestTileReduce, "test_tile_untile_scalar", test_tile_untile_scalar, devices=devices)
370
+ add_function_test(TestTileReduce, "test_tile_untile_vector", test_tile_untile_vector, devices=devices)
371
+
372
+ if __name__ == "__main__":
373
+ wp.clear_kernel_cache()
374
+ unittest.main(verbosity=2, failfast=True)
@@ -0,0 +1,190 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import unittest
9
+
10
+ import numpy as np
11
+
12
+ import warp as wp
13
+ from warp.tests.unittest_utils import *
14
+
15
+
16
+ # checks that we can configure shared memory to the expected size
17
+ def test_tile_shared_mem_size(test, device):
18
+ DIM_M = 32
19
+ DIM_N = 32
20
+
21
+ BLOCK_DIM = 256
22
+
23
+ @wp.kernel
24
+ def compute(out: wp.array2d(dtype=float)):
25
+ a = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared")
26
+ b = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared") * 2.0
27
+
28
+ c = a + b
29
+ wp.tile_store(out, 0, 0, c)
30
+
31
+ out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
32
+
33
+ wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
34
+
35
+ # check output
36
+ assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
37
+
38
+ # check required shared memory
39
+ expected_forward_bytes = DIM_M * DIM_N * 4 * 2
40
+ expected_backward_bytes = expected_forward_bytes * 2
41
+
42
+ # check shared memory for kernel on the device
43
+ module_exec = compute.module.load(device, BLOCK_DIM)
44
+ hooks = module_exec.get_kernel_hooks(compute)
45
+
46
+ assert hooks.forward_smem_bytes == expected_forward_bytes
47
+ assert hooks.backward_smem_bytes == expected_backward_bytes
48
+
49
+
50
+ # checks that we can configure shared memory > 48kb default
51
+ def test_tile_shared_mem_large(test, device):
52
+ # set dimensions that require 64kb for the forward kernel
53
+ DIM_M = 64
54
+ DIM_N = 128
55
+
56
+ BLOCK_DIM = 256
57
+
58
+ # we disable backward kernel gen since 128k is not supported on most architectures
59
+ @wp.kernel(enable_backward=False)
60
+ def compute(out: wp.array2d(dtype=float)):
61
+ a = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared")
62
+ b = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared") * 2.0
63
+
64
+ c = a + b
65
+ wp.tile_store(out, 0, 0, c)
66
+
67
+ out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
68
+
69
+ wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
70
+
71
+ # check output
72
+ assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
73
+
74
+ # check required shared memory
75
+ expected_forward_bytes = DIM_M * DIM_N * 4 * 2
76
+ expected_backward_bytes = expected_forward_bytes * 2
77
+
78
+ assert expected_forward_bytes == 2**16
79
+
80
+ # check shared memory for kernel on the device
81
+ module_exec = compute.module.load(device, BLOCK_DIM)
82
+ hooks = module_exec.get_kernel_hooks(compute)
83
+
84
+ assert hooks.forward_smem_bytes == expected_forward_bytes
85
+ assert hooks.backward_smem_bytes == expected_backward_bytes
86
+
87
+
88
+ # checks that we can configure dynamic shared memory during graph capture
89
+ def test_tile_shared_mem_graph(test, device):
90
+ DIM_M = 32
91
+ DIM_N = 32
92
+
93
+ BLOCK_DIM = 256
94
+
95
+ @wp.kernel
96
+ def compute(out: wp.array2d(dtype=float)):
97
+ a = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared")
98
+ b = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared") * 2.0
99
+
100
+ c = a + b
101
+ wp.tile_store(out, 0, 0, c)
102
+
103
+ out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
104
+
105
+ wp.load_module(device=device)
106
+
107
+ wp.capture_begin(device, force_module_load=False)
108
+ wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
109
+ graph = wp.capture_end(device)
110
+
111
+ wp.capture_launch(graph)
112
+
113
+ # check output
114
+ assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
115
+
116
+ # check required shared memory
117
+ expected_forward_bytes = DIM_M * DIM_N * 4 * 2
118
+ expected_backward_bytes = expected_forward_bytes * 2
119
+
120
+ # check shared memory for kernel on the device
121
+ module_exec = compute.module.load(device, BLOCK_DIM)
122
+ hooks = module_exec.get_kernel_hooks(compute)
123
+
124
+ assert hooks.forward_smem_bytes == expected_forward_bytes
125
+ assert hooks.backward_smem_bytes == expected_backward_bytes
126
+
127
+
128
+ # checks that stack allocations work for user functions
129
+ def test_tile_shared_mem_func(test, device):
130
+ DIM_M = 32
131
+ DIM_N = 32
132
+
133
+ BLOCK_DIM = 256
134
+
135
+ @wp.func
136
+ def add_tile_small():
137
+ a = wp.tile_ones(16, 16, dtype=float, storage="shared")
138
+ b = wp.tile_ones(16, 16, dtype=float, storage="shared") * 2.0
139
+
140
+ return a + b
141
+
142
+ @wp.func
143
+ def add_tile_big():
144
+ a = wp.tile_ones(64, 64, dtype=float, storage="shared")
145
+ b = wp.tile_ones(64, 64, dtype=float, storage="shared") * 2.0
146
+
147
+ return a + b
148
+
149
+ @wp.kernel
150
+ def compute(out: wp.array2d(dtype=float)):
151
+ s = add_tile_small()
152
+ b = add_tile_big()
153
+
154
+ wp.tile_store(out, 0, 0, b)
155
+
156
+ out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
157
+
158
+ wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
159
+
160
+ # check shared memory for kernel on the device
161
+ module_exec = compute.module.load(device, BLOCK_DIM)
162
+ hooks = module_exec.get_kernel_hooks(compute)
163
+
164
+ # ensure that total required dynamic shared is the larger of the two tiles
165
+ expected_required_shared = 64 * 64 * 4 * 2
166
+
167
+ assert hooks.forward_smem_bytes == expected_required_shared
168
+ assert hooks.backward_smem_bytes == expected_required_shared * 2
169
+
170
+
171
+ devices = get_cuda_test_devices()
172
+
173
+
174
+ class TestTileSharedMemory(unittest.TestCase):
175
+ pass
176
+
177
+
178
+ add_function_test(
179
+ TestTileSharedMemory, "test_tile_shared_mem_size", test_tile_shared_mem_size, devices=devices, check_output=False
180
+ )
181
+ add_function_test(
182
+ TestTileSharedMemory, "test_tile_shared_mem_large", test_tile_shared_mem_large, devices=devices, check_output=False
183
+ )
184
+ add_function_test(TestTileSharedMemory, "test_tile_shared_mem_graph", test_tile_shared_mem_graph, devices=devices)
185
+ add_function_test(TestTileSharedMemory, "test_tile_shared_mem_func", test_tile_shared_mem_func, devices=devices)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ wp.clear_kernel_cache()
190
+ unittest.main(verbosity=2, failfast=True)
warp/tests/test_vbd.py CHANGED
@@ -5,6 +5,8 @@
5
5
  # distribution of this software and related documentation without an express
6
6
  # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
7
 
8
+ import contextlib
9
+ import io
8
10
  import unittest
9
11
 
10
12
  import warp as wp
@@ -287,14 +289,6 @@ class VBDClothSim:
287
289
  89, 99, 100
288
290
  ]
289
291
 
290
- self.coloring = [
291
- [9, 12, 17, 24, 31, 38, 43, 46, 50, 62, 65, 68, 80, 84, 89, 92],
292
- [6, 20, 25, 32, 37, 44, 51, 56, 59, 63, 70, 75, 82, 88, 90, 94, 96],
293
- [2, 8, 10, 14, 26, 29, 33, 40, 48, 52, 55, 67, 73, 79, 86, 91, 98],
294
- [4, 11, 16, 23, 28, 30, 35, 42, 49, 54, 57, 71, 74, 76, 78, 93, 97],
295
- [3, 15, 18, 22, 34, 36, 39, 41, 53, 58, 60, 66, 72, 85, 99, 0, 87],
296
- [7, 21, 27, 45, 47, 61, 64, 69, 77, 81, 83, 95, 1, 5, 13, 19],
297
- ]
298
292
  # fmt: on
299
293
 
300
294
  self.dt = 1 / 60
@@ -323,6 +317,7 @@ class VBDClothSim:
323
317
  tri_ka=stiffness,
324
318
  tri_kd=kd,
325
319
  )
320
+ builder.color()
326
321
 
327
322
  self.model = builder.finalize(device=device)
328
323
  self.model.ground = True
@@ -331,11 +326,6 @@ class VBDClothSim:
331
326
  self.model.soft_contact_ke = 1.0e4
332
327
  self.model.soft_contact_kd = 1.0e2
333
328
 
334
- coloring_wp = []
335
- for color in self.coloring:
336
- coloring_wp.append(wp.array(color, dtype=wp.int32, device=self.model.device))
337
- self.model.coloring = coloring_wp
338
-
339
329
  self.dt = self.dt / self.num_substeps
340
330
  self.fixed_particles = [0, 9]
341
331
 
@@ -367,19 +357,21 @@ class VBDClothSim:
367
357
  model.particle_flags = wp.array(flags, device=model.device)
368
358
 
369
359
 
370
- def test_vbd_cloth(test, device):
371
- example = VBDClothSim(device)
372
- example.run(test)
373
-
374
-
375
360
  devices = get_test_devices()
376
361
 
377
362
 
378
363
  class TestVBD(unittest.TestCase):
379
- pass
364
+ def test_vbd_cloth(self):
365
+ for device in devices:
366
+ with contextlib.redirect_stdout(io.StringIO()) as f:
367
+ example = VBDClothSim(device)
368
+ self.assertRegex(
369
+ f.getvalue(),
370
+ r"Warp UserWarning: The graph is not optimizable anymore, terminated with a max/min ratio: 2.0 without reaching the target ratio: 1.1",
371
+ )
380
372
 
373
+ example.run(self)
381
374
 
382
- add_function_test(TestVBD, "test_vbd_cloth", test_vbd_cloth, devices=devices)
383
375
 
384
376
  if __name__ == "__main__":
385
377
  wp.clear_kernel_cache()
warp/tests/test_volume.py CHANGED
@@ -5,6 +5,8 @@
5
5
  # distribution of this software and related documentation without an express
6
6
  # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
7
 
8
+ import os
9
+ import tempfile
8
10
  import unittest
9
11
  from typing import Any
10
12
 
@@ -890,6 +892,46 @@ def test_volume_aniso_transform(test, device):
890
892
  assert_np_equal(transform, np.array(volume.get_grid_info().transform_matrix).reshape(3, 3))
891
893
 
892
894
 
895
+ def test_volume_write(test, device):
896
+ codecs = ["none", "zip", "blosc"]
897
+ try:
898
+ import blosc # noqa: F401 I001
899
+ except ImportError:
900
+ codecs.pop()
901
+
902
+ for volume_name in ("float", "vec3f", "index"):
903
+ for codec in codecs:
904
+ with test.subTest(volume_name=volume_name, codec=codec):
905
+ volume = volumes[volume_name][device.alias]
906
+ fd, file_path = tempfile.mkstemp(suffix=".nvdb")
907
+ os.close(fd)
908
+ try:
909
+ volume.save_to_nvdb(file_path, codec=codec)
910
+ with open(file_path, "rb") as f:
911
+ volume_2 = wp.Volume.load_from_nvdb(f)
912
+ next_volume = volume
913
+ while next_volume:
914
+ np.testing.assert_array_equal(next_volume.array().numpy(), volume_2.array().numpy())
915
+ next_volume = next_volume.load_next_grid()
916
+ volume_2 = volume_2.load_next_grid()
917
+
918
+ finally:
919
+ os.remove(file_path)
920
+
921
+ with test.subTest(volume_write="unsupported"):
922
+ volume = volumes["index"][device.alias]
923
+ volume = volume.load_next_grid()
924
+
925
+ fd, file_path = tempfile.mkstemp(suffix=".nvdb")
926
+ os.close(fd)
927
+
928
+ try:
929
+ with test.assertRaises(RuntimeError):
930
+ volume.save_to_nvdb(file_path, codec=codec)
931
+ finally:
932
+ os.remove(file_path)
933
+
934
+
893
935
  class TestVolume(unittest.TestCase):
894
936
  def test_volume_new_del(self):
895
937
  # test the scenario in which a volume is created but not initialized before gc
@@ -930,6 +972,7 @@ add_function_test(
930
972
  add_function_test(TestVolume, "test_volume_multiple_grids", test_volume_multiple_grids, devices=devices)
931
973
  add_function_test(TestVolume, "test_volume_feature_array", test_volume_feature_array, devices=devices)
932
974
  add_function_test(TestVolume, "test_volume_sample_index", test_volume_sample_index, devices=devices)
975
+ add_function_test(TestVolume, "test_volume_write", test_volume_write, devices=[wp.get_device("cpu")])
933
976
 
934
977
  points = {}
935
978
  points_jittered = {}