warp-lang 1.7.2rc1__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +3 -1
- warp/__init__.pyi +3489 -1
- warp/autograd.py +45 -122
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +241 -252
- warp/build_dll.py +125 -26
- warp/builtins.py +1907 -384
- warp/codegen.py +257 -101
- warp/config.py +12 -1
- warp/constants.py +1 -1
- warp/context.py +657 -223
- warp/dlpack.py +1 -1
- warp/examples/benchmarks/benchmark_cloth.py +2 -2
- warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
- warp/examples/core/example_sample_mesh.py +1 -1
- warp/examples/core/example_spin_lock.py +93 -0
- warp/examples/core/example_work_queue.py +118 -0
- warp/examples/fem/example_adaptive_grid.py +5 -5
- warp/examples/fem/example_apic_fluid.py +1 -1
- warp/examples/fem/example_burgers.py +1 -1
- warp/examples/fem/example_convection_diffusion.py +9 -6
- warp/examples/fem/example_darcy_ls_optimization.py +489 -0
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion.py +2 -2
- warp/examples/fem/example_diffusion_3d.py +1 -1
- warp/examples/fem/example_distortion_energy.py +1 -1
- warp/examples/fem/example_elastic_shape_optimization.py +387 -0
- warp/examples/fem/example_magnetostatics.py +5 -3
- warp/examples/fem/example_mixed_elasticity.py +5 -3
- warp/examples/fem/example_navier_stokes.py +11 -9
- warp/examples/fem/example_nonconforming_contact.py +5 -3
- warp/examples/fem/example_streamlines.py +8 -3
- warp/examples/fem/utils.py +9 -8
- warp/examples/interop/example_jax_ffi_callback.py +2 -2
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/sim/example_cloth.py +1 -1
- warp/examples/sim/example_cloth_self_contact.py +48 -54
- warp/examples/tile/example_tile_block_cholesky.py +502 -0
- warp/examples/tile/example_tile_cholesky.py +2 -1
- warp/examples/tile/example_tile_convolution.py +1 -1
- warp/examples/tile/example_tile_filtering.py +1 -1
- warp/examples/tile/example_tile_matmul.py +1 -1
- warp/examples/tile/example_tile_mlp.py +2 -0
- warp/fabric.py +7 -7
- warp/fem/__init__.py +5 -0
- warp/fem/adaptivity.py +1 -1
- warp/fem/cache.py +152 -63
- warp/fem/dirichlet.py +2 -2
- warp/fem/domain.py +136 -6
- warp/fem/field/field.py +141 -99
- warp/fem/field/nodal_field.py +85 -39
- warp/fem/field/virtual.py +97 -52
- warp/fem/geometry/adaptive_nanogrid.py +91 -86
- warp/fem/geometry/closest_point.py +13 -0
- warp/fem/geometry/deformed_geometry.py +102 -40
- warp/fem/geometry/element.py +56 -2
- warp/fem/geometry/geometry.py +323 -22
- warp/fem/geometry/grid_2d.py +157 -62
- warp/fem/geometry/grid_3d.py +116 -20
- warp/fem/geometry/hexmesh.py +86 -20
- warp/fem/geometry/nanogrid.py +166 -86
- warp/fem/geometry/partition.py +59 -25
- warp/fem/geometry/quadmesh.py +86 -135
- warp/fem/geometry/tetmesh.py +47 -119
- warp/fem/geometry/trimesh.py +77 -270
- warp/fem/integrate.py +107 -52
- warp/fem/linalg.py +25 -58
- warp/fem/operator.py +124 -27
- warp/fem/quadrature/pic_quadrature.py +36 -14
- warp/fem/quadrature/quadrature.py +40 -16
- warp/fem/space/__init__.py +1 -1
- warp/fem/space/basis_function_space.py +66 -46
- warp/fem/space/basis_space.py +17 -4
- warp/fem/space/dof_mapper.py +1 -1
- warp/fem/space/function_space.py +2 -2
- warp/fem/space/grid_2d_function_space.py +4 -1
- warp/fem/space/hexmesh_function_space.py +4 -2
- warp/fem/space/nanogrid_function_space.py +3 -1
- warp/fem/space/partition.py +11 -2
- warp/fem/space/quadmesh_function_space.py +4 -1
- warp/fem/space/restriction.py +5 -2
- warp/fem/space/shape/__init__.py +10 -8
- warp/fem/space/tetmesh_function_space.py +4 -1
- warp/fem/space/topology.py +52 -21
- warp/fem/space/trimesh_function_space.py +4 -1
- warp/fem/utils.py +53 -8
- warp/jax.py +1 -2
- warp/jax_experimental/ffi.py +12 -17
- warp/jax_experimental/xla_ffi.py +37 -24
- warp/math.py +171 -1
- warp/native/array.h +99 -0
- warp/native/builtin.h +174 -31
- warp/native/coloring.cpp +1 -1
- warp/native/exports.h +118 -63
- warp/native/intersect.h +3 -3
- warp/native/mat.h +5 -10
- warp/native/mathdx.cpp +11 -5
- warp/native/matnn.h +1 -123
- warp/native/quat.h +28 -4
- warp/native/sparse.cpp +121 -258
- warp/native/sparse.cu +181 -274
- warp/native/spatial.h +305 -17
- warp/native/tile.h +583 -72
- warp/native/tile_radix_sort.h +1108 -0
- warp/native/tile_reduce.h +237 -2
- warp/native/tile_scan.h +240 -0
- warp/native/tuple.h +189 -0
- warp/native/vec.h +6 -16
- warp/native/warp.cpp +36 -4
- warp/native/warp.cu +574 -51
- warp/native/warp.h +47 -74
- warp/optim/linear.py +5 -1
- warp/paddle.py +7 -8
- warp/py.typed +0 -0
- warp/render/render_opengl.py +58 -29
- warp/render/render_usd.py +124 -61
- warp/sim/__init__.py +9 -0
- warp/sim/collide.py +252 -78
- warp/sim/graph_coloring.py +8 -1
- warp/sim/import_mjcf.py +4 -3
- warp/sim/import_usd.py +11 -7
- warp/sim/integrator.py +5 -2
- warp/sim/integrator_euler.py +1 -1
- warp/sim/integrator_featherstone.py +1 -1
- warp/sim/integrator_vbd.py +751 -320
- warp/sim/integrator_xpbd.py +1 -1
- warp/sim/model.py +265 -260
- warp/sim/utils.py +10 -7
- warp/sparse.py +303 -166
- warp/tape.py +52 -51
- warp/tests/cuda/test_conditional_captures.py +1046 -0
- warp/tests/cuda/test_streams.py +1 -1
- warp/tests/geometry/test_volume.py +2 -2
- warp/tests/interop/test_dlpack.py +9 -9
- warp/tests/interop/test_jax.py +0 -1
- warp/tests/run_coverage_serial.py +1 -1
- warp/tests/sim/disabled_kinematics.py +2 -2
- warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
- warp/tests/sim/test_collision.py +159 -51
- warp/tests/sim/test_coloring.py +15 -1
- warp/tests/test_array.py +254 -2
- warp/tests/test_array_reduce.py +2 -2
- warp/tests/test_atomic_cas.py +299 -0
- warp/tests/test_codegen.py +142 -19
- warp/tests/test_conditional.py +47 -1
- warp/tests/test_ctypes.py +0 -20
- warp/tests/test_devices.py +8 -0
- warp/tests/test_fabricarray.py +4 -2
- warp/tests/test_fem.py +58 -25
- warp/tests/test_func.py +42 -1
- warp/tests/test_grad.py +1 -1
- warp/tests/test_lerp.py +1 -3
- warp/tests/test_map.py +481 -0
- warp/tests/test_mat.py +1 -24
- warp/tests/test_quat.py +6 -15
- warp/tests/test_rounding.py +10 -38
- warp/tests/test_runlength_encode.py +7 -7
- warp/tests/test_smoothstep.py +1 -1
- warp/tests/test_sparse.py +51 -2
- warp/tests/test_spatial.py +507 -1
- warp/tests/test_struct.py +2 -2
- warp/tests/test_tuple.py +265 -0
- warp/tests/test_types.py +2 -2
- warp/tests/test_utils.py +24 -18
- warp/tests/tile/test_tile.py +420 -1
- warp/tests/tile/test_tile_mathdx.py +518 -14
- warp/tests/tile/test_tile_reduce.py +213 -0
- warp/tests/tile/test_tile_shared_memory.py +130 -1
- warp/tests/tile/test_tile_sort.py +117 -0
- warp/tests/unittest_suites.py +4 -6
- warp/types.py +462 -308
- warp/utils.py +647 -86
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/RECORD +178 -166
- warp/stubs.py +0 -3381
- warp/tests/sim/test_xpbd.py +0 -399
- warp/tests/test_mlp.py +0 -282
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0
warp/dlpack.py
CHANGED
|
@@ -65,7 +65,7 @@ class _DLPackTensorHolder:
|
|
|
65
65
|
"""Class responsible for deleting DLManagedTensor memory after ownership is transferred from a capsule."""
|
|
66
66
|
|
|
67
67
|
def __new__(cls, *args, **kwargs):
|
|
68
|
-
instance = super(
|
|
68
|
+
instance = super().__new__(cls)
|
|
69
69
|
instance.mem_ptr = None
|
|
70
70
|
return instance
|
|
71
71
|
|
|
@@ -243,7 +243,7 @@ def run_benchmark(mode, dim, timers, render=False):
|
|
|
243
243
|
# run one warm-up iteration to accurately measure initialization time (some engines do lazy init)
|
|
244
244
|
positions = integrator.simulate(sim_dt, sim_substeps)
|
|
245
245
|
|
|
246
|
-
label = "Dim ({}^2)"
|
|
246
|
+
label = f"Dim ({dim}^2)"
|
|
247
247
|
|
|
248
248
|
# run simulation
|
|
249
249
|
for _i in range(sim_frames):
|
|
@@ -275,7 +275,7 @@ run_benchmark(mode, 128, timers, render=False)
|
|
|
275
275
|
# write results
|
|
276
276
|
|
|
277
277
|
for k, v in timers.items():
|
|
278
|
-
print("{:16} min: {:8.2f} max: {:8.2f} avg: {:8.2f}"
|
|
278
|
+
print(f"{k:16} min: {np.min(v):8.2f} max: {np.max(v):8.2f} avg: {np.mean(v):8.2f}")
|
|
279
279
|
|
|
280
280
|
report = open(os.path.join("benchmark.csv"), "a")
|
|
281
281
|
writer = csv.writer(report, delimiter=",")
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import time
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
import warp as wp
|
|
21
|
+
|
|
22
|
+
BLOCK_DIM = 128
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def create_test_kernel(KEY_TYPE, MAX_SORT_LENGTH):
|
|
26
|
+
@wp.kernel
|
|
27
|
+
def tile_sort_kernel(
|
|
28
|
+
input_keys: wp.array(dtype=KEY_TYPE, ndim=2),
|
|
29
|
+
input_values: wp.array(dtype=wp.int32, ndim=2),
|
|
30
|
+
output_keys: wp.array(dtype=KEY_TYPE, ndim=2),
|
|
31
|
+
output_values: wp.array(dtype=wp.int32, ndim=2),
|
|
32
|
+
):
|
|
33
|
+
batch_id, i = wp.tid()
|
|
34
|
+
|
|
35
|
+
# Load input into shared memory
|
|
36
|
+
keys = wp.tile_load(input_keys[batch_id], shape=MAX_SORT_LENGTH, storage="shared")
|
|
37
|
+
values = wp.tile_load(input_values[batch_id], shape=MAX_SORT_LENGTH, storage="shared")
|
|
38
|
+
|
|
39
|
+
# Perform in-place sorting
|
|
40
|
+
wp.tile_sort(keys, values)
|
|
41
|
+
|
|
42
|
+
# Store sorted shared memory into output arrays
|
|
43
|
+
wp.tile_store(output_keys[batch_id], keys)
|
|
44
|
+
wp.tile_store(output_values[batch_id], values)
|
|
45
|
+
|
|
46
|
+
return tile_sort_kernel
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
wp.config.quiet = True
|
|
51
|
+
wp.init()
|
|
52
|
+
wp.clear_kernel_cache()
|
|
53
|
+
wp.set_module_options({"fast_math": True, "enable_backward": False})
|
|
54
|
+
|
|
55
|
+
iterations = 100
|
|
56
|
+
rng = np.random.default_rng(42)
|
|
57
|
+
|
|
58
|
+
shared_benchmark_data = {}
|
|
59
|
+
cub_segmented_sort_benchmark_data = {}
|
|
60
|
+
|
|
61
|
+
array_length = list(range(16, 257, 16))
|
|
62
|
+
|
|
63
|
+
print(
|
|
64
|
+
f"{'Type':<12s} {'Batch Size':<12s} {'Length':<12s} {'Tile Sort (ms)':<16s} {'Cub Segmented Sort (ms)':<24s} {'CubTime/TileTime':<16s}"
|
|
65
|
+
)
|
|
66
|
+
print("-" * 100)
|
|
67
|
+
|
|
68
|
+
for dtype in [int, float]:
|
|
69
|
+
for batch_size_exponent in range(5, 11):
|
|
70
|
+
batch_size = 2**batch_size_exponent
|
|
71
|
+
for length in array_length:
|
|
72
|
+
if dtype == int:
|
|
73
|
+
np_keys = rng.choice(1000000000, size=(batch_size, length), replace=False)
|
|
74
|
+
else: # dtype == float
|
|
75
|
+
np_keys = rng.choice(1000000, size=(batch_size, length), replace=False).astype(np.float32)
|
|
76
|
+
|
|
77
|
+
np_values = np.tile(np.arange(length), (batch_size, 1))
|
|
78
|
+
|
|
79
|
+
# Sort using NumPy for validation
|
|
80
|
+
np_sorted_keys = np.zeros_like(np_keys)
|
|
81
|
+
np_sorted_values = np.zeros_like(np_values)
|
|
82
|
+
for b in range(batch_size):
|
|
83
|
+
sorted_indices = np.argsort(np_keys[b])
|
|
84
|
+
np_sorted_keys[b] = np_keys[b][sorted_indices]
|
|
85
|
+
np_sorted_values[b] = np_values[b][sorted_indices]
|
|
86
|
+
|
|
87
|
+
# Generate random keys and iota indexer
|
|
88
|
+
input_keys = wp.array(np_keys, dtype=dtype, ndim=2, device="cuda")
|
|
89
|
+
input_values = wp.array(np_values, dtype=int, ndim=2, device="cuda")
|
|
90
|
+
output_keys = wp.zeros_like(input_keys, device="cuda")
|
|
91
|
+
output_values = wp.zeros_like(input_values, device="cuda")
|
|
92
|
+
|
|
93
|
+
kernel = create_test_kernel(dtype, length)
|
|
94
|
+
|
|
95
|
+
cmd = wp.launch_tiled(
|
|
96
|
+
kernel,
|
|
97
|
+
dim=batch_size,
|
|
98
|
+
inputs=[input_keys, input_values, output_keys, output_values],
|
|
99
|
+
block_dim=BLOCK_DIM,
|
|
100
|
+
record_cmd=True,
|
|
101
|
+
)
|
|
102
|
+
# Warmup
|
|
103
|
+
for _ in range(5):
|
|
104
|
+
cmd.launch()
|
|
105
|
+
|
|
106
|
+
with wp.ScopedTimer("benchmark", cuda_filter=wp.TIMING_KERNEL, print=False, synchronize=True) as timer:
|
|
107
|
+
for _ in range(iterations):
|
|
108
|
+
cmd.launch()
|
|
109
|
+
wp.synchronize()
|
|
110
|
+
|
|
111
|
+
if dtype == int:
|
|
112
|
+
keys_match = np.array_equal(output_keys.numpy(), np_sorted_keys)
|
|
113
|
+
else: # dtype == float
|
|
114
|
+
keys_match = np.allclose(output_keys.numpy(), np_sorted_keys, atol=1e-6) # Use tolerance for floats
|
|
115
|
+
|
|
116
|
+
values_match = np.array_equal(output_values.numpy(), np_sorted_values)
|
|
117
|
+
|
|
118
|
+
# Validate results
|
|
119
|
+
assert keys_match, f"Key sorting mismatch for dtype={dtype}!"
|
|
120
|
+
assert values_match, f"Value sorting mismatch for dtype={dtype}!"
|
|
121
|
+
|
|
122
|
+
timing_results = [result.elapsed for result in timer.timing_results]
|
|
123
|
+
mean_timing = np.mean(timing_results)
|
|
124
|
+
|
|
125
|
+
shared_benchmark_data[length] = mean_timing
|
|
126
|
+
|
|
127
|
+
# Allocate memory
|
|
128
|
+
input_keys = wp.zeros(shape=(batch_size * 2, length), dtype=dtype, device="cuda")
|
|
129
|
+
input_values = wp.zeros(shape=(batch_size * 2, length), dtype=int, device="cuda")
|
|
130
|
+
|
|
131
|
+
# Copy data
|
|
132
|
+
input_keys.assign(np_keys)
|
|
133
|
+
input_values.assign(np_values)
|
|
134
|
+
|
|
135
|
+
input_keys = input_keys.reshape(-1)
|
|
136
|
+
input_values = input_values.reshape(-1)
|
|
137
|
+
|
|
138
|
+
segments = wp.array(np.arange(0, batch_size + 1) * length, dtype=int, device="cuda")
|
|
139
|
+
|
|
140
|
+
# Compare with cub segmented radix sort
|
|
141
|
+
# Warmup
|
|
142
|
+
for _ in range(5):
|
|
143
|
+
wp.utils.segmented_sort_pairs(input_keys, input_values, batch_size * length, segments)
|
|
144
|
+
|
|
145
|
+
t1 = time.time_ns()
|
|
146
|
+
for _ in range(iterations):
|
|
147
|
+
wp.utils.segmented_sort_pairs(input_keys, input_values, batch_size * length, segments)
|
|
148
|
+
wp.synchronize()
|
|
149
|
+
t2 = time.time_ns()
|
|
150
|
+
cub_segmented_sort_benchmark_data[length] = (t2 - t1) / (1_000_000 * iterations)
|
|
151
|
+
|
|
152
|
+
# Print results
|
|
153
|
+
print(
|
|
154
|
+
f"{dtype!s:<12s} {batch_size:<12d} {length:<12d} {shared_benchmark_data[length]:<16.4g} {cub_segmented_sort_benchmark_data[length]:<24.4g} {cub_segmented_sort_benchmark_data[length] / shared_benchmark_data[length]:<16.4g}"
|
|
155
|
+
)
|
|
@@ -176,7 +176,7 @@ def sample_mesh(
|
|
|
176
176
|
sample = wp.randf(rng)
|
|
177
177
|
tri = wp.lower_bound(cdf, sample)
|
|
178
178
|
|
|
179
|
-
# Sample the location in that triangle using random barycentric
|
|
179
|
+
# Sample the location in that triangle using random barycentric coordinates.
|
|
180
180
|
ru = wp.randf(rng)
|
|
181
181
|
rv = wp.randf(rng)
|
|
182
182
|
tri_u = 1.0 - wp.sqrt(ru)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
###########################################################################
|
|
17
|
+
# Example Spin Lock
|
|
18
|
+
#
|
|
19
|
+
# Shows how to use a spin lock to synchronize access to a shared resource.
|
|
20
|
+
#
|
|
21
|
+
###########################################################################
|
|
22
|
+
|
|
23
|
+
import warp as wp
|
|
24
|
+
from warp.tests.unittest_utils import *
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@wp.func
|
|
28
|
+
def spinlock_acquire(lock: wp.array(dtype=wp.int32)):
|
|
29
|
+
# Try to acquire the lock by setting it to 1 if it's 0
|
|
30
|
+
while wp.atomic_cas(lock, 0, 0, 1) == 1:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@wp.func
|
|
35
|
+
def spinlock_release(lock: wp.array(dtype=wp.int32)):
|
|
36
|
+
# Release the lock by setting it back to 0
|
|
37
|
+
wp.atomic_exch(lock, 0, 0)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@wp.func
|
|
41
|
+
def volatile_read(ptr: wp.array(dtype=wp.int32), index: int):
|
|
42
|
+
value = wp.atomic_exch(ptr, index, 0)
|
|
43
|
+
wp.atomic_exch(ptr, index, value)
|
|
44
|
+
return value
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@wp.kernel
|
|
48
|
+
def test_spinlock_counter(
|
|
49
|
+
counter: wp.array(dtype=wp.int32), atomic_counter: wp.array(dtype=wp.int32), lock: wp.array(dtype=wp.int32)
|
|
50
|
+
):
|
|
51
|
+
# Try to acquire the lock
|
|
52
|
+
spinlock_acquire(lock)
|
|
53
|
+
|
|
54
|
+
# Critical section - increment counter
|
|
55
|
+
# counter[0] = counter[0] + 1 # This gives wrong results - counter should be marked as volatile
|
|
56
|
+
|
|
57
|
+
# Work around since warp arrays cannot be marked as volatile
|
|
58
|
+
value = volatile_read(counter, 0)
|
|
59
|
+
counter[0] = value + 1
|
|
60
|
+
|
|
61
|
+
# Release the lock
|
|
62
|
+
spinlock_release(lock)
|
|
63
|
+
|
|
64
|
+
# Increment atomic counter for comparison
|
|
65
|
+
wp.atomic_add(atomic_counter, 0, 1)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_spinlock(device):
|
|
69
|
+
# Create a lock array initialized to 0 (unlocked)
|
|
70
|
+
lock = wp.array([0], dtype=wp.int32, device=device)
|
|
71
|
+
|
|
72
|
+
# Create counter arrays initialized to 0
|
|
73
|
+
counter = wp.array([0], dtype=wp.int32, device=device)
|
|
74
|
+
atomic_counter = wp.array([0], dtype=wp.int32, device=device)
|
|
75
|
+
|
|
76
|
+
# Number of threads to test with
|
|
77
|
+
n = 1024
|
|
78
|
+
|
|
79
|
+
# Launch the test kernel
|
|
80
|
+
wp.launch(test_spinlock_counter, dim=n, inputs=[counter, atomic_counter, lock], device=device)
|
|
81
|
+
|
|
82
|
+
# Verify results
|
|
83
|
+
assert atomic_counter.numpy()[0] == n, f"Atomic counter should be {n}, got {atomic_counter.numpy()[0]}"
|
|
84
|
+
assert counter.numpy()[0] == n, f"Counter should be {n}, got {counter.numpy()[0]}"
|
|
85
|
+
assert lock.numpy()[0] == 0, "Lock was not properly released"
|
|
86
|
+
|
|
87
|
+
print(f"Final counter value: {counter.numpy()[0]}")
|
|
88
|
+
print(f"Final atomic counter value: {atomic_counter.numpy()[0]}")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
wp.clear_kernel_cache()
|
|
93
|
+
test_spinlock(device="cuda")
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
###########################################################################
|
|
17
|
+
# Example Work Queue
|
|
18
|
+
#
|
|
19
|
+
# Shows how to use a work queue to synchronize access to a shared resource.
|
|
20
|
+
#
|
|
21
|
+
###########################################################################
|
|
22
|
+
|
|
23
|
+
import warp as wp
|
|
24
|
+
from warp.tests.unittest_utils import *
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@wp.func
|
|
28
|
+
def volatile_read(ptr: wp.array(dtype=wp.int32), index: int):
|
|
29
|
+
value = wp.atomic_add(ptr, index, 0)
|
|
30
|
+
return value
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@wp.struct
|
|
34
|
+
class WorkQueue:
|
|
35
|
+
buffer: wp.array(dtype=wp.int32)
|
|
36
|
+
capacity: int
|
|
37
|
+
head: wp.array(dtype=wp.int32)
|
|
38
|
+
tail: wp.array(dtype=wp.int32)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@wp.func
|
|
42
|
+
def enqueue(queue: WorkQueue, item: int) -> bool:
|
|
43
|
+
while True:
|
|
44
|
+
# Read current head and tail atomically
|
|
45
|
+
current_tail = volatile_read(queue.tail, 0)
|
|
46
|
+
current_head = volatile_read(queue.head, 0)
|
|
47
|
+
|
|
48
|
+
# Check if queue is full
|
|
49
|
+
if (current_tail - current_head) >= queue.capacity:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
# Try to increment tail atomically
|
|
53
|
+
index = current_tail % queue.capacity
|
|
54
|
+
if wp.atomic_cas(queue.tail, 0, current_tail, current_tail + 1) == current_tail:
|
|
55
|
+
queue.buffer[index] = item
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
# Retry if another thread changed tail
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@wp.func
|
|
62
|
+
def dequeue(queue: WorkQueue) -> tuple[bool, int]:
|
|
63
|
+
while True:
|
|
64
|
+
# Read current head and tail atomically
|
|
65
|
+
current_head = volatile_read(queue.head, 0)
|
|
66
|
+
current_tail = volatile_read(queue.tail, 0)
|
|
67
|
+
|
|
68
|
+
# Check if queue is empty
|
|
69
|
+
if current_head >= current_tail:
|
|
70
|
+
return False, 0
|
|
71
|
+
|
|
72
|
+
# Get item at current head
|
|
73
|
+
index = current_head % queue.capacity
|
|
74
|
+
item = queue.buffer[index]
|
|
75
|
+
|
|
76
|
+
# Try to increment head atomically
|
|
77
|
+
if wp.atomic_cas(queue.head, 0, current_head, current_head + 1) == current_head:
|
|
78
|
+
return True, item
|
|
79
|
+
|
|
80
|
+
# Retry if another thread changed head
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@wp.kernel
|
|
84
|
+
def process_queue(queue: WorkQueue):
|
|
85
|
+
counter = int(0)
|
|
86
|
+
while True:
|
|
87
|
+
success, item = dequeue(queue)
|
|
88
|
+
if not success:
|
|
89
|
+
break
|
|
90
|
+
wp.printf("Processed item: %d\n", item)
|
|
91
|
+
if item < 1000000:
|
|
92
|
+
if not enqueue(queue, item + 1000000):
|
|
93
|
+
wp.printf("Failed to enqueue item: %d\n", item + 1000000)
|
|
94
|
+
counter = counter + 1
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_work_queue(device):
|
|
98
|
+
# Create a work queue with capacity 1024
|
|
99
|
+
capacity = 8192
|
|
100
|
+
head = wp.array([0], dtype=wp.int32, device=device)
|
|
101
|
+
tail = wp.array([4096], dtype=wp.int32, device=device)
|
|
102
|
+
buffer = wp.array(np.arange(4096, dtype=np.int32), dtype=wp.int32, device=device)
|
|
103
|
+
|
|
104
|
+
queue = WorkQueue()
|
|
105
|
+
queue.capacity = capacity
|
|
106
|
+
queue.head = head
|
|
107
|
+
queue.tail = tail
|
|
108
|
+
queue.buffer = buffer
|
|
109
|
+
|
|
110
|
+
# Launch processing kernel
|
|
111
|
+
wp.launch(process_queue, dim=1024, inputs=[queue], device=device)
|
|
112
|
+
|
|
113
|
+
wp.synchronize()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
wp.clear_kernel_cache()
|
|
118
|
+
test_work_queue(device="cuda")
|
|
@@ -70,7 +70,7 @@ def mass_form(
|
|
|
70
70
|
@fem.integrand
|
|
71
71
|
def side_divergence_form(s: fem.Sample, domain: fem.Domain, u: fem.Field, psi: fem.Field):
|
|
72
72
|
# normal velocity jump (non-zero at resolution boundaries)
|
|
73
|
-
return -wp.dot(fem.jump(u, s), fem.normal(domain, s)) * psi
|
|
73
|
+
return -wp.dot(fem.jump(u, s), fem.normal(domain, s)) * fem.average(psi, s)
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
@wp.func
|
|
@@ -173,7 +173,7 @@ class Example:
|
|
|
173
173
|
bd_test = fem.make_test(u_space, domain=boundary)
|
|
174
174
|
bd_trial = fem.make_trial(u_space, domain=boundary)
|
|
175
175
|
dirichlet_projector = fem.integrate(
|
|
176
|
-
noslip_projector_form, fields={"u": bd_test, "v": bd_trial}, nodal
|
|
176
|
+
noslip_projector_form, fields={"u": bd_test, "v": bd_trial}, assembly="nodal", output_dtype=float
|
|
177
177
|
)
|
|
178
178
|
fem.normalize_dirichlet_projector(dirichlet_projector)
|
|
179
179
|
|
|
@@ -187,7 +187,7 @@ class Example:
|
|
|
187
187
|
rho_trial = fem.make_trial(rho_space)
|
|
188
188
|
|
|
189
189
|
inv_mass_matrix = fem.integrate(
|
|
190
|
-
mass_form, fields={"u": rho_trial, "v": rho_test}, nodal
|
|
190
|
+
mass_form, fields={"u": rho_trial, "v": rho_test}, assembly="nodal", output_dtype=float
|
|
191
191
|
)
|
|
192
192
|
fem_example_utils.invert_diagonal_bsr_matrix(inv_mass_matrix)
|
|
193
193
|
|
|
@@ -269,8 +269,8 @@ if __name__ == "__main__":
|
|
|
269
269
|
|
|
270
270
|
stage = Usd.Stage.Open(os.path.join(warp.examples.get_asset_directory(), "rocks.usd"))
|
|
271
271
|
mesh = UsdGeom.Mesh(stage.GetPrimAtPath("/root/rocks"))
|
|
272
|
-
points = np.array(
|
|
273
|
-
counts = np.array(
|
|
272
|
+
points = np.array(mesh.GetPointsAttr().Get())
|
|
273
|
+
counts = np.array(mesh.GetFaceVertexCountsAttr().Get())
|
|
274
274
|
indices = np.array(mesh.GetFaceVertexIndicesAttr().Get())
|
|
275
275
|
ref_geom = (points, counts, indices)
|
|
276
276
|
except Exception:
|
|
@@ -290,7 +290,7 @@ class Example:
|
|
|
290
290
|
vel_projector = fem.integrate(
|
|
291
291
|
velocity_boundary_projector_form,
|
|
292
292
|
fields={"u": velocity_trial, "v": velocity_test},
|
|
293
|
-
nodal
|
|
293
|
+
assembly="nodal",
|
|
294
294
|
output_dtype=float,
|
|
295
295
|
)
|
|
296
296
|
fem.normalize_dirichlet_projector(vel_projector)
|
|
@@ -146,7 +146,7 @@ class Example:
|
|
|
146
146
|
# For simplicity, use nodal integration so that inertia matrix is diagonal
|
|
147
147
|
trial = fem.make_trial(space=vector_space, domain=domain)
|
|
148
148
|
matrix_inertia = fem.integrate(
|
|
149
|
-
vel_mass_form, fields={"u": trial, "v": self._test}, output_dtype=wp.float32, nodal
|
|
149
|
+
vel_mass_form, fields={"u": trial, "v": self._test}, output_dtype=wp.float32, assembly="nodal"
|
|
150
150
|
)
|
|
151
151
|
self._inv_mass_matrix = wp.sparse.bsr_copy(matrix_inertia)
|
|
152
152
|
fem_example_utils.invert_diagonal_bsr_matrix(self._inv_mass_matrix)
|
|
@@ -82,7 +82,7 @@ def diffusion_and_inertia_form(s: fem.Sample, phi: fem.Field, psi: fem.Field, dt
|
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
class Example:
|
|
85
|
-
def __init__(self, quiet=False, degree=2, resolution=50,
|
|
85
|
+
def __init__(self, quiet=False, degree=2, resolution=50, mesh: str = "grid", viscosity=0.001, ang_vel=1.0):
|
|
86
86
|
self._quiet = quiet
|
|
87
87
|
|
|
88
88
|
self._ang_vel = ang_vel
|
|
@@ -91,11 +91,14 @@ class Example:
|
|
|
91
91
|
self.sim_dt = 1.0 / (ang_vel * res)
|
|
92
92
|
self.current_frame = 0
|
|
93
93
|
|
|
94
|
-
if
|
|
95
|
-
positions, tri_vidx = fem_example_utils.gen_trimesh(res=wp.vec2i(
|
|
94
|
+
if mesh == "tri":
|
|
95
|
+
positions, tri_vidx = fem_example_utils.gen_trimesh(res=wp.vec2i(resolution))
|
|
96
96
|
geo = fem.Trimesh2D(tri_vertex_indices=tri_vidx, positions=positions, build_bvh=True)
|
|
97
|
+
elif mesh == "quad":
|
|
98
|
+
positions, quad_vidx = fem_example_utils.gen_quadmesh(res=wp.vec2i(resolution))
|
|
99
|
+
geo = fem.Quadmesh2D(quad_vertex_indices=quad_vidx, positions=positions, build_bvh=True)
|
|
97
100
|
else:
|
|
98
|
-
geo = fem.Grid2D(res=wp.vec2i(
|
|
101
|
+
geo = fem.Grid2D(res=wp.vec2i(resolution))
|
|
99
102
|
|
|
100
103
|
domain = fem.Cells(geometry=geo)
|
|
101
104
|
scalar_space = fem.make_polynomial_space(geo, degree=degree)
|
|
@@ -149,7 +152,7 @@ if __name__ == "__main__":
|
|
|
149
152
|
parser.add_argument("--num_frames", type=int, default=250, help="Total number of frames.")
|
|
150
153
|
parser.add_argument("--viscosity", type=float, default=0.001, help="Fluid viscosity parameter.")
|
|
151
154
|
parser.add_argument("--ang_vel", type=float, default=1.0, help="Angular velocity.")
|
|
152
|
-
parser.add_argument("--
|
|
155
|
+
parser.add_argument("--mesh", choices=("grid", "tri", "quad"), default="grid", help="Mesh type.")
|
|
153
156
|
parser.add_argument(
|
|
154
157
|
"--headless",
|
|
155
158
|
action="store_true",
|
|
@@ -164,7 +167,7 @@ if __name__ == "__main__":
|
|
|
164
167
|
quiet=args.quiet,
|
|
165
168
|
degree=args.degree,
|
|
166
169
|
resolution=args.resolution,
|
|
167
|
-
|
|
170
|
+
mesh=args.mesh,
|
|
168
171
|
viscosity=args.viscosity,
|
|
169
172
|
ang_vel=args.ang_vel,
|
|
170
173
|
)
|