warp-lang 1.3.0__py3-none-manylinux2014_x86_64.whl → 1.3.2__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/autograd.py +6 -6
- warp/bin/warp.so +0 -0
- warp/builtins.py +46 -43
- warp/codegen.py +27 -38
- warp/config.py +1 -1
- warp/context.py +160 -111
- warp/examples/fem/example_mixed_elasticity.py +33 -23
- warp/fem/field/nodal_field.py +1 -1
- warp/fem/quadrature/quadrature.py +1 -0
- warp/native/builtin.h +3 -3
- warp/native/bvh.h +1 -1
- warp/native/svd.h +22 -7
- warp/native/warp.cpp +1 -0
- warp/native/warp.cu +5 -0
- warp/native/warp.h +1 -0
- warp/sim/collide.py +1 -1
- warp/sim/model.py +16 -3
- warp/sim/utils.py +1 -1
- warp/stubs.py +112 -112
- warp/tape.py +3 -3
- warp/tests/test_array.py +11 -0
- warp/tests/test_async.py +3 -1
- warp/tests/test_bvh.py +33 -8
- warp/tests/test_codegen.py +25 -0
- warp/tests/test_compile_consts.py +15 -0
- warp/tests/test_examples.py +6 -1
- warp/tests/test_fem.py +51 -0
- warp/tests/test_grad_debug.py +2 -1
- warp/tests/test_model.py +55 -0
- warp/tests/test_point_triangle_closest_point.py +143 -0
- warp/tests/test_reload.py +28 -0
- warp/tests/test_struct.py +48 -30
- warp/types.py +4 -2
- {warp_lang-1.3.0.dist-info → warp_lang-1.3.2.dist-info}/METADATA +14 -14
- {warp_lang-1.3.0.dist-info → warp_lang-1.3.2.dist-info}/RECORD +38 -37
- {warp_lang-1.3.0.dist-info → warp_lang-1.3.2.dist-info}/WHEEL +1 -1
- {warp_lang-1.3.0.dist-info → warp_lang-1.3.2.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.3.0.dist-info → warp_lang-1.3.2.dist-info}/top_level.txt +0 -0
warp/context.py
CHANGED
|
@@ -1411,12 +1411,65 @@ class ModuleBuilder:
|
|
|
1411
1411
|
return source
|
|
1412
1412
|
|
|
1413
1413
|
|
|
1414
|
+
# ModuleExec holds the compiled executable code for a specific device.
|
|
1415
|
+
# It can be used to obtain kernel hooks on that device and serves
|
|
1416
|
+
# as a reference-counted wrapper of the loaded module.
|
|
1417
|
+
# Clients can keep a reference to a ModuleExec object to prevent the
|
|
1418
|
+
# executable code from being unloaded prematurely.
|
|
1419
|
+
# For example, the Graph class retains references to all the CUDA modules
|
|
1420
|
+
# needed by a graph. This ensures that graphs remain valid even if
|
|
1421
|
+
# the original Modules get reloaded.
|
|
1422
|
+
class ModuleExec:
|
|
1423
|
+
def __new__(cls, *args, **kwargs):
|
|
1424
|
+
instance = super(ModuleExec, cls).__new__(cls)
|
|
1425
|
+
instance.handle = None
|
|
1426
|
+
return instance
|
|
1427
|
+
|
|
1428
|
+
def __init__(self, handle, device):
|
|
1429
|
+
self.handle = handle
|
|
1430
|
+
self.device = device
|
|
1431
|
+
self.kernel_hooks = {}
|
|
1432
|
+
|
|
1433
|
+
# release the loaded module
|
|
1434
|
+
def __del__(self):
|
|
1435
|
+
if self.handle is not None:
|
|
1436
|
+
if self.device.is_cuda:
|
|
1437
|
+
# use CUDA context guard to avoid side effects during garbage collection
|
|
1438
|
+
with self.device.context_guard:
|
|
1439
|
+
runtime.core.cuda_unload_module(self.device.context, self.handle)
|
|
1440
|
+
else:
|
|
1441
|
+
runtime.llvm.unload_obj(self.handle.encode("utf-8"))
|
|
1442
|
+
|
|
1443
|
+
# lookup and cache kernel entry points
|
|
1444
|
+
def get_kernel_hooks(self, kernel):
|
|
1445
|
+
hooks = self.kernel_hooks.get(kernel)
|
|
1446
|
+
if hooks is not None:
|
|
1447
|
+
return hooks
|
|
1448
|
+
|
|
1449
|
+
name = kernel.get_mangled_name()
|
|
1450
|
+
|
|
1451
|
+
if self.device.is_cuda:
|
|
1452
|
+
forward = runtime.core.cuda_get_kernel(
|
|
1453
|
+
self.device.context, self.handle, (name + "_cuda_kernel_forward").encode("utf-8")
|
|
1454
|
+
)
|
|
1455
|
+
backward = runtime.core.cuda_get_kernel(
|
|
1456
|
+
self.device.context, self.handle, (name + "_cuda_kernel_backward").encode("utf-8")
|
|
1457
|
+
)
|
|
1458
|
+
else:
|
|
1459
|
+
func = ctypes.CFUNCTYPE(None)
|
|
1460
|
+
forward = func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8")))
|
|
1461
|
+
backward = func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
|
|
1462
|
+
|
|
1463
|
+
hooks = KernelHooks(forward, backward)
|
|
1464
|
+
self.kernel_hooks[kernel] = hooks
|
|
1465
|
+
|
|
1466
|
+
return hooks
|
|
1467
|
+
|
|
1468
|
+
|
|
1414
1469
|
# -----------------------------------------------------
|
|
1415
1470
|
# stores all functions and kernels for a Python module
|
|
1416
1471
|
# creates a hash of the function to use for checking
|
|
1417
1472
|
# build cache
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
1473
|
class Module:
|
|
1421
1474
|
def __init__(self, name, loader):
|
|
1422
1475
|
self.name = name
|
|
@@ -1427,8 +1480,8 @@ class Module:
|
|
|
1427
1480
|
self.constants = {} # Any constants referenced in this module including those defined in other modules
|
|
1428
1481
|
self.structs = {}
|
|
1429
1482
|
|
|
1430
|
-
self.
|
|
1431
|
-
self.
|
|
1483
|
+
self.cpu_exec = None # executable CPU module
|
|
1484
|
+
self.cuda_execs = {} # executable CUDA module lookup by CUDA context
|
|
1432
1485
|
|
|
1433
1486
|
self.cpu_build_failed = False
|
|
1434
1487
|
self.cuda_build_failed = False
|
|
@@ -1441,11 +1494,6 @@ class Module:
|
|
|
1441
1494
|
"mode": warp.config.mode,
|
|
1442
1495
|
}
|
|
1443
1496
|
|
|
1444
|
-
# kernel hook lookup per device
|
|
1445
|
-
# hooks are stored with the module so they can be easily cleared when the module is reloaded.
|
|
1446
|
-
# -> See ``Module.get_kernel_hooks()``
|
|
1447
|
-
self.kernel_hooks = {}
|
|
1448
|
-
|
|
1449
1497
|
# Module dependencies are determined by scanning each function
|
|
1450
1498
|
# and kernel for references to external functions and structs.
|
|
1451
1499
|
#
|
|
@@ -1558,10 +1606,13 @@ class Module:
|
|
|
1558
1606
|
computed ``content_hash`` will be used.
|
|
1559
1607
|
"""
|
|
1560
1608
|
|
|
1561
|
-
def get_type_name(type_hint):
|
|
1609
|
+
def get_type_name(type_hint) -> str:
|
|
1562
1610
|
if isinstance(type_hint, warp.codegen.Struct):
|
|
1563
1611
|
return get_type_name(type_hint.cls)
|
|
1564
|
-
|
|
1612
|
+
elif isinstance(type_hint, warp.array) and isinstance(type_hint.dtype, warp.codegen.Struct):
|
|
1613
|
+
return f"array{get_type_name(type_hint.dtype)}"
|
|
1614
|
+
|
|
1615
|
+
return str(type_hint)
|
|
1565
1616
|
|
|
1566
1617
|
def hash_recursive(module, visited):
|
|
1567
1618
|
# Hash this module, including all referenced modules recursively.
|
|
@@ -1682,27 +1733,26 @@ class Module:
|
|
|
1682
1733
|
|
|
1683
1734
|
return hash_recursive(self, visited=set())
|
|
1684
1735
|
|
|
1685
|
-
def load(self, device) ->
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
device = get_device(device)
|
|
1736
|
+
def load(self, device) -> ModuleExec:
|
|
1737
|
+
device = runtime.get_device(device)
|
|
1689
1738
|
|
|
1690
1739
|
if device.is_cpu:
|
|
1691
1740
|
# check if already loaded
|
|
1692
|
-
if self.
|
|
1693
|
-
return
|
|
1741
|
+
if self.cpu_exec:
|
|
1742
|
+
return self.cpu_exec
|
|
1694
1743
|
# avoid repeated build attempts
|
|
1695
1744
|
if self.cpu_build_failed:
|
|
1696
|
-
return
|
|
1745
|
+
return None
|
|
1697
1746
|
if not warp.is_cpu_available():
|
|
1698
1747
|
raise RuntimeError("Failed to build CPU module because no CPU buildchain was found")
|
|
1699
1748
|
else:
|
|
1700
1749
|
# check if already loaded
|
|
1701
|
-
|
|
1702
|
-
|
|
1750
|
+
cuda_exec = self.cuda_execs.get(device.context)
|
|
1751
|
+
if cuda_exec is not None:
|
|
1752
|
+
return cuda_exec
|
|
1703
1753
|
# avoid repeated build attempts
|
|
1704
1754
|
if self.cuda_build_failed:
|
|
1705
|
-
return
|
|
1755
|
+
return None
|
|
1706
1756
|
if not warp.is_cuda_available():
|
|
1707
1757
|
raise RuntimeError("Failed to build CUDA module because CUDA is not available")
|
|
1708
1758
|
|
|
@@ -1712,7 +1762,7 @@ class Module:
|
|
|
1712
1762
|
# use a unique module path using the module short hash
|
|
1713
1763
|
module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}")
|
|
1714
1764
|
|
|
1715
|
-
with ScopedTimer(
|
|
1765
|
+
with warp.ScopedTimer(
|
|
1716
1766
|
f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
|
|
1717
1767
|
) as module_load_timer:
|
|
1718
1768
|
# -----------------------------------------------------------
|
|
@@ -1784,7 +1834,7 @@ class Module:
|
|
|
1784
1834
|
output_path = os.path.join(build_dir, output_name)
|
|
1785
1835
|
|
|
1786
1836
|
# build object code
|
|
1787
|
-
with ScopedTimer("Compile x86", active=warp.config.verbose):
|
|
1837
|
+
with warp.ScopedTimer("Compile x86", active=warp.config.verbose):
|
|
1788
1838
|
warp.build.build_cpu(
|
|
1789
1839
|
output_path,
|
|
1790
1840
|
source_code_path,
|
|
@@ -1812,7 +1862,7 @@ class Module:
|
|
|
1812
1862
|
output_path = os.path.join(build_dir, output_name)
|
|
1813
1863
|
|
|
1814
1864
|
# generate PTX or CUBIN
|
|
1815
|
-
with ScopedTimer("Compile CUDA", active=warp.config.verbose):
|
|
1865
|
+
with warp.ScopedTimer("Compile CUDA", active=warp.config.verbose):
|
|
1816
1866
|
warp.build.build_cuda(
|
|
1817
1867
|
source_code_path,
|
|
1818
1868
|
output_arch,
|
|
@@ -1865,12 +1915,14 @@ class Module:
|
|
|
1865
1915
|
# Load CPU or CUDA binary
|
|
1866
1916
|
if device.is_cpu:
|
|
1867
1917
|
runtime.llvm.load_obj(binary_path.encode("utf-8"), module_name.encode("utf-8"))
|
|
1868
|
-
|
|
1918
|
+
module_exec = ModuleExec(module_name, device)
|
|
1919
|
+
self.cpu_exec = module_exec
|
|
1869
1920
|
|
|
1870
1921
|
elif device.is_cuda:
|
|
1871
1922
|
cuda_module = warp.build.load_cuda(binary_path, device)
|
|
1872
1923
|
if cuda_module is not None:
|
|
1873
|
-
|
|
1924
|
+
module_exec = ModuleExec(cuda_module, device)
|
|
1925
|
+
self.cuda_execs[device.context] = module_exec
|
|
1874
1926
|
else:
|
|
1875
1927
|
module_load_timer.extra_msg = " (error)"
|
|
1876
1928
|
raise Exception(f"Failed to load CUDA module '{self.name}'")
|
|
@@ -1881,65 +1933,27 @@ class Module:
|
|
|
1881
1933
|
# clean up build_dir used for this process regardless
|
|
1882
1934
|
shutil.rmtree(build_dir, ignore_errors=True)
|
|
1883
1935
|
|
|
1884
|
-
return
|
|
1936
|
+
return module_exec
|
|
1885
1937
|
|
|
1886
1938
|
def unload(self):
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
# need to unload the CUDA module from all CUDA contexts where it is loaded
|
|
1892
|
-
# note: we ensure that this doesn't change the current CUDA context
|
|
1893
|
-
if self.cuda_modules:
|
|
1894
|
-
saved_context = runtime.core.cuda_context_get_current()
|
|
1895
|
-
for context, module in self.cuda_modules.items():
|
|
1896
|
-
device = runtime.context_map[context]
|
|
1897
|
-
if device.is_capturing:
|
|
1898
|
-
raise RuntimeError(f"Failed to unload CUDA module '{self.name}' because graph capture is active")
|
|
1899
|
-
runtime.core.cuda_unload_module(context, module)
|
|
1900
|
-
runtime.core.cuda_context_set_current(saved_context)
|
|
1901
|
-
self.cuda_modules = {}
|
|
1902
|
-
|
|
1903
|
-
# clear kernel hooks
|
|
1904
|
-
self.kernel_hooks = {}
|
|
1939
|
+
# clear loaded modules
|
|
1940
|
+
self.cpu_exec = None
|
|
1941
|
+
self.cuda_execs = {}
|
|
1905
1942
|
|
|
1906
1943
|
# clear content hash
|
|
1907
1944
|
self.content_hash = None
|
|
1908
1945
|
|
|
1909
|
-
# lookup
|
|
1946
|
+
# lookup kernel entry points based on name, called after compilation / module load
|
|
1910
1947
|
def get_kernel_hooks(self, kernel, device):
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
if device_hooks is None:
|
|
1914
|
-
self.kernel_hooks[device.context] = device_hooks = {}
|
|
1915
|
-
|
|
1916
|
-
# look up this kernel
|
|
1917
|
-
hooks = device_hooks.get(kernel)
|
|
1918
|
-
if hooks is not None:
|
|
1919
|
-
return hooks
|
|
1920
|
-
|
|
1921
|
-
name = kernel.get_mangled_name()
|
|
1922
|
-
|
|
1923
|
-
if device.is_cpu:
|
|
1924
|
-
func = ctypes.CFUNCTYPE(None)
|
|
1925
|
-
forward = func(
|
|
1926
|
-
runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))
|
|
1927
|
-
)
|
|
1928
|
-
backward = func(
|
|
1929
|
-
runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))
|
|
1930
|
-
)
|
|
1948
|
+
if device.is_cuda:
|
|
1949
|
+
module_exec = self.cuda_execs.get(device.context)
|
|
1931
1950
|
else:
|
|
1932
|
-
|
|
1933
|
-
forward = runtime.core.cuda_get_kernel(
|
|
1934
|
-
device.context, cu_module, (name + "_cuda_kernel_forward").encode("utf-8")
|
|
1935
|
-
)
|
|
1936
|
-
backward = runtime.core.cuda_get_kernel(
|
|
1937
|
-
device.context, cu_module, (name + "_cuda_kernel_backward").encode("utf-8")
|
|
1938
|
-
)
|
|
1951
|
+
module_exec = self.cpu_exec
|
|
1939
1952
|
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1953
|
+
if module_exec is not None:
|
|
1954
|
+
return module_exec.get_kernel_hooks(kernel)
|
|
1955
|
+
else:
|
|
1956
|
+
raise RuntimeError(f"Module is not loaded on device {device}")
|
|
1943
1957
|
|
|
1944
1958
|
|
|
1945
1959
|
# -------------------------------------------
|
|
@@ -2196,8 +2210,8 @@ class Device:
|
|
|
2196
2210
|
self._stream = None
|
|
2197
2211
|
self.null_stream = None
|
|
2198
2212
|
|
|
2199
|
-
#
|
|
2200
|
-
self.captures =
|
|
2213
|
+
# maps streams to started graph captures
|
|
2214
|
+
self.captures = {}
|
|
2201
2215
|
|
|
2202
2216
|
self.context_guard = ContextGuard(self)
|
|
2203
2217
|
|
|
@@ -2434,20 +2448,25 @@ Devicelike = Union[Device, str, None]
|
|
|
2434
2448
|
class Graph:
|
|
2435
2449
|
def __new__(cls, *args, **kwargs):
|
|
2436
2450
|
instance = super(Graph, cls).__new__(cls)
|
|
2437
|
-
instance.
|
|
2451
|
+
instance.graph_exec = None
|
|
2438
2452
|
return instance
|
|
2439
2453
|
|
|
2440
|
-
def __init__(self, device: Device,
|
|
2454
|
+
def __init__(self, device: Device, capture_id: int):
|
|
2441
2455
|
self.device = device
|
|
2442
|
-
self.
|
|
2456
|
+
self.capture_id = capture_id
|
|
2457
|
+
self.module_execs = set()
|
|
2443
2458
|
|
|
2444
2459
|
def __del__(self):
|
|
2445
|
-
if not self.
|
|
2460
|
+
if not self.graph_exec:
|
|
2446
2461
|
return
|
|
2447
2462
|
|
|
2448
2463
|
# use CUDA context guard to avoid side effects during garbage collection
|
|
2449
2464
|
with self.device.context_guard:
|
|
2450
|
-
runtime.core.cuda_graph_destroy(self.device.context, self.
|
|
2465
|
+
runtime.core.cuda_graph_destroy(self.device.context, self.graph_exec)
|
|
2466
|
+
|
|
2467
|
+
# retain executable CUDA modules used by this graph, which prevents them from being unloaded
|
|
2468
|
+
def retain_module_exec(self, module_exec: ModuleExec):
|
|
2469
|
+
self.module_execs.add(module_exec)
|
|
2451
2470
|
|
|
2452
2471
|
|
|
2453
2472
|
class Runtime:
|
|
@@ -2488,6 +2507,9 @@ class Runtime:
|
|
|
2488
2507
|
else:
|
|
2489
2508
|
self.llvm = None
|
|
2490
2509
|
|
|
2510
|
+
# maps capture ids to graphs
|
|
2511
|
+
self.captures = {}
|
|
2512
|
+
|
|
2491
2513
|
# setup c-types for warp.dll
|
|
2492
2514
|
try:
|
|
2493
2515
|
self.core.get_error_string.argtypes = []
|
|
@@ -3023,6 +3045,8 @@ class Runtime:
|
|
|
3023
3045
|
self.core.cuda_stream_wait_stream.restype = None
|
|
3024
3046
|
self.core.cuda_stream_is_capturing.argtypes = [ctypes.c_void_p]
|
|
3025
3047
|
self.core.cuda_stream_is_capturing.restype = ctypes.c_int
|
|
3048
|
+
self.core.cuda_stream_get_capture_id.argtypes = [ctypes.c_void_p]
|
|
3049
|
+
self.core.cuda_stream_get_capture_id.restype = ctypes.c_uint64
|
|
3026
3050
|
|
|
3027
3051
|
self.core.cuda_event_create.argtypes = [ctypes.c_void_p, ctypes.c_uint]
|
|
3028
3052
|
self.core.cuda_event_create.restype = ctypes.c_void_p
|
|
@@ -4490,13 +4514,14 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
|
|
|
4490
4514
|
# so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
|
|
4491
4515
|
class Launch:
|
|
4492
4516
|
def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
|
|
4517
|
+
# retain the module executable so it doesn't get unloaded
|
|
4518
|
+
self.module_exec = kernel.module.load(device)
|
|
4519
|
+
if not self.module_exec:
|
|
4520
|
+
raise RuntimeError(f"Failed to load module {kernel.module.name} on device {device}")
|
|
4521
|
+
|
|
4493
4522
|
# if not specified look up hooks
|
|
4494
4523
|
if not hooks:
|
|
4495
|
-
|
|
4496
|
-
if not module.load(device):
|
|
4497
|
-
return
|
|
4498
|
-
|
|
4499
|
-
hooks = module.get_kernel_hooks(kernel, device)
|
|
4524
|
+
hooks = self.module_exec.get_kernel_hooks(kernel)
|
|
4500
4525
|
|
|
4501
4526
|
# if not specified set a zero bound
|
|
4502
4527
|
if not bounds:
|
|
@@ -4594,6 +4619,15 @@ class Launch:
|
|
|
4594
4619
|
else:
|
|
4595
4620
|
if stream is None:
|
|
4596
4621
|
stream = self.device.stream
|
|
4622
|
+
|
|
4623
|
+
# If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
|
|
4624
|
+
# before the captured graph is released.
|
|
4625
|
+
if runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
|
|
4626
|
+
capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
|
|
4627
|
+
graph = runtime.captures.get(capture_id)
|
|
4628
|
+
if graph is not None:
|
|
4629
|
+
graph.retain_module_exec(self.module_exec)
|
|
4630
|
+
|
|
4597
4631
|
runtime.core.cuda_launch_kernel(
|
|
4598
4632
|
self.device.context,
|
|
4599
4633
|
self.hooks.forward,
|
|
@@ -4689,12 +4723,12 @@ def launch(
|
|
|
4689
4723
|
kernel = kernel.add_overload(fwd_types)
|
|
4690
4724
|
|
|
4691
4725
|
# delay load modules, including new overload if needed
|
|
4692
|
-
|
|
4693
|
-
if not
|
|
4726
|
+
module_exec = kernel.module.load(device)
|
|
4727
|
+
if not module_exec:
|
|
4694
4728
|
return
|
|
4695
4729
|
|
|
4696
4730
|
# late bind
|
|
4697
|
-
hooks =
|
|
4731
|
+
hooks = module_exec.get_kernel_hooks(kernel)
|
|
4698
4732
|
|
|
4699
4733
|
pack_args(fwd_args, params)
|
|
4700
4734
|
pack_args(adj_args, params, adjoint=True)
|
|
@@ -4730,6 +4764,14 @@ def launch(
|
|
|
4730
4764
|
if stream is None:
|
|
4731
4765
|
stream = device.stream
|
|
4732
4766
|
|
|
4767
|
+
# If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
|
|
4768
|
+
# before the captured graph is released.
|
|
4769
|
+
if runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
|
|
4770
|
+
capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
|
|
4771
|
+
graph = runtime.captures.get(capture_id)
|
|
4772
|
+
if graph is not None:
|
|
4773
|
+
graph.retain_module_exec(module_exec)
|
|
4774
|
+
|
|
4733
4775
|
if adjoint:
|
|
4734
4776
|
if hooks.backward is None:
|
|
4735
4777
|
raise RuntimeError(
|
|
@@ -4778,7 +4820,7 @@ def launch(
|
|
|
4778
4820
|
|
|
4779
4821
|
# detect illegal inter-kernel read/write access patterns if verification flag is set
|
|
4780
4822
|
if warp.config.verify_autograd_array_access:
|
|
4781
|
-
runtime.tape.
|
|
4823
|
+
runtime.tape._check_kernel_array_access(kernel, fwd_args)
|
|
4782
4824
|
|
|
4783
4825
|
|
|
4784
4826
|
def synchronize():
|
|
@@ -5014,11 +5056,18 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=None
|
|
|
5014
5056
|
if force_module_load:
|
|
5015
5057
|
force_load(device)
|
|
5016
5058
|
|
|
5017
|
-
device.captures.add(stream)
|
|
5018
|
-
|
|
5019
5059
|
if not runtime.core.cuda_graph_begin_capture(device.context, stream.cuda_stream, int(external)):
|
|
5020
5060
|
raise RuntimeError(runtime.get_error_string())
|
|
5021
5061
|
|
|
5062
|
+
capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
|
|
5063
|
+
graph = Graph(device, capture_id)
|
|
5064
|
+
|
|
5065
|
+
# add to ongoing captures on the device
|
|
5066
|
+
device.captures[stream] = graph
|
|
5067
|
+
|
|
5068
|
+
# add to lookup table by globally unique capture id
|
|
5069
|
+
runtime.captures[capture_id] = graph
|
|
5070
|
+
|
|
5022
5071
|
|
|
5023
5072
|
def capture_end(device: Devicelike = None, stream: Stream = None) -> Graph:
|
|
5024
5073
|
"""Ends the capture of a CUDA graph
|
|
@@ -5040,21 +5089,27 @@ def capture_end(device: Devicelike = None, stream: Stream = None) -> Graph:
|
|
|
5040
5089
|
raise RuntimeError("Must be a CUDA device")
|
|
5041
5090
|
stream = device.stream
|
|
5042
5091
|
|
|
5043
|
-
|
|
5092
|
+
# get the graph being captured
|
|
5093
|
+
graph = device.captures.get(stream)
|
|
5094
|
+
|
|
5095
|
+
if graph is None:
|
|
5044
5096
|
raise RuntimeError("Graph capture is not active on this stream")
|
|
5045
5097
|
|
|
5046
|
-
device.captures
|
|
5098
|
+
del device.captures[stream]
|
|
5099
|
+
del runtime.captures[graph.capture_id]
|
|
5047
5100
|
|
|
5048
|
-
graph
|
|
5049
|
-
|
|
5101
|
+
# get the graph executable
|
|
5102
|
+
graph_exec = ctypes.c_void_p()
|
|
5103
|
+
result = runtime.core.cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(graph_exec))
|
|
5050
5104
|
|
|
5051
5105
|
if not result:
|
|
5052
5106
|
# A concrete error should've already been reported, so we don't need to go into details here
|
|
5053
5107
|
raise RuntimeError(f"CUDA graph capture failed. {runtime.get_error_string()}")
|
|
5054
5108
|
|
|
5055
|
-
#
|
|
5056
|
-
|
|
5057
|
-
|
|
5109
|
+
# set the graph executable
|
|
5110
|
+
graph.graph_exec = graph_exec
|
|
5111
|
+
|
|
5112
|
+
return graph
|
|
5058
5113
|
|
|
5059
5114
|
|
|
5060
5115
|
def capture_launch(graph: Graph, stream: Stream = None):
|
|
@@ -5073,7 +5128,7 @@ def capture_launch(graph: Graph, stream: Stream = None):
|
|
|
5073
5128
|
device = graph.device
|
|
5074
5129
|
stream = device.stream
|
|
5075
5130
|
|
|
5076
|
-
if not runtime.core.cuda_graph_launch(graph.
|
|
5131
|
+
if not runtime.core.cuda_graph_launch(graph.graph_exec, stream.cuda_stream):
|
|
5077
5132
|
raise RuntimeError(f"Graph launch error: {runtime.get_error_string()}")
|
|
5078
5133
|
|
|
5079
5134
|
|
|
@@ -5522,15 +5577,9 @@ def export_stubs(file): # pragma: no cover
|
|
|
5522
5577
|
if not f.export or f.hidden: # or f.generic:
|
|
5523
5578
|
continue
|
|
5524
5579
|
|
|
5525
|
-
|
|
5526
|
-
|
|
5527
|
-
|
|
5528
|
-
return_type = f.value_func(None, None)
|
|
5529
|
-
if return_type:
|
|
5530
|
-
return_str = " -> " + type_str(return_type)
|
|
5531
|
-
|
|
5532
|
-
except Exception:
|
|
5533
|
-
pass
|
|
5580
|
+
return_type = f.value_func(None, None)
|
|
5581
|
+
if return_type:
|
|
5582
|
+
return_str = " -> " + type_str(return_type)
|
|
5534
5583
|
|
|
5535
5584
|
print("@over", file=file)
|
|
5536
5585
|
print(f"def {f.key}({args}){return_str}:", file=file)
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
#
|
|
13
13
|
# Div[ d/dF Psi(F(u)) ] = 0
|
|
14
14
|
#
|
|
15
|
-
# with Dirichlet boundary conditions on vertical sides
|
|
16
|
-
#
|
|
15
|
+
# with Dirichlet boundary conditions on vertical sides and Psi an elastic potential function of the deformation gradient.
|
|
16
|
+
# Here we choose Psi Neo-Hookean, as per Sec 3.2 of "Stable Neo-Hookean Flesh Simulation" (Smith et al. 2018),
|
|
17
|
+
# Psi(F) = mu ||F||^2 + lambda (det J - 1 - mu/lambda)^2
|
|
17
18
|
#
|
|
18
19
|
# which we write as a sequence of Newton iterations:
|
|
19
20
|
# int {sigma : grad v} = 0 for all displacement test functions v
|
|
@@ -37,19 +38,28 @@ def displacement_gradient_form(
|
|
|
37
38
|
return wp.ddot(tau(s), fem.grad(u, s))
|
|
38
39
|
|
|
39
40
|
|
|
41
|
+
@wp.func
|
|
42
|
+
def nh_parameters_from_lame(lame: wp.vec2):
|
|
43
|
+
"""Parameters such that for small strains model behaves according to Hooke's law"""
|
|
44
|
+
mu_nh = lame[1]
|
|
45
|
+
lambda_nh = lame[0] + lame[1]
|
|
46
|
+
|
|
47
|
+
return mu_nh, lambda_nh
|
|
48
|
+
|
|
49
|
+
|
|
40
50
|
@fem.integrand
|
|
41
51
|
def nh_stress_form(s: fem.Sample, tau: fem.Field, u_cur: fem.Field, lame: wp.vec2):
|
|
42
52
|
"""d Psi/dF : tau"""
|
|
43
53
|
|
|
54
|
+
# Deformation gradient
|
|
44
55
|
F = wp.identity(n=2, dtype=float) + fem.grad(u_cur, s)
|
|
45
56
|
|
|
57
|
+
# Area term and its derivative w.r.t F
|
|
46
58
|
J = wp.determinant(F)
|
|
47
|
-
|
|
48
|
-
lambda_nh = lame[0] + lame[1]
|
|
49
|
-
gamma = 1.0 + mu_nh / lambda_nh
|
|
59
|
+
dJ_dF = wp.mat22(F[1, 1], -F[1, 0], -F[0, 1], F[0, 0])
|
|
50
60
|
|
|
51
|
-
|
|
52
|
-
nh_stress = mu_nh * F + lambda_nh * (J -
|
|
61
|
+
mu_nh, lambda_nh = nh_parameters_from_lame(lame)
|
|
62
|
+
nh_stress = mu_nh * F + (lambda_nh * (J - 1.0) - mu_nh) * dJ_dF
|
|
53
63
|
|
|
54
64
|
return wp.ddot(tau(s), nh_stress)
|
|
55
65
|
|
|
@@ -62,23 +72,11 @@ def nh_stress_delta_form(s: fem.Sample, tau: fem.Field, u: fem.Field, u_cur: fem
|
|
|
62
72
|
sigma_s = fem.grad(u, s)
|
|
63
73
|
|
|
64
74
|
F = wp.identity(n=2, dtype=float) + fem.grad(u_cur, s)
|
|
65
|
-
|
|
66
75
|
dJ_dF = wp.mat22(F[1, 1], -F[1, 0], -F[0, 1], F[0, 0])
|
|
67
76
|
|
|
68
|
-
|
|
69
|
-
lambda_nh = lame
|
|
70
|
-
|
|
71
|
-
dpsi_dpsi = mu_nh * wp.ddot(tau_s, sigma_s) + lambda_nh * wp.ddot(dJ_dF * tau_s, dJ_dF * sigma_s)
|
|
72
|
-
|
|
73
|
-
# positive part of d2J_dS2
|
|
74
|
-
gamma = 1.0 + mu_nh / lambda_nh
|
|
75
|
-
J = wp.determinant(F)
|
|
76
|
-
if J >= gamma:
|
|
77
|
-
d2J_dF_sig = wp.mat22(sigma_s[1, 1], 0.0, 0.0, sigma_s[0, 0])
|
|
78
|
-
else:
|
|
79
|
-
d2J_dF_sig = wp.mat22(0.0, -sigma_s[1, 0], -sigma_s[0, 1], 0.0)
|
|
80
|
-
|
|
81
|
-
return dpsi_dpsi + lambda_nh * (J - gamma) * wp.ddot(d2J_dF_sig, tau_s)
|
|
77
|
+
# Gauss--Newton approximation; ignore d2J/dF2 term
|
|
78
|
+
mu_nh, lambda_nh = nh_parameters_from_lame(lame)
|
|
79
|
+
return mu_nh * wp.ddot(tau_s, sigma_s) + lambda_nh * wp.ddot(dJ_dF, tau_s) * wp.ddot(dJ_dF, sigma_s)
|
|
82
80
|
|
|
83
81
|
|
|
84
82
|
@fem.integrand
|
|
@@ -114,6 +112,12 @@ def tensor_mass_form(
|
|
|
114
112
|
return wp.ddot(tau(s), sig(s))
|
|
115
113
|
|
|
116
114
|
|
|
115
|
+
@fem.integrand
|
|
116
|
+
def area_form(s: fem.Sample, u_cur: fem.Field):
|
|
117
|
+
F = wp.identity(n=2, dtype=float) + fem.grad(u_cur, s)
|
|
118
|
+
return wp.determinant(F)
|
|
119
|
+
|
|
120
|
+
|
|
117
121
|
class Example:
|
|
118
122
|
def __init__(
|
|
119
123
|
self,
|
|
@@ -228,6 +232,12 @@ class Example:
|
|
|
228
232
|
wp.utils.array_cast(in_array=x, out_array=delta_u)
|
|
229
233
|
fem.utils.array_axpy(x=delta_u, y=self._u_field.dof_values)
|
|
230
234
|
|
|
235
|
+
# Evaluate area conservation, should converge to 1.0 as Poisson ratio approaches 1.0
|
|
236
|
+
final_area = fem.integrate(
|
|
237
|
+
area_form, quadrature=fem.RegularQuadrature(domain, order=4), fields={"u_cur": self._u_field}
|
|
238
|
+
)
|
|
239
|
+
print(f"Area gain: {final_area} (using Poisson ratio={self._lame[0] / (self._lame[0] + 2.0*self._lame[1])})")
|
|
240
|
+
|
|
231
241
|
def render(self):
|
|
232
242
|
self.renderer.add_field("solution", self._u_field)
|
|
233
243
|
|
|
@@ -242,7 +252,7 @@ if __name__ == "__main__":
|
|
|
242
252
|
parser.add_argument("--resolution", type=int, default=25, help="Grid resolution.")
|
|
243
253
|
parser.add_argument("--degree", type=int, default=2, help="Polynomial degree of shape functions.")
|
|
244
254
|
parser.add_argument("--displacement", type=float, default=-0.5)
|
|
245
|
-
parser.add_argument("--poisson_ratio", type=float, default=0.
|
|
255
|
+
parser.add_argument("--poisson_ratio", type=float, default=0.99)
|
|
246
256
|
parser.add_argument("--mesh", choices=("grid", "tri", "quad"), default="grid", help="Mesh type")
|
|
247
257
|
parser.add_argument(
|
|
248
258
|
"--nonconforming_stresses", action="store_true", help="For grid, use non-conforming stresses (Q_d/P_d)"
|
warp/fem/field/nodal_field.py
CHANGED
|
@@ -247,7 +247,7 @@ class NodalFieldBase(DiscreteField):
|
|
|
247
247
|
|
|
248
248
|
def _make_node_partition_index(self):
|
|
249
249
|
@cache.dynamic_func(suffix=self.name)
|
|
250
|
-
def node_partition_index(args: self.
|
|
250
|
+
def node_partition_index(args: self.ElementEvalArg, node_index: int):
|
|
251
251
|
return self.space_partition.partition_node_index(args.eval_arg.partition_arg, node_index)
|
|
252
252
|
|
|
253
253
|
return node_partition_index
|
|
@@ -336,6 +336,7 @@ class ExplicitQuadrature(Quadrature):
|
|
|
336
336
|
@cache.cached_arg_value
|
|
337
337
|
def arg_value(self, device):
|
|
338
338
|
arg = self.Arg()
|
|
339
|
+
arg.points_per_cell = self._points_per_cell
|
|
339
340
|
arg.points = self._points.to(device)
|
|
340
341
|
arg.weights = self._weights.to(device)
|
|
341
342
|
|
warp/native/builtin.h
CHANGED
|
@@ -748,7 +748,7 @@ inline CUDA_CALLABLE half floordiv(half a, half b)
|
|
|
748
748
|
#if FP_CHECK
|
|
749
749
|
if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f)
|
|
750
750
|
{
|
|
751
|
-
printf("%s:%d
|
|
751
|
+
printf("%s:%d floordiv(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
|
|
752
752
|
assert(0);
|
|
753
753
|
}
|
|
754
754
|
#endif
|
|
@@ -759,7 +759,7 @@ inline CUDA_CALLABLE float floordiv(float a, float b)
|
|
|
759
759
|
#if FP_CHECK
|
|
760
760
|
if (!isfinite(a) || !isfinite(b) || b == 0.0f)
|
|
761
761
|
{
|
|
762
|
-
printf("%s:%d
|
|
762
|
+
printf("%s:%d floordiv(%f, %f)\n", __FILE__, __LINE__, a, b);
|
|
763
763
|
assert(0);
|
|
764
764
|
}
|
|
765
765
|
#endif
|
|
@@ -770,7 +770,7 @@ inline CUDA_CALLABLE double floordiv(double a, double b)
|
|
|
770
770
|
#if FP_CHECK
|
|
771
771
|
if (!isfinite(a) || !isfinite(b) || b == 0.0)
|
|
772
772
|
{
|
|
773
|
-
printf("%s:%d
|
|
773
|
+
printf("%s:%d floordiv(%f, %f)\n", __FILE__, __LINE__, a, b);
|
|
774
774
|
assert(0);
|
|
775
775
|
}
|
|
776
776
|
#endif
|
warp/native/bvh.h
CHANGED
|
@@ -320,7 +320,7 @@ CUDA_CALLABLE inline bvh_query_t bvh_query_aabb(
|
|
|
320
320
|
CUDA_CALLABLE inline bvh_query_t bvh_query_ray(
|
|
321
321
|
uint64_t id, const vec3& start, const vec3& dir)
|
|
322
322
|
{
|
|
323
|
-
return bvh_query(id, true, start, dir);
|
|
323
|
+
return bvh_query(id, true, start, 1.0f / dir);
|
|
324
324
|
}
|
|
325
325
|
|
|
326
326
|
//Stub
|