warp-lang 1.3.1__py3-none-macosx_10_13_universal2.whl → 1.3.3__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

warp/context.py CHANGED
@@ -1411,12 +1411,65 @@ class ModuleBuilder:
1411
1411
  return source
1412
1412
 
1413
1413
 
1414
+ # ModuleExec holds the compiled executable code for a specific device.
1415
+ # It can be used to obtain kernel hooks on that device and serves
1416
+ # as a reference-counted wrapper of the loaded module.
1417
+ # Clients can keep a reference to a ModuleExec object to prevent the
1418
+ # executable code from being unloaded prematurely.
1419
+ # For example, the Graph class retains references to all the CUDA modules
1420
+ # needed by a graph. This ensures that graphs remain valid even if
1421
+ # the original Modules get reloaded.
1422
+ class ModuleExec:
1423
+ def __new__(cls, *args, **kwargs):
1424
+ instance = super(ModuleExec, cls).__new__(cls)
1425
+ instance.handle = None
1426
+ return instance
1427
+
1428
+ def __init__(self, handle, device):
1429
+ self.handle = handle
1430
+ self.device = device
1431
+ self.kernel_hooks = {}
1432
+
1433
+ # release the loaded module
1434
+ def __del__(self):
1435
+ if self.handle is not None:
1436
+ if self.device.is_cuda:
1437
+ # use CUDA context guard to avoid side effects during garbage collection
1438
+ with self.device.context_guard:
1439
+ runtime.core.cuda_unload_module(self.device.context, self.handle)
1440
+ else:
1441
+ runtime.llvm.unload_obj(self.handle.encode("utf-8"))
1442
+
1443
+ # lookup and cache kernel entry points
1444
+ def get_kernel_hooks(self, kernel):
1445
+ hooks = self.kernel_hooks.get(kernel)
1446
+ if hooks is not None:
1447
+ return hooks
1448
+
1449
+ name = kernel.get_mangled_name()
1450
+
1451
+ if self.device.is_cuda:
1452
+ forward = runtime.core.cuda_get_kernel(
1453
+ self.device.context, self.handle, (name + "_cuda_kernel_forward").encode("utf-8")
1454
+ )
1455
+ backward = runtime.core.cuda_get_kernel(
1456
+ self.device.context, self.handle, (name + "_cuda_kernel_backward").encode("utf-8")
1457
+ )
1458
+ else:
1459
+ func = ctypes.CFUNCTYPE(None)
1460
+ forward = func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8")))
1461
+ backward = func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
1462
+
1463
+ hooks = KernelHooks(forward, backward)
1464
+ self.kernel_hooks[kernel] = hooks
1465
+
1466
+ return hooks
1467
+
1468
+
1414
1469
  # -----------------------------------------------------
1415
1470
  # stores all functions and kernels for a Python module
1416
1471
  # creates a hash of the function to use for checking
1417
1472
  # build cache
1418
-
1419
-
1420
1473
  class Module:
1421
1474
  def __init__(self, name, loader):
1422
1475
  self.name = name
@@ -1427,8 +1480,8 @@ class Module:
1427
1480
  self.constants = {} # Any constants referenced in this module including those defined in other modules
1428
1481
  self.structs = {}
1429
1482
 
1430
- self.cpu_module = None
1431
- self.cuda_modules = {} # module lookup by CUDA context
1483
+ self.cpu_exec = None # executable CPU module
1484
+ self.cuda_execs = {} # executable CUDA module lookup by CUDA context
1432
1485
 
1433
1486
  self.cpu_build_failed = False
1434
1487
  self.cuda_build_failed = False
@@ -1441,11 +1494,6 @@ class Module:
1441
1494
  "mode": warp.config.mode,
1442
1495
  }
1443
1496
 
1444
- # kernel hook lookup per device
1445
- # hooks are stored with the module so they can be easily cleared when the module is reloaded.
1446
- # -> See ``Module.get_kernel_hooks()``
1447
- self.kernel_hooks = {}
1448
-
1449
1497
  # Module dependencies are determined by scanning each function
1450
1498
  # and kernel for references to external functions and structs.
1451
1499
  #
@@ -1558,10 +1606,13 @@ class Module:
1558
1606
  computed ``content_hash`` will be used.
1559
1607
  """
1560
1608
 
1561
- def get_type_name(type_hint):
1609
+ def get_type_name(type_hint) -> str:
1562
1610
  if isinstance(type_hint, warp.codegen.Struct):
1563
1611
  return get_type_name(type_hint.cls)
1564
- return type_hint
1612
+ elif isinstance(type_hint, warp.array) and isinstance(type_hint.dtype, warp.codegen.Struct):
1613
+ return f"array{get_type_name(type_hint.dtype)}"
1614
+
1615
+ return str(type_hint)
1565
1616
 
1566
1617
  def hash_recursive(module, visited):
1567
1618
  # Hash this module, including all referenced modules recursively.
@@ -1682,27 +1733,26 @@ class Module:
1682
1733
 
1683
1734
  return hash_recursive(self, visited=set())
1684
1735
 
1685
- def load(self, device) -> bool:
1686
- from warp.utils import ScopedTimer
1687
-
1688
- device = get_device(device)
1736
+ def load(self, device) -> ModuleExec:
1737
+ device = runtime.get_device(device)
1689
1738
 
1690
1739
  if device.is_cpu:
1691
1740
  # check if already loaded
1692
- if self.cpu_module:
1693
- return True
1741
+ if self.cpu_exec:
1742
+ return self.cpu_exec
1694
1743
  # avoid repeated build attempts
1695
1744
  if self.cpu_build_failed:
1696
- return False
1745
+ return None
1697
1746
  if not warp.is_cpu_available():
1698
1747
  raise RuntimeError("Failed to build CPU module because no CPU buildchain was found")
1699
1748
  else:
1700
1749
  # check if already loaded
1701
- if device.context in self.cuda_modules:
1702
- return True
1750
+ cuda_exec = self.cuda_execs.get(device.context)
1751
+ if cuda_exec is not None:
1752
+ return cuda_exec
1703
1753
  # avoid repeated build attempts
1704
1754
  if self.cuda_build_failed:
1705
- return False
1755
+ return None
1706
1756
  if not warp.is_cuda_available():
1707
1757
  raise RuntimeError("Failed to build CUDA module because CUDA is not available")
1708
1758
 
@@ -1712,7 +1762,7 @@ class Module:
1712
1762
  # use a unique module path using the module short hash
1713
1763
  module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}")
1714
1764
 
1715
- with ScopedTimer(
1765
+ with warp.ScopedTimer(
1716
1766
  f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
1717
1767
  ) as module_load_timer:
1718
1768
  # -----------------------------------------------------------
@@ -1784,7 +1834,7 @@ class Module:
1784
1834
  output_path = os.path.join(build_dir, output_name)
1785
1835
 
1786
1836
  # build object code
1787
- with ScopedTimer("Compile x86", active=warp.config.verbose):
1837
+ with warp.ScopedTimer("Compile x86", active=warp.config.verbose):
1788
1838
  warp.build.build_cpu(
1789
1839
  output_path,
1790
1840
  source_code_path,
@@ -1812,7 +1862,7 @@ class Module:
1812
1862
  output_path = os.path.join(build_dir, output_name)
1813
1863
 
1814
1864
  # generate PTX or CUBIN
1815
- with ScopedTimer("Compile CUDA", active=warp.config.verbose):
1865
+ with warp.ScopedTimer("Compile CUDA", active=warp.config.verbose):
1816
1866
  warp.build.build_cuda(
1817
1867
  source_code_path,
1818
1868
  output_arch,
@@ -1865,12 +1915,14 @@ class Module:
1865
1915
  # Load CPU or CUDA binary
1866
1916
  if device.is_cpu:
1867
1917
  runtime.llvm.load_obj(binary_path.encode("utf-8"), module_name.encode("utf-8"))
1868
- self.cpu_module = module_name
1918
+ module_exec = ModuleExec(module_name, device)
1919
+ self.cpu_exec = module_exec
1869
1920
 
1870
1921
  elif device.is_cuda:
1871
1922
  cuda_module = warp.build.load_cuda(binary_path, device)
1872
1923
  if cuda_module is not None:
1873
- self.cuda_modules[device.context] = cuda_module
1924
+ module_exec = ModuleExec(cuda_module, device)
1925
+ self.cuda_execs[device.context] = module_exec
1874
1926
  else:
1875
1927
  module_load_timer.extra_msg = " (error)"
1876
1928
  raise Exception(f"Failed to load CUDA module '{self.name}'")
@@ -1881,65 +1933,27 @@ class Module:
1881
1933
  # clean up build_dir used for this process regardless
1882
1934
  shutil.rmtree(build_dir, ignore_errors=True)
1883
1935
 
1884
- return True
1936
+ return module_exec
1885
1937
 
1886
1938
  def unload(self):
1887
- if self.cpu_module:
1888
- runtime.llvm.unload_obj(self.cpu_module.encode("utf-8"))
1889
- self.cpu_module = None
1890
-
1891
- # need to unload the CUDA module from all CUDA contexts where it is loaded
1892
- # note: we ensure that this doesn't change the current CUDA context
1893
- if self.cuda_modules:
1894
- saved_context = runtime.core.cuda_context_get_current()
1895
- for context, module in self.cuda_modules.items():
1896
- device = runtime.context_map[context]
1897
- if device.is_capturing:
1898
- raise RuntimeError(f"Failed to unload CUDA module '{self.name}' because graph capture is active")
1899
- runtime.core.cuda_unload_module(context, module)
1900
- runtime.core.cuda_context_set_current(saved_context)
1901
- self.cuda_modules = {}
1902
-
1903
- # clear kernel hooks
1904
- self.kernel_hooks = {}
1939
+ # clear loaded modules
1940
+ self.cpu_exec = None
1941
+ self.cuda_execs = {}
1905
1942
 
1906
1943
  # clear content hash
1907
1944
  self.content_hash = None
1908
1945
 
1909
- # lookup and cache kernel entry points based on name, called after compilation / module load
1946
+ # lookup kernel entry points based on name, called after compilation / module load
1910
1947
  def get_kernel_hooks(self, kernel, device):
1911
- # get all hooks for this device
1912
- device_hooks = self.kernel_hooks.get(device.context)
1913
- if device_hooks is None:
1914
- self.kernel_hooks[device.context] = device_hooks = {}
1915
-
1916
- # look up this kernel
1917
- hooks = device_hooks.get(kernel)
1918
- if hooks is not None:
1919
- return hooks
1920
-
1921
- name = kernel.get_mangled_name()
1922
-
1923
- if device.is_cpu:
1924
- func = ctypes.CFUNCTYPE(None)
1925
- forward = func(
1926
- runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))
1927
- )
1928
- backward = func(
1929
- runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))
1930
- )
1948
+ if device.is_cuda:
1949
+ module_exec = self.cuda_execs.get(device.context)
1931
1950
  else:
1932
- cu_module = self.cuda_modules[device.context]
1933
- forward = runtime.core.cuda_get_kernel(
1934
- device.context, cu_module, (name + "_cuda_kernel_forward").encode("utf-8")
1935
- )
1936
- backward = runtime.core.cuda_get_kernel(
1937
- device.context, cu_module, (name + "_cuda_kernel_backward").encode("utf-8")
1938
- )
1951
+ module_exec = self.cpu_exec
1939
1952
 
1940
- hooks = KernelHooks(forward, backward)
1941
- device_hooks[kernel] = hooks
1942
- return hooks
1953
+ if module_exec is not None:
1954
+ return module_exec.get_kernel_hooks(kernel)
1955
+ else:
1956
+ raise RuntimeError(f"Module is not loaded on device {device}")
1943
1957
 
1944
1958
 
1945
1959
  # -------------------------------------------
@@ -2196,8 +2210,8 @@ class Device:
2196
2210
  self._stream = None
2197
2211
  self.null_stream = None
2198
2212
 
2199
- # set of streams where capture has started
2200
- self.captures = set()
2213
+ # maps streams to started graph captures
2214
+ self.captures = {}
2201
2215
 
2202
2216
  self.context_guard = ContextGuard(self)
2203
2217
 
@@ -2434,20 +2448,25 @@ Devicelike = Union[Device, str, None]
2434
2448
  class Graph:
2435
2449
  def __new__(cls, *args, **kwargs):
2436
2450
  instance = super(Graph, cls).__new__(cls)
2437
- instance.exec = None
2451
+ instance.graph_exec = None
2438
2452
  return instance
2439
2453
 
2440
- def __init__(self, device: Device, exec: ctypes.c_void_p):
2454
+ def __init__(self, device: Device, capture_id: int):
2441
2455
  self.device = device
2442
- self.exec = exec
2456
+ self.capture_id = capture_id
2457
+ self.module_execs = set()
2443
2458
 
2444
2459
  def __del__(self):
2445
- if not self.exec:
2460
+ if not self.graph_exec:
2446
2461
  return
2447
2462
 
2448
2463
  # use CUDA context guard to avoid side effects during garbage collection
2449
2464
  with self.device.context_guard:
2450
- runtime.core.cuda_graph_destroy(self.device.context, self.exec)
2465
+ runtime.core.cuda_graph_destroy(self.device.context, self.graph_exec)
2466
+
2467
+ # retain executable CUDA modules used by this graph, which prevents them from being unloaded
2468
+ def retain_module_exec(self, module_exec: ModuleExec):
2469
+ self.module_execs.add(module_exec)
2451
2470
 
2452
2471
 
2453
2472
  class Runtime:
@@ -2488,6 +2507,9 @@ class Runtime:
2488
2507
  else:
2489
2508
  self.llvm = None
2490
2509
 
2510
+ # maps capture ids to graphs
2511
+ self.captures = {}
2512
+
2491
2513
  # setup c-types for warp.dll
2492
2514
  try:
2493
2515
  self.core.get_error_string.argtypes = []
@@ -3023,6 +3045,8 @@ class Runtime:
3023
3045
  self.core.cuda_stream_wait_stream.restype = None
3024
3046
  self.core.cuda_stream_is_capturing.argtypes = [ctypes.c_void_p]
3025
3047
  self.core.cuda_stream_is_capturing.restype = ctypes.c_int
3048
+ self.core.cuda_stream_get_capture_id.argtypes = [ctypes.c_void_p]
3049
+ self.core.cuda_stream_get_capture_id.restype = ctypes.c_uint64
3026
3050
 
3027
3051
  self.core.cuda_event_create.argtypes = [ctypes.c_void_p, ctypes.c_uint]
3028
3052
  self.core.cuda_event_create.restype = ctypes.c_void_p
@@ -4490,13 +4514,14 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
4490
4514
  # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
4491
4515
  class Launch:
4492
4516
  def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
4517
+ # retain the module executable so it doesn't get unloaded
4518
+ self.module_exec = kernel.module.load(device)
4519
+ if not self.module_exec:
4520
+ raise RuntimeError(f"Failed to load module {kernel.module.name} on device {device}")
4521
+
4493
4522
  # if not specified look up hooks
4494
4523
  if not hooks:
4495
- module = kernel.module
4496
- if not module.load(device):
4497
- return
4498
-
4499
- hooks = module.get_kernel_hooks(kernel, device)
4524
+ hooks = self.module_exec.get_kernel_hooks(kernel)
4500
4525
 
4501
4526
  # if not specified set a zero bound
4502
4527
  if not bounds:
@@ -4594,6 +4619,15 @@ class Launch:
4594
4619
  else:
4595
4620
  if stream is None:
4596
4621
  stream = self.device.stream
4622
+
4623
+ # If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
4624
+ # before the captured graph is released.
4625
+ if runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
4626
+ capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
4627
+ graph = runtime.captures.get(capture_id)
4628
+ if graph is not None:
4629
+ graph.retain_module_exec(self.module_exec)
4630
+
4597
4631
  runtime.core.cuda_launch_kernel(
4598
4632
  self.device.context,
4599
4633
  self.hooks.forward,
@@ -4689,12 +4723,12 @@ def launch(
4689
4723
  kernel = kernel.add_overload(fwd_types)
4690
4724
 
4691
4725
  # delay load modules, including new overload if needed
4692
- module = kernel.module
4693
- if not module.load(device):
4726
+ module_exec = kernel.module.load(device)
4727
+ if not module_exec:
4694
4728
  return
4695
4729
 
4696
4730
  # late bind
4697
- hooks = module.get_kernel_hooks(kernel, device)
4731
+ hooks = module_exec.get_kernel_hooks(kernel)
4698
4732
 
4699
4733
  pack_args(fwd_args, params)
4700
4734
  pack_args(adj_args, params, adjoint=True)
@@ -4730,6 +4764,14 @@ def launch(
4730
4764
  if stream is None:
4731
4765
  stream = device.stream
4732
4766
 
4767
+ # If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
4768
+ # before the captured graph is released.
4769
+ if runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
4770
+ capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
4771
+ graph = runtime.captures.get(capture_id)
4772
+ if graph is not None:
4773
+ graph.retain_module_exec(module_exec)
4774
+
4733
4775
  if adjoint:
4734
4776
  if hooks.backward is None:
4735
4777
  raise RuntimeError(
@@ -5014,11 +5056,18 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=None
5014
5056
  if force_module_load:
5015
5057
  force_load(device)
5016
5058
 
5017
- device.captures.add(stream)
5018
-
5019
5059
  if not runtime.core.cuda_graph_begin_capture(device.context, stream.cuda_stream, int(external)):
5020
5060
  raise RuntimeError(runtime.get_error_string())
5021
5061
 
5062
+ capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
5063
+ graph = Graph(device, capture_id)
5064
+
5065
+ # add to ongoing captures on the device
5066
+ device.captures[stream] = graph
5067
+
5068
+ # add to lookup table by globally unique capture id
5069
+ runtime.captures[capture_id] = graph
5070
+
5022
5071
 
5023
5072
  def capture_end(device: Devicelike = None, stream: Stream = None) -> Graph:
5024
5073
  """Ends the capture of a CUDA graph
@@ -5040,21 +5089,27 @@ def capture_end(device: Devicelike = None, stream: Stream = None) -> Graph:
5040
5089
  raise RuntimeError("Must be a CUDA device")
5041
5090
  stream = device.stream
5042
5091
 
5043
- if stream not in device.captures:
5092
+ # get the graph being captured
5093
+ graph = device.captures.get(stream)
5094
+
5095
+ if graph is None:
5044
5096
  raise RuntimeError("Graph capture is not active on this stream")
5045
5097
 
5046
- device.captures.remove(stream)
5098
+ del device.captures[stream]
5099
+ del runtime.captures[graph.capture_id]
5047
5100
 
5048
- graph = ctypes.c_void_p()
5049
- result = runtime.core.cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(graph))
5101
+ # get the graph executable
5102
+ graph_exec = ctypes.c_void_p()
5103
+ result = runtime.core.cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(graph_exec))
5050
5104
 
5051
5105
  if not result:
5052
5106
  # A concrete error should've already been reported, so we don't need to go into details here
5053
5107
  raise RuntimeError(f"CUDA graph capture failed. {runtime.get_error_string()}")
5054
5108
 
5055
- # note that for external captures, we do not return a graph, because we don't instantiate it ourselves
5056
- if graph:
5057
- return Graph(device, graph)
5109
+ # set the graph executable
5110
+ graph.graph_exec = graph_exec
5111
+
5112
+ return graph
5058
5113
 
5059
5114
 
5060
5115
  def capture_launch(graph: Graph, stream: Stream = None):
@@ -5073,7 +5128,7 @@ def capture_launch(graph: Graph, stream: Stream = None):
5073
5128
  device = graph.device
5074
5129
  stream = device.stream
5075
5130
 
5076
- if not runtime.core.cuda_graph_launch(graph.exec, stream.cuda_stream):
5131
+ if not runtime.core.cuda_graph_launch(graph.graph_exec, stream.cuda_stream):
5077
5132
  raise RuntimeError(f"Graph launch error: {runtime.get_error_string()}")
5078
5133
 
5079
5134
 
@@ -5522,15 +5577,9 @@ def export_stubs(file): # pragma: no cover
5522
5577
  if not f.export or f.hidden: # or f.generic:
5523
5578
  continue
5524
5579
 
5525
- try:
5526
- # todo: construct a default value for each of the functions args
5527
- # so we can generate the return type for overloaded functions
5528
- return_type = f.value_func(None, None)
5529
- if return_type:
5530
- return_str = " -> " + type_str(return_type)
5531
-
5532
- except Exception:
5533
- pass
5580
+ return_type = f.value_func(None, None)
5581
+ if return_type:
5582
+ return_str = " -> " + type_str(return_type)
5534
5583
 
5535
5584
  print("@over", file=file)
5536
5585
  print(f"def {f.key}({args}){return_str}:", file=file)
@@ -12,8 +12,9 @@
12
12
  #
13
13
  # Div[ d/dF Psi(F(u)) ] = 0
14
14
  #
15
- # with Dirichlet boundary conditions on vertical sides,
16
- # and Psi an elastic potential function of the deformation gradient (here Neo-Hookean)
15
+ # with Dirichlet boundary conditions on vertical sides and Psi an elastic potential function of the deformation gradient.
16
+ # Here we choose Psi Neo-Hookean, as per Sec 3.2 of "Stable Neo-Hookean Flesh Simulation" (Smith et al. 2018),
17
+ # Psi(F) = mu ||F||^2 + lambda (det J - 1 - mu/lambda)^2
17
18
  #
18
19
  # which we write as a sequence of Newton iterations:
19
20
  # int {sigma : grad v} = 0 for all displacement test functions v
@@ -37,19 +38,28 @@ def displacement_gradient_form(
37
38
  return wp.ddot(tau(s), fem.grad(u, s))
38
39
 
39
40
 
41
+ @wp.func
42
+ def nh_parameters_from_lame(lame: wp.vec2):
43
+ """Parameters such that for small strains model behaves according to Hooke's law"""
44
+ mu_nh = lame[1]
45
+ lambda_nh = lame[0] + lame[1]
46
+
47
+ return mu_nh, lambda_nh
48
+
49
+
40
50
  @fem.integrand
41
51
  def nh_stress_form(s: fem.Sample, tau: fem.Field, u_cur: fem.Field, lame: wp.vec2):
42
52
  """d Psi/dF : tau"""
43
53
 
54
+ # Deformation gradient
44
55
  F = wp.identity(n=2, dtype=float) + fem.grad(u_cur, s)
45
56
 
57
+ # Area term and its derivative w.r.t F
46
58
  J = wp.determinant(F)
47
- mu_nh = 2.0 * lame[1]
48
- lambda_nh = lame[0] + lame[1]
49
- gamma = 1.0 + mu_nh / lambda_nh
59
+ dJ_dF = wp.mat22(F[1, 1], -F[1, 0], -F[0, 1], F[0, 0])
50
60
 
51
- dJ_dS = wp.mat22(F[1, 1], -F[1, 0], -F[0, 1], F[0, 0])
52
- nh_stress = mu_nh * F + lambda_nh * (J - gamma) * dJ_dS
61
+ mu_nh, lambda_nh = nh_parameters_from_lame(lame)
62
+ nh_stress = mu_nh * F + (lambda_nh * (J - 1.0) - mu_nh) * dJ_dF
53
63
 
54
64
  return wp.ddot(tau(s), nh_stress)
55
65
 
@@ -62,23 +72,11 @@ def nh_stress_delta_form(s: fem.Sample, tau: fem.Field, u: fem.Field, u_cur: fem
62
72
  sigma_s = fem.grad(u, s)
63
73
 
64
74
  F = wp.identity(n=2, dtype=float) + fem.grad(u_cur, s)
65
-
66
75
  dJ_dF = wp.mat22(F[1, 1], -F[1, 0], -F[0, 1], F[0, 0])
67
76
 
68
- mu_nh = 2.0 * lame[1]
69
- lambda_nh = lame[0] + lame[1]
70
-
71
- dpsi_dpsi = mu_nh * wp.ddot(tau_s, sigma_s) + lambda_nh * wp.ddot(dJ_dF * tau_s, dJ_dF * sigma_s)
72
-
73
- # positive part of d2J_dS2
74
- gamma = 1.0 + mu_nh / lambda_nh
75
- J = wp.determinant(F)
76
- if J >= gamma:
77
- d2J_dF_sig = wp.mat22(sigma_s[1, 1], 0.0, 0.0, sigma_s[0, 0])
78
- else:
79
- d2J_dF_sig = wp.mat22(0.0, -sigma_s[1, 0], -sigma_s[0, 1], 0.0)
80
-
81
- return dpsi_dpsi + lambda_nh * (J - gamma) * wp.ddot(d2J_dF_sig, tau_s)
77
+ # Gauss--Newton approximation; ignore d2J/dF2 term
78
+ mu_nh, lambda_nh = nh_parameters_from_lame(lame)
79
+ return mu_nh * wp.ddot(tau_s, sigma_s) + lambda_nh * wp.ddot(dJ_dF, tau_s) * wp.ddot(dJ_dF, sigma_s)
82
80
 
83
81
 
84
82
  @fem.integrand
@@ -114,6 +112,12 @@ def tensor_mass_form(
114
112
  return wp.ddot(tau(s), sig(s))
115
113
 
116
114
 
115
+ @fem.integrand
116
+ def area_form(s: fem.Sample, u_cur: fem.Field):
117
+ F = wp.identity(n=2, dtype=float) + fem.grad(u_cur, s)
118
+ return wp.determinant(F)
119
+
120
+
117
121
  class Example:
118
122
  def __init__(
119
123
  self,
@@ -228,6 +232,12 @@ class Example:
228
232
  wp.utils.array_cast(in_array=x, out_array=delta_u)
229
233
  fem.utils.array_axpy(x=delta_u, y=self._u_field.dof_values)
230
234
 
235
+ # Evaluate area conservation, should converge to 1.0 as Poisson ratio approaches 1.0
236
+ final_area = fem.integrate(
237
+ area_form, quadrature=fem.RegularQuadrature(domain, order=4), fields={"u_cur": self._u_field}
238
+ )
239
+ print(f"Area gain: {final_area} (using Poisson ratio={self._lame[0] / (self._lame[0] + 2.0*self._lame[1])})")
240
+
231
241
  def render(self):
232
242
  self.renderer.add_field("solution", self._u_field)
233
243
 
@@ -242,7 +252,7 @@ if __name__ == "__main__":
242
252
  parser.add_argument("--resolution", type=int, default=25, help="Grid resolution.")
243
253
  parser.add_argument("--degree", type=int, default=2, help="Polynomial degree of shape functions.")
244
254
  parser.add_argument("--displacement", type=float, default=-0.5)
245
- parser.add_argument("--poisson_ratio", type=float, default=0.5)
255
+ parser.add_argument("--poisson_ratio", type=float, default=0.99)
246
256
  parser.add_argument("--mesh", choices=("grid", "tri", "quad"), default="grid", help="Mesh type")
247
257
  parser.add_argument(
248
258
  "--nonconforming_stresses", action="store_true", help="For grid, use non-conforming stresses (Q_d/P_d)"
@@ -247,7 +247,7 @@ class NodalFieldBase(DiscreteField):
247
247
 
248
248
  def _make_node_partition_index(self):
249
249
  @cache.dynamic_func(suffix=self.name)
250
- def node_partition_index(args: self.EvalArg, node_index: int):
250
+ def node_partition_index(args: self.ElementEvalArg, node_index: int):
251
251
  return self.space_partition.partition_node_index(args.eval_arg.partition_arg, node_index)
252
252
 
253
253
  return node_partition_index
@@ -336,6 +336,7 @@ class ExplicitQuadrature(Quadrature):
336
336
  @cache.cached_arg_value
337
337
  def arg_value(self, device):
338
338
  arg = self.Arg()
339
+ arg.points_per_cell = self._points_per_cell
339
340
  arg.points = self._points.to(device)
340
341
  arg.weights = self._weights.to(device)
341
342
 
warp/native/builtin.h CHANGED
@@ -748,7 +748,7 @@ inline CUDA_CALLABLE half floordiv(half a, half b)
748
748
  #if FP_CHECK
749
749
  if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f)
750
750
  {
751
- printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
751
+ printf("%s:%d floordiv(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
752
752
  assert(0);
753
753
  }
754
754
  #endif
@@ -759,7 +759,7 @@ inline CUDA_CALLABLE float floordiv(float a, float b)
759
759
  #if FP_CHECK
760
760
  if (!isfinite(a) || !isfinite(b) || b == 0.0f)
761
761
  {
762
- printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
762
+ printf("%s:%d floordiv(%f, %f)\n", __FILE__, __LINE__, a, b);
763
763
  assert(0);
764
764
  }
765
765
  #endif
@@ -770,7 +770,7 @@ inline CUDA_CALLABLE double floordiv(double a, double b)
770
770
  #if FP_CHECK
771
771
  if (!isfinite(a) || !isfinite(b) || b == 0.0)
772
772
  {
773
- printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
773
+ printf("%s:%d floordiv(%f, %f)\n", __FILE__, __LINE__, a, b);
774
774
  assert(0);
775
775
  }
776
776
  #endif
warp/native/bvh.h CHANGED
@@ -320,7 +320,7 @@ CUDA_CALLABLE inline bvh_query_t bvh_query_aabb(
320
320
  CUDA_CALLABLE inline bvh_query_t bvh_query_ray(
321
321
  uint64_t id, const vec3& start, const vec3& dir)
322
322
  {
323
- return bvh_query(id, true, start, dir);
323
+ return bvh_query(id, true, start, 1.0f / dir);
324
324
  }
325
325
 
326
326
  //Stub