warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (134) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/libwarp-clang.dylib +0 -0
  4. warp/bin/libwarp.dylib +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +47 -67
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +312 -206
  9. warp/config.py +1 -1
  10. warp/context.py +1249 -784
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/fabric.py +1 -1
  18. warp/fem/cache.py +27 -19
  19. warp/fem/domain.py +2 -2
  20. warp/fem/field/nodal_field.py +2 -2
  21. warp/fem/field/virtual.py +264 -166
  22. warp/fem/geometry/geometry.py +5 -5
  23. warp/fem/integrate.py +129 -51
  24. warp/fem/space/restriction.py +4 -0
  25. warp/fem/space/shape/tet_shape_function.py +3 -10
  26. warp/jax_experimental/custom_call.py +1 -1
  27. warp/jax_experimental/ffi.py +2 -1
  28. warp/marching_cubes.py +708 -0
  29. warp/native/array.h +99 -4
  30. warp/native/builtin.h +82 -5
  31. warp/native/bvh.cpp +64 -28
  32. warp/native/bvh.cu +58 -58
  33. warp/native/bvh.h +2 -2
  34. warp/native/clang/clang.cpp +7 -7
  35. warp/native/coloring.cpp +8 -2
  36. warp/native/crt.cpp +2 -2
  37. warp/native/crt.h +3 -5
  38. warp/native/cuda_util.cpp +41 -10
  39. warp/native/cuda_util.h +10 -4
  40. warp/native/exports.h +1842 -1908
  41. warp/native/fabric.h +2 -1
  42. warp/native/hashgrid.cpp +37 -37
  43. warp/native/hashgrid.cu +2 -2
  44. warp/native/initializer_array.h +1 -1
  45. warp/native/intersect.h +2 -2
  46. warp/native/mat.h +1910 -116
  47. warp/native/mathdx.cpp +43 -43
  48. warp/native/mesh.cpp +24 -24
  49. warp/native/mesh.cu +26 -26
  50. warp/native/mesh.h +4 -2
  51. warp/native/nanovdb/GridHandle.h +179 -12
  52. warp/native/nanovdb/HostBuffer.h +8 -7
  53. warp/native/nanovdb/NanoVDB.h +517 -895
  54. warp/native/nanovdb/NodeManager.h +323 -0
  55. warp/native/nanovdb/PNanoVDB.h +2 -2
  56. warp/native/quat.h +331 -14
  57. warp/native/range.h +7 -1
  58. warp/native/reduce.cpp +10 -10
  59. warp/native/reduce.cu +13 -14
  60. warp/native/runlength_encode.cpp +2 -2
  61. warp/native/runlength_encode.cu +5 -5
  62. warp/native/scan.cpp +3 -3
  63. warp/native/scan.cu +4 -4
  64. warp/native/sort.cpp +10 -10
  65. warp/native/sort.cu +22 -22
  66. warp/native/sparse.cpp +8 -8
  67. warp/native/sparse.cu +13 -13
  68. warp/native/spatial.h +366 -17
  69. warp/native/temp_buffer.h +2 -2
  70. warp/native/tile.h +283 -69
  71. warp/native/vec.h +381 -14
  72. warp/native/volume.cpp +54 -54
  73. warp/native/volume.cu +1 -1
  74. warp/native/volume.h +2 -1
  75. warp/native/volume_builder.cu +30 -37
  76. warp/native/warp.cpp +150 -149
  77. warp/native/warp.cu +323 -192
  78. warp/native/warp.h +227 -226
  79. warp/optim/linear.py +736 -271
  80. warp/render/imgui_manager.py +289 -0
  81. warp/render/render_opengl.py +85 -6
  82. warp/sim/graph_coloring.py +2 -2
  83. warp/sparse.py +558 -175
  84. warp/tests/aux_test_module_aot.py +7 -0
  85. warp/tests/cuda/test_async.py +3 -3
  86. warp/tests/cuda/test_conditional_captures.py +101 -0
  87. warp/tests/geometry/test_marching_cubes.py +233 -12
  88. warp/tests/sim/test_coloring.py +6 -6
  89. warp/tests/test_array.py +56 -5
  90. warp/tests/test_codegen.py +3 -2
  91. warp/tests/test_context.py +8 -15
  92. warp/tests/test_enum.py +136 -0
  93. warp/tests/test_examples.py +2 -2
  94. warp/tests/test_fem.py +45 -2
  95. warp/tests/test_fixedarray.py +229 -0
  96. warp/tests/test_func.py +18 -15
  97. warp/tests/test_future_annotations.py +7 -5
  98. warp/tests/test_linear_solvers.py +30 -0
  99. warp/tests/test_map.py +1 -1
  100. warp/tests/test_mat.py +1518 -378
  101. warp/tests/test_mat_assign_copy.py +178 -0
  102. warp/tests/test_mat_constructors.py +574 -0
  103. warp/tests/test_module_aot.py +287 -0
  104. warp/tests/test_print.py +69 -0
  105. warp/tests/test_quat.py +140 -34
  106. warp/tests/test_quat_assign_copy.py +145 -0
  107. warp/tests/test_reload.py +2 -1
  108. warp/tests/test_sparse.py +71 -0
  109. warp/tests/test_spatial.py +140 -34
  110. warp/tests/test_spatial_assign_copy.py +160 -0
  111. warp/tests/test_struct.py +43 -3
  112. warp/tests/test_types.py +0 -20
  113. warp/tests/test_vec.py +179 -34
  114. warp/tests/test_vec_assign_copy.py +143 -0
  115. warp/tests/tile/test_tile.py +184 -18
  116. warp/tests/tile/test_tile_cholesky.py +605 -0
  117. warp/tests/tile/test_tile_load.py +169 -0
  118. warp/tests/tile/test_tile_mathdx.py +2 -558
  119. warp/tests/tile/test_tile_matmul.py +1 -1
  120. warp/tests/tile/test_tile_mlp.py +1 -1
  121. warp/tests/tile/test_tile_shared_memory.py +5 -5
  122. warp/tests/unittest_suites.py +6 -0
  123. warp/tests/walkthrough_debug.py +1 -1
  124. warp/thirdparty/unittest_parallel.py +108 -9
  125. warp/types.py +554 -264
  126. warp/utils.py +68 -86
  127. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  128. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
  129. warp/native/marching.cpp +0 -19
  130. warp/native/marching.cu +0 -514
  131. warp/native/marching.h +0 -19
  132. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  133. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  134. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/context.py CHANGED
@@ -26,13 +26,28 @@ import json
26
26
  import operator
27
27
  import os
28
28
  import platform
29
+ import shutil
29
30
  import sys
30
31
  import types
31
32
  import typing
32
33
  import weakref
33
34
  from copy import copy as shallowcopy
34
35
  from pathlib import Path
35
- from typing import Any, Callable, Dict, List, Literal, Mapping, Sequence, Tuple, TypeVar, Union, get_args, get_origin
36
+ from typing import (
37
+ Any,
38
+ Callable,
39
+ Dict,
40
+ Iterable,
41
+ List,
42
+ Literal,
43
+ Mapping,
44
+ Sequence,
45
+ Tuple,
46
+ TypeVar,
47
+ Union,
48
+ get_args,
49
+ get_origin,
50
+ )
36
51
 
37
52
  import numpy as np
38
53
 
@@ -327,39 +342,25 @@ class Function:
327
342
  warp.codegen.apply_defaults(bound_args, self.defaults)
328
343
 
329
344
  arguments = tuple(bound_args.arguments.values())
330
-
331
- # Store the last runtime error we encountered from a function execution
332
- last_execution_error = None
345
+ arg_types = tuple(warp.codegen.get_arg_type(x) for x in arguments)
333
346
 
334
347
  # try and find a matching overload
335
348
  for overload in self.user_overloads.values():
336
349
  if len(overload.input_types) != len(arguments):
337
350
  continue
351
+
352
+ if not warp.codegen.func_match_args(overload, arg_types, {}):
353
+ continue
354
+
338
355
  template_types = list(overload.input_types.values())
339
356
  arg_names = list(overload.input_types.keys())
340
- try:
341
- # attempt to unify argument types with function template types
342
- warp.types.infer_argument_types(arguments, template_types, arg_names)
343
- return overload.func(*arguments)
344
- except Exception as e:
345
- # The function was callable but threw an error during its execution.
346
- # This might be the intended overload, but it failed, or it might be the wrong overload.
347
- # We save this specific error and continue, just in case another overload later in the
348
- # list is a better match and doesn't fail.
349
- last_execution_error = e
350
- continue
351
357
 
352
- if last_execution_error:
353
- # Raise a new, more contextual RuntimeError, but link it to the
354
- # original error that was caught. This preserves the original
355
- # traceback and error type for easier debugging.
356
- raise RuntimeError(
357
- f"Error calling function '{self.key}'. No version succeeded. "
358
- f"See above for the error from the last version that was tried."
359
- ) from last_execution_error
360
- else:
361
- # We got here without ever calling an overload.func
362
- raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
358
+ # attempt to unify argument types with function template types
359
+ warp.types.infer_argument_types(arguments, template_types, arg_names)
360
+ return overload.func(*arguments)
361
+
362
+ # We got here without ever calling an overload.func
363
+ raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
363
364
 
364
365
  # user-defined function with no overloads
365
366
  if self.func is None:
@@ -385,7 +386,7 @@ class Function:
385
386
  def mangle(self) -> str:
386
387
  """Build a mangled name for the C-exported function, e.g.: `builtin_normalize_vec3()`."""
387
388
 
388
- name = "builtin_" + self.key
389
+ name = "wp_builtin_" + self.key
389
390
 
390
391
  # Runtime arguments that are to be passed to the function, not its template signature.
391
392
  if self.export_func is not None:
@@ -475,6 +476,25 @@ class Function:
475
476
  # failed to find overload
476
477
  return None
477
478
 
479
+ def build(self, builder: ModuleBuilder | None):
480
+ self.adj.build(builder)
481
+
482
+ # complete the function return type after we have analyzed it (inferred from return statement in ast)
483
+ if not self.value_func:
484
+
485
+ def wrap(adj):
486
+ def value_type(arg_types, arg_values):
487
+ if adj.return_var is None or len(adj.return_var) == 0:
488
+ return None
489
+ if len(adj.return_var) == 1:
490
+ return adj.return_var[0].type
491
+ else:
492
+ return [v.type for v in adj.return_var]
493
+
494
+ return value_type
495
+
496
+ self.value_func = wrap(self.adj)
497
+
478
498
  def __repr__(self):
479
499
  inputs_str = ", ".join([f"{k}: {warp.types.type_repr(v)}" for k, v in self.input_types.items()])
480
500
  return f"<Function {self.key}({inputs_str})>"
@@ -807,14 +827,17 @@ class Kernel:
807
827
  sig = warp.types.get_signature(arg_types, func_name=self.key)
808
828
  return self.overloads.get(sig)
809
829
 
810
- def get_mangled_name(self):
811
- if self.hash is None:
812
- raise RuntimeError(f"Missing hash for kernel {self.key} in module {self.module.name}")
830
+ def get_mangled_name(self) -> str:
831
+ if self.module.options["strip_hash"]:
832
+ return self.key
833
+ else:
834
+ if self.hash is None:
835
+ raise RuntimeError(f"Missing hash for kernel {self.key} in module {self.module.name}")
813
836
 
814
- # TODO: allow customizing the number of hash characters used
815
- hash_suffix = self.hash.hex()[:8]
837
+ # TODO: allow customizing the number of hash characters used
838
+ hash_suffix = self.hash.hex()[:8]
816
839
 
817
- return f"{self.key}_{hash_suffix}"
840
+ return f"{self.key}_{hash_suffix}"
818
841
 
819
842
  def __call__(self, *args, **kwargs):
820
843
  # we implement this function only to ensure Kernel is a callable object
@@ -1597,6 +1620,9 @@ class ModuleHasher:
1597
1620
  # line directives, e.g. for Nsight Compute
1598
1621
  ch.update(bytes(ctypes.c_int(warp.config.line_directives)))
1599
1622
 
1623
+ # whether to use `assign_copy` instead of `assign_inplace`
1624
+ ch.update(bytes(ctypes.c_int(warp.config.enable_vector_component_overwrites)))
1625
+
1600
1626
  # build config
1601
1627
  ch.update(bytes(warp.config.mode, "utf-8"))
1602
1628
 
@@ -1784,6 +1810,9 @@ class ModuleBuilder:
1784
1810
  self.structs[struct] = None
1785
1811
 
1786
1812
  def build_kernel(self, kernel):
1813
+ if kernel.options.get("enable_backward", True):
1814
+ kernel.adj.used_by_backward_kernel = True
1815
+
1787
1816
  kernel.adj.build(self)
1788
1817
 
1789
1818
  if kernel.adj.return_var is not None:
@@ -1794,23 +1823,7 @@ class ModuleBuilder:
1794
1823
  if func in self.functions:
1795
1824
  return
1796
1825
  else:
1797
- func.adj.build(self)
1798
-
1799
- # complete the function return type after we have analyzed it (inferred from return statement in ast)
1800
- if not func.value_func:
1801
-
1802
- def wrap(adj):
1803
- def value_type(arg_types, arg_values):
1804
- if adj.return_var is None or len(adj.return_var) == 0:
1805
- return None
1806
- if len(adj.return_var) == 1:
1807
- return adj.return_var[0].type
1808
- else:
1809
- return [v.type for v in adj.return_var]
1810
-
1811
- return value_type
1812
-
1813
- func.value_func = wrap(func.adj)
1826
+ func.build(self)
1814
1827
 
1815
1828
  # use dict to preserve import order
1816
1829
  self.functions[func] = None
@@ -1830,10 +1843,11 @@ class ModuleBuilder:
1830
1843
  source = ""
1831
1844
 
1832
1845
  # code-gen LTO forward declarations
1833
- source += 'extern "C" {\n'
1834
- for fwd in self.ltoirs_decl.values():
1835
- source += fwd + "\n"
1836
- source += "}\n"
1846
+ if len(self.ltoirs_decl) > 0:
1847
+ source += 'extern "C" {\n'
1848
+ for fwd in self.ltoirs_decl.values():
1849
+ source += fwd + "\n"
1850
+ source += "}\n"
1837
1851
 
1838
1852
  # code-gen structs
1839
1853
  visited_structs = set()
@@ -1898,9 +1912,9 @@ class ModuleExec:
1898
1912
  if self.device.is_cuda:
1899
1913
  # use CUDA context guard to avoid side effects during garbage collection
1900
1914
  with self.device.context_guard:
1901
- runtime.core.cuda_unload_module(self.device.context, self.handle)
1915
+ runtime.core.wp_cuda_unload_module(self.device.context, self.handle)
1902
1916
  else:
1903
- runtime.llvm.unload_obj(self.handle.encode("utf-8"))
1917
+ runtime.llvm.wp_unload_obj(self.handle.encode("utf-8"))
1904
1918
 
1905
1919
  # lookup and cache kernel entry points
1906
1920
  def get_kernel_hooks(self, kernel) -> KernelHooks:
@@ -1918,13 +1932,13 @@ class ModuleExec:
1918
1932
 
1919
1933
  if self.device.is_cuda:
1920
1934
  forward_name = name + "_cuda_kernel_forward"
1921
- forward_kernel = runtime.core.cuda_get_kernel(
1935
+ forward_kernel = runtime.core.wp_cuda_get_kernel(
1922
1936
  self.device.context, self.handle, forward_name.encode("utf-8")
1923
1937
  )
1924
1938
 
1925
1939
  if options["enable_backward"]:
1926
1940
  backward_name = name + "_cuda_kernel_backward"
1927
- backward_kernel = runtime.core.cuda_get_kernel(
1941
+ backward_kernel = runtime.core.wp_cuda_get_kernel(
1928
1942
  self.device.context, self.handle, backward_name.encode("utf-8")
1929
1943
  )
1930
1944
  else:
@@ -1935,14 +1949,14 @@ class ModuleExec:
1935
1949
  backward_smem_bytes = self.meta[backward_name + "_smem_bytes"] if options["enable_backward"] else 0
1936
1950
 
1937
1951
  # configure kernels maximum shared memory size
1938
- max_smem_bytes = runtime.core.cuda_get_max_shared_memory(self.device.context)
1952
+ max_smem_bytes = runtime.core.wp_cuda_get_max_shared_memory(self.device.context)
1939
1953
 
1940
- if not runtime.core.cuda_configure_kernel_shared_memory(forward_kernel, forward_smem_bytes):
1954
+ if not runtime.core.wp_cuda_configure_kernel_shared_memory(forward_kernel, forward_smem_bytes):
1941
1955
  print(
1942
1956
  f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {forward_name} kernel for {forward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
1943
1957
  )
1944
1958
 
1945
- if options["enable_backward"] and not runtime.core.cuda_configure_kernel_shared_memory(
1959
+ if options["enable_backward"] and not runtime.core.wp_cuda_configure_kernel_shared_memory(
1946
1960
  backward_kernel, backward_smem_bytes
1947
1961
  ):
1948
1962
  print(
@@ -1954,12 +1968,13 @@ class ModuleExec:
1954
1968
  else:
1955
1969
  func = ctypes.CFUNCTYPE(None)
1956
1970
  forward = (
1957
- func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))) or None
1971
+ func(runtime.llvm.wp_lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8")))
1972
+ or None
1958
1973
  )
1959
1974
 
1960
1975
  if options["enable_backward"]:
1961
1976
  backward = (
1962
- func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
1977
+ func(runtime.llvm.wp_lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
1963
1978
  or None
1964
1979
  )
1965
1980
  else:
@@ -1971,6 +1986,25 @@ class ModuleExec:
1971
1986
  return hooks
1972
1987
 
1973
1988
 
1989
+ def _check_and_raise_long_path_error(e: FileNotFoundError):
1990
+ """Check if the error is due to a Windows long path and provide work-around instructions if it is.
1991
+
1992
+ ``FileNotFoundError.filename`` may legitimately be ``None`` when the originating
1993
+ API does not supply a path. Guard against that to avoid masking the original
1994
+ error with a ``TypeError``.
1995
+ """
1996
+ filename = getattr(e, "filename", None)
1997
+
1998
+ # Fast-exit when this is clearly not a legacy-path limitation:
1999
+ if filename is None or len(filename) < 260 or os.name != "nt" or filename.startswith("\\\\?\\"):
2000
+ raise e
2001
+
2002
+ raise RuntimeError(
2003
+ f"File path '{e.filename}' exceeds 259 characters, long-path support is required for this operation. "
2004
+ "See https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation for more information."
2005
+ ) from e
2006
+
2007
+
1974
2008
  # -----------------------------------------------------
1975
2009
  # stores all functions and kernels for a Python module
1976
2010
  # creates a hash of the function to use for checking
@@ -2024,6 +2058,7 @@ class Module:
2024
2058
  "mode": None,
2025
2059
  "block_dim": 256,
2026
2060
  "compile_time_trace": warp.config.compile_time_trace,
2061
+ "strip_hash": False,
2027
2062
  }
2028
2063
 
2029
2064
  # Module dependencies are determined by scanning each function
@@ -2170,20 +2205,23 @@ class Module:
2170
2205
  if isinstance(arg.type, warp.codegen.Struct) and arg.type.module is not None:
2171
2206
  add_ref(arg.type.module)
2172
2207
 
2173
- def hash_module(self):
2208
+ def hash_module(self) -> bytes:
2209
+ """Get the hash of the module for the current block_dim.
2210
+
2211
+ This function always creates a new `ModuleHasher` instance and computes the hash.
2212
+ """
2174
2213
  # compute latest hash
2175
2214
  block_dim = self.options["block_dim"]
2176
2215
  self.hashers[block_dim] = ModuleHasher(self)
2177
2216
  return self.hashers[block_dim].get_module_hash()
2178
2217
 
2179
- def load(self, device, block_dim=None) -> ModuleExec | None:
2180
- device = runtime.get_device(device)
2181
-
2182
- # update module options if launching with a new block dim
2183
- if block_dim is not None:
2184
- self.options["block_dim"] = block_dim
2218
+ def get_module_hash(self, block_dim: int | None = None) -> bytes:
2219
+ """Get the hash of the module for the current block_dim.
2185
2220
 
2186
- active_block_dim = self.options["block_dim"]
2221
+ If a hash has not been computed for the current block_dim, it will be computed and cached.
2222
+ """
2223
+ if block_dim is None:
2224
+ block_dim = self.options["block_dim"]
2187
2225
 
2188
2226
  if self.has_unresolved_static_expressions:
2189
2227
  # The module hash currently does not account for unresolved static expressions
@@ -2200,210 +2238,386 @@ class Module:
2200
2238
  self.has_unresolved_static_expressions = False
2201
2239
 
2202
2240
  # compute the hash if needed
2203
- if active_block_dim not in self.hashers:
2204
- self.hashers[active_block_dim] = ModuleHasher(self)
2241
+ if block_dim not in self.hashers:
2242
+ self.hashers[block_dim] = ModuleHasher(self)
2205
2243
 
2206
- # check if executable module is already loaded and not stale
2207
- exec = self.execs.get((device.context, active_block_dim))
2208
- if exec is not None:
2209
- if exec.module_hash == self.hashers[active_block_dim].get_module_hash():
2210
- return exec
2244
+ return self.hashers[block_dim].get_module_hash()
2211
2245
 
2212
- # quietly avoid repeated build attempts to reduce error spew
2213
- if device.context in self.failed_builds:
2246
+ def _use_ptx(self, device) -> bool:
2247
+ # determine whether to use PTX or CUBIN
2248
+ if device.is_cubin_supported:
2249
+ # get user preference specified either per module or globally
2250
+ preferred_cuda_output = self.options.get("cuda_output") or warp.config.cuda_output
2251
+ if preferred_cuda_output is not None:
2252
+ use_ptx = preferred_cuda_output == "ptx"
2253
+ else:
2254
+ # determine automatically: older drivers may not be able to handle PTX generated using newer
2255
+ # CUDA Toolkits, in which case we fall back on generating CUBIN modules
2256
+ use_ptx = runtime.driver_version >= runtime.toolkit_version
2257
+ else:
2258
+ # CUBIN not an option, must use PTX (e.g. CUDA Toolkit too old)
2259
+ use_ptx = True
2260
+
2261
+ return use_ptx
2262
+
2263
+ def get_module_identifier(self) -> str:
2264
+ """Get an abbreviated module name to use for directories and files in the cache.
2265
+
2266
+ Depending on the setting of the ``"strip_hash"`` option for this module,
2267
+ the module identifier might include a content-dependent hash as a suffix.
2268
+ """
2269
+ if self.options["strip_hash"]:
2270
+ module_name_short = f"wp_{self.name}"
2271
+ else:
2272
+ module_hash = self.get_module_hash()
2273
+ module_name_short = f"wp_{self.name}_{module_hash.hex()[:7]}"
2274
+
2275
+ return module_name_short
2276
+
2277
+ def get_compile_arch(self, device: Device | None = None) -> int | None:
2278
+ if device is None:
2279
+ device = runtime.get_device()
2280
+
2281
+ if device.is_cpu:
2214
2282
  return None
2215
2283
 
2216
- module_name = "wp_" + self.name
2217
- module_hash = self.hashers[active_block_dim].get_module_hash()
2284
+ if self._use_ptx(device):
2285
+ # use the default PTX arch if the device supports it
2286
+ if warp.config.ptx_target_arch is not None:
2287
+ output_arch = min(device.arch, warp.config.ptx_target_arch)
2288
+ else:
2289
+ output_arch = min(device.arch, runtime.default_ptx_arch)
2290
+ else:
2291
+ output_arch = device.arch
2218
2292
 
2219
- # use a unique module path using the module short hash
2220
- module_name_short = f"{module_name}_{module_hash.hex()[:7]}"
2221
- module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short)
2293
+ return output_arch
2222
2294
 
2223
- with warp.ScopedTimer(
2224
- f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
2225
- ) as module_load_timer:
2226
- # -----------------------------------------------------------
2227
- # determine output paths
2228
- if device.is_cpu:
2229
- output_name = f"{module_name_short}.o"
2230
- output_arch = None
2295
+ def get_compile_output_name(
2296
+ self, device: Device | None, output_arch: int | None = None, use_ptx: bool | None = None
2297
+ ) -> str:
2298
+ """Get the filename to use for the compiled module binary.
2231
2299
 
2232
- elif device.is_cuda:
2233
- # determine whether to use PTX or CUBIN
2234
- if device.is_cubin_supported:
2235
- # get user preference specified either per module or globally
2236
- preferred_cuda_output = self.options.get("cuda_output") or warp.config.cuda_output
2237
- if preferred_cuda_output is not None:
2238
- use_ptx = preferred_cuda_output == "ptx"
2239
- else:
2240
- # determine automatically: older drivers may not be able to handle PTX generated using newer
2241
- # CUDA Toolkits, in which case we fall back on generating CUBIN modules
2242
- use_ptx = runtime.driver_version >= runtime.toolkit_version
2243
- else:
2244
- # CUBIN not an option, must use PTX (e.g. CUDA Toolkit too old)
2245
- use_ptx = True
2300
+ This is only the filename, e.g. ``wp___main___0340cd1.sm86.ptx``.
2301
+ It should be used to form a path.
2302
+ """
2303
+ module_name_short = self.get_module_identifier()
2246
2304
 
2247
- if use_ptx:
2248
- # use the default PTX arch if the device supports it
2249
- if warp.config.ptx_target_arch is not None:
2250
- output_arch = min(device.arch, warp.config.ptx_target_arch)
2251
- else:
2252
- output_arch = min(device.arch, runtime.default_ptx_arch)
2253
- output_name = f"{module_name_short}.sm{output_arch}.ptx"
2254
- else:
2255
- output_arch = device.arch
2256
- output_name = f"{module_name_short}.sm{output_arch}.cubin"
2305
+ if device and device.is_cpu:
2306
+ return f"{module_name_short}.o"
2307
+
2308
+ # For CUDA compilation, we must have an architecture.
2309
+ final_arch = output_arch
2310
+ if final_arch is None:
2311
+ if device:
2312
+ # Infer the architecture from the device
2313
+ final_arch = self.get_compile_arch(device)
2314
+ else:
2315
+ raise ValueError(
2316
+ "Either 'device' or 'output_arch' must be provided to determine compilation architecture"
2317
+ )
2318
+
2319
+ # Determine if we should compile to PTX or CUBIN
2320
+ if use_ptx is None:
2321
+ if device:
2322
+ use_ptx = self._use_ptx(device)
2323
+ else:
2324
+ init()
2325
+ use_ptx = final_arch not in runtime.nvrtc_supported_archs
2326
+
2327
+ if use_ptx:
2328
+ output_name = f"{module_name_short}.sm{final_arch}.ptx"
2329
+ else:
2330
+ output_name = f"{module_name_short}.sm{final_arch}.cubin"
2331
+
2332
+ return output_name
2333
+
2334
+ def get_meta_name(self) -> str:
2335
+ """Get the filename to use for the module metadata file.
2336
+
2337
+ This is only the filename. It should be used to form a path.
2338
+ """
2339
+ return f"{self.get_module_identifier()}.meta"
2340
+
2341
+ def compile(
2342
+ self,
2343
+ device: Device | None = None,
2344
+ output_dir: str | os.PathLike | None = None,
2345
+ output_name: str | None = None,
2346
+ output_arch: int | None = None,
2347
+ use_ptx: bool | None = None,
2348
+ ) -> None:
2349
+ """Compile this module for a specific device.
2350
+
2351
+ Note that this function only generates and compiles code. The resulting
2352
+ binary is not loaded into the runtime.
2353
+
2354
+ Args:
2355
+ device: The device to compile the module for.
2356
+ output_dir: The directory to write the compiled module to.
2357
+ output_name: The name of the compiled module binary file.
2358
+ output_arch: The architecture to compile the module for.
2359
+ """
2360
+ if output_arch is None:
2361
+ output_arch = self.get_compile_arch(device) # Will remain at None if device is CPU
2362
+
2363
+ if output_name is None:
2364
+ output_name = self.get_compile_output_name(device, output_arch, use_ptx)
2365
+
2366
+ builder_options = {
2367
+ **self.options,
2368
+ # Some of the tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
2369
+ "output_arch": output_arch,
2370
+ }
2371
+ builder = ModuleBuilder(
2372
+ self,
2373
+ builder_options,
2374
+ hasher=self.hashers.get(self.options["block_dim"], None),
2375
+ )
2376
+
2377
+ # create a temporary (process unique) dir for build outputs before moving to the binary dir
2378
+ module_name_short = self.get_module_identifier()
2379
+
2380
+ if output_dir is None:
2381
+ output_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name_short}")
2382
+ else:
2383
+ output_dir = os.fspath(output_dir)
2384
+
2385
+ meta_path = os.path.join(output_dir, self.get_meta_name())
2386
+
2387
+ build_dir = os.path.normpath(output_dir) + f"_p{os.getpid()}"
2388
+
2389
+ # dir may exist from previous attempts / runs / archs
2390
+ Path(build_dir).mkdir(parents=True, exist_ok=True)
2391
+
2392
+ mode = self.options["mode"] if self.options["mode"] is not None else warp.config.mode
2393
+
2394
+ # build CPU
2395
+ if output_arch is None:
2396
+ # build
2397
+ try:
2398
+ source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp")
2399
+
2400
+ # write cpp sources
2401
+ cpp_source = builder.codegen("cpu")
2402
+
2403
+ with open(source_code_path, "w") as cpp_file:
2404
+ cpp_file.write(cpp_source)
2405
+
2406
+ output_path = os.path.join(build_dir, output_name)
2407
+
2408
+ # build object code
2409
+ with warp.ScopedTimer("Compile x86", active=warp.config.verbose):
2410
+ warp.build.build_cpu(
2411
+ output_path,
2412
+ source_code_path,
2413
+ mode=mode,
2414
+ fast_math=self.options["fast_math"],
2415
+ verify_fp=warp.config.verify_fp,
2416
+ fuse_fp=self.options["fuse_fp"],
2417
+ )
2418
+
2419
+ except Exception as e:
2420
+ if isinstance(e, FileNotFoundError):
2421
+ _check_and_raise_long_path_error(e)
2422
+
2423
+ self.failed_builds.add(None)
2424
+
2425
+ raise (e)
2426
+
2427
+ else:
2428
+ # build
2429
+ try:
2430
+ source_code_path = os.path.join(build_dir, f"{module_name_short}.cu")
2431
+
2432
+ # write cuda sources
2433
+ cu_source = builder.codegen("cuda")
2434
+
2435
+ with open(source_code_path, "w") as cu_file:
2436
+ cu_file.write(cu_source)
2437
+
2438
+ output_path = os.path.join(build_dir, output_name)
2439
+
2440
+ # generate PTX or CUBIN
2441
+ with warp.ScopedTimer(
2442
+ f"Compile CUDA (arch={builder_options['output_arch']}, mode={mode}, block_dim={self.options['block_dim']})",
2443
+ active=warp.config.verbose,
2444
+ ):
2445
+ warp.build.build_cuda(
2446
+ source_code_path,
2447
+ builder_options["output_arch"],
2448
+ output_path,
2449
+ config=mode,
2450
+ verify_fp=warp.config.verify_fp,
2451
+ fast_math=self.options["fast_math"],
2452
+ fuse_fp=self.options["fuse_fp"],
2453
+ lineinfo=self.options["lineinfo"],
2454
+ compile_time_trace=self.options["compile_time_trace"],
2455
+ ltoirs=builder.ltoirs.values(),
2456
+ fatbins=builder.fatbins.values(),
2457
+ )
2458
+
2459
+ except Exception as e:
2460
+ if isinstance(e, FileNotFoundError):
2461
+ _check_and_raise_long_path_error(e)
2462
+
2463
+ if device:
2464
+ self.failed_builds.add(device.context)
2465
+
2466
+ raise (e)
2467
+
2468
+ # ------------------------------------------------------------
2469
+ # build meta data
2470
+
2471
+ meta = builder.build_meta()
2472
+ output_meta_path = os.path.join(build_dir, self.get_meta_name())
2473
+
2474
+ with open(output_meta_path, "w") as meta_file:
2475
+ json.dump(meta, meta_file)
2476
+
2477
+ # -----------------------------------------------------------
2478
+ # update cache
2257
2479
 
2480
+ # try to move process outputs to cache
2481
+ warp.build.safe_rename(build_dir, output_dir)
2482
+
2483
+ if os.path.exists(output_dir):
2258
2484
  # final object binary path
2259
- binary_path = os.path.join(module_dir, output_name)
2485
+ binary_path = os.path.join(output_dir, output_name)
2260
2486
 
2261
- # -----------------------------------------------------------
2262
- # check cache and build if necessary
2487
+ if not os.path.exists(binary_path) or self.options["strip_hash"]:
2488
+ # copy our output file to the destination module
2489
+ # this is necessary in case different processes
2490
+ # have different GPU architectures / devices
2491
+ try:
2492
+ os.rename(output_path, binary_path)
2493
+ except (OSError, FileExistsError):
2494
+ # another process likely updated the module dir first
2495
+ pass
2263
2496
 
2264
- build_dir = None
2497
+ if not os.path.exists(meta_path) or self.options["strip_hash"]:
2498
+ # copy our output file to the destination module
2499
+ # this is necessary in case different processes
2500
+ # have different GPU architectures / devices
2501
+ try:
2502
+ os.rename(output_meta_path, meta_path)
2503
+ except (OSError, FileExistsError):
2504
+ # another process likely updated the module dir first
2505
+ pass
2265
2506
 
2266
- # we always want to build if binary doesn't exist yet
2267
- # and we want to rebuild if we are not caching kernels or if we are tracking array access
2268
- if (
2269
- not os.path.exists(binary_path)
2270
- or not warp.config.cache_kernels
2271
- or warp.config.verify_autograd_array_access
2272
- ):
2273
- builder_options = {
2274
- **self.options,
2275
- # Some of the tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
2276
- "output_arch": output_arch,
2277
- }
2278
- builder = ModuleBuilder(self, builder_options, hasher=self.hashers[active_block_dim])
2279
-
2280
- # create a temporary (process unique) dir for build outputs before moving to the binary dir
2281
- build_dir = os.path.join(
2282
- warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}_p{os.getpid()}"
2283
- )
2507
+ try:
2508
+ final_source_path = os.path.join(output_dir, os.path.basename(source_code_path))
2509
+ if not os.path.exists(final_source_path) or self.options["strip_hash"]:
2510
+ os.rename(source_code_path, final_source_path)
2511
+ except (OSError, FileExistsError):
2512
+ # another process likely updated the module dir first
2513
+ pass
2514
+ except Exception as e:
2515
+ # We don't need source_code_path to be copied successfully to proceed, so warn and keep running
2516
+ warp.utils.warn(f"Exception when renaming {source_code_path}: {e}")
2284
2517
 
2285
- # dir may exist from previous attempts / runs / archs
2286
- Path(build_dir).mkdir(parents=True, exist_ok=True)
2518
+ # clean up build_dir used for this process regardless
2519
+ shutil.rmtree(build_dir, ignore_errors=True)
2287
2520
 
2288
- module_load_timer.extra_msg = " (compiled)" # For wp.ScopedTimer informational purposes
2521
+ def load(
2522
+ self,
2523
+ device,
2524
+ block_dim: int | None = None,
2525
+ binary_path: os.PathLike | None = None,
2526
+ output_arch: int | None = None,
2527
+ meta_path: os.PathLike | None = None,
2528
+ ) -> ModuleExec | None:
2529
+ device = runtime.get_device(device)
2289
2530
 
2290
- mode = self.options["mode"] if self.options["mode"] is not None else warp.config.mode
2531
+ # update module options if launching with a new block dim
2532
+ if block_dim is not None:
2533
+ self.options["block_dim"] = block_dim
2291
2534
 
2292
- # build CPU
2293
- if device.is_cpu:
2294
- # build
2295
- try:
2296
- source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp")
2535
+ active_block_dim = self.options["block_dim"]
2297
2536
 
2298
- # write cpp sources
2299
- cpp_source = builder.codegen("cpu")
2537
+ # check if executable module is already loaded and not stale
2538
+ exec = self.execs.get((device.context, active_block_dim))
2539
+ if exec is not None:
2540
+ if self.options["strip_hash"] or (exec.module_hash == self.get_module_hash(active_block_dim)):
2541
+ return exec
2300
2542
 
2301
- with open(source_code_path, "w") as cpp_file:
2302
- cpp_file.write(cpp_source)
2543
+ # quietly avoid repeated build attempts to reduce error spew
2544
+ if device.context in self.failed_builds:
2545
+ return None
2303
2546
 
2304
- output_path = os.path.join(build_dir, output_name)
2547
+ module_hash = self.get_module_hash(active_block_dim)
2305
2548
 
2306
- # build object code
2307
- with warp.ScopedTimer("Compile x86", active=warp.config.verbose):
2308
- warp.build.build_cpu(
2309
- output_path,
2310
- source_code_path,
2311
- mode=mode,
2312
- fast_math=self.options["fast_math"],
2313
- verify_fp=warp.config.verify_fp,
2314
- fuse_fp=self.options["fuse_fp"],
2315
- )
2549
+ # use a unique module path using the module short hash
2550
+ module_name_short = self.get_module_identifier()
2316
2551
 
2317
- except Exception as e:
2318
- self.failed_builds.add(None)
2319
- module_load_timer.extra_msg = " (error)"
2320
- raise (e)
2552
+ module_load_timer_name = (
2553
+ f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'"
2554
+ if self.options["strip_hash"] is False
2555
+ else f"Module {self.name} load on device '{device}'"
2556
+ )
2321
2557
 
2322
- elif device.is_cuda:
2323
- # build
2324
- try:
2325
- source_code_path = os.path.join(build_dir, f"{module_name_short}.cu")
2326
-
2327
- # write cuda sources
2328
- cu_source = builder.codegen("cuda")
2329
-
2330
- with open(source_code_path, "w") as cu_file:
2331
- cu_file.write(cu_source)
2332
-
2333
- output_path = os.path.join(build_dir, output_name)
2334
-
2335
- # generate PTX or CUBIN
2336
- with warp.ScopedTimer("Compile CUDA", active=warp.config.verbose):
2337
- warp.build.build_cuda(
2338
- source_code_path,
2339
- output_arch,
2340
- output_path,
2341
- config=mode,
2342
- verify_fp=warp.config.verify_fp,
2343
- fast_math=self.options["fast_math"],
2344
- fuse_fp=self.options["fuse_fp"],
2345
- lineinfo=self.options["lineinfo"],
2346
- compile_time_trace=self.options["compile_time_trace"],
2347
- ltoirs=builder.ltoirs.values(),
2348
- fatbins=builder.fatbins.values(),
2349
- )
2558
+ if warp.config.verbose:
2559
+ module_load_timer_name += f" (block_dim={active_block_dim})"
2350
2560
 
2351
- except Exception as e:
2352
- self.failed_builds.add(device.context)
2353
- module_load_timer.extra_msg = " (error)"
2354
- raise (e)
2561
+ with warp.ScopedTimer(module_load_timer_name, active=not warp.config.quiet) as module_load_timer:
2562
+ # -----------------------------------------------------------
2563
+ # Determine binary path and build if necessary
2355
2564
 
2356
- # ------------------------------------------------------------
2357
- # build meta data
2565
+ if binary_path:
2566
+ # We will never re-codegen or re-compile in this situation
2567
+ # The expected files must already exist
2358
2568
 
2359
- meta = builder.build_meta()
2360
- meta_path = os.path.join(build_dir, f"{module_name_short}.meta")
2569
+ if device.is_cuda and output_arch is None:
2570
+ raise ValueError("'output_arch' must be provided if a 'binary_path' is provided")
2361
2571
 
2362
- with open(meta_path, "w") as meta_file:
2363
- json.dump(meta, meta_file)
2572
+ if meta_path is None:
2573
+ raise ValueError("'meta_path' must be provided if a 'binary_path' is provided")
2364
2574
 
2365
- # -----------------------------------------------------------
2366
- # update cache
2575
+ if not os.path.exists(binary_path):
2576
+ module_load_timer.extra_msg = " (error)"
2577
+ raise FileNotFoundError(f"Binary file {binary_path} does not exist")
2578
+ else:
2579
+ module_load_timer.extra_msg = " (cached)"
2580
+ else:
2581
+ # we will build if binary doesn't exist yet
2582
+ # we will rebuild if we are not caching kernels or if we are tracking array access
2367
2583
 
2368
- # try to move process outputs to cache
2369
- warp.build.safe_rename(build_dir, module_dir)
2584
+ output_name = self.get_compile_output_name(device)
2585
+ output_arch = self.get_compile_arch(device)
2370
2586
 
2371
- if os.path.exists(module_dir):
2372
- if not os.path.exists(binary_path):
2373
- # copy our output file to the destination module
2374
- # this is necessary in case different processes
2375
- # have different GPU architectures / devices
2376
- try:
2377
- os.rename(output_path, binary_path)
2378
- except (OSError, FileExistsError):
2379
- # another process likely updated the module dir first
2380
- pass
2587
+ module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short)
2588
+ meta_path = os.path.join(module_dir, self.get_meta_name())
2589
+ # final object binary path
2590
+ binary_path = os.path.join(module_dir, output_name)
2381
2591
 
2592
+ if (
2593
+ not os.path.exists(binary_path)
2594
+ or not warp.config.cache_kernels
2595
+ or warp.config.verify_autograd_array_access
2596
+ ):
2382
2597
  try:
2383
- final_source_path = os.path.join(module_dir, os.path.basename(source_code_path))
2384
- if not os.path.exists(final_source_path):
2385
- os.rename(source_code_path, final_source_path)
2386
- except (OSError, FileExistsError):
2387
- # another process likely updated the module dir first
2388
- pass
2598
+ self.compile(device, module_dir, output_name, output_arch)
2389
2599
  except Exception as e:
2390
- # We don't need source_code_path to be copied successfully to proceed, so warn and keep running
2391
- warp.utils.warn(f"Exception when renaming {source_code_path}: {e}")
2392
- else:
2393
- module_load_timer.extra_msg = " (cached)" # For wp.ScopedTimer informational purposes
2600
+ module_load_timer.extra_msg = " (error)"
2601
+ raise (e)
2602
+
2603
+ module_load_timer.extra_msg = " (compiled)"
2604
+ else:
2605
+ module_load_timer.extra_msg = " (cached)"
2394
2606
 
2395
2607
  # -----------------------------------------------------------
2396
2608
  # Load CPU or CUDA binary
2397
2609
 
2398
- meta_path = os.path.join(module_dir, f"{module_name_short}.meta")
2399
- with open(meta_path) as meta_file:
2400
- meta = json.load(meta_file)
2610
+ if os.path.exists(meta_path):
2611
+ with open(meta_path) as meta_file:
2612
+ meta = json.load(meta_file)
2613
+ else:
2614
+ raise FileNotFoundError(f"Module metadata file {meta_path} was not found in the cache")
2401
2615
 
2402
2616
  if device.is_cpu:
2403
2617
  # LLVM modules are identified using strings, so we need to ensure uniqueness
2404
- module_handle = f"{module_name}_{self.cpu_exec_id}"
2618
+ module_handle = f"wp_{self.name}_{self.cpu_exec_id}"
2405
2619
  self.cpu_exec_id += 1
2406
- runtime.llvm.load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
2620
+ runtime.llvm.wp_load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
2407
2621
  module_exec = ModuleExec(module_handle, module_hash, device, meta)
2408
2622
  self.execs[(None, active_block_dim)] = module_exec
2409
2623
 
@@ -2416,12 +2630,6 @@ class Module:
2416
2630
  module_load_timer.extra_msg = " (error)"
2417
2631
  raise Exception(f"Failed to load CUDA module '{self.name}'")
2418
2632
 
2419
- if build_dir:
2420
- import shutil
2421
-
2422
- # clean up build_dir used for this process regardless
2423
- shutil.rmtree(build_dir, ignore_errors=True)
2424
-
2425
2633
  return module_exec
2426
2634
 
2427
2635
  def unload(self):
@@ -2457,13 +2665,13 @@ class CpuDefaultAllocator:
2457
2665
  self.deleter = lambda ptr, size: self.free(ptr, size)
2458
2666
 
2459
2667
  def alloc(self, size_in_bytes):
2460
- ptr = runtime.core.alloc_host(size_in_bytes)
2668
+ ptr = runtime.core.wp_alloc_host(size_in_bytes)
2461
2669
  if not ptr:
2462
2670
  raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device 'cpu'")
2463
2671
  return ptr
2464
2672
 
2465
2673
  def free(self, ptr, size_in_bytes):
2466
- runtime.core.free_host(ptr)
2674
+ runtime.core.wp_free_host(ptr)
2467
2675
 
2468
2676
 
2469
2677
  class CpuPinnedAllocator:
@@ -2472,13 +2680,13 @@ class CpuPinnedAllocator:
2472
2680
  self.deleter = lambda ptr, size: self.free(ptr, size)
2473
2681
 
2474
2682
  def alloc(self, size_in_bytes):
2475
- ptr = runtime.core.alloc_pinned(size_in_bytes)
2683
+ ptr = runtime.core.wp_alloc_pinned(size_in_bytes)
2476
2684
  if not ptr:
2477
2685
  raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device '{self.device}'")
2478
2686
  return ptr
2479
2687
 
2480
2688
  def free(self, ptr, size_in_bytes):
2481
- runtime.core.free_pinned(ptr)
2689
+ runtime.core.wp_free_pinned(ptr)
2482
2690
 
2483
2691
 
2484
2692
  class CudaDefaultAllocator:
@@ -2488,7 +2696,7 @@ class CudaDefaultAllocator:
2488
2696
  self.deleter = lambda ptr, size: self.free(ptr, size)
2489
2697
 
2490
2698
  def alloc(self, size_in_bytes):
2491
- ptr = runtime.core.alloc_device_default(self.device.context, size_in_bytes)
2699
+ ptr = runtime.core.wp_alloc_device_default(self.device.context, size_in_bytes)
2492
2700
  # If the allocation fails, check if graph capture is active to raise an informative error.
2493
2701
  # We delay the capture check to avoid overhead.
2494
2702
  if not ptr:
@@ -2510,7 +2718,7 @@ class CudaDefaultAllocator:
2510
2718
  return ptr
2511
2719
 
2512
2720
  def free(self, ptr, size_in_bytes):
2513
- runtime.core.free_device_default(self.device.context, ptr)
2721
+ runtime.core.wp_free_device_default(self.device.context, ptr)
2514
2722
 
2515
2723
 
2516
2724
  class CudaMempoolAllocator:
@@ -2521,13 +2729,13 @@ class CudaMempoolAllocator:
2521
2729
  self.deleter = lambda ptr, size: self.free(ptr, size)
2522
2730
 
2523
2731
  def alloc(self, size_in_bytes):
2524
- ptr = runtime.core.alloc_device_async(self.device.context, size_in_bytes)
2732
+ ptr = runtime.core.wp_alloc_device_async(self.device.context, size_in_bytes)
2525
2733
  if not ptr:
2526
2734
  raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device '{self.device}'")
2527
2735
  return ptr
2528
2736
 
2529
2737
  def free(self, ptr, size_in_bytes):
2530
- runtime.core.free_device_async(self.device.context, ptr)
2738
+ runtime.core.wp_free_device_async(self.device.context, ptr)
2531
2739
 
2532
2740
 
2533
2741
  class ContextGuard:
@@ -2536,15 +2744,15 @@ class ContextGuard:
2536
2744
 
2537
2745
  def __enter__(self):
2538
2746
  if self.device.is_cuda:
2539
- runtime.core.cuda_context_push_current(self.device.context)
2747
+ runtime.core.wp_cuda_context_push_current(self.device.context)
2540
2748
  elif is_cuda_driver_initialized():
2541
- self.saved_context = runtime.core.cuda_context_get_current()
2749
+ self.saved_context = runtime.core.wp_cuda_context_get_current()
2542
2750
 
2543
2751
  def __exit__(self, exc_type, exc_value, traceback):
2544
2752
  if self.device.is_cuda:
2545
- runtime.core.cuda_context_pop_current()
2753
+ runtime.core.wp_cuda_context_pop_current()
2546
2754
  elif is_cuda_driver_initialized():
2547
- runtime.core.cuda_context_set_current(self.saved_context)
2755
+ runtime.core.wp_cuda_context_set_current(self.saved_context)
2548
2756
 
2549
2757
 
2550
2758
  class Event:
@@ -2607,7 +2815,7 @@ class Event:
2607
2815
  raise ValueError("The combination of 'enable_timing=True' and 'interprocess=True' is not allowed.")
2608
2816
  flags |= Event.Flags.INTERPROCESS
2609
2817
 
2610
- self.cuda_event = runtime.core.cuda_event_create(device.context, flags)
2818
+ self.cuda_event = runtime.core.wp_cuda_event_create(device.context, flags)
2611
2819
  if not self.cuda_event:
2612
2820
  raise RuntimeError(f"Failed to create event on device {device}")
2613
2821
  self.owner = True
@@ -2634,7 +2842,9 @@ class Event:
2634
2842
  # Allocate a buffer for the data (64-element char array)
2635
2843
  ipc_handle_buffer = (ctypes.c_char * 64)()
2636
2844
 
2637
- warp.context.runtime.core.cuda_ipc_get_event_handle(self.device.context, self.cuda_event, ipc_handle_buffer)
2845
+ warp.context.runtime.core.wp_cuda_ipc_get_event_handle(
2846
+ self.device.context, self.cuda_event, ipc_handle_buffer
2847
+ )
2638
2848
 
2639
2849
  if ipc_handle_buffer.raw == bytes(64):
2640
2850
  warp.utils.warn("IPC event handle appears to be invalid. Was interprocess=True used?")
@@ -2651,7 +2861,7 @@ class Event:
2651
2861
  This property may not be accessed during a graph capture on any stream.
2652
2862
  """
2653
2863
 
2654
- result_code = runtime.core.cuda_event_query(self.cuda_event)
2864
+ result_code = runtime.core.wp_cuda_event_query(self.cuda_event)
2655
2865
 
2656
2866
  return result_code == 0
2657
2867
 
@@ -2659,7 +2869,7 @@ class Event:
2659
2869
  if not self.owner:
2660
2870
  return
2661
2871
 
2662
- runtime.core.cuda_event_destroy(self.cuda_event)
2872
+ runtime.core.wp_cuda_event_destroy(self.cuda_event)
2663
2873
 
2664
2874
 
2665
2875
  class Stream:
@@ -2709,12 +2919,12 @@ class Stream:
2709
2919
  # we pass cuda_stream through kwargs because cuda_stream=None is actually a valid value (CUDA default stream)
2710
2920
  if "cuda_stream" in kwargs:
2711
2921
  self.cuda_stream = kwargs["cuda_stream"]
2712
- device.runtime.core.cuda_stream_register(device.context, self.cuda_stream)
2922
+ device.runtime.core.wp_cuda_stream_register(device.context, self.cuda_stream)
2713
2923
  else:
2714
2924
  if not isinstance(priority, int):
2715
2925
  raise TypeError("Stream priority must be an integer.")
2716
2926
  clamped_priority = max(-1, min(priority, 0)) # Only support two priority levels
2717
- self.cuda_stream = device.runtime.core.cuda_stream_create(device.context, clamped_priority)
2927
+ self.cuda_stream = device.runtime.core.wp_cuda_stream_create(device.context, clamped_priority)
2718
2928
 
2719
2929
  if not self.cuda_stream:
2720
2930
  raise RuntimeError(f"Failed to create stream on device {device}")
@@ -2725,9 +2935,9 @@ class Stream:
2725
2935
  return
2726
2936
 
2727
2937
  if self.owner:
2728
- runtime.core.cuda_stream_destroy(self.device.context, self.cuda_stream)
2938
+ runtime.core.wp_cuda_stream_destroy(self.device.context, self.cuda_stream)
2729
2939
  else:
2730
- runtime.core.cuda_stream_unregister(self.device.context, self.cuda_stream)
2940
+ runtime.core.wp_cuda_stream_unregister(self.device.context, self.cuda_stream)
2731
2941
 
2732
2942
  @property
2733
2943
  def cached_event(self) -> Event:
@@ -2753,7 +2963,7 @@ class Stream:
2753
2963
  f"Event from device {event.device} cannot be recorded on stream from device {self.device}"
2754
2964
  )
2755
2965
 
2756
- runtime.core.cuda_event_record(event.cuda_event, self.cuda_stream, event.enable_timing)
2966
+ runtime.core.wp_cuda_event_record(event.cuda_event, self.cuda_stream, event.enable_timing)
2757
2967
 
2758
2968
  return event
2759
2969
 
@@ -2762,7 +2972,7 @@ class Stream:
2762
2972
 
2763
2973
  This function does not block the host thread.
2764
2974
  """
2765
- runtime.core.cuda_stream_wait_event(self.cuda_stream, event.cuda_event)
2975
+ runtime.core.wp_cuda_stream_wait_event(self.cuda_stream, event.cuda_event)
2766
2976
 
2767
2977
  def wait_stream(self, other_stream: Stream, event: Event | None = None):
2768
2978
  """Records an event on `other_stream` and makes this stream wait on it.
@@ -2785,7 +2995,7 @@ class Stream:
2785
2995
  if event is None:
2786
2996
  event = other_stream.cached_event
2787
2997
 
2788
- runtime.core.cuda_stream_wait_stream(self.cuda_stream, other_stream.cuda_stream, event.cuda_event)
2998
+ runtime.core.wp_cuda_stream_wait_stream(self.cuda_stream, other_stream.cuda_stream, event.cuda_event)
2789
2999
 
2790
3000
  @property
2791
3001
  def is_complete(self) -> bool:
@@ -2794,19 +3004,19 @@ class Stream:
2794
3004
  This property may not be accessed during a graph capture on any stream.
2795
3005
  """
2796
3006
 
2797
- result_code = runtime.core.cuda_stream_query(self.cuda_stream)
3007
+ result_code = runtime.core.wp_cuda_stream_query(self.cuda_stream)
2798
3008
 
2799
3009
  return result_code == 0
2800
3010
 
2801
3011
  @property
2802
3012
  def is_capturing(self) -> bool:
2803
3013
  """A boolean indicating whether a graph capture is currently ongoing on this stream."""
2804
- return bool(runtime.core.cuda_stream_is_capturing(self.cuda_stream))
3014
+ return bool(runtime.core.wp_cuda_stream_is_capturing(self.cuda_stream))
2805
3015
 
2806
3016
  @property
2807
3017
  def priority(self) -> int:
2808
3018
  """An integer representing the priority of the stream."""
2809
- return runtime.core.cuda_stream_get_priority(self.cuda_stream)
3019
+ return runtime.core.wp_cuda_stream_get_priority(self.cuda_stream)
2810
3020
 
2811
3021
 
2812
3022
  class Device:
@@ -2875,22 +3085,22 @@ class Device:
2875
3085
  self.pci_bus_id = None
2876
3086
 
2877
3087
  # TODO: add more device-specific dispatch functions
2878
- self.memset = runtime.core.memset_host
2879
- self.memtile = runtime.core.memtile_host
3088
+ self.memset = runtime.core.wp_memset_host
3089
+ self.memtile = runtime.core.wp_memtile_host
2880
3090
 
2881
3091
  self.default_allocator = CpuDefaultAllocator(self)
2882
3092
  self.pinned_allocator = CpuPinnedAllocator(self)
2883
3093
 
2884
- elif ordinal >= 0 and ordinal < runtime.core.cuda_device_get_count():
3094
+ elif ordinal >= 0 and ordinal < runtime.core.wp_cuda_device_get_count():
2885
3095
  # CUDA device
2886
- self.name = runtime.core.cuda_device_get_name(ordinal).decode()
2887
- self.arch = runtime.core.cuda_device_get_arch(ordinal)
2888
- self.sm_count = runtime.core.cuda_device_get_sm_count(ordinal)
2889
- self.is_uva = runtime.core.cuda_device_is_uva(ordinal) > 0
2890
- self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal) > 0
3096
+ self.name = runtime.core.wp_cuda_device_get_name(ordinal).decode()
3097
+ self.arch = runtime.core.wp_cuda_device_get_arch(ordinal)
3098
+ self.sm_count = runtime.core.wp_cuda_device_get_sm_count(ordinal)
3099
+ self.is_uva = runtime.core.wp_cuda_device_is_uva(ordinal) > 0
3100
+ self.is_mempool_supported = runtime.core.wp_cuda_device_is_mempool_supported(ordinal) > 0
2891
3101
  if platform.system() == "Linux":
2892
3102
  # Use None when IPC support cannot be determined
2893
- ipc_support_api_query = runtime.core.cuda_device_is_ipc_supported(ordinal)
3103
+ ipc_support_api_query = runtime.core.wp_cuda_device_is_ipc_supported(ordinal)
2894
3104
  self.is_ipc_supported = bool(ipc_support_api_query) if ipc_support_api_query >= 0 else None
2895
3105
  else:
2896
3106
  self.is_ipc_supported = False
@@ -2902,13 +3112,13 @@ class Device:
2902
3112
  self.is_mempool_enabled = False
2903
3113
 
2904
3114
  uuid_buffer = (ctypes.c_char * 16)()
2905
- runtime.core.cuda_device_get_uuid(ordinal, uuid_buffer)
3115
+ runtime.core.wp_cuda_device_get_uuid(ordinal, uuid_buffer)
2906
3116
  uuid_byte_str = bytes(uuid_buffer).hex()
2907
3117
  self.uuid = f"GPU-{uuid_byte_str[0:8]}-{uuid_byte_str[8:12]}-{uuid_byte_str[12:16]}-{uuid_byte_str[16:20]}-{uuid_byte_str[20:]}"
2908
3118
 
2909
- pci_domain_id = runtime.core.cuda_device_get_pci_domain_id(ordinal)
2910
- pci_bus_id = runtime.core.cuda_device_get_pci_bus_id(ordinal)
2911
- pci_device_id = runtime.core.cuda_device_get_pci_device_id(ordinal)
3119
+ pci_domain_id = runtime.core.wp_cuda_device_get_pci_domain_id(ordinal)
3120
+ pci_bus_id = runtime.core.wp_cuda_device_get_pci_bus_id(ordinal)
3121
+ pci_device_id = runtime.core.wp_cuda_device_get_pci_device_id(ordinal)
2912
3122
  # This is (mis)named to correspond to the naming of cudaDeviceGetPCIBusId
2913
3123
  self.pci_bus_id = f"{pci_domain_id:08X}:{pci_bus_id:02X}:{pci_device_id:02X}"
2914
3124
 
@@ -2932,8 +3142,8 @@ class Device:
2932
3142
  self._init_streams()
2933
3143
 
2934
3144
  # TODO: add more device-specific dispatch functions
2935
- self.memset = lambda ptr, value, size: runtime.core.memset_device(self.context, ptr, value, size)
2936
- self.memtile = lambda ptr, src, srcsize, reps: runtime.core.memtile_device(
3145
+ self.memset = lambda ptr, value, size: runtime.core.wp_memset_device(self.context, ptr, value, size)
3146
+ self.memtile = lambda ptr, src, srcsize, reps: runtime.core.wp_memtile_device(
2937
3147
  self.context, ptr, src, srcsize, reps
2938
3148
  )
2939
3149
 
@@ -2992,15 +3202,15 @@ class Device:
2992
3202
  return self._context
2993
3203
  elif self.is_primary:
2994
3204
  # acquire primary context on demand
2995
- prev_context = runtime.core.cuda_context_get_current()
2996
- self._context = self.runtime.core.cuda_device_get_primary_context(self.ordinal)
3205
+ prev_context = runtime.core.wp_cuda_context_get_current()
3206
+ self._context = self.runtime.core.wp_cuda_device_get_primary_context(self.ordinal)
2997
3207
  if self._context is None:
2998
- runtime.core.cuda_context_set_current(prev_context)
3208
+ runtime.core.wp_cuda_context_set_current(prev_context)
2999
3209
  raise RuntimeError(f"Failed to acquire primary context for device {self}")
3000
3210
  self.runtime.context_map[self._context] = self
3001
3211
  # initialize streams
3002
3212
  self._init_streams()
3003
- runtime.core.cuda_context_set_current(prev_context)
3213
+ runtime.core.wp_cuda_context_set_current(prev_context)
3004
3214
  return self._context
3005
3215
 
3006
3216
  @property
@@ -3044,7 +3254,7 @@ class Device:
3044
3254
  if stream.device != self:
3045
3255
  raise RuntimeError(f"Stream from device {stream.device} cannot be used on device {self}")
3046
3256
 
3047
- self.runtime.core.cuda_context_set_stream(self.context, stream.cuda_stream, int(sync))
3257
+ self.runtime.core.wp_cuda_context_set_stream(self.context, stream.cuda_stream, int(sync))
3048
3258
  self._stream = stream
3049
3259
  else:
3050
3260
  raise RuntimeError(f"Device {self} is not a CUDA device")
@@ -3062,7 +3272,7 @@ class Device:
3062
3272
  """
3063
3273
  if self.is_cuda:
3064
3274
  total_mem = ctypes.c_size_t()
3065
- self.runtime.core.cuda_device_get_memory_info(self.ordinal, None, ctypes.byref(total_mem))
3275
+ self.runtime.core.wp_cuda_device_get_memory_info(self.ordinal, None, ctypes.byref(total_mem))
3066
3276
  return total_mem.value
3067
3277
  else:
3068
3278
  # TODO: cpu
@@ -3076,7 +3286,7 @@ class Device:
3076
3286
  """
3077
3287
  if self.is_cuda:
3078
3288
  free_mem = ctypes.c_size_t()
3079
- self.runtime.core.cuda_device_get_memory_info(self.ordinal, ctypes.byref(free_mem), None)
3289
+ self.runtime.core.wp_cuda_device_get_memory_info(self.ordinal, ctypes.byref(free_mem), None)
3080
3290
  return free_mem.value
3081
3291
  else:
3082
3292
  # TODO: cpu
@@ -3103,7 +3313,7 @@ class Device:
3103
3313
 
3104
3314
  def make_current(self):
3105
3315
  if self.context is not None:
3106
- self.runtime.core.cuda_context_set_current(self.context)
3316
+ self.runtime.core.wp_cuda_context_set_current(self.context)
3107
3317
 
3108
3318
  def can_access(self, other):
3109
3319
  # TODO: this function should be redesigned in terms of (device, resource).
@@ -3129,11 +3339,7 @@ class Graph:
3129
3339
  self.capture_id = capture_id
3130
3340
  self.module_execs: set[ModuleExec] = set()
3131
3341
  self.graph_exec: ctypes.c_void_p | None = None
3132
-
3133
3342
  self.graph: ctypes.c_void_p | None = None
3134
- self.has_conditional = (
3135
- False # Track if there are conditional nodes in the graph since they are not allowed in child graphs
3136
- )
3137
3343
 
3138
3344
  def __del__(self):
3139
3345
  if not hasattr(self, "graph") or not hasattr(self, "device") or not self.graph:
@@ -3141,9 +3347,9 @@ class Graph:
3141
3347
 
3142
3348
  # use CUDA context guard to avoid side effects during garbage collection
3143
3349
  with self.device.context_guard:
3144
- runtime.core.cuda_graph_destroy(self.device.context, self.graph)
3350
+ runtime.core.wp_cuda_graph_destroy(self.device.context, self.graph)
3145
3351
  if hasattr(self, "graph_exec") and self.graph_exec is not None:
3146
- runtime.core.cuda_graph_exec_destroy(self.device.context, self.graph_exec)
3352
+ runtime.core.wp_cuda_graph_exec_destroy(self.device.context, self.graph_exec)
3147
3353
 
3148
3354
  # retain executable CUDA modules used by this graph, which prevents them from being unloaded
3149
3355
  def retain_module_exec(self, module_exec: ModuleExec):
@@ -3155,6 +3361,14 @@ class Runtime:
3155
3361
  if sys.version_info < (3, 9):
3156
3362
  warp.utils.warn(f"Python 3.9 or newer is recommended for running Warp, detected {sys.version_info}")
3157
3363
 
3364
+ if platform.system() == "Darwin" and platform.machine() == "x86_64":
3365
+ warp.utils.warn(
3366
+ "Support for Warp on Intel-based macOS is deprecated and will be removed in the near future. "
3367
+ "Apple Silicon-based Macs will continue to be supported.",
3368
+ DeprecationWarning,
3369
+ stacklevel=3,
3370
+ )
3371
+
3158
3372
  bin_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "bin")
3159
3373
 
3160
3374
  if os.name == "nt":
@@ -3177,7 +3391,7 @@ class Runtime:
3177
3391
  if os.path.exists(llvm_lib):
3178
3392
  self.llvm = self.load_dll(llvm_lib)
3179
3393
  # setup c-types for warp-clang.dll
3180
- self.llvm.lookup.restype = ctypes.c_uint64
3394
+ self.llvm.wp_lookup.restype = ctypes.c_uint64
3181
3395
  else:
3182
3396
  self.llvm = None
3183
3397
 
@@ -3186,83 +3400,83 @@ class Runtime:
3186
3400
 
3187
3401
  # setup c-types for warp.dll
3188
3402
  try:
3189
- self.core.get_error_string.argtypes = []
3190
- self.core.get_error_string.restype = ctypes.c_char_p
3191
- self.core.set_error_output_enabled.argtypes = [ctypes.c_int]
3192
- self.core.set_error_output_enabled.restype = None
3193
- self.core.is_error_output_enabled.argtypes = []
3194
- self.core.is_error_output_enabled.restype = ctypes.c_int
3195
-
3196
- self.core.alloc_host.argtypes = [ctypes.c_size_t]
3197
- self.core.alloc_host.restype = ctypes.c_void_p
3198
- self.core.alloc_pinned.argtypes = [ctypes.c_size_t]
3199
- self.core.alloc_pinned.restype = ctypes.c_void_p
3200
- self.core.alloc_device.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
3201
- self.core.alloc_device.restype = ctypes.c_void_p
3202
- self.core.alloc_device_default.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
3203
- self.core.alloc_device_default.restype = ctypes.c_void_p
3204
- self.core.alloc_device_async.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
3205
- self.core.alloc_device_async.restype = ctypes.c_void_p
3206
-
3207
- self.core.float_to_half_bits.argtypes = [ctypes.c_float]
3208
- self.core.float_to_half_bits.restype = ctypes.c_uint16
3209
- self.core.half_bits_to_float.argtypes = [ctypes.c_uint16]
3210
- self.core.half_bits_to_float.restype = ctypes.c_float
3211
-
3212
- self.core.free_host.argtypes = [ctypes.c_void_p]
3213
- self.core.free_host.restype = None
3214
- self.core.free_pinned.argtypes = [ctypes.c_void_p]
3215
- self.core.free_pinned.restype = None
3216
- self.core.free_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3217
- self.core.free_device.restype = None
3218
- self.core.free_device_default.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3219
- self.core.free_device_default.restype = None
3220
- self.core.free_device_async.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3221
- self.core.free_device_async.restype = None
3222
-
3223
- self.core.memset_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
3224
- self.core.memset_host.restype = None
3225
- self.core.memset_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
3226
- self.core.memset_device.restype = None
3227
-
3228
- self.core.memtile_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t]
3229
- self.core.memtile_host.restype = None
3230
- self.core.memtile_device.argtypes = [
3403
+ self.core.wp_get_error_string.argtypes = []
3404
+ self.core.wp_get_error_string.restype = ctypes.c_char_p
3405
+ self.core.wp_set_error_output_enabled.argtypes = [ctypes.c_int]
3406
+ self.core.wp_set_error_output_enabled.restype = None
3407
+ self.core.wp_is_error_output_enabled.argtypes = []
3408
+ self.core.wp_is_error_output_enabled.restype = ctypes.c_int
3409
+
3410
+ self.core.wp_alloc_host.argtypes = [ctypes.c_size_t]
3411
+ self.core.wp_alloc_host.restype = ctypes.c_void_p
3412
+ self.core.wp_alloc_pinned.argtypes = [ctypes.c_size_t]
3413
+ self.core.wp_alloc_pinned.restype = ctypes.c_void_p
3414
+ self.core.wp_alloc_device.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
3415
+ self.core.wp_alloc_device.restype = ctypes.c_void_p
3416
+ self.core.wp_alloc_device_default.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
3417
+ self.core.wp_alloc_device_default.restype = ctypes.c_void_p
3418
+ self.core.wp_alloc_device_async.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
3419
+ self.core.wp_alloc_device_async.restype = ctypes.c_void_p
3420
+
3421
+ self.core.wp_float_to_half_bits.argtypes = [ctypes.c_float]
3422
+ self.core.wp_float_to_half_bits.restype = ctypes.c_uint16
3423
+ self.core.wp_half_bits_to_float.argtypes = [ctypes.c_uint16]
3424
+ self.core.wp_half_bits_to_float.restype = ctypes.c_float
3425
+
3426
+ self.core.wp_free_host.argtypes = [ctypes.c_void_p]
3427
+ self.core.wp_free_host.restype = None
3428
+ self.core.wp_free_pinned.argtypes = [ctypes.c_void_p]
3429
+ self.core.wp_free_pinned.restype = None
3430
+ self.core.wp_free_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3431
+ self.core.wp_free_device.restype = None
3432
+ self.core.wp_free_device_default.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3433
+ self.core.wp_free_device_default.restype = None
3434
+ self.core.wp_free_device_async.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3435
+ self.core.wp_free_device_async.restype = None
3436
+
3437
+ self.core.wp_memset_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
3438
+ self.core.wp_memset_host.restype = None
3439
+ self.core.wp_memset_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
3440
+ self.core.wp_memset_device.restype = None
3441
+
3442
+ self.core.wp_memtile_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t]
3443
+ self.core.wp_memtile_host.restype = None
3444
+ self.core.wp_memtile_device.argtypes = [
3231
3445
  ctypes.c_void_p,
3232
3446
  ctypes.c_void_p,
3233
3447
  ctypes.c_void_p,
3234
3448
  ctypes.c_size_t,
3235
3449
  ctypes.c_size_t,
3236
3450
  ]
3237
- self.core.memtile_device.restype = None
3451
+ self.core.wp_memtile_device.restype = None
3238
3452
 
3239
- self.core.memcpy_h2h.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
3240
- self.core.memcpy_h2h.restype = ctypes.c_bool
3241
- self.core.memcpy_h2d.argtypes = [
3453
+ self.core.wp_memcpy_h2h.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
3454
+ self.core.wp_memcpy_h2h.restype = ctypes.c_bool
3455
+ self.core.wp_memcpy_h2d.argtypes = [
3242
3456
  ctypes.c_void_p,
3243
3457
  ctypes.c_void_p,
3244
3458
  ctypes.c_void_p,
3245
3459
  ctypes.c_size_t,
3246
3460
  ctypes.c_void_p,
3247
3461
  ]
3248
- self.core.memcpy_h2d.restype = ctypes.c_bool
3249
- self.core.memcpy_d2h.argtypes = [
3462
+ self.core.wp_memcpy_h2d.restype = ctypes.c_bool
3463
+ self.core.wp_memcpy_d2h.argtypes = [
3250
3464
  ctypes.c_void_p,
3251
3465
  ctypes.c_void_p,
3252
3466
  ctypes.c_void_p,
3253
3467
  ctypes.c_size_t,
3254
3468
  ctypes.c_void_p,
3255
3469
  ]
3256
- self.core.memcpy_d2h.restype = ctypes.c_bool
3257
- self.core.memcpy_d2d.argtypes = [
3470
+ self.core.wp_memcpy_d2h.restype = ctypes.c_bool
3471
+ self.core.wp_memcpy_d2d.argtypes = [
3258
3472
  ctypes.c_void_p,
3259
3473
  ctypes.c_void_p,
3260
3474
  ctypes.c_void_p,
3261
3475
  ctypes.c_size_t,
3262
3476
  ctypes.c_void_p,
3263
3477
  ]
3264
- self.core.memcpy_d2d.restype = ctypes.c_bool
3265
- self.core.memcpy_p2p.argtypes = [
3478
+ self.core.wp_memcpy_d2d.restype = ctypes.c_bool
3479
+ self.core.wp_memcpy_p2p.argtypes = [
3266
3480
  ctypes.c_void_p,
3267
3481
  ctypes.c_void_p,
3268
3482
  ctypes.c_void_p,
@@ -3270,17 +3484,17 @@ class Runtime:
3270
3484
  ctypes.c_size_t,
3271
3485
  ctypes.c_void_p,
3272
3486
  ]
3273
- self.core.memcpy_p2p.restype = ctypes.c_bool
3487
+ self.core.wp_memcpy_p2p.restype = ctypes.c_bool
3274
3488
 
3275
- self.core.array_copy_host.argtypes = [
3489
+ self.core.wp_array_copy_host.argtypes = [
3276
3490
  ctypes.c_void_p,
3277
3491
  ctypes.c_void_p,
3278
3492
  ctypes.c_int,
3279
3493
  ctypes.c_int,
3280
3494
  ctypes.c_int,
3281
3495
  ]
3282
- self.core.array_copy_host.restype = ctypes.c_bool
3283
- self.core.array_copy_device.argtypes = [
3496
+ self.core.wp_array_copy_host.restype = ctypes.c_bool
3497
+ self.core.wp_array_copy_device.argtypes = [
3284
3498
  ctypes.c_void_p,
3285
3499
  ctypes.c_void_p,
3286
3500
  ctypes.c_void_p,
@@ -3288,41 +3502,41 @@ class Runtime:
3288
3502
  ctypes.c_int,
3289
3503
  ctypes.c_int,
3290
3504
  ]
3291
- self.core.array_copy_device.restype = ctypes.c_bool
3505
+ self.core.wp_array_copy_device.restype = ctypes.c_bool
3292
3506
 
3293
- self.core.array_fill_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int]
3294
- self.core.array_fill_host.restype = None
3295
- self.core.array_fill_device.argtypes = [
3507
+ self.core.wp_array_fill_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int]
3508
+ self.core.wp_array_fill_host.restype = None
3509
+ self.core.wp_array_fill_device.argtypes = [
3296
3510
  ctypes.c_void_p,
3297
3511
  ctypes.c_void_p,
3298
3512
  ctypes.c_int,
3299
3513
  ctypes.c_void_p,
3300
3514
  ctypes.c_int,
3301
3515
  ]
3302
- self.core.array_fill_device.restype = None
3516
+ self.core.wp_array_fill_device.restype = None
3303
3517
 
3304
- self.core.array_sum_double_host.argtypes = [
3518
+ self.core.wp_array_sum_double_host.argtypes = [
3305
3519
  ctypes.c_uint64,
3306
3520
  ctypes.c_uint64,
3307
3521
  ctypes.c_int,
3308
3522
  ctypes.c_int,
3309
3523
  ctypes.c_int,
3310
3524
  ]
3311
- self.core.array_sum_float_host.argtypes = [
3525
+ self.core.wp_array_sum_float_host.argtypes = [
3312
3526
  ctypes.c_uint64,
3313
3527
  ctypes.c_uint64,
3314
3528
  ctypes.c_int,
3315
3529
  ctypes.c_int,
3316
3530
  ctypes.c_int,
3317
3531
  ]
3318
- self.core.array_sum_double_device.argtypes = [
3532
+ self.core.wp_array_sum_double_device.argtypes = [
3319
3533
  ctypes.c_uint64,
3320
3534
  ctypes.c_uint64,
3321
3535
  ctypes.c_int,
3322
3536
  ctypes.c_int,
3323
3537
  ctypes.c_int,
3324
3538
  ]
3325
- self.core.array_sum_float_device.argtypes = [
3539
+ self.core.wp_array_sum_float_device.argtypes = [
3326
3540
  ctypes.c_uint64,
3327
3541
  ctypes.c_uint64,
3328
3542
  ctypes.c_int,
@@ -3330,7 +3544,7 @@ class Runtime:
3330
3544
  ctypes.c_int,
3331
3545
  ]
3332
3546
 
3333
- self.core.array_inner_double_host.argtypes = [
3547
+ self.core.wp_array_inner_double_host.argtypes = [
3334
3548
  ctypes.c_uint64,
3335
3549
  ctypes.c_uint64,
3336
3550
  ctypes.c_uint64,
@@ -3339,7 +3553,7 @@ class Runtime:
3339
3553
  ctypes.c_int,
3340
3554
  ctypes.c_int,
3341
3555
  ]
3342
- self.core.array_inner_float_host.argtypes = [
3556
+ self.core.wp_array_inner_float_host.argtypes = [
3343
3557
  ctypes.c_uint64,
3344
3558
  ctypes.c_uint64,
3345
3559
  ctypes.c_uint64,
@@ -3348,7 +3562,7 @@ class Runtime:
3348
3562
  ctypes.c_int,
3349
3563
  ctypes.c_int,
3350
3564
  ]
3351
- self.core.array_inner_double_device.argtypes = [
3565
+ self.core.wp_array_inner_double_device.argtypes = [
3352
3566
  ctypes.c_uint64,
3353
3567
  ctypes.c_uint64,
3354
3568
  ctypes.c_uint64,
@@ -3357,7 +3571,7 @@ class Runtime:
3357
3571
  ctypes.c_int,
3358
3572
  ctypes.c_int,
3359
3573
  ]
3360
- self.core.array_inner_float_device.argtypes = [
3574
+ self.core.wp_array_inner_float_device.argtypes = [
3361
3575
  ctypes.c_uint64,
3362
3576
  ctypes.c_uint64,
3363
3577
  ctypes.c_uint64,
@@ -3367,21 +3581,36 @@ class Runtime:
3367
3581
  ctypes.c_int,
3368
3582
  ]
3369
3583
 
3370
- self.core.array_scan_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
3371
- self.core.array_scan_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
3372
- self.core.array_scan_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
3373
- self.core.array_scan_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
3584
+ self.core.wp_array_scan_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
3585
+ self.core.wp_array_scan_float_host.argtypes = [
3586
+ ctypes.c_uint64,
3587
+ ctypes.c_uint64,
3588
+ ctypes.c_int,
3589
+ ctypes.c_bool,
3590
+ ]
3591
+ self.core.wp_array_scan_int_device.argtypes = [
3592
+ ctypes.c_uint64,
3593
+ ctypes.c_uint64,
3594
+ ctypes.c_int,
3595
+ ctypes.c_bool,
3596
+ ]
3597
+ self.core.wp_array_scan_float_device.argtypes = [
3598
+ ctypes.c_uint64,
3599
+ ctypes.c_uint64,
3600
+ ctypes.c_int,
3601
+ ctypes.c_bool,
3602
+ ]
3374
3603
 
3375
- self.core.radix_sort_pairs_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3376
- self.core.radix_sort_pairs_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3604
+ self.core.wp_radix_sort_pairs_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3605
+ self.core.wp_radix_sort_pairs_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3377
3606
 
3378
- self.core.radix_sort_pairs_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3379
- self.core.radix_sort_pairs_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3607
+ self.core.wp_radix_sort_pairs_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3608
+ self.core.wp_radix_sort_pairs_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3380
3609
 
3381
- self.core.radix_sort_pairs_int64_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3382
- self.core.radix_sort_pairs_int64_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3610
+ self.core.wp_radix_sort_pairs_int64_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3611
+ self.core.wp_radix_sort_pairs_int64_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
3383
3612
 
3384
- self.core.segmented_sort_pairs_int_host.argtypes = [
3613
+ self.core.wp_segmented_sort_pairs_int_host.argtypes = [
3385
3614
  ctypes.c_uint64,
3386
3615
  ctypes.c_uint64,
3387
3616
  ctypes.c_int,
@@ -3389,7 +3618,7 @@ class Runtime:
3389
3618
  ctypes.c_uint64,
3390
3619
  ctypes.c_int,
3391
3620
  ]
3392
- self.core.segmented_sort_pairs_int_device.argtypes = [
3621
+ self.core.wp_segmented_sort_pairs_int_device.argtypes = [
3393
3622
  ctypes.c_uint64,
3394
3623
  ctypes.c_uint64,
3395
3624
  ctypes.c_int,
@@ -3398,7 +3627,7 @@ class Runtime:
3398
3627
  ctypes.c_int,
3399
3628
  ]
3400
3629
 
3401
- self.core.segmented_sort_pairs_float_host.argtypes = [
3630
+ self.core.wp_segmented_sort_pairs_float_host.argtypes = [
3402
3631
  ctypes.c_uint64,
3403
3632
  ctypes.c_uint64,
3404
3633
  ctypes.c_int,
@@ -3406,7 +3635,7 @@ class Runtime:
3406
3635
  ctypes.c_uint64,
3407
3636
  ctypes.c_int,
3408
3637
  ]
3409
- self.core.segmented_sort_pairs_float_device.argtypes = [
3638
+ self.core.wp_segmented_sort_pairs_float_device.argtypes = [
3410
3639
  ctypes.c_uint64,
3411
3640
  ctypes.c_uint64,
3412
3641
  ctypes.c_int,
@@ -3415,14 +3644,14 @@ class Runtime:
3415
3644
  ctypes.c_int,
3416
3645
  ]
3417
3646
 
3418
- self.core.runlength_encode_int_host.argtypes = [
3647
+ self.core.wp_runlength_encode_int_host.argtypes = [
3419
3648
  ctypes.c_uint64,
3420
3649
  ctypes.c_uint64,
3421
3650
  ctypes.c_uint64,
3422
3651
  ctypes.c_uint64,
3423
3652
  ctypes.c_int,
3424
3653
  ]
3425
- self.core.runlength_encode_int_device.argtypes = [
3654
+ self.core.wp_runlength_encode_int_device.argtypes = [
3426
3655
  ctypes.c_uint64,
3427
3656
  ctypes.c_uint64,
3428
3657
  ctypes.c_uint64,
@@ -3430,11 +3659,11 @@ class Runtime:
3430
3659
  ctypes.c_int,
3431
3660
  ]
3432
3661
 
3433
- self.core.bvh_create_host.restype = ctypes.c_uint64
3434
- self.core.bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
3662
+ self.core.wp_bvh_create_host.restype = ctypes.c_uint64
3663
+ self.core.wp_bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
3435
3664
 
3436
- self.core.bvh_create_device.restype = ctypes.c_uint64
3437
- self.core.bvh_create_device.argtypes = [
3665
+ self.core.wp_bvh_create_device.restype = ctypes.c_uint64
3666
+ self.core.wp_bvh_create_device.argtypes = [
3438
3667
  ctypes.c_void_p,
3439
3668
  ctypes.c_void_p,
3440
3669
  ctypes.c_void_p,
@@ -3442,14 +3671,14 @@ class Runtime:
3442
3671
  ctypes.c_int,
3443
3672
  ]
3444
3673
 
3445
- self.core.bvh_destroy_host.argtypes = [ctypes.c_uint64]
3446
- self.core.bvh_destroy_device.argtypes = [ctypes.c_uint64]
3674
+ self.core.wp_bvh_destroy_host.argtypes = [ctypes.c_uint64]
3675
+ self.core.wp_bvh_destroy_device.argtypes = [ctypes.c_uint64]
3447
3676
 
3448
- self.core.bvh_refit_host.argtypes = [ctypes.c_uint64]
3449
- self.core.bvh_refit_device.argtypes = [ctypes.c_uint64]
3677
+ self.core.wp_bvh_refit_host.argtypes = [ctypes.c_uint64]
3678
+ self.core.wp_bvh_refit_device.argtypes = [ctypes.c_uint64]
3450
3679
 
3451
- self.core.mesh_create_host.restype = ctypes.c_uint64
3452
- self.core.mesh_create_host.argtypes = [
3680
+ self.core.wp_mesh_create_host.restype = ctypes.c_uint64
3681
+ self.core.wp_mesh_create_host.argtypes = [
3453
3682
  warp.types.array_t,
3454
3683
  warp.types.array_t,
3455
3684
  warp.types.array_t,
@@ -3459,8 +3688,8 @@ class Runtime:
3459
3688
  ctypes.c_int,
3460
3689
  ]
3461
3690
 
3462
- self.core.mesh_create_device.restype = ctypes.c_uint64
3463
- self.core.mesh_create_device.argtypes = [
3691
+ self.core.wp_mesh_create_device.restype = ctypes.c_uint64
3692
+ self.core.wp_mesh_create_device.argtypes = [
3464
3693
  ctypes.c_void_p,
3465
3694
  warp.types.array_t,
3466
3695
  warp.types.array_t,
@@ -3471,61 +3700,61 @@ class Runtime:
3471
3700
  ctypes.c_int,
3472
3701
  ]
3473
3702
 
3474
- self.core.mesh_destroy_host.argtypes = [ctypes.c_uint64]
3475
- self.core.mesh_destroy_device.argtypes = [ctypes.c_uint64]
3703
+ self.core.wp_mesh_destroy_host.argtypes = [ctypes.c_uint64]
3704
+ self.core.wp_mesh_destroy_device.argtypes = [ctypes.c_uint64]
3476
3705
 
3477
- self.core.mesh_refit_host.argtypes = [ctypes.c_uint64]
3478
- self.core.mesh_refit_device.argtypes = [ctypes.c_uint64]
3706
+ self.core.wp_mesh_refit_host.argtypes = [ctypes.c_uint64]
3707
+ self.core.wp_mesh_refit_device.argtypes = [ctypes.c_uint64]
3479
3708
 
3480
- self.core.mesh_set_points_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
3481
- self.core.mesh_set_points_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
3709
+ self.core.wp_mesh_set_points_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
3710
+ self.core.wp_mesh_set_points_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
3482
3711
 
3483
- self.core.mesh_set_velocities_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
3484
- self.core.mesh_set_velocities_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
3712
+ self.core.wp_mesh_set_velocities_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
3713
+ self.core.wp_mesh_set_velocities_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
3485
3714
 
3486
- self.core.hash_grid_create_host.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
3487
- self.core.hash_grid_create_host.restype = ctypes.c_uint64
3488
- self.core.hash_grid_destroy_host.argtypes = [ctypes.c_uint64]
3489
- self.core.hash_grid_update_host.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
3490
- self.core.hash_grid_reserve_host.argtypes = [ctypes.c_uint64, ctypes.c_int]
3715
+ self.core.wp_hash_grid_create_host.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
3716
+ self.core.wp_hash_grid_create_host.restype = ctypes.c_uint64
3717
+ self.core.wp_hash_grid_destroy_host.argtypes = [ctypes.c_uint64]
3718
+ self.core.wp_hash_grid_update_host.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
3719
+ self.core.wp_hash_grid_reserve_host.argtypes = [ctypes.c_uint64, ctypes.c_int]
3491
3720
 
3492
- self.core.hash_grid_create_device.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int]
3493
- self.core.hash_grid_create_device.restype = ctypes.c_uint64
3494
- self.core.hash_grid_destroy_device.argtypes = [ctypes.c_uint64]
3495
- self.core.hash_grid_update_device.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
3496
- self.core.hash_grid_reserve_device.argtypes = [ctypes.c_uint64, ctypes.c_int]
3721
+ self.core.wp_hash_grid_create_device.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int]
3722
+ self.core.wp_hash_grid_create_device.restype = ctypes.c_uint64
3723
+ self.core.wp_hash_grid_destroy_device.argtypes = [ctypes.c_uint64]
3724
+ self.core.wp_hash_grid_update_device.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
3725
+ self.core.wp_hash_grid_reserve_device.argtypes = [ctypes.c_uint64, ctypes.c_int]
3497
3726
 
3498
- self.core.volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_bool, ctypes.c_bool]
3499
- self.core.volume_create_host.restype = ctypes.c_uint64
3500
- self.core.volume_get_tiles_host.argtypes = [
3727
+ self.core.wp_volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_bool, ctypes.c_bool]
3728
+ self.core.wp_volume_create_host.restype = ctypes.c_uint64
3729
+ self.core.wp_volume_get_tiles_host.argtypes = [
3501
3730
  ctypes.c_uint64,
3502
3731
  ctypes.c_void_p,
3503
3732
  ]
3504
- self.core.volume_get_voxels_host.argtypes = [
3733
+ self.core.wp_volume_get_voxels_host.argtypes = [
3505
3734
  ctypes.c_uint64,
3506
3735
  ctypes.c_void_p,
3507
3736
  ]
3508
- self.core.volume_destroy_host.argtypes = [ctypes.c_uint64]
3737
+ self.core.wp_volume_destroy_host.argtypes = [ctypes.c_uint64]
3509
3738
 
3510
- self.core.volume_create_device.argtypes = [
3739
+ self.core.wp_volume_create_device.argtypes = [
3511
3740
  ctypes.c_void_p,
3512
3741
  ctypes.c_void_p,
3513
3742
  ctypes.c_uint64,
3514
3743
  ctypes.c_bool,
3515
3744
  ctypes.c_bool,
3516
3745
  ]
3517
- self.core.volume_create_device.restype = ctypes.c_uint64
3518
- self.core.volume_get_tiles_device.argtypes = [
3746
+ self.core.wp_volume_create_device.restype = ctypes.c_uint64
3747
+ self.core.wp_volume_get_tiles_device.argtypes = [
3519
3748
  ctypes.c_uint64,
3520
3749
  ctypes.c_void_p,
3521
3750
  ]
3522
- self.core.volume_get_voxels_device.argtypes = [
3751
+ self.core.wp_volume_get_voxels_device.argtypes = [
3523
3752
  ctypes.c_uint64,
3524
3753
  ctypes.c_void_p,
3525
3754
  ]
3526
- self.core.volume_destroy_device.argtypes = [ctypes.c_uint64]
3755
+ self.core.wp_volume_destroy_device.argtypes = [ctypes.c_uint64]
3527
3756
 
3528
- self.core.volume_from_tiles_device.argtypes = [
3757
+ self.core.wp_volume_from_tiles_device.argtypes = [
3529
3758
  ctypes.c_void_p,
3530
3759
  ctypes.c_void_p,
3531
3760
  ctypes.c_int,
@@ -3536,8 +3765,8 @@ class Runtime:
3536
3765
  ctypes.c_uint32,
3537
3766
  ctypes.c_char_p,
3538
3767
  ]
3539
- self.core.volume_from_tiles_device.restype = ctypes.c_uint64
3540
- self.core.volume_index_from_tiles_device.argtypes = [
3768
+ self.core.wp_volume_from_tiles_device.restype = ctypes.c_uint64
3769
+ self.core.wp_volume_index_from_tiles_device.argtypes = [
3541
3770
  ctypes.c_void_p,
3542
3771
  ctypes.c_void_p,
3543
3772
  ctypes.c_int,
@@ -3545,8 +3774,8 @@ class Runtime:
3545
3774
  ctypes.c_float * 3,
3546
3775
  ctypes.c_bool,
3547
3776
  ]
3548
- self.core.volume_index_from_tiles_device.restype = ctypes.c_uint64
3549
- self.core.volume_from_active_voxels_device.argtypes = [
3777
+ self.core.wp_volume_index_from_tiles_device.restype = ctypes.c_uint64
3778
+ self.core.wp_volume_from_active_voxels_device.argtypes = [
3550
3779
  ctypes.c_void_p,
3551
3780
  ctypes.c_void_p,
3552
3781
  ctypes.c_int,
@@ -3554,25 +3783,25 @@ class Runtime:
3554
3783
  ctypes.c_float * 3,
3555
3784
  ctypes.c_bool,
3556
3785
  ]
3557
- self.core.volume_from_active_voxels_device.restype = ctypes.c_uint64
3786
+ self.core.wp_volume_from_active_voxels_device.restype = ctypes.c_uint64
3558
3787
 
3559
- self.core.volume_get_buffer_info.argtypes = [
3788
+ self.core.wp_volume_get_buffer_info.argtypes = [
3560
3789
  ctypes.c_uint64,
3561
3790
  ctypes.POINTER(ctypes.c_void_p),
3562
3791
  ctypes.POINTER(ctypes.c_uint64),
3563
3792
  ]
3564
- self.core.volume_get_voxel_size.argtypes = [
3793
+ self.core.wp_volume_get_voxel_size.argtypes = [
3565
3794
  ctypes.c_uint64,
3566
3795
  ctypes.POINTER(ctypes.c_float),
3567
3796
  ctypes.POINTER(ctypes.c_float),
3568
3797
  ctypes.POINTER(ctypes.c_float),
3569
3798
  ]
3570
- self.core.volume_get_tile_and_voxel_count.argtypes = [
3799
+ self.core.wp_volume_get_tile_and_voxel_count.argtypes = [
3571
3800
  ctypes.c_uint64,
3572
3801
  ctypes.POINTER(ctypes.c_uint32),
3573
3802
  ctypes.POINTER(ctypes.c_uint64),
3574
3803
  ]
3575
- self.core.volume_get_grid_info.argtypes = [
3804
+ self.core.wp_volume_get_grid_info.argtypes = [
3576
3805
  ctypes.c_uint64,
3577
3806
  ctypes.POINTER(ctypes.c_uint64),
3578
3807
  ctypes.POINTER(ctypes.c_uint32),
@@ -3581,12 +3810,12 @@ class Runtime:
3581
3810
  ctypes.c_float * 9,
3582
3811
  ctypes.c_char * 16,
3583
3812
  ]
3584
- self.core.volume_get_grid_info.restype = ctypes.c_char_p
3585
- self.core.volume_get_blind_data_count.argtypes = [
3813
+ self.core.wp_volume_get_grid_info.restype = ctypes.c_char_p
3814
+ self.core.wp_volume_get_blind_data_count.argtypes = [
3586
3815
  ctypes.c_uint64,
3587
3816
  ]
3588
- self.core.volume_get_blind_data_count.restype = ctypes.c_uint64
3589
- self.core.volume_get_blind_data_info.argtypes = [
3817
+ self.core.wp_volume_get_blind_data_count.restype = ctypes.c_uint64
3818
+ self.core.wp_volume_get_blind_data_info.argtypes = [
3590
3819
  ctypes.c_uint64,
3591
3820
  ctypes.c_uint32,
3592
3821
  ctypes.POINTER(ctypes.c_void_p),
@@ -3594,7 +3823,7 @@ class Runtime:
3594
3823
  ctypes.POINTER(ctypes.c_uint32),
3595
3824
  ctypes.c_char * 16,
3596
3825
  ]
3597
- self.core.volume_get_blind_data_info.restype = ctypes.c_char_p
3826
+ self.core.wp_volume_get_blind_data_info.restype = ctypes.c_char_p
3598
3827
 
3599
3828
  bsr_matrix_from_triplets_argtypes = [
3600
3829
  ctypes.c_int, # block_size
@@ -3616,8 +3845,8 @@ class Runtime:
3616
3845
  ctypes.c_void_p, # bsr_nnz_event
3617
3846
  ]
3618
3847
 
3619
- self.core.bsr_matrix_from_triplets_host.argtypes = bsr_matrix_from_triplets_argtypes
3620
- self.core.bsr_matrix_from_triplets_device.argtypes = bsr_matrix_from_triplets_argtypes
3848
+ self.core.wp_bsr_matrix_from_triplets_host.argtypes = bsr_matrix_from_triplets_argtypes
3849
+ self.core.wp_bsr_matrix_from_triplets_device.argtypes = bsr_matrix_from_triplets_argtypes
3621
3850
 
3622
3851
  bsr_transpose_argtypes = [
3623
3852
  ctypes.c_int, # row_count
@@ -3629,229 +3858,232 @@ class Runtime:
3629
3858
  ctypes.POINTER(ctypes.c_int), # transposed_bsr_columns
3630
3859
  ctypes.POINTER(ctypes.c_int), # src to dest block map
3631
3860
  ]
3632
- self.core.bsr_transpose_host.argtypes = bsr_transpose_argtypes
3633
- self.core.bsr_transpose_device.argtypes = bsr_transpose_argtypes
3634
-
3635
- self.core.is_cuda_enabled.argtypes = None
3636
- self.core.is_cuda_enabled.restype = ctypes.c_int
3637
- self.core.is_cuda_compatibility_enabled.argtypes = None
3638
- self.core.is_cuda_compatibility_enabled.restype = ctypes.c_int
3639
- self.core.is_mathdx_enabled.argtypes = None
3640
- self.core.is_mathdx_enabled.restype = ctypes.c_int
3641
-
3642
- self.core.cuda_driver_version.argtypes = None
3643
- self.core.cuda_driver_version.restype = ctypes.c_int
3644
- self.core.cuda_toolkit_version.argtypes = None
3645
- self.core.cuda_toolkit_version.restype = ctypes.c_int
3646
- self.core.cuda_driver_is_initialized.argtypes = None
3647
- self.core.cuda_driver_is_initialized.restype = ctypes.c_bool
3648
-
3649
- self.core.nvrtc_supported_arch_count.argtypes = None
3650
- self.core.nvrtc_supported_arch_count.restype = ctypes.c_int
3651
- self.core.nvrtc_supported_archs.argtypes = [ctypes.POINTER(ctypes.c_int)]
3652
- self.core.nvrtc_supported_archs.restype = None
3653
-
3654
- self.core.cuda_device_get_count.argtypes = None
3655
- self.core.cuda_device_get_count.restype = ctypes.c_int
3656
- self.core.cuda_device_get_primary_context.argtypes = [ctypes.c_int]
3657
- self.core.cuda_device_get_primary_context.restype = ctypes.c_void_p
3658
- self.core.cuda_device_get_name.argtypes = [ctypes.c_int]
3659
- self.core.cuda_device_get_name.restype = ctypes.c_char_p
3660
- self.core.cuda_device_get_arch.argtypes = [ctypes.c_int]
3661
- self.core.cuda_device_get_arch.restype = ctypes.c_int
3662
- self.core.cuda_device_get_sm_count.argtypes = [ctypes.c_int]
3663
- self.core.cuda_device_get_sm_count.restype = ctypes.c_int
3664
- self.core.cuda_device_is_uva.argtypes = [ctypes.c_int]
3665
- self.core.cuda_device_is_uva.restype = ctypes.c_int
3666
- self.core.cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
3667
- self.core.cuda_device_is_mempool_supported.restype = ctypes.c_int
3668
- self.core.cuda_device_is_ipc_supported.argtypes = [ctypes.c_int]
3669
- self.core.cuda_device_is_ipc_supported.restype = ctypes.c_int
3670
- self.core.cuda_device_set_mempool_release_threshold.argtypes = [ctypes.c_int, ctypes.c_uint64]
3671
- self.core.cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
3672
- self.core.cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
3673
- self.core.cuda_device_get_mempool_release_threshold.restype = ctypes.c_uint64
3674
- self.core.cuda_device_get_mempool_used_mem_current.argtypes = [ctypes.c_int]
3675
- self.core.cuda_device_get_mempool_used_mem_current.restype = ctypes.c_uint64
3676
- self.core.cuda_device_get_mempool_used_mem_high.argtypes = [ctypes.c_int]
3677
- self.core.cuda_device_get_mempool_used_mem_high.restype = ctypes.c_uint64
3678
- self.core.cuda_device_get_memory_info.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
3679
- self.core.cuda_device_get_memory_info.restype = None
3680
- self.core.cuda_device_get_uuid.argtypes = [ctypes.c_int, ctypes.c_char * 16]
3681
- self.core.cuda_device_get_uuid.restype = None
3682
- self.core.cuda_device_get_pci_domain_id.argtypes = [ctypes.c_int]
3683
- self.core.cuda_device_get_pci_domain_id.restype = ctypes.c_int
3684
- self.core.cuda_device_get_pci_bus_id.argtypes = [ctypes.c_int]
3685
- self.core.cuda_device_get_pci_bus_id.restype = ctypes.c_int
3686
- self.core.cuda_device_get_pci_device_id.argtypes = [ctypes.c_int]
3687
- self.core.cuda_device_get_pci_device_id.restype = ctypes.c_int
3688
-
3689
- self.core.cuda_context_get_current.argtypes = None
3690
- self.core.cuda_context_get_current.restype = ctypes.c_void_p
3691
- self.core.cuda_context_set_current.argtypes = [ctypes.c_void_p]
3692
- self.core.cuda_context_set_current.restype = None
3693
- self.core.cuda_context_push_current.argtypes = [ctypes.c_void_p]
3694
- self.core.cuda_context_push_current.restype = None
3695
- self.core.cuda_context_pop_current.argtypes = None
3696
- self.core.cuda_context_pop_current.restype = None
3697
- self.core.cuda_context_create.argtypes = [ctypes.c_int]
3698
- self.core.cuda_context_create.restype = ctypes.c_void_p
3699
- self.core.cuda_context_destroy.argtypes = [ctypes.c_void_p]
3700
- self.core.cuda_context_destroy.restype = None
3701
- self.core.cuda_context_synchronize.argtypes = [ctypes.c_void_p]
3702
- self.core.cuda_context_synchronize.restype = None
3703
- self.core.cuda_context_check.argtypes = [ctypes.c_void_p]
3704
- self.core.cuda_context_check.restype = ctypes.c_uint64
3705
-
3706
- self.core.cuda_context_get_device_ordinal.argtypes = [ctypes.c_void_p]
3707
- self.core.cuda_context_get_device_ordinal.restype = ctypes.c_int
3708
- self.core.cuda_context_is_primary.argtypes = [ctypes.c_void_p]
3709
- self.core.cuda_context_is_primary.restype = ctypes.c_int
3710
- self.core.cuda_context_get_stream.argtypes = [ctypes.c_void_p]
3711
- self.core.cuda_context_get_stream.restype = ctypes.c_void_p
3712
- self.core.cuda_context_set_stream.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
3713
- self.core.cuda_context_set_stream.restype = None
3861
+ self.core.wp_bsr_transpose_host.argtypes = bsr_transpose_argtypes
3862
+ self.core.wp_bsr_transpose_device.argtypes = bsr_transpose_argtypes
3863
+
3864
+ self.core.wp_is_cuda_enabled.argtypes = None
3865
+ self.core.wp_is_cuda_enabled.restype = ctypes.c_int
3866
+ self.core.wp_is_cuda_compatibility_enabled.argtypes = None
3867
+ self.core.wp_is_cuda_compatibility_enabled.restype = ctypes.c_int
3868
+ self.core.wp_is_mathdx_enabled.argtypes = None
3869
+ self.core.wp_is_mathdx_enabled.restype = ctypes.c_int
3870
+
3871
+ self.core.wp_cuda_driver_version.argtypes = None
3872
+ self.core.wp_cuda_driver_version.restype = ctypes.c_int
3873
+ self.core.wp_cuda_toolkit_version.argtypes = None
3874
+ self.core.wp_cuda_toolkit_version.restype = ctypes.c_int
3875
+ self.core.wp_cuda_driver_is_initialized.argtypes = None
3876
+ self.core.wp_cuda_driver_is_initialized.restype = ctypes.c_bool
3877
+
3878
+ self.core.wp_nvrtc_supported_arch_count.argtypes = None
3879
+ self.core.wp_nvrtc_supported_arch_count.restype = ctypes.c_int
3880
+ self.core.wp_nvrtc_supported_archs.argtypes = [ctypes.POINTER(ctypes.c_int)]
3881
+ self.core.wp_nvrtc_supported_archs.restype = None
3882
+
3883
+ self.core.wp_cuda_device_get_count.argtypes = None
3884
+ self.core.wp_cuda_device_get_count.restype = ctypes.c_int
3885
+ self.core.wp_cuda_device_get_primary_context.argtypes = [ctypes.c_int]
3886
+ self.core.wp_cuda_device_get_primary_context.restype = ctypes.c_void_p
3887
+ self.core.wp_cuda_device_get_name.argtypes = [ctypes.c_int]
3888
+ self.core.wp_cuda_device_get_name.restype = ctypes.c_char_p
3889
+ self.core.wp_cuda_device_get_arch.argtypes = [ctypes.c_int]
3890
+ self.core.wp_cuda_device_get_arch.restype = ctypes.c_int
3891
+ self.core.wp_cuda_device_get_sm_count.argtypes = [ctypes.c_int]
3892
+ self.core.wp_cuda_device_get_sm_count.restype = ctypes.c_int
3893
+ self.core.wp_cuda_device_is_uva.argtypes = [ctypes.c_int]
3894
+ self.core.wp_cuda_device_is_uva.restype = ctypes.c_int
3895
+ self.core.wp_cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
3896
+ self.core.wp_cuda_device_is_mempool_supported.restype = ctypes.c_int
3897
+ self.core.wp_cuda_device_is_ipc_supported.argtypes = [ctypes.c_int]
3898
+ self.core.wp_cuda_device_is_ipc_supported.restype = ctypes.c_int
3899
+ self.core.wp_cuda_device_set_mempool_release_threshold.argtypes = [ctypes.c_int, ctypes.c_uint64]
3900
+ self.core.wp_cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
3901
+ self.core.wp_cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
3902
+ self.core.wp_cuda_device_get_mempool_release_threshold.restype = ctypes.c_uint64
3903
+ self.core.wp_cuda_device_get_mempool_used_mem_current.argtypes = [ctypes.c_int]
3904
+ self.core.wp_cuda_device_get_mempool_used_mem_current.restype = ctypes.c_uint64
3905
+ self.core.wp_cuda_device_get_mempool_used_mem_high.argtypes = [ctypes.c_int]
3906
+ self.core.wp_cuda_device_get_mempool_used_mem_high.restype = ctypes.c_uint64
3907
+ self.core.wp_cuda_device_get_memory_info.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
3908
+ self.core.wp_cuda_device_get_memory_info.restype = None
3909
+ self.core.wp_cuda_device_get_uuid.argtypes = [ctypes.c_int, ctypes.c_char * 16]
3910
+ self.core.wp_cuda_device_get_uuid.restype = None
3911
+ self.core.wp_cuda_device_get_pci_domain_id.argtypes = [ctypes.c_int]
3912
+ self.core.wp_cuda_device_get_pci_domain_id.restype = ctypes.c_int
3913
+ self.core.wp_cuda_device_get_pci_bus_id.argtypes = [ctypes.c_int]
3914
+ self.core.wp_cuda_device_get_pci_bus_id.restype = ctypes.c_int
3915
+ self.core.wp_cuda_device_get_pci_device_id.argtypes = [ctypes.c_int]
3916
+ self.core.wp_cuda_device_get_pci_device_id.restype = ctypes.c_int
3917
+
3918
+ self.core.wp_cuda_context_get_current.argtypes = None
3919
+ self.core.wp_cuda_context_get_current.restype = ctypes.c_void_p
3920
+ self.core.wp_cuda_context_set_current.argtypes = [ctypes.c_void_p]
3921
+ self.core.wp_cuda_context_set_current.restype = None
3922
+ self.core.wp_cuda_context_push_current.argtypes = [ctypes.c_void_p]
3923
+ self.core.wp_cuda_context_push_current.restype = None
3924
+ self.core.wp_cuda_context_pop_current.argtypes = None
3925
+ self.core.wp_cuda_context_pop_current.restype = None
3926
+ self.core.wp_cuda_context_create.argtypes = [ctypes.c_int]
3927
+ self.core.wp_cuda_context_create.restype = ctypes.c_void_p
3928
+ self.core.wp_cuda_context_destroy.argtypes = [ctypes.c_void_p]
3929
+ self.core.wp_cuda_context_destroy.restype = None
3930
+ self.core.wp_cuda_context_synchronize.argtypes = [ctypes.c_void_p]
3931
+ self.core.wp_cuda_context_synchronize.restype = None
3932
+ self.core.wp_cuda_context_check.argtypes = [ctypes.c_void_p]
3933
+ self.core.wp_cuda_context_check.restype = ctypes.c_uint64
3934
+
3935
+ self.core.wp_cuda_context_get_device_ordinal.argtypes = [ctypes.c_void_p]
3936
+ self.core.wp_cuda_context_get_device_ordinal.restype = ctypes.c_int
3937
+ self.core.wp_cuda_context_is_primary.argtypes = [ctypes.c_void_p]
3938
+ self.core.wp_cuda_context_is_primary.restype = ctypes.c_int
3939
+ self.core.wp_cuda_context_get_stream.argtypes = [ctypes.c_void_p]
3940
+ self.core.wp_cuda_context_get_stream.restype = ctypes.c_void_p
3941
+ self.core.wp_cuda_context_set_stream.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
3942
+ self.core.wp_cuda_context_set_stream.restype = None
3714
3943
 
3715
3944
  # peer access
3716
- self.core.cuda_is_peer_access_supported.argtypes = [ctypes.c_int, ctypes.c_int]
3717
- self.core.cuda_is_peer_access_supported.restype = ctypes.c_int
3718
- self.core.cuda_is_peer_access_enabled.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3719
- self.core.cuda_is_peer_access_enabled.restype = ctypes.c_int
3720
- self.core.cuda_set_peer_access_enabled.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
3721
- self.core.cuda_set_peer_access_enabled.restype = ctypes.c_int
3722
- self.core.cuda_is_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int]
3723
- self.core.cuda_is_mempool_access_enabled.restype = ctypes.c_int
3724
- self.core.cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
3725
- self.core.cuda_set_mempool_access_enabled.restype = ctypes.c_int
3945
+ self.core.wp_cuda_is_peer_access_supported.argtypes = [ctypes.c_int, ctypes.c_int]
3946
+ self.core.wp_cuda_is_peer_access_supported.restype = ctypes.c_int
3947
+ self.core.wp_cuda_is_peer_access_enabled.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3948
+ self.core.wp_cuda_is_peer_access_enabled.restype = ctypes.c_int
3949
+ self.core.wp_cuda_set_peer_access_enabled.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
3950
+ self.core.wp_cuda_set_peer_access_enabled.restype = ctypes.c_int
3951
+ self.core.wp_cuda_is_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int]
3952
+ self.core.wp_cuda_is_mempool_access_enabled.restype = ctypes.c_int
3953
+ self.core.wp_cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
3954
+ self.core.wp_cuda_set_mempool_access_enabled.restype = ctypes.c_int
3726
3955
 
3727
3956
  # inter-process communication
3728
- self.core.cuda_ipc_get_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
3729
- self.core.cuda_ipc_get_mem_handle.restype = None
3730
- self.core.cuda_ipc_open_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
3731
- self.core.cuda_ipc_open_mem_handle.restype = ctypes.c_void_p
3732
- self.core.cuda_ipc_close_mem_handle.argtypes = [ctypes.c_void_p]
3733
- self.core.cuda_ipc_close_mem_handle.restype = None
3734
- self.core.cuda_ipc_get_event_handle.argtypes = [
3957
+ self.core.wp_cuda_ipc_get_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
3958
+ self.core.wp_cuda_ipc_get_mem_handle.restype = None
3959
+ self.core.wp_cuda_ipc_open_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
3960
+ self.core.wp_cuda_ipc_open_mem_handle.restype = ctypes.c_void_p
3961
+ self.core.wp_cuda_ipc_close_mem_handle.argtypes = [ctypes.c_void_p]
3962
+ self.core.wp_cuda_ipc_close_mem_handle.restype = None
3963
+ self.core.wp_cuda_ipc_get_event_handle.argtypes = [
3735
3964
  ctypes.c_void_p,
3736
3965
  ctypes.c_void_p,
3737
3966
  ctypes.POINTER(ctypes.c_char),
3738
3967
  ]
3739
- self.core.cuda_ipc_get_event_handle.restype = None
3740
- self.core.cuda_ipc_open_event_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
3741
- self.core.cuda_ipc_open_event_handle.restype = ctypes.c_void_p
3742
-
3743
- self.core.cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
3744
- self.core.cuda_stream_create.restype = ctypes.c_void_p
3745
- self.core.cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3746
- self.core.cuda_stream_destroy.restype = None
3747
- self.core.cuda_stream_query.argtypes = [ctypes.c_void_p]
3748
- self.core.cuda_stream_query.restype = ctypes.c_int
3749
- self.core.cuda_stream_register.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3750
- self.core.cuda_stream_register.restype = None
3751
- self.core.cuda_stream_unregister.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3752
- self.core.cuda_stream_unregister.restype = None
3753
- self.core.cuda_stream_synchronize.argtypes = [ctypes.c_void_p]
3754
- self.core.cuda_stream_synchronize.restype = None
3755
- self.core.cuda_stream_wait_event.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3756
- self.core.cuda_stream_wait_event.restype = None
3757
- self.core.cuda_stream_wait_stream.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
3758
- self.core.cuda_stream_wait_stream.restype = None
3759
- self.core.cuda_stream_is_capturing.argtypes = [ctypes.c_void_p]
3760
- self.core.cuda_stream_is_capturing.restype = ctypes.c_int
3761
- self.core.cuda_stream_get_capture_id.argtypes = [ctypes.c_void_p]
3762
- self.core.cuda_stream_get_capture_id.restype = ctypes.c_uint64
3763
- self.core.cuda_stream_get_priority.argtypes = [ctypes.c_void_p]
3764
- self.core.cuda_stream_get_priority.restype = ctypes.c_int
3765
-
3766
- self.core.cuda_event_create.argtypes = [ctypes.c_void_p, ctypes.c_uint]
3767
- self.core.cuda_event_create.restype = ctypes.c_void_p
3768
- self.core.cuda_event_destroy.argtypes = [ctypes.c_void_p]
3769
- self.core.cuda_event_destroy.restype = None
3770
- self.core.cuda_event_query.argtypes = [ctypes.c_void_p]
3771
- self.core.cuda_event_query.restype = ctypes.c_int
3772
- self.core.cuda_event_record.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_bool]
3773
- self.core.cuda_event_record.restype = None
3774
- self.core.cuda_event_synchronize.argtypes = [ctypes.c_void_p]
3775
- self.core.cuda_event_synchronize.restype = None
3776
- self.core.cuda_event_elapsed_time.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3777
- self.core.cuda_event_elapsed_time.restype = ctypes.c_float
3778
-
3779
- self.core.cuda_graph_begin_capture.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
3780
- self.core.cuda_graph_begin_capture.restype = ctypes.c_bool
3781
- self.core.cuda_graph_end_capture.argtypes = [
3968
+ self.core.wp_cuda_ipc_get_event_handle.restype = None
3969
+ self.core.wp_cuda_ipc_open_event_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
3970
+ self.core.wp_cuda_ipc_open_event_handle.restype = ctypes.c_void_p
3971
+
3972
+ self.core.wp_cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
3973
+ self.core.wp_cuda_stream_create.restype = ctypes.c_void_p
3974
+ self.core.wp_cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3975
+ self.core.wp_cuda_stream_destroy.restype = None
3976
+ self.core.wp_cuda_stream_query.argtypes = [ctypes.c_void_p]
3977
+ self.core.wp_cuda_stream_query.restype = ctypes.c_int
3978
+ self.core.wp_cuda_stream_register.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3979
+ self.core.wp_cuda_stream_register.restype = None
3980
+ self.core.wp_cuda_stream_unregister.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3981
+ self.core.wp_cuda_stream_unregister.restype = None
3982
+ self.core.wp_cuda_stream_synchronize.argtypes = [ctypes.c_void_p]
3983
+ self.core.wp_cuda_stream_synchronize.restype = None
3984
+ self.core.wp_cuda_stream_wait_event.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3985
+ self.core.wp_cuda_stream_wait_event.restype = None
3986
+ self.core.wp_cuda_stream_wait_stream.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
3987
+ self.core.wp_cuda_stream_wait_stream.restype = None
3988
+ self.core.wp_cuda_stream_is_capturing.argtypes = [ctypes.c_void_p]
3989
+ self.core.wp_cuda_stream_is_capturing.restype = ctypes.c_int
3990
+ self.core.wp_cuda_stream_get_capture_id.argtypes = [ctypes.c_void_p]
3991
+ self.core.wp_cuda_stream_get_capture_id.restype = ctypes.c_uint64
3992
+ self.core.wp_cuda_stream_get_priority.argtypes = [ctypes.c_void_p]
3993
+ self.core.wp_cuda_stream_get_priority.restype = ctypes.c_int
3994
+
3995
+ self.core.wp_cuda_event_create.argtypes = [ctypes.c_void_p, ctypes.c_uint]
3996
+ self.core.wp_cuda_event_create.restype = ctypes.c_void_p
3997
+ self.core.wp_cuda_event_destroy.argtypes = [ctypes.c_void_p]
3998
+ self.core.wp_cuda_event_destroy.restype = None
3999
+ self.core.wp_cuda_event_query.argtypes = [ctypes.c_void_p]
4000
+ self.core.wp_cuda_event_query.restype = ctypes.c_int
4001
+ self.core.wp_cuda_event_record.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_bool]
4002
+ self.core.wp_cuda_event_record.restype = None
4003
+ self.core.wp_cuda_event_synchronize.argtypes = [ctypes.c_void_p]
4004
+ self.core.wp_cuda_event_synchronize.restype = None
4005
+ self.core.wp_cuda_event_elapsed_time.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4006
+ self.core.wp_cuda_event_elapsed_time.restype = ctypes.c_float
4007
+
4008
+ self.core.wp_cuda_graph_begin_capture.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
4009
+ self.core.wp_cuda_graph_begin_capture.restype = ctypes.c_bool
4010
+ self.core.wp_cuda_graph_end_capture.argtypes = [
3782
4011
  ctypes.c_void_p,
3783
4012
  ctypes.c_void_p,
3784
4013
  ctypes.POINTER(ctypes.c_void_p),
3785
4014
  ]
3786
- self.core.cuda_graph_end_capture.restype = ctypes.c_bool
4015
+ self.core.wp_cuda_graph_end_capture.restype = ctypes.c_bool
3787
4016
 
3788
- self.core.cuda_graph_create_exec.argtypes = [
4017
+ self.core.wp_cuda_graph_create_exec.argtypes = [
3789
4018
  ctypes.c_void_p,
3790
4019
  ctypes.c_void_p,
3791
4020
  ctypes.c_void_p,
3792
4021
  ctypes.POINTER(ctypes.c_void_p),
3793
4022
  ]
3794
- self.core.cuda_graph_create_exec.restype = ctypes.c_bool
4023
+ self.core.wp_cuda_graph_create_exec.restype = ctypes.c_bool
3795
4024
 
3796
- self.core.capture_debug_dot_print.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_uint32]
3797
- self.core.capture_debug_dot_print.restype = ctypes.c_bool
4025
+ self.core.wp_capture_debug_dot_print.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_uint32]
4026
+ self.core.wp_capture_debug_dot_print.restype = ctypes.c_bool
3798
4027
 
3799
- self.core.cuda_graph_launch.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3800
- self.core.cuda_graph_launch.restype = ctypes.c_bool
3801
- self.core.cuda_graph_exec_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3802
- self.core.cuda_graph_exec_destroy.restype = ctypes.c_bool
4028
+ self.core.wp_cuda_graph_launch.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4029
+ self.core.wp_cuda_graph_launch.restype = ctypes.c_bool
4030
+ self.core.wp_cuda_graph_exec_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4031
+ self.core.wp_cuda_graph_exec_destroy.restype = ctypes.c_bool
3803
4032
 
3804
- self.core.cuda_graph_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3805
- self.core.cuda_graph_destroy.restype = ctypes.c_bool
4033
+ self.core.wp_cuda_graph_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4034
+ self.core.wp_cuda_graph_destroy.restype = ctypes.c_bool
3806
4035
 
3807
- self.core.cuda_graph_insert_if_else.argtypes = [
4036
+ self.core.wp_cuda_graph_insert_if_else.argtypes = [
3808
4037
  ctypes.c_void_p,
3809
4038
  ctypes.c_void_p,
3810
4039
  ctypes.POINTER(ctypes.c_int),
3811
4040
  ctypes.POINTER(ctypes.c_void_p),
3812
4041
  ctypes.POINTER(ctypes.c_void_p),
3813
4042
  ]
3814
- self.core.cuda_graph_insert_if_else.restype = ctypes.c_bool
4043
+ self.core.wp_cuda_graph_insert_if_else.restype = ctypes.c_bool
3815
4044
 
3816
- self.core.cuda_graph_insert_while.argtypes = [
4045
+ self.core.wp_cuda_graph_insert_while.argtypes = [
3817
4046
  ctypes.c_void_p,
3818
4047
  ctypes.c_void_p,
3819
4048
  ctypes.POINTER(ctypes.c_int),
3820
4049
  ctypes.POINTER(ctypes.c_void_p),
3821
4050
  ctypes.POINTER(ctypes.c_uint64),
3822
4051
  ]
3823
- self.core.cuda_graph_insert_while.restype = ctypes.c_bool
4052
+ self.core.wp_cuda_graph_insert_while.restype = ctypes.c_bool
3824
4053
 
3825
- self.core.cuda_graph_set_condition.argtypes = [
4054
+ self.core.wp_cuda_graph_set_condition.argtypes = [
3826
4055
  ctypes.c_void_p,
3827
4056
  ctypes.c_void_p,
3828
4057
  ctypes.POINTER(ctypes.c_int),
3829
4058
  ctypes.c_uint64,
3830
4059
  ]
3831
- self.core.cuda_graph_set_condition.restype = ctypes.c_bool
4060
+ self.core.wp_cuda_graph_set_condition.restype = ctypes.c_bool
3832
4061
 
3833
- self.core.cuda_graph_pause_capture.argtypes = [
4062
+ self.core.wp_cuda_graph_pause_capture.argtypes = [
3834
4063
  ctypes.c_void_p,
3835
4064
  ctypes.c_void_p,
3836
4065
  ctypes.POINTER(ctypes.c_void_p),
3837
4066
  ]
3838
- self.core.cuda_graph_pause_capture.restype = ctypes.c_bool
4067
+ self.core.wp_cuda_graph_pause_capture.restype = ctypes.c_bool
3839
4068
 
3840
- self.core.cuda_graph_resume_capture.argtypes = [
4069
+ self.core.wp_cuda_graph_resume_capture.argtypes = [
3841
4070
  ctypes.c_void_p,
3842
4071
  ctypes.c_void_p,
3843
4072
  ctypes.c_void_p,
3844
4073
  ]
3845
- self.core.cuda_graph_resume_capture.restype = ctypes.c_bool
4074
+ self.core.wp_cuda_graph_resume_capture.restype = ctypes.c_bool
3846
4075
 
3847
- self.core.cuda_graph_insert_child_graph.argtypes = [
4076
+ self.core.wp_cuda_graph_insert_child_graph.argtypes = [
3848
4077
  ctypes.c_void_p,
3849
4078
  ctypes.c_void_p,
3850
4079
  ctypes.c_void_p,
3851
4080
  ]
3852
- self.core.cuda_graph_insert_child_graph.restype = ctypes.c_bool
4081
+ self.core.wp_cuda_graph_insert_child_graph.restype = ctypes.c_bool
3853
4082
 
3854
- self.core.cuda_compile_program.argtypes = [
4083
+ self.core.wp_cuda_graph_check_conditional_body.argtypes = [ctypes.c_void_p]
4084
+ self.core.wp_cuda_graph_check_conditional_body.restype = ctypes.c_bool
4085
+
4086
+ self.core.wp_cuda_compile_program.argtypes = [
3855
4087
  ctypes.c_char_p, # cuda_src
3856
4088
  ctypes.c_char_p, # program name
3857
4089
  ctypes.c_int, # arch
@@ -3871,9 +4103,9 @@ class Runtime:
3871
4103
  ctypes.POINTER(ctypes.c_size_t), # ltoir_sizes
3872
4104
  ctypes.POINTER(ctypes.c_int), # ltoir_input_types, each of type nvJitLinkInputType
3873
4105
  ]
3874
- self.core.cuda_compile_program.restype = ctypes.c_size_t
4106
+ self.core.wp_cuda_compile_program.restype = ctypes.c_size_t
3875
4107
 
3876
- self.core.cuda_compile_fft.argtypes = [
4108
+ self.core.wp_cuda_compile_fft.argtypes = [
3877
4109
  ctypes.c_char_p, # lto
3878
4110
  ctypes.c_char_p, # function name
3879
4111
  ctypes.c_int, # num include dirs
@@ -3886,9 +4118,9 @@ class Runtime:
3886
4118
  ctypes.c_int, # precision
3887
4119
  ctypes.POINTER(ctypes.c_int), # smem (out)
3888
4120
  ]
3889
- self.core.cuda_compile_fft.restype = ctypes.c_bool
4121
+ self.core.wp_cuda_compile_fft.restype = ctypes.c_bool
3890
4122
 
3891
- self.core.cuda_compile_dot.argtypes = [
4123
+ self.core.wp_cuda_compile_dot.argtypes = [
3892
4124
  ctypes.c_char_p, # lto
3893
4125
  ctypes.c_char_p, # function name
3894
4126
  ctypes.c_int, # num include dirs
@@ -3907,9 +4139,9 @@ class Runtime:
3907
4139
  ctypes.c_int, # c_arrangement
3908
4140
  ctypes.c_int, # num threads
3909
4141
  ]
3910
- self.core.cuda_compile_dot.restype = ctypes.c_bool
4142
+ self.core.wp_cuda_compile_dot.restype = ctypes.c_bool
3911
4143
 
3912
- self.core.cuda_compile_solver.argtypes = [
4144
+ self.core.wp_cuda_compile_solver.argtypes = [
3913
4145
  ctypes.c_char_p, # universal fatbin
3914
4146
  ctypes.c_char_p, # lto
3915
4147
  ctypes.c_char_p, # function name
@@ -3929,24 +4161,24 @@ class Runtime:
3929
4161
  ctypes.c_int, # fill_mode
3930
4162
  ctypes.c_int, # num threads
3931
4163
  ]
3932
- self.core.cuda_compile_solver.restype = ctypes.c_bool
4164
+ self.core.wp_cuda_compile_solver.restype = ctypes.c_bool
3933
4165
 
3934
- self.core.cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
3935
- self.core.cuda_load_module.restype = ctypes.c_void_p
4166
+ self.core.wp_cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
4167
+ self.core.wp_cuda_load_module.restype = ctypes.c_void_p
3936
4168
 
3937
- self.core.cuda_unload_module.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3938
- self.core.cuda_unload_module.restype = None
4169
+ self.core.wp_cuda_unload_module.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4170
+ self.core.wp_cuda_unload_module.restype = None
3939
4171
 
3940
- self.core.cuda_get_kernel.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_char_p]
3941
- self.core.cuda_get_kernel.restype = ctypes.c_void_p
4172
+ self.core.wp_cuda_get_kernel.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_char_p]
4173
+ self.core.wp_cuda_get_kernel.restype = ctypes.c_void_p
3942
4174
 
3943
- self.core.cuda_get_max_shared_memory.argtypes = [ctypes.c_void_p]
3944
- self.core.cuda_get_max_shared_memory.restype = ctypes.c_int
4175
+ self.core.wp_cuda_get_max_shared_memory.argtypes = [ctypes.c_void_p]
4176
+ self.core.wp_cuda_get_max_shared_memory.restype = ctypes.c_int
3945
4177
 
3946
- self.core.cuda_configure_kernel_shared_memory.argtypes = [ctypes.c_void_p, ctypes.c_int]
3947
- self.core.cuda_configure_kernel_shared_memory.restype = ctypes.c_bool
4178
+ self.core.wp_cuda_configure_kernel_shared_memory.argtypes = [ctypes.c_void_p, ctypes.c_int]
4179
+ self.core.wp_cuda_configure_kernel_shared_memory.restype = ctypes.c_bool
3948
4180
 
3949
- self.core.cuda_launch_kernel.argtypes = [
4181
+ self.core.wp_cuda_launch_kernel.argtypes = [
3950
4182
  ctypes.c_void_p,
3951
4183
  ctypes.c_void_p,
3952
4184
  ctypes.c_size_t,
@@ -3956,54 +4188,54 @@ class Runtime:
3956
4188
  ctypes.POINTER(ctypes.c_void_p),
3957
4189
  ctypes.c_void_p,
3958
4190
  ]
3959
- self.core.cuda_launch_kernel.restype = ctypes.c_size_t
4191
+ self.core.wp_cuda_launch_kernel.restype = ctypes.c_size_t
3960
4192
 
3961
- self.core.cuda_graphics_map.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3962
- self.core.cuda_graphics_map.restype = None
3963
- self.core.cuda_graphics_unmap.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3964
- self.core.cuda_graphics_unmap.restype = None
3965
- self.core.cuda_graphics_device_ptr_and_size.argtypes = [
4193
+ self.core.wp_cuda_graphics_map.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4194
+ self.core.wp_cuda_graphics_map.restype = None
4195
+ self.core.wp_cuda_graphics_unmap.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4196
+ self.core.wp_cuda_graphics_unmap.restype = None
4197
+ self.core.wp_cuda_graphics_device_ptr_and_size.argtypes = [
3966
4198
  ctypes.c_void_p,
3967
4199
  ctypes.c_void_p,
3968
4200
  ctypes.POINTER(ctypes.c_uint64),
3969
4201
  ctypes.POINTER(ctypes.c_size_t),
3970
4202
  ]
3971
- self.core.cuda_graphics_device_ptr_and_size.restype = None
3972
- self.core.cuda_graphics_register_gl_buffer.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint]
3973
- self.core.cuda_graphics_register_gl_buffer.restype = ctypes.c_void_p
3974
- self.core.cuda_graphics_unregister_resource.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
3975
- self.core.cuda_graphics_unregister_resource.restype = None
3976
-
3977
- self.core.cuda_timing_begin.argtypes = [ctypes.c_int]
3978
- self.core.cuda_timing_begin.restype = None
3979
- self.core.cuda_timing_get_result_count.argtypes = []
3980
- self.core.cuda_timing_get_result_count.restype = int
3981
- self.core.cuda_timing_end.argtypes = []
3982
- self.core.cuda_timing_end.restype = None
3983
-
3984
- self.core.graph_coloring.argtypes = [
4203
+ self.core.wp_cuda_graphics_device_ptr_and_size.restype = None
4204
+ self.core.wp_cuda_graphics_register_gl_buffer.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint]
4205
+ self.core.wp_cuda_graphics_register_gl_buffer.restype = ctypes.c_void_p
4206
+ self.core.wp_cuda_graphics_unregister_resource.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
4207
+ self.core.wp_cuda_graphics_unregister_resource.restype = None
4208
+
4209
+ self.core.wp_cuda_timing_begin.argtypes = [ctypes.c_int]
4210
+ self.core.wp_cuda_timing_begin.restype = None
4211
+ self.core.wp_cuda_timing_get_result_count.argtypes = []
4212
+ self.core.wp_cuda_timing_get_result_count.restype = int
4213
+ self.core.wp_cuda_timing_end.argtypes = []
4214
+ self.core.wp_cuda_timing_end.restype = None
4215
+
4216
+ self.core.wp_graph_coloring.argtypes = [
3985
4217
  ctypes.c_int,
3986
4218
  warp.types.array_t,
3987
4219
  ctypes.c_int,
3988
4220
  warp.types.array_t,
3989
4221
  ]
3990
- self.core.graph_coloring.restype = ctypes.c_int
4222
+ self.core.wp_graph_coloring.restype = ctypes.c_int
3991
4223
 
3992
- self.core.balance_coloring.argtypes = [
4224
+ self.core.wp_balance_coloring.argtypes = [
3993
4225
  ctypes.c_int,
3994
4226
  warp.types.array_t,
3995
4227
  ctypes.c_int,
3996
4228
  ctypes.c_float,
3997
4229
  warp.types.array_t,
3998
4230
  ]
3999
- self.core.balance_coloring.restype = ctypes.c_float
4231
+ self.core.wp_balance_coloring.restype = ctypes.c_float
4000
4232
 
4001
- self.core.init.restype = ctypes.c_int
4233
+ self.core.wp_init.restype = ctypes.c_int
4002
4234
 
4003
4235
  except AttributeError as e:
4004
4236
  raise RuntimeError(f"Setting C-types for {warp_lib} failed. It may need rebuilding.") from e
4005
4237
 
4006
- error = self.core.init()
4238
+ error = self.core.wp_init()
4007
4239
 
4008
4240
  if error != 0:
4009
4241
  raise Exception("Warp initialization failed")
@@ -4019,8 +4251,8 @@ class Runtime:
4019
4251
  self.device_map["cpu"] = self.cpu_device
4020
4252
  self.context_map[None] = self.cpu_device
4021
4253
 
4022
- self.is_cuda_enabled = bool(self.core.is_cuda_enabled())
4023
- self.is_cuda_compatibility_enabled = bool(self.core.is_cuda_compatibility_enabled())
4254
+ self.is_cuda_enabled = bool(self.core.wp_is_cuda_enabled())
4255
+ self.is_cuda_compatibility_enabled = bool(self.core.wp_is_cuda_compatibility_enabled())
4024
4256
 
4025
4257
  self.toolkit_version = None # CTK version used to build the core lib
4026
4258
  self.driver_version = None # installed driver version
@@ -4033,12 +4265,15 @@ class Runtime:
4033
4265
 
4034
4266
  if self.is_cuda_enabled:
4035
4267
  # get CUDA Toolkit and driver versions
4036
- toolkit_version = self.core.cuda_toolkit_version()
4037
- driver_version = self.core.cuda_driver_version()
4038
-
4039
- # save versions as tuples, e.g., (12, 4)
4268
+ toolkit_version = self.core.wp_cuda_toolkit_version()
4040
4269
  self.toolkit_version = (toolkit_version // 1000, (toolkit_version % 1000) // 10)
4041
- self.driver_version = (driver_version // 1000, (driver_version % 1000) // 10)
4270
+
4271
+ if self.core.wp_cuda_driver_is_initialized():
4272
+ # save versions as tuples, e.g., (12, 4)
4273
+ driver_version = self.core.wp_cuda_driver_version()
4274
+ self.driver_version = (driver_version // 1000, (driver_version % 1000) // 10)
4275
+ else:
4276
+ self.driver_version = None
4042
4277
 
4043
4278
  # determine minimum required driver version
4044
4279
  if self.is_cuda_compatibility_enabled:
@@ -4052,18 +4287,18 @@ class Runtime:
4052
4287
  self.min_driver_version = self.toolkit_version
4053
4288
 
4054
4289
  # determine if the installed driver is sufficient
4055
- if self.driver_version >= self.min_driver_version:
4290
+ if self.driver_version is not None and self.driver_version >= self.min_driver_version:
4056
4291
  # get all architectures supported by NVRTC
4057
- num_archs = self.core.nvrtc_supported_arch_count()
4292
+ num_archs = self.core.wp_nvrtc_supported_arch_count()
4058
4293
  if num_archs > 0:
4059
4294
  archs = (ctypes.c_int * num_archs)()
4060
- self.core.nvrtc_supported_archs(archs)
4295
+ self.core.wp_nvrtc_supported_archs(archs)
4061
4296
  self.nvrtc_supported_archs = set(archs)
4062
4297
  else:
4063
4298
  self.nvrtc_supported_archs = set()
4064
4299
 
4065
4300
  # get CUDA device count
4066
- cuda_device_count = self.core.cuda_device_get_count()
4301
+ cuda_device_count = self.core.wp_cuda_device_get_count()
4067
4302
 
4068
4303
  # register primary CUDA devices
4069
4304
  for i in range(cuda_device_count):
@@ -4080,7 +4315,7 @@ class Runtime:
4080
4315
  # set default device
4081
4316
  if cuda_device_count > 0:
4082
4317
  # stick with the current cuda context, if one is bound
4083
- initial_context = self.core.cuda_context_get_current()
4318
+ initial_context = self.core.wp_cuda_context_get_current()
4084
4319
  if initial_context is not None:
4085
4320
  self.set_default_device("cuda")
4086
4321
  # if this is a non-primary context that was just registered, update the device count
@@ -4133,6 +4368,8 @@ class Runtime:
4133
4368
  if not self.is_cuda_enabled:
4134
4369
  # Warp was compiled without CUDA support
4135
4370
  greeting.append(" CUDA not enabled in this build")
4371
+ elif self.driver_version is None:
4372
+ greeting.append(" CUDA driver not found or failed to initialize")
4136
4373
  elif self.driver_version < self.min_driver_version:
4137
4374
  # insufficient CUDA driver version
4138
4375
  greeting.append(
@@ -4176,7 +4413,7 @@ class Runtime:
4176
4413
  access_vector.append(1)
4177
4414
  else:
4178
4415
  peer_device = self.cuda_devices[j]
4179
- can_access = self.core.cuda_is_peer_access_supported(
4416
+ can_access = self.core.wp_cuda_is_peer_access_supported(
4180
4417
  target_device.ordinal, peer_device.ordinal
4181
4418
  )
4182
4419
  access_vector.append(can_access)
@@ -4201,7 +4438,7 @@ class Runtime:
4201
4438
 
4202
4439
  if cuda_device_count > 0:
4203
4440
  # ensure initialization did not change the initial context (e.g. querying available memory)
4204
- self.core.cuda_context_set_current(initial_context)
4441
+ self.core.wp_cuda_context_set_current(initial_context)
4205
4442
 
4206
4443
  # detect possible misconfiguration of the system
4207
4444
  devices_without_uva = []
@@ -4229,7 +4466,7 @@ class Runtime:
4229
4466
  elif self.is_cuda_enabled:
4230
4467
  # Report a warning about insufficient driver version. The warning should appear even in quiet mode
4231
4468
  # when the greeting message is suppressed. Also try to provide guidance for resolving the situation.
4232
- if self.driver_version < self.min_driver_version:
4469
+ if self.driver_version is not None and self.driver_version < self.min_driver_version:
4233
4470
  msg = []
4234
4471
  msg.append("\n Insufficient CUDA driver version.")
4235
4472
  msg.append(
@@ -4240,7 +4477,7 @@ class Runtime:
4240
4477
  warp.utils.warn("\n ".join(msg))
4241
4478
 
4242
4479
  def get_error_string(self):
4243
- return self.core.get_error_string().decode("utf-8")
4480
+ return self.core.wp_get_error_string().decode("utf-8")
4244
4481
 
4245
4482
  def load_dll(self, dll_path):
4246
4483
  try:
@@ -4276,21 +4513,21 @@ class Runtime:
4276
4513
  self.default_device = self.get_device(ident)
4277
4514
 
4278
4515
  def get_current_cuda_device(self) -> Device:
4279
- current_context = self.core.cuda_context_get_current()
4516
+ current_context = self.core.wp_cuda_context_get_current()
4280
4517
  if current_context is not None:
4281
4518
  current_device = self.context_map.get(current_context)
4282
4519
  if current_device is not None:
4283
4520
  # this is a known device
4284
4521
  return current_device
4285
- elif self.core.cuda_context_is_primary(current_context):
4522
+ elif self.core.wp_cuda_context_is_primary(current_context):
4286
4523
  # this is a primary context that we haven't used yet
4287
- ordinal = self.core.cuda_context_get_device_ordinal(current_context)
4524
+ ordinal = self.core.wp_cuda_context_get_device_ordinal(current_context)
4288
4525
  device = self.cuda_devices[ordinal]
4289
4526
  self.context_map[current_context] = device
4290
4527
  return device
4291
4528
  else:
4292
4529
  # this is an unseen non-primary context, register it as a new device with a unique alias
4293
- ordinal = self.core.cuda_context_get_device_ordinal(current_context)
4530
+ ordinal = self.core.wp_cuda_context_get_device_ordinal(current_context)
4294
4531
  alias = f"cuda:{ordinal}.{self.cuda_custom_context_count[ordinal]}"
4295
4532
  self.cuda_custom_context_count[ordinal] += 1
4296
4533
  return self.map_cuda_device(alias, current_context)
@@ -4313,7 +4550,7 @@ class Runtime:
4313
4550
 
4314
4551
  def map_cuda_device(self, alias, context=None) -> Device:
4315
4552
  if context is None:
4316
- context = self.core.cuda_context_get_current()
4553
+ context = self.core.wp_cuda_context_get_current()
4317
4554
  if context is None:
4318
4555
  raise RuntimeError(f"Unable to determine CUDA context for device alias '{alias}'")
4319
4556
 
@@ -4335,10 +4572,10 @@ class Runtime:
4335
4572
  # it's an unmapped context
4336
4573
 
4337
4574
  # get the device ordinal
4338
- ordinal = self.core.cuda_context_get_device_ordinal(context)
4575
+ ordinal = self.core.wp_cuda_context_get_device_ordinal(context)
4339
4576
 
4340
4577
  # check if this is a primary context (we could get here if it's a device that hasn't been used yet)
4341
- if self.core.cuda_context_is_primary(context):
4578
+ if self.core.wp_cuda_context_is_primary(context):
4342
4579
  # rename the device
4343
4580
  device = self.cuda_primary_devices[ordinal]
4344
4581
  return self.rename_device(device, alias)
@@ -4369,7 +4606,7 @@ class Runtime:
4369
4606
  if not device.is_cuda:
4370
4607
  return
4371
4608
 
4372
- err = self.core.cuda_context_check(device.context)
4609
+ err = self.core.wp_cuda_context_check(device.context)
4373
4610
  if err != 0:
4374
4611
  raise RuntimeError(f"CUDA error detected: {err}")
4375
4612
 
@@ -4401,7 +4638,7 @@ def is_cuda_driver_initialized() -> bool:
4401
4638
  """
4402
4639
  init()
4403
4640
 
4404
- return runtime.core.cuda_driver_is_initialized()
4641
+ return runtime.core.wp_cuda_driver_is_initialized()
4405
4642
 
4406
4643
 
4407
4644
  def get_devices() -> list[Device]:
@@ -4609,7 +4846,7 @@ def set_mempool_release_threshold(device: Devicelike, threshold: int | float) ->
4609
4846
  elif threshold > 0 and threshold <= 1:
4610
4847
  threshold = int(threshold * device.total_memory)
4611
4848
 
4612
- if not runtime.core.cuda_device_set_mempool_release_threshold(device.ordinal, threshold):
4849
+ if not runtime.core.wp_cuda_device_set_mempool_release_threshold(device.ordinal, threshold):
4613
4850
  raise RuntimeError(f"Failed to set memory pool release threshold for device {device}")
4614
4851
 
4615
4852
 
@@ -4639,7 +4876,7 @@ def get_mempool_release_threshold(device: Devicelike = None) -> int:
4639
4876
  if not device.is_mempool_supported:
4640
4877
  raise RuntimeError(f"Device {device} does not support memory pools")
4641
4878
 
4642
- return runtime.core.cuda_device_get_mempool_release_threshold(device.ordinal)
4879
+ return runtime.core.wp_cuda_device_get_mempool_release_threshold(device.ordinal)
4643
4880
 
4644
4881
 
4645
4882
  def get_mempool_used_mem_current(device: Devicelike = None) -> int:
@@ -4668,7 +4905,7 @@ def get_mempool_used_mem_current(device: Devicelike = None) -> int:
4668
4905
  if not device.is_mempool_supported:
4669
4906
  raise RuntimeError(f"Device {device} does not support memory pools")
4670
4907
 
4671
- return runtime.core.cuda_device_get_mempool_used_mem_current(device.ordinal)
4908
+ return runtime.core.wp_cuda_device_get_mempool_used_mem_current(device.ordinal)
4672
4909
 
4673
4910
 
4674
4911
  def get_mempool_used_mem_high(device: Devicelike = None) -> int:
@@ -4697,7 +4934,7 @@ def get_mempool_used_mem_high(device: Devicelike = None) -> int:
4697
4934
  if not device.is_mempool_supported:
4698
4935
  raise RuntimeError(f"Device {device} does not support memory pools")
4699
4936
 
4700
- return runtime.core.cuda_device_get_mempool_used_mem_high(device.ordinal)
4937
+ return runtime.core.wp_cuda_device_get_mempool_used_mem_high(device.ordinal)
4701
4938
 
4702
4939
 
4703
4940
  def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike) -> bool:
@@ -4718,7 +4955,7 @@ def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike)
4718
4955
  if not target_device.is_cuda or not peer_device.is_cuda:
4719
4956
  return False
4720
4957
 
4721
- return bool(runtime.core.cuda_is_peer_access_supported(target_device.ordinal, peer_device.ordinal))
4958
+ return bool(runtime.core.wp_cuda_is_peer_access_supported(target_device.ordinal, peer_device.ordinal))
4722
4959
 
4723
4960
 
4724
4961
  def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike) -> bool:
@@ -4739,7 +4976,7 @@ def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike) -
4739
4976
  if not target_device.is_cuda or not peer_device.is_cuda:
4740
4977
  return False
4741
4978
 
4742
- return bool(runtime.core.cuda_is_peer_access_enabled(target_device.context, peer_device.context))
4979
+ return bool(runtime.core.wp_cuda_is_peer_access_enabled(target_device.context, peer_device.context))
4743
4980
 
4744
4981
 
4745
4982
  def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool) -> None:
@@ -4769,7 +5006,7 @@ def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike,
4769
5006
  else:
4770
5007
  return
4771
5008
 
4772
- if not runtime.core.cuda_set_peer_access_enabled(target_device.context, peer_device.context, int(enable)):
5009
+ if not runtime.core.wp_cuda_set_peer_access_enabled(target_device.context, peer_device.context, int(enable)):
4773
5010
  action = "enable" if enable else "disable"
4774
5011
  raise RuntimeError(f"Failed to {action} peer access from device {peer_device} to device {target_device}")
4775
5012
 
@@ -4810,7 +5047,7 @@ def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike
4810
5047
  if not peer_device.is_cuda or not target_device.is_cuda or not target_device.is_mempool_supported:
4811
5048
  return False
4812
5049
 
4813
- return bool(runtime.core.cuda_is_mempool_access_enabled(target_device.ordinal, peer_device.ordinal))
5050
+ return bool(runtime.core.wp_cuda_is_mempool_access_enabled(target_device.ordinal, peer_device.ordinal))
4814
5051
 
4815
5052
 
4816
5053
  def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool) -> None:
@@ -4843,7 +5080,7 @@ def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelik
4843
5080
  else:
4844
5081
  return
4845
5082
 
4846
- if not runtime.core.cuda_set_mempool_access_enabled(target_device.ordinal, peer_device.ordinal, int(enable)):
5083
+ if not runtime.core.wp_cuda_set_mempool_access_enabled(target_device.ordinal, peer_device.ordinal, int(enable)):
4847
5084
  action = "enable" if enable else "disable"
4848
5085
  raise RuntimeError(f"Failed to {action} memory pool access from device {peer_device} to device {target_device}")
4849
5086
 
@@ -4924,7 +5161,7 @@ def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bo
4924
5161
  if synchronize:
4925
5162
  synchronize_event(end_event)
4926
5163
 
4927
- return runtime.core.cuda_event_elapsed_time(start_event.cuda_event, end_event.cuda_event)
5164
+ return runtime.core.wp_cuda_event_elapsed_time(start_event.cuda_event, end_event.cuda_event)
4928
5165
 
4929
5166
 
4930
5167
  def wait_stream(other_stream: Stream, event: Event | None = None):
@@ -5018,7 +5255,7 @@ class RegisteredGLBuffer:
5018
5255
  self.context = self.device.context
5019
5256
  self.flags = flags
5020
5257
  self.fallback_to_copy = fallback_to_copy
5021
- self.resource = runtime.core.cuda_graphics_register_gl_buffer(self.context, gl_buffer_id, flags)
5258
+ self.resource = runtime.core.wp_cuda_graphics_register_gl_buffer(self.context, gl_buffer_id, flags)
5022
5259
  if self.resource is None:
5023
5260
  if self.fallback_to_copy:
5024
5261
  self.warp_buffer = None
@@ -5037,7 +5274,7 @@ class RegisteredGLBuffer:
5037
5274
 
5038
5275
  # use CUDA context guard to avoid side effects during garbage collection
5039
5276
  with self.device.context_guard:
5040
- runtime.core.cuda_graphics_unregister_resource(self.context, self.resource)
5277
+ runtime.core.wp_cuda_graphics_unregister_resource(self.context, self.resource)
5041
5278
 
5042
5279
  def map(self, dtype, shape) -> warp.array:
5043
5280
  """Map the OpenGL buffer to a Warp array.
@@ -5050,10 +5287,10 @@ class RegisteredGLBuffer:
5050
5287
  A Warp array object representing the mapped OpenGL buffer.
5051
5288
  """
5052
5289
  if self.resource is not None:
5053
- runtime.core.cuda_graphics_map(self.context, self.resource)
5290
+ runtime.core.wp_cuda_graphics_map(self.context, self.resource)
5054
5291
  ptr = ctypes.c_uint64(0)
5055
5292
  size = ctypes.c_size_t(0)
5056
- runtime.core.cuda_graphics_device_ptr_and_size(
5293
+ runtime.core.wp_cuda_graphics_device_ptr_and_size(
5057
5294
  self.context, self.resource, ctypes.byref(ptr), ctypes.byref(size)
5058
5295
  )
5059
5296
  return warp.array(ptr=ptr.value, dtype=dtype, shape=shape, device=self.device)
@@ -5078,7 +5315,7 @@ class RegisteredGLBuffer:
5078
5315
  def unmap(self):
5079
5316
  """Unmap the OpenGL buffer."""
5080
5317
  if self.resource is not None:
5081
- runtime.core.cuda_graphics_unmap(self.context, self.resource)
5318
+ runtime.core.wp_cuda_graphics_unmap(self.context, self.resource)
5082
5319
  elif self.fallback_to_copy:
5083
5320
  if self.warp_buffer is None:
5084
5321
  raise RuntimeError("RegisteredGLBuffer first has to be mapped")
@@ -5434,7 +5671,7 @@ def event_from_ipc_handle(handle, device: Devicelike = None) -> Event:
5434
5671
  raise RuntimeError(f"IPC is not supported on device {device}.")
5435
5672
 
5436
5673
  event = Event(
5437
- device=device, cuda_event=warp.context.runtime.core.cuda_ipc_open_event_handle(device.context, handle)
5674
+ device=device, cuda_event=warp.context.runtime.core.wp_cuda_ipc_open_event_handle(device.context, handle)
5438
5675
  )
5439
5676
  # Events created from IPC handles must be freed with cuEventDestroy
5440
5677
  event.owner = True
@@ -5566,6 +5803,44 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
5566
5803
  ) from e
5567
5804
 
5568
5805
 
5806
+ # invoke a CPU kernel by passing the parameters as a ctypes structure
5807
+ def invoke(kernel, hooks, params: Sequence[Any], adjoint: bool):
5808
+ fields = []
5809
+
5810
+ for i in range(0, len(kernel.adj.args)):
5811
+ arg_name = kernel.adj.args[i].label
5812
+ field = (arg_name, type(params[1 + i])) # skip the first argument, which is the launch bounds
5813
+ fields.append(field)
5814
+
5815
+ ArgsStruct = type("ArgsStruct", (ctypes.Structure,), {"_fields_": fields})
5816
+
5817
+ args = ArgsStruct()
5818
+ for i, field in enumerate(fields):
5819
+ name = field[0]
5820
+ setattr(args, name, params[1 + i])
5821
+
5822
+ if not adjoint:
5823
+ hooks.forward(params[0], ctypes.byref(args))
5824
+
5825
+ # for adjoint kernels the adjoint arguments are passed through a second struct
5826
+ else:
5827
+ adj_fields = []
5828
+
5829
+ for i in range(0, len(kernel.adj.args)):
5830
+ arg_name = kernel.adj.args[i].label
5831
+ field = (arg_name, type(params[1 + len(fields) + i])) # skip the first argument, which is the launch bounds
5832
+ adj_fields.append(field)
5833
+
5834
+ AdjArgsStruct = type("AdjArgsStruct", (ctypes.Structure,), {"_fields_": adj_fields})
5835
+
5836
+ adj_args = AdjArgsStruct()
5837
+ for i, field in enumerate(adj_fields):
5838
+ name = field[0]
5839
+ setattr(adj_args, name, params[1 + len(fields) + i])
5840
+
5841
+ hooks.backward(params[0], ctypes.byref(args), ctypes.byref(adj_args))
5842
+
5843
+
5569
5844
  class Launch:
5570
5845
  """Represents all data required for a kernel launch so that launches can be replayed quickly.
5571
5846
 
@@ -5758,24 +6033,21 @@ class Launch:
5758
6033
  stream: The stream to launch on.
5759
6034
  """
5760
6035
  if self.device.is_cpu:
5761
- if self.adjoint:
5762
- self.hooks.backward(*self.params)
5763
- else:
5764
- self.hooks.forward(*self.params)
6036
+ invoke(self.kernel, self.hooks, self.params, self.adjoint)
5765
6037
  else:
5766
6038
  if stream is None:
5767
6039
  stream = self.device.stream
5768
6040
 
5769
6041
  # If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
5770
6042
  # before the captured graph is released.
5771
- if len(runtime.captures) > 0 and runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
5772
- capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
6043
+ if len(runtime.captures) > 0 and runtime.core.wp_cuda_stream_is_capturing(stream.cuda_stream):
6044
+ capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
5773
6045
  graph = runtime.captures.get(capture_id)
5774
6046
  if graph is not None:
5775
6047
  graph.retain_module_exec(self.module_exec)
5776
6048
 
5777
6049
  if self.adjoint:
5778
- runtime.core.cuda_launch_kernel(
6050
+ runtime.core.wp_cuda_launch_kernel(
5779
6051
  self.device.context,
5780
6052
  self.hooks.backward,
5781
6053
  self.bounds.size,
@@ -5786,7 +6058,7 @@ class Launch:
5786
6058
  stream.cuda_stream,
5787
6059
  )
5788
6060
  else:
5789
- runtime.core.cuda_launch_kernel(
6061
+ runtime.core.wp_cuda_launch_kernel(
5790
6062
  self.device.context,
5791
6063
  self.hooks.forward,
5792
6064
  self.bounds.size,
@@ -5905,7 +6177,7 @@ def launch(
5905
6177
  # late bind
5906
6178
  hooks = module_exec.get_kernel_hooks(kernel)
5907
6179
 
5908
- pack_args(fwd_args, params)
6180
+ pack_args(fwd_args, params, adjoint=False)
5909
6181
  pack_args(adj_args, params, adjoint=True)
5910
6182
 
5911
6183
  # run kernel
@@ -5916,38 +6188,25 @@ def launch(
5916
6188
  f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
5917
6189
  )
5918
6190
 
5919
- if record_cmd:
5920
- launch = Launch(
5921
- kernel=kernel,
5922
- hooks=hooks,
5923
- params=params,
5924
- params_addr=None,
5925
- bounds=bounds,
5926
- device=device,
5927
- adjoint=adjoint,
5928
- )
5929
- return launch
5930
- hooks.backward(*params)
5931
-
5932
6191
  else:
5933
6192
  if hooks.forward is None:
5934
6193
  raise RuntimeError(
5935
6194
  f"Failed to find forward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
5936
6195
  )
5937
6196
 
5938
- if record_cmd:
5939
- launch = Launch(
5940
- kernel=kernel,
5941
- hooks=hooks,
5942
- params=params,
5943
- params_addr=None,
5944
- bounds=bounds,
5945
- device=device,
5946
- adjoint=adjoint,
5947
- )
5948
- return launch
5949
- else:
5950
- hooks.forward(*params)
6197
+ if record_cmd:
6198
+ launch = Launch(
6199
+ kernel=kernel,
6200
+ hooks=hooks,
6201
+ params=params,
6202
+ params_addr=None,
6203
+ bounds=bounds,
6204
+ device=device,
6205
+ adjoint=adjoint,
6206
+ )
6207
+ return launch
6208
+
6209
+ invoke(kernel, hooks, params, adjoint)
5951
6210
 
5952
6211
  else:
5953
6212
  kernel_args = [ctypes.c_void_p(ctypes.addressof(x)) for x in params]
@@ -5958,8 +6217,8 @@ def launch(
5958
6217
 
5959
6218
  # If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
5960
6219
  # before the captured graph is released.
5961
- if len(runtime.captures) > 0 and runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
5962
- capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
6220
+ if len(runtime.captures) > 0 and runtime.core.wp_cuda_stream_is_capturing(stream.cuda_stream):
6221
+ capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
5963
6222
  graph = runtime.captures.get(capture_id)
5964
6223
  if graph is not None:
5965
6224
  graph.retain_module_exec(module_exec)
@@ -5984,7 +6243,7 @@ def launch(
5984
6243
  )
5985
6244
  return launch
5986
6245
  else:
5987
- runtime.core.cuda_launch_kernel(
6246
+ runtime.core.wp_cuda_launch_kernel(
5988
6247
  device.context,
5989
6248
  hooks.backward,
5990
6249
  bounds.size,
@@ -6015,7 +6274,7 @@ def launch(
6015
6274
  return launch
6016
6275
  else:
6017
6276
  # launch
6018
- runtime.core.cuda_launch_kernel(
6277
+ runtime.core.wp_cuda_launch_kernel(
6019
6278
  device.context,
6020
6279
  hooks.forward,
6021
6280
  bounds.size,
@@ -6117,7 +6376,7 @@ def synchronize():
6117
6376
 
6118
6377
  if is_cuda_driver_initialized():
6119
6378
  # save the original context to avoid side effects
6120
- saved_context = runtime.core.cuda_context_get_current()
6379
+ saved_context = runtime.core.wp_cuda_context_get_current()
6121
6380
 
6122
6381
  # TODO: only synchronize devices that have outstanding work
6123
6382
  for device in runtime.cuda_devices:
@@ -6126,10 +6385,10 @@ def synchronize():
6126
6385
  if device.is_capturing:
6127
6386
  raise RuntimeError(f"Cannot synchronize device {device} while graph capture is active")
6128
6387
 
6129
- runtime.core.cuda_context_synchronize(device.context)
6388
+ runtime.core.wp_cuda_context_synchronize(device.context)
6130
6389
 
6131
6390
  # restore the original context to avoid side effects
6132
- runtime.core.cuda_context_set_current(saved_context)
6391
+ runtime.core.wp_cuda_context_set_current(saved_context)
6133
6392
 
6134
6393
 
6135
6394
  def synchronize_device(device: Devicelike = None):
@@ -6147,7 +6406,7 @@ def synchronize_device(device: Devicelike = None):
6147
6406
  if device.is_capturing:
6148
6407
  raise RuntimeError(f"Cannot synchronize device {device} while graph capture is active")
6149
6408
 
6150
- runtime.core.cuda_context_synchronize(device.context)
6409
+ runtime.core.wp_cuda_context_synchronize(device.context)
6151
6410
 
6152
6411
 
6153
6412
  def synchronize_stream(stream_or_device: Stream | Devicelike | None = None):
@@ -6165,7 +6424,7 @@ def synchronize_stream(stream_or_device: Stream | Devicelike | None = None):
6165
6424
  else:
6166
6425
  stream = runtime.get_device(stream_or_device).stream
6167
6426
 
6168
- runtime.core.cuda_stream_synchronize(stream.cuda_stream)
6427
+ runtime.core.wp_cuda_stream_synchronize(stream.cuda_stream)
6169
6428
 
6170
6429
 
6171
6430
  def synchronize_event(event: Event):
@@ -6177,20 +6436,25 @@ def synchronize_event(event: Event):
6177
6436
  event: Event to wait for.
6178
6437
  """
6179
6438
 
6180
- runtime.core.cuda_event_synchronize(event.cuda_event)
6439
+ runtime.core.wp_cuda_event_synchronize(event.cuda_event)
6181
6440
 
6182
6441
 
6183
- def force_load(device: Device | str | list[Device] | list[str] | None = None, modules: list[Module] | None = None):
6442
+ def force_load(
6443
+ device: Device | str | list[Device] | list[str] | None = None,
6444
+ modules: list[Module] | None = None,
6445
+ block_dim: int | None = None,
6446
+ ):
6184
6447
  """Force user-defined kernels to be compiled and loaded
6185
6448
 
6186
6449
  Args:
6187
6450
  device: The device or list of devices to load the modules on. If None, load on all devices.
6188
6451
  modules: List of modules to load. If None, load all imported modules.
6452
+ block_dim: The number of threads per block (always 1 for "cpu" devices).
6189
6453
  """
6190
6454
 
6191
6455
  if is_cuda_driver_initialized():
6192
6456
  # save original context to avoid side effects
6193
- saved_context = runtime.core.cuda_context_get_current()
6457
+ saved_context = runtime.core.wp_cuda_context_get_current()
6194
6458
 
6195
6459
  if device is None:
6196
6460
  devices = get_devices()
@@ -6204,22 +6468,26 @@ def force_load(device: Device | str | list[Device] | list[str] | None = None, mo
6204
6468
 
6205
6469
  for d in devices:
6206
6470
  for m in modules:
6207
- m.load(d)
6471
+ m.load(d, block_dim=block_dim)
6208
6472
 
6209
6473
  if is_cuda_available():
6210
6474
  # restore original context to avoid side effects
6211
- runtime.core.cuda_context_set_current(saved_context)
6475
+ runtime.core.wp_cuda_context_set_current(saved_context)
6212
6476
 
6213
6477
 
6214
6478
  def load_module(
6215
- module: Module | types.ModuleType | str | None = None, device: Device | str | None = None, recursive: bool = False
6479
+ module: Module | types.ModuleType | str | None = None,
6480
+ device: Device | str | None = None,
6481
+ recursive: bool = False,
6482
+ block_dim: int | None = None,
6216
6483
  ):
6217
- """Force user-defined module to be compiled and loaded
6484
+ """Force a user-defined module to be compiled and loaded
6218
6485
 
6219
6486
  Args:
6220
6487
  module: The module to load. If None, load the current module.
6221
6488
  device: The device to load the modules on. If None, load on all devices.
6222
6489
  recursive: Whether to load submodules. E.g., if the given module is `warp.sim`, this will also load `warp.sim.model`, `warp.sim.articulation`, etc.
6490
+ block_dim: The number of threads per block (always 1 for "cpu" devices).
6223
6491
 
6224
6492
  Note: A module must be imported before it can be loaded by this function.
6225
6493
  """
@@ -6240,9 +6508,13 @@ def load_module(
6240
6508
  modules = []
6241
6509
 
6242
6510
  # add the given module, if found
6243
- m = user_modules.get(module_name)
6244
- if m is not None:
6245
- modules.append(m)
6511
+ if isinstance(module, Module):
6512
+ # this ensures that we can load "unique" or procedural modules, which aren't added to `user_modules` by name
6513
+ modules.append(module)
6514
+ else:
6515
+ m = user_modules.get(module_name)
6516
+ if m is not None:
6517
+ modules.append(m)
6246
6518
 
6247
6519
  # add submodules, if recursive
6248
6520
  if recursive:
@@ -6251,7 +6523,203 @@ def load_module(
6251
6523
  if name.startswith(prefix):
6252
6524
  modules.append(mod)
6253
6525
 
6254
- force_load(device=device, modules=modules)
6526
+ force_load(device=device, modules=modules, block_dim=block_dim)
6527
+
6528
+
6529
+ def _resolve_module(module: Module | types.ModuleType | str) -> Module:
6530
+ """Resolve a module from a string, Module, or types.ModuleType.
6531
+
6532
+ Args:
6533
+ module: The module to resolve.
6534
+
6535
+ Returns:
6536
+ The resolved module.
6537
+
6538
+ Raises:
6539
+ TypeError: If the module argument is not a Module, a types.ModuleType, or a string.
6540
+ """
6541
+
6542
+ if isinstance(module, str):
6543
+ module_object = get_module(module)
6544
+ elif isinstance(module, Module):
6545
+ module_object = module
6546
+ elif isinstance(module, types.ModuleType):
6547
+ module_object = get_module(module.__name__)
6548
+ else:
6549
+ raise TypeError(f"Argument 'module' must be a Module or a string, got {type(module)}")
6550
+
6551
+ return module_object
6552
+
6553
+
6554
+ def compile_aot_module(
6555
+ module: Module | types.ModuleType | str,
6556
+ device: Device | str | list[Device] | list[str] | None = None,
6557
+ arch: int | Iterable[int] | None = None,
6558
+ module_dir: str | os.PathLike | None = None,
6559
+ use_ptx: bool | None = None,
6560
+ strip_hash: bool | None = None,
6561
+ ) -> None:
6562
+ """Compile a module (ahead of time) for a given device.
6563
+
6564
+ Args:
6565
+ module: The module to compile.
6566
+ device: The device or devices to compile the module for. If ``None``,
6567
+ and ``arch`` is not specified, compile the module for the current device.
6568
+ arch: The architecture or architectures to compile the module for. If ``None``,
6569
+ the architecture to compile for will be inferred from the current device.
6570
+ module_dir: The directory to save the source, meta, and compiled files to.
6571
+ If not specified, the module will be compiled to the default cache directory.
6572
+ use_ptx: Whether to compile the module to PTX. This setting is only used
6573
+ when compiling modules for the GPU. If ``None``, Warp will decide an
6574
+ appropriate setting based on the runtime environment.
6575
+ strip_hash: Whether to strip the hash from the module and kernel names.
6576
+ Setting this value to ``True`` or ``False`` will update the module's
6577
+ ``"strip_hash"`` option. If left at ``None``, the current value will
6578
+ be used.
6579
+
6580
+ Warning: Do not enable ``strip_hash`` for modules that contain generic
6581
+ kernels. Generic kernels compile to multiple overloads, and the
6582
+ per-overload hash is required to distinguish them. Stripping the hash
6583
+ in this case will cause the module to fail to compile.
6584
+
6585
+ Raises:
6586
+ TypeError: If the module argument is not a Module, a types.ModuleType, or a string.
6587
+ """
6588
+
6589
+ if is_cuda_driver_initialized():
6590
+ # save original context to avoid side effects
6591
+ saved_context = runtime.core.wp_cuda_context_get_current()
6592
+
6593
+ module_object = _resolve_module(module)
6594
+
6595
+ if strip_hash is not None:
6596
+ module_object.options["strip_hash"] = strip_hash
6597
+
6598
+ if device is None and arch:
6599
+ # User provided no device, but an arch, so we will not compile for the default device
6600
+ devices = []
6601
+ elif isinstance(device, list):
6602
+ devices = [get_device(device_item) for device_item in device]
6603
+ else:
6604
+ devices = [get_device(device)]
6605
+
6606
+ for d in devices:
6607
+ module_object.compile(d, module_dir, use_ptx=use_ptx)
6608
+
6609
+ if arch:
6610
+ if isinstance(arch, str) or not hasattr(arch, "__iter__"):
6611
+ arch = [arch]
6612
+
6613
+ for arch_value in arch:
6614
+ module_object.compile(None, module_dir, output_arch=arch_value, use_ptx=use_ptx)
6615
+
6616
+ if is_cuda_available():
6617
+ # restore original context to avoid side effects
6618
+ runtime.core.wp_cuda_context_set_current(saved_context)
6619
+
6620
+
6621
+ def load_aot_module(
6622
+ module: Module | types.ModuleType | str,
6623
+ device: Device | str | list[Device] | list[str] | None = None,
6624
+ arch: int | None = None,
6625
+ module_dir: str | os.PathLike | None = None,
6626
+ use_ptx: bool | None = None,
6627
+ strip_hash: bool = False,
6628
+ ) -> None:
6629
+ """Load a previously compiled module (ahead of time).
6630
+
6631
+ Args:
6632
+ module: The module to load.
6633
+ device: The device or devices to load the module on. If ``None``,
6634
+ load the module for the current device.
6635
+ arch: The architecture to load the module for on all devices.
6636
+ If ``None``, the architecture to load for will be inferred from the
6637
+ current device.
6638
+ module_dir: The directory to load the module from.
6639
+ If not specified, the module will be loaded from the default cache directory.
6640
+ use_ptx: Whether to load the module from PTX. This setting is only used
6641
+ when loading modules for the GPU. If ``None`` on a CUDA device, Warp will
6642
+ try both PTX and CUBIN (PTX first) and load the first that exists.
6643
+ If neither exists, a ``FileNotFoundError`` is raised listing all
6644
+ attempted paths.
6645
+ strip_hash: Whether to strip the hash from the module and kernel names.
6646
+ Setting this value to ``True`` or ``False`` will update the module's
6647
+ ``"strip_hash"`` option. If left at ``None``, the current value will
6648
+ be used.
6649
+
6650
+ Warning: Do not enable ``strip_hash`` for modules that contain generic
6651
+ kernels. Generic kernels compile to multiple overloads, and the
6652
+ per-overload hash is required to distinguish them. Stripping the hash
6653
+ in this case will cause the module to fail to compile.
6654
+
6655
+ Raises:
6656
+ FileNotFoundError: If no matching binary is found. When ``use_ptx`` is
6657
+ ``None`` on a CUDA device, both PTX and CUBIN candidates are tried
6658
+ before raising.
6659
+ TypeError: If the module argument is not a Module, a types.ModuleType, or a string.
6660
+ """
6661
+
6662
+ if is_cuda_driver_initialized():
6663
+ # save original context to avoid side effects
6664
+ saved_context = runtime.core.wp_cuda_context_get_current()
6665
+
6666
+ if device is None:
6667
+ devices = [runtime.get_device()]
6668
+ elif isinstance(device, list):
6669
+ devices = [get_device(device_item) for device_item in device]
6670
+ else:
6671
+ devices = [get_device(device)]
6672
+
6673
+ module_object = _resolve_module(module)
6674
+
6675
+ if strip_hash is not None:
6676
+ module_object.options["strip_hash"] = strip_hash
6677
+
6678
+ if module_dir is None:
6679
+ module_dir = os.path.join(warp.config.kernel_cache_dir, module_object.get_module_identifier())
6680
+ else:
6681
+ module_dir = os.fspath(module_dir)
6682
+
6683
+ for d in devices:
6684
+ # Identify the files in the cache to load
6685
+ if arch is None:
6686
+ output_arch = module_object.get_compile_arch(d)
6687
+ else:
6688
+ output_arch = arch
6689
+
6690
+ meta_path = os.path.join(module_dir, module_object.get_meta_name())
6691
+
6692
+ # Determine candidate binaries to try
6693
+ tried_paths = []
6694
+ binary_path = None
6695
+ if d.is_cuda and use_ptx is None:
6696
+ candidate_flags = (True, False) # try PTX first, then CUBIN
6697
+ else:
6698
+ candidate_flags = (use_ptx,)
6699
+
6700
+ for candidate_use_ptx in candidate_flags:
6701
+ candidate_path = os.path.join(
6702
+ module_dir, module_object.get_compile_output_name(d, output_arch, candidate_use_ptx)
6703
+ )
6704
+ tried_paths.append(candidate_path)
6705
+ if os.path.exists(candidate_path):
6706
+ binary_path = candidate_path
6707
+ break
6708
+
6709
+ if binary_path is None:
6710
+ raise FileNotFoundError(f"Binary file not found. Tried: {', '.join(tried_paths)}")
6711
+
6712
+ module_object.load(
6713
+ d,
6714
+ block_dim=module_object.options["block_dim"],
6715
+ binary_path=binary_path,
6716
+ output_arch=output_arch,
6717
+ meta_path=meta_path,
6718
+ )
6719
+
6720
+ if is_cuda_available():
6721
+ # restore original context to avoid side effects
6722
+ runtime.core.wp_cuda_context_set_current(saved_context)
6255
6723
 
6256
6724
 
6257
6725
  def set_module_options(options: dict[str, Any], module: Any = None):
@@ -6381,10 +6849,10 @@ def capture_begin(
6381
6849
  if force_module_load:
6382
6850
  force_load(device)
6383
6851
 
6384
- if not runtime.core.cuda_graph_begin_capture(device.context, stream.cuda_stream, int(external)):
6852
+ if not runtime.core.wp_cuda_graph_begin_capture(device.context, stream.cuda_stream, int(external)):
6385
6853
  raise RuntimeError(runtime.get_error_string())
6386
6854
 
6387
- capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
6855
+ capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
6388
6856
  graph = Graph(device, capture_id)
6389
6857
 
6390
6858
  _register_capture(device, stream, graph, capture_id)
@@ -6419,7 +6887,7 @@ def capture_end(device: Devicelike = None, stream: Stream | None = None) -> Grap
6419
6887
 
6420
6888
  # get the graph executable
6421
6889
  g = ctypes.c_void_p()
6422
- result = runtime.core.cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(g))
6890
+ result = runtime.core.wp_cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(g))
6423
6891
 
6424
6892
  if not result:
6425
6893
  # A concrete error should've already been reported, so we don't need to go into details here
@@ -6440,7 +6908,7 @@ def capture_debug_dot_print(graph: Graph, path: str, verbose: bool = False):
6440
6908
  path: Path to save the DOT file
6441
6909
  verbose: Whether to include additional debug information in the output
6442
6910
  """
6443
- if not runtime.core.capture_debug_dot_print(graph.graph, path.encode(), 0 if verbose else 1):
6911
+ if not runtime.core.wp_capture_debug_dot_print(graph.graph, path.encode(), 0 if verbose else 1):
6444
6912
  raise RuntimeError(f"Graph debug dot print error: {runtime.get_error_string()}")
6445
6913
 
6446
6914
 
@@ -6473,7 +6941,7 @@ def capture_pause(device: Devicelike = None, stream: Stream | None = None) -> Gr
6473
6941
  _unregister_capture(device, stream, graph)
6474
6942
 
6475
6943
  g = ctypes.c_void_p()
6476
- if not runtime.core.cuda_graph_pause_capture(device.context, stream.cuda_stream, ctypes.byref(g)):
6944
+ if not runtime.core.wp_cuda_graph_pause_capture(device.context, stream.cuda_stream, ctypes.byref(g)):
6477
6945
  raise RuntimeError(runtime.get_error_string())
6478
6946
 
6479
6947
  graph.graph = g
@@ -6490,10 +6958,10 @@ def capture_resume(graph: Graph, device: Devicelike = None, stream: Stream | Non
6490
6958
  raise RuntimeError("Must be a CUDA device")
6491
6959
  stream = device.stream
6492
6960
 
6493
- if not runtime.core.cuda_graph_resume_capture(device.context, stream.cuda_stream, graph.graph):
6961
+ if not runtime.core.wp_cuda_graph_resume_capture(device.context, stream.cuda_stream, graph.graph):
6494
6962
  raise RuntimeError(runtime.get_error_string())
6495
6963
 
6496
- capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
6964
+ capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
6497
6965
  graph.capture_id = capture_id
6498
6966
 
6499
6967
  _register_capture(device, stream, graph, capture_id)
@@ -6576,15 +7044,13 @@ def capture_if(
6576
7044
 
6577
7045
  return
6578
7046
 
6579
- graph.has_conditional = True
6580
-
6581
7047
  # ensure conditional graph nodes are supported
6582
7048
  assert_conditional_graph_support()
6583
7049
 
6584
7050
  # insert conditional node
6585
7051
  graph_on_true = ctypes.c_void_p()
6586
7052
  graph_on_false = ctypes.c_void_p()
6587
- if not runtime.core.cuda_graph_insert_if_else(
7053
+ if not runtime.core.wp_cuda_graph_insert_if_else(
6588
7054
  device.context,
6589
7055
  stream.cuda_stream,
6590
7056
  ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
@@ -6607,11 +7073,7 @@ def capture_if(
6607
7073
  if isinstance(on_true, Callable):
6608
7074
  on_true(**kwargs)
6609
7075
  elif isinstance(on_true, Graph):
6610
- if on_true.has_conditional:
6611
- raise RuntimeError(
6612
- "The on_true graph contains conditional nodes, which are not allowed in child graphs"
6613
- )
6614
- if not runtime.core.cuda_graph_insert_child_graph(
7076
+ if not runtime.core.wp_cuda_graph_insert_child_graph(
6615
7077
  device.context,
6616
7078
  stream.cuda_stream,
6617
7079
  on_true.graph,
@@ -6621,6 +7083,10 @@ def capture_if(
6621
7083
  raise TypeError("on_true must be a Callable or a Graph")
6622
7084
  capture_pause(stream=stream)
6623
7085
 
7086
+ # check the if-body graph
7087
+ if not runtime.core.wp_cuda_graph_check_conditional_body(graph_on_true):
7088
+ raise RuntimeError(runtime.get_error_string())
7089
+
6624
7090
  # capture else-graph
6625
7091
  if on_false is not None:
6626
7092
  # temporarily repurpose the main_graph python object such that all dependencies
@@ -6630,11 +7096,7 @@ def capture_if(
6630
7096
  if isinstance(on_false, Callable):
6631
7097
  on_false(**kwargs)
6632
7098
  elif isinstance(on_false, Graph):
6633
- if on_false.has_conditional:
6634
- raise RuntimeError(
6635
- "The on_false graph contains conditional nodes, which are not allowed in child graphs"
6636
- )
6637
- if not runtime.core.cuda_graph_insert_child_graph(
7099
+ if not runtime.core.wp_cuda_graph_insert_child_graph(
6638
7100
  device.context,
6639
7101
  stream.cuda_stream,
6640
7102
  on_false.graph,
@@ -6644,6 +7106,10 @@ def capture_if(
6644
7106
  raise TypeError("on_false must be a Callable or a Graph")
6645
7107
  capture_pause(stream=stream)
6646
7108
 
7109
+ # check the else-body graph
7110
+ if not runtime.core.wp_cuda_graph_check_conditional_body(graph_on_false):
7111
+ raise RuntimeError(runtime.get_error_string())
7112
+
6647
7113
  # restore the main graph to its original state
6648
7114
  main_graph.graph = main_graph_ptr
6649
7115
 
@@ -6710,15 +7176,13 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
6710
7176
 
6711
7177
  return
6712
7178
 
6713
- graph.has_conditional = True
6714
-
6715
7179
  # ensure conditional graph nodes are supported
6716
7180
  assert_conditional_graph_support()
6717
7181
 
6718
7182
  # insert conditional while-node
6719
7183
  body_graph = ctypes.c_void_p()
6720
7184
  cond_handle = ctypes.c_uint64()
6721
- if not runtime.core.cuda_graph_insert_while(
7185
+ if not runtime.core.wp_cuda_graph_insert_while(
6722
7186
  device.context,
6723
7187
  stream.cuda_stream,
6724
7188
  ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
@@ -6741,20 +7205,17 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
6741
7205
  if isinstance(while_body, Callable):
6742
7206
  while_body(**kwargs)
6743
7207
  elif isinstance(while_body, Graph):
6744
- if while_body.has_conditional:
6745
- raise RuntimeError("The body graph contains conditional nodes, which are not allowed in child graphs")
6746
-
6747
- if not runtime.core.cuda_graph_insert_child_graph(
7208
+ if not runtime.core.wp_cuda_graph_insert_child_graph(
6748
7209
  device.context,
6749
7210
  stream.cuda_stream,
6750
7211
  while_body.graph,
6751
7212
  ):
6752
7213
  raise RuntimeError(runtime.get_error_string())
6753
7214
  else:
6754
- raise RuntimeError(runtime.get_error_string())
7215
+ raise TypeError("while_body must be a callable or a graph")
6755
7216
 
6756
7217
  # update condition
6757
- if not runtime.core.cuda_graph_set_condition(
7218
+ if not runtime.core.wp_cuda_graph_set_condition(
6758
7219
  device.context,
6759
7220
  stream.cuda_stream,
6760
7221
  ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
@@ -6762,8 +7223,13 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
6762
7223
  ):
6763
7224
  raise RuntimeError(runtime.get_error_string())
6764
7225
 
6765
- # stop capturing child graph and resume capturing parent graph
7226
+ # stop capturing while-body
6766
7227
  capture_pause(stream=stream)
7228
+
7229
+ # check the while-body graph
7230
+ if not runtime.core.wp_cuda_graph_check_conditional_body(body_graph):
7231
+ raise RuntimeError(runtime.get_error_string())
7232
+
6767
7233
  # restore the main graph to its original state
6768
7234
  main_graph.graph = main_graph_ptr
6769
7235
  capture_resume(main_graph, stream=stream)
@@ -6787,14 +7253,14 @@ def capture_launch(graph: Graph, stream: Stream | None = None):
6787
7253
 
6788
7254
  if graph.graph_exec is None:
6789
7255
  g = ctypes.c_void_p()
6790
- result = runtime.core.cuda_graph_create_exec(
7256
+ result = runtime.core.wp_cuda_graph_create_exec(
6791
7257
  graph.device.context, stream.cuda_stream, graph.graph, ctypes.byref(g)
6792
7258
  )
6793
7259
  if not result:
6794
7260
  raise RuntimeError(f"Graph creation error: {runtime.get_error_string()}")
6795
7261
  graph.graph_exec = g
6796
7262
 
6797
- if not runtime.core.cuda_graph_launch(graph.graph_exec, stream.cuda_stream):
7263
+ if not runtime.core.wp_cuda_graph_launch(graph.graph_exec, stream.cuda_stream):
6798
7264
  raise RuntimeError(f"Graph launch error: {runtime.get_error_string()}")
6799
7265
 
6800
7266
 
@@ -6905,24 +7371,24 @@ def copy(
6905
7371
  if dest.device.is_cuda:
6906
7372
  if src.device.is_cuda:
6907
7373
  if src.device == dest.device:
6908
- result = runtime.core.memcpy_d2d(
7374
+ result = runtime.core.wp_memcpy_d2d(
6909
7375
  dest.device.context, dst_ptr, src_ptr, bytes_to_copy, stream.cuda_stream
6910
7376
  )
6911
7377
  else:
6912
- result = runtime.core.memcpy_p2p(
7378
+ result = runtime.core.wp_memcpy_p2p(
6913
7379
  dest.device.context, dst_ptr, src.device.context, src_ptr, bytes_to_copy, stream.cuda_stream
6914
7380
  )
6915
7381
  else:
6916
- result = runtime.core.memcpy_h2d(
7382
+ result = runtime.core.wp_memcpy_h2d(
6917
7383
  dest.device.context, dst_ptr, src_ptr, bytes_to_copy, stream.cuda_stream
6918
7384
  )
6919
7385
  else:
6920
7386
  if src.device.is_cuda:
6921
- result = runtime.core.memcpy_d2h(
7387
+ result = runtime.core.wp_memcpy_d2h(
6922
7388
  src.device.context, dst_ptr, src_ptr, bytes_to_copy, stream.cuda_stream
6923
7389
  )
6924
7390
  else:
6925
- result = runtime.core.memcpy_h2h(dst_ptr, src_ptr, bytes_to_copy)
7391
+ result = runtime.core.wp_memcpy_h2h(dst_ptr, src_ptr, bytes_to_copy)
6926
7392
 
6927
7393
  if not result:
6928
7394
  raise RuntimeError(f"Warp copy error: {runtime.get_error_string()}")
@@ -6957,17 +7423,17 @@ def copy(
6957
7423
  # This work involves a kernel launch, so it must run on the destination device.
6958
7424
  # If the copy stream is different, we need to synchronize it.
6959
7425
  if stream == dest.device.stream:
6960
- result = runtime.core.array_copy_device(
7426
+ result = runtime.core.wp_array_copy_device(
6961
7427
  dest.device.context, dst_ptr, src_ptr, dst_type, src_type, src_elem_size
6962
7428
  )
6963
7429
  else:
6964
7430
  dest.device.stream.wait_stream(stream)
6965
- result = runtime.core.array_copy_device(
7431
+ result = runtime.core.wp_array_copy_device(
6966
7432
  dest.device.context, dst_ptr, src_ptr, dst_type, src_type, src_elem_size
6967
7433
  )
6968
7434
  stream.wait_stream(dest.device.stream)
6969
7435
  else:
6970
- result = runtime.core.array_copy_host(dst_ptr, src_ptr, dst_type, src_type, src_elem_size)
7436
+ result = runtime.core.wp_array_copy_host(dst_ptr, src_ptr, dst_type, src_type, src_elem_size)
6971
7437
 
6972
7438
  if not result:
6973
7439
  raise RuntimeError(f"Warp copy error: {runtime.get_error_string()}")
@@ -7272,7 +7738,6 @@ def export_stubs(file): # pragma: no cover
7272
7738
  """,
7273
7739
  file=file,
7274
7740
  )
7275
-
7276
7741
  print(
7277
7742
  "# Autogenerated file, do not edit, this file provides stubs for builtins autocomplete in VSCode, PyCharm, etc",
7278
7743
  file=file,