warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +282 -103
- warp/__init__.pyi +482 -110
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +93 -30
- warp/build_dll.py +47 -67
- warp/builtins.py +955 -137
- warp/codegen.py +312 -206
- warp/config.py +1 -1
- warp/context.py +1249 -784
- warp/examples/core/example_marching_cubes.py +1 -0
- warp/examples/core/example_render_opengl.py +100 -3
- warp/examples/fem/example_apic_fluid.py +98 -52
- warp/examples/fem/example_convection_diffusion_dg.py +25 -4
- warp/examples/fem/example_diffusion_mgpu.py +8 -3
- warp/examples/fem/utils.py +68 -22
- warp/fabric.py +1 -1
- warp/fem/cache.py +27 -19
- warp/fem/domain.py +2 -2
- warp/fem/field/nodal_field.py +2 -2
- warp/fem/field/virtual.py +264 -166
- warp/fem/geometry/geometry.py +5 -5
- warp/fem/integrate.py +129 -51
- warp/fem/space/restriction.py +4 -0
- warp/fem/space/shape/tet_shape_function.py +3 -10
- warp/jax_experimental/custom_call.py +1 -1
- warp/jax_experimental/ffi.py +2 -1
- warp/marching_cubes.py +708 -0
- warp/native/array.h +99 -4
- warp/native/builtin.h +82 -5
- warp/native/bvh.cpp +64 -28
- warp/native/bvh.cu +58 -58
- warp/native/bvh.h +2 -2
- warp/native/clang/clang.cpp +7 -7
- warp/native/coloring.cpp +8 -2
- warp/native/crt.cpp +2 -2
- warp/native/crt.h +3 -5
- warp/native/cuda_util.cpp +41 -10
- warp/native/cuda_util.h +10 -4
- warp/native/exports.h +1842 -1908
- warp/native/fabric.h +2 -1
- warp/native/hashgrid.cpp +37 -37
- warp/native/hashgrid.cu +2 -2
- warp/native/initializer_array.h +1 -1
- warp/native/intersect.h +2 -2
- warp/native/mat.h +1910 -116
- warp/native/mathdx.cpp +43 -43
- warp/native/mesh.cpp +24 -24
- warp/native/mesh.cu +26 -26
- warp/native/mesh.h +4 -2
- warp/native/nanovdb/GridHandle.h +179 -12
- warp/native/nanovdb/HostBuffer.h +8 -7
- warp/native/nanovdb/NanoVDB.h +517 -895
- warp/native/nanovdb/NodeManager.h +323 -0
- warp/native/nanovdb/PNanoVDB.h +2 -2
- warp/native/quat.h +331 -14
- warp/native/range.h +7 -1
- warp/native/reduce.cpp +10 -10
- warp/native/reduce.cu +13 -14
- warp/native/runlength_encode.cpp +2 -2
- warp/native/runlength_encode.cu +5 -5
- warp/native/scan.cpp +3 -3
- warp/native/scan.cu +4 -4
- warp/native/sort.cpp +10 -10
- warp/native/sort.cu +22 -22
- warp/native/sparse.cpp +8 -8
- warp/native/sparse.cu +13 -13
- warp/native/spatial.h +366 -17
- warp/native/temp_buffer.h +2 -2
- warp/native/tile.h +283 -69
- warp/native/vec.h +381 -14
- warp/native/volume.cpp +54 -54
- warp/native/volume.cu +1 -1
- warp/native/volume.h +2 -1
- warp/native/volume_builder.cu +30 -37
- warp/native/warp.cpp +150 -149
- warp/native/warp.cu +323 -192
- warp/native/warp.h +227 -226
- warp/optim/linear.py +736 -271
- warp/render/imgui_manager.py +289 -0
- warp/render/render_opengl.py +85 -6
- warp/sim/graph_coloring.py +2 -2
- warp/sparse.py +558 -175
- warp/tests/aux_test_module_aot.py +7 -0
- warp/tests/cuda/test_async.py +3 -3
- warp/tests/cuda/test_conditional_captures.py +101 -0
- warp/tests/geometry/test_marching_cubes.py +233 -12
- warp/tests/sim/test_coloring.py +6 -6
- warp/tests/test_array.py +56 -5
- warp/tests/test_codegen.py +3 -2
- warp/tests/test_context.py +8 -15
- warp/tests/test_enum.py +136 -0
- warp/tests/test_examples.py +2 -2
- warp/tests/test_fem.py +45 -2
- warp/tests/test_fixedarray.py +229 -0
- warp/tests/test_func.py +18 -15
- warp/tests/test_future_annotations.py +7 -5
- warp/tests/test_linear_solvers.py +30 -0
- warp/tests/test_map.py +1 -1
- warp/tests/test_mat.py +1518 -378
- warp/tests/test_mat_assign_copy.py +178 -0
- warp/tests/test_mat_constructors.py +574 -0
- warp/tests/test_module_aot.py +287 -0
- warp/tests/test_print.py +69 -0
- warp/tests/test_quat.py +140 -34
- warp/tests/test_quat_assign_copy.py +145 -0
- warp/tests/test_reload.py +2 -1
- warp/tests/test_sparse.py +71 -0
- warp/tests/test_spatial.py +140 -34
- warp/tests/test_spatial_assign_copy.py +160 -0
- warp/tests/test_struct.py +43 -3
- warp/tests/test_types.py +0 -20
- warp/tests/test_vec.py +179 -34
- warp/tests/test_vec_assign_copy.py +143 -0
- warp/tests/tile/test_tile.py +184 -18
- warp/tests/tile/test_tile_cholesky.py +605 -0
- warp/tests/tile/test_tile_load.py +169 -0
- warp/tests/tile/test_tile_mathdx.py +2 -558
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +1 -1
- warp/tests/tile/test_tile_shared_memory.py +5 -5
- warp/tests/unittest_suites.py +6 -0
- warp/tests/walkthrough_debug.py +1 -1
- warp/thirdparty/unittest_parallel.py +108 -9
- warp/types.py +554 -264
- warp/utils.py +68 -86
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
- warp/native/marching.cpp +0 -19
- warp/native/marching.cu +0 -514
- warp/native/marching.h +0 -19
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/context.py
CHANGED
|
@@ -26,13 +26,28 @@ import json
|
|
|
26
26
|
import operator
|
|
27
27
|
import os
|
|
28
28
|
import platform
|
|
29
|
+
import shutil
|
|
29
30
|
import sys
|
|
30
31
|
import types
|
|
31
32
|
import typing
|
|
32
33
|
import weakref
|
|
33
34
|
from copy import copy as shallowcopy
|
|
34
35
|
from pathlib import Path
|
|
35
|
-
from typing import
|
|
36
|
+
from typing import (
|
|
37
|
+
Any,
|
|
38
|
+
Callable,
|
|
39
|
+
Dict,
|
|
40
|
+
Iterable,
|
|
41
|
+
List,
|
|
42
|
+
Literal,
|
|
43
|
+
Mapping,
|
|
44
|
+
Sequence,
|
|
45
|
+
Tuple,
|
|
46
|
+
TypeVar,
|
|
47
|
+
Union,
|
|
48
|
+
get_args,
|
|
49
|
+
get_origin,
|
|
50
|
+
)
|
|
36
51
|
|
|
37
52
|
import numpy as np
|
|
38
53
|
|
|
@@ -327,39 +342,25 @@ class Function:
|
|
|
327
342
|
warp.codegen.apply_defaults(bound_args, self.defaults)
|
|
328
343
|
|
|
329
344
|
arguments = tuple(bound_args.arguments.values())
|
|
330
|
-
|
|
331
|
-
# Store the last runtime error we encountered from a function execution
|
|
332
|
-
last_execution_error = None
|
|
345
|
+
arg_types = tuple(warp.codegen.get_arg_type(x) for x in arguments)
|
|
333
346
|
|
|
334
347
|
# try and find a matching overload
|
|
335
348
|
for overload in self.user_overloads.values():
|
|
336
349
|
if len(overload.input_types) != len(arguments):
|
|
337
350
|
continue
|
|
351
|
+
|
|
352
|
+
if not warp.codegen.func_match_args(overload, arg_types, {}):
|
|
353
|
+
continue
|
|
354
|
+
|
|
338
355
|
template_types = list(overload.input_types.values())
|
|
339
356
|
arg_names = list(overload.input_types.keys())
|
|
340
|
-
try:
|
|
341
|
-
# attempt to unify argument types with function template types
|
|
342
|
-
warp.types.infer_argument_types(arguments, template_types, arg_names)
|
|
343
|
-
return overload.func(*arguments)
|
|
344
|
-
except Exception as e:
|
|
345
|
-
# The function was callable but threw an error during its execution.
|
|
346
|
-
# This might be the intended overload, but it failed, or it might be the wrong overload.
|
|
347
|
-
# We save this specific error and continue, just in case another overload later in the
|
|
348
|
-
# list is a better match and doesn't fail.
|
|
349
|
-
last_execution_error = e
|
|
350
|
-
continue
|
|
351
357
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
f"See above for the error from the last version that was tried."
|
|
359
|
-
) from last_execution_error
|
|
360
|
-
else:
|
|
361
|
-
# We got here without ever calling an overload.func
|
|
362
|
-
raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
|
|
358
|
+
# attempt to unify argument types with function template types
|
|
359
|
+
warp.types.infer_argument_types(arguments, template_types, arg_names)
|
|
360
|
+
return overload.func(*arguments)
|
|
361
|
+
|
|
362
|
+
# We got here without ever calling an overload.func
|
|
363
|
+
raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
|
|
363
364
|
|
|
364
365
|
# user-defined function with no overloads
|
|
365
366
|
if self.func is None:
|
|
@@ -385,7 +386,7 @@ class Function:
|
|
|
385
386
|
def mangle(self) -> str:
|
|
386
387
|
"""Build a mangled name for the C-exported function, e.g.: `builtin_normalize_vec3()`."""
|
|
387
388
|
|
|
388
|
-
name = "
|
|
389
|
+
name = "wp_builtin_" + self.key
|
|
389
390
|
|
|
390
391
|
# Runtime arguments that are to be passed to the function, not its template signature.
|
|
391
392
|
if self.export_func is not None:
|
|
@@ -475,6 +476,25 @@ class Function:
|
|
|
475
476
|
# failed to find overload
|
|
476
477
|
return None
|
|
477
478
|
|
|
479
|
+
def build(self, builder: ModuleBuilder | None):
|
|
480
|
+
self.adj.build(builder)
|
|
481
|
+
|
|
482
|
+
# complete the function return type after we have analyzed it (inferred from return statement in ast)
|
|
483
|
+
if not self.value_func:
|
|
484
|
+
|
|
485
|
+
def wrap(adj):
|
|
486
|
+
def value_type(arg_types, arg_values):
|
|
487
|
+
if adj.return_var is None or len(adj.return_var) == 0:
|
|
488
|
+
return None
|
|
489
|
+
if len(adj.return_var) == 1:
|
|
490
|
+
return adj.return_var[0].type
|
|
491
|
+
else:
|
|
492
|
+
return [v.type for v in adj.return_var]
|
|
493
|
+
|
|
494
|
+
return value_type
|
|
495
|
+
|
|
496
|
+
self.value_func = wrap(self.adj)
|
|
497
|
+
|
|
478
498
|
def __repr__(self):
|
|
479
499
|
inputs_str = ", ".join([f"{k}: {warp.types.type_repr(v)}" for k, v in self.input_types.items()])
|
|
480
500
|
return f"<Function {self.key}({inputs_str})>"
|
|
@@ -807,14 +827,17 @@ class Kernel:
|
|
|
807
827
|
sig = warp.types.get_signature(arg_types, func_name=self.key)
|
|
808
828
|
return self.overloads.get(sig)
|
|
809
829
|
|
|
810
|
-
def get_mangled_name(self):
|
|
811
|
-
if self.
|
|
812
|
-
|
|
830
|
+
def get_mangled_name(self) -> str:
|
|
831
|
+
if self.module.options["strip_hash"]:
|
|
832
|
+
return self.key
|
|
833
|
+
else:
|
|
834
|
+
if self.hash is None:
|
|
835
|
+
raise RuntimeError(f"Missing hash for kernel {self.key} in module {self.module.name}")
|
|
813
836
|
|
|
814
|
-
|
|
815
|
-
|
|
837
|
+
# TODO: allow customizing the number of hash characters used
|
|
838
|
+
hash_suffix = self.hash.hex()[:8]
|
|
816
839
|
|
|
817
|
-
|
|
840
|
+
return f"{self.key}_{hash_suffix}"
|
|
818
841
|
|
|
819
842
|
def __call__(self, *args, **kwargs):
|
|
820
843
|
# we implement this function only to ensure Kernel is a callable object
|
|
@@ -1597,6 +1620,9 @@ class ModuleHasher:
|
|
|
1597
1620
|
# line directives, e.g. for Nsight Compute
|
|
1598
1621
|
ch.update(bytes(ctypes.c_int(warp.config.line_directives)))
|
|
1599
1622
|
|
|
1623
|
+
# whether to use `assign_copy` instead of `assign_inplace`
|
|
1624
|
+
ch.update(bytes(ctypes.c_int(warp.config.enable_vector_component_overwrites)))
|
|
1625
|
+
|
|
1600
1626
|
# build config
|
|
1601
1627
|
ch.update(bytes(warp.config.mode, "utf-8"))
|
|
1602
1628
|
|
|
@@ -1784,6 +1810,9 @@ class ModuleBuilder:
|
|
|
1784
1810
|
self.structs[struct] = None
|
|
1785
1811
|
|
|
1786
1812
|
def build_kernel(self, kernel):
|
|
1813
|
+
if kernel.options.get("enable_backward", True):
|
|
1814
|
+
kernel.adj.used_by_backward_kernel = True
|
|
1815
|
+
|
|
1787
1816
|
kernel.adj.build(self)
|
|
1788
1817
|
|
|
1789
1818
|
if kernel.adj.return_var is not None:
|
|
@@ -1794,23 +1823,7 @@ class ModuleBuilder:
|
|
|
1794
1823
|
if func in self.functions:
|
|
1795
1824
|
return
|
|
1796
1825
|
else:
|
|
1797
|
-
func.
|
|
1798
|
-
|
|
1799
|
-
# complete the function return type after we have analyzed it (inferred from return statement in ast)
|
|
1800
|
-
if not func.value_func:
|
|
1801
|
-
|
|
1802
|
-
def wrap(adj):
|
|
1803
|
-
def value_type(arg_types, arg_values):
|
|
1804
|
-
if adj.return_var is None or len(adj.return_var) == 0:
|
|
1805
|
-
return None
|
|
1806
|
-
if len(adj.return_var) == 1:
|
|
1807
|
-
return adj.return_var[0].type
|
|
1808
|
-
else:
|
|
1809
|
-
return [v.type for v in adj.return_var]
|
|
1810
|
-
|
|
1811
|
-
return value_type
|
|
1812
|
-
|
|
1813
|
-
func.value_func = wrap(func.adj)
|
|
1826
|
+
func.build(self)
|
|
1814
1827
|
|
|
1815
1828
|
# use dict to preserve import order
|
|
1816
1829
|
self.functions[func] = None
|
|
@@ -1830,10 +1843,11 @@ class ModuleBuilder:
|
|
|
1830
1843
|
source = ""
|
|
1831
1844
|
|
|
1832
1845
|
# code-gen LTO forward declarations
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1846
|
+
if len(self.ltoirs_decl) > 0:
|
|
1847
|
+
source += 'extern "C" {\n'
|
|
1848
|
+
for fwd in self.ltoirs_decl.values():
|
|
1849
|
+
source += fwd + "\n"
|
|
1850
|
+
source += "}\n"
|
|
1837
1851
|
|
|
1838
1852
|
# code-gen structs
|
|
1839
1853
|
visited_structs = set()
|
|
@@ -1898,9 +1912,9 @@ class ModuleExec:
|
|
|
1898
1912
|
if self.device.is_cuda:
|
|
1899
1913
|
# use CUDA context guard to avoid side effects during garbage collection
|
|
1900
1914
|
with self.device.context_guard:
|
|
1901
|
-
runtime.core.
|
|
1915
|
+
runtime.core.wp_cuda_unload_module(self.device.context, self.handle)
|
|
1902
1916
|
else:
|
|
1903
|
-
runtime.llvm.
|
|
1917
|
+
runtime.llvm.wp_unload_obj(self.handle.encode("utf-8"))
|
|
1904
1918
|
|
|
1905
1919
|
# lookup and cache kernel entry points
|
|
1906
1920
|
def get_kernel_hooks(self, kernel) -> KernelHooks:
|
|
@@ -1918,13 +1932,13 @@ class ModuleExec:
|
|
|
1918
1932
|
|
|
1919
1933
|
if self.device.is_cuda:
|
|
1920
1934
|
forward_name = name + "_cuda_kernel_forward"
|
|
1921
|
-
forward_kernel = runtime.core.
|
|
1935
|
+
forward_kernel = runtime.core.wp_cuda_get_kernel(
|
|
1922
1936
|
self.device.context, self.handle, forward_name.encode("utf-8")
|
|
1923
1937
|
)
|
|
1924
1938
|
|
|
1925
1939
|
if options["enable_backward"]:
|
|
1926
1940
|
backward_name = name + "_cuda_kernel_backward"
|
|
1927
|
-
backward_kernel = runtime.core.
|
|
1941
|
+
backward_kernel = runtime.core.wp_cuda_get_kernel(
|
|
1928
1942
|
self.device.context, self.handle, backward_name.encode("utf-8")
|
|
1929
1943
|
)
|
|
1930
1944
|
else:
|
|
@@ -1935,14 +1949,14 @@ class ModuleExec:
|
|
|
1935
1949
|
backward_smem_bytes = self.meta[backward_name + "_smem_bytes"] if options["enable_backward"] else 0
|
|
1936
1950
|
|
|
1937
1951
|
# configure kernels maximum shared memory size
|
|
1938
|
-
max_smem_bytes = runtime.core.
|
|
1952
|
+
max_smem_bytes = runtime.core.wp_cuda_get_max_shared_memory(self.device.context)
|
|
1939
1953
|
|
|
1940
|
-
if not runtime.core.
|
|
1954
|
+
if not runtime.core.wp_cuda_configure_kernel_shared_memory(forward_kernel, forward_smem_bytes):
|
|
1941
1955
|
print(
|
|
1942
1956
|
f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {forward_name} kernel for {forward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
|
|
1943
1957
|
)
|
|
1944
1958
|
|
|
1945
|
-
if options["enable_backward"] and not runtime.core.
|
|
1959
|
+
if options["enable_backward"] and not runtime.core.wp_cuda_configure_kernel_shared_memory(
|
|
1946
1960
|
backward_kernel, backward_smem_bytes
|
|
1947
1961
|
):
|
|
1948
1962
|
print(
|
|
@@ -1954,12 +1968,13 @@ class ModuleExec:
|
|
|
1954
1968
|
else:
|
|
1955
1969
|
func = ctypes.CFUNCTYPE(None)
|
|
1956
1970
|
forward = (
|
|
1957
|
-
func(runtime.llvm.
|
|
1971
|
+
func(runtime.llvm.wp_lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8")))
|
|
1972
|
+
or None
|
|
1958
1973
|
)
|
|
1959
1974
|
|
|
1960
1975
|
if options["enable_backward"]:
|
|
1961
1976
|
backward = (
|
|
1962
|
-
func(runtime.llvm.
|
|
1977
|
+
func(runtime.llvm.wp_lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
|
|
1963
1978
|
or None
|
|
1964
1979
|
)
|
|
1965
1980
|
else:
|
|
@@ -1971,6 +1986,25 @@ class ModuleExec:
|
|
|
1971
1986
|
return hooks
|
|
1972
1987
|
|
|
1973
1988
|
|
|
1989
|
+
def _check_and_raise_long_path_error(e: FileNotFoundError):
|
|
1990
|
+
"""Check if the error is due to a Windows long path and provide work-around instructions if it is.
|
|
1991
|
+
|
|
1992
|
+
``FileNotFoundError.filename`` may legitimately be ``None`` when the originating
|
|
1993
|
+
API does not supply a path. Guard against that to avoid masking the original
|
|
1994
|
+
error with a ``TypeError``.
|
|
1995
|
+
"""
|
|
1996
|
+
filename = getattr(e, "filename", None)
|
|
1997
|
+
|
|
1998
|
+
# Fast-exit when this is clearly not a legacy-path limitation:
|
|
1999
|
+
if filename is None or len(filename) < 260 or os.name != "nt" or filename.startswith("\\\\?\\"):
|
|
2000
|
+
raise e
|
|
2001
|
+
|
|
2002
|
+
raise RuntimeError(
|
|
2003
|
+
f"File path '{e.filename}' exceeds 259 characters, long-path support is required for this operation. "
|
|
2004
|
+
"See https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation for more information."
|
|
2005
|
+
) from e
|
|
2006
|
+
|
|
2007
|
+
|
|
1974
2008
|
# -----------------------------------------------------
|
|
1975
2009
|
# stores all functions and kernels for a Python module
|
|
1976
2010
|
# creates a hash of the function to use for checking
|
|
@@ -2024,6 +2058,7 @@ class Module:
|
|
|
2024
2058
|
"mode": None,
|
|
2025
2059
|
"block_dim": 256,
|
|
2026
2060
|
"compile_time_trace": warp.config.compile_time_trace,
|
|
2061
|
+
"strip_hash": False,
|
|
2027
2062
|
}
|
|
2028
2063
|
|
|
2029
2064
|
# Module dependencies are determined by scanning each function
|
|
@@ -2170,20 +2205,23 @@ class Module:
|
|
|
2170
2205
|
if isinstance(arg.type, warp.codegen.Struct) and arg.type.module is not None:
|
|
2171
2206
|
add_ref(arg.type.module)
|
|
2172
2207
|
|
|
2173
|
-
def hash_module(self):
|
|
2208
|
+
def hash_module(self) -> bytes:
|
|
2209
|
+
"""Get the hash of the module for the current block_dim.
|
|
2210
|
+
|
|
2211
|
+
This function always creates a new `ModuleHasher` instance and computes the hash.
|
|
2212
|
+
"""
|
|
2174
2213
|
# compute latest hash
|
|
2175
2214
|
block_dim = self.options["block_dim"]
|
|
2176
2215
|
self.hashers[block_dim] = ModuleHasher(self)
|
|
2177
2216
|
return self.hashers[block_dim].get_module_hash()
|
|
2178
2217
|
|
|
2179
|
-
def
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
# update module options if launching with a new block dim
|
|
2183
|
-
if block_dim is not None:
|
|
2184
|
-
self.options["block_dim"] = block_dim
|
|
2218
|
+
def get_module_hash(self, block_dim: int | None = None) -> bytes:
|
|
2219
|
+
"""Get the hash of the module for the current block_dim.
|
|
2185
2220
|
|
|
2186
|
-
|
|
2221
|
+
If a hash has not been computed for the current block_dim, it will be computed and cached.
|
|
2222
|
+
"""
|
|
2223
|
+
if block_dim is None:
|
|
2224
|
+
block_dim = self.options["block_dim"]
|
|
2187
2225
|
|
|
2188
2226
|
if self.has_unresolved_static_expressions:
|
|
2189
2227
|
# The module hash currently does not account for unresolved static expressions
|
|
@@ -2200,210 +2238,386 @@ class Module:
|
|
|
2200
2238
|
self.has_unresolved_static_expressions = False
|
|
2201
2239
|
|
|
2202
2240
|
# compute the hash if needed
|
|
2203
|
-
if
|
|
2204
|
-
self.hashers[
|
|
2241
|
+
if block_dim not in self.hashers:
|
|
2242
|
+
self.hashers[block_dim] = ModuleHasher(self)
|
|
2205
2243
|
|
|
2206
|
-
|
|
2207
|
-
exec = self.execs.get((device.context, active_block_dim))
|
|
2208
|
-
if exec is not None:
|
|
2209
|
-
if exec.module_hash == self.hashers[active_block_dim].get_module_hash():
|
|
2210
|
-
return exec
|
|
2244
|
+
return self.hashers[block_dim].get_module_hash()
|
|
2211
2245
|
|
|
2212
|
-
|
|
2213
|
-
|
|
2246
|
+
def _use_ptx(self, device) -> bool:
|
|
2247
|
+
# determine whether to use PTX or CUBIN
|
|
2248
|
+
if device.is_cubin_supported:
|
|
2249
|
+
# get user preference specified either per module or globally
|
|
2250
|
+
preferred_cuda_output = self.options.get("cuda_output") or warp.config.cuda_output
|
|
2251
|
+
if preferred_cuda_output is not None:
|
|
2252
|
+
use_ptx = preferred_cuda_output == "ptx"
|
|
2253
|
+
else:
|
|
2254
|
+
# determine automatically: older drivers may not be able to handle PTX generated using newer
|
|
2255
|
+
# CUDA Toolkits, in which case we fall back on generating CUBIN modules
|
|
2256
|
+
use_ptx = runtime.driver_version >= runtime.toolkit_version
|
|
2257
|
+
else:
|
|
2258
|
+
# CUBIN not an option, must use PTX (e.g. CUDA Toolkit too old)
|
|
2259
|
+
use_ptx = True
|
|
2260
|
+
|
|
2261
|
+
return use_ptx
|
|
2262
|
+
|
|
2263
|
+
def get_module_identifier(self) -> str:
|
|
2264
|
+
"""Get an abbreviated module name to use for directories and files in the cache.
|
|
2265
|
+
|
|
2266
|
+
Depending on the setting of the ``"strip_hash"`` option for this module,
|
|
2267
|
+
the module identifier might include a content-dependent hash as a suffix.
|
|
2268
|
+
"""
|
|
2269
|
+
if self.options["strip_hash"]:
|
|
2270
|
+
module_name_short = f"wp_{self.name}"
|
|
2271
|
+
else:
|
|
2272
|
+
module_hash = self.get_module_hash()
|
|
2273
|
+
module_name_short = f"wp_{self.name}_{module_hash.hex()[:7]}"
|
|
2274
|
+
|
|
2275
|
+
return module_name_short
|
|
2276
|
+
|
|
2277
|
+
def get_compile_arch(self, device: Device | None = None) -> int | None:
|
|
2278
|
+
if device is None:
|
|
2279
|
+
device = runtime.get_device()
|
|
2280
|
+
|
|
2281
|
+
if device.is_cpu:
|
|
2214
2282
|
return None
|
|
2215
2283
|
|
|
2216
|
-
|
|
2217
|
-
|
|
2284
|
+
if self._use_ptx(device):
|
|
2285
|
+
# use the default PTX arch if the device supports it
|
|
2286
|
+
if warp.config.ptx_target_arch is not None:
|
|
2287
|
+
output_arch = min(device.arch, warp.config.ptx_target_arch)
|
|
2288
|
+
else:
|
|
2289
|
+
output_arch = min(device.arch, runtime.default_ptx_arch)
|
|
2290
|
+
else:
|
|
2291
|
+
output_arch = device.arch
|
|
2218
2292
|
|
|
2219
|
-
|
|
2220
|
-
module_name_short = f"{module_name}_{module_hash.hex()[:7]}"
|
|
2221
|
-
module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short)
|
|
2293
|
+
return output_arch
|
|
2222
2294
|
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
# determine output paths
|
|
2228
|
-
if device.is_cpu:
|
|
2229
|
-
output_name = f"{module_name_short}.o"
|
|
2230
|
-
output_arch = None
|
|
2295
|
+
def get_compile_output_name(
|
|
2296
|
+
self, device: Device | None, output_arch: int | None = None, use_ptx: bool | None = None
|
|
2297
|
+
) -> str:
|
|
2298
|
+
"""Get the filename to use for the compiled module binary.
|
|
2231
2299
|
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
preferred_cuda_output = self.options.get("cuda_output") or warp.config.cuda_output
|
|
2237
|
-
if preferred_cuda_output is not None:
|
|
2238
|
-
use_ptx = preferred_cuda_output == "ptx"
|
|
2239
|
-
else:
|
|
2240
|
-
# determine automatically: older drivers may not be able to handle PTX generated using newer
|
|
2241
|
-
# CUDA Toolkits, in which case we fall back on generating CUBIN modules
|
|
2242
|
-
use_ptx = runtime.driver_version >= runtime.toolkit_version
|
|
2243
|
-
else:
|
|
2244
|
-
# CUBIN not an option, must use PTX (e.g. CUDA Toolkit too old)
|
|
2245
|
-
use_ptx = True
|
|
2300
|
+
This is only the filename, e.g. ``wp___main___0340cd1.sm86.ptx``.
|
|
2301
|
+
It should be used to form a path.
|
|
2302
|
+
"""
|
|
2303
|
+
module_name_short = self.get_module_identifier()
|
|
2246
2304
|
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2305
|
+
if device and device.is_cpu:
|
|
2306
|
+
return f"{module_name_short}.o"
|
|
2307
|
+
|
|
2308
|
+
# For CUDA compilation, we must have an architecture.
|
|
2309
|
+
final_arch = output_arch
|
|
2310
|
+
if final_arch is None:
|
|
2311
|
+
if device:
|
|
2312
|
+
# Infer the architecture from the device
|
|
2313
|
+
final_arch = self.get_compile_arch(device)
|
|
2314
|
+
else:
|
|
2315
|
+
raise ValueError(
|
|
2316
|
+
"Either 'device' or 'output_arch' must be provided to determine compilation architecture"
|
|
2317
|
+
)
|
|
2318
|
+
|
|
2319
|
+
# Determine if we should compile to PTX or CUBIN
|
|
2320
|
+
if use_ptx is None:
|
|
2321
|
+
if device:
|
|
2322
|
+
use_ptx = self._use_ptx(device)
|
|
2323
|
+
else:
|
|
2324
|
+
init()
|
|
2325
|
+
use_ptx = final_arch not in runtime.nvrtc_supported_archs
|
|
2326
|
+
|
|
2327
|
+
if use_ptx:
|
|
2328
|
+
output_name = f"{module_name_short}.sm{final_arch}.ptx"
|
|
2329
|
+
else:
|
|
2330
|
+
output_name = f"{module_name_short}.sm{final_arch}.cubin"
|
|
2331
|
+
|
|
2332
|
+
return output_name
|
|
2333
|
+
|
|
2334
|
+
def get_meta_name(self) -> str:
|
|
2335
|
+
"""Get the filename to use for the module metadata file.
|
|
2336
|
+
|
|
2337
|
+
This is only the filename. It should be used to form a path.
|
|
2338
|
+
"""
|
|
2339
|
+
return f"{self.get_module_identifier()}.meta"
|
|
2340
|
+
|
|
2341
|
+
def compile(
|
|
2342
|
+
self,
|
|
2343
|
+
device: Device | None = None,
|
|
2344
|
+
output_dir: str | os.PathLike | None = None,
|
|
2345
|
+
output_name: str | None = None,
|
|
2346
|
+
output_arch: int | None = None,
|
|
2347
|
+
use_ptx: bool | None = None,
|
|
2348
|
+
) -> None:
|
|
2349
|
+
"""Compile this module for a specific device.
|
|
2350
|
+
|
|
2351
|
+
Note that this function only generates and compiles code. The resulting
|
|
2352
|
+
binary is not loaded into the runtime.
|
|
2353
|
+
|
|
2354
|
+
Args:
|
|
2355
|
+
device: The device to compile the module for.
|
|
2356
|
+
output_dir: The directory to write the compiled module to.
|
|
2357
|
+
output_name: The name of the compiled module binary file.
|
|
2358
|
+
output_arch: The architecture to compile the module for.
|
|
2359
|
+
"""
|
|
2360
|
+
if output_arch is None:
|
|
2361
|
+
output_arch = self.get_compile_arch(device) # Will remain at None if device is CPU
|
|
2362
|
+
|
|
2363
|
+
if output_name is None:
|
|
2364
|
+
output_name = self.get_compile_output_name(device, output_arch, use_ptx)
|
|
2365
|
+
|
|
2366
|
+
builder_options = {
|
|
2367
|
+
**self.options,
|
|
2368
|
+
# Some of the tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
|
|
2369
|
+
"output_arch": output_arch,
|
|
2370
|
+
}
|
|
2371
|
+
builder = ModuleBuilder(
|
|
2372
|
+
self,
|
|
2373
|
+
builder_options,
|
|
2374
|
+
hasher=self.hashers.get(self.options["block_dim"], None),
|
|
2375
|
+
)
|
|
2376
|
+
|
|
2377
|
+
# create a temporary (process unique) dir for build outputs before moving to the binary dir
|
|
2378
|
+
module_name_short = self.get_module_identifier()
|
|
2379
|
+
|
|
2380
|
+
if output_dir is None:
|
|
2381
|
+
output_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name_short}")
|
|
2382
|
+
else:
|
|
2383
|
+
output_dir = os.fspath(output_dir)
|
|
2384
|
+
|
|
2385
|
+
meta_path = os.path.join(output_dir, self.get_meta_name())
|
|
2386
|
+
|
|
2387
|
+
build_dir = os.path.normpath(output_dir) + f"_p{os.getpid()}"
|
|
2388
|
+
|
|
2389
|
+
# dir may exist from previous attempts / runs / archs
|
|
2390
|
+
Path(build_dir).mkdir(parents=True, exist_ok=True)
|
|
2391
|
+
|
|
2392
|
+
mode = self.options["mode"] if self.options["mode"] is not None else warp.config.mode
|
|
2393
|
+
|
|
2394
|
+
# build CPU
|
|
2395
|
+
if output_arch is None:
|
|
2396
|
+
# build
|
|
2397
|
+
try:
|
|
2398
|
+
source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp")
|
|
2399
|
+
|
|
2400
|
+
# write cpp sources
|
|
2401
|
+
cpp_source = builder.codegen("cpu")
|
|
2402
|
+
|
|
2403
|
+
with open(source_code_path, "w") as cpp_file:
|
|
2404
|
+
cpp_file.write(cpp_source)
|
|
2405
|
+
|
|
2406
|
+
output_path = os.path.join(build_dir, output_name)
|
|
2407
|
+
|
|
2408
|
+
# build object code
|
|
2409
|
+
with warp.ScopedTimer("Compile x86", active=warp.config.verbose):
|
|
2410
|
+
warp.build.build_cpu(
|
|
2411
|
+
output_path,
|
|
2412
|
+
source_code_path,
|
|
2413
|
+
mode=mode,
|
|
2414
|
+
fast_math=self.options["fast_math"],
|
|
2415
|
+
verify_fp=warp.config.verify_fp,
|
|
2416
|
+
fuse_fp=self.options["fuse_fp"],
|
|
2417
|
+
)
|
|
2418
|
+
|
|
2419
|
+
except Exception as e:
|
|
2420
|
+
if isinstance(e, FileNotFoundError):
|
|
2421
|
+
_check_and_raise_long_path_error(e)
|
|
2422
|
+
|
|
2423
|
+
self.failed_builds.add(None)
|
|
2424
|
+
|
|
2425
|
+
raise (e)
|
|
2426
|
+
|
|
2427
|
+
else:
|
|
2428
|
+
# build
|
|
2429
|
+
try:
|
|
2430
|
+
source_code_path = os.path.join(build_dir, f"{module_name_short}.cu")
|
|
2431
|
+
|
|
2432
|
+
# write cuda sources
|
|
2433
|
+
cu_source = builder.codegen("cuda")
|
|
2434
|
+
|
|
2435
|
+
with open(source_code_path, "w") as cu_file:
|
|
2436
|
+
cu_file.write(cu_source)
|
|
2437
|
+
|
|
2438
|
+
output_path = os.path.join(build_dir, output_name)
|
|
2439
|
+
|
|
2440
|
+
# generate PTX or CUBIN
|
|
2441
|
+
with warp.ScopedTimer(
|
|
2442
|
+
f"Compile CUDA (arch={builder_options['output_arch']}, mode={mode}, block_dim={self.options['block_dim']})",
|
|
2443
|
+
active=warp.config.verbose,
|
|
2444
|
+
):
|
|
2445
|
+
warp.build.build_cuda(
|
|
2446
|
+
source_code_path,
|
|
2447
|
+
builder_options["output_arch"],
|
|
2448
|
+
output_path,
|
|
2449
|
+
config=mode,
|
|
2450
|
+
verify_fp=warp.config.verify_fp,
|
|
2451
|
+
fast_math=self.options["fast_math"],
|
|
2452
|
+
fuse_fp=self.options["fuse_fp"],
|
|
2453
|
+
lineinfo=self.options["lineinfo"],
|
|
2454
|
+
compile_time_trace=self.options["compile_time_trace"],
|
|
2455
|
+
ltoirs=builder.ltoirs.values(),
|
|
2456
|
+
fatbins=builder.fatbins.values(),
|
|
2457
|
+
)
|
|
2458
|
+
|
|
2459
|
+
except Exception as e:
|
|
2460
|
+
if isinstance(e, FileNotFoundError):
|
|
2461
|
+
_check_and_raise_long_path_error(e)
|
|
2462
|
+
|
|
2463
|
+
if device:
|
|
2464
|
+
self.failed_builds.add(device.context)
|
|
2465
|
+
|
|
2466
|
+
raise (e)
|
|
2467
|
+
|
|
2468
|
+
# ------------------------------------------------------------
|
|
2469
|
+
# build meta data
|
|
2470
|
+
|
|
2471
|
+
meta = builder.build_meta()
|
|
2472
|
+
output_meta_path = os.path.join(build_dir, self.get_meta_name())
|
|
2473
|
+
|
|
2474
|
+
with open(output_meta_path, "w") as meta_file:
|
|
2475
|
+
json.dump(meta, meta_file)
|
|
2476
|
+
|
|
2477
|
+
# -----------------------------------------------------------
|
|
2478
|
+
# update cache
|
|
2257
2479
|
|
|
2480
|
+
# try to move process outputs to cache
|
|
2481
|
+
warp.build.safe_rename(build_dir, output_dir)
|
|
2482
|
+
|
|
2483
|
+
if os.path.exists(output_dir):
|
|
2258
2484
|
# final object binary path
|
|
2259
|
-
binary_path = os.path.join(
|
|
2485
|
+
binary_path = os.path.join(output_dir, output_name)
|
|
2260
2486
|
|
|
2261
|
-
|
|
2262
|
-
|
|
2487
|
+
if not os.path.exists(binary_path) or self.options["strip_hash"]:
|
|
2488
|
+
# copy our output file to the destination module
|
|
2489
|
+
# this is necessary in case different processes
|
|
2490
|
+
# have different GPU architectures / devices
|
|
2491
|
+
try:
|
|
2492
|
+
os.rename(output_path, binary_path)
|
|
2493
|
+
except (OSError, FileExistsError):
|
|
2494
|
+
# another process likely updated the module dir first
|
|
2495
|
+
pass
|
|
2263
2496
|
|
|
2264
|
-
|
|
2497
|
+
if not os.path.exists(meta_path) or self.options["strip_hash"]:
|
|
2498
|
+
# copy our output file to the destination module
|
|
2499
|
+
# this is necessary in case different processes
|
|
2500
|
+
# have different GPU architectures / devices
|
|
2501
|
+
try:
|
|
2502
|
+
os.rename(output_meta_path, meta_path)
|
|
2503
|
+
except (OSError, FileExistsError):
|
|
2504
|
+
# another process likely updated the module dir first
|
|
2505
|
+
pass
|
|
2265
2506
|
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
"output_arch": output_arch,
|
|
2277
|
-
}
|
|
2278
|
-
builder = ModuleBuilder(self, builder_options, hasher=self.hashers[active_block_dim])
|
|
2279
|
-
|
|
2280
|
-
# create a temporary (process unique) dir for build outputs before moving to the binary dir
|
|
2281
|
-
build_dir = os.path.join(
|
|
2282
|
-
warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}_p{os.getpid()}"
|
|
2283
|
-
)
|
|
2507
|
+
try:
|
|
2508
|
+
final_source_path = os.path.join(output_dir, os.path.basename(source_code_path))
|
|
2509
|
+
if not os.path.exists(final_source_path) or self.options["strip_hash"]:
|
|
2510
|
+
os.rename(source_code_path, final_source_path)
|
|
2511
|
+
except (OSError, FileExistsError):
|
|
2512
|
+
# another process likely updated the module dir first
|
|
2513
|
+
pass
|
|
2514
|
+
except Exception as e:
|
|
2515
|
+
# We don't need source_code_path to be copied successfully to proceed, so warn and keep running
|
|
2516
|
+
warp.utils.warn(f"Exception when renaming {source_code_path}: {e}")
|
|
2284
2517
|
|
|
2285
|
-
|
|
2286
|
-
|
|
2518
|
+
# clean up build_dir used for this process regardless
|
|
2519
|
+
shutil.rmtree(build_dir, ignore_errors=True)
|
|
2287
2520
|
|
|
2288
|
-
|
|
2521
|
+
def load(
|
|
2522
|
+
self,
|
|
2523
|
+
device,
|
|
2524
|
+
block_dim: int | None = None,
|
|
2525
|
+
binary_path: os.PathLike | None = None,
|
|
2526
|
+
output_arch: int | None = None,
|
|
2527
|
+
meta_path: os.PathLike | None = None,
|
|
2528
|
+
) -> ModuleExec | None:
|
|
2529
|
+
device = runtime.get_device(device)
|
|
2289
2530
|
|
|
2290
|
-
|
|
2531
|
+
# update module options if launching with a new block dim
|
|
2532
|
+
if block_dim is not None:
|
|
2533
|
+
self.options["block_dim"] = block_dim
|
|
2291
2534
|
|
|
2292
|
-
|
|
2293
|
-
if device.is_cpu:
|
|
2294
|
-
# build
|
|
2295
|
-
try:
|
|
2296
|
-
source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp")
|
|
2535
|
+
active_block_dim = self.options["block_dim"]
|
|
2297
2536
|
|
|
2298
|
-
|
|
2299
|
-
|
|
2537
|
+
# check if executable module is already loaded and not stale
|
|
2538
|
+
exec = self.execs.get((device.context, active_block_dim))
|
|
2539
|
+
if exec is not None:
|
|
2540
|
+
if self.options["strip_hash"] or (exec.module_hash == self.get_module_hash(active_block_dim)):
|
|
2541
|
+
return exec
|
|
2300
2542
|
|
|
2301
|
-
|
|
2302
|
-
|
|
2543
|
+
# quietly avoid repeated build attempts to reduce error spew
|
|
2544
|
+
if device.context in self.failed_builds:
|
|
2545
|
+
return None
|
|
2303
2546
|
|
|
2304
|
-
|
|
2547
|
+
module_hash = self.get_module_hash(active_block_dim)
|
|
2305
2548
|
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
warp.build.build_cpu(
|
|
2309
|
-
output_path,
|
|
2310
|
-
source_code_path,
|
|
2311
|
-
mode=mode,
|
|
2312
|
-
fast_math=self.options["fast_math"],
|
|
2313
|
-
verify_fp=warp.config.verify_fp,
|
|
2314
|
-
fuse_fp=self.options["fuse_fp"],
|
|
2315
|
-
)
|
|
2549
|
+
# use a unique module path using the module short hash
|
|
2550
|
+
module_name_short = self.get_module_identifier()
|
|
2316
2551
|
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2552
|
+
module_load_timer_name = (
|
|
2553
|
+
f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'"
|
|
2554
|
+
if self.options["strip_hash"] is False
|
|
2555
|
+
else f"Module {self.name} load on device '{device}'"
|
|
2556
|
+
)
|
|
2321
2557
|
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
try:
|
|
2325
|
-
source_code_path = os.path.join(build_dir, f"{module_name_short}.cu")
|
|
2326
|
-
|
|
2327
|
-
# write cuda sources
|
|
2328
|
-
cu_source = builder.codegen("cuda")
|
|
2329
|
-
|
|
2330
|
-
with open(source_code_path, "w") as cu_file:
|
|
2331
|
-
cu_file.write(cu_source)
|
|
2332
|
-
|
|
2333
|
-
output_path = os.path.join(build_dir, output_name)
|
|
2334
|
-
|
|
2335
|
-
# generate PTX or CUBIN
|
|
2336
|
-
with warp.ScopedTimer("Compile CUDA", active=warp.config.verbose):
|
|
2337
|
-
warp.build.build_cuda(
|
|
2338
|
-
source_code_path,
|
|
2339
|
-
output_arch,
|
|
2340
|
-
output_path,
|
|
2341
|
-
config=mode,
|
|
2342
|
-
verify_fp=warp.config.verify_fp,
|
|
2343
|
-
fast_math=self.options["fast_math"],
|
|
2344
|
-
fuse_fp=self.options["fuse_fp"],
|
|
2345
|
-
lineinfo=self.options["lineinfo"],
|
|
2346
|
-
compile_time_trace=self.options["compile_time_trace"],
|
|
2347
|
-
ltoirs=builder.ltoirs.values(),
|
|
2348
|
-
fatbins=builder.fatbins.values(),
|
|
2349
|
-
)
|
|
2558
|
+
if warp.config.verbose:
|
|
2559
|
+
module_load_timer_name += f" (block_dim={active_block_dim})"
|
|
2350
2560
|
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
raise (e)
|
|
2561
|
+
with warp.ScopedTimer(module_load_timer_name, active=not warp.config.quiet) as module_load_timer:
|
|
2562
|
+
# -----------------------------------------------------------
|
|
2563
|
+
# Determine binary path and build if necessary
|
|
2355
2564
|
|
|
2356
|
-
|
|
2357
|
-
#
|
|
2565
|
+
if binary_path:
|
|
2566
|
+
# We will never re-codegen or re-compile in this situation
|
|
2567
|
+
# The expected files must already exist
|
|
2358
2568
|
|
|
2359
|
-
|
|
2360
|
-
|
|
2569
|
+
if device.is_cuda and output_arch is None:
|
|
2570
|
+
raise ValueError("'output_arch' must be provided if a 'binary_path' is provided")
|
|
2361
2571
|
|
|
2362
|
-
|
|
2363
|
-
|
|
2572
|
+
if meta_path is None:
|
|
2573
|
+
raise ValueError("'meta_path' must be provided if a 'binary_path' is provided")
|
|
2364
2574
|
|
|
2365
|
-
|
|
2366
|
-
|
|
2575
|
+
if not os.path.exists(binary_path):
|
|
2576
|
+
module_load_timer.extra_msg = " (error)"
|
|
2577
|
+
raise FileNotFoundError(f"Binary file {binary_path} does not exist")
|
|
2578
|
+
else:
|
|
2579
|
+
module_load_timer.extra_msg = " (cached)"
|
|
2580
|
+
else:
|
|
2581
|
+
# we will build if binary doesn't exist yet
|
|
2582
|
+
# we will rebuild if we are not caching kernels or if we are tracking array access
|
|
2367
2583
|
|
|
2368
|
-
|
|
2369
|
-
|
|
2584
|
+
output_name = self.get_compile_output_name(device)
|
|
2585
|
+
output_arch = self.get_compile_arch(device)
|
|
2370
2586
|
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
|
|
2375
|
-
# have different GPU architectures / devices
|
|
2376
|
-
try:
|
|
2377
|
-
os.rename(output_path, binary_path)
|
|
2378
|
-
except (OSError, FileExistsError):
|
|
2379
|
-
# another process likely updated the module dir first
|
|
2380
|
-
pass
|
|
2587
|
+
module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short)
|
|
2588
|
+
meta_path = os.path.join(module_dir, self.get_meta_name())
|
|
2589
|
+
# final object binary path
|
|
2590
|
+
binary_path = os.path.join(module_dir, output_name)
|
|
2381
2591
|
|
|
2592
|
+
if (
|
|
2593
|
+
not os.path.exists(binary_path)
|
|
2594
|
+
or not warp.config.cache_kernels
|
|
2595
|
+
or warp.config.verify_autograd_array_access
|
|
2596
|
+
):
|
|
2382
2597
|
try:
|
|
2383
|
-
|
|
2384
|
-
if not os.path.exists(final_source_path):
|
|
2385
|
-
os.rename(source_code_path, final_source_path)
|
|
2386
|
-
except (OSError, FileExistsError):
|
|
2387
|
-
# another process likely updated the module dir first
|
|
2388
|
-
pass
|
|
2598
|
+
self.compile(device, module_dir, output_name, output_arch)
|
|
2389
2599
|
except Exception as e:
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2600
|
+
module_load_timer.extra_msg = " (error)"
|
|
2601
|
+
raise (e)
|
|
2602
|
+
|
|
2603
|
+
module_load_timer.extra_msg = " (compiled)"
|
|
2604
|
+
else:
|
|
2605
|
+
module_load_timer.extra_msg = " (cached)"
|
|
2394
2606
|
|
|
2395
2607
|
# -----------------------------------------------------------
|
|
2396
2608
|
# Load CPU or CUDA binary
|
|
2397
2609
|
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2610
|
+
if os.path.exists(meta_path):
|
|
2611
|
+
with open(meta_path) as meta_file:
|
|
2612
|
+
meta = json.load(meta_file)
|
|
2613
|
+
else:
|
|
2614
|
+
raise FileNotFoundError(f"Module metadata file {meta_path} was not found in the cache")
|
|
2401
2615
|
|
|
2402
2616
|
if device.is_cpu:
|
|
2403
2617
|
# LLVM modules are identified using strings, so we need to ensure uniqueness
|
|
2404
|
-
module_handle = f"{
|
|
2618
|
+
module_handle = f"wp_{self.name}_{self.cpu_exec_id}"
|
|
2405
2619
|
self.cpu_exec_id += 1
|
|
2406
|
-
runtime.llvm.
|
|
2620
|
+
runtime.llvm.wp_load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
|
|
2407
2621
|
module_exec = ModuleExec(module_handle, module_hash, device, meta)
|
|
2408
2622
|
self.execs[(None, active_block_dim)] = module_exec
|
|
2409
2623
|
|
|
@@ -2416,12 +2630,6 @@ class Module:
|
|
|
2416
2630
|
module_load_timer.extra_msg = " (error)"
|
|
2417
2631
|
raise Exception(f"Failed to load CUDA module '{self.name}'")
|
|
2418
2632
|
|
|
2419
|
-
if build_dir:
|
|
2420
|
-
import shutil
|
|
2421
|
-
|
|
2422
|
-
# clean up build_dir used for this process regardless
|
|
2423
|
-
shutil.rmtree(build_dir, ignore_errors=True)
|
|
2424
|
-
|
|
2425
2633
|
return module_exec
|
|
2426
2634
|
|
|
2427
2635
|
def unload(self):
|
|
@@ -2457,13 +2665,13 @@ class CpuDefaultAllocator:
|
|
|
2457
2665
|
self.deleter = lambda ptr, size: self.free(ptr, size)
|
|
2458
2666
|
|
|
2459
2667
|
def alloc(self, size_in_bytes):
|
|
2460
|
-
ptr = runtime.core.
|
|
2668
|
+
ptr = runtime.core.wp_alloc_host(size_in_bytes)
|
|
2461
2669
|
if not ptr:
|
|
2462
2670
|
raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device 'cpu'")
|
|
2463
2671
|
return ptr
|
|
2464
2672
|
|
|
2465
2673
|
def free(self, ptr, size_in_bytes):
|
|
2466
|
-
runtime.core.
|
|
2674
|
+
runtime.core.wp_free_host(ptr)
|
|
2467
2675
|
|
|
2468
2676
|
|
|
2469
2677
|
class CpuPinnedAllocator:
|
|
@@ -2472,13 +2680,13 @@ class CpuPinnedAllocator:
|
|
|
2472
2680
|
self.deleter = lambda ptr, size: self.free(ptr, size)
|
|
2473
2681
|
|
|
2474
2682
|
def alloc(self, size_in_bytes):
|
|
2475
|
-
ptr = runtime.core.
|
|
2683
|
+
ptr = runtime.core.wp_alloc_pinned(size_in_bytes)
|
|
2476
2684
|
if not ptr:
|
|
2477
2685
|
raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device '{self.device}'")
|
|
2478
2686
|
return ptr
|
|
2479
2687
|
|
|
2480
2688
|
def free(self, ptr, size_in_bytes):
|
|
2481
|
-
runtime.core.
|
|
2689
|
+
runtime.core.wp_free_pinned(ptr)
|
|
2482
2690
|
|
|
2483
2691
|
|
|
2484
2692
|
class CudaDefaultAllocator:
|
|
@@ -2488,7 +2696,7 @@ class CudaDefaultAllocator:
|
|
|
2488
2696
|
self.deleter = lambda ptr, size: self.free(ptr, size)
|
|
2489
2697
|
|
|
2490
2698
|
def alloc(self, size_in_bytes):
|
|
2491
|
-
ptr = runtime.core.
|
|
2699
|
+
ptr = runtime.core.wp_alloc_device_default(self.device.context, size_in_bytes)
|
|
2492
2700
|
# If the allocation fails, check if graph capture is active to raise an informative error.
|
|
2493
2701
|
# We delay the capture check to avoid overhead.
|
|
2494
2702
|
if not ptr:
|
|
@@ -2510,7 +2718,7 @@ class CudaDefaultAllocator:
|
|
|
2510
2718
|
return ptr
|
|
2511
2719
|
|
|
2512
2720
|
def free(self, ptr, size_in_bytes):
|
|
2513
|
-
runtime.core.
|
|
2721
|
+
runtime.core.wp_free_device_default(self.device.context, ptr)
|
|
2514
2722
|
|
|
2515
2723
|
|
|
2516
2724
|
class CudaMempoolAllocator:
|
|
@@ -2521,13 +2729,13 @@ class CudaMempoolAllocator:
|
|
|
2521
2729
|
self.deleter = lambda ptr, size: self.free(ptr, size)
|
|
2522
2730
|
|
|
2523
2731
|
def alloc(self, size_in_bytes):
|
|
2524
|
-
ptr = runtime.core.
|
|
2732
|
+
ptr = runtime.core.wp_alloc_device_async(self.device.context, size_in_bytes)
|
|
2525
2733
|
if not ptr:
|
|
2526
2734
|
raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device '{self.device}'")
|
|
2527
2735
|
return ptr
|
|
2528
2736
|
|
|
2529
2737
|
def free(self, ptr, size_in_bytes):
|
|
2530
|
-
runtime.core.
|
|
2738
|
+
runtime.core.wp_free_device_async(self.device.context, ptr)
|
|
2531
2739
|
|
|
2532
2740
|
|
|
2533
2741
|
class ContextGuard:
|
|
@@ -2536,15 +2744,15 @@ class ContextGuard:
|
|
|
2536
2744
|
|
|
2537
2745
|
def __enter__(self):
|
|
2538
2746
|
if self.device.is_cuda:
|
|
2539
|
-
runtime.core.
|
|
2747
|
+
runtime.core.wp_cuda_context_push_current(self.device.context)
|
|
2540
2748
|
elif is_cuda_driver_initialized():
|
|
2541
|
-
self.saved_context = runtime.core.
|
|
2749
|
+
self.saved_context = runtime.core.wp_cuda_context_get_current()
|
|
2542
2750
|
|
|
2543
2751
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
2544
2752
|
if self.device.is_cuda:
|
|
2545
|
-
runtime.core.
|
|
2753
|
+
runtime.core.wp_cuda_context_pop_current()
|
|
2546
2754
|
elif is_cuda_driver_initialized():
|
|
2547
|
-
runtime.core.
|
|
2755
|
+
runtime.core.wp_cuda_context_set_current(self.saved_context)
|
|
2548
2756
|
|
|
2549
2757
|
|
|
2550
2758
|
class Event:
|
|
@@ -2607,7 +2815,7 @@ class Event:
|
|
|
2607
2815
|
raise ValueError("The combination of 'enable_timing=True' and 'interprocess=True' is not allowed.")
|
|
2608
2816
|
flags |= Event.Flags.INTERPROCESS
|
|
2609
2817
|
|
|
2610
|
-
self.cuda_event = runtime.core.
|
|
2818
|
+
self.cuda_event = runtime.core.wp_cuda_event_create(device.context, flags)
|
|
2611
2819
|
if not self.cuda_event:
|
|
2612
2820
|
raise RuntimeError(f"Failed to create event on device {device}")
|
|
2613
2821
|
self.owner = True
|
|
@@ -2634,7 +2842,9 @@ class Event:
|
|
|
2634
2842
|
# Allocate a buffer for the data (64-element char array)
|
|
2635
2843
|
ipc_handle_buffer = (ctypes.c_char * 64)()
|
|
2636
2844
|
|
|
2637
|
-
warp.context.runtime.core.
|
|
2845
|
+
warp.context.runtime.core.wp_cuda_ipc_get_event_handle(
|
|
2846
|
+
self.device.context, self.cuda_event, ipc_handle_buffer
|
|
2847
|
+
)
|
|
2638
2848
|
|
|
2639
2849
|
if ipc_handle_buffer.raw == bytes(64):
|
|
2640
2850
|
warp.utils.warn("IPC event handle appears to be invalid. Was interprocess=True used?")
|
|
@@ -2651,7 +2861,7 @@ class Event:
|
|
|
2651
2861
|
This property may not be accessed during a graph capture on any stream.
|
|
2652
2862
|
"""
|
|
2653
2863
|
|
|
2654
|
-
result_code = runtime.core.
|
|
2864
|
+
result_code = runtime.core.wp_cuda_event_query(self.cuda_event)
|
|
2655
2865
|
|
|
2656
2866
|
return result_code == 0
|
|
2657
2867
|
|
|
@@ -2659,7 +2869,7 @@ class Event:
|
|
|
2659
2869
|
if not self.owner:
|
|
2660
2870
|
return
|
|
2661
2871
|
|
|
2662
|
-
runtime.core.
|
|
2872
|
+
runtime.core.wp_cuda_event_destroy(self.cuda_event)
|
|
2663
2873
|
|
|
2664
2874
|
|
|
2665
2875
|
class Stream:
|
|
@@ -2709,12 +2919,12 @@ class Stream:
|
|
|
2709
2919
|
# we pass cuda_stream through kwargs because cuda_stream=None is actually a valid value (CUDA default stream)
|
|
2710
2920
|
if "cuda_stream" in kwargs:
|
|
2711
2921
|
self.cuda_stream = kwargs["cuda_stream"]
|
|
2712
|
-
device.runtime.core.
|
|
2922
|
+
device.runtime.core.wp_cuda_stream_register(device.context, self.cuda_stream)
|
|
2713
2923
|
else:
|
|
2714
2924
|
if not isinstance(priority, int):
|
|
2715
2925
|
raise TypeError("Stream priority must be an integer.")
|
|
2716
2926
|
clamped_priority = max(-1, min(priority, 0)) # Only support two priority levels
|
|
2717
|
-
self.cuda_stream = device.runtime.core.
|
|
2927
|
+
self.cuda_stream = device.runtime.core.wp_cuda_stream_create(device.context, clamped_priority)
|
|
2718
2928
|
|
|
2719
2929
|
if not self.cuda_stream:
|
|
2720
2930
|
raise RuntimeError(f"Failed to create stream on device {device}")
|
|
@@ -2725,9 +2935,9 @@ class Stream:
|
|
|
2725
2935
|
return
|
|
2726
2936
|
|
|
2727
2937
|
if self.owner:
|
|
2728
|
-
runtime.core.
|
|
2938
|
+
runtime.core.wp_cuda_stream_destroy(self.device.context, self.cuda_stream)
|
|
2729
2939
|
else:
|
|
2730
|
-
runtime.core.
|
|
2940
|
+
runtime.core.wp_cuda_stream_unregister(self.device.context, self.cuda_stream)
|
|
2731
2941
|
|
|
2732
2942
|
@property
|
|
2733
2943
|
def cached_event(self) -> Event:
|
|
@@ -2753,7 +2963,7 @@ class Stream:
|
|
|
2753
2963
|
f"Event from device {event.device} cannot be recorded on stream from device {self.device}"
|
|
2754
2964
|
)
|
|
2755
2965
|
|
|
2756
|
-
runtime.core.
|
|
2966
|
+
runtime.core.wp_cuda_event_record(event.cuda_event, self.cuda_stream, event.enable_timing)
|
|
2757
2967
|
|
|
2758
2968
|
return event
|
|
2759
2969
|
|
|
@@ -2762,7 +2972,7 @@ class Stream:
|
|
|
2762
2972
|
|
|
2763
2973
|
This function does not block the host thread.
|
|
2764
2974
|
"""
|
|
2765
|
-
runtime.core.
|
|
2975
|
+
runtime.core.wp_cuda_stream_wait_event(self.cuda_stream, event.cuda_event)
|
|
2766
2976
|
|
|
2767
2977
|
def wait_stream(self, other_stream: Stream, event: Event | None = None):
|
|
2768
2978
|
"""Records an event on `other_stream` and makes this stream wait on it.
|
|
@@ -2785,7 +2995,7 @@ class Stream:
|
|
|
2785
2995
|
if event is None:
|
|
2786
2996
|
event = other_stream.cached_event
|
|
2787
2997
|
|
|
2788
|
-
runtime.core.
|
|
2998
|
+
runtime.core.wp_cuda_stream_wait_stream(self.cuda_stream, other_stream.cuda_stream, event.cuda_event)
|
|
2789
2999
|
|
|
2790
3000
|
@property
|
|
2791
3001
|
def is_complete(self) -> bool:
|
|
@@ -2794,19 +3004,19 @@ class Stream:
|
|
|
2794
3004
|
This property may not be accessed during a graph capture on any stream.
|
|
2795
3005
|
"""
|
|
2796
3006
|
|
|
2797
|
-
result_code = runtime.core.
|
|
3007
|
+
result_code = runtime.core.wp_cuda_stream_query(self.cuda_stream)
|
|
2798
3008
|
|
|
2799
3009
|
return result_code == 0
|
|
2800
3010
|
|
|
2801
3011
|
@property
|
|
2802
3012
|
def is_capturing(self) -> bool:
|
|
2803
3013
|
"""A boolean indicating whether a graph capture is currently ongoing on this stream."""
|
|
2804
|
-
return bool(runtime.core.
|
|
3014
|
+
return bool(runtime.core.wp_cuda_stream_is_capturing(self.cuda_stream))
|
|
2805
3015
|
|
|
2806
3016
|
@property
|
|
2807
3017
|
def priority(self) -> int:
|
|
2808
3018
|
"""An integer representing the priority of the stream."""
|
|
2809
|
-
return runtime.core.
|
|
3019
|
+
return runtime.core.wp_cuda_stream_get_priority(self.cuda_stream)
|
|
2810
3020
|
|
|
2811
3021
|
|
|
2812
3022
|
class Device:
|
|
@@ -2875,22 +3085,22 @@ class Device:
|
|
|
2875
3085
|
self.pci_bus_id = None
|
|
2876
3086
|
|
|
2877
3087
|
# TODO: add more device-specific dispatch functions
|
|
2878
|
-
self.memset = runtime.core.
|
|
2879
|
-
self.memtile = runtime.core.
|
|
3088
|
+
self.memset = runtime.core.wp_memset_host
|
|
3089
|
+
self.memtile = runtime.core.wp_memtile_host
|
|
2880
3090
|
|
|
2881
3091
|
self.default_allocator = CpuDefaultAllocator(self)
|
|
2882
3092
|
self.pinned_allocator = CpuPinnedAllocator(self)
|
|
2883
3093
|
|
|
2884
|
-
elif ordinal >= 0 and ordinal < runtime.core.
|
|
3094
|
+
elif ordinal >= 0 and ordinal < runtime.core.wp_cuda_device_get_count():
|
|
2885
3095
|
# CUDA device
|
|
2886
|
-
self.name = runtime.core.
|
|
2887
|
-
self.arch = runtime.core.
|
|
2888
|
-
self.sm_count = runtime.core.
|
|
2889
|
-
self.is_uva = runtime.core.
|
|
2890
|
-
self.is_mempool_supported = runtime.core.
|
|
3096
|
+
self.name = runtime.core.wp_cuda_device_get_name(ordinal).decode()
|
|
3097
|
+
self.arch = runtime.core.wp_cuda_device_get_arch(ordinal)
|
|
3098
|
+
self.sm_count = runtime.core.wp_cuda_device_get_sm_count(ordinal)
|
|
3099
|
+
self.is_uva = runtime.core.wp_cuda_device_is_uva(ordinal) > 0
|
|
3100
|
+
self.is_mempool_supported = runtime.core.wp_cuda_device_is_mempool_supported(ordinal) > 0
|
|
2891
3101
|
if platform.system() == "Linux":
|
|
2892
3102
|
# Use None when IPC support cannot be determined
|
|
2893
|
-
ipc_support_api_query = runtime.core.
|
|
3103
|
+
ipc_support_api_query = runtime.core.wp_cuda_device_is_ipc_supported(ordinal)
|
|
2894
3104
|
self.is_ipc_supported = bool(ipc_support_api_query) if ipc_support_api_query >= 0 else None
|
|
2895
3105
|
else:
|
|
2896
3106
|
self.is_ipc_supported = False
|
|
@@ -2902,13 +3112,13 @@ class Device:
|
|
|
2902
3112
|
self.is_mempool_enabled = False
|
|
2903
3113
|
|
|
2904
3114
|
uuid_buffer = (ctypes.c_char * 16)()
|
|
2905
|
-
runtime.core.
|
|
3115
|
+
runtime.core.wp_cuda_device_get_uuid(ordinal, uuid_buffer)
|
|
2906
3116
|
uuid_byte_str = bytes(uuid_buffer).hex()
|
|
2907
3117
|
self.uuid = f"GPU-{uuid_byte_str[0:8]}-{uuid_byte_str[8:12]}-{uuid_byte_str[12:16]}-{uuid_byte_str[16:20]}-{uuid_byte_str[20:]}"
|
|
2908
3118
|
|
|
2909
|
-
pci_domain_id = runtime.core.
|
|
2910
|
-
pci_bus_id = runtime.core.
|
|
2911
|
-
pci_device_id = runtime.core.
|
|
3119
|
+
pci_domain_id = runtime.core.wp_cuda_device_get_pci_domain_id(ordinal)
|
|
3120
|
+
pci_bus_id = runtime.core.wp_cuda_device_get_pci_bus_id(ordinal)
|
|
3121
|
+
pci_device_id = runtime.core.wp_cuda_device_get_pci_device_id(ordinal)
|
|
2912
3122
|
# This is (mis)named to correspond to the naming of cudaDeviceGetPCIBusId
|
|
2913
3123
|
self.pci_bus_id = f"{pci_domain_id:08X}:{pci_bus_id:02X}:{pci_device_id:02X}"
|
|
2914
3124
|
|
|
@@ -2932,8 +3142,8 @@ class Device:
|
|
|
2932
3142
|
self._init_streams()
|
|
2933
3143
|
|
|
2934
3144
|
# TODO: add more device-specific dispatch functions
|
|
2935
|
-
self.memset = lambda ptr, value, size: runtime.core.
|
|
2936
|
-
self.memtile = lambda ptr, src, srcsize, reps: runtime.core.
|
|
3145
|
+
self.memset = lambda ptr, value, size: runtime.core.wp_memset_device(self.context, ptr, value, size)
|
|
3146
|
+
self.memtile = lambda ptr, src, srcsize, reps: runtime.core.wp_memtile_device(
|
|
2937
3147
|
self.context, ptr, src, srcsize, reps
|
|
2938
3148
|
)
|
|
2939
3149
|
|
|
@@ -2992,15 +3202,15 @@ class Device:
|
|
|
2992
3202
|
return self._context
|
|
2993
3203
|
elif self.is_primary:
|
|
2994
3204
|
# acquire primary context on demand
|
|
2995
|
-
prev_context = runtime.core.
|
|
2996
|
-
self._context = self.runtime.core.
|
|
3205
|
+
prev_context = runtime.core.wp_cuda_context_get_current()
|
|
3206
|
+
self._context = self.runtime.core.wp_cuda_device_get_primary_context(self.ordinal)
|
|
2997
3207
|
if self._context is None:
|
|
2998
|
-
runtime.core.
|
|
3208
|
+
runtime.core.wp_cuda_context_set_current(prev_context)
|
|
2999
3209
|
raise RuntimeError(f"Failed to acquire primary context for device {self}")
|
|
3000
3210
|
self.runtime.context_map[self._context] = self
|
|
3001
3211
|
# initialize streams
|
|
3002
3212
|
self._init_streams()
|
|
3003
|
-
runtime.core.
|
|
3213
|
+
runtime.core.wp_cuda_context_set_current(prev_context)
|
|
3004
3214
|
return self._context
|
|
3005
3215
|
|
|
3006
3216
|
@property
|
|
@@ -3044,7 +3254,7 @@ class Device:
|
|
|
3044
3254
|
if stream.device != self:
|
|
3045
3255
|
raise RuntimeError(f"Stream from device {stream.device} cannot be used on device {self}")
|
|
3046
3256
|
|
|
3047
|
-
self.runtime.core.
|
|
3257
|
+
self.runtime.core.wp_cuda_context_set_stream(self.context, stream.cuda_stream, int(sync))
|
|
3048
3258
|
self._stream = stream
|
|
3049
3259
|
else:
|
|
3050
3260
|
raise RuntimeError(f"Device {self} is not a CUDA device")
|
|
@@ -3062,7 +3272,7 @@ class Device:
|
|
|
3062
3272
|
"""
|
|
3063
3273
|
if self.is_cuda:
|
|
3064
3274
|
total_mem = ctypes.c_size_t()
|
|
3065
|
-
self.runtime.core.
|
|
3275
|
+
self.runtime.core.wp_cuda_device_get_memory_info(self.ordinal, None, ctypes.byref(total_mem))
|
|
3066
3276
|
return total_mem.value
|
|
3067
3277
|
else:
|
|
3068
3278
|
# TODO: cpu
|
|
@@ -3076,7 +3286,7 @@ class Device:
|
|
|
3076
3286
|
"""
|
|
3077
3287
|
if self.is_cuda:
|
|
3078
3288
|
free_mem = ctypes.c_size_t()
|
|
3079
|
-
self.runtime.core.
|
|
3289
|
+
self.runtime.core.wp_cuda_device_get_memory_info(self.ordinal, ctypes.byref(free_mem), None)
|
|
3080
3290
|
return free_mem.value
|
|
3081
3291
|
else:
|
|
3082
3292
|
# TODO: cpu
|
|
@@ -3103,7 +3313,7 @@ class Device:
|
|
|
3103
3313
|
|
|
3104
3314
|
def make_current(self):
|
|
3105
3315
|
if self.context is not None:
|
|
3106
|
-
self.runtime.core.
|
|
3316
|
+
self.runtime.core.wp_cuda_context_set_current(self.context)
|
|
3107
3317
|
|
|
3108
3318
|
def can_access(self, other):
|
|
3109
3319
|
# TODO: this function should be redesigned in terms of (device, resource).
|
|
@@ -3129,11 +3339,7 @@ class Graph:
|
|
|
3129
3339
|
self.capture_id = capture_id
|
|
3130
3340
|
self.module_execs: set[ModuleExec] = set()
|
|
3131
3341
|
self.graph_exec: ctypes.c_void_p | None = None
|
|
3132
|
-
|
|
3133
3342
|
self.graph: ctypes.c_void_p | None = None
|
|
3134
|
-
self.has_conditional = (
|
|
3135
|
-
False # Track if there are conditional nodes in the graph since they are not allowed in child graphs
|
|
3136
|
-
)
|
|
3137
3343
|
|
|
3138
3344
|
def __del__(self):
|
|
3139
3345
|
if not hasattr(self, "graph") or not hasattr(self, "device") or not self.graph:
|
|
@@ -3141,9 +3347,9 @@ class Graph:
|
|
|
3141
3347
|
|
|
3142
3348
|
# use CUDA context guard to avoid side effects during garbage collection
|
|
3143
3349
|
with self.device.context_guard:
|
|
3144
|
-
runtime.core.
|
|
3350
|
+
runtime.core.wp_cuda_graph_destroy(self.device.context, self.graph)
|
|
3145
3351
|
if hasattr(self, "graph_exec") and self.graph_exec is not None:
|
|
3146
|
-
runtime.core.
|
|
3352
|
+
runtime.core.wp_cuda_graph_exec_destroy(self.device.context, self.graph_exec)
|
|
3147
3353
|
|
|
3148
3354
|
# retain executable CUDA modules used by this graph, which prevents them from being unloaded
|
|
3149
3355
|
def retain_module_exec(self, module_exec: ModuleExec):
|
|
@@ -3155,6 +3361,14 @@ class Runtime:
|
|
|
3155
3361
|
if sys.version_info < (3, 9):
|
|
3156
3362
|
warp.utils.warn(f"Python 3.9 or newer is recommended for running Warp, detected {sys.version_info}")
|
|
3157
3363
|
|
|
3364
|
+
if platform.system() == "Darwin" and platform.machine() == "x86_64":
|
|
3365
|
+
warp.utils.warn(
|
|
3366
|
+
"Support for Warp on Intel-based macOS is deprecated and will be removed in the near future. "
|
|
3367
|
+
"Apple Silicon-based Macs will continue to be supported.",
|
|
3368
|
+
DeprecationWarning,
|
|
3369
|
+
stacklevel=3,
|
|
3370
|
+
)
|
|
3371
|
+
|
|
3158
3372
|
bin_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "bin")
|
|
3159
3373
|
|
|
3160
3374
|
if os.name == "nt":
|
|
@@ -3177,7 +3391,7 @@ class Runtime:
|
|
|
3177
3391
|
if os.path.exists(llvm_lib):
|
|
3178
3392
|
self.llvm = self.load_dll(llvm_lib)
|
|
3179
3393
|
# setup c-types for warp-clang.dll
|
|
3180
|
-
self.llvm.
|
|
3394
|
+
self.llvm.wp_lookup.restype = ctypes.c_uint64
|
|
3181
3395
|
else:
|
|
3182
3396
|
self.llvm = None
|
|
3183
3397
|
|
|
@@ -3186,83 +3400,83 @@ class Runtime:
|
|
|
3186
3400
|
|
|
3187
3401
|
# setup c-types for warp.dll
|
|
3188
3402
|
try:
|
|
3189
|
-
self.core.
|
|
3190
|
-
self.core.
|
|
3191
|
-
self.core.
|
|
3192
|
-
self.core.
|
|
3193
|
-
self.core.
|
|
3194
|
-
self.core.
|
|
3195
|
-
|
|
3196
|
-
self.core.
|
|
3197
|
-
self.core.
|
|
3198
|
-
self.core.
|
|
3199
|
-
self.core.
|
|
3200
|
-
self.core.
|
|
3201
|
-
self.core.
|
|
3202
|
-
self.core.
|
|
3203
|
-
self.core.
|
|
3204
|
-
self.core.
|
|
3205
|
-
self.core.
|
|
3206
|
-
|
|
3207
|
-
self.core.
|
|
3208
|
-
self.core.
|
|
3209
|
-
self.core.
|
|
3210
|
-
self.core.
|
|
3211
|
-
|
|
3212
|
-
self.core.
|
|
3213
|
-
self.core.
|
|
3214
|
-
self.core.
|
|
3215
|
-
self.core.
|
|
3216
|
-
self.core.
|
|
3217
|
-
self.core.
|
|
3218
|
-
self.core.
|
|
3219
|
-
self.core.
|
|
3220
|
-
self.core.
|
|
3221
|
-
self.core.
|
|
3222
|
-
|
|
3223
|
-
self.core.
|
|
3224
|
-
self.core.
|
|
3225
|
-
self.core.
|
|
3226
|
-
self.core.
|
|
3227
|
-
|
|
3228
|
-
self.core.
|
|
3229
|
-
self.core.
|
|
3230
|
-
self.core.
|
|
3403
|
+
self.core.wp_get_error_string.argtypes = []
|
|
3404
|
+
self.core.wp_get_error_string.restype = ctypes.c_char_p
|
|
3405
|
+
self.core.wp_set_error_output_enabled.argtypes = [ctypes.c_int]
|
|
3406
|
+
self.core.wp_set_error_output_enabled.restype = None
|
|
3407
|
+
self.core.wp_is_error_output_enabled.argtypes = []
|
|
3408
|
+
self.core.wp_is_error_output_enabled.restype = ctypes.c_int
|
|
3409
|
+
|
|
3410
|
+
self.core.wp_alloc_host.argtypes = [ctypes.c_size_t]
|
|
3411
|
+
self.core.wp_alloc_host.restype = ctypes.c_void_p
|
|
3412
|
+
self.core.wp_alloc_pinned.argtypes = [ctypes.c_size_t]
|
|
3413
|
+
self.core.wp_alloc_pinned.restype = ctypes.c_void_p
|
|
3414
|
+
self.core.wp_alloc_device.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
|
3415
|
+
self.core.wp_alloc_device.restype = ctypes.c_void_p
|
|
3416
|
+
self.core.wp_alloc_device_default.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
|
3417
|
+
self.core.wp_alloc_device_default.restype = ctypes.c_void_p
|
|
3418
|
+
self.core.wp_alloc_device_async.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
|
3419
|
+
self.core.wp_alloc_device_async.restype = ctypes.c_void_p
|
|
3420
|
+
|
|
3421
|
+
self.core.wp_float_to_half_bits.argtypes = [ctypes.c_float]
|
|
3422
|
+
self.core.wp_float_to_half_bits.restype = ctypes.c_uint16
|
|
3423
|
+
self.core.wp_half_bits_to_float.argtypes = [ctypes.c_uint16]
|
|
3424
|
+
self.core.wp_half_bits_to_float.restype = ctypes.c_float
|
|
3425
|
+
|
|
3426
|
+
self.core.wp_free_host.argtypes = [ctypes.c_void_p]
|
|
3427
|
+
self.core.wp_free_host.restype = None
|
|
3428
|
+
self.core.wp_free_pinned.argtypes = [ctypes.c_void_p]
|
|
3429
|
+
self.core.wp_free_pinned.restype = None
|
|
3430
|
+
self.core.wp_free_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3431
|
+
self.core.wp_free_device.restype = None
|
|
3432
|
+
self.core.wp_free_device_default.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3433
|
+
self.core.wp_free_device_default.restype = None
|
|
3434
|
+
self.core.wp_free_device_async.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3435
|
+
self.core.wp_free_device_async.restype = None
|
|
3436
|
+
|
|
3437
|
+
self.core.wp_memset_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
|
|
3438
|
+
self.core.wp_memset_host.restype = None
|
|
3439
|
+
self.core.wp_memset_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
|
|
3440
|
+
self.core.wp_memset_device.restype = None
|
|
3441
|
+
|
|
3442
|
+
self.core.wp_memtile_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t]
|
|
3443
|
+
self.core.wp_memtile_host.restype = None
|
|
3444
|
+
self.core.wp_memtile_device.argtypes = [
|
|
3231
3445
|
ctypes.c_void_p,
|
|
3232
3446
|
ctypes.c_void_p,
|
|
3233
3447
|
ctypes.c_void_p,
|
|
3234
3448
|
ctypes.c_size_t,
|
|
3235
3449
|
ctypes.c_size_t,
|
|
3236
3450
|
]
|
|
3237
|
-
self.core.
|
|
3451
|
+
self.core.wp_memtile_device.restype = None
|
|
3238
3452
|
|
|
3239
|
-
self.core.
|
|
3240
|
-
self.core.
|
|
3241
|
-
self.core.
|
|
3453
|
+
self.core.wp_memcpy_h2h.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
|
|
3454
|
+
self.core.wp_memcpy_h2h.restype = ctypes.c_bool
|
|
3455
|
+
self.core.wp_memcpy_h2d.argtypes = [
|
|
3242
3456
|
ctypes.c_void_p,
|
|
3243
3457
|
ctypes.c_void_p,
|
|
3244
3458
|
ctypes.c_void_p,
|
|
3245
3459
|
ctypes.c_size_t,
|
|
3246
3460
|
ctypes.c_void_p,
|
|
3247
3461
|
]
|
|
3248
|
-
self.core.
|
|
3249
|
-
self.core.
|
|
3462
|
+
self.core.wp_memcpy_h2d.restype = ctypes.c_bool
|
|
3463
|
+
self.core.wp_memcpy_d2h.argtypes = [
|
|
3250
3464
|
ctypes.c_void_p,
|
|
3251
3465
|
ctypes.c_void_p,
|
|
3252
3466
|
ctypes.c_void_p,
|
|
3253
3467
|
ctypes.c_size_t,
|
|
3254
3468
|
ctypes.c_void_p,
|
|
3255
3469
|
]
|
|
3256
|
-
self.core.
|
|
3257
|
-
self.core.
|
|
3470
|
+
self.core.wp_memcpy_d2h.restype = ctypes.c_bool
|
|
3471
|
+
self.core.wp_memcpy_d2d.argtypes = [
|
|
3258
3472
|
ctypes.c_void_p,
|
|
3259
3473
|
ctypes.c_void_p,
|
|
3260
3474
|
ctypes.c_void_p,
|
|
3261
3475
|
ctypes.c_size_t,
|
|
3262
3476
|
ctypes.c_void_p,
|
|
3263
3477
|
]
|
|
3264
|
-
self.core.
|
|
3265
|
-
self.core.
|
|
3478
|
+
self.core.wp_memcpy_d2d.restype = ctypes.c_bool
|
|
3479
|
+
self.core.wp_memcpy_p2p.argtypes = [
|
|
3266
3480
|
ctypes.c_void_p,
|
|
3267
3481
|
ctypes.c_void_p,
|
|
3268
3482
|
ctypes.c_void_p,
|
|
@@ -3270,17 +3484,17 @@ class Runtime:
|
|
|
3270
3484
|
ctypes.c_size_t,
|
|
3271
3485
|
ctypes.c_void_p,
|
|
3272
3486
|
]
|
|
3273
|
-
self.core.
|
|
3487
|
+
self.core.wp_memcpy_p2p.restype = ctypes.c_bool
|
|
3274
3488
|
|
|
3275
|
-
self.core.
|
|
3489
|
+
self.core.wp_array_copy_host.argtypes = [
|
|
3276
3490
|
ctypes.c_void_p,
|
|
3277
3491
|
ctypes.c_void_p,
|
|
3278
3492
|
ctypes.c_int,
|
|
3279
3493
|
ctypes.c_int,
|
|
3280
3494
|
ctypes.c_int,
|
|
3281
3495
|
]
|
|
3282
|
-
self.core.
|
|
3283
|
-
self.core.
|
|
3496
|
+
self.core.wp_array_copy_host.restype = ctypes.c_bool
|
|
3497
|
+
self.core.wp_array_copy_device.argtypes = [
|
|
3284
3498
|
ctypes.c_void_p,
|
|
3285
3499
|
ctypes.c_void_p,
|
|
3286
3500
|
ctypes.c_void_p,
|
|
@@ -3288,41 +3502,41 @@ class Runtime:
|
|
|
3288
3502
|
ctypes.c_int,
|
|
3289
3503
|
ctypes.c_int,
|
|
3290
3504
|
]
|
|
3291
|
-
self.core.
|
|
3505
|
+
self.core.wp_array_copy_device.restype = ctypes.c_bool
|
|
3292
3506
|
|
|
3293
|
-
self.core.
|
|
3294
|
-
self.core.
|
|
3295
|
-
self.core.
|
|
3507
|
+
self.core.wp_array_fill_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int]
|
|
3508
|
+
self.core.wp_array_fill_host.restype = None
|
|
3509
|
+
self.core.wp_array_fill_device.argtypes = [
|
|
3296
3510
|
ctypes.c_void_p,
|
|
3297
3511
|
ctypes.c_void_p,
|
|
3298
3512
|
ctypes.c_int,
|
|
3299
3513
|
ctypes.c_void_p,
|
|
3300
3514
|
ctypes.c_int,
|
|
3301
3515
|
]
|
|
3302
|
-
self.core.
|
|
3516
|
+
self.core.wp_array_fill_device.restype = None
|
|
3303
3517
|
|
|
3304
|
-
self.core.
|
|
3518
|
+
self.core.wp_array_sum_double_host.argtypes = [
|
|
3305
3519
|
ctypes.c_uint64,
|
|
3306
3520
|
ctypes.c_uint64,
|
|
3307
3521
|
ctypes.c_int,
|
|
3308
3522
|
ctypes.c_int,
|
|
3309
3523
|
ctypes.c_int,
|
|
3310
3524
|
]
|
|
3311
|
-
self.core.
|
|
3525
|
+
self.core.wp_array_sum_float_host.argtypes = [
|
|
3312
3526
|
ctypes.c_uint64,
|
|
3313
3527
|
ctypes.c_uint64,
|
|
3314
3528
|
ctypes.c_int,
|
|
3315
3529
|
ctypes.c_int,
|
|
3316
3530
|
ctypes.c_int,
|
|
3317
3531
|
]
|
|
3318
|
-
self.core.
|
|
3532
|
+
self.core.wp_array_sum_double_device.argtypes = [
|
|
3319
3533
|
ctypes.c_uint64,
|
|
3320
3534
|
ctypes.c_uint64,
|
|
3321
3535
|
ctypes.c_int,
|
|
3322
3536
|
ctypes.c_int,
|
|
3323
3537
|
ctypes.c_int,
|
|
3324
3538
|
]
|
|
3325
|
-
self.core.
|
|
3539
|
+
self.core.wp_array_sum_float_device.argtypes = [
|
|
3326
3540
|
ctypes.c_uint64,
|
|
3327
3541
|
ctypes.c_uint64,
|
|
3328
3542
|
ctypes.c_int,
|
|
@@ -3330,7 +3544,7 @@ class Runtime:
|
|
|
3330
3544
|
ctypes.c_int,
|
|
3331
3545
|
]
|
|
3332
3546
|
|
|
3333
|
-
self.core.
|
|
3547
|
+
self.core.wp_array_inner_double_host.argtypes = [
|
|
3334
3548
|
ctypes.c_uint64,
|
|
3335
3549
|
ctypes.c_uint64,
|
|
3336
3550
|
ctypes.c_uint64,
|
|
@@ -3339,7 +3553,7 @@ class Runtime:
|
|
|
3339
3553
|
ctypes.c_int,
|
|
3340
3554
|
ctypes.c_int,
|
|
3341
3555
|
]
|
|
3342
|
-
self.core.
|
|
3556
|
+
self.core.wp_array_inner_float_host.argtypes = [
|
|
3343
3557
|
ctypes.c_uint64,
|
|
3344
3558
|
ctypes.c_uint64,
|
|
3345
3559
|
ctypes.c_uint64,
|
|
@@ -3348,7 +3562,7 @@ class Runtime:
|
|
|
3348
3562
|
ctypes.c_int,
|
|
3349
3563
|
ctypes.c_int,
|
|
3350
3564
|
]
|
|
3351
|
-
self.core.
|
|
3565
|
+
self.core.wp_array_inner_double_device.argtypes = [
|
|
3352
3566
|
ctypes.c_uint64,
|
|
3353
3567
|
ctypes.c_uint64,
|
|
3354
3568
|
ctypes.c_uint64,
|
|
@@ -3357,7 +3571,7 @@ class Runtime:
|
|
|
3357
3571
|
ctypes.c_int,
|
|
3358
3572
|
ctypes.c_int,
|
|
3359
3573
|
]
|
|
3360
|
-
self.core.
|
|
3574
|
+
self.core.wp_array_inner_float_device.argtypes = [
|
|
3361
3575
|
ctypes.c_uint64,
|
|
3362
3576
|
ctypes.c_uint64,
|
|
3363
3577
|
ctypes.c_uint64,
|
|
@@ -3367,21 +3581,36 @@ class Runtime:
|
|
|
3367
3581
|
ctypes.c_int,
|
|
3368
3582
|
]
|
|
3369
3583
|
|
|
3370
|
-
self.core.
|
|
3371
|
-
self.core.
|
|
3372
|
-
|
|
3373
|
-
|
|
3584
|
+
self.core.wp_array_scan_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
|
|
3585
|
+
self.core.wp_array_scan_float_host.argtypes = [
|
|
3586
|
+
ctypes.c_uint64,
|
|
3587
|
+
ctypes.c_uint64,
|
|
3588
|
+
ctypes.c_int,
|
|
3589
|
+
ctypes.c_bool,
|
|
3590
|
+
]
|
|
3591
|
+
self.core.wp_array_scan_int_device.argtypes = [
|
|
3592
|
+
ctypes.c_uint64,
|
|
3593
|
+
ctypes.c_uint64,
|
|
3594
|
+
ctypes.c_int,
|
|
3595
|
+
ctypes.c_bool,
|
|
3596
|
+
]
|
|
3597
|
+
self.core.wp_array_scan_float_device.argtypes = [
|
|
3598
|
+
ctypes.c_uint64,
|
|
3599
|
+
ctypes.c_uint64,
|
|
3600
|
+
ctypes.c_int,
|
|
3601
|
+
ctypes.c_bool,
|
|
3602
|
+
]
|
|
3374
3603
|
|
|
3375
|
-
self.core.
|
|
3376
|
-
self.core.
|
|
3604
|
+
self.core.wp_radix_sort_pairs_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
|
|
3605
|
+
self.core.wp_radix_sort_pairs_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
|
|
3377
3606
|
|
|
3378
|
-
self.core.
|
|
3379
|
-
self.core.
|
|
3607
|
+
self.core.wp_radix_sort_pairs_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
|
|
3608
|
+
self.core.wp_radix_sort_pairs_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
|
|
3380
3609
|
|
|
3381
|
-
self.core.
|
|
3382
|
-
self.core.
|
|
3610
|
+
self.core.wp_radix_sort_pairs_int64_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
|
|
3611
|
+
self.core.wp_radix_sort_pairs_int64_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
|
|
3383
3612
|
|
|
3384
|
-
self.core.
|
|
3613
|
+
self.core.wp_segmented_sort_pairs_int_host.argtypes = [
|
|
3385
3614
|
ctypes.c_uint64,
|
|
3386
3615
|
ctypes.c_uint64,
|
|
3387
3616
|
ctypes.c_int,
|
|
@@ -3389,7 +3618,7 @@ class Runtime:
|
|
|
3389
3618
|
ctypes.c_uint64,
|
|
3390
3619
|
ctypes.c_int,
|
|
3391
3620
|
]
|
|
3392
|
-
self.core.
|
|
3621
|
+
self.core.wp_segmented_sort_pairs_int_device.argtypes = [
|
|
3393
3622
|
ctypes.c_uint64,
|
|
3394
3623
|
ctypes.c_uint64,
|
|
3395
3624
|
ctypes.c_int,
|
|
@@ -3398,7 +3627,7 @@ class Runtime:
|
|
|
3398
3627
|
ctypes.c_int,
|
|
3399
3628
|
]
|
|
3400
3629
|
|
|
3401
|
-
self.core.
|
|
3630
|
+
self.core.wp_segmented_sort_pairs_float_host.argtypes = [
|
|
3402
3631
|
ctypes.c_uint64,
|
|
3403
3632
|
ctypes.c_uint64,
|
|
3404
3633
|
ctypes.c_int,
|
|
@@ -3406,7 +3635,7 @@ class Runtime:
|
|
|
3406
3635
|
ctypes.c_uint64,
|
|
3407
3636
|
ctypes.c_int,
|
|
3408
3637
|
]
|
|
3409
|
-
self.core.
|
|
3638
|
+
self.core.wp_segmented_sort_pairs_float_device.argtypes = [
|
|
3410
3639
|
ctypes.c_uint64,
|
|
3411
3640
|
ctypes.c_uint64,
|
|
3412
3641
|
ctypes.c_int,
|
|
@@ -3415,14 +3644,14 @@ class Runtime:
|
|
|
3415
3644
|
ctypes.c_int,
|
|
3416
3645
|
]
|
|
3417
3646
|
|
|
3418
|
-
self.core.
|
|
3647
|
+
self.core.wp_runlength_encode_int_host.argtypes = [
|
|
3419
3648
|
ctypes.c_uint64,
|
|
3420
3649
|
ctypes.c_uint64,
|
|
3421
3650
|
ctypes.c_uint64,
|
|
3422
3651
|
ctypes.c_uint64,
|
|
3423
3652
|
ctypes.c_int,
|
|
3424
3653
|
]
|
|
3425
|
-
self.core.
|
|
3654
|
+
self.core.wp_runlength_encode_int_device.argtypes = [
|
|
3426
3655
|
ctypes.c_uint64,
|
|
3427
3656
|
ctypes.c_uint64,
|
|
3428
3657
|
ctypes.c_uint64,
|
|
@@ -3430,11 +3659,11 @@ class Runtime:
|
|
|
3430
3659
|
ctypes.c_int,
|
|
3431
3660
|
]
|
|
3432
3661
|
|
|
3433
|
-
self.core.
|
|
3434
|
-
self.core.
|
|
3662
|
+
self.core.wp_bvh_create_host.restype = ctypes.c_uint64
|
|
3663
|
+
self.core.wp_bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
|
3435
3664
|
|
|
3436
|
-
self.core.
|
|
3437
|
-
self.core.
|
|
3665
|
+
self.core.wp_bvh_create_device.restype = ctypes.c_uint64
|
|
3666
|
+
self.core.wp_bvh_create_device.argtypes = [
|
|
3438
3667
|
ctypes.c_void_p,
|
|
3439
3668
|
ctypes.c_void_p,
|
|
3440
3669
|
ctypes.c_void_p,
|
|
@@ -3442,14 +3671,14 @@ class Runtime:
|
|
|
3442
3671
|
ctypes.c_int,
|
|
3443
3672
|
]
|
|
3444
3673
|
|
|
3445
|
-
self.core.
|
|
3446
|
-
self.core.
|
|
3674
|
+
self.core.wp_bvh_destroy_host.argtypes = [ctypes.c_uint64]
|
|
3675
|
+
self.core.wp_bvh_destroy_device.argtypes = [ctypes.c_uint64]
|
|
3447
3676
|
|
|
3448
|
-
self.core.
|
|
3449
|
-
self.core.
|
|
3677
|
+
self.core.wp_bvh_refit_host.argtypes = [ctypes.c_uint64]
|
|
3678
|
+
self.core.wp_bvh_refit_device.argtypes = [ctypes.c_uint64]
|
|
3450
3679
|
|
|
3451
|
-
self.core.
|
|
3452
|
-
self.core.
|
|
3680
|
+
self.core.wp_mesh_create_host.restype = ctypes.c_uint64
|
|
3681
|
+
self.core.wp_mesh_create_host.argtypes = [
|
|
3453
3682
|
warp.types.array_t,
|
|
3454
3683
|
warp.types.array_t,
|
|
3455
3684
|
warp.types.array_t,
|
|
@@ -3459,8 +3688,8 @@ class Runtime:
|
|
|
3459
3688
|
ctypes.c_int,
|
|
3460
3689
|
]
|
|
3461
3690
|
|
|
3462
|
-
self.core.
|
|
3463
|
-
self.core.
|
|
3691
|
+
self.core.wp_mesh_create_device.restype = ctypes.c_uint64
|
|
3692
|
+
self.core.wp_mesh_create_device.argtypes = [
|
|
3464
3693
|
ctypes.c_void_p,
|
|
3465
3694
|
warp.types.array_t,
|
|
3466
3695
|
warp.types.array_t,
|
|
@@ -3471,61 +3700,61 @@ class Runtime:
|
|
|
3471
3700
|
ctypes.c_int,
|
|
3472
3701
|
]
|
|
3473
3702
|
|
|
3474
|
-
self.core.
|
|
3475
|
-
self.core.
|
|
3703
|
+
self.core.wp_mesh_destroy_host.argtypes = [ctypes.c_uint64]
|
|
3704
|
+
self.core.wp_mesh_destroy_device.argtypes = [ctypes.c_uint64]
|
|
3476
3705
|
|
|
3477
|
-
self.core.
|
|
3478
|
-
self.core.
|
|
3706
|
+
self.core.wp_mesh_refit_host.argtypes = [ctypes.c_uint64]
|
|
3707
|
+
self.core.wp_mesh_refit_device.argtypes = [ctypes.c_uint64]
|
|
3479
3708
|
|
|
3480
|
-
self.core.
|
|
3481
|
-
self.core.
|
|
3709
|
+
self.core.wp_mesh_set_points_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
|
|
3710
|
+
self.core.wp_mesh_set_points_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
|
|
3482
3711
|
|
|
3483
|
-
self.core.
|
|
3484
|
-
self.core.
|
|
3712
|
+
self.core.wp_mesh_set_velocities_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
|
|
3713
|
+
self.core.wp_mesh_set_velocities_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
|
|
3485
3714
|
|
|
3486
|
-
self.core.
|
|
3487
|
-
self.core.
|
|
3488
|
-
self.core.
|
|
3489
|
-
self.core.
|
|
3490
|
-
self.core.
|
|
3715
|
+
self.core.wp_hash_grid_create_host.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
|
|
3716
|
+
self.core.wp_hash_grid_create_host.restype = ctypes.c_uint64
|
|
3717
|
+
self.core.wp_hash_grid_destroy_host.argtypes = [ctypes.c_uint64]
|
|
3718
|
+
self.core.wp_hash_grid_update_host.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
|
|
3719
|
+
self.core.wp_hash_grid_reserve_host.argtypes = [ctypes.c_uint64, ctypes.c_int]
|
|
3491
3720
|
|
|
3492
|
-
self.core.
|
|
3493
|
-
self.core.
|
|
3494
|
-
self.core.
|
|
3495
|
-
self.core.
|
|
3496
|
-
self.core.
|
|
3721
|
+
self.core.wp_hash_grid_create_device.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int]
|
|
3722
|
+
self.core.wp_hash_grid_create_device.restype = ctypes.c_uint64
|
|
3723
|
+
self.core.wp_hash_grid_destroy_device.argtypes = [ctypes.c_uint64]
|
|
3724
|
+
self.core.wp_hash_grid_update_device.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
|
|
3725
|
+
self.core.wp_hash_grid_reserve_device.argtypes = [ctypes.c_uint64, ctypes.c_int]
|
|
3497
3726
|
|
|
3498
|
-
self.core.
|
|
3499
|
-
self.core.
|
|
3500
|
-
self.core.
|
|
3727
|
+
self.core.wp_volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_bool, ctypes.c_bool]
|
|
3728
|
+
self.core.wp_volume_create_host.restype = ctypes.c_uint64
|
|
3729
|
+
self.core.wp_volume_get_tiles_host.argtypes = [
|
|
3501
3730
|
ctypes.c_uint64,
|
|
3502
3731
|
ctypes.c_void_p,
|
|
3503
3732
|
]
|
|
3504
|
-
self.core.
|
|
3733
|
+
self.core.wp_volume_get_voxels_host.argtypes = [
|
|
3505
3734
|
ctypes.c_uint64,
|
|
3506
3735
|
ctypes.c_void_p,
|
|
3507
3736
|
]
|
|
3508
|
-
self.core.
|
|
3737
|
+
self.core.wp_volume_destroy_host.argtypes = [ctypes.c_uint64]
|
|
3509
3738
|
|
|
3510
|
-
self.core.
|
|
3739
|
+
self.core.wp_volume_create_device.argtypes = [
|
|
3511
3740
|
ctypes.c_void_p,
|
|
3512
3741
|
ctypes.c_void_p,
|
|
3513
3742
|
ctypes.c_uint64,
|
|
3514
3743
|
ctypes.c_bool,
|
|
3515
3744
|
ctypes.c_bool,
|
|
3516
3745
|
]
|
|
3517
|
-
self.core.
|
|
3518
|
-
self.core.
|
|
3746
|
+
self.core.wp_volume_create_device.restype = ctypes.c_uint64
|
|
3747
|
+
self.core.wp_volume_get_tiles_device.argtypes = [
|
|
3519
3748
|
ctypes.c_uint64,
|
|
3520
3749
|
ctypes.c_void_p,
|
|
3521
3750
|
]
|
|
3522
|
-
self.core.
|
|
3751
|
+
self.core.wp_volume_get_voxels_device.argtypes = [
|
|
3523
3752
|
ctypes.c_uint64,
|
|
3524
3753
|
ctypes.c_void_p,
|
|
3525
3754
|
]
|
|
3526
|
-
self.core.
|
|
3755
|
+
self.core.wp_volume_destroy_device.argtypes = [ctypes.c_uint64]
|
|
3527
3756
|
|
|
3528
|
-
self.core.
|
|
3757
|
+
self.core.wp_volume_from_tiles_device.argtypes = [
|
|
3529
3758
|
ctypes.c_void_p,
|
|
3530
3759
|
ctypes.c_void_p,
|
|
3531
3760
|
ctypes.c_int,
|
|
@@ -3536,8 +3765,8 @@ class Runtime:
|
|
|
3536
3765
|
ctypes.c_uint32,
|
|
3537
3766
|
ctypes.c_char_p,
|
|
3538
3767
|
]
|
|
3539
|
-
self.core.
|
|
3540
|
-
self.core.
|
|
3768
|
+
self.core.wp_volume_from_tiles_device.restype = ctypes.c_uint64
|
|
3769
|
+
self.core.wp_volume_index_from_tiles_device.argtypes = [
|
|
3541
3770
|
ctypes.c_void_p,
|
|
3542
3771
|
ctypes.c_void_p,
|
|
3543
3772
|
ctypes.c_int,
|
|
@@ -3545,8 +3774,8 @@ class Runtime:
|
|
|
3545
3774
|
ctypes.c_float * 3,
|
|
3546
3775
|
ctypes.c_bool,
|
|
3547
3776
|
]
|
|
3548
|
-
self.core.
|
|
3549
|
-
self.core.
|
|
3777
|
+
self.core.wp_volume_index_from_tiles_device.restype = ctypes.c_uint64
|
|
3778
|
+
self.core.wp_volume_from_active_voxels_device.argtypes = [
|
|
3550
3779
|
ctypes.c_void_p,
|
|
3551
3780
|
ctypes.c_void_p,
|
|
3552
3781
|
ctypes.c_int,
|
|
@@ -3554,25 +3783,25 @@ class Runtime:
|
|
|
3554
3783
|
ctypes.c_float * 3,
|
|
3555
3784
|
ctypes.c_bool,
|
|
3556
3785
|
]
|
|
3557
|
-
self.core.
|
|
3786
|
+
self.core.wp_volume_from_active_voxels_device.restype = ctypes.c_uint64
|
|
3558
3787
|
|
|
3559
|
-
self.core.
|
|
3788
|
+
self.core.wp_volume_get_buffer_info.argtypes = [
|
|
3560
3789
|
ctypes.c_uint64,
|
|
3561
3790
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3562
3791
|
ctypes.POINTER(ctypes.c_uint64),
|
|
3563
3792
|
]
|
|
3564
|
-
self.core.
|
|
3793
|
+
self.core.wp_volume_get_voxel_size.argtypes = [
|
|
3565
3794
|
ctypes.c_uint64,
|
|
3566
3795
|
ctypes.POINTER(ctypes.c_float),
|
|
3567
3796
|
ctypes.POINTER(ctypes.c_float),
|
|
3568
3797
|
ctypes.POINTER(ctypes.c_float),
|
|
3569
3798
|
]
|
|
3570
|
-
self.core.
|
|
3799
|
+
self.core.wp_volume_get_tile_and_voxel_count.argtypes = [
|
|
3571
3800
|
ctypes.c_uint64,
|
|
3572
3801
|
ctypes.POINTER(ctypes.c_uint32),
|
|
3573
3802
|
ctypes.POINTER(ctypes.c_uint64),
|
|
3574
3803
|
]
|
|
3575
|
-
self.core.
|
|
3804
|
+
self.core.wp_volume_get_grid_info.argtypes = [
|
|
3576
3805
|
ctypes.c_uint64,
|
|
3577
3806
|
ctypes.POINTER(ctypes.c_uint64),
|
|
3578
3807
|
ctypes.POINTER(ctypes.c_uint32),
|
|
@@ -3581,12 +3810,12 @@ class Runtime:
|
|
|
3581
3810
|
ctypes.c_float * 9,
|
|
3582
3811
|
ctypes.c_char * 16,
|
|
3583
3812
|
]
|
|
3584
|
-
self.core.
|
|
3585
|
-
self.core.
|
|
3813
|
+
self.core.wp_volume_get_grid_info.restype = ctypes.c_char_p
|
|
3814
|
+
self.core.wp_volume_get_blind_data_count.argtypes = [
|
|
3586
3815
|
ctypes.c_uint64,
|
|
3587
3816
|
]
|
|
3588
|
-
self.core.
|
|
3589
|
-
self.core.
|
|
3817
|
+
self.core.wp_volume_get_blind_data_count.restype = ctypes.c_uint64
|
|
3818
|
+
self.core.wp_volume_get_blind_data_info.argtypes = [
|
|
3590
3819
|
ctypes.c_uint64,
|
|
3591
3820
|
ctypes.c_uint32,
|
|
3592
3821
|
ctypes.POINTER(ctypes.c_void_p),
|
|
@@ -3594,7 +3823,7 @@ class Runtime:
|
|
|
3594
3823
|
ctypes.POINTER(ctypes.c_uint32),
|
|
3595
3824
|
ctypes.c_char * 16,
|
|
3596
3825
|
]
|
|
3597
|
-
self.core.
|
|
3826
|
+
self.core.wp_volume_get_blind_data_info.restype = ctypes.c_char_p
|
|
3598
3827
|
|
|
3599
3828
|
bsr_matrix_from_triplets_argtypes = [
|
|
3600
3829
|
ctypes.c_int, # block_size
|
|
@@ -3616,8 +3845,8 @@ class Runtime:
|
|
|
3616
3845
|
ctypes.c_void_p, # bsr_nnz_event
|
|
3617
3846
|
]
|
|
3618
3847
|
|
|
3619
|
-
self.core.
|
|
3620
|
-
self.core.
|
|
3848
|
+
self.core.wp_bsr_matrix_from_triplets_host.argtypes = bsr_matrix_from_triplets_argtypes
|
|
3849
|
+
self.core.wp_bsr_matrix_from_triplets_device.argtypes = bsr_matrix_from_triplets_argtypes
|
|
3621
3850
|
|
|
3622
3851
|
bsr_transpose_argtypes = [
|
|
3623
3852
|
ctypes.c_int, # row_count
|
|
@@ -3629,229 +3858,232 @@ class Runtime:
|
|
|
3629
3858
|
ctypes.POINTER(ctypes.c_int), # transposed_bsr_columns
|
|
3630
3859
|
ctypes.POINTER(ctypes.c_int), # src to dest block map
|
|
3631
3860
|
]
|
|
3632
|
-
self.core.
|
|
3633
|
-
self.core.
|
|
3634
|
-
|
|
3635
|
-
self.core.
|
|
3636
|
-
self.core.
|
|
3637
|
-
self.core.
|
|
3638
|
-
self.core.
|
|
3639
|
-
self.core.
|
|
3640
|
-
self.core.
|
|
3641
|
-
|
|
3642
|
-
self.core.
|
|
3643
|
-
self.core.
|
|
3644
|
-
self.core.
|
|
3645
|
-
self.core.
|
|
3646
|
-
self.core.
|
|
3647
|
-
self.core.
|
|
3648
|
-
|
|
3649
|
-
self.core.
|
|
3650
|
-
self.core.
|
|
3651
|
-
self.core.
|
|
3652
|
-
self.core.
|
|
3653
|
-
|
|
3654
|
-
self.core.
|
|
3655
|
-
self.core.
|
|
3656
|
-
self.core.
|
|
3657
|
-
self.core.
|
|
3658
|
-
self.core.
|
|
3659
|
-
self.core.
|
|
3660
|
-
self.core.
|
|
3661
|
-
self.core.
|
|
3662
|
-
self.core.
|
|
3663
|
-
self.core.
|
|
3664
|
-
self.core.
|
|
3665
|
-
self.core.
|
|
3666
|
-
self.core.
|
|
3667
|
-
self.core.
|
|
3668
|
-
self.core.
|
|
3669
|
-
self.core.
|
|
3670
|
-
self.core.
|
|
3671
|
-
self.core.
|
|
3672
|
-
self.core.
|
|
3673
|
-
self.core.
|
|
3674
|
-
self.core.
|
|
3675
|
-
self.core.
|
|
3676
|
-
self.core.
|
|
3677
|
-
self.core.
|
|
3678
|
-
self.core.
|
|
3679
|
-
self.core.
|
|
3680
|
-
self.core.
|
|
3681
|
-
self.core.
|
|
3682
|
-
self.core.
|
|
3683
|
-
self.core.
|
|
3684
|
-
self.core.
|
|
3685
|
-
self.core.
|
|
3686
|
-
self.core.
|
|
3687
|
-
self.core.
|
|
3688
|
-
|
|
3689
|
-
self.core.
|
|
3690
|
-
self.core.
|
|
3691
|
-
self.core.
|
|
3692
|
-
self.core.
|
|
3693
|
-
self.core.
|
|
3694
|
-
self.core.
|
|
3695
|
-
self.core.
|
|
3696
|
-
self.core.
|
|
3697
|
-
self.core.
|
|
3698
|
-
self.core.
|
|
3699
|
-
self.core.
|
|
3700
|
-
self.core.
|
|
3701
|
-
self.core.
|
|
3702
|
-
self.core.
|
|
3703
|
-
self.core.
|
|
3704
|
-
self.core.
|
|
3705
|
-
|
|
3706
|
-
self.core.
|
|
3707
|
-
self.core.
|
|
3708
|
-
self.core.
|
|
3709
|
-
self.core.
|
|
3710
|
-
self.core.
|
|
3711
|
-
self.core.
|
|
3712
|
-
self.core.
|
|
3713
|
-
self.core.
|
|
3861
|
+
self.core.wp_bsr_transpose_host.argtypes = bsr_transpose_argtypes
|
|
3862
|
+
self.core.wp_bsr_transpose_device.argtypes = bsr_transpose_argtypes
|
|
3863
|
+
|
|
3864
|
+
self.core.wp_is_cuda_enabled.argtypes = None
|
|
3865
|
+
self.core.wp_is_cuda_enabled.restype = ctypes.c_int
|
|
3866
|
+
self.core.wp_is_cuda_compatibility_enabled.argtypes = None
|
|
3867
|
+
self.core.wp_is_cuda_compatibility_enabled.restype = ctypes.c_int
|
|
3868
|
+
self.core.wp_is_mathdx_enabled.argtypes = None
|
|
3869
|
+
self.core.wp_is_mathdx_enabled.restype = ctypes.c_int
|
|
3870
|
+
|
|
3871
|
+
self.core.wp_cuda_driver_version.argtypes = None
|
|
3872
|
+
self.core.wp_cuda_driver_version.restype = ctypes.c_int
|
|
3873
|
+
self.core.wp_cuda_toolkit_version.argtypes = None
|
|
3874
|
+
self.core.wp_cuda_toolkit_version.restype = ctypes.c_int
|
|
3875
|
+
self.core.wp_cuda_driver_is_initialized.argtypes = None
|
|
3876
|
+
self.core.wp_cuda_driver_is_initialized.restype = ctypes.c_bool
|
|
3877
|
+
|
|
3878
|
+
self.core.wp_nvrtc_supported_arch_count.argtypes = None
|
|
3879
|
+
self.core.wp_nvrtc_supported_arch_count.restype = ctypes.c_int
|
|
3880
|
+
self.core.wp_nvrtc_supported_archs.argtypes = [ctypes.POINTER(ctypes.c_int)]
|
|
3881
|
+
self.core.wp_nvrtc_supported_archs.restype = None
|
|
3882
|
+
|
|
3883
|
+
self.core.wp_cuda_device_get_count.argtypes = None
|
|
3884
|
+
self.core.wp_cuda_device_get_count.restype = ctypes.c_int
|
|
3885
|
+
self.core.wp_cuda_device_get_primary_context.argtypes = [ctypes.c_int]
|
|
3886
|
+
self.core.wp_cuda_device_get_primary_context.restype = ctypes.c_void_p
|
|
3887
|
+
self.core.wp_cuda_device_get_name.argtypes = [ctypes.c_int]
|
|
3888
|
+
self.core.wp_cuda_device_get_name.restype = ctypes.c_char_p
|
|
3889
|
+
self.core.wp_cuda_device_get_arch.argtypes = [ctypes.c_int]
|
|
3890
|
+
self.core.wp_cuda_device_get_arch.restype = ctypes.c_int
|
|
3891
|
+
self.core.wp_cuda_device_get_sm_count.argtypes = [ctypes.c_int]
|
|
3892
|
+
self.core.wp_cuda_device_get_sm_count.restype = ctypes.c_int
|
|
3893
|
+
self.core.wp_cuda_device_is_uva.argtypes = [ctypes.c_int]
|
|
3894
|
+
self.core.wp_cuda_device_is_uva.restype = ctypes.c_int
|
|
3895
|
+
self.core.wp_cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
|
|
3896
|
+
self.core.wp_cuda_device_is_mempool_supported.restype = ctypes.c_int
|
|
3897
|
+
self.core.wp_cuda_device_is_ipc_supported.argtypes = [ctypes.c_int]
|
|
3898
|
+
self.core.wp_cuda_device_is_ipc_supported.restype = ctypes.c_int
|
|
3899
|
+
self.core.wp_cuda_device_set_mempool_release_threshold.argtypes = [ctypes.c_int, ctypes.c_uint64]
|
|
3900
|
+
self.core.wp_cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
|
|
3901
|
+
self.core.wp_cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
|
|
3902
|
+
self.core.wp_cuda_device_get_mempool_release_threshold.restype = ctypes.c_uint64
|
|
3903
|
+
self.core.wp_cuda_device_get_mempool_used_mem_current.argtypes = [ctypes.c_int]
|
|
3904
|
+
self.core.wp_cuda_device_get_mempool_used_mem_current.restype = ctypes.c_uint64
|
|
3905
|
+
self.core.wp_cuda_device_get_mempool_used_mem_high.argtypes = [ctypes.c_int]
|
|
3906
|
+
self.core.wp_cuda_device_get_mempool_used_mem_high.restype = ctypes.c_uint64
|
|
3907
|
+
self.core.wp_cuda_device_get_memory_info.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
|
3908
|
+
self.core.wp_cuda_device_get_memory_info.restype = None
|
|
3909
|
+
self.core.wp_cuda_device_get_uuid.argtypes = [ctypes.c_int, ctypes.c_char * 16]
|
|
3910
|
+
self.core.wp_cuda_device_get_uuid.restype = None
|
|
3911
|
+
self.core.wp_cuda_device_get_pci_domain_id.argtypes = [ctypes.c_int]
|
|
3912
|
+
self.core.wp_cuda_device_get_pci_domain_id.restype = ctypes.c_int
|
|
3913
|
+
self.core.wp_cuda_device_get_pci_bus_id.argtypes = [ctypes.c_int]
|
|
3914
|
+
self.core.wp_cuda_device_get_pci_bus_id.restype = ctypes.c_int
|
|
3915
|
+
self.core.wp_cuda_device_get_pci_device_id.argtypes = [ctypes.c_int]
|
|
3916
|
+
self.core.wp_cuda_device_get_pci_device_id.restype = ctypes.c_int
|
|
3917
|
+
|
|
3918
|
+
self.core.wp_cuda_context_get_current.argtypes = None
|
|
3919
|
+
self.core.wp_cuda_context_get_current.restype = ctypes.c_void_p
|
|
3920
|
+
self.core.wp_cuda_context_set_current.argtypes = [ctypes.c_void_p]
|
|
3921
|
+
self.core.wp_cuda_context_set_current.restype = None
|
|
3922
|
+
self.core.wp_cuda_context_push_current.argtypes = [ctypes.c_void_p]
|
|
3923
|
+
self.core.wp_cuda_context_push_current.restype = None
|
|
3924
|
+
self.core.wp_cuda_context_pop_current.argtypes = None
|
|
3925
|
+
self.core.wp_cuda_context_pop_current.restype = None
|
|
3926
|
+
self.core.wp_cuda_context_create.argtypes = [ctypes.c_int]
|
|
3927
|
+
self.core.wp_cuda_context_create.restype = ctypes.c_void_p
|
|
3928
|
+
self.core.wp_cuda_context_destroy.argtypes = [ctypes.c_void_p]
|
|
3929
|
+
self.core.wp_cuda_context_destroy.restype = None
|
|
3930
|
+
self.core.wp_cuda_context_synchronize.argtypes = [ctypes.c_void_p]
|
|
3931
|
+
self.core.wp_cuda_context_synchronize.restype = None
|
|
3932
|
+
self.core.wp_cuda_context_check.argtypes = [ctypes.c_void_p]
|
|
3933
|
+
self.core.wp_cuda_context_check.restype = ctypes.c_uint64
|
|
3934
|
+
|
|
3935
|
+
self.core.wp_cuda_context_get_device_ordinal.argtypes = [ctypes.c_void_p]
|
|
3936
|
+
self.core.wp_cuda_context_get_device_ordinal.restype = ctypes.c_int
|
|
3937
|
+
self.core.wp_cuda_context_is_primary.argtypes = [ctypes.c_void_p]
|
|
3938
|
+
self.core.wp_cuda_context_is_primary.restype = ctypes.c_int
|
|
3939
|
+
self.core.wp_cuda_context_get_stream.argtypes = [ctypes.c_void_p]
|
|
3940
|
+
self.core.wp_cuda_context_get_stream.restype = ctypes.c_void_p
|
|
3941
|
+
self.core.wp_cuda_context_set_stream.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
|
|
3942
|
+
self.core.wp_cuda_context_set_stream.restype = None
|
|
3714
3943
|
|
|
3715
3944
|
# peer access
|
|
3716
|
-
self.core.
|
|
3717
|
-
self.core.
|
|
3718
|
-
self.core.
|
|
3719
|
-
self.core.
|
|
3720
|
-
self.core.
|
|
3721
|
-
self.core.
|
|
3722
|
-
self.core.
|
|
3723
|
-
self.core.
|
|
3724
|
-
self.core.
|
|
3725
|
-
self.core.
|
|
3945
|
+
self.core.wp_cuda_is_peer_access_supported.argtypes = [ctypes.c_int, ctypes.c_int]
|
|
3946
|
+
self.core.wp_cuda_is_peer_access_supported.restype = ctypes.c_int
|
|
3947
|
+
self.core.wp_cuda_is_peer_access_enabled.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3948
|
+
self.core.wp_cuda_is_peer_access_enabled.restype = ctypes.c_int
|
|
3949
|
+
self.core.wp_cuda_set_peer_access_enabled.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
|
|
3950
|
+
self.core.wp_cuda_set_peer_access_enabled.restype = ctypes.c_int
|
|
3951
|
+
self.core.wp_cuda_is_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int]
|
|
3952
|
+
self.core.wp_cuda_is_mempool_access_enabled.restype = ctypes.c_int
|
|
3953
|
+
self.core.wp_cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
|
|
3954
|
+
self.core.wp_cuda_set_mempool_access_enabled.restype = ctypes.c_int
|
|
3726
3955
|
|
|
3727
3956
|
# inter-process communication
|
|
3728
|
-
self.core.
|
|
3729
|
-
self.core.
|
|
3730
|
-
self.core.
|
|
3731
|
-
self.core.
|
|
3732
|
-
self.core.
|
|
3733
|
-
self.core.
|
|
3734
|
-
self.core.
|
|
3957
|
+
self.core.wp_cuda_ipc_get_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
|
|
3958
|
+
self.core.wp_cuda_ipc_get_mem_handle.restype = None
|
|
3959
|
+
self.core.wp_cuda_ipc_open_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
|
|
3960
|
+
self.core.wp_cuda_ipc_open_mem_handle.restype = ctypes.c_void_p
|
|
3961
|
+
self.core.wp_cuda_ipc_close_mem_handle.argtypes = [ctypes.c_void_p]
|
|
3962
|
+
self.core.wp_cuda_ipc_close_mem_handle.restype = None
|
|
3963
|
+
self.core.wp_cuda_ipc_get_event_handle.argtypes = [
|
|
3735
3964
|
ctypes.c_void_p,
|
|
3736
3965
|
ctypes.c_void_p,
|
|
3737
3966
|
ctypes.POINTER(ctypes.c_char),
|
|
3738
3967
|
]
|
|
3739
|
-
self.core.
|
|
3740
|
-
self.core.
|
|
3741
|
-
self.core.
|
|
3742
|
-
|
|
3743
|
-
self.core.
|
|
3744
|
-
self.core.
|
|
3745
|
-
self.core.
|
|
3746
|
-
self.core.
|
|
3747
|
-
self.core.
|
|
3748
|
-
self.core.
|
|
3749
|
-
self.core.
|
|
3750
|
-
self.core.
|
|
3751
|
-
self.core.
|
|
3752
|
-
self.core.
|
|
3753
|
-
self.core.
|
|
3754
|
-
self.core.
|
|
3755
|
-
self.core.
|
|
3756
|
-
self.core.
|
|
3757
|
-
self.core.
|
|
3758
|
-
self.core.
|
|
3759
|
-
self.core.
|
|
3760
|
-
self.core.
|
|
3761
|
-
self.core.
|
|
3762
|
-
self.core.
|
|
3763
|
-
self.core.
|
|
3764
|
-
self.core.
|
|
3765
|
-
|
|
3766
|
-
self.core.
|
|
3767
|
-
self.core.
|
|
3768
|
-
self.core.
|
|
3769
|
-
self.core.
|
|
3770
|
-
self.core.
|
|
3771
|
-
self.core.
|
|
3772
|
-
self.core.
|
|
3773
|
-
self.core.
|
|
3774
|
-
self.core.
|
|
3775
|
-
self.core.
|
|
3776
|
-
self.core.
|
|
3777
|
-
self.core.
|
|
3778
|
-
|
|
3779
|
-
self.core.
|
|
3780
|
-
self.core.
|
|
3781
|
-
self.core.
|
|
3968
|
+
self.core.wp_cuda_ipc_get_event_handle.restype = None
|
|
3969
|
+
self.core.wp_cuda_ipc_open_event_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
|
|
3970
|
+
self.core.wp_cuda_ipc_open_event_handle.restype = ctypes.c_void_p
|
|
3971
|
+
|
|
3972
|
+
self.core.wp_cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
|
|
3973
|
+
self.core.wp_cuda_stream_create.restype = ctypes.c_void_p
|
|
3974
|
+
self.core.wp_cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3975
|
+
self.core.wp_cuda_stream_destroy.restype = None
|
|
3976
|
+
self.core.wp_cuda_stream_query.argtypes = [ctypes.c_void_p]
|
|
3977
|
+
self.core.wp_cuda_stream_query.restype = ctypes.c_int
|
|
3978
|
+
self.core.wp_cuda_stream_register.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3979
|
+
self.core.wp_cuda_stream_register.restype = None
|
|
3980
|
+
self.core.wp_cuda_stream_unregister.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3981
|
+
self.core.wp_cuda_stream_unregister.restype = None
|
|
3982
|
+
self.core.wp_cuda_stream_synchronize.argtypes = [ctypes.c_void_p]
|
|
3983
|
+
self.core.wp_cuda_stream_synchronize.restype = None
|
|
3984
|
+
self.core.wp_cuda_stream_wait_event.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
3985
|
+
self.core.wp_cuda_stream_wait_event.restype = None
|
|
3986
|
+
self.core.wp_cuda_stream_wait_stream.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
|
3987
|
+
self.core.wp_cuda_stream_wait_stream.restype = None
|
|
3988
|
+
self.core.wp_cuda_stream_is_capturing.argtypes = [ctypes.c_void_p]
|
|
3989
|
+
self.core.wp_cuda_stream_is_capturing.restype = ctypes.c_int
|
|
3990
|
+
self.core.wp_cuda_stream_get_capture_id.argtypes = [ctypes.c_void_p]
|
|
3991
|
+
self.core.wp_cuda_stream_get_capture_id.restype = ctypes.c_uint64
|
|
3992
|
+
self.core.wp_cuda_stream_get_priority.argtypes = [ctypes.c_void_p]
|
|
3993
|
+
self.core.wp_cuda_stream_get_priority.restype = ctypes.c_int
|
|
3994
|
+
|
|
3995
|
+
self.core.wp_cuda_event_create.argtypes = [ctypes.c_void_p, ctypes.c_uint]
|
|
3996
|
+
self.core.wp_cuda_event_create.restype = ctypes.c_void_p
|
|
3997
|
+
self.core.wp_cuda_event_destroy.argtypes = [ctypes.c_void_p]
|
|
3998
|
+
self.core.wp_cuda_event_destroy.restype = None
|
|
3999
|
+
self.core.wp_cuda_event_query.argtypes = [ctypes.c_void_p]
|
|
4000
|
+
self.core.wp_cuda_event_query.restype = ctypes.c_int
|
|
4001
|
+
self.core.wp_cuda_event_record.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_bool]
|
|
4002
|
+
self.core.wp_cuda_event_record.restype = None
|
|
4003
|
+
self.core.wp_cuda_event_synchronize.argtypes = [ctypes.c_void_p]
|
|
4004
|
+
self.core.wp_cuda_event_synchronize.restype = None
|
|
4005
|
+
self.core.wp_cuda_event_elapsed_time.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4006
|
+
self.core.wp_cuda_event_elapsed_time.restype = ctypes.c_float
|
|
4007
|
+
|
|
4008
|
+
self.core.wp_cuda_graph_begin_capture.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
|
|
4009
|
+
self.core.wp_cuda_graph_begin_capture.restype = ctypes.c_bool
|
|
4010
|
+
self.core.wp_cuda_graph_end_capture.argtypes = [
|
|
3782
4011
|
ctypes.c_void_p,
|
|
3783
4012
|
ctypes.c_void_p,
|
|
3784
4013
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3785
4014
|
]
|
|
3786
|
-
self.core.
|
|
4015
|
+
self.core.wp_cuda_graph_end_capture.restype = ctypes.c_bool
|
|
3787
4016
|
|
|
3788
|
-
self.core.
|
|
4017
|
+
self.core.wp_cuda_graph_create_exec.argtypes = [
|
|
3789
4018
|
ctypes.c_void_p,
|
|
3790
4019
|
ctypes.c_void_p,
|
|
3791
4020
|
ctypes.c_void_p,
|
|
3792
4021
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3793
4022
|
]
|
|
3794
|
-
self.core.
|
|
4023
|
+
self.core.wp_cuda_graph_create_exec.restype = ctypes.c_bool
|
|
3795
4024
|
|
|
3796
|
-
self.core.
|
|
3797
|
-
self.core.
|
|
4025
|
+
self.core.wp_capture_debug_dot_print.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_uint32]
|
|
4026
|
+
self.core.wp_capture_debug_dot_print.restype = ctypes.c_bool
|
|
3798
4027
|
|
|
3799
|
-
self.core.
|
|
3800
|
-
self.core.
|
|
3801
|
-
self.core.
|
|
3802
|
-
self.core.
|
|
4028
|
+
self.core.wp_cuda_graph_launch.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4029
|
+
self.core.wp_cuda_graph_launch.restype = ctypes.c_bool
|
|
4030
|
+
self.core.wp_cuda_graph_exec_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4031
|
+
self.core.wp_cuda_graph_exec_destroy.restype = ctypes.c_bool
|
|
3803
4032
|
|
|
3804
|
-
self.core.
|
|
3805
|
-
self.core.
|
|
4033
|
+
self.core.wp_cuda_graph_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4034
|
+
self.core.wp_cuda_graph_destroy.restype = ctypes.c_bool
|
|
3806
4035
|
|
|
3807
|
-
self.core.
|
|
4036
|
+
self.core.wp_cuda_graph_insert_if_else.argtypes = [
|
|
3808
4037
|
ctypes.c_void_p,
|
|
3809
4038
|
ctypes.c_void_p,
|
|
3810
4039
|
ctypes.POINTER(ctypes.c_int),
|
|
3811
4040
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3812
4041
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3813
4042
|
]
|
|
3814
|
-
self.core.
|
|
4043
|
+
self.core.wp_cuda_graph_insert_if_else.restype = ctypes.c_bool
|
|
3815
4044
|
|
|
3816
|
-
self.core.
|
|
4045
|
+
self.core.wp_cuda_graph_insert_while.argtypes = [
|
|
3817
4046
|
ctypes.c_void_p,
|
|
3818
4047
|
ctypes.c_void_p,
|
|
3819
4048
|
ctypes.POINTER(ctypes.c_int),
|
|
3820
4049
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3821
4050
|
ctypes.POINTER(ctypes.c_uint64),
|
|
3822
4051
|
]
|
|
3823
|
-
self.core.
|
|
4052
|
+
self.core.wp_cuda_graph_insert_while.restype = ctypes.c_bool
|
|
3824
4053
|
|
|
3825
|
-
self.core.
|
|
4054
|
+
self.core.wp_cuda_graph_set_condition.argtypes = [
|
|
3826
4055
|
ctypes.c_void_p,
|
|
3827
4056
|
ctypes.c_void_p,
|
|
3828
4057
|
ctypes.POINTER(ctypes.c_int),
|
|
3829
4058
|
ctypes.c_uint64,
|
|
3830
4059
|
]
|
|
3831
|
-
self.core.
|
|
4060
|
+
self.core.wp_cuda_graph_set_condition.restype = ctypes.c_bool
|
|
3832
4061
|
|
|
3833
|
-
self.core.
|
|
4062
|
+
self.core.wp_cuda_graph_pause_capture.argtypes = [
|
|
3834
4063
|
ctypes.c_void_p,
|
|
3835
4064
|
ctypes.c_void_p,
|
|
3836
4065
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3837
4066
|
]
|
|
3838
|
-
self.core.
|
|
4067
|
+
self.core.wp_cuda_graph_pause_capture.restype = ctypes.c_bool
|
|
3839
4068
|
|
|
3840
|
-
self.core.
|
|
4069
|
+
self.core.wp_cuda_graph_resume_capture.argtypes = [
|
|
3841
4070
|
ctypes.c_void_p,
|
|
3842
4071
|
ctypes.c_void_p,
|
|
3843
4072
|
ctypes.c_void_p,
|
|
3844
4073
|
]
|
|
3845
|
-
self.core.
|
|
4074
|
+
self.core.wp_cuda_graph_resume_capture.restype = ctypes.c_bool
|
|
3846
4075
|
|
|
3847
|
-
self.core.
|
|
4076
|
+
self.core.wp_cuda_graph_insert_child_graph.argtypes = [
|
|
3848
4077
|
ctypes.c_void_p,
|
|
3849
4078
|
ctypes.c_void_p,
|
|
3850
4079
|
ctypes.c_void_p,
|
|
3851
4080
|
]
|
|
3852
|
-
self.core.
|
|
4081
|
+
self.core.wp_cuda_graph_insert_child_graph.restype = ctypes.c_bool
|
|
3853
4082
|
|
|
3854
|
-
self.core.
|
|
4083
|
+
self.core.wp_cuda_graph_check_conditional_body.argtypes = [ctypes.c_void_p]
|
|
4084
|
+
self.core.wp_cuda_graph_check_conditional_body.restype = ctypes.c_bool
|
|
4085
|
+
|
|
4086
|
+
self.core.wp_cuda_compile_program.argtypes = [
|
|
3855
4087
|
ctypes.c_char_p, # cuda_src
|
|
3856
4088
|
ctypes.c_char_p, # program name
|
|
3857
4089
|
ctypes.c_int, # arch
|
|
@@ -3871,9 +4103,9 @@ class Runtime:
|
|
|
3871
4103
|
ctypes.POINTER(ctypes.c_size_t), # ltoir_sizes
|
|
3872
4104
|
ctypes.POINTER(ctypes.c_int), # ltoir_input_types, each of type nvJitLinkInputType
|
|
3873
4105
|
]
|
|
3874
|
-
self.core.
|
|
4106
|
+
self.core.wp_cuda_compile_program.restype = ctypes.c_size_t
|
|
3875
4107
|
|
|
3876
|
-
self.core.
|
|
4108
|
+
self.core.wp_cuda_compile_fft.argtypes = [
|
|
3877
4109
|
ctypes.c_char_p, # lto
|
|
3878
4110
|
ctypes.c_char_p, # function name
|
|
3879
4111
|
ctypes.c_int, # num include dirs
|
|
@@ -3886,9 +4118,9 @@ class Runtime:
|
|
|
3886
4118
|
ctypes.c_int, # precision
|
|
3887
4119
|
ctypes.POINTER(ctypes.c_int), # smem (out)
|
|
3888
4120
|
]
|
|
3889
|
-
self.core.
|
|
4121
|
+
self.core.wp_cuda_compile_fft.restype = ctypes.c_bool
|
|
3890
4122
|
|
|
3891
|
-
self.core.
|
|
4123
|
+
self.core.wp_cuda_compile_dot.argtypes = [
|
|
3892
4124
|
ctypes.c_char_p, # lto
|
|
3893
4125
|
ctypes.c_char_p, # function name
|
|
3894
4126
|
ctypes.c_int, # num include dirs
|
|
@@ -3907,9 +4139,9 @@ class Runtime:
|
|
|
3907
4139
|
ctypes.c_int, # c_arrangement
|
|
3908
4140
|
ctypes.c_int, # num threads
|
|
3909
4141
|
]
|
|
3910
|
-
self.core.
|
|
4142
|
+
self.core.wp_cuda_compile_dot.restype = ctypes.c_bool
|
|
3911
4143
|
|
|
3912
|
-
self.core.
|
|
4144
|
+
self.core.wp_cuda_compile_solver.argtypes = [
|
|
3913
4145
|
ctypes.c_char_p, # universal fatbin
|
|
3914
4146
|
ctypes.c_char_p, # lto
|
|
3915
4147
|
ctypes.c_char_p, # function name
|
|
@@ -3929,24 +4161,24 @@ class Runtime:
|
|
|
3929
4161
|
ctypes.c_int, # fill_mode
|
|
3930
4162
|
ctypes.c_int, # num threads
|
|
3931
4163
|
]
|
|
3932
|
-
self.core.
|
|
4164
|
+
self.core.wp_cuda_compile_solver.restype = ctypes.c_bool
|
|
3933
4165
|
|
|
3934
|
-
self.core.
|
|
3935
|
-
self.core.
|
|
4166
|
+
self.core.wp_cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
|
|
4167
|
+
self.core.wp_cuda_load_module.restype = ctypes.c_void_p
|
|
3936
4168
|
|
|
3937
|
-
self.core.
|
|
3938
|
-
self.core.
|
|
4169
|
+
self.core.wp_cuda_unload_module.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4170
|
+
self.core.wp_cuda_unload_module.restype = None
|
|
3939
4171
|
|
|
3940
|
-
self.core.
|
|
3941
|
-
self.core.
|
|
4172
|
+
self.core.wp_cuda_get_kernel.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_char_p]
|
|
4173
|
+
self.core.wp_cuda_get_kernel.restype = ctypes.c_void_p
|
|
3942
4174
|
|
|
3943
|
-
self.core.
|
|
3944
|
-
self.core.
|
|
4175
|
+
self.core.wp_cuda_get_max_shared_memory.argtypes = [ctypes.c_void_p]
|
|
4176
|
+
self.core.wp_cuda_get_max_shared_memory.restype = ctypes.c_int
|
|
3945
4177
|
|
|
3946
|
-
self.core.
|
|
3947
|
-
self.core.
|
|
4178
|
+
self.core.wp_cuda_configure_kernel_shared_memory.argtypes = [ctypes.c_void_p, ctypes.c_int]
|
|
4179
|
+
self.core.wp_cuda_configure_kernel_shared_memory.restype = ctypes.c_bool
|
|
3948
4180
|
|
|
3949
|
-
self.core.
|
|
4181
|
+
self.core.wp_cuda_launch_kernel.argtypes = [
|
|
3950
4182
|
ctypes.c_void_p,
|
|
3951
4183
|
ctypes.c_void_p,
|
|
3952
4184
|
ctypes.c_size_t,
|
|
@@ -3956,54 +4188,54 @@ class Runtime:
|
|
|
3956
4188
|
ctypes.POINTER(ctypes.c_void_p),
|
|
3957
4189
|
ctypes.c_void_p,
|
|
3958
4190
|
]
|
|
3959
|
-
self.core.
|
|
4191
|
+
self.core.wp_cuda_launch_kernel.restype = ctypes.c_size_t
|
|
3960
4192
|
|
|
3961
|
-
self.core.
|
|
3962
|
-
self.core.
|
|
3963
|
-
self.core.
|
|
3964
|
-
self.core.
|
|
3965
|
-
self.core.
|
|
4193
|
+
self.core.wp_cuda_graphics_map.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4194
|
+
self.core.wp_cuda_graphics_map.restype = None
|
|
4195
|
+
self.core.wp_cuda_graphics_unmap.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4196
|
+
self.core.wp_cuda_graphics_unmap.restype = None
|
|
4197
|
+
self.core.wp_cuda_graphics_device_ptr_and_size.argtypes = [
|
|
3966
4198
|
ctypes.c_void_p,
|
|
3967
4199
|
ctypes.c_void_p,
|
|
3968
4200
|
ctypes.POINTER(ctypes.c_uint64),
|
|
3969
4201
|
ctypes.POINTER(ctypes.c_size_t),
|
|
3970
4202
|
]
|
|
3971
|
-
self.core.
|
|
3972
|
-
self.core.
|
|
3973
|
-
self.core.
|
|
3974
|
-
self.core.
|
|
3975
|
-
self.core.
|
|
3976
|
-
|
|
3977
|
-
self.core.
|
|
3978
|
-
self.core.
|
|
3979
|
-
self.core.
|
|
3980
|
-
self.core.
|
|
3981
|
-
self.core.
|
|
3982
|
-
self.core.
|
|
3983
|
-
|
|
3984
|
-
self.core.
|
|
4203
|
+
self.core.wp_cuda_graphics_device_ptr_and_size.restype = None
|
|
4204
|
+
self.core.wp_cuda_graphics_register_gl_buffer.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint]
|
|
4205
|
+
self.core.wp_cuda_graphics_register_gl_buffer.restype = ctypes.c_void_p
|
|
4206
|
+
self.core.wp_cuda_graphics_unregister_resource.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
|
4207
|
+
self.core.wp_cuda_graphics_unregister_resource.restype = None
|
|
4208
|
+
|
|
4209
|
+
self.core.wp_cuda_timing_begin.argtypes = [ctypes.c_int]
|
|
4210
|
+
self.core.wp_cuda_timing_begin.restype = None
|
|
4211
|
+
self.core.wp_cuda_timing_get_result_count.argtypes = []
|
|
4212
|
+
self.core.wp_cuda_timing_get_result_count.restype = int
|
|
4213
|
+
self.core.wp_cuda_timing_end.argtypes = []
|
|
4214
|
+
self.core.wp_cuda_timing_end.restype = None
|
|
4215
|
+
|
|
4216
|
+
self.core.wp_graph_coloring.argtypes = [
|
|
3985
4217
|
ctypes.c_int,
|
|
3986
4218
|
warp.types.array_t,
|
|
3987
4219
|
ctypes.c_int,
|
|
3988
4220
|
warp.types.array_t,
|
|
3989
4221
|
]
|
|
3990
|
-
self.core.
|
|
4222
|
+
self.core.wp_graph_coloring.restype = ctypes.c_int
|
|
3991
4223
|
|
|
3992
|
-
self.core.
|
|
4224
|
+
self.core.wp_balance_coloring.argtypes = [
|
|
3993
4225
|
ctypes.c_int,
|
|
3994
4226
|
warp.types.array_t,
|
|
3995
4227
|
ctypes.c_int,
|
|
3996
4228
|
ctypes.c_float,
|
|
3997
4229
|
warp.types.array_t,
|
|
3998
4230
|
]
|
|
3999
|
-
self.core.
|
|
4231
|
+
self.core.wp_balance_coloring.restype = ctypes.c_float
|
|
4000
4232
|
|
|
4001
|
-
self.core.
|
|
4233
|
+
self.core.wp_init.restype = ctypes.c_int
|
|
4002
4234
|
|
|
4003
4235
|
except AttributeError as e:
|
|
4004
4236
|
raise RuntimeError(f"Setting C-types for {warp_lib} failed. It may need rebuilding.") from e
|
|
4005
4237
|
|
|
4006
|
-
error = self.core.
|
|
4238
|
+
error = self.core.wp_init()
|
|
4007
4239
|
|
|
4008
4240
|
if error != 0:
|
|
4009
4241
|
raise Exception("Warp initialization failed")
|
|
@@ -4019,8 +4251,8 @@ class Runtime:
|
|
|
4019
4251
|
self.device_map["cpu"] = self.cpu_device
|
|
4020
4252
|
self.context_map[None] = self.cpu_device
|
|
4021
4253
|
|
|
4022
|
-
self.is_cuda_enabled = bool(self.core.
|
|
4023
|
-
self.is_cuda_compatibility_enabled = bool(self.core.
|
|
4254
|
+
self.is_cuda_enabled = bool(self.core.wp_is_cuda_enabled())
|
|
4255
|
+
self.is_cuda_compatibility_enabled = bool(self.core.wp_is_cuda_compatibility_enabled())
|
|
4024
4256
|
|
|
4025
4257
|
self.toolkit_version = None # CTK version used to build the core lib
|
|
4026
4258
|
self.driver_version = None # installed driver version
|
|
@@ -4033,12 +4265,15 @@ class Runtime:
|
|
|
4033
4265
|
|
|
4034
4266
|
if self.is_cuda_enabled:
|
|
4035
4267
|
# get CUDA Toolkit and driver versions
|
|
4036
|
-
toolkit_version = self.core.
|
|
4037
|
-
driver_version = self.core.cuda_driver_version()
|
|
4038
|
-
|
|
4039
|
-
# save versions as tuples, e.g., (12, 4)
|
|
4268
|
+
toolkit_version = self.core.wp_cuda_toolkit_version()
|
|
4040
4269
|
self.toolkit_version = (toolkit_version // 1000, (toolkit_version % 1000) // 10)
|
|
4041
|
-
|
|
4270
|
+
|
|
4271
|
+
if self.core.wp_cuda_driver_is_initialized():
|
|
4272
|
+
# save versions as tuples, e.g., (12, 4)
|
|
4273
|
+
driver_version = self.core.wp_cuda_driver_version()
|
|
4274
|
+
self.driver_version = (driver_version // 1000, (driver_version % 1000) // 10)
|
|
4275
|
+
else:
|
|
4276
|
+
self.driver_version = None
|
|
4042
4277
|
|
|
4043
4278
|
# determine minimum required driver version
|
|
4044
4279
|
if self.is_cuda_compatibility_enabled:
|
|
@@ -4052,18 +4287,18 @@ class Runtime:
|
|
|
4052
4287
|
self.min_driver_version = self.toolkit_version
|
|
4053
4288
|
|
|
4054
4289
|
# determine if the installed driver is sufficient
|
|
4055
|
-
if self.driver_version >= self.min_driver_version:
|
|
4290
|
+
if self.driver_version is not None and self.driver_version >= self.min_driver_version:
|
|
4056
4291
|
# get all architectures supported by NVRTC
|
|
4057
|
-
num_archs = self.core.
|
|
4292
|
+
num_archs = self.core.wp_nvrtc_supported_arch_count()
|
|
4058
4293
|
if num_archs > 0:
|
|
4059
4294
|
archs = (ctypes.c_int * num_archs)()
|
|
4060
|
-
self.core.
|
|
4295
|
+
self.core.wp_nvrtc_supported_archs(archs)
|
|
4061
4296
|
self.nvrtc_supported_archs = set(archs)
|
|
4062
4297
|
else:
|
|
4063
4298
|
self.nvrtc_supported_archs = set()
|
|
4064
4299
|
|
|
4065
4300
|
# get CUDA device count
|
|
4066
|
-
cuda_device_count = self.core.
|
|
4301
|
+
cuda_device_count = self.core.wp_cuda_device_get_count()
|
|
4067
4302
|
|
|
4068
4303
|
# register primary CUDA devices
|
|
4069
4304
|
for i in range(cuda_device_count):
|
|
@@ -4080,7 +4315,7 @@ class Runtime:
|
|
|
4080
4315
|
# set default device
|
|
4081
4316
|
if cuda_device_count > 0:
|
|
4082
4317
|
# stick with the current cuda context, if one is bound
|
|
4083
|
-
initial_context = self.core.
|
|
4318
|
+
initial_context = self.core.wp_cuda_context_get_current()
|
|
4084
4319
|
if initial_context is not None:
|
|
4085
4320
|
self.set_default_device("cuda")
|
|
4086
4321
|
# if this is a non-primary context that was just registered, update the device count
|
|
@@ -4133,6 +4368,8 @@ class Runtime:
|
|
|
4133
4368
|
if not self.is_cuda_enabled:
|
|
4134
4369
|
# Warp was compiled without CUDA support
|
|
4135
4370
|
greeting.append(" CUDA not enabled in this build")
|
|
4371
|
+
elif self.driver_version is None:
|
|
4372
|
+
greeting.append(" CUDA driver not found or failed to initialize")
|
|
4136
4373
|
elif self.driver_version < self.min_driver_version:
|
|
4137
4374
|
# insufficient CUDA driver version
|
|
4138
4375
|
greeting.append(
|
|
@@ -4176,7 +4413,7 @@ class Runtime:
|
|
|
4176
4413
|
access_vector.append(1)
|
|
4177
4414
|
else:
|
|
4178
4415
|
peer_device = self.cuda_devices[j]
|
|
4179
|
-
can_access = self.core.
|
|
4416
|
+
can_access = self.core.wp_cuda_is_peer_access_supported(
|
|
4180
4417
|
target_device.ordinal, peer_device.ordinal
|
|
4181
4418
|
)
|
|
4182
4419
|
access_vector.append(can_access)
|
|
@@ -4201,7 +4438,7 @@ class Runtime:
|
|
|
4201
4438
|
|
|
4202
4439
|
if cuda_device_count > 0:
|
|
4203
4440
|
# ensure initialization did not change the initial context (e.g. querying available memory)
|
|
4204
|
-
self.core.
|
|
4441
|
+
self.core.wp_cuda_context_set_current(initial_context)
|
|
4205
4442
|
|
|
4206
4443
|
# detect possible misconfiguration of the system
|
|
4207
4444
|
devices_without_uva = []
|
|
@@ -4229,7 +4466,7 @@ class Runtime:
|
|
|
4229
4466
|
elif self.is_cuda_enabled:
|
|
4230
4467
|
# Report a warning about insufficient driver version. The warning should appear even in quiet mode
|
|
4231
4468
|
# when the greeting message is suppressed. Also try to provide guidance for resolving the situation.
|
|
4232
|
-
if self.driver_version < self.min_driver_version:
|
|
4469
|
+
if self.driver_version is not None and self.driver_version < self.min_driver_version:
|
|
4233
4470
|
msg = []
|
|
4234
4471
|
msg.append("\n Insufficient CUDA driver version.")
|
|
4235
4472
|
msg.append(
|
|
@@ -4240,7 +4477,7 @@ class Runtime:
|
|
|
4240
4477
|
warp.utils.warn("\n ".join(msg))
|
|
4241
4478
|
|
|
4242
4479
|
def get_error_string(self):
|
|
4243
|
-
return self.core.
|
|
4480
|
+
return self.core.wp_get_error_string().decode("utf-8")
|
|
4244
4481
|
|
|
4245
4482
|
def load_dll(self, dll_path):
|
|
4246
4483
|
try:
|
|
@@ -4276,21 +4513,21 @@ class Runtime:
|
|
|
4276
4513
|
self.default_device = self.get_device(ident)
|
|
4277
4514
|
|
|
4278
4515
|
def get_current_cuda_device(self) -> Device:
|
|
4279
|
-
current_context = self.core.
|
|
4516
|
+
current_context = self.core.wp_cuda_context_get_current()
|
|
4280
4517
|
if current_context is not None:
|
|
4281
4518
|
current_device = self.context_map.get(current_context)
|
|
4282
4519
|
if current_device is not None:
|
|
4283
4520
|
# this is a known device
|
|
4284
4521
|
return current_device
|
|
4285
|
-
elif self.core.
|
|
4522
|
+
elif self.core.wp_cuda_context_is_primary(current_context):
|
|
4286
4523
|
# this is a primary context that we haven't used yet
|
|
4287
|
-
ordinal = self.core.
|
|
4524
|
+
ordinal = self.core.wp_cuda_context_get_device_ordinal(current_context)
|
|
4288
4525
|
device = self.cuda_devices[ordinal]
|
|
4289
4526
|
self.context_map[current_context] = device
|
|
4290
4527
|
return device
|
|
4291
4528
|
else:
|
|
4292
4529
|
# this is an unseen non-primary context, register it as a new device with a unique alias
|
|
4293
|
-
ordinal = self.core.
|
|
4530
|
+
ordinal = self.core.wp_cuda_context_get_device_ordinal(current_context)
|
|
4294
4531
|
alias = f"cuda:{ordinal}.{self.cuda_custom_context_count[ordinal]}"
|
|
4295
4532
|
self.cuda_custom_context_count[ordinal] += 1
|
|
4296
4533
|
return self.map_cuda_device(alias, current_context)
|
|
@@ -4313,7 +4550,7 @@ class Runtime:
|
|
|
4313
4550
|
|
|
4314
4551
|
def map_cuda_device(self, alias, context=None) -> Device:
|
|
4315
4552
|
if context is None:
|
|
4316
|
-
context = self.core.
|
|
4553
|
+
context = self.core.wp_cuda_context_get_current()
|
|
4317
4554
|
if context is None:
|
|
4318
4555
|
raise RuntimeError(f"Unable to determine CUDA context for device alias '{alias}'")
|
|
4319
4556
|
|
|
@@ -4335,10 +4572,10 @@ class Runtime:
|
|
|
4335
4572
|
# it's an unmapped context
|
|
4336
4573
|
|
|
4337
4574
|
# get the device ordinal
|
|
4338
|
-
ordinal = self.core.
|
|
4575
|
+
ordinal = self.core.wp_cuda_context_get_device_ordinal(context)
|
|
4339
4576
|
|
|
4340
4577
|
# check if this is a primary context (we could get here if it's a device that hasn't been used yet)
|
|
4341
|
-
if self.core.
|
|
4578
|
+
if self.core.wp_cuda_context_is_primary(context):
|
|
4342
4579
|
# rename the device
|
|
4343
4580
|
device = self.cuda_primary_devices[ordinal]
|
|
4344
4581
|
return self.rename_device(device, alias)
|
|
@@ -4369,7 +4606,7 @@ class Runtime:
|
|
|
4369
4606
|
if not device.is_cuda:
|
|
4370
4607
|
return
|
|
4371
4608
|
|
|
4372
|
-
err = self.core.
|
|
4609
|
+
err = self.core.wp_cuda_context_check(device.context)
|
|
4373
4610
|
if err != 0:
|
|
4374
4611
|
raise RuntimeError(f"CUDA error detected: {err}")
|
|
4375
4612
|
|
|
@@ -4401,7 +4638,7 @@ def is_cuda_driver_initialized() -> bool:
|
|
|
4401
4638
|
"""
|
|
4402
4639
|
init()
|
|
4403
4640
|
|
|
4404
|
-
return runtime.core.
|
|
4641
|
+
return runtime.core.wp_cuda_driver_is_initialized()
|
|
4405
4642
|
|
|
4406
4643
|
|
|
4407
4644
|
def get_devices() -> list[Device]:
|
|
@@ -4609,7 +4846,7 @@ def set_mempool_release_threshold(device: Devicelike, threshold: int | float) ->
|
|
|
4609
4846
|
elif threshold > 0 and threshold <= 1:
|
|
4610
4847
|
threshold = int(threshold * device.total_memory)
|
|
4611
4848
|
|
|
4612
|
-
if not runtime.core.
|
|
4849
|
+
if not runtime.core.wp_cuda_device_set_mempool_release_threshold(device.ordinal, threshold):
|
|
4613
4850
|
raise RuntimeError(f"Failed to set memory pool release threshold for device {device}")
|
|
4614
4851
|
|
|
4615
4852
|
|
|
@@ -4639,7 +4876,7 @@ def get_mempool_release_threshold(device: Devicelike = None) -> int:
|
|
|
4639
4876
|
if not device.is_mempool_supported:
|
|
4640
4877
|
raise RuntimeError(f"Device {device} does not support memory pools")
|
|
4641
4878
|
|
|
4642
|
-
return runtime.core.
|
|
4879
|
+
return runtime.core.wp_cuda_device_get_mempool_release_threshold(device.ordinal)
|
|
4643
4880
|
|
|
4644
4881
|
|
|
4645
4882
|
def get_mempool_used_mem_current(device: Devicelike = None) -> int:
|
|
@@ -4668,7 +4905,7 @@ def get_mempool_used_mem_current(device: Devicelike = None) -> int:
|
|
|
4668
4905
|
if not device.is_mempool_supported:
|
|
4669
4906
|
raise RuntimeError(f"Device {device} does not support memory pools")
|
|
4670
4907
|
|
|
4671
|
-
return runtime.core.
|
|
4908
|
+
return runtime.core.wp_cuda_device_get_mempool_used_mem_current(device.ordinal)
|
|
4672
4909
|
|
|
4673
4910
|
|
|
4674
4911
|
def get_mempool_used_mem_high(device: Devicelike = None) -> int:
|
|
@@ -4697,7 +4934,7 @@ def get_mempool_used_mem_high(device: Devicelike = None) -> int:
|
|
|
4697
4934
|
if not device.is_mempool_supported:
|
|
4698
4935
|
raise RuntimeError(f"Device {device} does not support memory pools")
|
|
4699
4936
|
|
|
4700
|
-
return runtime.core.
|
|
4937
|
+
return runtime.core.wp_cuda_device_get_mempool_used_mem_high(device.ordinal)
|
|
4701
4938
|
|
|
4702
4939
|
|
|
4703
4940
|
def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike) -> bool:
|
|
@@ -4718,7 +4955,7 @@ def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike)
|
|
|
4718
4955
|
if not target_device.is_cuda or not peer_device.is_cuda:
|
|
4719
4956
|
return False
|
|
4720
4957
|
|
|
4721
|
-
return bool(runtime.core.
|
|
4958
|
+
return bool(runtime.core.wp_cuda_is_peer_access_supported(target_device.ordinal, peer_device.ordinal))
|
|
4722
4959
|
|
|
4723
4960
|
|
|
4724
4961
|
def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike) -> bool:
|
|
@@ -4739,7 +4976,7 @@ def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike) -
|
|
|
4739
4976
|
if not target_device.is_cuda or not peer_device.is_cuda:
|
|
4740
4977
|
return False
|
|
4741
4978
|
|
|
4742
|
-
return bool(runtime.core.
|
|
4979
|
+
return bool(runtime.core.wp_cuda_is_peer_access_enabled(target_device.context, peer_device.context))
|
|
4743
4980
|
|
|
4744
4981
|
|
|
4745
4982
|
def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool) -> None:
|
|
@@ -4769,7 +5006,7 @@ def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike,
|
|
|
4769
5006
|
else:
|
|
4770
5007
|
return
|
|
4771
5008
|
|
|
4772
|
-
if not runtime.core.
|
|
5009
|
+
if not runtime.core.wp_cuda_set_peer_access_enabled(target_device.context, peer_device.context, int(enable)):
|
|
4773
5010
|
action = "enable" if enable else "disable"
|
|
4774
5011
|
raise RuntimeError(f"Failed to {action} peer access from device {peer_device} to device {target_device}")
|
|
4775
5012
|
|
|
@@ -4810,7 +5047,7 @@ def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike
|
|
|
4810
5047
|
if not peer_device.is_cuda or not target_device.is_cuda or not target_device.is_mempool_supported:
|
|
4811
5048
|
return False
|
|
4812
5049
|
|
|
4813
|
-
return bool(runtime.core.
|
|
5050
|
+
return bool(runtime.core.wp_cuda_is_mempool_access_enabled(target_device.ordinal, peer_device.ordinal))
|
|
4814
5051
|
|
|
4815
5052
|
|
|
4816
5053
|
def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool) -> None:
|
|
@@ -4843,7 +5080,7 @@ def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelik
|
|
|
4843
5080
|
else:
|
|
4844
5081
|
return
|
|
4845
5082
|
|
|
4846
|
-
if not runtime.core.
|
|
5083
|
+
if not runtime.core.wp_cuda_set_mempool_access_enabled(target_device.ordinal, peer_device.ordinal, int(enable)):
|
|
4847
5084
|
action = "enable" if enable else "disable"
|
|
4848
5085
|
raise RuntimeError(f"Failed to {action} memory pool access from device {peer_device} to device {target_device}")
|
|
4849
5086
|
|
|
@@ -4924,7 +5161,7 @@ def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bo
|
|
|
4924
5161
|
if synchronize:
|
|
4925
5162
|
synchronize_event(end_event)
|
|
4926
5163
|
|
|
4927
|
-
return runtime.core.
|
|
5164
|
+
return runtime.core.wp_cuda_event_elapsed_time(start_event.cuda_event, end_event.cuda_event)
|
|
4928
5165
|
|
|
4929
5166
|
|
|
4930
5167
|
def wait_stream(other_stream: Stream, event: Event | None = None):
|
|
@@ -5018,7 +5255,7 @@ class RegisteredGLBuffer:
|
|
|
5018
5255
|
self.context = self.device.context
|
|
5019
5256
|
self.flags = flags
|
|
5020
5257
|
self.fallback_to_copy = fallback_to_copy
|
|
5021
|
-
self.resource = runtime.core.
|
|
5258
|
+
self.resource = runtime.core.wp_cuda_graphics_register_gl_buffer(self.context, gl_buffer_id, flags)
|
|
5022
5259
|
if self.resource is None:
|
|
5023
5260
|
if self.fallback_to_copy:
|
|
5024
5261
|
self.warp_buffer = None
|
|
@@ -5037,7 +5274,7 @@ class RegisteredGLBuffer:
|
|
|
5037
5274
|
|
|
5038
5275
|
# use CUDA context guard to avoid side effects during garbage collection
|
|
5039
5276
|
with self.device.context_guard:
|
|
5040
|
-
runtime.core.
|
|
5277
|
+
runtime.core.wp_cuda_graphics_unregister_resource(self.context, self.resource)
|
|
5041
5278
|
|
|
5042
5279
|
def map(self, dtype, shape) -> warp.array:
|
|
5043
5280
|
"""Map the OpenGL buffer to a Warp array.
|
|
@@ -5050,10 +5287,10 @@ class RegisteredGLBuffer:
|
|
|
5050
5287
|
A Warp array object representing the mapped OpenGL buffer.
|
|
5051
5288
|
"""
|
|
5052
5289
|
if self.resource is not None:
|
|
5053
|
-
runtime.core.
|
|
5290
|
+
runtime.core.wp_cuda_graphics_map(self.context, self.resource)
|
|
5054
5291
|
ptr = ctypes.c_uint64(0)
|
|
5055
5292
|
size = ctypes.c_size_t(0)
|
|
5056
|
-
runtime.core.
|
|
5293
|
+
runtime.core.wp_cuda_graphics_device_ptr_and_size(
|
|
5057
5294
|
self.context, self.resource, ctypes.byref(ptr), ctypes.byref(size)
|
|
5058
5295
|
)
|
|
5059
5296
|
return warp.array(ptr=ptr.value, dtype=dtype, shape=shape, device=self.device)
|
|
@@ -5078,7 +5315,7 @@ class RegisteredGLBuffer:
|
|
|
5078
5315
|
def unmap(self):
|
|
5079
5316
|
"""Unmap the OpenGL buffer."""
|
|
5080
5317
|
if self.resource is not None:
|
|
5081
|
-
runtime.core.
|
|
5318
|
+
runtime.core.wp_cuda_graphics_unmap(self.context, self.resource)
|
|
5082
5319
|
elif self.fallback_to_copy:
|
|
5083
5320
|
if self.warp_buffer is None:
|
|
5084
5321
|
raise RuntimeError("RegisteredGLBuffer first has to be mapped")
|
|
@@ -5434,7 +5671,7 @@ def event_from_ipc_handle(handle, device: Devicelike = None) -> Event:
|
|
|
5434
5671
|
raise RuntimeError(f"IPC is not supported on device {device}.")
|
|
5435
5672
|
|
|
5436
5673
|
event = Event(
|
|
5437
|
-
device=device, cuda_event=warp.context.runtime.core.
|
|
5674
|
+
device=device, cuda_event=warp.context.runtime.core.wp_cuda_ipc_open_event_handle(device.context, handle)
|
|
5438
5675
|
)
|
|
5439
5676
|
# Events created from IPC handles must be freed with cuEventDestroy
|
|
5440
5677
|
event.owner = True
|
|
@@ -5566,6 +5803,44 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
|
|
|
5566
5803
|
) from e
|
|
5567
5804
|
|
|
5568
5805
|
|
|
5806
|
+
# invoke a CPU kernel by passing the parameters as a ctypes structure
|
|
5807
|
+
def invoke(kernel, hooks, params: Sequence[Any], adjoint: bool):
|
|
5808
|
+
fields = []
|
|
5809
|
+
|
|
5810
|
+
for i in range(0, len(kernel.adj.args)):
|
|
5811
|
+
arg_name = kernel.adj.args[i].label
|
|
5812
|
+
field = (arg_name, type(params[1 + i])) # skip the first argument, which is the launch bounds
|
|
5813
|
+
fields.append(field)
|
|
5814
|
+
|
|
5815
|
+
ArgsStruct = type("ArgsStruct", (ctypes.Structure,), {"_fields_": fields})
|
|
5816
|
+
|
|
5817
|
+
args = ArgsStruct()
|
|
5818
|
+
for i, field in enumerate(fields):
|
|
5819
|
+
name = field[0]
|
|
5820
|
+
setattr(args, name, params[1 + i])
|
|
5821
|
+
|
|
5822
|
+
if not adjoint:
|
|
5823
|
+
hooks.forward(params[0], ctypes.byref(args))
|
|
5824
|
+
|
|
5825
|
+
# for adjoint kernels the adjoint arguments are passed through a second struct
|
|
5826
|
+
else:
|
|
5827
|
+
adj_fields = []
|
|
5828
|
+
|
|
5829
|
+
for i in range(0, len(kernel.adj.args)):
|
|
5830
|
+
arg_name = kernel.adj.args[i].label
|
|
5831
|
+
field = (arg_name, type(params[1 + len(fields) + i])) # skip the first argument, which is the launch bounds
|
|
5832
|
+
adj_fields.append(field)
|
|
5833
|
+
|
|
5834
|
+
AdjArgsStruct = type("AdjArgsStruct", (ctypes.Structure,), {"_fields_": adj_fields})
|
|
5835
|
+
|
|
5836
|
+
adj_args = AdjArgsStruct()
|
|
5837
|
+
for i, field in enumerate(adj_fields):
|
|
5838
|
+
name = field[0]
|
|
5839
|
+
setattr(adj_args, name, params[1 + len(fields) + i])
|
|
5840
|
+
|
|
5841
|
+
hooks.backward(params[0], ctypes.byref(args), ctypes.byref(adj_args))
|
|
5842
|
+
|
|
5843
|
+
|
|
5569
5844
|
class Launch:
|
|
5570
5845
|
"""Represents all data required for a kernel launch so that launches can be replayed quickly.
|
|
5571
5846
|
|
|
@@ -5758,24 +6033,21 @@ class Launch:
|
|
|
5758
6033
|
stream: The stream to launch on.
|
|
5759
6034
|
"""
|
|
5760
6035
|
if self.device.is_cpu:
|
|
5761
|
-
|
|
5762
|
-
self.hooks.backward(*self.params)
|
|
5763
|
-
else:
|
|
5764
|
-
self.hooks.forward(*self.params)
|
|
6036
|
+
invoke(self.kernel, self.hooks, self.params, self.adjoint)
|
|
5765
6037
|
else:
|
|
5766
6038
|
if stream is None:
|
|
5767
6039
|
stream = self.device.stream
|
|
5768
6040
|
|
|
5769
6041
|
# If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
|
|
5770
6042
|
# before the captured graph is released.
|
|
5771
|
-
if len(runtime.captures) > 0 and runtime.core.
|
|
5772
|
-
capture_id = runtime.core.
|
|
6043
|
+
if len(runtime.captures) > 0 and runtime.core.wp_cuda_stream_is_capturing(stream.cuda_stream):
|
|
6044
|
+
capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
|
|
5773
6045
|
graph = runtime.captures.get(capture_id)
|
|
5774
6046
|
if graph is not None:
|
|
5775
6047
|
graph.retain_module_exec(self.module_exec)
|
|
5776
6048
|
|
|
5777
6049
|
if self.adjoint:
|
|
5778
|
-
runtime.core.
|
|
6050
|
+
runtime.core.wp_cuda_launch_kernel(
|
|
5779
6051
|
self.device.context,
|
|
5780
6052
|
self.hooks.backward,
|
|
5781
6053
|
self.bounds.size,
|
|
@@ -5786,7 +6058,7 @@ class Launch:
|
|
|
5786
6058
|
stream.cuda_stream,
|
|
5787
6059
|
)
|
|
5788
6060
|
else:
|
|
5789
|
-
runtime.core.
|
|
6061
|
+
runtime.core.wp_cuda_launch_kernel(
|
|
5790
6062
|
self.device.context,
|
|
5791
6063
|
self.hooks.forward,
|
|
5792
6064
|
self.bounds.size,
|
|
@@ -5905,7 +6177,7 @@ def launch(
|
|
|
5905
6177
|
# late bind
|
|
5906
6178
|
hooks = module_exec.get_kernel_hooks(kernel)
|
|
5907
6179
|
|
|
5908
|
-
pack_args(fwd_args, params)
|
|
6180
|
+
pack_args(fwd_args, params, adjoint=False)
|
|
5909
6181
|
pack_args(adj_args, params, adjoint=True)
|
|
5910
6182
|
|
|
5911
6183
|
# run kernel
|
|
@@ -5916,38 +6188,25 @@ def launch(
|
|
|
5916
6188
|
f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
|
|
5917
6189
|
)
|
|
5918
6190
|
|
|
5919
|
-
if record_cmd:
|
|
5920
|
-
launch = Launch(
|
|
5921
|
-
kernel=kernel,
|
|
5922
|
-
hooks=hooks,
|
|
5923
|
-
params=params,
|
|
5924
|
-
params_addr=None,
|
|
5925
|
-
bounds=bounds,
|
|
5926
|
-
device=device,
|
|
5927
|
-
adjoint=adjoint,
|
|
5928
|
-
)
|
|
5929
|
-
return launch
|
|
5930
|
-
hooks.backward(*params)
|
|
5931
|
-
|
|
5932
6191
|
else:
|
|
5933
6192
|
if hooks.forward is None:
|
|
5934
6193
|
raise RuntimeError(
|
|
5935
6194
|
f"Failed to find forward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
|
|
5936
6195
|
)
|
|
5937
6196
|
|
|
5938
|
-
|
|
5939
|
-
|
|
5940
|
-
|
|
5941
|
-
|
|
5942
|
-
|
|
5943
|
-
|
|
5944
|
-
|
|
5945
|
-
|
|
5946
|
-
|
|
5947
|
-
|
|
5948
|
-
|
|
5949
|
-
|
|
5950
|
-
|
|
6197
|
+
if record_cmd:
|
|
6198
|
+
launch = Launch(
|
|
6199
|
+
kernel=kernel,
|
|
6200
|
+
hooks=hooks,
|
|
6201
|
+
params=params,
|
|
6202
|
+
params_addr=None,
|
|
6203
|
+
bounds=bounds,
|
|
6204
|
+
device=device,
|
|
6205
|
+
adjoint=adjoint,
|
|
6206
|
+
)
|
|
6207
|
+
return launch
|
|
6208
|
+
|
|
6209
|
+
invoke(kernel, hooks, params, adjoint)
|
|
5951
6210
|
|
|
5952
6211
|
else:
|
|
5953
6212
|
kernel_args = [ctypes.c_void_p(ctypes.addressof(x)) for x in params]
|
|
@@ -5958,8 +6217,8 @@ def launch(
|
|
|
5958
6217
|
|
|
5959
6218
|
# If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
|
|
5960
6219
|
# before the captured graph is released.
|
|
5961
|
-
if len(runtime.captures) > 0 and runtime.core.
|
|
5962
|
-
capture_id = runtime.core.
|
|
6220
|
+
if len(runtime.captures) > 0 and runtime.core.wp_cuda_stream_is_capturing(stream.cuda_stream):
|
|
6221
|
+
capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
|
|
5963
6222
|
graph = runtime.captures.get(capture_id)
|
|
5964
6223
|
if graph is not None:
|
|
5965
6224
|
graph.retain_module_exec(module_exec)
|
|
@@ -5984,7 +6243,7 @@ def launch(
|
|
|
5984
6243
|
)
|
|
5985
6244
|
return launch
|
|
5986
6245
|
else:
|
|
5987
|
-
runtime.core.
|
|
6246
|
+
runtime.core.wp_cuda_launch_kernel(
|
|
5988
6247
|
device.context,
|
|
5989
6248
|
hooks.backward,
|
|
5990
6249
|
bounds.size,
|
|
@@ -6015,7 +6274,7 @@ def launch(
|
|
|
6015
6274
|
return launch
|
|
6016
6275
|
else:
|
|
6017
6276
|
# launch
|
|
6018
|
-
runtime.core.
|
|
6277
|
+
runtime.core.wp_cuda_launch_kernel(
|
|
6019
6278
|
device.context,
|
|
6020
6279
|
hooks.forward,
|
|
6021
6280
|
bounds.size,
|
|
@@ -6117,7 +6376,7 @@ def synchronize():
|
|
|
6117
6376
|
|
|
6118
6377
|
if is_cuda_driver_initialized():
|
|
6119
6378
|
# save the original context to avoid side effects
|
|
6120
|
-
saved_context = runtime.core.
|
|
6379
|
+
saved_context = runtime.core.wp_cuda_context_get_current()
|
|
6121
6380
|
|
|
6122
6381
|
# TODO: only synchronize devices that have outstanding work
|
|
6123
6382
|
for device in runtime.cuda_devices:
|
|
@@ -6126,10 +6385,10 @@ def synchronize():
|
|
|
6126
6385
|
if device.is_capturing:
|
|
6127
6386
|
raise RuntimeError(f"Cannot synchronize device {device} while graph capture is active")
|
|
6128
6387
|
|
|
6129
|
-
runtime.core.
|
|
6388
|
+
runtime.core.wp_cuda_context_synchronize(device.context)
|
|
6130
6389
|
|
|
6131
6390
|
# restore the original context to avoid side effects
|
|
6132
|
-
runtime.core.
|
|
6391
|
+
runtime.core.wp_cuda_context_set_current(saved_context)
|
|
6133
6392
|
|
|
6134
6393
|
|
|
6135
6394
|
def synchronize_device(device: Devicelike = None):
|
|
@@ -6147,7 +6406,7 @@ def synchronize_device(device: Devicelike = None):
|
|
|
6147
6406
|
if device.is_capturing:
|
|
6148
6407
|
raise RuntimeError(f"Cannot synchronize device {device} while graph capture is active")
|
|
6149
6408
|
|
|
6150
|
-
runtime.core.
|
|
6409
|
+
runtime.core.wp_cuda_context_synchronize(device.context)
|
|
6151
6410
|
|
|
6152
6411
|
|
|
6153
6412
|
def synchronize_stream(stream_or_device: Stream | Devicelike | None = None):
|
|
@@ -6165,7 +6424,7 @@ def synchronize_stream(stream_or_device: Stream | Devicelike | None = None):
|
|
|
6165
6424
|
else:
|
|
6166
6425
|
stream = runtime.get_device(stream_or_device).stream
|
|
6167
6426
|
|
|
6168
|
-
runtime.core.
|
|
6427
|
+
runtime.core.wp_cuda_stream_synchronize(stream.cuda_stream)
|
|
6169
6428
|
|
|
6170
6429
|
|
|
6171
6430
|
def synchronize_event(event: Event):
|
|
@@ -6177,20 +6436,25 @@ def synchronize_event(event: Event):
|
|
|
6177
6436
|
event: Event to wait for.
|
|
6178
6437
|
"""
|
|
6179
6438
|
|
|
6180
|
-
runtime.core.
|
|
6439
|
+
runtime.core.wp_cuda_event_synchronize(event.cuda_event)
|
|
6181
6440
|
|
|
6182
6441
|
|
|
6183
|
-
def force_load(
|
|
6442
|
+
def force_load(
|
|
6443
|
+
device: Device | str | list[Device] | list[str] | None = None,
|
|
6444
|
+
modules: list[Module] | None = None,
|
|
6445
|
+
block_dim: int | None = None,
|
|
6446
|
+
):
|
|
6184
6447
|
"""Force user-defined kernels to be compiled and loaded
|
|
6185
6448
|
|
|
6186
6449
|
Args:
|
|
6187
6450
|
device: The device or list of devices to load the modules on. If None, load on all devices.
|
|
6188
6451
|
modules: List of modules to load. If None, load all imported modules.
|
|
6452
|
+
block_dim: The number of threads per block (always 1 for "cpu" devices).
|
|
6189
6453
|
"""
|
|
6190
6454
|
|
|
6191
6455
|
if is_cuda_driver_initialized():
|
|
6192
6456
|
# save original context to avoid side effects
|
|
6193
|
-
saved_context = runtime.core.
|
|
6457
|
+
saved_context = runtime.core.wp_cuda_context_get_current()
|
|
6194
6458
|
|
|
6195
6459
|
if device is None:
|
|
6196
6460
|
devices = get_devices()
|
|
@@ -6204,22 +6468,26 @@ def force_load(device: Device | str | list[Device] | list[str] | None = None, mo
|
|
|
6204
6468
|
|
|
6205
6469
|
for d in devices:
|
|
6206
6470
|
for m in modules:
|
|
6207
|
-
m.load(d)
|
|
6471
|
+
m.load(d, block_dim=block_dim)
|
|
6208
6472
|
|
|
6209
6473
|
if is_cuda_available():
|
|
6210
6474
|
# restore original context to avoid side effects
|
|
6211
|
-
runtime.core.
|
|
6475
|
+
runtime.core.wp_cuda_context_set_current(saved_context)
|
|
6212
6476
|
|
|
6213
6477
|
|
|
6214
6478
|
def load_module(
|
|
6215
|
-
module: Module | types.ModuleType | str | None = None,
|
|
6479
|
+
module: Module | types.ModuleType | str | None = None,
|
|
6480
|
+
device: Device | str | None = None,
|
|
6481
|
+
recursive: bool = False,
|
|
6482
|
+
block_dim: int | None = None,
|
|
6216
6483
|
):
|
|
6217
|
-
"""Force user-defined module to be compiled and loaded
|
|
6484
|
+
"""Force a user-defined module to be compiled and loaded
|
|
6218
6485
|
|
|
6219
6486
|
Args:
|
|
6220
6487
|
module: The module to load. If None, load the current module.
|
|
6221
6488
|
device: The device to load the modules on. If None, load on all devices.
|
|
6222
6489
|
recursive: Whether to load submodules. E.g., if the given module is `warp.sim`, this will also load `warp.sim.model`, `warp.sim.articulation`, etc.
|
|
6490
|
+
block_dim: The number of threads per block (always 1 for "cpu" devices).
|
|
6223
6491
|
|
|
6224
6492
|
Note: A module must be imported before it can be loaded by this function.
|
|
6225
6493
|
"""
|
|
@@ -6240,9 +6508,13 @@ def load_module(
|
|
|
6240
6508
|
modules = []
|
|
6241
6509
|
|
|
6242
6510
|
# add the given module, if found
|
|
6243
|
-
|
|
6244
|
-
|
|
6245
|
-
modules.append(
|
|
6511
|
+
if isinstance(module, Module):
|
|
6512
|
+
# this ensures that we can load "unique" or procedural modules, which aren't added to `user_modules` by name
|
|
6513
|
+
modules.append(module)
|
|
6514
|
+
else:
|
|
6515
|
+
m = user_modules.get(module_name)
|
|
6516
|
+
if m is not None:
|
|
6517
|
+
modules.append(m)
|
|
6246
6518
|
|
|
6247
6519
|
# add submodules, if recursive
|
|
6248
6520
|
if recursive:
|
|
@@ -6251,7 +6523,203 @@ def load_module(
|
|
|
6251
6523
|
if name.startswith(prefix):
|
|
6252
6524
|
modules.append(mod)
|
|
6253
6525
|
|
|
6254
|
-
force_load(device=device, modules=modules)
|
|
6526
|
+
force_load(device=device, modules=modules, block_dim=block_dim)
|
|
6527
|
+
|
|
6528
|
+
|
|
6529
|
+
def _resolve_module(module: Module | types.ModuleType | str) -> Module:
|
|
6530
|
+
"""Resolve a module from a string, Module, or types.ModuleType.
|
|
6531
|
+
|
|
6532
|
+
Args:
|
|
6533
|
+
module: The module to resolve.
|
|
6534
|
+
|
|
6535
|
+
Returns:
|
|
6536
|
+
The resolved module.
|
|
6537
|
+
|
|
6538
|
+
Raises:
|
|
6539
|
+
TypeError: If the module argument is not a Module, a types.ModuleType, or a string.
|
|
6540
|
+
"""
|
|
6541
|
+
|
|
6542
|
+
if isinstance(module, str):
|
|
6543
|
+
module_object = get_module(module)
|
|
6544
|
+
elif isinstance(module, Module):
|
|
6545
|
+
module_object = module
|
|
6546
|
+
elif isinstance(module, types.ModuleType):
|
|
6547
|
+
module_object = get_module(module.__name__)
|
|
6548
|
+
else:
|
|
6549
|
+
raise TypeError(f"Argument 'module' must be a Module or a string, got {type(module)}")
|
|
6550
|
+
|
|
6551
|
+
return module_object
|
|
6552
|
+
|
|
6553
|
+
|
|
6554
|
+
def compile_aot_module(
|
|
6555
|
+
module: Module | types.ModuleType | str,
|
|
6556
|
+
device: Device | str | list[Device] | list[str] | None = None,
|
|
6557
|
+
arch: int | Iterable[int] | None = None,
|
|
6558
|
+
module_dir: str | os.PathLike | None = None,
|
|
6559
|
+
use_ptx: bool | None = None,
|
|
6560
|
+
strip_hash: bool | None = None,
|
|
6561
|
+
) -> None:
|
|
6562
|
+
"""Compile a module (ahead of time) for a given device.
|
|
6563
|
+
|
|
6564
|
+
Args:
|
|
6565
|
+
module: The module to compile.
|
|
6566
|
+
device: The device or devices to compile the module for. If ``None``,
|
|
6567
|
+
and ``arch`` is not specified, compile the module for the current device.
|
|
6568
|
+
arch: The architecture or architectures to compile the module for. If ``None``,
|
|
6569
|
+
the architecture to compile for will be inferred from the current device.
|
|
6570
|
+
module_dir: The directory to save the source, meta, and compiled files to.
|
|
6571
|
+
If not specified, the module will be compiled to the default cache directory.
|
|
6572
|
+
use_ptx: Whether to compile the module to PTX. This setting is only used
|
|
6573
|
+
when compiling modules for the GPU. If ``None``, Warp will decide an
|
|
6574
|
+
appropriate setting based on the runtime environment.
|
|
6575
|
+
strip_hash: Whether to strip the hash from the module and kernel names.
|
|
6576
|
+
Setting this value to ``True`` or ``False`` will update the module's
|
|
6577
|
+
``"strip_hash"`` option. If left at ``None``, the current value will
|
|
6578
|
+
be used.
|
|
6579
|
+
|
|
6580
|
+
Warning: Do not enable ``strip_hash`` for modules that contain generic
|
|
6581
|
+
kernels. Generic kernels compile to multiple overloads, and the
|
|
6582
|
+
per-overload hash is required to distinguish them. Stripping the hash
|
|
6583
|
+
in this case will cause the module to fail to compile.
|
|
6584
|
+
|
|
6585
|
+
Raises:
|
|
6586
|
+
TypeError: If the module argument is not a Module, a types.ModuleType, or a string.
|
|
6587
|
+
"""
|
|
6588
|
+
|
|
6589
|
+
if is_cuda_driver_initialized():
|
|
6590
|
+
# save original context to avoid side effects
|
|
6591
|
+
saved_context = runtime.core.wp_cuda_context_get_current()
|
|
6592
|
+
|
|
6593
|
+
module_object = _resolve_module(module)
|
|
6594
|
+
|
|
6595
|
+
if strip_hash is not None:
|
|
6596
|
+
module_object.options["strip_hash"] = strip_hash
|
|
6597
|
+
|
|
6598
|
+
if device is None and arch:
|
|
6599
|
+
# User provided no device, but an arch, so we will not compile for the default device
|
|
6600
|
+
devices = []
|
|
6601
|
+
elif isinstance(device, list):
|
|
6602
|
+
devices = [get_device(device_item) for device_item in device]
|
|
6603
|
+
else:
|
|
6604
|
+
devices = [get_device(device)]
|
|
6605
|
+
|
|
6606
|
+
for d in devices:
|
|
6607
|
+
module_object.compile(d, module_dir, use_ptx=use_ptx)
|
|
6608
|
+
|
|
6609
|
+
if arch:
|
|
6610
|
+
if isinstance(arch, str) or not hasattr(arch, "__iter__"):
|
|
6611
|
+
arch = [arch]
|
|
6612
|
+
|
|
6613
|
+
for arch_value in arch:
|
|
6614
|
+
module_object.compile(None, module_dir, output_arch=arch_value, use_ptx=use_ptx)
|
|
6615
|
+
|
|
6616
|
+
if is_cuda_available():
|
|
6617
|
+
# restore original context to avoid side effects
|
|
6618
|
+
runtime.core.wp_cuda_context_set_current(saved_context)
|
|
6619
|
+
|
|
6620
|
+
|
|
6621
|
+
def load_aot_module(
|
|
6622
|
+
module: Module | types.ModuleType | str,
|
|
6623
|
+
device: Device | str | list[Device] | list[str] | None = None,
|
|
6624
|
+
arch: int | None = None,
|
|
6625
|
+
module_dir: str | os.PathLike | None = None,
|
|
6626
|
+
use_ptx: bool | None = None,
|
|
6627
|
+
strip_hash: bool = False,
|
|
6628
|
+
) -> None:
|
|
6629
|
+
"""Load a previously compiled module (ahead of time).
|
|
6630
|
+
|
|
6631
|
+
Args:
|
|
6632
|
+
module: The module to load.
|
|
6633
|
+
device: The device or devices to load the module on. If ``None``,
|
|
6634
|
+
load the module for the current device.
|
|
6635
|
+
arch: The architecture to load the module for on all devices.
|
|
6636
|
+
If ``None``, the architecture to load for will be inferred from the
|
|
6637
|
+
current device.
|
|
6638
|
+
module_dir: The directory to load the module from.
|
|
6639
|
+
If not specified, the module will be loaded from the default cache directory.
|
|
6640
|
+
use_ptx: Whether to load the module from PTX. This setting is only used
|
|
6641
|
+
when loading modules for the GPU. If ``None`` on a CUDA device, Warp will
|
|
6642
|
+
try both PTX and CUBIN (PTX first) and load the first that exists.
|
|
6643
|
+
If neither exists, a ``FileNotFoundError`` is raised listing all
|
|
6644
|
+
attempted paths.
|
|
6645
|
+
strip_hash: Whether to strip the hash from the module and kernel names.
|
|
6646
|
+
Setting this value to ``True`` or ``False`` will update the module's
|
|
6647
|
+
``"strip_hash"`` option. If left at ``None``, the current value will
|
|
6648
|
+
be used.
|
|
6649
|
+
|
|
6650
|
+
Warning: Do not enable ``strip_hash`` for modules that contain generic
|
|
6651
|
+
kernels. Generic kernels compile to multiple overloads, and the
|
|
6652
|
+
per-overload hash is required to distinguish them. Stripping the hash
|
|
6653
|
+
in this case will cause the module to fail to compile.
|
|
6654
|
+
|
|
6655
|
+
Raises:
|
|
6656
|
+
FileNotFoundError: If no matching binary is found. When ``use_ptx`` is
|
|
6657
|
+
``None`` on a CUDA device, both PTX and CUBIN candidates are tried
|
|
6658
|
+
before raising.
|
|
6659
|
+
TypeError: If the module argument is not a Module, a types.ModuleType, or a string.
|
|
6660
|
+
"""
|
|
6661
|
+
|
|
6662
|
+
if is_cuda_driver_initialized():
|
|
6663
|
+
# save original context to avoid side effects
|
|
6664
|
+
saved_context = runtime.core.wp_cuda_context_get_current()
|
|
6665
|
+
|
|
6666
|
+
if device is None:
|
|
6667
|
+
devices = [runtime.get_device()]
|
|
6668
|
+
elif isinstance(device, list):
|
|
6669
|
+
devices = [get_device(device_item) for device_item in device]
|
|
6670
|
+
else:
|
|
6671
|
+
devices = [get_device(device)]
|
|
6672
|
+
|
|
6673
|
+
module_object = _resolve_module(module)
|
|
6674
|
+
|
|
6675
|
+
if strip_hash is not None:
|
|
6676
|
+
module_object.options["strip_hash"] = strip_hash
|
|
6677
|
+
|
|
6678
|
+
if module_dir is None:
|
|
6679
|
+
module_dir = os.path.join(warp.config.kernel_cache_dir, module_object.get_module_identifier())
|
|
6680
|
+
else:
|
|
6681
|
+
module_dir = os.fspath(module_dir)
|
|
6682
|
+
|
|
6683
|
+
for d in devices:
|
|
6684
|
+
# Identify the files in the cache to load
|
|
6685
|
+
if arch is None:
|
|
6686
|
+
output_arch = module_object.get_compile_arch(d)
|
|
6687
|
+
else:
|
|
6688
|
+
output_arch = arch
|
|
6689
|
+
|
|
6690
|
+
meta_path = os.path.join(module_dir, module_object.get_meta_name())
|
|
6691
|
+
|
|
6692
|
+
# Determine candidate binaries to try
|
|
6693
|
+
tried_paths = []
|
|
6694
|
+
binary_path = None
|
|
6695
|
+
if d.is_cuda and use_ptx is None:
|
|
6696
|
+
candidate_flags = (True, False) # try PTX first, then CUBIN
|
|
6697
|
+
else:
|
|
6698
|
+
candidate_flags = (use_ptx,)
|
|
6699
|
+
|
|
6700
|
+
for candidate_use_ptx in candidate_flags:
|
|
6701
|
+
candidate_path = os.path.join(
|
|
6702
|
+
module_dir, module_object.get_compile_output_name(d, output_arch, candidate_use_ptx)
|
|
6703
|
+
)
|
|
6704
|
+
tried_paths.append(candidate_path)
|
|
6705
|
+
if os.path.exists(candidate_path):
|
|
6706
|
+
binary_path = candidate_path
|
|
6707
|
+
break
|
|
6708
|
+
|
|
6709
|
+
if binary_path is None:
|
|
6710
|
+
raise FileNotFoundError(f"Binary file not found. Tried: {', '.join(tried_paths)}")
|
|
6711
|
+
|
|
6712
|
+
module_object.load(
|
|
6713
|
+
d,
|
|
6714
|
+
block_dim=module_object.options["block_dim"],
|
|
6715
|
+
binary_path=binary_path,
|
|
6716
|
+
output_arch=output_arch,
|
|
6717
|
+
meta_path=meta_path,
|
|
6718
|
+
)
|
|
6719
|
+
|
|
6720
|
+
if is_cuda_available():
|
|
6721
|
+
# restore original context to avoid side effects
|
|
6722
|
+
runtime.core.wp_cuda_context_set_current(saved_context)
|
|
6255
6723
|
|
|
6256
6724
|
|
|
6257
6725
|
def set_module_options(options: dict[str, Any], module: Any = None):
|
|
@@ -6381,10 +6849,10 @@ def capture_begin(
|
|
|
6381
6849
|
if force_module_load:
|
|
6382
6850
|
force_load(device)
|
|
6383
6851
|
|
|
6384
|
-
if not runtime.core.
|
|
6852
|
+
if not runtime.core.wp_cuda_graph_begin_capture(device.context, stream.cuda_stream, int(external)):
|
|
6385
6853
|
raise RuntimeError(runtime.get_error_string())
|
|
6386
6854
|
|
|
6387
|
-
capture_id = runtime.core.
|
|
6855
|
+
capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
|
|
6388
6856
|
graph = Graph(device, capture_id)
|
|
6389
6857
|
|
|
6390
6858
|
_register_capture(device, stream, graph, capture_id)
|
|
@@ -6419,7 +6887,7 @@ def capture_end(device: Devicelike = None, stream: Stream | None = None) -> Grap
|
|
|
6419
6887
|
|
|
6420
6888
|
# get the graph executable
|
|
6421
6889
|
g = ctypes.c_void_p()
|
|
6422
|
-
result = runtime.core.
|
|
6890
|
+
result = runtime.core.wp_cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(g))
|
|
6423
6891
|
|
|
6424
6892
|
if not result:
|
|
6425
6893
|
# A concrete error should've already been reported, so we don't need to go into details here
|
|
@@ -6440,7 +6908,7 @@ def capture_debug_dot_print(graph: Graph, path: str, verbose: bool = False):
|
|
|
6440
6908
|
path: Path to save the DOT file
|
|
6441
6909
|
verbose: Whether to include additional debug information in the output
|
|
6442
6910
|
"""
|
|
6443
|
-
if not runtime.core.
|
|
6911
|
+
if not runtime.core.wp_capture_debug_dot_print(graph.graph, path.encode(), 0 if verbose else 1):
|
|
6444
6912
|
raise RuntimeError(f"Graph debug dot print error: {runtime.get_error_string()}")
|
|
6445
6913
|
|
|
6446
6914
|
|
|
@@ -6473,7 +6941,7 @@ def capture_pause(device: Devicelike = None, stream: Stream | None = None) -> Gr
|
|
|
6473
6941
|
_unregister_capture(device, stream, graph)
|
|
6474
6942
|
|
|
6475
6943
|
g = ctypes.c_void_p()
|
|
6476
|
-
if not runtime.core.
|
|
6944
|
+
if not runtime.core.wp_cuda_graph_pause_capture(device.context, stream.cuda_stream, ctypes.byref(g)):
|
|
6477
6945
|
raise RuntimeError(runtime.get_error_string())
|
|
6478
6946
|
|
|
6479
6947
|
graph.graph = g
|
|
@@ -6490,10 +6958,10 @@ def capture_resume(graph: Graph, device: Devicelike = None, stream: Stream | Non
|
|
|
6490
6958
|
raise RuntimeError("Must be a CUDA device")
|
|
6491
6959
|
stream = device.stream
|
|
6492
6960
|
|
|
6493
|
-
if not runtime.core.
|
|
6961
|
+
if not runtime.core.wp_cuda_graph_resume_capture(device.context, stream.cuda_stream, graph.graph):
|
|
6494
6962
|
raise RuntimeError(runtime.get_error_string())
|
|
6495
6963
|
|
|
6496
|
-
capture_id = runtime.core.
|
|
6964
|
+
capture_id = runtime.core.wp_cuda_stream_get_capture_id(stream.cuda_stream)
|
|
6497
6965
|
graph.capture_id = capture_id
|
|
6498
6966
|
|
|
6499
6967
|
_register_capture(device, stream, graph, capture_id)
|
|
@@ -6576,15 +7044,13 @@ def capture_if(
|
|
|
6576
7044
|
|
|
6577
7045
|
return
|
|
6578
7046
|
|
|
6579
|
-
graph.has_conditional = True
|
|
6580
|
-
|
|
6581
7047
|
# ensure conditional graph nodes are supported
|
|
6582
7048
|
assert_conditional_graph_support()
|
|
6583
7049
|
|
|
6584
7050
|
# insert conditional node
|
|
6585
7051
|
graph_on_true = ctypes.c_void_p()
|
|
6586
7052
|
graph_on_false = ctypes.c_void_p()
|
|
6587
|
-
if not runtime.core.
|
|
7053
|
+
if not runtime.core.wp_cuda_graph_insert_if_else(
|
|
6588
7054
|
device.context,
|
|
6589
7055
|
stream.cuda_stream,
|
|
6590
7056
|
ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
|
|
@@ -6607,11 +7073,7 @@ def capture_if(
|
|
|
6607
7073
|
if isinstance(on_true, Callable):
|
|
6608
7074
|
on_true(**kwargs)
|
|
6609
7075
|
elif isinstance(on_true, Graph):
|
|
6610
|
-
if
|
|
6611
|
-
raise RuntimeError(
|
|
6612
|
-
"The on_true graph contains conditional nodes, which are not allowed in child graphs"
|
|
6613
|
-
)
|
|
6614
|
-
if not runtime.core.cuda_graph_insert_child_graph(
|
|
7076
|
+
if not runtime.core.wp_cuda_graph_insert_child_graph(
|
|
6615
7077
|
device.context,
|
|
6616
7078
|
stream.cuda_stream,
|
|
6617
7079
|
on_true.graph,
|
|
@@ -6621,6 +7083,10 @@ def capture_if(
|
|
|
6621
7083
|
raise TypeError("on_true must be a Callable or a Graph")
|
|
6622
7084
|
capture_pause(stream=stream)
|
|
6623
7085
|
|
|
7086
|
+
# check the if-body graph
|
|
7087
|
+
if not runtime.core.wp_cuda_graph_check_conditional_body(graph_on_true):
|
|
7088
|
+
raise RuntimeError(runtime.get_error_string())
|
|
7089
|
+
|
|
6624
7090
|
# capture else-graph
|
|
6625
7091
|
if on_false is not None:
|
|
6626
7092
|
# temporarily repurpose the main_graph python object such that all dependencies
|
|
@@ -6630,11 +7096,7 @@ def capture_if(
|
|
|
6630
7096
|
if isinstance(on_false, Callable):
|
|
6631
7097
|
on_false(**kwargs)
|
|
6632
7098
|
elif isinstance(on_false, Graph):
|
|
6633
|
-
if
|
|
6634
|
-
raise RuntimeError(
|
|
6635
|
-
"The on_false graph contains conditional nodes, which are not allowed in child graphs"
|
|
6636
|
-
)
|
|
6637
|
-
if not runtime.core.cuda_graph_insert_child_graph(
|
|
7099
|
+
if not runtime.core.wp_cuda_graph_insert_child_graph(
|
|
6638
7100
|
device.context,
|
|
6639
7101
|
stream.cuda_stream,
|
|
6640
7102
|
on_false.graph,
|
|
@@ -6644,6 +7106,10 @@ def capture_if(
|
|
|
6644
7106
|
raise TypeError("on_false must be a Callable or a Graph")
|
|
6645
7107
|
capture_pause(stream=stream)
|
|
6646
7108
|
|
|
7109
|
+
# check the else-body graph
|
|
7110
|
+
if not runtime.core.wp_cuda_graph_check_conditional_body(graph_on_false):
|
|
7111
|
+
raise RuntimeError(runtime.get_error_string())
|
|
7112
|
+
|
|
6647
7113
|
# restore the main graph to its original state
|
|
6648
7114
|
main_graph.graph = main_graph_ptr
|
|
6649
7115
|
|
|
@@ -6710,15 +7176,13 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
|
|
|
6710
7176
|
|
|
6711
7177
|
return
|
|
6712
7178
|
|
|
6713
|
-
graph.has_conditional = True
|
|
6714
|
-
|
|
6715
7179
|
# ensure conditional graph nodes are supported
|
|
6716
7180
|
assert_conditional_graph_support()
|
|
6717
7181
|
|
|
6718
7182
|
# insert conditional while-node
|
|
6719
7183
|
body_graph = ctypes.c_void_p()
|
|
6720
7184
|
cond_handle = ctypes.c_uint64()
|
|
6721
|
-
if not runtime.core.
|
|
7185
|
+
if not runtime.core.wp_cuda_graph_insert_while(
|
|
6722
7186
|
device.context,
|
|
6723
7187
|
stream.cuda_stream,
|
|
6724
7188
|
ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
|
|
@@ -6741,20 +7205,17 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
|
|
|
6741
7205
|
if isinstance(while_body, Callable):
|
|
6742
7206
|
while_body(**kwargs)
|
|
6743
7207
|
elif isinstance(while_body, Graph):
|
|
6744
|
-
if
|
|
6745
|
-
raise RuntimeError("The body graph contains conditional nodes, which are not allowed in child graphs")
|
|
6746
|
-
|
|
6747
|
-
if not runtime.core.cuda_graph_insert_child_graph(
|
|
7208
|
+
if not runtime.core.wp_cuda_graph_insert_child_graph(
|
|
6748
7209
|
device.context,
|
|
6749
7210
|
stream.cuda_stream,
|
|
6750
7211
|
while_body.graph,
|
|
6751
7212
|
):
|
|
6752
7213
|
raise RuntimeError(runtime.get_error_string())
|
|
6753
7214
|
else:
|
|
6754
|
-
raise
|
|
7215
|
+
raise TypeError("while_body must be a callable or a graph")
|
|
6755
7216
|
|
|
6756
7217
|
# update condition
|
|
6757
|
-
if not runtime.core.
|
|
7218
|
+
if not runtime.core.wp_cuda_graph_set_condition(
|
|
6758
7219
|
device.context,
|
|
6759
7220
|
stream.cuda_stream,
|
|
6760
7221
|
ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
|
|
@@ -6762,8 +7223,13 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
|
|
|
6762
7223
|
):
|
|
6763
7224
|
raise RuntimeError(runtime.get_error_string())
|
|
6764
7225
|
|
|
6765
|
-
# stop capturing
|
|
7226
|
+
# stop capturing while-body
|
|
6766
7227
|
capture_pause(stream=stream)
|
|
7228
|
+
|
|
7229
|
+
# check the while-body graph
|
|
7230
|
+
if not runtime.core.wp_cuda_graph_check_conditional_body(body_graph):
|
|
7231
|
+
raise RuntimeError(runtime.get_error_string())
|
|
7232
|
+
|
|
6767
7233
|
# restore the main graph to its original state
|
|
6768
7234
|
main_graph.graph = main_graph_ptr
|
|
6769
7235
|
capture_resume(main_graph, stream=stream)
|
|
@@ -6787,14 +7253,14 @@ def capture_launch(graph: Graph, stream: Stream | None = None):
|
|
|
6787
7253
|
|
|
6788
7254
|
if graph.graph_exec is None:
|
|
6789
7255
|
g = ctypes.c_void_p()
|
|
6790
|
-
result = runtime.core.
|
|
7256
|
+
result = runtime.core.wp_cuda_graph_create_exec(
|
|
6791
7257
|
graph.device.context, stream.cuda_stream, graph.graph, ctypes.byref(g)
|
|
6792
7258
|
)
|
|
6793
7259
|
if not result:
|
|
6794
7260
|
raise RuntimeError(f"Graph creation error: {runtime.get_error_string()}")
|
|
6795
7261
|
graph.graph_exec = g
|
|
6796
7262
|
|
|
6797
|
-
if not runtime.core.
|
|
7263
|
+
if not runtime.core.wp_cuda_graph_launch(graph.graph_exec, stream.cuda_stream):
|
|
6798
7264
|
raise RuntimeError(f"Graph launch error: {runtime.get_error_string()}")
|
|
6799
7265
|
|
|
6800
7266
|
|
|
@@ -6905,24 +7371,24 @@ def copy(
|
|
|
6905
7371
|
if dest.device.is_cuda:
|
|
6906
7372
|
if src.device.is_cuda:
|
|
6907
7373
|
if src.device == dest.device:
|
|
6908
|
-
result = runtime.core.
|
|
7374
|
+
result = runtime.core.wp_memcpy_d2d(
|
|
6909
7375
|
dest.device.context, dst_ptr, src_ptr, bytes_to_copy, stream.cuda_stream
|
|
6910
7376
|
)
|
|
6911
7377
|
else:
|
|
6912
|
-
result = runtime.core.
|
|
7378
|
+
result = runtime.core.wp_memcpy_p2p(
|
|
6913
7379
|
dest.device.context, dst_ptr, src.device.context, src_ptr, bytes_to_copy, stream.cuda_stream
|
|
6914
7380
|
)
|
|
6915
7381
|
else:
|
|
6916
|
-
result = runtime.core.
|
|
7382
|
+
result = runtime.core.wp_memcpy_h2d(
|
|
6917
7383
|
dest.device.context, dst_ptr, src_ptr, bytes_to_copy, stream.cuda_stream
|
|
6918
7384
|
)
|
|
6919
7385
|
else:
|
|
6920
7386
|
if src.device.is_cuda:
|
|
6921
|
-
result = runtime.core.
|
|
7387
|
+
result = runtime.core.wp_memcpy_d2h(
|
|
6922
7388
|
src.device.context, dst_ptr, src_ptr, bytes_to_copy, stream.cuda_stream
|
|
6923
7389
|
)
|
|
6924
7390
|
else:
|
|
6925
|
-
result = runtime.core.
|
|
7391
|
+
result = runtime.core.wp_memcpy_h2h(dst_ptr, src_ptr, bytes_to_copy)
|
|
6926
7392
|
|
|
6927
7393
|
if not result:
|
|
6928
7394
|
raise RuntimeError(f"Warp copy error: {runtime.get_error_string()}")
|
|
@@ -6957,17 +7423,17 @@ def copy(
|
|
|
6957
7423
|
# This work involves a kernel launch, so it must run on the destination device.
|
|
6958
7424
|
# If the copy stream is different, we need to synchronize it.
|
|
6959
7425
|
if stream == dest.device.stream:
|
|
6960
|
-
result = runtime.core.
|
|
7426
|
+
result = runtime.core.wp_array_copy_device(
|
|
6961
7427
|
dest.device.context, dst_ptr, src_ptr, dst_type, src_type, src_elem_size
|
|
6962
7428
|
)
|
|
6963
7429
|
else:
|
|
6964
7430
|
dest.device.stream.wait_stream(stream)
|
|
6965
|
-
result = runtime.core.
|
|
7431
|
+
result = runtime.core.wp_array_copy_device(
|
|
6966
7432
|
dest.device.context, dst_ptr, src_ptr, dst_type, src_type, src_elem_size
|
|
6967
7433
|
)
|
|
6968
7434
|
stream.wait_stream(dest.device.stream)
|
|
6969
7435
|
else:
|
|
6970
|
-
result = runtime.core.
|
|
7436
|
+
result = runtime.core.wp_array_copy_host(dst_ptr, src_ptr, dst_type, src_type, src_elem_size)
|
|
6971
7437
|
|
|
6972
7438
|
if not result:
|
|
6973
7439
|
raise RuntimeError(f"Warp copy error: {runtime.get_error_string()}")
|
|
@@ -7272,7 +7738,6 @@ def export_stubs(file): # pragma: no cover
|
|
|
7272
7738
|
""",
|
|
7273
7739
|
file=file,
|
|
7274
7740
|
)
|
|
7275
|
-
|
|
7276
7741
|
print(
|
|
7277
7742
|
"# Autogenerated file, do not edit, this file provides stubs for builtins autocomplete in VSCode, PyCharm, etc",
|
|
7278
7743
|
file=file,
|