PyPI - warp-lang - Versions diffs - 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show

warp/__init__.py +8 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +7 -6
warp/build_dll.py +70 -79
warp/builtins.py +10 -6
warp/codegen.py +51 -19
warp/config.py +7 -8
warp/constants.py +3 -0
warp/context.py +948 -245
warp/dlpack.py +198 -113
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usda +42 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usda +56 -0
warp/examples/assets/torus.usda +105 -0
warp/examples/benchmarks/benchmark_api.py +383 -0
warp/examples/benchmarks/benchmark_cloth.py +279 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
warp/examples/benchmarks/benchmark_launches.py +295 -0
warp/examples/core/example_dem.py +221 -0
warp/examples/core/example_fluid.py +267 -0
warp/examples/core/example_graph_capture.py +129 -0
warp/examples/core/example_marching_cubes.py +177 -0
warp/examples/core/example_mesh.py +154 -0
warp/examples/core/example_mesh_intersect.py +193 -0
warp/examples/core/example_nvdb.py +169 -0
warp/examples/core/example_raycast.py +89 -0
warp/examples/core/example_raymarch.py +178 -0
warp/examples/core/example_render_opengl.py +141 -0
warp/examples/core/example_sph.py +389 -0
warp/examples/core/example_torch.py +181 -0
warp/examples/core/example_wave.py +249 -0
warp/examples/fem/bsr_utils.py +380 -0
warp/examples/fem/example_apic_fluid.py +391 -0
warp/examples/fem/example_convection_diffusion.py +168 -0
warp/examples/fem/example_convection_diffusion_dg.py +209 -0
warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
warp/examples/fem/example_deformed_geometry.py +159 -0
warp/examples/fem/example_diffusion.py +173 -0
warp/examples/fem/example_diffusion_3d.py +152 -0
warp/examples/fem/example_diffusion_mgpu.py +214 -0
warp/examples/fem/example_mixed_elasticity.py +222 -0
warp/examples/fem/example_navier_stokes.py +243 -0
warp/examples/fem/example_stokes.py +192 -0
warp/examples/fem/example_stokes_transfer.py +249 -0
warp/examples/fem/mesh_utils.py +109 -0
warp/examples/fem/plot_utils.py +287 -0
warp/examples/optim/example_bounce.py +248 -0
warp/examples/optim/example_cloth_throw.py +210 -0
warp/examples/optim/example_diffray.py +535 -0
warp/examples/optim/example_drone.py +850 -0
warp/examples/optim/example_inverse_kinematics.py +169 -0
warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
warp/examples/optim/example_spring_cage.py +234 -0
warp/examples/optim/example_trajectory.py +201 -0
warp/examples/sim/example_cartpole.py +128 -0
warp/examples/sim/example_cloth.py +184 -0
warp/examples/sim/example_granular.py +113 -0
warp/examples/sim/example_granular_collision_sdf.py +185 -0
warp/examples/sim/example_jacobian_ik.py +213 -0
warp/examples/sim/example_particle_chain.py +106 -0
warp/examples/sim/example_quadruped.py +179 -0
warp/examples/sim/example_rigid_chain.py +191 -0
warp/examples/sim/example_rigid_contact.py +176 -0
warp/examples/sim/example_rigid_force.py +126 -0
warp/examples/sim/example_rigid_gyroscopic.py +97 -0
warp/examples/sim/example_rigid_soft_contact.py +124 -0
warp/examples/sim/example_soft_body.py +178 -0
warp/fabric.py +29 -20
warp/fem/cache.py +0 -1
warp/fem/dirichlet.py +0 -2
warp/fem/integrate.py +0 -1
warp/jax.py +45 -0
warp/jax_experimental.py +339 -0
warp/native/builtin.h +12 -0
warp/native/bvh.cu +18 -18
warp/native/clang/clang.cpp +8 -3
warp/native/cuda_util.cpp +94 -5
warp/native/cuda_util.h +35 -6
warp/native/cutlass_gemm.cpp +1 -1
warp/native/cutlass_gemm.cu +4 -1
warp/native/error.cpp +66 -0
warp/native/error.h +27 -0
warp/native/mesh.cu +2 -2
warp/native/reduce.cu +4 -4
warp/native/runlength_encode.cu +2 -2
warp/native/scan.cu +2 -2
warp/native/sparse.cu +0 -1
warp/native/temp_buffer.h +2 -2
warp/native/warp.cpp +95 -60
warp/native/warp.cu +1053 -218
warp/native/warp.h +49 -32
warp/optim/linear.py +33 -16
warp/render/render_opengl.py +202 -101
warp/render/render_usd.py +82 -40
warp/sim/__init__.py +13 -4
warp/sim/articulation.py +4 -5
warp/sim/collide.py +320 -175
warp/sim/import_mjcf.py +25 -30
warp/sim/import_urdf.py +94 -63
warp/sim/import_usd.py +51 -36
warp/sim/inertia.py +3 -2
warp/sim/integrator.py +233 -0
warp/sim/integrator_euler.py +447 -469
warp/sim/integrator_featherstone.py +1991 -0
warp/sim/integrator_xpbd.py +1420 -640
warp/sim/model.py +765 -487
warp/sim/particles.py +2 -1
warp/sim/render.py +35 -13
warp/sim/utils.py +222 -11
warp/stubs.py +8 -0
warp/tape.py +16 -1
warp/tests/aux_test_grad_customs.py +23 -0
warp/tests/test_array.py +190 -1
warp/tests/test_async.py +656 -0
warp/tests/test_bool.py +50 -0
warp/tests/test_dlpack.py +164 -11
warp/tests/test_examples.py +166 -74
warp/tests/test_fem.py +8 -1
warp/tests/test_generics.py +15 -5
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +172 -12
warp/tests/test_jax.py +254 -0
warp/tests/test_large.py +29 -6
warp/tests/test_launch.py +25 -0
warp/tests/test_linear_solvers.py +20 -3
warp/tests/test_matmul.py +61 -16
warp/tests/test_matmul_lite.py +13 -13
warp/tests/test_mempool.py +186 -0
warp/tests/test_multigpu.py +3 -0
warp/tests/test_options.py +16 -2
warp/tests/test_peer.py +137 -0
warp/tests/test_print.py +3 -1
warp/tests/test_quat.py +23 -0
warp/tests/test_sim_kinematics.py +97 -0
warp/tests/test_snippet.py +126 -3
warp/tests/test_streams.py +108 -79
warp/tests/test_torch.py +16 -8
warp/tests/test_utils.py +32 -27
warp/tests/test_verify_fp.py +65 -0
warp/tests/test_volume.py +1 -1
warp/tests/unittest_serial.py +2 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +14 -7
warp/thirdparty/unittest_parallel.py +15 -3
warp/torch.py +10 -8
warp/types.py +363 -246
warp/utils.py +143 -19
warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
warp_lang-1.0.0.dist-info/METADATA +394 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
warp/sim/optimizer.py +0 -138
warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
warp_lang-0.11.0.dist-info/METADATA +0 -238
/warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0

warp/__init__.py CHANGED Viewed

@@ -45,6 +45,8 @@ from warp.context import get_device, set_device, synchronize_device
 from warp.context import (
     zeros,
     zeros_like,
+    ones,
+    ones_like,
     full,
     full_like,
     clone,
@@ -63,9 +65,15 @@ from warp.context import Kernel, Function, Launch
 from warp.context import Stream, get_stream, set_stream, synchronize_stream
 from warp.context import Event, record_event, wait_event, wait_stream
 from warp.context import RegisteredGLBuffer
+from warp.context import is_mempool_supported, is_mempool_enabled, set_mempool_enabled
+from warp.context import set_mempool_release_threshold, get_mempool_release_threshold
+from warp.context import is_mempool_access_supported, is_mempool_access_enabled, set_mempool_access_enabled
+from warp.context import is_peer_access_supported, is_peer_access_enabled, set_peer_access_enabled
 from warp.tape import Tape
 from warp.utils import ScopedTimer, ScopedDevice, ScopedStream
+from warp.utils import ScopedMempool, ScopedMempoolAccess, ScopedPeerAccess
+from warp.utils import ScopedCapture
 from warp.utils import transform_expand, quat_between_vectors
 from warp.torch import from_torch, to_torch

warp/bin/warp-clang.so CHANGED Viewed

Binary file

warp/bin/warp.so CHANGED Viewed

Binary file

warp/build.py CHANGED Viewed

@@ -45,7 +45,7 @@ def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=Fal
         inc_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "native").encode("utf-8")
         obj_path = obj_path.encode("utf-8")
-        err = warp.context.runtime.llvm.compile_cpp(src, cpp_path, inc_path, obj_path, mode == "debug")
+        err = warp.context.runtime.llvm.compile_cpp(src, cpp_path, inc_path, obj_path, mode == "debug", verify_fp)
         if err != 0:
             raise Exception(f"CPU kernel build failed with error code {err}")
@@ -66,9 +66,7 @@ def init_kernel_cache(path=None):
     if path is not None:
         cache_root_dir = os.path.realpath(path)
     else:
-        cache_root_dir = appdirs.user_cache_dir(
-            appname="warp", appauthor="NVIDIA Corporation", version=warp.config.version
-        )
+        cache_root_dir = appdirs.user_cache_dir(appname="warp", appauthor="NVIDIA", version=warp.config.version)
     cache_bin_dir = os.path.join(cache_root_dir, "bin")
     cache_gen_dir = os.path.join(cache_root_dir, "gen")
@@ -95,15 +93,18 @@ def init_kernel_cache(path=None):
 def clear_kernel_cache():
     """Clear the kernel cache."""
+    is_intialized = kernel_bin_dir is not None and kernel_gen_dir is not None
+    assert is_intialized, "The kernel cache directory is not configured; wp.init() has not been called yet or failed."
     import glob
     paths = []
-    if kernel_bin_dir is not None and os.path.isdir(kernel_bin_dir):
+    if os.path.isdir(kernel_bin_dir):
         pattern = os.path.join(kernel_bin_dir, "wp_*")
         paths += glob.glob(pattern)
-    if kernel_gen_dir is not None and os.path.isdir(kernel_gen_dir):
+    if os.path.isdir(kernel_gen_dir):
         pattern = os.path.join(kernel_gen_dir, "wp_*")
         paths += glob.glob(pattern)

warp/build_dll.py CHANGED Viewed

@@ -10,9 +10,10 @@ import os
 import subprocess
 import platform
-import warp.config
 from warp.utils import ScopedTimer
+verbose_cmd = True  # print command lines before executing them
 # returns a canonical machine architecture string
 # - "x86_64" for x86-64, aka. AMD64, aka. x64
@@ -26,8 +27,8 @@ def machine_architecture() -> str:
     raise RuntimeError(f"Unrecognized machine architecture {machine}")
-def run_cmd(cmd, capture=False):
-    if warp.config.verbose:
+def run_cmd(cmd):
+    if verbose_cmd:
         print(cmd)
     try:
@@ -41,8 +42,8 @@ def run_cmd(cmd, capture=False):
 # cut-down version of vcvars64.bat that allows using
-# custom toolchain locations
-def set_msvc_compiler(msvc_path, sdk_path):
+# custom toolchain locations, returns the compiler program path
+def set_msvc_env(msvc_path, sdk_path):
     if "INCLUDE" not in os.environ:
         os.environ["INCLUDE"] = ""
@@ -65,58 +66,51 @@ def set_msvc_compiler(msvc_path, sdk_path):
     os.environ["PATH"] += os.pathsep + os.path.join(msvc_path, "bin/HostX64/x64")
     os.environ["PATH"] += os.pathsep + os.path.join(sdk_path, "bin/x64")
-    warp.config.host_compiler = os.path.join(msvc_path, "bin", "HostX64", "x64", "cl.exe")
+    return os.path.join(msvc_path, "bin", "HostX64", "x64", "cl.exe")
 def find_host_compiler():
     if os.name == "nt":
-        try:
-            # try and find an installed host compiler (msvc)
-            # runs vcvars and copies back the build environment
-            vswhere_path = r"%ProgramFiles(x86)%/Microsoft Visual Studio/Installer/vswhere.exe"
-            vswhere_path = os.path.expandvars(vswhere_path)
-            if not os.path.exists(vswhere_path):
-                return ""
-            vs_path = run_cmd(f'"{vswhere_path}" -latest -property installationPath').decode().rstrip()
-            vsvars_path = os.path.join(vs_path, "VC\\Auxiliary\\Build\\vcvars64.bat")
-            output = run_cmd(f'"{vsvars_path}" && set').decode()
-            for line in output.splitlines():
-                pair = line.split("=", 1)
-                if len(pair) >= 2:
-                    os.environ[pair[0]] = pair[1]
-            cl_path = run_cmd("where cl.exe").decode("utf-8").rstrip()
-            cl_version = os.environ["VCToolsVersion"].split(".")
-            # ensure at least VS2019 version, see list of MSVC versions here https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B
-            cl_required_major = 14
-            cl_required_minor = 29
-            if (
-                (int(cl_version[0]) < cl_required_major)
-                or (int(cl_version[0]) == cl_required_major)
-                and int(cl_version[1]) < cl_required_minor
-            ):
-                print(
-                    f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled."
-                )
-                return ""
+        # try and find an installed host compiler (msvc)
+        # runs vcvars and copies back the build environment
+        vswhere_path = r"%ProgramFiles(x86)%/Microsoft Visual Studio/Installer/vswhere.exe"
+        vswhere_path = os.path.expandvars(vswhere_path)
+        if not os.path.exists(vswhere_path):
+            return ""
+        vs_path = run_cmd(f'"{vswhere_path}" -latest -property installationPath').decode().rstrip()
+        vsvars_path = os.path.join(vs_path, "VC\\Auxiliary\\Build\\vcvars64.bat")
+        output = run_cmd(f'"{vsvars_path}" && set').decode()
-            return cl_path
+        for line in output.splitlines():
+            pair = line.split("=", 1)
+            if len(pair) >= 2:
+                os.environ[pair[0]] = pair[1]
-        except Exception as e:
-            # couldn't find host compiler
+        cl_path = run_cmd("where cl.exe").decode("utf-8").rstrip()
+        cl_version = os.environ["VCToolsVersion"].split(".")
+        # ensure at least VS2019 version, see list of MSVC versions here https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B
+        cl_required_major = 14
+        cl_required_minor = 29
+        if (
+            (int(cl_version[0]) < cl_required_major)
+            or (int(cl_version[0]) == cl_required_major)
+            and int(cl_version[1]) < cl_required_minor
+        ):
+            print(
+                f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled."
+            )
             return ""
+        return cl_path
     else:
         # try and find g++
-        try:
-            return run_cmd("which g++").decode()
-        except:
-            return ""
+        return run_cmd("which g++").decode()
 def get_cuda_toolkit_version(cuda_home):
@@ -141,11 +135,12 @@ def quote(path):
     return '"' + path + '"'
-def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp=False, fast_math=False, quick=False):
-    cuda_home = warp.config.cuda_path
+def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None):
+    mode = args.mode if (mode is None) else mode
+    cuda_home = args.cuda_path
     cuda_cmd = None
-    if quick:
+    if args.quick:
         cutlass_includes = ""
         cutlass_enabled = "WP_ENABLE_CUTLASS=0"
     else:
@@ -153,7 +148,7 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
         cutlass_includes = f'-I"{cutlass_home}/include" -I"{cutlass_home}/tools/util/include"'
         cutlass_enabled = "WP_ENABLE_CUTLASS=1"
-    if quick or cu_path is None:
+    if args.quick or cu_path is None:
         cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
     else:
         cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
@@ -165,7 +160,7 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
     nanovdb_home = warp_home_path.parent / "_build/host-deps/nanovdb/include"
     # output stale, rebuild
-    if warp.config.verbose:
+    if args.verbose:
         print(f"Building {dll_path}")
     native_dir = os.path.join(warp_home, "native")
@@ -181,7 +176,7 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
         gencode_opts = []
-        if quick:
+        if args.quick:
             # minimum supported architectures (PTX)
             gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
         else:
@@ -224,15 +219,15 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
             "--extended-lambda",
         ]
-        if fast_math:
+        if args.fast_math:
             nvcc_opts.append("--use_fast_math")
     # is the library being built with CUDA enabled?
     cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
     if os.name == "nt":
-        if warp.config.host_compiler:
-            host_linker = os.path.join(os.path.dirname(warp.config.host_compiler), "link.exe")
+        if args.host_compiler:
+            host_linker = os.path.join(os.path.dirname(args.host_compiler), "link.exe")
         else:
             raise RuntimeError("Warp build error: No host compiler was found")
@@ -251,27 +246,27 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
             iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
             debug = "_DEBUG"
-        if warp.config.mode == "debug":
+        if args.mode == "debug":
             cpp_flags = f'/nologo {runtime} /Zi /Od /D "{debug}" /D WP_ENABLE_DEBUG=1 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}'
             linkopts = ["/DLL", "/DEBUG"]
-        elif warp.config.mode == "release":
+        elif args.mode == "release":
             cpp_flags = f'/nologo {runtime} /Ox /D "{debug}" /D WP_ENABLE_DEBUG=0 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}'
             linkopts = ["/DLL"]
         else:
-            raise RuntimeError(f"Unrecognized build configuration (debug, release), got: {mode}")
+            raise RuntimeError(f"Unrecognized build configuration (debug, release), got: {args.mode}")
-        if verify_fp:
+        if args.verify_fp:
             cpp_flags += ' /D "WP_VERIFY_FP"'
-        if fast_math:
+        if args.fast_math:
             cpp_flags += " /fp:fast"
-        with ScopedTimer("build", active=warp.config.verbose):
+        with ScopedTimer("build", active=args.verbose):
             for cpp_path in cpp_paths:
                 cpp_out = cpp_path + ".obj"
                 linkopts.append(quote(cpp_out))
-                cpp_cmd = f'"{warp.config.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"'
+                cpp_cmd = f'"{args.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"'
                 run_cmd(cpp_cmd)
         if cu_path:
@@ -283,14 +278,14 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
             elif mode == "release":
                 cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
-            with ScopedTimer("build_cuda", active=warp.config.verbose):
+            with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
                 linkopts.append(quote(cu_out))
                 linkopts.append(
                     f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
                 )
-        with ScopedTimer("link", active=warp.config.verbose):
+        with ScopedTimer("link", active=args.verbose):
             link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
             run_cmd(link_cmd)
@@ -311,15 +306,15 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
         if mode == "release":
             cpp_flags = f'{target} -O3 -DNDEBUG -DWP_ENABLE_DEBUG=0 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes}'
-        if verify_fp:
+        if args.verify_fp:
             cpp_flags += " -DWP_VERIFY_FP"
-        if fast_math:
+        if args.fast_math:
             cpp_flags += " -ffast-math"
         ld_inputs = []
-        with ScopedTimer("build", active=warp.config.verbose):
+        with ScopedTimer("build", active=args.verbose):
             for cpp_path in cpp_paths:
                 cpp_out = cpp_path + ".o"
                 ld_inputs.append(quote(cpp_out))
@@ -336,7 +331,7 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
             elif mode == "release":
                 cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
-            with ScopedTimer("build_cuda", active=warp.config.verbose):
+            with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
                 ld_inputs.append(quote(cu_out))
@@ -351,7 +346,7 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
             opt_no_undefined = "-Wl,--no-undefined"
             opt_exclude_libs = "-Wl,--exclude-libs,ALL"
-        with ScopedTimer("link", active=warp.config.verbose):
+        with ScopedTimer("link", active=args.verbose):
             origin = "@loader_path" if (sys.platform == "darwin") else "$ORIGIN"
             link_cmd = f"g++ {target} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}"
             run_cmd(link_cmd)
@@ -366,19 +361,15 @@ def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp
                 )
-def build_dll(dll_path, cpp_paths, cu_path, libs=[], mode="release", verify_fp=False, fast_math=False, quick=False):
+def build_dll(args, dll_path, cpp_paths, cu_path, libs=[]):
     if sys.platform == "darwin":
         # create a universal binary by combining x86-64 and AArch64 builds
-        build_dll_for_arch(dll_path + "-x86_64", cpp_paths, cu_path, libs, mode, "x86_64", verify_fp, fast_math, quick)
-        build_dll_for_arch(
-            dll_path + "-aarch64", cpp_paths, cu_path, libs, mode, "aarch64", verify_fp, fast_math, quick
-        )
+        build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_path, libs, "x86_64")
+        build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_path, libs, "aarch64")
         run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64")
         os.remove(f"{dll_path}-x86_64")
         os.remove(f"{dll_path}-aarch64")
     else:
-        build_dll_for_arch(
-            dll_path, cpp_paths, cu_path, libs, mode, machine_architecture(), verify_fp, fast_math, quick
-        )
+        build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, machine_architecture())

warp/builtins.py CHANGED Viewed

@@ -612,16 +612,20 @@ add_builtin(
 # scalar type constructors between all storage / compute types
-scalar_types_all = [*scalar_types, int, float]
+scalar_types_all = [*scalar_types, bool, int, float]
 for t in scalar_types_all:
     for u in scalar_types_all:
         add_builtin(
-            t.__name__, input_types={"u": u}, value_type=t, doc="", hidden=True, group="Scalar Math", export=False
+            t.__name__,
+            input_types={"u": u},
+            value_type=t,
+            doc="",
+            hidden=True,
+            group="Scalar Math",
+            export=False,
+            namespace="wp::" if t is not bool else "",
         )
-for u in [bool, builtins.bool]:
-    add_builtin(bool.__name__, input_types={"u": u}, value_type=bool, doc="", hidden=True, export=False, namespace="")
 def vector_constructor_func(arg_types, kwds, templates):
     if arg_types is None:
@@ -2852,7 +2856,7 @@ add_builtin(
     skip_replay=True,
 )
-for t in scalar_types + vector_types + [builtins.bool]:
+for t in scalar_types + vector_types + [bool, builtins.bool]:
     if "vec" in t.__name__ or "mat" in t.__name__:
         continue
     add_builtin(

warp/codegen.py CHANGED Viewed

@@ -418,7 +418,10 @@ def compute_type_str(base_name, template_params):
         if isinstance(p, int):
             return str(p)
         elif hasattr(p, "_type_"):
-            return f"wp::{p.__name__}"
+            if p.__name__ == "bool":
+                return "bool"
+            else:
+                return f"wp::{p.__name__}"
         return p.__name__
     return f"{base_name}<{','.join(map(param2str, template_params))}>"
@@ -595,12 +598,17 @@ class Adjoint:
         adj.skip_build = False
     # generate function ssa form and adjoint
-    def build(adj, builder):
+    def build(adj, builder, default_builder_options={}):
         if adj.skip_build:
             return
         adj.builder = builder
+        if adj.builder:
+            adj.builder_options = adj.builder.options
+        else:
+            adj.builder_options = default_builder_options
         adj.symbols = {}  # map from symbols to adjoint variables
         adj.variables = []  # list of local variables (in order)
@@ -911,8 +919,16 @@ class Adjoint:
                     break
         # if it is a user-function then build it recursively
-        if not func.is_builtin():
+        if not func.is_builtin() and func not in adj.builder.functions:
             adj.builder.build_function(func)
+            # add custom grad, replay functions to the list of functions
+            # to be built later (invalid code could be generated if we built them now)
+            # so that they are not missed when only the forward function is imported
+            # from another module
+            if func.custom_grad_func:
+                adj.builder.deferred_functions.append(func.custom_grad_func)
+            if func.custom_replay_func:
+                adj.builder.deferred_functions.append(func.custom_replay_func)
         # evaluate the function type based on inputs
         arg_types = [strip_reference(a.type) for a in args if not isinstance(a, warp.context.Function)]
@@ -924,9 +940,11 @@ class Adjoint:
         use_initializer_list = func.initializer_list_func(args, templates)
         args_var = [
-            adj.load(a)
-            if not ((param_types[i] == Reference or param_types[i] == Callable) if i < len(param_types) else False)
-            else a
+            (
+                adj.load(a)
+                if not ((param_types[i] == Reference or param_types[i] == Callable) if i < len(param_types) else False)
+                else a
+            )
             for i, a in enumerate(args)
         ]
@@ -940,7 +958,7 @@ class Adjoint:
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
             )
             replay_call = forward_call
-            if func.custom_replay_func is not None:
+            if func.custom_replay_func is not None or func.replay_snippet is not None:
                 replay_call = f"{func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
         elif not isinstance(return_type, list) or len(return_type) == 1:
@@ -1539,7 +1557,11 @@ class Adjoint:
             # test if we're above max unroll count
             max_iters = abs(end - start) // abs(step)
-            max_unroll = adj.builder.options["max_unroll"]
+            if "max_unroll" in adj.builder_options:
+                max_unroll = adj.builder_options["max_unroll"]
+            else:
+                max_unroll = warp.config.max_unroll
             ok_to_unroll = True
@@ -1722,9 +1744,7 @@ class Adjoint:
         target = adj.eval(node.value)
         if not is_local_value(target):
-            raise RuntimeError(
-                "Cannot reference a global variable from a kernel unless `wp.constant()` is being used"
-            )
+            raise RuntimeError("Cannot reference a global variable from a kernel unless `wp.constant()` is being used")
         indices = []
@@ -2008,11 +2028,9 @@ class Adjoint:
         # Look up the closure info and append it to adj.func.__globals__
         # in case you want to define a kernel inside a function and refer
         # to variables you've declared inside that function:
-        extract_contents = (
-            lambda contents: contents
-            if isinstance(contents, warp.context.Function) or not callable(contents)
-            else contents
-        )
+        def extract_contents(contents):
+            return contents if isinstance(contents, warp.context.Function) or not callable(contents) else contents
         capturedvars = dict(
             zip(
                 adj.func.__code__.co_freevars,
@@ -2343,9 +2361,12 @@ def constant_str(value):
         initlist = []
         for i in range(value._length_):
             x = ctypes.Array.__getitem__(value, i)
-            initlist.append(str(scalar_value(x)))
+            initlist.append(str(scalar_value(x)).lower())
-        dtypestr = f"wp::initializer_array<{value._length_},wp::{value._wp_scalar_type_.__name__}>"
+        if value._wp_scalar_type_ is bool:
+            dtypestr = f"wp::initializer_array<{value._length_},{value._wp_scalar_type_.__name__}>"
+        else:
+            dtypestr = f"wp::initializer_array<{value._length_},wp::{value._wp_scalar_type_.__name__}>"
         # construct value from initializer array, e.g. wp::initializer_array<4,wp::float32>{1.0, 2.0, 3.0, 4.0}
         return f"{dtypestr}{{{', '.join(initlist)}}}"
@@ -2614,7 +2635,7 @@ def codegen_func(adj, c_func_name: str, device="cpu", options={}):
     return s
-def codegen_snippet(adj, name, snippet, adj_snippet):
+def codegen_snippet(adj, name, snippet, adj_snippet, replay_snippet):
     forward_args = []
     reverse_args = []
@@ -2633,6 +2654,7 @@ def codegen_snippet(adj, name, snippet, adj_snippet):
             reverse_args.append(arg.ctype() + " & adj_" + arg.label)
     forward_template = cuda_forward_function_template
+    replay_template = cuda_forward_function_template
     reverse_template = cuda_reverse_function_template
     s = ""
@@ -2645,6 +2667,16 @@ def codegen_snippet(adj, name, snippet, adj_snippet):
         lineno=adj.fun_lineno,
     )
+    if replay_snippet is not None:
+        s += replay_template.format(
+            name="replay_" + name,
+            return_type="void",
+            forward_args=indent(forward_args),
+            forward_body=replay_snippet,
+            filename=adj.filename,
+            lineno=adj.fun_lineno,
+        )
     if adj_snippet:
         reverse_body = adj_snippet
     else:

warp/config.py CHANGED Viewed

@@ -5,11 +5,7 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-version = "0.11.0"
-cuda_path = (
-    None  # path to local CUDA toolchain, if None at init time warp will attempt to find the SDK using CUDA_PATH env var
-)
+version = "1.0.0"
 verify_fp = False  # verify inputs and outputs are finite after each launch
 verify_cuda = False  # if true will check CUDA errors after each kernel launch / memory operation
@@ -17,10 +13,9 @@ print_launches = False  # if true will print out launch information
 mode = "release"
 verbose = False  # print extra informative messages
+verbose_warnings = False  # whether file and line info gets included in Warp warnings
 quiet = False  # suppress all output except errors and warnings
-host_compiler = None  # user can specify host compiler here, otherwise will attempt to find one automatically
 cache_kernels = True
 kernel_cache_dir = None  # path to kernel cache directory, if None a default path will be used
@@ -34,4 +29,8 @@ enable_backward = True  # whether to compiler the backward passes of the kernels
 llvm_cuda = False  # use Clang/LLVM instead of NVRTC to compile CUDA
-graph_capture_module_load_default = True  # Default value of force_module_load for capture_begin()
+enable_graph_capture_module_load_by_default = True  # Default value of force_module_load for capture_begin()
+enable_mempools_at_init = True  # Whether CUDA devices will be initialized with mempools enabled (if supported)
+max_unroll = 16

warp/constants.py CHANGED Viewed

@@ -26,6 +26,8 @@ __all__ = [
     "phi",
     "PI",
     "pi",
+    "HALF_PI",
+    "half_pi",
     "TAU",
     "tau",
 ]
@@ -37,6 +39,7 @@ LN2 = ln2 = constant(0.69314718055994530942)  # ln(2)
 LN10 = ln10 = constant(2.30258509299404568402)  # ln(10)
 PHI = phi = constant(1.61803398874989484820)  # golden constant
 PI = pi = constant(3.14159265358979323846)  # pi
+HALF_PI = half_pi = constant(1.57079632679489661923)  # half pi
 TAU = tau = constant(6.28318530717958647692)  # 2 * pi
 INF = inf = constant(math.inf)