PyPI - warp-lang - Versions diffs - 1.7.2rc1__py3-none-macosx_10_13_universal2.whl → 1.8.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.7.2rc1__py3-none-macosx_10_13_universal2.whl → 1.8.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (180) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/libwarp.dylib +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/RECORD +177 -165
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/build_dll.py CHANGED Viewed

@@ -17,16 +17,18 @@ import os
 import platform
 import subprocess
 import sys
+from typing import List, Optional
 from warp.utils import ScopedTimer
 verbose_cmd = True  # print command lines before executing them
-# returns a canonical machine architecture string
-# - "x86_64" for x86-64, aka. AMD64, aka. x64
-# - "aarch64" for AArch64, aka. ARM64
 def machine_architecture() -> str:
+    """Return a canonical machine architecture string.
+    - "x86_64" for x86-64, aka. AMD64, aka. x64
+    - "aarch64" for AArch64, aka. ARM64
+    """
     machine = platform.machine()
     if machine == "x86_64" or machine == "AMD64":
         return "x86_64"
@@ -103,10 +105,8 @@ def find_host_compiler():
         cl_required_major = 14
         cl_required_minor = 29
-        if (
-            (int(cl_version[0]) < cl_required_major)
-            or (int(cl_version[0]) == cl_required_major)
-            and int(cl_version[1]) < cl_required_minor
+        if int(cl_version[0]) < cl_required_major or (
+            (int(cl_version[0]) == cl_required_major) and (int(cl_version[1]) < cl_required_minor)
         ):
             print(
                 f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled."
@@ -142,22 +142,54 @@ def quote(path):
     return '"' + path + '"'
-def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None):
+def add_llvm_bin_to_path(args):
+    """Add the LLVM bin directory to the PATH environment variable if it's set.
+    Args:
+        args: The argument namespace containing llvm_path.
+    Returns:
+        ``True`` if the PATH was updated, ``False`` otherwise.
+    """
+    if not hasattr(args, "llvm_path") or not args.llvm_path:
+        return False
+    # Construct the bin directory path
+    llvm_bin_path = os.path.join(args.llvm_path, "bin")
+    # Check if the directory exists
+    if not os.path.isdir(llvm_bin_path):
+        print(f"Warning: LLVM bin directory not found at {llvm_bin_path}")
+        return False
+    # Add to PATH environment variable
+    os.environ["PATH"] = llvm_bin_path + os.pathsep + os.environ.get("PATH", "")
+    print(f"Added {llvm_bin_path} to PATH")
+    return True
+def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[List[str]] = None, mode=None):
     mode = args.mode if (mode is None) else mode
     cuda_home = args.cuda_path
     cuda_cmd = None
+    # Add LLVM bin directory to PATH
+    add_llvm_bin_to_path(args)
     if args.quick or cu_path is None:
         cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
     else:
         cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
+    if libs is None:
+        libs = []
     import pathlib
     warp_home_path = pathlib.Path(__file__).parent
     warp_home = warp_home_path.resolve()
-    # output stale, rebuild
     if args.verbose:
         print(f"Building {dll_path}")
@@ -176,11 +208,16 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             print("MathDx support requires at least CUDA 12, skipping")
             args.libmathdx_path = None
+        # NVCC gencode options
         gencode_opts = []
+        # Clang architecture flags
+        clang_arch_flags = []
         if args.quick:
             # minimum supported architectures (PTX)
             gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
+            clang_arch_flags += ["--cuda-gpu-arch=sm_52", "--cuda-gpu-arch=sm_75"]
         else:
             # generate code for all supported architectures
             gencode_opts += [
@@ -193,6 +230,19 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                 "-gencode=arch=compute_80,code=sm_80",  # Ampere
                 "-gencode=arch=compute_86,code=sm_86",
             ]
+            # TODO: Get this working with sm_52, sm_60, sm_61
+            clang_arch_flags += [
+                # SASS for supported desktop/datacenter architectures
+                "--cuda-gpu-arch=sm_52",
+                "--cuda-gpu-arch=sm_60",
+                "--cuda-gpu-arch=sm_61",
+                "--cuda-gpu-arch=sm_70",  # Volta
+                "--cuda-gpu-arch=sm_75",  # Turing
+                "--cuda-gpu-arch=sm_80",  # Ampere
+                "--cuda-gpu-arch=sm_86",
+            ]
             if arch == "aarch64" and sys.platform == "linux":
                 gencode_opts += [
                     # SASS for supported mobile architectures (e.g. Tegra/Jetson)
@@ -202,6 +252,14 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                     "-gencode=arch=compute_87,code=sm_87",  # Orin
                 ]
+                clang_arch_flags += [
+                    # SASS for supported mobile architectures
+                    "--cuda-gpu-arch=sm_53",  # X1
+                    "--cuda-gpu-arch=sm_62",  # X2
+                    "--cuda-gpu-arch=sm_72",  # Xavier
+                    "--cuda-gpu-arch=sm_87",  # Orin
+                ]
             if ctk_version >= (12, 8):
                 # Support for Blackwell is available with CUDA Toolkit 12.8+
                 gencode_opts += [
@@ -211,6 +269,13 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                     "-gencode=arch=compute_120,code=sm_120",  # Blackwell
                     "-gencode=arch=compute_120,code=compute_120",  # PTX for future hardware
                 ]
+                clang_arch_flags += [
+                    "--cuda-gpu-arch=sm_89",  # Ada
+                    "--cuda-gpu-arch=sm_90",  # Hopper
+                    "--cuda-gpu-arch=sm_100",  # Blackwell
+                    "--cuda-gpu-arch=sm_120",  # Blackwell
+                ]
             elif ctk_version >= (11, 8):
                 # Support for Ada and Hopper is available with CUDA Toolkit 11.8+
                 gencode_opts += [
@@ -218,16 +283,40 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                     "-gencode=arch=compute_90,code=sm_90",  # Hopper
                     "-gencode=arch=compute_90,code=compute_90",  # PTX for future hardware
                 ]
+                clang_arch_flags += [
+                    "--cuda-gpu-arch=sm_89",  # Ada
+                    "--cuda-gpu-arch=sm_90",  # Hopper
+                ]
             else:
                 gencode_opts += [
                     "-gencode=arch=compute_86,code=compute_86",  # PTX for future hardware
                 ]
-        nvcc_opts = gencode_opts + [
+                clang_arch_flags += [
+                    "--cuda-gpu-arch=sm_86",  # PTX for future hardware
+                ]
+        nvcc_opts = [
+            *gencode_opts,
             "-t0",  # multithreaded compilation
             "--extended-lambda",
         ]
+        # Clang options
+        clang_opts = [
+            *clang_arch_flags,
+            "-std=c++17",
+            "-xcuda",
+            f'--cuda-path="{cuda_home}"',
+        ]
+        if args.compile_time_trace:
+            if ctk_version >= (12, 8):
+                nvcc_opts.append("--fdevice-time-trace=build_lib_compile-time-trace")
+            else:
+                print("Warp warning: CUDA version is less than 12.8, compile_time_trace is not supported")
         if args.fast_math:
             nvcc_opts.append("--use_fast_math")
@@ -304,13 +393,17 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                 )
                 if args.libmathdx_path:
-                    linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib" mathdx_static.lib')
+                    linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib/x64" mathdx_static.lib')
         with ScopedTimer("link", active=args.verbose):
             link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
             run_cmd(link_cmd)
     else:
+        # Unix compilation
+        cuda_compiler = "clang++" if getattr(args, "clang_build_toolchain", False) else "nvcc"
+        cpp_compiler = "clang++" if getattr(args, "clang_build_toolchain", False) else "g++"
         cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
         cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
         cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
@@ -319,9 +412,12 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
         if sys.platform == "darwin":
             version = f"--target={arch}-apple-macos11"
         else:
-            version = "-fabi-version=13"  # GCC 8.2+
+            if cpp_compiler == "g++":
+                version = "-fabi-version=13"  # GCC 8.2+
+            else:
+                version = ""
-        cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{mathdx_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
+        cpp_flags = f'-Werror -Wuninitialized {version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{mathdx_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
         if mode == "debug":
             cpp_flags += "-O0 -g -D_DEBUG -DWP_ENABLE_DEBUG=1 -fkeep-inline-functions"
@@ -342,17 +438,23 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                 cpp_out = cpp_path + ".o"
                 ld_inputs.append(quote(cpp_out))
-                build_cmd = f'g++ {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
+                build_cmd = f'{cpp_compiler} {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
                 run_cmd(build_cmd)
         if cu_path:
             cu_out = cu_path + ".o"
-            if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
-            elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+            if cuda_compiler == "nvcc":
+                if mode == "debug":
+                    cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                elif mode == "release":
+                    cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+            else:
+                # Use Clang compiler
+                if mode == "debug":
+                    cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                elif mode == "release":
+                    cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
@@ -374,7 +476,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
         with ScopedTimer("link", active=args.verbose):
             origin = "@loader_path" if (sys.platform == "darwin") else "$ORIGIN"
-            link_cmd = f"g++ {version} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}"
+            link_cmd = f"{cpp_compiler} {version} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}"
             run_cmd(link_cmd)
             # Strip symbols to reduce the binary size
@@ -389,17 +491,14 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
 def build_dll(args, dll_path, cpp_paths, cu_path, libs=None):
-    if libs is None:
-        libs = []
     if sys.platform == "darwin":
         # create a universal binary by combining x86-64 and AArch64 builds
-        build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_path, libs, "x86_64")
-        build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_path, libs, "aarch64")
+        build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_path, "x86_64", libs)
+        build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_path, "aarch64", libs)
         run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64")
         os.remove(f"{dll_path}-x86_64")
         os.remove(f"{dll_path}-aarch64")
     else:
-        build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, machine_architecture())
+        build_dll_for_arch(args, dll_path, cpp_paths, cu_path, machine_architecture(), libs)