PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/bin/warp-clang.so CHANGED Viewed

Binary file

warp/bin/warp.so CHANGED Viewed

Binary file

warp/build.py CHANGED Viewed

@@ -51,7 +51,7 @@ def build_cuda(
         output_path = output_path.encode("utf-8")
         if warp.config.llvm_cuda:
-            warp.context.runtime.llvm.compile_cuda(src, cu_path_bytes, inc_path, output_path, False)
+            warp.context.runtime.llvm.wp_compile_cuda(src, cu_path_bytes, inc_path, output_path, False)
         else:
             if ltoirs is None:
@@ -67,7 +67,7 @@ def build_cuda(
                 fatbins
             )
             arr_link_input_types = (ctypes.c_int * num_link)(*link_input_types)
-            err = warp.context.runtime.core.cuda_compile_program(
+            err = warp.context.runtime.core.wp_cuda_compile_program(
                 src,
                 program_name_bytes,
                 arch,
@@ -96,7 +96,7 @@ def load_cuda(input_path, device):
     if not device.is_cuda:
         raise RuntimeError("Not a CUDA device")
-    return warp.context.runtime.core.cuda_load_module(device.context, input_path.encode("utf-8"))
+    return warp.context.runtime.core.wp_cuda_load_module(device.context, input_path.encode("utf-8"))
 def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=False, fuse_fp=True):
@@ -106,7 +106,7 @@ def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=Fal
         inc_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "native").encode("utf-8")
         obj_path = obj_path.encode("utf-8")
-        err = warp.context.runtime.llvm.compile_cpp(
+        err = warp.context.runtime.llvm.wp_compile_cpp(
             src, cpp_path, inc_path, obj_path, mode == "debug", verify_fp, fuse_fp
         )
         if err != 0:
@@ -129,6 +129,15 @@ def init_kernel_cache(path=None):
     else:
         cache_root_dir = appdirs.user_cache_dir(appname="warp", appauthor="NVIDIA", version=warp.config.version)
+        if os.name == "nt" and os.path.isabs(cache_root_dir) and not cache_root_dir.startswith("\\\\?\\"):
+            # Add Windows long-path prefix, accounting for UNC shares.
+            if cache_root_dir.startswith("\\\\"):
+                # UNC path  \\server\share\…  →  \\?\UNC\server\share\…
+                cache_root_dir = "\\\\?\\UNC\\" + cache_root_dir.lstrip("\\")
+            else:
+                # Drive-letter path  C:\…  →  \\?\C:\…
+                cache_root_dir = "\\\\?\\" + cache_root_dir
     warp.config.kernel_cache_dir = cache_root_dir
     os.makedirs(warp.config.kernel_cache_dir, exist_ok=True)
@@ -246,7 +255,12 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
             the cached file data.
     Returns:
-        Tuple containing lto_code_data followed by any extra data from extra_files
+        Tuple where the first element is a success flag (``bool``). The second
+        element is the LTO code as bytes (or ``None`` on failure).
+        If ``extra_files`` is provided, additional elements follow in the same
+        order as the keys in ``extra_files``:
+          - ``".meta"``: int (shared memory bytes).
+          - ``"_fatbin.lto"``: bytes (universal fatbin).
     """
     if extra_files is None:
         extra_files = {}
@@ -283,9 +297,9 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
         if all_files_cached:
             if not extra_files:
-                return (lto_code_data,)
+                return (True, lto_code_data)
             else:
-                return (lto_code_data, *[extra_files[ext] for ext in extra_files.keys()])
+                return (True, lto_code_data, *[extra_files[ext] for ext in extra_files.keys()])
     # Create process-dependent temporary build directory
     build_dir = f"{lto_dir}_p{os.getpid()}"
@@ -303,21 +317,24 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
         for path in temp_file_paths.values():
             if Path(path).exists():
                 Path(path).unlink()
-        raise RuntimeError(f"Failed to compile {lto_symbol}")
-    # Move outputs to cache
-    safe_rename(build_dir, lto_dir)
-    # If build_dir couldn't be moved by a rename, move the outputs one-by-one to lto_dir
-    if os.path.exists(lto_dir):
-        for ext, path in file_paths.items():
-            if not os.path.exists(path):
-                try:
-                    # copy output file to the destination lto dir
-                    os.rename(temp_file_paths[ext], path)
-                except (OSError, FileExistsError):
-                    # another process likely updated the lto dir first
-                    pass
+        outputs[".lto"] = None
+        for ext in extra_files.keys():
+            outputs[ext] = None
+    else:
+        # Move outputs to cache
+        safe_rename(build_dir, lto_dir)
+        # If build_dir couldn't be moved by a rename, move the outputs one-by-one to lto_dir
+        if os.path.exists(lto_dir):
+            for ext, path in file_paths.items():
+                if not os.path.exists(path):
+                    try:
+                        # copy output file to the destination lto dir
+                        os.rename(temp_file_paths[ext], path)
+                    except (OSError, FileExistsError):
+                        # another process likely updated the lto dir first
+                        pass
     # Clean up the temporary build directory
     if build_dir:
@@ -326,9 +343,9 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
         shutil.rmtree(build_dir, ignore_errors=True)
     if not extra_files:
-        return (outputs[".lto"],)
+        return (result, outputs[".lto"])
     else:
-        return (outputs[".lto"], *[outputs[ext] for ext in extra_files.keys()])
+        return (result, outputs[".lto"], *[outputs[ext] for ext in extra_files.keys()])
 def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, arch, num_threads, builder):
@@ -372,7 +389,7 @@ def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, ar
     lto_symbol = f"dot_{M}_{N}_{K}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
     def compile_lto_dot(temp_paths):
-        result = warp.context.runtime.core.cuda_compile_dot(
+        result = warp.context.runtime.core.wp_cuda_compile_dot(
             temp_paths[".lto"].encode("utf-8"),
             lto_symbol.encode("utf-8"),
             0,
@@ -402,7 +419,13 @@ def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, ar
     if lto_symbol in builder.ltoirs:
         lto_code_data = builder.ltoirs[lto_symbol]
     else:
-        (lto_code_data,) = _build_lto_base(lto_symbol, compile_lto_dot, builder, {})
+        (result, lto_code_data) = _build_lto_base(lto_symbol, compile_lto_dot, builder, {})
+        if not result:
+            raise RuntimeError(
+                f"Failed to compile LTO '{lto_symbol}'. "
+                "Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
+            )
         # Update builder
         builder.ltoirs[lto_symbol] = lto_code_data
@@ -429,6 +452,7 @@ def build_lto_solver(
     num_threads,
     parameter_list,
     builder,
+    smem_estimate_bytes=None,
 ):
     arch = 120 if arch > 121 else arch
@@ -446,7 +470,7 @@ def build_lto_solver(
     def compile_lto_solver(temp_paths):
         # compile LTO
-        result = warp.context.runtime.core.cuda_compile_solver(
+        result = warp.context.runtime.core.wp_cuda_compile_solver(
             temp_paths["_fatbin.lto"].encode("utf-8"),
             temp_paths[".lto"].encode("utf-8"),
             lto_symbol.encode("utf-8"),
@@ -479,10 +503,43 @@ def build_lto_solver(
     if lto_symbol in builder.ltoirs:
         lto_code_data = builder.ltoirs[lto_symbol]
     else:
-        lto_code_data, universal_fatbin_code_data = _build_lto_base(
+        (result, lto_code_data, universal_fatbin_code_data) = _build_lto_base(
             lto_symbol, compile_lto_solver, builder, {"_fatbin.lto": get_cached_lto}
         )
+        if not result:
+            hint = ""
+            if smem_estimate_bytes:
+                max_smem_bytes = 232448
+                max_smem_is_estimate = True
+                for d in warp.get_cuda_devices():
+                    if d.arch == arch:
+                        # We can directly query the max shared memory for this device
+                        queried_bytes = warp.context.runtime.core.wp_cuda_get_max_shared_memory(d.context)
+                        if queried_bytes > 0:
+                            max_smem_bytes = queried_bytes
+                            max_smem_is_estimate = False
+                            break
+                if smem_estimate_bytes > max_smem_bytes:
+                    source = "estimated limit" if max_smem_is_estimate else "device-reported limit"
+                    hint = (
+                        f"Estimated shared memory requirement is {smem_estimate_bytes}B, "
+                        f"but the {source} is {max_smem_bytes}B. "
+                        "The tile size(s) may be too large for this device."
+                    )
+            if warp.context.runtime.toolkit_version < (12, 6):
+                raise RuntimeError(
+                    "cuSolverDx requires CUDA Toolkit 12.6.3 or later. This version of Warp was built against CUDA Toolkit "
+                    f"{warp.context.runtime.toolkit_version[0]}.{warp.context.runtime.toolkit_version[1]}. "
+                    "Upgrade your CUDA Toolkit and rebuild Warp, or install a Warp wheel built with CUDA >= 12.6.3."
+                )
+            else:
+                raise RuntimeError(
+                    f"Failed to compile LTO '{lto_symbol}'. {hint}"
+                    " Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
+                )
         # Update builder
         builder.ltoirs[lto_symbol] = lto_code_data
         builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}{parameter_list};"
@@ -499,7 +556,7 @@ def build_lto_fft(arch, size, ept, direction, dir, precision, builder):
     def compile_lto_fft(temp_paths):
         shared_memory_size = ctypes.c_int(0)
-        result = warp.context.runtime.core.cuda_compile_fft(
+        result = warp.context.runtime.core.wp_cuda_compile_fft(
             temp_paths[".lto"].encode("utf-8"),
             lto_symbol.encode("utf-8"),
             0,
@@ -535,10 +592,16 @@ def build_lto_fft(arch, size, ept, direction, dir, precision, builder):
         lto_code_data = builder.ltoirs[lto_symbol]
         shared_memory_bytes = builder.shared_memory_bytes[lto_symbol]
     else:
-        lto_code_data, shared_memory_bytes = _build_lto_base(
+        (result, lto_code_data, shared_memory_bytes) = _build_lto_base(
             lto_symbol, compile_lto_fft, builder, {".meta": lambda path: get_cached_lto_meta(path, lto_symbol)}
         )
+        if not result:
+            raise RuntimeError(
+                f"Failed to compile LTO '{lto_symbol}'."
+                "Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
+            )
         # Update builder
         builder.ltoirs[lto_symbol] = lto_code_data
         builder.shared_memory_bytes[lto_symbol] = shared_memory_bytes