PyPI - warp-lang - Versions diffs - 1.4.2__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.4.2__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (158) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1783 -2
warp/codegen.py +177 -45
warp/config.py +2 -2
warp/context.py +321 -73
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/sim/example_cloth.py +2 -25
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -5
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +15 -0
warp/native/builtin.h +66 -26
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +600 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1857 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +137 -65
warp/sim/graph_coloring.py +292 -0
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +88 -15
warp/stubs.py +569 -4
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +39 -0
warp/tests/test_codegen.py +81 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +241 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +18 -4
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +13 -0
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +56 -1
warp/tests/test_spatial.py +1 -1
warp/tests/test_tile.py +700 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +19 -2
warp/tests/unittest_utils.py +4 -0
warp/types.py +338 -72
warp/utils.py +22 -1
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/RECORD +153 -126
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -11,9 +11,16 @@
 #include "cuda_util.h"
 #include "error.h"
+#include <cstdlib>
+#include <fstream>
 #include <nvrtc.h>
 #include <nvPTXCompiler.h>
+#if WP_ENABLE_MATHDX
+    #include <nvJitLink.h>
+    #include <libmathdx.h>
+#endif
+#include <array>
 #include <algorithm>
 #include <iterator>
 #include <list>
@@ -23,8 +30,39 @@
 #include <unordered_set>
 #include <vector>
+#define check_any(result) (check_generic(result, __FILE__, __LINE__))
 #define check_nvrtc(code) (check_nvrtc_result(code, __FILE__, __LINE__))
 #define check_nvptx(code) (check_nvptx_result(code, __FILE__, __LINE__))
+#define check_nvjitlink(handle, code) (check_nvjitlink_result(handle, code, __FILE__, __LINE__))
+#define check_cufftdx(code) (check_cufftdx_result(code, __FILE__, __LINE__))
+#define check_cublasdx(code) (check_cublasdx_result(code, __FILE__, __LINE__))
+#define CHECK_ANY(code) \
+{ \
+    do { \
+        bool out = (check_any(code)); \
+        if(!out) { \
+            return out; \
+        } \
+    } while(0); \
+}
+#define CHECK_CUFFTDX(code) \
+{ \
+    do { \
+        bool out = (check_cufftdx(code)); \
+        if(!out) { \
+            return out; \
+        } \
+    } while(0); \
+}
+#define CHECK_CUBLASDX(code) \
+{ \
+    do { \
+        bool out = (check_cufftdx(code)); \
+        if(!out) { \
+            return out; \
+        } \
+    } while(0); \
+}
 bool check_nvrtc_result(nvrtcResult result, const char* file, int line)
 {
@@ -74,6 +112,15 @@ bool check_nvptx_result(nvPTXCompileResult result, const char* file, int line)
     return false;
 }
+bool check_generic(int result, const char* file, int line)
+{
+    if (!result) {
+        fprintf(stderr, "Error %d on %s:%d\n", (int)result, file, line);
+        return false;
+    } else {
+        return true;
+    }
+}
 struct DeviceInfo
 {
@@ -89,6 +136,7 @@ struct DeviceInfo
     int arch = 0;
     int is_uva = 0;
     int is_mempool_supported = 0;
+    int max_smem_bytes = 0;
     CUcontext primary_context = NULL;
 };
@@ -202,6 +250,7 @@ int cuda_init()
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device));
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].is_uva, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, device));
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].is_mempool_supported, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, device));
+                check_cu(cuDeviceGetAttribute_f(&g_devices[i].max_smem_bytes, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device));
                 int major = 0;
                 int minor = 0;
                 check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
@@ -2520,11 +2569,57 @@ bool cuda_graph_destroy(void* context, void* graph_exec)
     return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
 }
-size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path)
+bool write_file(const char* data, size_t size, std::string filename, const char* mode)
+{
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
+    if (print_debug)
+    {
+        printf("Writing %zu B to %s (%s)\n", size, filename.c_str(), mode);
+    }
+    FILE* file = fopen(filename.c_str(), mode);
+    if (file)
+    {
+        if (fwrite(data, 1, size, file) != size) {
+            fprintf(stderr, "Warp error: Failed to write to output file '%s'\n", filename.c_str());
+            return false;
+        }
+        fclose(file);
+        return true;
+    }
+    else
+    {
+        fprintf(stderr, "Warp error: Failed to open file '%s'\n", filename.c_str());
+        return false;
+    }
+}
+#if WP_ENABLE_MATHDX
+    bool check_nvjitlink_result(nvJitLinkHandle handle, nvJitLinkResult result, const char* file, int line)
+    {
+        if (result != NVJITLINK_SUCCESS) {
+            fprintf(stderr, "nvJitLink error: %d on %s:%d\n", (int)result, file, line);
+            size_t lsize;
+            result = nvJitLinkGetErrorLogSize(handle, &lsize);
+            if (result == NVJITLINK_SUCCESS && lsize > 0) {
+                std::vector<char> log(lsize);
+                result = nvJitLinkGetErrorLog(handle, log.data());
+                if (result == NVJITLINK_SUCCESS) {
+                    fprintf(stderr, "%s\n", log.data());
+                }
+            }
+            return false;
+        } else {
+            return true;
+        }
+    }
+#endif
+size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes)
 {
     // use file extension to determine whether to output PTX or CUBIN
     const char* output_ext = strrchr(output_path, '.');
     bool use_ptx = output_ext && strcmp(output_ext + 1, "ptx") == 0;
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
     // check include dir path len (path + option)
     const int max_path = 4096 + 16;
@@ -2534,17 +2629,37 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         return size_t(-1);
     }
+    if (print_debug)
+    {
+        // Not available in all nvJitLink versions
+        // unsigned major = 0;
+        // unsigned minor = 0;
+        // nvJitLinkVersion(&major, &minor);
+        // printf("nvJitLink version %d.%d\n", major, minor);
+        int major = 0;
+        int minor = 0;
+        nvrtcVersion(&major, &minor);
+        printf("NVRTC version %d.%d\n", major, minor);
+    }
     char include_opt[max_path];
     strcpy(include_opt, "--include-path=");
     strcat(include_opt, include_dir);
     const int max_arch = 128;
     char arch_opt[max_arch];
+    char arch_opt_lto[max_arch];
     if (use_ptx)
+    {
         snprintf(arch_opt, max_arch, "--gpu-architecture=compute_%d", arch);
+        snprintf(arch_opt_lto, max_arch, "-arch=compute_%d", arch);
+    }
     else
+    {
         snprintf(arch_opt, max_arch, "--gpu-architecture=sm_%d", arch);
+        snprintf(arch_opt_lto, max_arch, "-arch=sm_%d", arch);
+    }
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
@@ -2555,6 +2670,7 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     {
         opts.push_back("--define-macro=_DEBUG");
         opts.push_back("--generate-line-info");
         // disabling since it causes issues with `Unresolved extern function 'cudaGetParameterBufferV2'
         //opts.push_back("--device-debug");
     }
@@ -2569,6 +2685,26 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     if (fast_math)
         opts.push_back("--use_fast_math");
+    char include_cutlass[max_path];
+    sprintf(include_cutlass, "--include-path=%s/cutlass/include", include_dir);
+    opts.push_back(include_cutlass);
+    std::vector<std::string> cuda_include_opt;
+    for(int i = 0; i < num_cuda_include_dirs; i++)
+    {
+        cuda_include_opt.push_back(std::string("--include-path=") + cuda_include_dirs[i]);
+        opts.push_back(cuda_include_opt.back().c_str());
+    }
+    opts.push_back("--device-as-default-execution-space");
+    opts.push_back("--extra-device-vectorization");
+    opts.push_back("--restrict");
+    if (num_ltoirs > 0)
+    {
+        opts.push_back("-dlto");
+        opts.push_back("--relocatable-device-code=true");
+    }
     nvrtcProgram prog;
     nvrtcResult res;
@@ -2584,6 +2720,13 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     if (!check_nvrtc(res))
         return size_t(res);
+    if (print_debug)
+    {
+        printf("NVRTC options:\n");
+        for(auto o: opts) {
+            printf("%s\n", o);
+        }
+    }
     res = nvrtcCompileProgram(prog, int(opts.size()), opts.data());
     if (!check_nvrtc(res) || verbose)
@@ -2613,7 +2756,17 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     nvrtcResult (*get_output_size)(nvrtcProgram, size_t*);
     nvrtcResult (*get_output_data)(nvrtcProgram, char*);
     const char* output_mode;
-    if (use_ptx)
+    if(num_ltoirs > 0) {
+#if WP_ENABLE_MATHDX
+        get_output_size = nvrtcGetLTOIRSize;
+        get_output_data = nvrtcGetLTOIR;
+        output_mode = "wb";
+#else
+        fprintf(stderr, "Warp error: num_ltoirs > 0 but Warp was not built with MathDx support\n");
+        return size_t(-1);
+#endif
+    }
+    else if (use_ptx)
     {
         get_output_size = nvrtcGetPTXSize;
         get_output_data = nvrtcGetPTX;
@@ -2635,19 +2788,78 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         res = get_output_data(prog, output.data());
         if (check_nvrtc(res))
         {
-            FILE* file = fopen(output_path, output_mode);
-            if (file)
+            // LTOIR case - need an extra step
+            if (num_ltoirs > 0)
             {
-                if (fwrite(output.data(), 1, output_size, file) != output_size)
+#if WP_ENABLE_MATHDX
+                nvJitLinkHandle handle;
+                std::vector<const char *> lopts = {"-dlto", arch_opt_lto};
+                if (use_ptx) {
+                    lopts.push_back("-ptx");
+                }
+                if (print_debug)
+                {
+                    printf("nvJitLink options:\n");
+                    for(auto o: lopts) {
+                        printf("%s\n", o);
+                    }
+                }
+                if(!check_nvjitlink(handle, nvJitLinkCreate(&handle, lopts.size(), lopts.data())))
+                {
+                    res = nvrtcResult(-1);
+                }
+                // Links
+                if(std::getenv("WARP_DUMP_LTOIR"))
+                {
+                    write_file(output.data(), output.size(), "nvrtc_output.ltoir", "wb");
+                }
+                if(!check_nvjitlink(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output.data(), output.size(), "nvrtc_output"))) // NVRTC business
                 {
-                    fprintf(stderr, "Warp error: Failed to write output file '%s'\n", output_path);
                     res = nvrtcResult(-1);
                 }
-                fclose(file);
+                for(size_t ltoidx = 0; ltoidx < num_ltoirs; ltoidx++)
+                {
+                    if(std::getenv("WARP_DUMP_LTOIR"))
+                    {
+                        write_file(ltoirs[ltoidx], ltoir_sizes[ltoidx], std::string("lto_online_") + std::to_string(ltoidx) + ".ltoir", "wb");
+                    }
+                    if(!check_nvjitlink(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoirs[ltoidx], ltoir_sizes[ltoidx], "lto_online"))) // External LTOIR
+                    {
+                        res = nvrtcResult(-1);
+                    }
+                }
+                if(!check_nvjitlink(handle, nvJitLinkComplete(handle)))
+                {
+                    res = nvrtcResult(-1);
+                }
+                else
+                {
+                    if(use_ptx)
+                    {
+                        size_t ptx_size = 0;
+                        check_nvjitlink(handle, nvJitLinkGetLinkedPtxSize(handle, &ptx_size));
+                        std::vector<char> ptx(ptx_size);
+                        check_nvjitlink(handle, nvJitLinkGetLinkedPtx(handle, ptx.data()));
+                        output = ptx;
+                    }
+                    else
+                    {
+                        size_t cubin_size = 0;
+                        check_nvjitlink(handle, nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
+                        std::vector<char> cubin(cubin_size);
+                        check_nvjitlink(handle, nvJitLinkGetLinkedCubin(handle, cubin.data()));
+                        output = cubin;
+                    }
+                }
+                check_nvjitlink(handle, nvJitLinkDestroy(&handle));
+#else
+                fprintf(stderr, "Warp error: num_ltoirs > 0 but Warp was not built with MathDx support\n");
+                return size_t(-1);
+#endif
             }
-            else
-            {
-                fprintf(stderr, "Warp error: Failed to open output file '%s'\n", output_path);
+            if(!write_file(output.data(), output.size(), output_path, output_mode)) {
                 res = nvrtcResult(-1);
             }
         }
@@ -2658,6 +2870,119 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     return res;
 }
+#if WP_ENABLE_MATHDX
+    bool check_cufftdx_result(commonDxStatusType result, const char* file, int line)
+    {
+        if (result != commonDxStatusType::COMMONDX_SUCCESS) {
+            fprintf(stderr, "libmathdx cuFFTDx error: %d on %s:%d\n", (int)result, file, line);
+            return false;
+        } else {
+            return true;
+        }
+    }
+    bool check_cublasdx_result(commonDxStatusType result, const char* file, int line)
+    {
+        if (result != commonDxStatusType::COMMONDX_SUCCESS) {
+            fprintf(stderr, "libmathdx cuBLASDx error: %d on %s:%d\n", (int)result, file, line);
+            return false;
+        } else {
+            return true;
+        }
+    }
+    bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
+    {
+        CHECK_ANY(ltoir_output_path != nullptr);
+        CHECK_ANY(symbol_name != nullptr);
+        CHECK_ANY(shared_memory_size != nullptr);
+        // Includes currently unused
+        CHECK_ANY(include_dirs == nullptr);
+        CHECK_ANY(mathdx_include_dir == nullptr);
+        CHECK_ANY(num_include_dirs == 0);
+        bool res = true;
+        cufftdxHandle h;
+        CHECK_CUFFTDX(cufftDxCreate(&h));
+        // CUFFTDX_API_BLOCK_LMEM means each thread starts with a subset of the data
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_API, cufftDxApi::CUFFTDX_API_BLOCK_LMEM));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SIZE, (long long)size));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_DIRECTION, (cufftDxDirection)direction));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SM, (long long)(arch * 10)));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_ELEMENTS_PER_THREAD, (long long)(elements_per_thread)));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_FFTS_PER_BLOCK, 1));
+        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+        size_t lto_size = 0;
+        CHECK_CUFFTDX(cufftDxGetLTOIRSize(h, &lto_size));
+        std::vector<char> lto(lto_size);
+        CHECK_CUFFTDX(cufftDxGetLTOIR(h, lto.size(), lto.data()));
+        long long int smem = 0;
+        CHECK_CUFFTDX(cufftDxGetTraitInt64(h, cufftDxTraitType::CUFFTDX_TRAIT_SHARED_MEMORY_SIZE, &smem));
+        *shared_memory_size = (int)smem;
+        if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
+            res = false;
+        }
+        CHECK_CUFFTDX(cufftDxDestroy(h));
+        return res;
+    }
+    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
+    {
+        CHECK_ANY(ltoir_output_path != nullptr);
+        CHECK_ANY(symbol_name != nullptr);
+        // Includes currently unused
+        CHECK_ANY(include_dirs == nullptr);
+        CHECK_ANY(mathdx_include_dir == nullptr);
+        CHECK_ANY(num_include_dirs == 0);
+        bool res = true;
+        cublasdxHandle h;
+        CHECK_CUBLASDX(cublasDxCreate(&h));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_FUNCTION, cublasDxFunction::CUBLASDX_FUNCTION_MM));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_API, cublasDxApi::CUBLASDX_API_BLOCK_SMEM));
+        std::array<long long int, 3> precisions = {precision_A, precision_B, precision_C};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_PRECISION, 3, precisions.data()));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SM, (long long)(arch * 10)));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TYPE, (cublasDxType)type));
+        std::array<long long int, 3> block_dim = {num_threads, 1, 1};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
+        std::array<long long int, 3> size = {M, N, K};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
+        std::array<long long int, 3> arrangement = {arrangement_A, arrangement_B, arrangement_C};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_ARRANGEMENT, arrangement.size(), arrangement.data()));
+        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+        size_t lto_size = 0;
+        CHECK_CUBLASDX(cublasDxGetLTOIRSize(h, &lto_size));
+        std::vector<char> lto(lto_size);
+        CHECK_CUBLASDX(cublasDxGetLTOIR(h, lto.size(), lto.data()));
+        if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
+            res = false;
+        }
+        CHECK_CUBLASDX(cublasDxDestroy(h));
+        return res;
+    }
+#endif
 void* cuda_load_module(void* context, const char* path)
 {
     ContextGuard guard(context);
@@ -2784,6 +3109,29 @@ void cuda_unload_module(void* context, void* module)
     check_cu(cuModuleUnload_f((CUmodule)module));
 }
+int cuda_get_max_shared_memory(void* context)
+{
+    ContextInfo* info = get_context_info(context);
+    if (!info)
+        return -1;
+    int max_smem_bytes = info->device_info->max_smem_bytes;
+    return max_smem_bytes;
+}
+bool cuda_configure_kernel_shared_memory(void* kernel, int size)
+{
+    int requested_smem_bytes = size;
+    // configure shared memory
+    CUresult res = cuFuncSetAttribute_f((CUfunction)kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, requested_smem_bytes);
+    if (res != CUDA_SUCCESS)
+        return false;
+    return true;
+}
 void* cuda_get_kernel(void* context, void* module, const char* name)
 {
     ContextGuard guard(context);
@@ -2796,15 +3144,21 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
     }
     g_kernel_names[kernel] = name;
     return kernel;
 }
-size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream)
+size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
 {
     ContextGuard guard(context);
-    const int block_dim = 256;
+    if (block_dim <= 0)
+    {
+#if defined(_DEBUG)
+        fprintf(stderr, "Warp warning: Launch got block_dim %d. Setting to 256.\n", dim, block_dim);
+#endif
+        block_dim = 256;
+    }
     // CUDA specs up to compute capability 9.0 says the max x-dim grid is 2**31-1, so
     // grid_dim is fine as an int for the near future
     int grid_dim = (dim + block_dim - 1)/block_dim;
@@ -2835,7 +3189,8 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
         (CUfunction)kernel,
         grid_dim, 1, 1,
         block_dim, 1, 1,
-        0, static_cast<CUstream>(stream),
+        shared_memory_bytes,
+        static_cast<CUstream>(stream),
         args,
         0);
@@ -2940,7 +3295,6 @@ void cuda_timing_end(timing_result_t* results, int size)
     g_cuda_timing_state = parent_state;
 }
 // impl. files
 #include "bvh.cu"
 #include "mesh.cu"

warp/native/warp.h CHANGED Viewed

@@ -34,6 +34,8 @@ extern "C"
     WP_API int is_cuda_compatibility_enabled();
     // whether Warp was compiled with CUTLASS support
     WP_API int is_cutlass_enabled();
+    // whether Warp was compiled with MathDx support
+    WP_API int is_mathdx_enabled();
     // whether Warp was compiled with debug support
     WP_API int is_debug_enabled();
@@ -315,12 +317,16 @@ extern "C"
     WP_API bool cuda_graph_launch(void* graph, void* stream);
     WP_API bool cuda_graph_destroy(void* context, void* graph);
-    WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_file);
+    WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes);
+    WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size);
+    WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads);
     WP_API void* cuda_load_module(void* context, const char* ptx);
     WP_API void cuda_unload_module(void* context, void* module);
     WP_API void* cuda_get_kernel(void* context, void* module, const char* name);
-    WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream);
+    WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream);
+    WP_API int cuda_get_max_shared_memory(void* context);
+    WP_API bool cuda_configure_kernel_shared_memory(void* kernel, int size);
     WP_API void cuda_set_context_restore_policy(bool always_restore);
     WP_API int cuda_get_context_restore_policy();
@@ -336,4 +342,8 @@ extern "C"
     WP_API int cuda_timing_get_result_count();
     WP_API void cuda_timing_end(timing_result_t* results, int size);
+    // graph coloring
+    WP_API int graph_coloring(int num_nodes, wp::array_t<int> edges, int algorithm, wp::array_t<int> node_colors);
+    WP_API float balance_coloring(int num_nodes, wp::array_t<int> edges, int num_colors, float target_max_min_ratio, wp::array_t<int> node_colors);
 } // extern "C"

warp/optim/adam.py CHANGED Viewed

@@ -50,6 +50,26 @@ def adam_step_kernel_float(
     params[i] = params[i] - lr * mhat / (wp.sqrt(vhat) + eps)
+@wp.kernel
+def adam_step_kernel_half(
+    g: wp.array(dtype=wp.float16),
+    m: wp.array(dtype=float),
+    v: wp.array(dtype=float),
+    lr: float,
+    beta1: float,
+    beta2: float,
+    t: float,
+    eps: float,
+    params: wp.array(dtype=wp.float16),
+):
+    i = wp.tid()
+    m[i] = beta1 * m[i] + (1.0 - beta1) * float(g[i])
+    v[i] = beta2 * v[i] + (1.0 - beta2) * float(g[i]) * float(g[i])
+    mhat = m[i] / (1.0 - wp.pow(beta1, (t + 1.0)))
+    vhat = v[i] / (1.0 - wp.pow(beta2, (t + 1.0)))
+    params[i] = params[i] - wp.float16(lr * mhat / (wp.sqrt(vhat) + eps))
 class Adam:
     """An implementation of the Adam Optimizer
     It is designed to mimic Pytorch's version.
@@ -75,10 +95,20 @@ class Adam:
                 self.v = [None] * len(params)  # reset second moment
             for i in range(len(params)):
                 param = params[i]
+                if param.dtype == wp.vec3:
+                    dtype = wp.vec3
+                elif param.dtype == wp.float32:
+                    dtype = wp.float32
+                elif param.dtype == wp.float16:
+                    dtype = wp.float32  # we always use fp32 for moments, even if params are fp16
+                else:
+                    raise RuntimeError(f"Unsupported dtype for Warp Adam optimizer: {param.dtype}")
                 if self.m[i] is None or self.m[i].shape != param.shape or self.m[i].dtype != param.dtype:
-                    self.m[i] = wp.zeros_like(param)
+                    self.m[i] = wp.zeros(shape=param.shape, dtype=dtype, device=param.device)
                 if self.v[i] is None or self.v[i].shape != param.shape or self.v[i].dtype != param.dtype:
-                    self.v[i] = wp.zeros_like(param)
+                    self.v[i] = wp.zeros(shape=param.shape, dtype=dtype, device=param.device)
     def reset_internal_state(self):
         for m_i in self.m:
@@ -98,8 +128,6 @@ class Adam:
     @staticmethod
     def step_detail(g, m, v, lr, beta1, beta2, t, eps, params):
         assert params.dtype == g.dtype
-        assert params.dtype == m.dtype
-        assert params.dtype == v.dtype
         assert params.shape == g.shape
         kernel_inputs = [g, m, v, lr, beta1, beta2, t, eps, params]
         if params.dtype == wp.types.float32:
@@ -109,6 +137,13 @@ class Adam:
                 inputs=kernel_inputs,
                 device=params.device,
             )
+        elif params.dtype == wp.types.float16:
+            wp.launch(
+                kernel=adam_step_kernel_half,
+                dim=len(params),
+                inputs=kernel_inputs,
+                device=params.device,
+            )
         elif params.dtype == wp.types.vec3:
             wp.launch(
                 kernel=adam_step_kernel_vec3,