PyPI - warp-lang - Versions diffs - 1.4.1__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.4.1__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1920 -111
warp/codegen.py +186 -62
warp/config.py +2 -2
warp/context.py +322 -73
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/core/example_dem.py +2 -1
warp/examples/core/example_mesh_intersect.py +3 -3
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/optim/example_walker.py +2 -2
warp/examples/sim/example_cloth.py +2 -25
warp/examples/sim/example_jacobian_ik.py +6 -2
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -5
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +55 -40
warp/native/builtin.h +124 -43
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +600 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1857 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +137 -65
warp/sim/graph_coloring.py +292 -0
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +90 -17
warp/stubs.py +651 -85
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +207 -48
warp/tests/test_closest_point_edge_edge.py +8 -8
warp/tests/test_codegen.py +120 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +241 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +18 -4
warp/tests/test_fabricarray.py +33 -0
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +48 -1
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_mesh_query_point.py +5 -4
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +191 -1
warp/tests/test_spatial.py +1 -1
warp/tests/test_tile.py +700 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +23 -2
warp/tests/unittest_utils.py +4 -0
warp/types.py +339 -73
warp/utils.py +22 -1
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0

warp/native/tile_gemm.h ADDED Viewed

@@ -0,0 +1,341 @@
+/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+#pragma once
+#include "builtin.h"
+#define USE_CUTE 0
+#if USE_CUTE
+#include "cutlass/include/cute/tensor.hpp"
+#include "cutlass/include/cute/algorithm/cooperative_gemm.hpp"
+#endif // USE_CUTE
+namespace wp
+{
+/*
+// 2D tile zero
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE array_t<T> tile_zeros()
+{
+    const int length = M*N;
+    WP_TILE_SHARED __align__(16) T data[length];
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {
+        data[t] = T(0.0);
+    }
+    return array_t<T>(data, M, N, nullptr);
+}
+// 2D tile load
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
+{
+    const int length = M*N;
+    WP_TILE_SHARED __align__(16) T data[length];
+    //---------------
+    // naive-synchronous load
+    //
+    // WP_PRAGMA_UNROLL
+    // for (int t=threadIdx.x; t < length; t += blockDim.x)
+    // {
+    //     data[t] = index(src, i*M + t/N, j*N + t%N);
+    // }
+    //---------------
+    // async 128 bit loads (assumes row-major i.e.: stride 1 on y axis and 4-element alignment on dimension)
+    const int s = 4;
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
+    {
+        __pipeline_memcpy_async(&data[t],
+                                &index(src, i*M + t/N, j*N + t%N),
+                                sizeof(T)*s);
+    }
+    __pipeline_commit();
+    return array_t<T>(data, M, N, nullptr);
+}
+// 2D tile store
+template <typename T>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array_t<T>& src)
+{
+    const int M = src.shape[0];
+    const int N = src.shape[1];
+    const int length = M*N;
+    // cooperatively store the tile, using a block-stride iterator
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {
+        index(dest, i*M + t/N, j*N + t%N) = src.data[t];
+    }
+}
+*/
+template <typename T>
+inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
+{
+    return p[i*stride + j];
+}
+template <typename T>
+inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
+{
+    return p[i*stride + j];
+}
+template <unsigned M, unsigned N, typename T>
+struct partition_t
+{
+    inline partition_t(array_t<T> A)
+    {
+        data = A;
+        // todo: do ceil div for non-multiples of M,N
+        shape[0] = A.shape[0]/M;
+        shape[1] = A.shape[1]/N;
+    }
+    // underlying data
+    array_t<T> data;
+    // partition dimensions
+    int shape[2];
+};
+template <unsigned M, unsigned N, typename T>
+inline int partition_size(const partition_t<M, N, T>& tile)
+{
+    return tile.shape[0]*tile.shape[1];
+}
+// returns the x, y coordinates of a tile given a linear index
+template <unsigned M, unsigned N, typename T>
+inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
+{
+    i = t/tile.shape[1];
+    j = t%tile.shape[1];
+}
+template <unsigned M, unsigned N, typename T>
+inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
+{
+    mat_t<M, N, T> out;
+    const int tile_i = i*M;
+    const int tile_j = j*N;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < M; ++i)
+    {
+        WP_PRAGMA_UNROLL
+        for (int j=0; j < N; ++j)
+        {
+            out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
+        }
+    }
+    return out;
+}
+template <unsigned M, unsigned N, typename T>
+inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
+{
+    mat_t<M, N, T> out;
+    const int tile_i = M*i;
+    const int tile_j = N*j;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < M; ++i)
+    {
+        WP_PRAGMA_UNROLL
+        for (int j=0; j < N; ++j)
+        {
+            index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
+        }
+    }
+}
+#if !USE_CUTE
+template <typename T>
+inline CUDA_CALLABLE void gemm(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{
+    const int TILE_M = 4;
+    const int TILE_N = 4;
+    const int TILE_K = 4;
+    partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
+    partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
+    partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
+    const int length = partition_size(C_tile);
+    __pipeline_wait_prior(0);
+    WP_TILE_SYNC();
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {
+        int i, j;
+        partition_coord(C_tile, t, i, j);
+        // accumulator
+        mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
+        WP_PRAGMA_UNROLL
+        for (int k=0; k < A_tile.shape[1]; k++)
+        {
+            const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
+            const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
+            sum += mul(a, b);
+        }
+        partition_store(C_tile, i, j, sum);
+    }
+    WP_TILE_SYNC();
+}
+// 2D gemm accumulate out += A*B
+template <typename TileA, typename TileB, typename TileC>
+inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
+                                             const TileB& B,
+                                             TileC& out)
+{
+    const int length = tile_size(out);
+    WP_TILE_SYNC();
+    using T = typename TileA::Type;
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += WP_TILE_BLOCK_DIM)
+    {
+        // compute output index
+        const int i = t/out.N;
+        const int j = t%out.N;
+        T sum(0.0);
+        WP_PRAGMA_UNROLL
+        for (int k=0; k < A.N; ++k)
+        {
+            T a = A(i,k);
+            T b = B(k,j);
+            sum += a*b; // todo: use fmaf()
+        }
+        out(i,j) += sum;
+    }
+    WP_TILE_SYNC();
+}
+#else
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{
+	using namespace cute;
+    __pipeline_wait_prior(0);
+    // ensure smem tile is ready
+ 	WP_TILE_SYNC();
+	// Define CTA matrix size (static)
+	auto bM = Int<64>{};
+	auto bN = Int<64>{};
+	auto bK = Int<8>{};
+	// Define the smem layouts (static)
+	auto sA = make_layout(make_shape(bM, bK), LayoutRight{});
+	auto sB = make_layout(make_shape(bN, bK));
+	auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<float>(A.data), sA);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<float>(B.data), sB);
+    Tensor s_c_tensor = make_tensor(make_smem_ptr<float>(out.data), sC);
+    // TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+    //                              Layout<Shape<_16,_8,_1>>{});  // 16x8x1 UniversalFMA, assumes blockDim=128
+    // TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+    //                                     Layout<Shape<_8,_16>,Stride<_16,_1>>{});  // 8x16x1 UniversalFMA, assumes blockDim=128
+    TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+                                        Layout<Shape<_2,_64>,Stride<_64,_1>>{});  // 8x16x1 UniversalFMA, assumes blockDim=128
+    cooperative_gemm< AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
+                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
+                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>
+                    >(
+      threadIdx.x, tiled_mma,
+      1.0f, s_a_tensor, s_b_tensor, 1.0f, s_c_tensor,
+      cute::identity(), cute::identity(), cute::identity(), cute::identity()
+    );
+    WP_TILE_SYNC();
+}
+#endif // USE_CUTE
+#if 0
+template <typename TileA, typename TileB, typename TileC>
+void tile_matmul(TileA& a, TileB& b, TileC& c)
+{
+    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
+    static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
+    static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
+    static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
+    tile_matmul_scalar(a, b, c);
+}
+template <typename TileA, typename TileB, typename TileC,
+          typename AdjTileA, typename AdjTileB, typename AdjTileC>
+void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
+                     AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c)
+{
+    tile_matmul_scalar(adj_c, wp::tile_transpose(b), adj_a);
+    tile_matmul_scalar(wp::tile_transpose(a), adj_c, adj_b);
+}
+#endif // 0
+} // namespace wp

warp/native/tile_reduce.h ADDED Viewed

@@ -0,0 +1,210 @@
+/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+#pragma once
+#include "tile.h"
+#define WP_TILE_WARP_SIZE 32
+namespace wp
+{
+template <typename T>
+inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset, int mask)
+{
+    typedef unsigned int Word;
+    union
+    {
+        T output;
+        Word output_storage;
+    };
+    union
+    {
+        T input;
+        Word input_storage;
+    };
+    input = val;
+    Word* dest = reinterpret_cast<Word*>(&output);
+    Word* src  = reinterpret_cast<Word*>(&input);
+    unsigned int shuffle_word;
+    constexpr int word_count = (sizeof(T) + sizeof(Word) - 1) / sizeof(Word);
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < word_count; ++i)
+    {
+        shuffle_word = __shfl_down_sync(mask, src[i], offset, WP_TILE_WARP_SIZE);
+        dest[i] = shuffle_word;
+    }
+  return output;
+}
+template <typename T, typename Op>
+inline CUDA_CALLABLE T warp_reduce(T val, Op f, unsigned int mask)
+{
+    T sum = val;
+    if (mask == 0xFFFFFFFF)
+    {
+        // handle case where entire warp is active
+        for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+        {
+            sum = f(sum, warp_shuffle_down(sum, offset, mask));
+        }
+    }
+    else
+    {
+        // handle partial warp case
+        for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+        {
+            T shfl_val = warp_shuffle_down(sum, offset, mask);
+            if ((mask & (1 << ((threadIdx.x + offset)%WP_TILE_WARP_SIZE))) != 0)
+                sum = f(sum, shfl_val);
+        }
+    }
+    return sum;
+}
+// non-axis version which computes sum
+// across the entire tile using the whole block
+template <typename Tile, typename Op>
+auto tile_reduce_impl(Op f, Tile& t)
+{
+    using T = typename Tile::Type;
+    auto input = t.copy_to_register();
+    auto output = tile_register_t<T, 1, 1>();
+    const int warp_count = (WP_TILE_BLOCK_DIM + WP_TILE_WARP_SIZE - 1)/WP_TILE_WARP_SIZE;
+    const int warp_index = threadIdx.x/WP_TILE_WARP_SIZE;
+    const int lane_index = threadIdx.x%WP_TILE_WARP_SIZE;
+    T thread_sum = input.data[0];
+    // thread reduction
+    WP_PRAGMA_UNROLL
+    for (int i=1; i < input.NumRegs; ++i)
+    {
+        int linear = t.index(i);
+        if (!Tile::Aligned && linear >= Tile::Size)
+            break;
+        thread_sum = f(thread_sum, input.data[i]);
+    }
+    // ensure that only threads with at least one valid item participate in the reduction
+    unsigned int mask = __ballot_sync(__activemask(), t.index(0) < Tile::Size);
+    // warp reduction
+    T warp_sum = warp_reduce(thread_sum, f, mask);
+    // fixed size scratch pad for partial results in shared memory
+    WP_TILE_SHARED T partials[warp_count];
+    // count of active warps
+    WP_TILE_SHARED int active_warps;
+    if (threadIdx.x == 0)
+        active_warps = 0;
+    // ensure active_warps is initialized
+    WP_TILE_SYNC();
+    if (lane_index == 0)
+    {
+        partials[warp_index] = warp_sum;
+        atomicAdd(&active_warps, 1);
+    }
+    // ensure partials are ready
+    WP_TILE_SYNC();
+    // reduce across block, todo: use warp_reduce() here
+    if (threadIdx.x == 0)
+    {
+        T block_sum = partials[0];
+        WP_PRAGMA_UNROLL
+        for (int i=1; i < active_warps; ++i)
+            block_sum = f(block_sum, partials[i]);
+        output.data[0] = block_sum;
+    }
+    return output;
+}
+void adj_tile_reduce_impl()
+{
+    // todo: general purpose reduction gradients not implemented
+}
+// entry point for Python code-gen, wraps op in a lambda to perform overload resolution
+#define tile_reduce(op, t) tile_reduce_impl([](auto x, auto y) { return op(x, y);}, t)
+#define adj_tile_reduce(op, a, adj_op, adj_a, adj_ret) adj_tile_reduce_impl()
+// convenience methods for specific reductions
+template <typename Tile>
+auto tile_sum(Tile& t)
+{
+    return tile_reduce(add, t);
+}
+// special case adjoint for summation
+template <typename Tile, typename AdjTile>
+void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
+{
+    using T = typename Tile::Type;
+    // broadcast incoming adjoint to block
+    WP_TILE_SHARED T scratch;
+    if (threadIdx.x == 0)
+        scratch = adj_ret.data[0];
+    WP_TILE_SYNC();
+    // broadcast scalar across input dimensions (note zero strides)
+    auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, 0, 0>(&scratch, NULL).copy_to_register();
+    adj_t.grad_add(adj_ret_reg);
+}
+template <typename Tile>
+auto tile_max(Tile& t)
+{
+    return tile_reduce(max, t);
+}
+template <typename Tile, typename AdjTile>
+void adj_tile_max(Tile& t, Tile& adj_t, AdjTile& adj_ret)
+{
+    // todo: not implemented
+}
+template <typename Tile>
+auto tile_min(Tile& t)
+{
+    return tile_reduce(min, t);
+}
+template <typename Tile, typename AdjTile>
+void adj_tile_min(Tile& t, Tile& adj_t, AdjTile& adj_ret)
+{
+    // todo: not implemented
+}
+} // namespace wp

warp/native/volume_builder.cu CHANGED Viewed

@@ -1,3 +1,11 @@
+/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
 #include "volume_builder.h"
 #include <nanovdb/tools/cuda/PointsToGrid.cuh>

warp/native/volume_builder.h CHANGED Viewed

@@ -1,3 +1,11 @@
+/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
 #pragma once
 #include <nanovdb/NanoVDB.h>

warp/native/warp.cpp CHANGED Viewed

@@ -147,6 +147,11 @@ int is_cutlass_enabled()
     return int(WP_ENABLE_CUTLASS);
 }
+int is_mathdx_enabled()
+{
+    return int(WP_ENABLE_MATHDX);
+}
 int is_debug_enabled()
 {
     return int(WP_ENABLE_DEBUG);
@@ -1033,12 +1038,15 @@ WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret
 WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
 WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
-WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_file) { return 0; }
+WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; }
 WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; }
 WP_API void cuda_unload_module(void* context, void* module) {}
 WP_API void* cuda_get_kernel(void* context, void* module, const char* name) { return NULL; }
-WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream) { return 0; }
+WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream) { return 0; }
+WP_API int cuda_get_max_shared_memory(void* context) { return 0; }
+WP_API bool cuda_configure_kernel_shared_memory(void* kernel, int size) { return false; }
 WP_API void cuda_set_context_restore_policy(bool always_restore) {}
 WP_API int cuda_get_context_restore_policy() { return false; }