PyPI - warp-lang - Versions diffs - 1.1.0__py3-none-manylinux2014_aarch64.whl → 1.2.1__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.1.0__py3-none-manylinux2014_aarch64.whl → 1.2.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (218) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +10 -37
warp/build_dll.py +2 -2
warp/builtins.py +274 -6
warp/codegen.py +51 -4
warp/config.py +2 -2
warp/constants.py +4 -0
warp/context.py +422 -203
warp/examples/benchmarks/benchmark_api.py +0 -2
warp/examples/benchmarks/benchmark_cloth_warp.py +0 -1
warp/examples/benchmarks/benchmark_launches.py +0 -2
warp/examples/core/example_dem.py +0 -2
warp/examples/core/example_fluid.py +0 -2
warp/examples/core/example_graph_capture.py +0 -2
warp/examples/core/example_marching_cubes.py +0 -2
warp/examples/core/example_mesh.py +0 -2
warp/examples/core/example_mesh_intersect.py +0 -2
warp/examples/core/example_nvdb.py +0 -2
warp/examples/core/example_raycast.py +0 -2
warp/examples/core/example_raymarch.py +0 -2
warp/examples/core/example_render_opengl.py +0 -2
warp/examples/core/example_sph.py +0 -2
warp/examples/core/example_torch.py +0 -3
warp/examples/core/example_wave.py +0 -2
warp/examples/fem/example_apic_fluid.py +140 -115
warp/examples/fem/example_burgers.py +262 -0
warp/examples/fem/example_convection_diffusion.py +0 -2
warp/examples/fem/example_convection_diffusion_dg.py +0 -2
warp/examples/fem/example_deformed_geometry.py +0 -2
warp/examples/fem/example_diffusion.py +0 -2
warp/examples/fem/example_diffusion_3d.py +5 -4
warp/examples/fem/example_diffusion_mgpu.py +0 -2
warp/examples/fem/example_mixed_elasticity.py +0 -2
warp/examples/fem/example_navier_stokes.py +0 -2
warp/examples/fem/example_stokes.py +0 -2
warp/examples/fem/example_stokes_transfer.py +0 -2
warp/examples/optim/example_bounce.py +0 -2
warp/examples/optim/example_cloth_throw.py +0 -2
warp/examples/optim/example_diffray.py +0 -2
warp/examples/optim/example_drone.py +0 -2
warp/examples/optim/example_inverse_kinematics.py +0 -2
warp/examples/optim/example_inverse_kinematics_torch.py +0 -2
warp/examples/optim/example_spring_cage.py +0 -2
warp/examples/optim/example_trajectory.py +0 -2
warp/examples/optim/example_walker.py +0 -2
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth.py +0 -2
warp/examples/sim/example_granular.py +0 -2
warp/examples/sim/example_granular_collision_sdf.py +0 -2
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_particle_chain.py +0 -2
warp/examples/sim/example_quadruped.py +0 -2
warp/examples/sim/example_rigid_chain.py +0 -2
warp/examples/sim/example_rigid_contact.py +0 -2
warp/examples/sim/example_rigid_force.py +0 -2
warp/examples/sim/example_rigid_gyroscopic.py +0 -2
warp/examples/sim/example_rigid_soft_contact.py +0 -2
warp/examples/sim/example_soft_body.py +0 -2
warp/fem/__init__.py +1 -0
warp/fem/cache.py +3 -1
warp/fem/geometry/__init__.py +1 -0
warp/fem/geometry/element.py +4 -0
warp/fem/geometry/grid_3d.py +0 -4
warp/fem/geometry/nanogrid.py +455 -0
warp/fem/integrate.py +63 -9
warp/fem/space/__init__.py +43 -158
warp/fem/space/basis_space.py +34 -0
warp/fem/space/collocated_function_space.py +1 -1
warp/fem/space/grid_2d_function_space.py +13 -132
warp/fem/space/grid_3d_function_space.py +16 -154
warp/fem/space/hexmesh_function_space.py +37 -134
warp/fem/space/nanogrid_function_space.py +202 -0
warp/fem/space/quadmesh_2d_function_space.py +12 -119
warp/fem/space/restriction.py +4 -1
warp/fem/space/shape/__init__.py +77 -0
warp/fem/space/shape/cube_shape_function.py +5 -15
warp/fem/space/tetmesh_function_space.py +6 -76
warp/fem/space/trimesh_2d_function_space.py +6 -76
warp/native/array.h +12 -3
warp/native/builtin.h +48 -5
warp/native/bvh.cpp +14 -10
warp/native/bvh.cu +23 -15
warp/native/bvh.h +1 -0
warp/native/clang/clang.cpp +2 -1
warp/native/crt.cpp +11 -1
warp/native/crt.h +18 -1
warp/native/exports.h +187 -0
warp/native/mat.h +47 -0
warp/native/mesh.cpp +1 -1
warp/native/mesh.cu +1 -2
warp/native/nanovdb/GridHandle.h +366 -0
warp/native/nanovdb/HostBuffer.h +590 -0
warp/native/nanovdb/NanoVDB.h +3999 -2157
warp/native/nanovdb/PNanoVDB.h +936 -99
warp/native/quat.h +28 -1
warp/native/rand.h +5 -1
warp/native/vec.h +45 -1
warp/native/volume.cpp +335 -103
warp/native/volume.cu +39 -13
warp/native/volume.h +725 -303
warp/native/volume_builder.cu +381 -360
warp/native/volume_builder.h +16 -1
warp/native/volume_impl.h +61 -0
warp/native/warp.cu +8 -2
warp/native/warp.h +15 -7
warp/render/render_opengl.py +191 -52
warp/sim/integrator_featherstone.py +10 -3
warp/sim/integrator_xpbd.py +16 -22
warp/sparse.py +89 -27
warp/stubs.py +83 -0
warp/tests/assets/test_index_grid.nvdb +0 -0
warp/tests/aux_test_dependent.py +0 -2
warp/tests/aux_test_grad_customs.py +0 -2
warp/tests/aux_test_reference.py +0 -2
warp/tests/aux_test_reference_reference.py +0 -2
warp/tests/aux_test_square.py +0 -2
warp/tests/disabled_kinematics.py +0 -2
warp/tests/test_adam.py +0 -2
warp/tests/test_arithmetic.py +0 -36
warp/tests/test_array.py +9 -11
warp/tests/test_array_reduce.py +0 -2
warp/tests/test_async.py +0 -2
warp/tests/test_atomic.py +0 -2
warp/tests/test_bool.py +58 -50
warp/tests/test_builtins_resolution.py +0 -2
warp/tests/test_bvh.py +0 -2
warp/tests/test_closest_point_edge_edge.py +0 -1
warp/tests/test_codegen.py +0 -4
warp/tests/test_compile_consts.py +130 -10
warp/tests/test_conditional.py +0 -2
warp/tests/test_copy.py +0 -2
warp/tests/test_ctypes.py +6 -8
warp/tests/test_dense.py +0 -2
warp/tests/test_devices.py +0 -2
warp/tests/test_dlpack.py +9 -11
warp/tests/test_examples.py +42 -39
warp/tests/test_fabricarray.py +0 -3
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +75 -54
warp/tests/test_fp16.py +0 -2
warp/tests/test_func.py +0 -2
warp/tests/test_generics.py +27 -2
warp/tests/test_grad.py +147 -8
warp/tests/test_grad_customs.py +0 -2
warp/tests/test_hash_grid.py +1 -3
warp/tests/test_import.py +0 -2
warp/tests/test_indexedarray.py +0 -2
warp/tests/test_intersect.py +0 -2
warp/tests/test_jax.py +0 -2
warp/tests/test_large.py +11 -9
warp/tests/test_launch.py +0 -2
warp/tests/test_lerp.py +10 -54
warp/tests/test_linear_solvers.py +3 -5
warp/tests/test_lvalue.py +0 -2
warp/tests/test_marching_cubes.py +0 -2
warp/tests/test_mat.py +0 -2
warp/tests/test_mat_lite.py +0 -2
warp/tests/test_mat_scalar_ops.py +0 -2
warp/tests/test_math.py +0 -2
warp/tests/test_matmul.py +35 -37
warp/tests/test_matmul_lite.py +29 -31
warp/tests/test_mempool.py +0 -2
warp/tests/test_mesh.py +0 -3
warp/tests/test_mesh_query_aabb.py +0 -2
warp/tests/test_mesh_query_point.py +0 -2
warp/tests/test_mesh_query_ray.py +0 -2
warp/tests/test_mlp.py +0 -2
warp/tests/test_model.py +0 -2
warp/tests/test_module_hashing.py +111 -0
warp/tests/test_modules_lite.py +0 -3
warp/tests/test_multigpu.py +0 -2
warp/tests/test_noise.py +0 -4
warp/tests/test_operators.py +0 -2
warp/tests/test_options.py +0 -2
warp/tests/test_peer.py +0 -2
warp/tests/test_pinned.py +0 -2
warp/tests/test_print.py +0 -2
warp/tests/test_quat.py +0 -2
warp/tests/test_rand.py +41 -5
warp/tests/test_reload.py +0 -10
warp/tests/test_rounding.py +0 -2
warp/tests/test_runlength_encode.py +0 -2
warp/tests/test_sim_grad.py +0 -2
warp/tests/test_sim_kinematics.py +0 -2
warp/tests/test_smoothstep.py +0 -2
warp/tests/test_snippet.py +0 -2
warp/tests/test_sparse.py +0 -2
warp/tests/test_spatial.py +0 -2
warp/tests/test_special_values.py +362 -0
warp/tests/test_streams.py +0 -2
warp/tests/test_struct.py +0 -2
warp/tests/test_tape.py +0 -2
warp/tests/test_torch.py +0 -2
warp/tests/test_transient_module.py +0 -2
warp/tests/test_types.py +0 -2
warp/tests/test_utils.py +0 -2
warp/tests/test_vec.py +0 -2
warp/tests/test_vec_lite.py +0 -2
warp/tests/test_vec_scalar_ops.py +0 -2
warp/tests/test_verify_fp.py +0 -2
warp/tests/test_volume.py +237 -13
warp/tests/test_volume_write.py +86 -3
warp/tests/unittest_serial.py +10 -9
warp/tests/unittest_suites.py +6 -2
warp/tests/unittest_utils.py +2 -171
warp/tests/unused_test_misc.py +0 -2
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +37 -40
warp/types.py +526 -85
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/METADATA +61 -31
warp_lang-1.2.1.dist-info/RECORD +359 -0
warp/examples/fem/example_convection_diffusion_dg0.py +0 -204
warp/native/nanovdb/PNanoVDBWrite.h +0 -295
warp_lang-1.1.0.dist-info/RECORD +0 -352
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/WHEEL +0 -0
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/top_level.txt +0 -0

warp/native/volume_builder.cu CHANGED Viewed

@@ -1,425 +1,446 @@
 #include "volume_builder.h"
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
-// Explanation of key types
-// ------------------------
-//
-// leaf_key:
-// .__.__. .... .__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.
-//  63 62  ....  27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
-//  XX|< tile key >|<               upper offset               >|<           lower offset          >|
-//
-// tile key (36 bit):
-//   (uint32(ijk[2]) >> ChildT::TOTAL) |
-//   (uint64_t(uint32(ijk[1]) >> ChildT::TOTAL)) << 12 |
-//   (uint64_t(uint32(ijk[0]) >> ChildT::TOTAL)) << 24
-//
-// lower_key (51 bits) == leaf_key >> 12
-//
-// upper_key (36 bits) == lower_key >> 15 == leaf_key >> 27 == tile key
-CUDA_CALLABLE inline uint64_t coord_to_full_key(const nanovdb::Coord& ijk)
-{
-    using Tree = nanovdb::FloatTree; // any type is fine at this point
-    assert((abs(ijk[0]) >> 24) == 0);
-    assert((abs(ijk[1]) >> 24) == 0);
-    assert((abs(ijk[2]) >> 24) == 0);
-    constexpr uint32_t MASK_12BITS = (1u << 12) - 1u;
-    const uint64_t     tile_key36 =
-        ((uint32_t(ijk[2]) >> 12) & MASK_12BITS) | // z is the lower 12 bits
-        (uint64_t((uint32_t(ijk[1]) >> 12) & MASK_12BITS) << 12) | // y is the middle 12 bits
-        (uint64_t((uint32_t(ijk[0]) >> 12) & MASK_12BITS) << 24); // x is the upper 12 bits
-    const uint32_t upper_offset = Tree::Node2::CoordToOffset(ijk);
-    const uint32_t lower_offset = Tree::Node1::CoordToOffset(ijk);
-    return (tile_key36 << 27) | (upper_offset << 12) | lower_offset;
-}
-__global__
-void generate_keys(size_t num_points, const nanovdb::Coord* points, uint64_t* all_leaf_keys)
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+// dynamic initialization is not supported for a function-scope static __shared__ variable within a
+// __device__/__global__ function
+#pragma nv_diag_suppress 20054
+#elif defined(__NVCC__)
+#pragma diag_suppress 20054
+#endif
+namespace
+{
+/// Allocator class following interface of cub::cachingDeviceAllocator, as expected by naovdb::PointsToGrid
+struct Allocator
 {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_points) return;
-    all_leaf_keys[tid] = coord_to_full_key(points[tid]);
-}
+    cudaError_t DeviceAllocate(void **d_ptr,               ///< [out] Reference to pointer to the allocation
+                               size_t bytes,               ///< [in] Minimum number of bytes for the allocation
+                               cudaStream_t active_stream) ///< [in] The stream to be associated with this allocation
+    {
+        // in PointsToGrid stream argument always coincide with current stream, ignore
+        *d_ptr = alloc_device(WP_CURRENT_CONTEXT, bytes);
+        return cudaSuccess;
+    }
+    cudaError_t DeviceFree(void *d_ptr)
+    {
+        free_device(WP_CURRENT_CONTEXT, d_ptr);
+        return cudaSuccess;
+    }
+    cudaError_t FreeAllCached()
+    {
+        return cudaSuccess;
+    }
+};
-__global__
-void generate_keys(size_t num_points, const nanovdb::Vec3f* points, uint64_t* all_leaf_keys, float one_over_voxel_size, nanovdb::Vec3f translation)
+/// @brief  Implementation of NanoVDB's DeviceBuffer that uses warp allocators
+class DeviceBuffer
 {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_points) return;
+    uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
+    void *mCpuData, *mGpuData; // raw pointers to the host and device buffers
+    bool mManaged;
+  public:
+    /// @brief Static factory method that return an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer *dummy = nullptr, bool host = true,
+                               void *stream = nullptr)
+    {
+        return DeviceBuffer(size, host, stream);
+    }
-    const nanovdb::Coord ijk = ((points[tid] - translation) * one_over_voxel_size).round();
-    all_leaf_keys[tid] = coord_to_full_key(ijk);
-}
+    /// @brief Static factory method that return an instance of this buffer that wraps externally managed memory
+    /// @param size byte size of buffer specified by external memory
+    /// @param cpuData pointer to externally managed host memory
+    /// @param gpuData pointer to externally managed device memory
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, void *cpuData, void *gpuData)
+    {
+        return DeviceBuffer(size, cpuData, gpuData);
+    }
-// Convert a 36 bit tile key to the ijk origin of the addressed tile
-CUDA_CALLABLE inline nanovdb::Coord tile_key36_to_coord(uint64_t tile_key36) {
-    auto extend_sign = [](uint32_t i) -> int32_t { return i | ((i>>11 & 1) * 0xFFFFF800);};
-    constexpr uint32_t MASK_12BITS = (1u << 12) - 1u;
-    const int32_t i = extend_sign(uint32_t(tile_key36 >> 24) & MASK_12BITS);
-    const int32_t j = extend_sign(uint32_t(tile_key36 >> 12) & MASK_12BITS);
-    const int32_t k = extend_sign(uint32_t(tile_key36) & MASK_12BITS);
-    return nanovdb::Coord(i, j, k) << 12;
-}
+    /// @brief Constructor
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    DeviceBuffer(uint64_t size = 0, bool host = true, void *stream = nullptr)
+        : mSize(0), mCpuData(nullptr), mGpuData(nullptr), mManaged(false)
+    {
+        if (size > 0)
+            this->init(size, host, stream);
+    }
+    DeviceBuffer(uint64_t size, void *cpuData, void *gpuData)
+        : mSize(size), mCpuData(cpuData), mGpuData(gpuData), mManaged(false)
+    {
+    }
+    /// @brief Disallow copy-construction
+    DeviceBuffer(const DeviceBuffer &) = delete;
-// --- CUB helpers ---
-template<uint8_t bits, typename InType, typename OutType>
-struct ShiftRight {
-    CUDA_CALLABLE inline OutType operator()(const InType& v) const {
-        return static_cast<OutType>(v >> bits);
+    /// @brief Move copy-constructor
+    DeviceBuffer(DeviceBuffer &&other) noexcept
+        : mSize(other.mSize), mCpuData(other.mCpuData), mGpuData(other.mGpuData), mManaged(other.mManaged)
+    {
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
     }
-};
-template<uint8_t bits, typename InType = uint64_t, typename OutType = uint64_t>
-struct ShiftRightIterator : public cub::TransformInputIterator<OutType, ShiftRight<bits, InType, OutType>, InType*> {
-    using BASE = cub::TransformInputIterator<OutType, ShiftRight<bits, InType, OutType>, InType*>;
-    CUDA_CALLABLE inline ShiftRightIterator(uint64_t* input_itr)
-        : BASE(input_itr, ShiftRight<bits, InType, OutType>()) {}
-};
+    /// @brief Disallow copy assignment operation
+    DeviceBuffer &operator=(const DeviceBuffer &) = delete;
+    /// @brief Move copy assignment operation
+    DeviceBuffer &operator=(DeviceBuffer &&other) noexcept
+    {
+        this->clear();
+        mSize = other.mSize;
+        mCpuData = other.mCpuData;
+        mGpuData = other.mGpuData;
+        mManaged = other.mManaged;
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+        return *this;
+    }
-// --- Atomic instructions for NanoVDB construction ---
-template<typename MaskT>
-CUDA_CALLABLE_DEVICE void set_mask_atomic(MaskT& mask, uint32_t n) {
-    unsigned long long int* words = reinterpret_cast<unsigned long long int*>(&mask);
-    atomicOr(words + (n / 64), 1ull << (n & 63));
-}
+    /// @brief Destructor frees memory on both the host and device
+    ~DeviceBuffer()
+    {
+        this->clear();
+    };
+    /// @brief Initialize buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @note All existing buffers are first cleared
+    /// @warning size is expected to be non-zero. Use clear() clear buffer!
+    void init(uint64_t size, bool host = true, void *stream = nullptr)
+    {
+        if (mSize > 0)
+            this->clear(stream);
+        NANOVDB_ASSERT(size > 0);
+        if (host)
+        {
+            mCpuData =
+                alloc_pinned(size); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+        }
+        else
+        {
+            mGpuData = alloc_device(WP_CURRENT_CONTEXT, size);
+        }
+        mSize = size;
+        mManaged = true;
+    }
-template<typename Vec3T>
-CUDA_CALLABLE_DEVICE void expand_cwise_atomic(nanovdb::BBox<Vec3T>& bbox, const Vec3T& v) {
-    atomicMin(&bbox.mCoord[0][0], v[0]);
-    atomicMin(&bbox.mCoord[0][1], v[1]);
-    atomicMin(&bbox.mCoord[0][2], v[2]);
-    atomicMax(&bbox.mCoord[1][0], v[0]);
-    atomicMax(&bbox.mCoord[1][1], v[1]);
-    atomicMax(&bbox.mCoord[1][2], v[2]);
-}
+    /// @brief Returns a raw pointer to the host/CPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void *data() const
+    {
+        return mCpuData;
+    }
+    /// @brief Returns a raw pointer to the device/GPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void *deviceData() const
+    {
+        return mGpuData;
+    }
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
+    uint64_t size() const
+    {
+        return mSize;
+    }
-template<typename RootDataType>
-__hostdev__ const typename RootDataType::Tile* find_tile(const RootDataType* root_data, const nanovdb::Coord& ijk)
+    //@{
+    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
+    bool empty() const
+    {
+        return mSize == 0;
+    }
+    bool isEmpty() const
+    {
+        return mSize == 0;
+    }
+    //@}
+    /// @brief Detach device data so it is not dealloced when this buffer is destroyed
+    void detachDeviceData()
+    {
+        mGpuData = nullptr;
+        if (!mCpuData)
+        {
+            mSize = 0;
+        }
+    }
+    /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
+    void clear(void *stream = nullptr)
+    {
+        if (mManaged && mGpuData)
+            free_device(WP_CURRENT_CONTEXT, mGpuData);
+        if (mManaged && mCpuData)
+            free_pinned(mCpuData);
+        mCpuData = mGpuData = nullptr;
+        mSize = 0;
+        mManaged = false;
+    }
+}; // DeviceBuffer class
+template <typename Tree> __global__ void activateAllLeafVoxels(Tree *tree)
 {
-    using Tile = typename RootDataType::Tile;
-    const Tile *tiles = reinterpret_cast<const Tile *>(root_data + 1);
-    const auto key = RootDataType::CoordToKey(ijk);
+    const unsigned leaf_count = tree->mNodeCount[0];
-    for (uint32_t i = 0; i < root_data->mTableSize; ++i)
+    const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < leaf_count)
     {
-        if (tiles[i].key == key)
-            return &tiles[i];
+        // activate all leaf voxels
+        typename Tree::LeafNodeType *const leaf_nodes = tree->getFirstLeaf();
+        typename Tree::LeafNodeType &leaf = leaf_nodes[tid];
+        leaf.mValueMask.setOn();
+        leaf.updateBBox();
+    }
+    if (tid == 0)
+    {
+        tree->mVoxelCount = Tree::LeafNodeType::SIZE * leaf_count; // full leaves
     }
-    return nullptr;
 }
-// --- Wrapper for launching lambda kernels
-template<typename Func, typename... Args>
-__global__ void kernel(const size_t num_items, Func f, Args... args)
+template <typename Node>
+__device__ std::enable_if_t<!nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, unsigned tile_id, const typename Node::BuildType background_value)
 {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_items) return;
-    f(tid, args...);
+    node.setValue(tile_id, background_value);
 }
-template <typename BuildT>
-void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid,
-                           size_t &out_grid_size,
-                           const void *points,
-                           size_t num_points,
-                           bool points_in_world_space,
-                           const BuildGridParams<BuildT> &params)
+template <typename Node>
+__device__ std::enable_if_t<nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, unsigned tile_id, const typename Node::BuildType background_value)
 {
-    using FloatT = typename nanovdb::FloatTraits<BuildT>::FloatType;
-    const BuildT ZERO_VAL{0};
-    const FloatT ZERO_SCALAR{0};
-    // Don't want to access "params" in kernels
-    const double dx = params.voxel_size;
-    const double Tx = params.translation[0], Ty = params.translation[1], Tz = params.translation[2];
-    const BuildT background_value = params.background_value;
+}
-    const unsigned int num_threads = 256;
-    unsigned int num_blocks;
+template <typename Node>
+__device__ std::enable_if_t<!nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, const typename Node::BuildType background_value)
+{
+    node.mBackground = background_value;
+}
-    out_grid = nullptr;
-    out_grid_size = 0;
+template <typename Node>
+__device__ std::enable_if_t<nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, const typename Node::BuildType background_value)
+{
+}
-    cub::CachingDeviceAllocator allocator;
-    uint64_t* leaf_keys;
-    uint64_t* lower_keys;
-    uint64_t* upper_keys;
-    uint32_t* node_counts;
-    uint32_t leaf_count, lower_node_count, upper_node_count;
+template <typename Tree, typename NodeT>
+__global__ void setInternalBBoxAndBackgroundValue(Tree *tree, const typename Tree::BuildType background_value)
+{
+    using BBox = nanovdb::math::BBox<typename NodeT::CoordT>;
+    __shared__ BBox bbox;
-    allocator.DeviceAllocate((void**)&leaf_keys, sizeof(uint64_t) * num_points);
-    allocator.DeviceAllocate((void**)&node_counts, sizeof(uint32_t) * 3);
+    const unsigned node_count = tree->mNodeCount[NodeT::LEVEL];
+    const unsigned node_id = blockIdx.x;
-    // Phase 1: counting the nodes
+    if (node_id < node_count)
     {
-        // Generating keys from coords
-        uint64_t* all_leaf_keys;
-        uint64_t* all_leaf_keys_sorted;
-        allocator.DeviceAllocate((void**)&all_leaf_keys, sizeof(uint64_t) * num_points);
-        allocator.DeviceAllocate((void**)&all_leaf_keys_sorted, sizeof(uint64_t) * num_points);
-        num_blocks = (static_cast<unsigned int>(num_points) + num_threads - 1) / num_threads;
-        if (points_in_world_space) {
-            generate_keys<<<num_blocks, num_threads>>>(num_points, static_cast<const nanovdb::Vec3f*>(points), all_leaf_keys, static_cast<float>(1.0 / dx), nanovdb::Vec3f(params.translation));
-        } else {
-            generate_keys<<<num_blocks, num_threads>>>(num_points, static_cast<const nanovdb::Coord*>(points), all_leaf_keys);
+        if (threadIdx.x == 0)
+        {
+            bbox = BBox();
         }
-        void*  d_temp_storage = nullptr;
-        size_t temp_storage_bytes;
-        // Sort the keys, then get an array of unique keys
-        cub::DeviceRadixSort::SortKeys(nullptr, temp_storage_bytes, all_leaf_keys, all_leaf_keys_sorted, static_cast<int>(num_points), /* begin_bit = */ 0, /* end_bit = */ 63);
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, all_leaf_keys, all_leaf_keys_sorted, static_cast<int>(num_points), /* begin_bit = */ 0, /* end_bit = */ 63);
-        allocator.DeviceFree(d_temp_storage);
-        cub::DeviceSelect::Unique(nullptr, temp_storage_bytes, all_leaf_keys_sorted, leaf_keys, node_counts, static_cast<int>(num_points));
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, all_leaf_keys_sorted, leaf_keys, node_counts, static_cast<int>(num_points));
-        allocator.DeviceFree(d_temp_storage);
-        check_cuda(cudaMemcpy(&leaf_count, node_counts, sizeof(uint32_t), cudaMemcpyDeviceToHost));
-        allocator.DeviceFree(all_leaf_keys);
-        all_leaf_keys = nullptr;
-        allocator.DeviceFree(all_leaf_keys_sorted);
-        all_leaf_keys_sorted = nullptr;
-        // Get the keys unique to lower nodes and the number of them
-        allocator.DeviceAllocate((void**)&lower_keys, sizeof(uint64_t) * leaf_count);
-        cub::DeviceSelect::Unique(nullptr, temp_storage_bytes, ShiftRightIterator<12>(leaf_keys), lower_keys, node_counts + 1, leaf_count);
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, ShiftRightIterator<12>(leaf_keys), lower_keys, node_counts + 1, leaf_count);
-        allocator.DeviceFree(d_temp_storage);
-        check_cuda(cudaMemcpy(&lower_node_count, node_counts + 1, sizeof(uint32_t), cudaMemcpyDeviceToHost));
-        // Get the keys unique to upper nodes and the number of them
-        allocator.DeviceAllocate((void**)&upper_keys, sizeof(uint64_t) * lower_node_count);
-        cub::DeviceSelect::Unique(nullptr, temp_storage_bytes, ShiftRightIterator<15>(lower_keys), upper_keys, node_counts + 2, lower_node_count);
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, ShiftRightIterator<15>(lower_keys), upper_keys, node_counts + 2, lower_node_count);
-        allocator.DeviceFree(d_temp_storage);
-        check_cuda(cudaMemcpy(&upper_node_count, node_counts + 2, sizeof(uint32_t), cudaMemcpyDeviceToHost));
+        __syncthreads();
+        NodeT &node = tree->template getFirstNode<NodeT>()[node_id];
+        for (unsigned child_id = threadIdx.x; child_id < NodeT::SIZE; child_id += blockDim.x)
+        {
+            if (node.isChild(child_id))
+            {
+                bbox.expandAtomic(node.getChild(child_id)->bbox());
+            }
+            else
+            {
+                setBackgroundValue(node, child_id, background_value);
+            }
+        }
+        __syncthreads();
+        if (threadIdx.x == 0)
+        {
+            node.mBBox = bbox;
+        }
     }
+}
-    using Tree = nanovdb::NanoTree<BuildT>;
-    using Grid = nanovdb::Grid<Tree>;
-    const size_t total_bytes =
-        sizeof(Grid) +
-        sizeof(Tree) +
-        sizeof(typename Tree::RootType) +
-        sizeof(typename Tree::RootType::Tile) * upper_node_count +
-        sizeof(typename Tree::Node2) * upper_node_count +
-        sizeof(typename Tree::Node1) * lower_node_count +
-        sizeof(typename Tree::Node0) * leaf_count;
-    const int64_t upper_mem_offset =
-        sizeof(nanovdb::GridData) + sizeof(Tree) + sizeof(typename Tree::RootType) +
-        sizeof(typename Tree::RootType::Tile) * upper_node_count;
-    const int64_t lower_mem_offset = upper_mem_offset + sizeof(typename Tree::Node2) * upper_node_count;
-    const int64_t leaf_mem_offset = lower_mem_offset + sizeof(typename Tree::Node1) * lower_node_count;
-    typename Grid::DataType* grid;
-    check_cuda(cudaMalloc(&grid, total_bytes));
-    typename Tree::DataType* const tree = reinterpret_cast<typename Tree::DataType*>(grid + 1); // The tree is immediately after the grid
-    typename Tree::RootType::DataType* const root = reinterpret_cast<typename Tree::RootType::DataType*>(tree + 1); // The root is immediately after the tree
-    typename Tree::RootType::Tile* const tiles = reinterpret_cast<typename Tree::RootType::Tile*>(root + 1);
-    typename Tree::Node2::DataType* const upper_nodes = nanovdb::PtrAdd<typename Tree::Node2::DataType>(grid, upper_mem_offset);
-    typename Tree::Node1::DataType* const lower_nodes = nanovdb::PtrAdd<typename Tree::Node1::DataType>(grid, lower_mem_offset);
-    typename Tree::Node0::DataType* const leaf_nodes  = nanovdb::PtrAdd<typename Tree::Node0::DataType>(grid, leaf_mem_offset);
-    // Phase 2: building the tree
+template <typename Tree>
+__global__ void setRootBBoxAndBackgroundValue(nanovdb::Grid<Tree> *grid,
+                                              const typename Tree::BuildType background_value)
+{
+    using BBox = typename Tree::RootNodeType::BBoxType;
+    __shared__ BBox bbox;
+    Tree &tree = grid->tree();
+    const unsigned upper_count = tree.mNodeCount[2];
+    if (threadIdx.x == 0)
     {
-        // Setting up the tree and root node
-        kernel<<<1, 1>>>(1, [=] __device__(size_t i) {
-            tree->mNodeOffset[3] = sizeof(Tree);
-            tree->mNodeOffset[2] = tree->mNodeOffset[3] + sizeof(typename Tree::RootType) + sizeof(typename Tree::RootType::Tile) * upper_node_count;
-            tree->mNodeOffset[1] = tree->mNodeOffset[2] + sizeof(typename Tree::Node2) * upper_node_count;
-            tree->mNodeOffset[0] = tree->mNodeOffset[1] + sizeof(typename Tree::Node1) * lower_node_count;
-            tree->mNodeCount[2] = tree->mTileCount[2] = upper_node_count;
-            tree->mNodeCount[1] = tree->mTileCount[1] = lower_node_count;
-            tree->mNodeCount[0] = tree->mTileCount[0] = leaf_count;
-            tree->mVoxelCount = Tree::Node0::SIZE * leaf_count; // assuming full leaves
-            root->mBBox = nanovdb::CoordBBox(); // init to empty
-            root->mTableSize = upper_node_count;
-            root->mBackground = background_value;
-            root->mMinimum = ZERO_VAL;
-            root->mMaximum = ZERO_VAL;
-            root->mAverage = ZERO_SCALAR;
-            root->mStdDevi = ZERO_SCALAR;
-        });
+        bbox = BBox();
     }
-    // Add tiles and upper nodes
-    // i : 0 .. upper_node_count-1
-    num_blocks = (upper_node_count + num_threads - 1) / num_threads;
+    __syncthreads();
+    for (unsigned upper_id = threadIdx.x; upper_id < upper_count; upper_id += blockDim.x)
     {
-        kernel<<<num_blocks, num_threads>>>(upper_node_count, [=] __device__(size_t i) {
-            tiles[i].key = root->CoordToKey(tile_key36_to_coord(upper_keys[i]));
-            tiles[i].child = sizeof(typename Tree::RootType) + sizeof(typename Tree::RootType::Tile) * upper_node_count + sizeof(typename Tree::Node2) * i;
-            tiles[i].state = 0;
-            tiles[i].value = background_value;
-            assert(reinterpret_cast<const char*>(root->getChild(tiles + i)) == reinterpret_cast<const char*>(upper_nodes + i));
-            auto& node = upper_nodes[i];
-            node.mBBox = nanovdb::CoordBBox();
-            node.mFlags = 0;
-            node.mValueMask.setOff();
-            node.mChildMask.setOff();
-            node.mMinimum = ZERO_VAL;
-            node.mMaximum = ZERO_VAL;
-            node.mAverage = ZERO_SCALAR;
-            node.mStdDevi = ZERO_SCALAR;
-            for (size_t n = 0; n < Tree::Node2::SIZE; ++n) {
-                node.mTable[n].value = background_value;
-            }
-        });
+        typename Tree::UpperNodeType &upper = tree.getFirstUpper()[upper_id];
+        bbox.expandAtomic(upper.bbox());
     }
-    constexpr uint32_t MASK_15BITS = (1u << 15) - 1u;
-    constexpr uint32_t MASK_12BITS = (1u << 12) - 1u;
+    __syncthreads();
-    // Init lower nodes and register to parent
-    // i : 0 .. lower_node_count-1
-    num_blocks = (lower_node_count + num_threads - 1) / num_threads;
+    if (threadIdx.x == 0)
     {
-        kernel<<<num_blocks, num_threads>>>(lower_node_count, [=] __device__(size_t i) {
-            uint32_t upper_offset = lower_keys[i] & MASK_15BITS;
-            auto*    upper_node = root->getChild(find_tile(root, tile_key36_to_coord(lower_keys[i] >> 15)))->data();
-            set_mask_atomic(upper_node->mChildMask, upper_offset);
-            upper_node->setChild(upper_offset, lower_nodes + i);
-            auto& node = lower_nodes[i];
-            node.mBBox = nanovdb::CoordBBox();
-            node.mFlags = 0;
-            node.mValueMask.setOff();
-            node.mChildMask.setOff();
-            node.mMinimum = ZERO_VAL;
-            node.mMaximum = ZERO_VAL;
-            node.mAverage = ZERO_SCALAR;
-            node.mStdDevi = ZERO_SCALAR;
-            for (size_t n = 0; n < Tree::Node1::SIZE; ++n) {
-                node.mTable[n].value = background_value;
-            }
-        });
+        typename Tree::RootNodeType &root = tree.root();
+        setBackgroundValue(root, background_value);
+        root.mBBox = bbox;
+        grid->mWorldBBox = root.mBBox.transform(grid->map());
     }
+}
+template <typename BuildT>
+void finalize_grid(nanovdb::Grid<nanovdb::NanoTree<BuildT>> &out_grid, const BuildGridParams<BuildT> &params)
+{
+    // set background value, activate all voxels for allocated tiles and update bbox
+    using Tree = nanovdb::NanoTree<BuildT>;
+    Tree *tree = &out_grid.tree();
+    int node_counts[3];
+    memcpy_d2h(WP_CURRENT_CONTEXT, node_counts, tree->mNodeCount, sizeof(node_counts));
+    // synchronization below is unnecessary as node_counts is in pageable memory.
+    // keep it for clarity
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cuda_stream_synchronize(stream);
+    const unsigned int leaf_count = node_counts[0];
+    const unsigned int lower_count = node_counts[1];
+    const unsigned int upper_count = node_counts[2];
+    constexpr unsigned NUM_THREADS = 256;
+    const unsigned leaf_blocks = (leaf_count + NUM_THREADS - 1) / NUM_THREADS;
+    activateAllLeafVoxels<Tree><<<leaf_blocks, NUM_THREADS, 0, stream>>>(tree);
+    setInternalBBoxAndBackgroundValue<Tree, typename Tree::LowerNodeType>
+        <<<lower_count, NUM_THREADS, 0, stream>>>(tree, params.background_value);
+    setInternalBBoxAndBackgroundValue<Tree, typename Tree::UpperNodeType>
+        <<<upper_count, NUM_THREADS, 0, stream>>>(tree, params.background_value);
+    setRootBBoxAndBackgroundValue<Tree><<<1, NUM_THREADS, 0, stream>>>(&out_grid, params.background_value);
+    check_cuda(cuda_context_check(WP_CURRENT_CONTEXT));
+}
+template <>
+void finalize_grid(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueOnIndex>> &out_grid,
+                   const BuildGridParams<nanovdb::ValueOnIndex> &params)
+{
+    // nothing to do for OnIndex grids
+}
+/// "fancy-pointer" that transforms from world to index coordinates
+struct WorldSpacePointsPtr
+{
+    const nanovdb::Vec3f *points;
+    const nanovdb::Map map;
-    // Init leaf nodes and register to parent
-    // i : 0 .. leaf_count-1
-    num_blocks = (leaf_count + num_threads - 1) / num_threads;
+    __device__ nanovdb::Vec3f operator[](int idx) const
     {
-        kernel<<<num_blocks, num_threads>>>(leaf_count, [=] __device__(size_t i) {
-            uint32_t lower_offset = leaf_keys[i] & MASK_12BITS;
-            uint32_t upper_offset = (leaf_keys[i] >> 12) & MASK_15BITS;
-            const nanovdb::Coord ijk = tile_key36_to_coord(leaf_keys[i] >> 27);
-            auto* upper_node = root->getChild(find_tile(root, ijk))->data();
-            auto* lower_node = upper_node->getChild(upper_offset)->data();
-            set_mask_atomic(lower_node->mChildMask, lower_offset);
-            lower_node->setChild(lower_offset, leaf_nodes + i);
-            const nanovdb::Coord localUpperIjk = Tree::Node2::OffsetToLocalCoord(upper_offset) << Tree::Node1::TOTAL;
-            const nanovdb::Coord localLowerIjk = Tree::Node1::OffsetToLocalCoord(lower_offset) << Tree::Node0::TOTAL;
-            const nanovdb::Coord leafOrigin = ijk + localUpperIjk + localLowerIjk;
-            auto& node = leaf_nodes[i];
-            node.mBBoxMin = leafOrigin;
-            node.mBBoxDif[0] = leaf_nodes[i].mBBoxDif[1] = leaf_nodes[i].mBBoxDif[2] = Tree::Node0::DIM;
-            node.mFlags = 0;
-            node.mValueMask.setOn();
-            node.mMinimum = ZERO_VAL;
-            node.mMaximum = ZERO_VAL;
-            node.mAverage = ZERO_SCALAR;
-            node.mStdDevi = ZERO_SCALAR;
-            // mValues is undefined
-            // propagating bbox up:
-            expand_cwise_atomic(lower_node->mBBox, leafOrigin);
-            expand_cwise_atomic(lower_node->mBBox, leafOrigin + nanovdb::Coord(Tree::Node0::DIM));
-        });
+        return map.applyInverseMapF(points[idx]);
     }
-    // Propagating bounding boxes from lower nodes to upper nodes
-    // i : 0 .. lower_node_count-1
-    num_blocks = (lower_node_count + num_threads - 1) / num_threads;
+    __device__ nanovdb::Vec3f operator*() const
     {
-        kernel<<<num_blocks, num_threads>>>(lower_node_count, [=] __device__(size_t i) {
-            auto* upper_node = root->getChild(find_tile(root, tile_key36_to_coord(lower_keys[i] >> 15)))->data();
-            expand_cwise_atomic(upper_node->mBBox, lower_nodes[i].mBBox.min());
-            expand_cwise_atomic(upper_node->mBBox, lower_nodes[i].mBBox.max());
-        });
+        return (*this)[0];
     }
+};
-    // Setting up root bounding box and grid
-    {
-        kernel<<<1, 1>>>(1, [=] __device__(size_t i) {
-            for (int i = 0; i < upper_node_count; ++i) {
-                root->mBBox.expand(upper_nodes[i].mBBox.min());
-                root->mBBox.expand(upper_nodes[i].mBBox.max());
-            }
+} // namespace
-            nanovdb::Map map;
-            {
-                const double mat[4][4] = {
-                    {dx, 0.0, 0.0, 0.0}, // row 0
-                    {0.0, dx, 0.0, 0.0}, // row 1
-                    {0.0, 0.0, dx, 0.0}, // row 2
-                    {Tx, Ty, Tz, 1.0}, // row 3
-                };
-                const double invMat[4][4] = {
-                    {1 / dx, 0.0, 0.0, 0.0}, // row 0
-                    {0.0, 1 / dx, 0.0, 0.0}, // row 1
-                    {0.0, 0.0, 1 / dx, 0.0}, // row 2
-                    {0.0, 0.0, 0.0, 0.0}, // row 3, ignored by Map::set
-                };
-                map.set(mat, invMat, 1.0);
-            }
+namespace nanovdb
+{
+template <> struct BufferTraits<DeviceBuffer>
+{
+    static constexpr bool hasDeviceDual = true;
+};
+} // namespace nanovdb
+template <typename BuildT>
+void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid, size_t &out_grid_size,
+                            const void *points, size_t num_points, bool points_in_world_space,
+                            const BuildGridParams<BuildT> &params)
+{
+    out_grid = nullptr;
+    out_grid_size = 0;
-            grid->mMagic = NANOVDB_MAGIC_NUMBER;
-            grid->mChecksum = 0xFFFFFFFFFFFFFFFFull;
-            grid->mVersion = nanovdb::Version();
-            grid->mFlags = static_cast<uint32_t>(nanovdb::GridFlags::HasBBox) |
-                           static_cast<uint32_t>(nanovdb::GridFlags::IsBreadthFirst);
-            grid->mGridIndex = 0;
-            grid->mGridCount = 1;
-            grid->mGridSize = total_bytes;
-            // mGridName is set below
-            grid->mWorldBBox.mCoord[0] = map.applyMap(nanovdb::Vec3R(root->mBBox.mCoord[0]));
-            grid->mWorldBBox.mCoord[1] = map.applyMap(nanovdb::Vec3R(root->mBBox.mCoord[1]));
-            grid->mVoxelSize = nanovdb::Vec3d(dx);
-            grid->mMap = map;
-            grid->mGridClass = nanovdb::GridClass::Unknown;
-            grid->mGridType = nanovdb::mapToGridType<BuildT>();
-            grid->mBlindMetadataOffset = total_bytes;
-            grid->mBlindMetadataCount = 0;
-        });
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    nanovdb::Map map(params.voxel_size, params.translation);
+    nanovdb::tools::cuda::PointsToGrid<BuildT, Allocator> p2g(map, stream);
+    // p2g.setVerbose(2);
+    p2g.setGridName(params.name);
+    p2g.setChecksum(nanovdb::CheckMode::Disable);
+    // Only compute bbox for OnIndex grids. Otherwise bbox will be computed after activating all leaf voxels
+    p2g.includeBBox(nanovdb::BuildTraits<BuildT>::is_onindex);
+    nanovdb::GridHandle<DeviceBuffer> grid_handle;
+    if (points_in_world_space)
+    {
+        grid_handle = p2g.getHandle(WorldSpacePointsPtr{static_cast<const nanovdb::Vec3f *>(points), map}, num_points,
+                                    DeviceBuffer());
+    }
+    else
+    {
+        grid_handle = p2g.getHandle(static_cast<const nanovdb::Coord *>(points), num_points, DeviceBuffer());
     }
-    check_cuda(cudaMemcpy(grid->mGridName, params.name, 256, cudaMemcpyHostToDevice));
+    out_grid = grid_handle.deviceGrid<BuildT>();
+    out_grid_size = grid_handle.gridSize();
-    allocator.DeviceFree(lower_keys);
-    allocator.DeviceFree(upper_keys);
-    allocator.DeviceFree(leaf_keys);
-    allocator.DeviceFree(node_counts);
+    finalize_grid(*out_grid, params);
-    out_grid = reinterpret_cast<Grid*>(grid);
-    out_grid_size = total_bytes;
+    // So that buffer is not destroyed when handles goes out of scope
+    grid_handle.buffer().detachDeviceData();
 }
-template void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<float>>*&, size_t&, const void*, size_t, bool, const BuildGridParams<float>&);
-template void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<nanovdb::Vec3f>>*&, size_t&, const void*, size_t, bool, const BuildGridParams<nanovdb::Vec3f>&);
-template void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<int32_t>>*&, size_t&, const void*, size_t, bool, const BuildGridParams<int32_t>&);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<float>> *&, size_t &, const void *, size_t, bool,
+                                     const BuildGridParams<float> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::Vec3f>> *&, size_t &, const void *,
+                                     size_t, bool, const BuildGridParams<nanovdb::Vec3f> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<int32_t>> *&, size_t &, const void *, size_t, bool,
+                                     const BuildGridParams<int32_t> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueIndex>> *&, size_t &, const void *,
+                                     size_t, bool, const BuildGridParams<nanovdb::ValueIndex> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueOnIndex>> *&, size_t &, const void *,
+                                     size_t, bool, const BuildGridParams<nanovdb::ValueOnIndex> &);