PyPI - torch-chamfer-dist - Versions diffs - 0.1.1__cp39-cp39-macosx_13_0_universal2.whl - Mend

torch-chamfer-dist 0.1.1__cp39-cp39-macosx_13_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

chamfer/__init__.py +240 -0
chamfer/src/kd_tree.cpp +86 -0
chamfer/src/kd_tree.hpp +22 -0
chamfer/src/metal_bridge.mm +503 -0
chamfer_ext.cpython-39-darwin.so +0 -0
torch_chamfer_dist-0.1.1.dist-info/METADATA +87 -0
torch_chamfer_dist-0.1.1.dist-info/RECORD +10 -0
torch_chamfer_dist-0.1.1.dist-info/WHEEL +5 -0
torch_chamfer_dist-0.1.1.dist-info/licenses/LICENSE +21 -0
torch_chamfer_dist-0.1.1.dist-info/top_level.txt +2 -0

chamfer/__init__.py ADDED Viewed

@@ -0,0 +1,240 @@
+from __future__ import annotations
+import os
+import site
+from pathlib import Path
+from typing import Tuple
+import torch
+__all__ = ["closest_points", "chamfer_distance"]
+_EXTENSION = None
+def _extension() -> object:
+    global _EXTENSION
+    if _EXTENSION is not None:
+        return _EXTENSION
+    try:
+        import chamfer_ext  # type: ignore
+    except ImportError:
+        src_dir = Path(__file__).resolve().parent / "src"
+        if not src_dir.exists():
+            raise RuntimeError(
+                "chamfer_ext extension not built. Install from wheel or run setup.py."
+            ) from None
+        from torch.utils.cpp_extension import load
+        import nanobind
+        nanobind_root = Path(nanobind.__file__).resolve().parent
+        nb_combined = nanobind_root / "src" / "nb_combined.cpp"
+        sources = [
+            src_dir / "metal_bridge.mm",
+            src_dir / "kd_tree.cpp",
+            nb_combined,
+        ]
+        include_dirs = [
+            str(src_dir),
+            str(nanobind_root / "include"),
+            str(nanobind_root / "ext" / "robin_map" / "include"),
+        ]
+        os.environ.setdefault("MACOSX_DEPLOYMENT_TARGET", "13.0")
+        user_bin = Path(site.getuserbase()) / "bin"
+        if user_bin.exists():
+            current_path = os.environ.get("PATH", "")
+            if str(user_bin) not in current_path.split(os.pathsep):
+                os.environ["PATH"] = os.pathsep.join(
+                    [str(user_bin)] + ([current_path] if current_path else [])
+                )
+        extra_cflags = ["-std=c++20", "-fobjc-arc", "-fvisibility=hidden"]
+        extra_ldflags = ["-framework", "Metal", "-framework", "Foundation"]
+        chamfer_ext = load(
+            name="chamfer_ext",
+            sources=[str(path) for path in sources if path.exists()],
+            extra_include_paths=include_dirs,
+            extra_cflags=extra_cflags,
+            extra_ldflags=extra_ldflags,
+            verbose=False,
+        )
+    _EXTENSION = chamfer_ext
+    return _EXTENSION
+def _mps_available() -> bool:
+    return bool(getattr(torch.backends, "mps", None) and torch.backends.mps.is_available())
+def _validate_pair(query: torch.Tensor, reference: torch.Tensor) -> None:
+    if query.dim() != 2:
+        raise ValueError("query tensor must be 2D [N, K]")
+    if reference.dim() != 2:
+        raise ValueError("reference tensor must be 2D [M, K]")
+    if query.size(1) != reference.size(1):
+        raise ValueError("query and reference tensors must have matching feature dimensions")
+def _require_device(tensor: torch.Tensor, device: str, name: str) -> None:
+    if tensor.device.type != device:
+        raise ValueError(f"{name} tensor must live on {device}, but found {tensor.device.type}")
+def _require_float32(tensor: torch.Tensor, name: str) -> None:
+    if tensor.dtype != torch.float32:
+        raise ValueError(f"{name} tensor must be float32, but found {tensor.dtype}")
+def _prepare_backend_tensors(
+    query: torch.Tensor, reference: torch.Tensor, *, is_mps: bool
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    device = "mps" if is_mps else "cpu"
+    _require_device(query, device, "query")
+    _require_device(reference, device, "reference")
+    _require_float32(query, "query")
+    _require_float32(reference, "reference")
+    return query.contiguous(), reference.contiguous()
+def _decide_backend(
+    query: torch.Tensor, reference: torch.Tensor, use_mps: bool | None
+) -> bool:
+    mps_available = _mps_available()
+    inputs_on_mps = query.device.type == "mps" and reference.device.type == "mps"
+    inputs_on_cpu = query.device.type == "cpu" and reference.device.type == "cpu"
+    if use_mps is True:
+        if not mps_available:
+            raise RuntimeError("MPS was requested, but torch.backends.mps.is_available() is False")
+        if not inputs_on_mps:
+            raise ValueError("MPS execution requires both tensors to be on the mps device")
+        return True
+    if use_mps is False:
+        if not inputs_on_cpu:
+            raise ValueError("CPU execution requires both tensors to be on the cpu device")
+        return False
+    if inputs_on_mps:
+        if not mps_available:
+            raise RuntimeError("Input tensors are on MPS, but the MPS backend is unavailable")
+        return True
+    if inputs_on_cpu:
+        return False
+    raise ValueError("query and reference must both reside on either CPU or MPS device")
+def closest_points(
+    query: torch.Tensor,
+    reference: torch.Tensor,
+    *,
+    use_mps: bool | None = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Return (indices, squared distances) of nearest neighbours in *reference* for each query point.
+    The search uses a kd-tree constructed on the CPU but traversed on the GPU via MPS/Metal.
+    """
+    _validate_pair(query, reference)
+    use_mps_flag = _decide_backend(query, reference, use_mps)
+    query_prepped, reference_prepped = _prepare_backend_tensors(query, reference, is_mps=use_mps_flag)
+    ext = _extension()
+    if use_mps_flag:
+        return ext.kd_query(query_prepped, reference_prepped)
+    if not hasattr(ext, "kd_query_cpu"):
+        raise RuntimeError("CPU kd-tree query is not available in the compiled extension")
+    return ext.kd_query_cpu(query_prepped, reference_prepped)
+class _ChamferDistanceFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a: torch.Tensor, b: torch.Tensor, use_mps_flag: bool | None = None) -> torch.Tensor:
+        if a.device != b.device:
+            raise ValueError("points_a and points_b must be on the same device")
+        assert a.device.type in {"cpu", "mps"}, "Unsupported device for chamfer_distance"
+        _validate_pair(a, b)
+        backend_is_mps = _decide_backend(a, b, use_mps_flag)
+        a_prepped, b_prepped = _prepare_backend_tensors(a, b, is_mps=backend_is_mps)
+        idx_ab_tensor, _ = closest_points(a_prepped, b_prepped, use_mps=backend_is_mps)
+        idx_ba_tensor, _ = closest_points(b_prepped, a_prepped, use_mps=backend_is_mps)
+        idx_ab = idx_ab_tensor.to(device=b_prepped.device, dtype=torch.long)
+        idx_ba = idx_ba_tensor.to(device=a_prepped.device, dtype=torch.long)
+        nn_ab = torch.index_select(b_prepped, 0, idx_ab)
+        nn_ba = torch.index_select(a_prepped, 0, idx_ba)
+        diff_ab = a_prepped - nn_ab
+        diff_ba = b_prepped - nn_ba
+        loss_ab = torch.sum(diff_ab * diff_ab, dim=1).mean()
+        loss_ba = torch.sum(diff_ba * diff_ba, dim=1).mean()
+        loss = loss_ab + loss_ba
+        ctx.save_for_backward(
+            a_prepped,
+            b_prepped,
+            idx_ab_tensor.to(torch.long),
+            idx_ba_tensor.to(torch.long),
+        )
+        ctx.sizes = (a_prepped.shape[0], b_prepped.shape[0])
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, None]:
+        a, b, idx_ab_saved, idx_ba_saved = ctx.saved_tensors
+        n_a, n_b = ctx.sizes
+        grad_a = grad_b = None
+        scalar_a = grad_output.to(device=a.device, dtype=a.dtype)
+        scalar_b = grad_output.to(device=b.device, dtype=b.dtype)
+        # All tensors are either on CPU or MPS; keep computations there.
+        assert a.device == b.device == idx_ab_saved.device == idx_ba_saved.device
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            idx_ab = idx_ab_saved.to(device=b.device)
+            nn_ab = torch.index_select(b, 0, idx_ab)
+            diff_ab = a - nn_ab
+            coeff_ab = (2.0 / float(n_a)) * scalar_a
+        if ctx.needs_input_grad[1] or ctx.needs_input_grad[0]:
+            idx_ba = idx_ba_saved.to(device=a.device)
+            nn_ba = torch.index_select(a, 0, idx_ba)
+            diff_ba = b - nn_ba
+            coeff_ba = (2.0 / float(n_b)) * scalar_b
+        if ctx.needs_input_grad[0]:
+            grad_a = coeff_ab * diff_ab
+            grad_a = grad_a.contiguous()
+            scatter_idx = idx_ba_saved
+            grad_a.index_add_(0, scatter_idx, (-coeff_ba) * diff_ba)
+        if ctx.needs_input_grad[1]:
+            grad_b = coeff_ba * diff_ba
+            grad_b = grad_b.contiguous()
+            scatter_idx = idx_ab_saved
+            grad_b.index_add_(0, scatter_idx, (-coeff_ab) * diff_ab)
+        return grad_a, grad_b, None
+def chamfer_distance(
+    points_a: torch.Tensor,
+    points_b: torch.Tensor,
+    *,
+    use_mps: bool | None = None,
+) -> torch.Tensor:
+    return _ChamferDistanceFunction.apply(points_a, points_b, use_mps)

chamfer/src/kd_tree.cpp ADDED Viewed

@@ -0,0 +1,86 @@
+#include "kd_tree.hpp"
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <future>
+#include <numeric>
+#include <stdexcept>
+namespace chamfer {
+std::vector<KDNodeGPU> build_kd_tree(const float* points, int64_t num_points, int64_t dims) {
+    if (num_points <= 0) {
+        throw std::invalid_argument("build_kd_tree: num_points must be positive");
+    }
+    if (dims <= 0) {
+        throw std::invalid_argument("build_kd_tree: dims must be positive");
+    }
+    std::vector<int> order(num_points);
+    std::iota(order.begin(), order.end(), 0);
+    std::vector<KDNodeGPU> gpu_nodes(static_cast<size_t>(num_points));
+    std::atomic<int> next_index{0};
+    const int dims_int = static_cast<int>(dims);
+    const int max_parallel_depth = 2;
+    const int parallel_threshold = 2048;
+    std::function<int(int, int, int)> build = [&](int start, int end, int depth) -> int {
+        if (start >= end) {
+            return -1;
+        }
+        int axis = depth % dims_int;
+        int mid = (start + end) / 2;
+        auto comparator = [points, dims_int, axis, &order](int lhs, int rhs) {
+            float l = points[static_cast<int64_t>(lhs) * dims_int + axis];
+            float r = points[static_cast<int64_t>(rhs) * dims_int + axis];
+            if (l == r) {
+                return lhs < rhs;
+            }
+            return l < r;
+        };
+        std::nth_element(order.begin() + start, order.begin() + mid, order.begin() + end, comparator);
+        int current = next_index.fetch_add(1, std::memory_order_relaxed);
+        KDNodeGPU& node = gpu_nodes[static_cast<size_t>(current)];
+        node.point_index = order[mid];
+        node.split_dim = axis;
+        node.split_value = points[static_cast<int64_t>(node.point_index) * dims_int + axis];
+        node.pad0 = 0.0f;
+        node.pad1 = 0.0f;
+        node.pad2 = 0.0f;
+        const bool parallel = depth < max_parallel_depth && (end - start) > parallel_threshold;
+        int left_index;
+        int right_index;
+        if (parallel) {
+            auto future_left = std::async(std::launch::async, [&]() {
+                return build(start, mid, depth + 1);
+            });
+            right_index = build(mid + 1, end, depth + 1);
+            left_index = future_left.get();
+        } else {
+            left_index = build(start, mid, depth + 1);
+            right_index = build(mid + 1, end, depth + 1);
+        }
+        node.left = left_index;
+        node.right = right_index;
+        return current;
+    };
+    int root_index = build(0, static_cast<int>(num_points), 0);
+    (void)root_index;
+    gpu_nodes.resize(static_cast<size_t>(next_index.load(std::memory_order_relaxed)));
+    return gpu_nodes;
+}
+}  // namespace chamfer

chamfer/src/kd_tree.hpp ADDED Viewed

@@ -0,0 +1,22 @@
+#pragma once
+#include <vector>
+#include <cstddef>
+namespace chamfer {
+struct KDNodeGPU {
+    int left;
+    int right;
+    int point_index;
+    int split_dim;
+    float split_value;
+    float pad0;
+    float pad1;
+    float pad2;
+};
+std::vector<KDNodeGPU> build_kd_tree(const float* points, int64_t num_points, int64_t dims);
+}

chamfer/src/metal_bridge.mm ADDED Viewed

@@ -0,0 +1,503 @@
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <nanobind/nanobind.h>
+#include <torch/extension.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <ATen/mps/MPSStream.h>
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <mach/mach_time.h>
+#include "kd_tree.hpp"
+namespace nb = nanobind;
+namespace {
+inline id<MTLBuffer> tensor_to_mtl_buffer(const at::Tensor& tensor) {
+    return (__bridge id<MTLBuffer>)(tensor.storage().data());
+}
+struct TimebaseInfo {
+    uint64_t numer = 0;
+    uint64_t denom = 0;
+    TimebaseInfo() {
+        mach_timebase_info_data_t info;
+        mach_timebase_info(&info);
+        numer = info.numer;
+        denom = info.denom;
+    }
+    double to_millis(uint64_t delta) const {
+        double nanoseconds = static_cast<double>(delta) * static_cast<double>(numer) / static_cast<double>(denom);
+        return nanoseconds / 1e6;
+    }
+};
+const TimebaseInfo& timebase() {
+    static TimebaseInfo info;
+    return info;
+}
+bool should_profile() {
+    static bool initialized = false;
+    static bool enabled = false;
+    if (!initialized) {
+        const char* env = std::getenv("CHAMFER_PROFILE");
+        enabled = env && std::strlen(env) > 0;
+        initialized = true;
+    }
+    return enabled;
+}
+struct ScopedTimer {
+    const TimebaseInfo& info;
+    uint64_t start;
+    std::string label;
+    bool enabled;
+    ScopedTimer(const TimebaseInfo& info, std::string lbl, bool en)
+        : info(info), start(en ? mach_absolute_time() : 0), label(std::move(lbl)), enabled(en) {}
+    ~ScopedTimer() {
+        if (enabled) {
+            uint64_t end = mach_absolute_time();
+            double ms = info.to_millis(end - start);
+            fprintf(stderr, "[chamfer] %s: %.3f ms\n", label.c_str(), ms);
+        }
+    }
+};
+constexpr const char* kMetalSource = R"(using namespace metal;
+struct KDNode {
+    int left;
+    int right;
+    int point_index;
+    int split_dim;
+    float split_value;
+    float pad0;
+    float pad1;
+    float pad2;
+};
+inline float distance_squared(const device float* a,
+                              const device float* b,
+                              int dims) {
+    float acc = 0.0f;
+    for (int i = 0; i < dims; ++i) {
+        float diff = a[i] - b[i];
+        acc += diff * diff;
+    }
+    return acc;
+}
+kernel void kd_query(device const float* ref_points [[buffer(0)]],
+                     device const KDNode* nodes [[buffer(1)]],
+                     constant int& num_nodes [[buffer(2)]],
+                     constant int& dims [[buffer(3)]],
+                     device const float* queries [[buffer(4)]],
+                     constant int& num_queries [[buffer(5)]],
+                     device int* out_indices [[buffer(6)]],
+                     device float* out_distances [[buffer(7)]],
+                     uint gid [[thread_position_in_grid]]) {
+    if (gid >= static_cast<uint>(num_queries)) {
+        return;
+    }
+    constexpr int STACK_CAP = 128;
+    int stack[STACK_CAP];
+    int stack_size = 0;
+    if (num_nodes > 0) {
+        stack[stack_size++] = 0;
+    }
+    device const float* query = queries + static_cast<size_t>(gid) * static_cast<size_t>(dims);
+    float best_dist = INFINITY;
+    int best_index = -1;
+    while (stack_size > 0) {
+        int node_idx = stack[--stack_size];
+        if (node_idx < 0 || node_idx >= num_nodes) {
+            continue;
+        }
+        KDNode node = nodes[node_idx];
+        int point_idx = node.point_index;
+        device const float* point = ref_points + static_cast<size_t>(point_idx) * static_cast<size_t>(dims);
+        float dist = distance_squared(query, point, dims);
+        if (dist < best_dist) {
+            best_dist = dist;
+            best_index = point_idx;
+        }
+        int left = node.left;
+        int right = node.right;
+        if (left < 0 && right < 0) {
+            continue;
+        }
+        float diff = query[node.split_dim] - node.split_value;
+        int near_child = diff <= 0.0f ? left : right;
+        int far_child = diff <= 0.0f ? right : left;
+        if (far_child >= 0 && stack_size < STACK_CAP && diff * diff < best_dist) {
+            stack[stack_size++] = far_child;
+        }
+        if (near_child >= 0 && stack_size < STACK_CAP) {
+            stack[stack_size++] = near_child;
+        }
+    }
+    if (best_index < 0) {
+        best_dist = 0.0f;
+    }
+    out_indices[gid] = best_index;
+    out_distances[gid] = best_dist;
+}
+)";
+struct MetalContext {
+    id<MTLDevice> device = nil;
+    id<MTLCommandQueue> queue = nil;
+    id<MTLLibrary> library = nil;
+    id<MTLComputePipelineState> pipeline = nil;
+    bool initialized = false;
+    bool attempted = false;
+    std::string error_message;
+};
+MetalContext& get_context() {
+    static MetalContext ctx;
+    return ctx;
+}
+void initialize_metal_once() {
+    auto& ctx = get_context();
+    static std::once_flag once_flag;
+    std::call_once(once_flag, [&ctx]() {
+        ctx.attempted = true;
+        ctx.device = MTLCreateSystemDefaultDevice();
+        if (!ctx.device) {
+            ctx.error_message = "No Metal-capable device available for MPS";
+            return;
+        }
+        ctx.queue = [ctx.device newCommandQueue];
+        if (!ctx.queue) {
+            ctx.error_message = "Failed to create Metal command queue";
+            return;
+        }
+        NSError* error = nil;
+        NSString* source = [[NSString alloc] initWithUTF8String:kMetalSource];
+        MTLCompileOptions* options = [[MTLCompileOptions alloc] init];
+        options.fastMathEnabled = YES;
+        ctx.library = [ctx.device newLibraryWithSource:source options:options error:&error];
+        if (!ctx.library) {
+            std::string message = "Failed to compile Metal library: ";
+            if (error) {
+                message += [[error localizedDescription] UTF8String];
+            }
+            ctx.error_message = message;
+            return;
+        }
+        id<MTLFunction> function = [ctx.library newFunctionWithName:@"kd_query"];
+        if (!function) {
+            ctx.error_message = "Failed to load kd_query function from Metal library";
+            return;
+        }
+        ctx.pipeline = [ctx.device newComputePipelineStateWithFunction:function error:&error];
+        if (!ctx.pipeline) {
+            std::string message = "Failed to create pipeline state: ";
+            if (error) {
+                message += [[error localizedDescription] UTF8String];
+            }
+            ctx.error_message = message;
+            return;
+        }
+        ctx.initialized = true;
+    });
+}
+void ensure_initialized() {
+    initialize_metal_once();
+    auto& ctx = get_context();
+    if (!ctx.initialized) {
+        if (!ctx.error_message.empty()) {
+            throw std::runtime_error(ctx.error_message);
+        }
+        throw std::runtime_error("Metal context failed to initialize");
+    }
+}
+const at::Tensor& tensor_from_nb(nb::handle h) {
+    if (!THPVariable_Check(h.ptr())) {
+        throw nb::type_error("expected a torch.Tensor");
+    }
+    return THPVariable_Unpack(h.ptr());
+}
+nb::tuple kd_tree_query(nb::handle query_handle, nb::handle reference_handle) {
+    torch::NoGradGuard guard;
+    const bool profile = should_profile();
+    const TimebaseInfo& tinfo = timebase();
+    ScopedTimer total_timer(tinfo, "kd_query_total", profile);
+    const at::Tensor& query_in = tensor_from_nb(query_handle);
+    const at::Tensor& reference_in = tensor_from_nb(reference_handle);
+    if (query_in.dim() != 2) {
+        throw std::invalid_argument("query tensor must be 2D [N, K]");
+    }
+    if (reference_in.dim() != 2) {
+        throw std::invalid_argument("reference tensor must be 2D [M, K]");
+    }
+    if (query_in.size(1) != reference_in.size(1)) {
+        throw std::invalid_argument("query and reference tensors must have the same dimensionality");
+    }
+    if (!query_in.device().is_mps() || !reference_in.device().is_mps()) {
+        throw std::invalid_argument("kd_query expects query and reference tensors on MPS device");
+    }
+    if (query_in.scalar_type() != at::kFloat || reference_in.scalar_type() != at::kFloat) {
+        throw std::invalid_argument("kd_query expects float32 tensors");
+    }
+    int64_t dims = query_in.size(1);
+    int64_t num_query = query_in.size(0);
+    int64_t num_reference = reference_in.size(0);
+    if (num_reference == 0) {
+        throw std::invalid_argument("reference set must contain at least one point");
+    }
+    at::Tensor query_mps = query_in.contiguous();
+    at::Tensor reference_mps = reference_in.contiguous();
+    at::mps::getCurrentMPSStream()->synchronize(at::mps::SyncType::COMMIT_AND_WAIT);
+    ensure_initialized();
+    auto& ctx = get_context();
+    at::Tensor reference_cpu;
+    {
+        ScopedTimer cpu_copy_timer(tinfo, "kd_query_copy_to_cpu", profile);
+        reference_cpu = reference_mps.to(at::kCPU).contiguous();
+    }
+    std::vector<chamfer::KDNodeGPU> nodes;
+    {
+        ScopedTimer build_timer(tinfo, "kd_tree_build", profile);
+        nodes = chamfer::build_kd_tree(reference_cpu.data_ptr<float>(), num_reference, dims);
+    }
+    if (nodes.empty()) {
+        throw std::runtime_error("Failed to build kd-tree");
+    }
+    NSUInteger node_bytes = static_cast<NSUInteger>(nodes.size() * sizeof(chamfer::KDNodeGPU));
+    id<MTLBuffer> node_buffer = [ctx.device newBufferWithBytes:nodes.data()
+                                                        length:node_bytes
+                                                       options:MTLResourceStorageModeShared];
+    if (!node_buffer) {
+        throw std::runtime_error("Failed to allocate node buffers");
+    }
+    auto indices_tensor = torch::empty({num_query}, torch::TensorOptions().dtype(torch::kInt32).device(torch::kMPS));
+    auto distances_tensor = torch::empty({num_query}, torch::TensorOptions().dtype(torch::kFloat).device(torch::kMPS));
+    id<MTLBuffer> points_buffer = tensor_to_mtl_buffer(reference_mps);
+    id<MTLBuffer> query_buffer = tensor_to_mtl_buffer(query_mps);
+    id<MTLBuffer> indices_buffer = tensor_to_mtl_buffer(indices_tensor);
+    id<MTLBuffer> distances_buffer = tensor_to_mtl_buffer(distances_tensor);
+    if (!points_buffer || !query_buffer || !node_buffer || !indices_buffer || !distances_buffer) {
+        throw std::runtime_error("Failed to allocate Metal buffers");
+    }
+    id<MTLCommandBuffer> command_buffer = [ctx.queue commandBuffer];
+    if (!command_buffer) {
+        throw std::runtime_error("Failed to create Metal command buffer");
+    }
+    id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+    [encoder setComputePipelineState:ctx.pipeline];
+    int num_nodes = static_cast<int>(nodes.size());
+    int dims_i = static_cast<int>(dims);
+    int num_query_i = static_cast<int>(num_query);
+    NSUInteger points_offset = static_cast<NSUInteger>(reference_mps.storage_offset() * reference_mps.element_size());
+    NSUInteger query_offset = static_cast<NSUInteger>(query_mps.storage_offset() * query_mps.element_size());
+    NSUInteger indices_offset = static_cast<NSUInteger>(indices_tensor.storage_offset() * indices_tensor.element_size());
+    NSUInteger distances_offset = static_cast<NSUInteger>(distances_tensor.storage_offset() * distances_tensor.element_size());
+    [encoder setBuffer:points_buffer offset:points_offset atIndex:0];
+    [encoder setBuffer:node_buffer offset:0 atIndex:1];
+    [encoder setBytes:&num_nodes length:sizeof(int) atIndex:2];
+    [encoder setBytes:&dims_i length:sizeof(int) atIndex:3];
+    [encoder setBuffer:query_buffer offset:query_offset atIndex:4];
+    [encoder setBytes:&num_query_i length:sizeof(int) atIndex:5];
+    [encoder setBuffer:indices_buffer offset:indices_offset atIndex:6];
+    [encoder setBuffer:distances_buffer offset:distances_offset atIndex:7];
+    NSUInteger max_threads = ctx.pipeline.maxTotalThreadsPerThreadgroup;
+    if (max_threads == 0) {
+        max_threads = 64;
+    }
+    NSUInteger threadgroup_size = std::min<NSUInteger>(max_threads, 256);
+    MTLSize threads_per_threadgroup = MTLSizeMake(threadgroup_size, 1, 1);
+    NSUInteger grid_threads = static_cast<NSUInteger>(num_query);
+    NSUInteger groups = (grid_threads + threadgroup_size - 1) / threadgroup_size;
+    MTLSize threads_per_grid = MTLSizeMake(groups * threadgroup_size, 1, 1);
+    {
+        ScopedTimer dispatch_timer(tinfo, "kd_query_dispatch", profile);
+        [encoder dispatchThreads:threads_per_grid threadsPerThreadgroup:threads_per_threadgroup];
+        [encoder endEncoding];
+        [command_buffer commit];
+    }
+    {
+        ScopedTimer wait_timer(tinfo, "kd_query_wait", profile);
+        [command_buffer waitUntilCompleted];
+    }
+    PyObject* indices_obj = THPVariable_Wrap(indices_tensor);
+    PyObject* distances_obj = THPVariable_Wrap(distances_tensor);
+    return nb::make_tuple(nb::steal<nb::object>(indices_obj), nb::steal<nb::object>(distances_obj));
+}
+nb::tuple kd_tree_query_cpu(nb::handle query_handle, nb::handle reference_handle) {
+    torch::NoGradGuard guard;
+    const at::Tensor& query_in = tensor_from_nb(query_handle);
+    const at::Tensor& reference_in = tensor_from_nb(reference_handle);
+    if (query_in.dim() != 2) {
+        throw std::invalid_argument("query tensor must be 2D [N, K]");
+    }
+    if (reference_in.dim() != 2) {
+        throw std::invalid_argument("reference tensor must be 2D [M, K]");
+    }
+    if (query_in.size(1) != reference_in.size(1)) {
+        throw std::invalid_argument("query and reference tensors must have the same dimensionality");
+    }
+    int64_t dims = query_in.size(1);
+    int64_t num_query = query_in.size(0);
+    int64_t num_reference = reference_in.size(0);
+    if (num_reference == 0) {
+        throw std::invalid_argument("reference set must contain at least one point");
+    }
+    at::Tensor query_cpu = query_in;
+    if (!query_cpu.device().is_cpu() || query_cpu.scalar_type() != at::kFloat || !query_cpu.is_contiguous()) {
+        query_cpu = query_in.to(at::kCPU, at::kFloat).contiguous();
+    }
+    at::Tensor reference_cpu = reference_in;
+    if (!reference_cpu.device().is_cpu() || reference_cpu.scalar_type() != at::kFloat || !reference_cpu.is_contiguous()) {
+        reference_cpu = reference_in.to(at::kCPU, at::kFloat).contiguous();
+    }
+    auto nodes = chamfer::build_kd_tree(reference_cpu.data_ptr<float>(), num_reference, dims);
+    if (nodes.empty()) {
+        throw std::runtime_error("Failed to build kd-tree");
+    }
+    auto indices_tensor = torch::empty({num_query}, torch::dtype(torch::kInt32).device(torch::kCPU));
+    auto distances_tensor = torch::empty({num_query}, torch::dtype(torch::kFloat).device(torch::kCPU));
+    const float* query_ptr = query_cpu.data_ptr<float>();
+    const float* reference_ptr = reference_cpu.data_ptr<float>();
+    int32_t* index_ptr = indices_tensor.data_ptr<int32_t>();
+    float* distance_ptr = distances_tensor.data_ptr<float>();
+    std::vector<int> stack;
+    stack.reserve(64);
+    for (int64_t qi = 0; qi < num_query; ++qi) {
+        const float* query = query_ptr + qi * dims;
+        float best_dist = std::numeric_limits<float>::infinity();
+        int best_index = -1;
+        stack.clear();
+        if (!nodes.empty()) {
+            stack.push_back(0);
+        }
+        while (!stack.empty()) {
+            int node_idx = stack.back();
+            stack.pop_back();
+            if (node_idx < 0 || node_idx >= static_cast<int>(nodes.size())) {
+                continue;
+            }
+            const auto& node = nodes[node_idx];
+            int point_idx = node.point_index;
+            const float* point = reference_ptr + static_cast<int64_t>(point_idx) * dims;
+            float dist = 0.0f;
+            for (int64_t d = 0; d < dims; ++d) {
+                float diff = query[d] - point[d];
+                dist += diff * diff;
+            }
+            if (dist < best_dist) {
+                best_dist = dist;
+                best_index = point_idx;
+            }
+            int left = node.left;
+            int right = node.right;
+            if (left < 0 && right < 0) {
+                continue;
+            }
+            float diff = query[node.split_dim] - node.split_value;
+            int near_child = diff <= 0.0f ? left : right;
+            int far_child = diff <= 0.0f ? right : left;
+            if (far_child >= 0 && diff * diff < best_dist) {
+                stack.push_back(far_child);
+            }
+            if (near_child >= 0) {
+                stack.push_back(near_child);
+            }
+        }
+        if (best_index < 0) {
+            best_dist = 0.0f;
+            best_index = 0;
+        }
+        index_ptr[qi] = best_index;
+        distance_ptr[qi] = best_dist;
+    }
+    PyObject* indices_obj = THPVariable_Wrap(indices_tensor);
+    PyObject* distances_obj = THPVariable_Wrap(distances_tensor);
+    return nb::make_tuple(nb::steal<nb::object>(indices_obj), nb::steal<nb::object>(distances_obj));
+}
+}  // namespace
+NB_MODULE(chamfer_ext, m) {
+    m.def("kd_query", &kd_tree_query, "KD-tree nearest neighbour query using Metal");
+    m.def("kd_query_cpu", &kd_tree_query_cpu, "KD-tree nearest neighbour query on CPU");
+}

chamfer_ext.cpython-39-darwin.so ADDED Viewed

Binary file

torch_chamfer_dist-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,87 @@
+Metadata-Version: 2.4
+Name: torch-chamfer-dist
+Version: 0.1.1
+Summary: Chamfer distance with Metal/MPS acceleration (macOS)
+Author: Janos
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/Janos95/chamfer
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=2.1
+Requires-Dist: nanobind>=2.0
+Dynamic: license-file
+# torch-chamfer-dist
+`torch-chamfer-dist` provides a fast Chamfer distance implementation for PyTorch. On macOS with
+Metal/MPS it runs kd-tree nearest-neighbour queries directly on the GPU; elsewhere it falls back to
+an optimized CPU kd-tree. Autograd support is built in.
+## Installation
+```bash
+pip install torch-chamfer-dist
+```
+The provided wheel targets macOS 13+ (arm64 and x86_64). On other platforms the CPU backend is
+selected automatically.
+## Quick start
+```python
+import torch
+import chamfer
+# Create two point clouds on the desired device ("mps" for Metal, "cpu" otherwise)
+a = torch.rand(5_000, 3, device="mps")
+b = torch.rand(5_000, 3, device="mps")
+# Nearest neighbours via kd-tree
+dist_idx, dist_sq = chamfer.closest_points(a, b)
+# Chamfer distance with gradients
+loss = chamfer.chamfer_distance(a, b)
+loss.backward()
+```
+The device of the inputs determines the backend. When both tensors live on MPS the Metal kernel is
+used; otherwise a CPU kd-tree path runs. Gradients are computed on the same device without host
+roundtrips.
+## Benchmarks
+The repository ships a benchmark script comparing brute-force, CPU kd-tree, and Metal kd-tree
+implementations. Example (20k points per cloud on an M2 Pro):
+```
+Method      | Forward           | Backward
+------------+-------------------+------------------
+Brute force | 0.885 s           | 1.829 s
+KD-tree CPU | 0.139 s (6.39x)   | 0.269 s (6.79x)
+KD-tree MPS | 0.008 s (115.31x) | 0.012 s (147.63x)
+```
+Run the benchmark locally:
+```bash
+PYTHONPATH=. python benchmarks/benchmark_chamfer.py --n 20000 --chunk 4096 --repeat 3
+```
+Set `CHAMFER_PROFILE=1` to emit per-stage timings (tree build, kernel wait, etc.).
+## Development
+- Install dependencies: `pip install torch nanobind pytest build`.
+- Run tests: `python -m pytest`.
+- Build wheel: `python -m build`.
+### Publishing to PyPI
+```bash
+python -m pip install --upgrade build twine
+python -m build
+python -m twine upload dist/*
+```
+Remember to bump the version in `pyproject.toml` before tagging and uploading a release.

torch_chamfer_dist-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+chamfer_ext.cpython-39-darwin.so,sha256=kH2IIIcBNhuokaAR2IiXe--DzYFHn1NJiZN1Fg5-SKE,825440
+chamfer/__init__.py,sha256=frr68NMP-eQcfRqvrjJqG26NQUQgNU-FnSr36gU58YA,8528
+chamfer/src/kd_tree.cpp,sha256=bAIazy-Co3yls9iRFvhHThHrPpXUxruFVuklU5CYBLU,2733
+chamfer/src/kd_tree.hpp,sha256=YoO5dGfWkwfjyjk6k-O5TNCmvWu2Wqz7Z-Gq6dSzcO4,333
+chamfer/src/metal_bridge.mm,sha256=jqqV-hPqy9xQlsiCL1fUwSVvQlkTry9zDuVZeTUrC_0,17275
+torch_chamfer_dist-0.1.1.dist-info/licenses/LICENSE,sha256=5FFaSGkWnSDsyq8Q_X3pU32jUSmmpUyaTP0UHsMKYuA,1062
+torch_chamfer_dist-0.1.1.dist-info/METADATA,sha256=9Q5gmXoK-s5iq31OlSXd7kRrCIFIuyOnigKwuz3c_As,2438
+torch_chamfer_dist-0.1.1.dist-info/WHEEL,sha256=MvK8_Pa_hl4o_UrfNOLF60Dnvg5dKyq8ck5oRFEOLq8,112
+torch_chamfer_dist-0.1.1.dist-info/top_level.txt,sha256=utVLD13Vx_2MOE1JHvK379NJMKdA8B3uTjmXIe3OiJ4,20
+torch_chamfer_dist-0.1.1.dist-info/RECORD,,

torch_chamfer_dist-0.1.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: false
+Tag: cp39-cp39-macosx_13_0_universal2

torch_chamfer_dist-0.1.1.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Janos
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

torch_chamfer_dist-0.1.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ chamfer
2	+ chamfer_ext