RubyGems - faiss - Versions diffs - 0.2.5 → 0.2.7 - Mend

faiss 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (191) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/LICENSE.txt +1 -1
data/ext/faiss/extconf.rb +1 -1
data/ext/faiss/index.cpp +13 -0
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +2 -2
data/vendor/faiss/faiss/AutoTune.cpp +15 -4
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +1 -5
data/vendor/faiss/faiss/Clustering.h +0 -2
data/vendor/faiss/faiss/IVFlib.h +0 -2
data/vendor/faiss/faiss/Index.h +1 -2
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +17 -3
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +10 -1
data/vendor/faiss/faiss/IndexBinary.h +0 -1
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -0
data/vendor/faiss/faiss/IndexBinaryHash.cpp +1 -3
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +273 -48
data/vendor/faiss/faiss/IndexBinaryIVF.h +18 -11
data/vendor/faiss/faiss/IndexFastScan.cpp +13 -10
data/vendor/faiss/faiss/IndexFastScan.h +5 -1
data/vendor/faiss/faiss/IndexFlat.cpp +16 -3
data/vendor/faiss/faiss/IndexFlat.h +1 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +5 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +7 -2
data/vendor/faiss/faiss/IndexHNSW.cpp +3 -6
data/vendor/faiss/faiss/IndexHNSW.h +0 -1
data/vendor/faiss/faiss/IndexIDMap.cpp +4 -4
data/vendor/faiss/faiss/IndexIDMap.h +0 -2
data/vendor/faiss/faiss/IndexIVF.cpp +155 -129
data/vendor/faiss/faiss/IndexIVF.h +121 -61
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +12 -11
data/vendor/faiss/faiss/IndexIVFFastScan.h +6 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +221 -165
data/vendor/faiss/faiss/IndexIVFPQ.h +1 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +6 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +0 -2
data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -2
data/vendor/faiss/faiss/IndexNNDescent.h +0 -1
data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +7 -9
data/vendor/faiss/faiss/IndexRefine.cpp +1 -1
data/vendor/faiss/faiss/IndexReplicas.cpp +3 -4
data/vendor/faiss/faiss/IndexReplicas.h +0 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +8 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.h +7 -0
data/vendor/faiss/faiss/IndexShards.cpp +26 -109
data/vendor/faiss/faiss/IndexShards.h +2 -3
data/vendor/faiss/faiss/IndexShardsIVF.cpp +246 -0
data/vendor/faiss/faiss/IndexShardsIVF.h +42 -0
data/vendor/faiss/faiss/MetaIndexes.cpp +86 -0
data/vendor/faiss/faiss/MetaIndexes.h +29 -0
data/vendor/faiss/faiss/MetricType.h +14 -0
data/vendor/faiss/faiss/VectorTransform.cpp +8 -10
data/vendor/faiss/faiss/VectorTransform.h +1 -3
data/vendor/faiss/faiss/clone_index.cpp +232 -18
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +25 -3
data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +7 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +78 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +20 -6
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +7 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +21 -7
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +7 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +7 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +10 -3
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +7 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +11 -3
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +25 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +76 -29
data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +14 -13
data/vendor/faiss/faiss/gpu/GpuDistance.h +18 -6
data/vendor/faiss/faiss/gpu/GpuIndex.h +23 -21
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +10 -10
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -12
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +29 -50
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +3 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +8 -8
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +4 -4
data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -5
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +9 -7
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +4 -4
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +55 -6
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +20 -6
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +95 -25
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +67 -16
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +4 -4
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +7 -7
data/vendor/faiss/faiss/gpu/test/TestUtils.h +4 -4
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +0 -7
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +9 -9
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -7
data/vendor/faiss/faiss/impl/CodePacker.cpp +67 -0
data/vendor/faiss/faiss/impl/CodePacker.h +71 -0
data/vendor/faiss/faiss/impl/DistanceComputer.h +0 -2
data/vendor/faiss/faiss/impl/HNSW.cpp +3 -7
data/vendor/faiss/faiss/impl/HNSW.h +6 -9
data/vendor/faiss/faiss/impl/IDSelector.cpp +1 -1
data/vendor/faiss/faiss/impl/IDSelector.h +39 -1
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +62 -51
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +11 -12
data/vendor/faiss/faiss/impl/NNDescent.cpp +3 -9
data/vendor/faiss/faiss/impl/NNDescent.h +10 -10
data/vendor/faiss/faiss/impl/NSG.cpp +1 -6
data/vendor/faiss/faiss/impl/NSG.h +4 -7
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +1 -15
data/vendor/faiss/faiss/impl/PolysemousTraining.h +11 -10
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +0 -7
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -4
data/vendor/faiss/faiss/impl/Quantizer.h +6 -3
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +796 -174
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +16 -8
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +3 -5
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +4 -4
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +3 -3
data/vendor/faiss/faiss/impl/ThreadedIndex.h +4 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +291 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +74 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +123 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +102 -0
data/vendor/faiss/faiss/impl/index_read.cpp +13 -10
data/vendor/faiss/faiss/impl/index_write.cpp +3 -4
data/vendor/faiss/faiss/impl/kmeans1d.cpp +0 -1
data/vendor/faiss/faiss/impl/kmeans1d.h +3 -3
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +61 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +48 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +18 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
data/vendor/faiss/faiss/index_factory.cpp +8 -10
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +29 -12
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +8 -2
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/DirectMap.h +2 -4
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +118 -18
data/vendor/faiss/faiss/invlists/InvertedLists.h +44 -4
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/python/python_callbacks.h +1 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +3 -1
data/vendor/faiss/faiss/utils/Heap.cpp +139 -3
data/vendor/faiss/faiss/utils/Heap.h +35 -1
data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +84 -0
data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +196 -0
data/vendor/faiss/faiss/utils/approx_topk/generic.h +138 -0
data/vendor/faiss/faiss/utils/approx_topk/mode.h +34 -0
data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +367 -0
data/vendor/faiss/faiss/utils/distances.cpp +61 -7
data/vendor/faiss/faiss/utils/distances.h +11 -0
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +346 -0
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +36 -0
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +42 -0
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +40 -0
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +352 -0
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +32 -0
data/vendor/faiss/faiss/utils/distances_simd.cpp +515 -327
data/vendor/faiss/faiss/utils/extra_distances-inl.h +17 -1
data/vendor/faiss/faiss/utils/extra_distances.cpp +37 -8
data/vendor/faiss/faiss/utils/extra_distances.h +2 -1
data/vendor/faiss/faiss/utils/fp16-fp16c.h +7 -0
data/vendor/faiss/faiss/utils/fp16-inl.h +7 -0
data/vendor/faiss/faiss/utils/fp16.h +7 -0
data/vendor/faiss/faiss/utils/hamming-inl.h +0 -456
data/vendor/faiss/faiss/utils/hamming.cpp +104 -120
data/vendor/faiss/faiss/utils/hamming.h +21 -10
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +535 -0
data/vendor/faiss/faiss/utils/hamming_distance/common.h +48 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +519 -0
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +26 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +614 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +21 -25
data/vendor/faiss/faiss/utils/simdlib_avx2.h +344 -3
data/vendor/faiss/faiss/utils/simdlib_emulated.h +390 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +655 -130
data/vendor/faiss/faiss/utils/sorting.cpp +692 -0
data/vendor/faiss/faiss/utils/sorting.h +71 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +165 -0
data/vendor/faiss/faiss/utils/utils.cpp +4 -176
data/vendor/faiss/faiss/utils/utils.h +2 -9
metadata +30 -4
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +0 -26

data/vendor/faiss/faiss/utils/sorting.cpp ADDED Viewed

@@ -0,0 +1,692 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/utils/sorting.h>
+#include <omp.h>
+#include <algorithm>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+/*****************************************************************************
+ * Argsort
+ ****************************************************************************/
+namespace {
+struct ArgsortComparator {
+    const float* vals;
+    bool operator()(const size_t a, const size_t b) const {
+        return vals[a] < vals[b];
+    }
+};
+struct SegmentS {
+    size_t i0; // begin pointer in the permutation array
+    size_t i1; // end
+    size_t len() const {
+        return i1 - i0;
+    }
+};
+// see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
+// extended to > 1 merge thread
+// merges 2 ranges that should be consecutive on the source into
+// the union of the two on the destination
+template <typename T>
+void parallel_merge(
+        const T* src,
+        T* dst,
+        SegmentS& s1,
+        SegmentS& s2,
+        int nt,
+        const ArgsortComparator& comp) {
+    if (s2.len() > s1.len()) { // make sure that s1 larger than s2
+        std::swap(s1, s2);
+    }
+    // compute sub-ranges for each thread
+    std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
+    s2s[0].i0 = s2.i0;
+    s2s[nt - 1].i1 = s2.i1;
+    // not sure parallel actually helps here
+#pragma omp parallel for num_threads(nt)
+    for (int t = 0; t < nt; t++) {
+        s1s[t].i0 = s1.i0 + s1.len() * t / nt;
+        s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
+        if (t + 1 < nt) {
+            T pivot = src[s1s[t].i1];
+            size_t i0 = s2.i0, i1 = s2.i1;
+            while (i0 + 1 < i1) {
+                size_t imed = (i1 + i0) / 2;
+                if (comp(pivot, src[imed])) {
+                    i1 = imed;
+                } else {
+                    i0 = imed;
+                }
+            }
+            s2s[t].i1 = s2s[t + 1].i0 = i1;
+        }
+    }
+    s1.i0 = std::min(s1.i0, s2.i0);
+    s1.i1 = std::max(s1.i1, s2.i1);
+    s2 = s1;
+    sws[0].i0 = s1.i0;
+    for (int t = 0; t < nt; t++) {
+        sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
+        if (t + 1 < nt) {
+            sws[t + 1].i0 = sws[t].i1;
+        }
+    }
+    assert(sws[nt - 1].i1 == s1.i1);
+    // do the actual merging
+#pragma omp parallel for num_threads(nt)
+    for (int t = 0; t < nt; t++) {
+        SegmentS sw = sws[t];
+        SegmentS s1t = s1s[t];
+        SegmentS s2t = s2s[t];
+        if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
+            for (;;) {
+                // assert (sw.len() == s1t.len() + s2t.len());
+                if (comp(src[s1t.i0], src[s2t.i0])) {
+                    dst[sw.i0++] = src[s1t.i0++];
+                    if (s1t.i0 == s1t.i1) {
+                        break;
+                    }
+                } else {
+                    dst[sw.i0++] = src[s2t.i0++];
+                    if (s2t.i0 == s2t.i1) {
+                        break;
+                    }
+                }
+            }
+        }
+        if (s1t.len() > 0) {
+            assert(s1t.len() == sw.len());
+            memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
+        } else if (s2t.len() > 0) {
+            assert(s2t.len() == sw.len());
+            memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
+        }
+    }
+}
+}; // namespace
+void fvec_argsort(size_t n, const float* vals, size_t* perm) {
+    for (size_t i = 0; i < n; i++) {
+        perm[i] = i;
+    }
+    ArgsortComparator comp = {vals};
+    std::sort(perm, perm + n, comp);
+}
+void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
+    size_t* perm2 = new size_t[n];
+    // 2 result tables, during merging, flip between them
+    size_t *permB = perm2, *permA = perm;
+    int nt = omp_get_max_threads();
+    { // prepare correct permutation so that the result ends in perm
+      // at final iteration
+        int nseg = nt;
+        while (nseg > 1) {
+            nseg = (nseg + 1) / 2;
+            std::swap(permA, permB);
+        }
+    }
+#pragma omp parallel
+    for (size_t i = 0; i < n; i++) {
+        permA[i] = i;
+    }
+    ArgsortComparator comp = {vals};
+    std::vector<SegmentS> segs(nt);
+    // independent sorts
+#pragma omp parallel for
+    for (int t = 0; t < nt; t++) {
+        size_t i0 = t * n / nt;
+        size_t i1 = (t + 1) * n / nt;
+        SegmentS seg = {i0, i1};
+        std::sort(permA + seg.i0, permA + seg.i1, comp);
+        segs[t] = seg;
+    }
+    int prev_nested = omp_get_nested();
+    omp_set_nested(1);
+    int nseg = nt;
+    while (nseg > 1) {
+        int nseg1 = (nseg + 1) / 2;
+        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
+        int sub_nseg1 = nseg / 2;
+#pragma omp parallel for num_threads(nseg1)
+        for (int s = 0; s < nseg; s += 2) {
+            if (s + 1 == nseg) { // otherwise isolated segment
+                memcpy(permB + segs[s].i0,
+                       permA + segs[s].i0,
+                       segs[s].len() * sizeof(size_t));
+            } else {
+                int t0 = s * sub_nt / sub_nseg1;
+                int t1 = (s + 1) * sub_nt / sub_nseg1;
+                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
+                parallel_merge(
+                        permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
+            }
+        }
+        for (int s = 0; s < nseg; s += 2) {
+            segs[s / 2] = segs[s];
+        }
+        nseg = nseg1;
+        std::swap(permA, permB);
+    }
+    assert(permA == perm);
+    omp_set_nested(prev_nested);
+    delete[] perm2;
+}
+/*****************************************************************************
+ * Bucket sort
+ ****************************************************************************/
+// extern symbol in the .h
+int bucket_sort_verbose = 0;
+namespace {
+void bucket_sort_ref(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm) {
+    double t0 = getmillisecs();
+    memset(lims, 0, sizeof(*lims) * (vmax + 1));
+    for (size_t i = 0; i < nval; i++) {
+        FAISS_THROW_IF_NOT(vals[i] < vmax);
+        lims[vals[i] + 1]++;
+    }
+    double t1 = getmillisecs();
+    // compute cumulative sum
+    for (size_t i = 0; i < vmax; i++) {
+        lims[i + 1] += lims[i];
+    }
+    FAISS_THROW_IF_NOT(lims[vmax] == nval);
+    double t2 = getmillisecs();
+    // populate buckets
+    for (size_t i = 0; i < nval; i++) {
+        perm[lims[vals[i]]++] = i;
+    }
+    double t3 = getmillisecs();
+    // reset pointers
+    for (size_t i = vmax; i > 0; i--) {
+        lims[i] = lims[i - 1];
+    }
+    lims[0] = 0;
+    double t4 = getmillisecs();
+    if (bucket_sort_verbose) {
+        printf("times %.3f %.3f %.3f %.3f\n",
+               t1 - t0,
+               t2 - t1,
+               t3 - t2,
+               t4 - t3);
+    }
+}
+void bucket_sort_parallel(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm,
+        int nt_in) {
+    memset(lims, 0, sizeof(*lims) * (vmax + 1));
+#pragma omp parallel num_threads(nt_in)
+    {
+        int nt = omp_get_num_threads(); // might be different from nt_in
+        int rank = omp_get_thread_num();
+        std::vector<int64_t> local_lims(vmax + 1);
+        // range of indices handled by this thread
+        size_t i0 = nval * rank / nt;
+        size_t i1 = nval * (rank + 1) / nt;
+        // build histogram in local lims
+        double t0 = getmillisecs();
+        for (size_t i = i0; i < i1; i++) {
+            local_lims[vals[i]]++;
+        }
+#pragma omp critical
+        { // accumulate histograms (not shifted indices to prepare cumsum)
+            for (size_t i = 0; i < vmax; i++) {
+                lims[i + 1] += local_lims[i];
+            }
+        }
+#pragma omp barrier
+        double t1 = getmillisecs();
+#pragma omp master
+        {
+            // compute cumulative sum
+            for (size_t i = 0; i < vmax; i++) {
+                lims[i + 1] += lims[i];
+            }
+            FAISS_THROW_IF_NOT(lims[vmax] == nval);
+        }
+#pragma omp barrier
+#pragma omp critical
+        { // current thread grabs a slot in the buckets
+            for (size_t i = 0; i < vmax; i++) {
+                size_t nv = local_lims[i];
+                local_lims[i] = lims[i]; // where we should start writing
+                lims[i] += nv;
+            }
+        }
+        double t2 = getmillisecs();
+#pragma omp barrier
+        { // populate buckets, this is the slowest operation
+            for (size_t i = i0; i < i1; i++) {
+                perm[local_lims[vals[i]]++] = i;
+            }
+        }
+#pragma omp barrier
+        double t3 = getmillisecs();
+#pragma omp master
+        { // shift back lims
+            for (size_t i = vmax; i > 0; i--) {
+                lims[i] = lims[i - 1];
+            }
+            lims[0] = 0;
+            double t4 = getmillisecs();
+            if (bucket_sort_verbose) {
+                printf("times %.3f %.3f %.3f %.3f\n",
+                       t1 - t0,
+                       t2 - t1,
+                       t3 - t2,
+                       t4 - t3);
+            }
+        }
+    }
+}
+/***********************************************
+ * in-place bucket sort
+ */
+template <class TI>
+void bucket_sort_inplace_ref(
+        size_t nrow,
+        size_t ncol,
+        TI* vals,
+        TI nbucket,
+        int64_t* lims) {
+    double t0 = getmillisecs();
+    size_t nval = nrow * ncol;
+    FAISS_THROW_IF_NOT(
+            nbucket < nval); // unclear what would happen in this case...
+    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
+    for (size_t i = 0; i < nval; i++) {
+        FAISS_THROW_IF_NOT(vals[i] < nbucket);
+        lims[vals[i] + 1]++;
+    }
+    double t1 = getmillisecs();
+    // compute cumulative sum
+    for (size_t i = 0; i < nbucket; i++) {
+        lims[i + 1] += lims[i];
+    }
+    FAISS_THROW_IF_NOT(lims[nbucket] == nval);
+    double t2 = getmillisecs();
+    std::vector<size_t> ptrs(nbucket);
+    for (size_t i = 0; i < nbucket; i++) {
+        ptrs[i] = lims[i];
+    }
+    // find loops in the permutation and follow them
+    TI row = -1;
+    TI init_bucket_no = 0, bucket_no = 0;
+    for (;;) {
+        size_t idx = ptrs[bucket_no];
+        if (row >= 0) {
+            ptrs[bucket_no] += 1;
+        }
+        assert(idx < lims[bucket_no + 1]);
+        TI next_bucket_no = vals[idx];
+        vals[idx] = row;
+        if (next_bucket_no != -1) {
+            row = idx / ncol;
+            bucket_no = next_bucket_no;
+        } else {
+            // start new loop
+            for (; init_bucket_no < nbucket; init_bucket_no++) {
+                if (ptrs[init_bucket_no] < lims[init_bucket_no + 1]) {
+                    break;
+                }
+            }
+            if (init_bucket_no == nbucket) { // we're done
+                break;
+            }
+            bucket_no = init_bucket_no;
+            row = -1;
+        }
+    }
+    for (size_t i = 0; i < nbucket; i++) {
+        assert(ptrs[i] == lims[i + 1]);
+    }
+    double t3 = getmillisecs();
+    if (bucket_sort_verbose) {
+        printf("times %.3f %.3f %.3f\n", t1 - t0, t2 - t1, t3 - t2);
+    }
+}
+// collects row numbers to write into buckets
+template <class TI>
+struct ToWrite {
+    TI nbucket;
+    std::vector<TI> buckets;
+    std::vector<TI> rows;
+    std::vector<size_t> lims;
+    explicit ToWrite(TI nbucket) : nbucket(nbucket) {
+        lims.resize(nbucket + 1);
+    }
+    /// add one element (row) to write in bucket b
+    void add(TI row, TI b) {
+        assert(b >= 0 && b < nbucket);
+        rows.push_back(row);
+        buckets.push_back(b);
+    }
+    void bucket_sort() {
+        FAISS_THROW_IF_NOT(buckets.size() == rows.size());
+        lims.resize(nbucket + 1);
+        memset(lims.data(), 0, sizeof(lims[0]) * (nbucket + 1));
+        for (size_t i = 0; i < buckets.size(); i++) {
+            assert(buckets[i] >= 0 && buckets[i] < nbucket);
+            lims[buckets[i] + 1]++;
+        }
+        // compute cumulative sum
+        for (size_t i = 0; i < nbucket; i++) {
+            lims[i + 1] += lims[i];
+        }
+        FAISS_THROW_IF_NOT(lims[nbucket] == buckets.size());
+        // could also do a circular perm...
+        std::vector<TI> new_rows(rows.size());
+        std::vector<size_t> ptrs = lims;
+        for (size_t i = 0; i < buckets.size(); i++) {
+            TI b = buckets[i];
+            assert(ptrs[b] < lims[b + 1]);
+            new_rows[ptrs[b]++] = rows[i];
+        }
+        buckets.resize(0);
+        std::swap(rows, new_rows);
+    }
+    void swap(ToWrite& other) {
+        assert(nbucket == other.nbucket);
+        buckets.swap(other.buckets);
+        rows.swap(other.rows);
+        lims.swap(other.lims);
+    }
+};
+template <class TI>
+void bucket_sort_inplace_parallel(
+        size_t nrow,
+        size_t ncol,
+        TI* vals,
+        TI nbucket,
+        int64_t* lims,
+        int nt_in) {
+    int verbose = bucket_sort_verbose;
+    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
+    std::vector<ToWrite<TI>> all_to_write;
+    size_t nval = nrow * ncol;
+    FAISS_THROW_IF_NOT(
+            nbucket < nval); // unclear what would happen in this case...
+    // try to keep size of all_to_write < 5GiB
+    // but we need at least one element per bucket
+    size_t init_to_write = std::max(
+            size_t(nbucket),
+            std::min(nval / 10, ((size_t)5 << 30) / (sizeof(TI) * 3 * nt_in)));
+    if (verbose > 0) {
+        printf("init_to_write=%zd\n", init_to_write);
+    }
+    std::vector<size_t> ptrs(nbucket); // ptrs is shared across all threads
+    std::vector<char> did_wrap(
+            nbucket); // DON'T use std::vector<bool> that cannot be accessed
+                      // safely from multiple threads!!!
+#pragma omp parallel num_threads(nt_in)
+    {
+        int nt = omp_get_num_threads(); // might be different from nt_in (?)
+        int rank = omp_get_thread_num();
+        std::vector<int64_t> local_lims(nbucket + 1);
+        // range of indices handled by this thread
+        size_t i0 = nval * rank / nt;
+        size_t i1 = nval * (rank + 1) / nt;
+        // build histogram in local lims
+        for (size_t i = i0; i < i1; i++) {
+            local_lims[vals[i]]++;
+        }
+#pragma omp critical
+        { // accumulate histograms (not shifted indices to prepare cumsum)
+            for (size_t i = 0; i < nbucket; i++) {
+                lims[i + 1] += local_lims[i];
+            }
+            all_to_write.push_back(ToWrite<TI>(nbucket));
+        }
+#pragma omp barrier
+        // this thread's things to write
+        ToWrite<TI>& to_write = all_to_write[rank];
+#pragma omp master
+        {
+            // compute cumulative sum
+            for (size_t i = 0; i < nbucket; i++) {
+                lims[i + 1] += lims[i];
+            }
+            FAISS_THROW_IF_NOT(lims[nbucket] == nval);
+            // at this point lims is final (read only!)
+            memcpy(ptrs.data(), lims, sizeof(lims[0]) * nbucket);
+            // initial values to write (we write -1s to get the process running)
+            // make sure at least one element per bucket
+            size_t written = 0;
+            for (TI b = 0; b < nbucket; b++) {
+                size_t l0 = lims[b], l1 = lims[b + 1];
+                size_t target_to_write = l1 * init_to_write / nval;
+                do {
+                    if (l0 == l1) {
+                        break;
+                    }
+                    to_write.add(-1, b);
+                    l0++;
+                    written++;
+                } while (written < target_to_write);
+            }
+            to_write.bucket_sort();
+        }
+        // this thread writes only buckets b0:b1
+        size_t b0 = (rank * nbucket + nt - 1) / nt;
+        size_t b1 = ((rank + 1) * nbucket + nt - 1) / nt;
+        // in this loop, we write elements collected in the previous round
+        // and collect the elements that are overwritten for the next round
+        size_t tot_written = 0;
+        int round = 0;
+        for (;;) {
+#pragma omp barrier
+            size_t n_to_write = 0;
+            for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                n_to_write += to_write_2.lims.back();
+            }
+            tot_written += n_to_write;
+            // assert(tot_written <= nval);
+#pragma omp master
+            {
+                if (verbose >= 1) {
+                    printf("ROUND %d n_to_write=%zd\n", round, n_to_write);
+                }
+                if (verbose > 2) {
+                    for (size_t b = 0; b < nbucket; b++) {
+                        printf("   b=%zd [", b);
+                        for (size_t i = lims[b]; i < lims[b + 1]; i++) {
+                            printf(" %s%d",
+                                   ptrs[b] == i ? ">" : "",
+                                   int(vals[i]));
+                        }
+                        printf(" %s] %s\n",
+                               ptrs[b] == lims[b + 1] ? ">" : "",
+                               did_wrap[b] ? "w" : "");
+                    }
+                    printf("To write\n");
+                    for (size_t b = 0; b < nbucket; b++) {
+                        printf("   b=%zd ", b);
+                        const char* sep = "[";
+                        for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                            printf("%s", sep);
+                            sep = " |";
+                            size_t l0 = to_write_2.lims[b];
+                            size_t l1 = to_write_2.lims[b + 1];
+                            for (size_t i = l0; i < l1; i++) {
+                                printf(" %d", int(to_write_2.rows[i]));
+                            }
+                        }
+                        printf(" ]\n");
+                    }
+                }
+            }
+            if (n_to_write == 0) {
+                break;
+            }
+            round++;
+#pragma omp barrier
+            ToWrite<TI> next_to_write(nbucket);
+            for (size_t b = b0; b < b1; b++) {
+                for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                    size_t l0 = to_write_2.lims[b];
+                    size_t l1 = to_write_2.lims[b + 1];
+                    for (size_t i = l0; i < l1; i++) {
+                        TI row = to_write_2.rows[i];
+                        size_t idx = ptrs[b];
+                        if (verbose > 2) {
+                            printf("    bucket %d (rank %d) idx %zd\n",
+                                   int(row),
+                                   rank,
+                                   idx);
+                        }
+                        if (idx < lims[b + 1]) {
+                            ptrs[b]++;
+                        } else {
+                            // wrapping around
+                            assert(!did_wrap[b]);
+                            did_wrap[b] = true;
+                            idx = lims[b];
+                            ptrs[b] = idx + 1;
+                        }
+                        // check if we need to remember the overwritten number
+                        if (vals[idx] >= 0) {
+                            TI new_row = idx / ncol;
+                            next_to_write.add(new_row, vals[idx]);
+                            if (verbose > 2) {
+                                printf("       new_row=%d\n", int(new_row));
+                            }
+                        } else {
+                            assert(did_wrap[b]);
+                        }
+                        vals[idx] = row;
+                    }
+                }
+            }
+            next_to_write.bucket_sort();
+#pragma omp barrier
+            all_to_write[rank].swap(next_to_write);
+        }
+    }
+}
+} // anonymous namespace
+void bucket_sort(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_ref(nval, vals, vmax, lims, perm);
+    } else {
+        bucket_sort_parallel(nval, vals, vmax, lims, perm, nt);
+    }
+}
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int32_t* vals,
+        int32_t vmax,
+        int64_t* lims,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
+    } else {
+        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
+    }
+}
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int64_t* vals,
+        int64_t vmax,
+        int64_t* lims,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
+    } else {
+        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
+    }
+}
+} // namespace faiss