RubyGems - faiss - Versions diffs - 0.1.0 → 0.1.1 - Mend

faiss 0.1.0 → 0.1.1

Files changed (226) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +103 -3
data/ext/faiss/ext.cpp +99 -32
data/ext/faiss/extconf.rb +12 -2
data/lib/faiss/ext.bundle +0 -0
data/lib/faiss/index.rb +3 -3
data/lib/faiss/index_binary.rb +3 -3
data/lib/faiss/kmeans.rb +1 -1
data/lib/faiss/pca_matrix.rb +2 -2
data/lib/faiss/product_quantizer.rb +3 -3
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/AutoTune.cpp +719 -0
data/vendor/faiss/AutoTune.h +212 -0
data/vendor/faiss/Clustering.cpp +261 -0
data/vendor/faiss/Clustering.h +101 -0
data/vendor/faiss/IVFlib.cpp +339 -0
data/vendor/faiss/IVFlib.h +132 -0
data/vendor/faiss/Index.cpp +171 -0
data/vendor/faiss/Index.h +261 -0
data/vendor/faiss/Index2Layer.cpp +437 -0
data/vendor/faiss/Index2Layer.h +85 -0
data/vendor/faiss/IndexBinary.cpp +77 -0
data/vendor/faiss/IndexBinary.h +163 -0
data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
data/vendor/faiss/IndexBinaryFlat.h +54 -0
data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
data/vendor/faiss/IndexBinaryHNSW.h +56 -0
data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
data/vendor/faiss/IndexBinaryIVF.h +211 -0
data/vendor/faiss/IndexFlat.cpp +508 -0
data/vendor/faiss/IndexFlat.h +175 -0
data/vendor/faiss/IndexHNSW.cpp +1090 -0
data/vendor/faiss/IndexHNSW.h +170 -0
data/vendor/faiss/IndexIVF.cpp +909 -0
data/vendor/faiss/IndexIVF.h +353 -0
data/vendor/faiss/IndexIVFFlat.cpp +502 -0
data/vendor/faiss/IndexIVFFlat.h +118 -0
data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
data/vendor/faiss/IndexIVFPQ.h +161 -0
data/vendor/faiss/IndexIVFPQR.cpp +219 -0
data/vendor/faiss/IndexIVFPQR.h +65 -0
data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
data/vendor/faiss/IndexLSH.cpp +225 -0
data/vendor/faiss/IndexLSH.h +87 -0
data/vendor/faiss/IndexLattice.cpp +143 -0
data/vendor/faiss/IndexLattice.h +68 -0
data/vendor/faiss/IndexPQ.cpp +1188 -0
data/vendor/faiss/IndexPQ.h +199 -0
data/vendor/faiss/IndexPreTransform.cpp +288 -0
data/vendor/faiss/IndexPreTransform.h +91 -0
data/vendor/faiss/IndexReplicas.cpp +123 -0
data/vendor/faiss/IndexReplicas.h +76 -0
data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
data/vendor/faiss/IndexScalarQuantizer.h +127 -0
data/vendor/faiss/IndexShards.cpp +317 -0
data/vendor/faiss/IndexShards.h +100 -0
data/vendor/faiss/InvertedLists.cpp +623 -0
data/vendor/faiss/InvertedLists.h +334 -0
data/vendor/faiss/LICENSE +21 -0
data/vendor/faiss/MatrixStats.cpp +252 -0
data/vendor/faiss/MatrixStats.h +62 -0
data/vendor/faiss/MetaIndexes.cpp +351 -0
data/vendor/faiss/MetaIndexes.h +126 -0
data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
data/vendor/faiss/OnDiskInvertedLists.h +127 -0
data/vendor/faiss/VectorTransform.cpp +1157 -0
data/vendor/faiss/VectorTransform.h +322 -0
data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
data/vendor/faiss/c_api/AutoTune_c.h +64 -0
data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
data/vendor/faiss/c_api/Clustering_c.h +117 -0
data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
data/vendor/faiss/c_api/IndexShards_c.h +42 -0
data/vendor/faiss/c_api/Index_c.cpp +105 -0
data/vendor/faiss/c_api/Index_c.h +183 -0
data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
data/vendor/faiss/c_api/clone_index_c.h +32 -0
data/vendor/faiss/c_api/error_c.h +42 -0
data/vendor/faiss/c_api/error_impl.cpp +27 -0
data/vendor/faiss/c_api/error_impl.h +16 -0
data/vendor/faiss/c_api/faiss_c.h +58 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
data/vendor/faiss/c_api/index_factory_c.h +30 -0
data/vendor/faiss/c_api/index_io_c.cpp +42 -0
data/vendor/faiss/c_api/index_io_c.h +50 -0
data/vendor/faiss/c_api/macros_impl.h +110 -0
data/vendor/faiss/clone_index.cpp +147 -0
data/vendor/faiss/clone_index.h +38 -0
data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
data/vendor/faiss/gpu/GpuCloner.h +82 -0
data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
data/vendor/faiss/gpu/GpuDistance.h +52 -0
data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
data/vendor/faiss/gpu/GpuIndex.h +148 -0
data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
data/vendor/faiss/gpu/GpuResources.cpp +52 -0
data/vendor/faiss/gpu/GpuResources.h +73 -0
data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
data/vendor/faiss/gpu/test/TestUtils.h +93 -0
data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
data/vendor/faiss/gpu/utils/Timer.h +52 -0
data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
data/vendor/faiss/impl/FaissAssert.h +95 -0
data/vendor/faiss/impl/FaissException.cpp +66 -0
data/vendor/faiss/impl/FaissException.h +71 -0
data/vendor/faiss/impl/HNSW.cpp +818 -0
data/vendor/faiss/impl/HNSW.h +275 -0
data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
data/vendor/faiss/impl/PolysemousTraining.h +158 -0
data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
data/vendor/faiss/impl/ProductQuantizer.h +242 -0
data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
data/vendor/faiss/impl/ThreadedIndex.h +80 -0
data/vendor/faiss/impl/index_read.cpp +793 -0
data/vendor/faiss/impl/index_write.cpp +558 -0
data/vendor/faiss/impl/io.cpp +142 -0
data/vendor/faiss/impl/io.h +98 -0
data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
data/vendor/faiss/impl/lattice_Zn.h +199 -0
data/vendor/faiss/index_factory.cpp +392 -0
data/vendor/faiss/index_factory.h +25 -0
data/vendor/faiss/index_io.h +75 -0
data/vendor/faiss/misc/test_blas.cpp +84 -0
data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
data/vendor/faiss/tests/test_merge.cpp +258 -0
data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
data/vendor/faiss/tests/test_params_override.cpp +231 -0
data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
data/vendor/faiss/utils/Heap.cpp +122 -0
data/vendor/faiss/utils/Heap.h +495 -0
data/vendor/faiss/utils/WorkerThread.cpp +126 -0
data/vendor/faiss/utils/WorkerThread.h +61 -0
data/vendor/faiss/utils/distances.cpp +765 -0
data/vendor/faiss/utils/distances.h +243 -0
data/vendor/faiss/utils/distances_simd.cpp +809 -0
data/vendor/faiss/utils/extra_distances.cpp +336 -0
data/vendor/faiss/utils/extra_distances.h +54 -0
data/vendor/faiss/utils/hamming-inl.h +472 -0
data/vendor/faiss/utils/hamming.cpp +792 -0
data/vendor/faiss/utils/hamming.h +220 -0
data/vendor/faiss/utils/random.cpp +192 -0
data/vendor/faiss/utils/random.h +60 -0
data/vendor/faiss/utils/utils.cpp +783 -0
data/vendor/faiss/utils/utils.h +181 -0
metadata +216 -2

data/vendor/faiss/utils/utils.cpp ADDED Viewed

@@ -0,0 +1,783 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/utils/utils.h>
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <cmath>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <omp.h>
+#include <algorithm>
+#include <vector>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/random.h>
+#ifndef FINTEGER
+#define FINTEGER long
+#endif
+extern "C" {
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
+int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
+                 float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
+int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
+            FINTEGER *lda, float *tau, float *work,
+            FINTEGER *lwork, FINTEGER *info);
+int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
+           const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
+           float *beta, float *y, FINTEGER *incy);
+}
+/**************************************************
+ * Get some stats about the system
+ **************************************************/
+namespace faiss {
+double getmillisecs () {
+    struct timeval tv;
+    gettimeofday (&tv, nullptr);
+    return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
+}
+uint64_t get_cycles () {
+#ifdef  __x86_64__
+    uint32_t high, low;
+    asm volatile("rdtsc \n\t"
+                 : "=a" (low),
+                   "=d" (high));
+    return ((uint64_t)high << 32) | (low);
+#else
+    return 0;
+#endif
+}
+#ifdef __linux__
+size_t get_mem_usage_kb ()
+{
+    int pid = getpid ();
+    char fname[256];
+    snprintf (fname, 256, "/proc/%d/status", pid);
+    FILE * f = fopen (fname, "r");
+    FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
+    size_t sz = 0;
+    for (;;) {
+        char buf [256];
+        if (!fgets (buf, 256, f)) break;
+        if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
+    }
+    fclose (f);
+    return sz;
+}
+#elif __APPLE__
+size_t get_mem_usage_kb ()
+{
+    fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
+    return 0;
+}
+#endif
+void reflection (const float * __restrict u,
+                 float * __restrict x,
+                 size_t n, size_t d, size_t nu)
+{
+    size_t i, j, l;
+    for (i = 0; i < n; i++) {
+        const float * up = u;
+        for (l = 0; l < nu; l++) {
+            float ip1 = 0, ip2 = 0;
+            for (j = 0; j < d; j+=2) {
+                ip1 += up[j] * x[j];
+                ip2 += up[j+1] * x[j+1];
+            }
+            float ip = 2 * (ip1 + ip2);
+            for (j = 0; j < d; j++)
+                x[j] -= ip * up[j];
+            up += d;
+        }
+        x += d;
+    }
+}
+/* Reference implementation (slower) */
+void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
+{
+    size_t i, j, l;
+    for (i = 0; i < n; i++) {
+        const float * up = u;
+        for (l = 0; l < nu; l++) {
+            double ip = 0;
+            for (j = 0; j < d; j++)
+                ip += up[j] * x[j];
+            ip *= 2;
+            for (j = 0; j < d; j++)
+                x[j] -= ip * up[j];
+            up += d;
+        }
+        x += d;
+    }
+}
+/***************************************************************************
+ * Some matrix manipulation functions
+ ***************************************************************************/
+/* This function exists because the Torch counterpart is extremly slow
+   (not multi-threaded + unexpected overhead even in single thread).
+   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
+void inner_product_to_L2sqr (float * __restrict dis,
+                             const float * nr1,
+                             const float * nr2,
+                             size_t n1, size_t n2)
+{
+#pragma omp parallel for
+    for (size_t j = 0 ; j < n1 ; j++) {
+        float * disj = dis + j * n2;
+        for (size_t i = 0 ; i < n2 ; i++)
+            disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
+    }
+}
+void matrix_qr (int m, int n, float *a)
+{
+    FAISS_THROW_IF_NOT (m >= n);
+    FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
+    std::vector<float> tau (ki);
+    FINTEGER lwork = -1, info;
+    float work_size;
+    sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
+             &work_size, &lwork, &info);
+    lwork = size_t(work_size);
+    std::vector<float> work (lwork);
+    sgeqrf_ (&mi, &ni, a, &mi,
+             tau.data(), work.data(), &lwork, &info);
+    sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
+             work.data(), &lwork, &info);
+}
+/***************************************************************************
+ * Kmeans subroutine
+ ***************************************************************************/
+// a bit above machine epsilon for float16
+#define EPS (1 / 1024.)
+/* For k-means, compute centroids given assignment of vectors to centroids */
+int km_update_centroids (const float * x,
+                         float * centroids,
+                         int64_t * assign,
+                         size_t d, size_t k, size_t n,
+                         size_t k_frozen)
+{
+    k -= k_frozen;
+    centroids += k_frozen * d;
+    std::vector<size_t> hassign(k);
+    memset (centroids, 0, sizeof(*centroids) * d * k);
+#pragma omp parallel
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        // this thread is taking care of centroids c0:c1
+        size_t c0 = (k * rank) / nt;
+        size_t c1 = (k * (rank + 1)) / nt;
+        const float *xi = x;
+        size_t nacc = 0;
+        for (size_t i = 0; i < n; i++) {
+            int64_t ci = assign[i];
+            assert (ci >= 0 && ci < k + k_frozen);
+            ci -= k_frozen;
+            if (ci >= c0 && ci < c1)  {
+                float * c = centroids + ci * d;
+                hassign[ci]++;
+                for (size_t j = 0; j < d; j++)
+                    c[j] += xi[j];
+                nacc++;
+            }
+            xi += d;
+        }
+    }
+#pragma omp parallel for
+    for (size_t ci = 0; ci < k; ci++) {
+        float * c = centroids + ci * d;
+        float ni = (float) hassign[ci];
+        if (ni != 0) {
+            for (size_t j = 0; j < d; j++)
+                c[j] /= ni;
+        }
+    }
+    /* Take care of void clusters */
+    size_t nsplit = 0;
+    RandomGenerator rng (1234);
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) { /* need to redefine a centroid */
+            size_t cj;
+            for (cj = 0; 1; cj = (cj + 1) % k) {
+                /* probability to pick this cluster for split */
+                float p = (hassign[cj] - 1.0) / (float) (n - k);
+                float r = rng.rand_float ();
+                if (r < p) {
+                    break; /* found our cluster to be split */
+                }
+            }
+            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
+            /* small symmetric pertubation. Much better than  */
+            for (size_t j = 0; j < d; j++) {
+                if (j % 2 == 0) {
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
+                } else {
+                    centroids[ci * d + j] *= 1 - EPS;
+                    centroids[cj * d + j] *= 1 + EPS;
+                }
+            }
+            /* assume even split of the cluster */
+            hassign[ci] = hassign[cj] / 2;
+            hassign[cj] -= hassign[ci];
+            nsplit++;
+        }
+    }
+    return nsplit;
+}
+#undef EPS
+/***************************************************************************
+ * Result list routines
+ ***************************************************************************/
+void ranklist_handle_ties (int k, int64_t *idx, const float *dis)
+{
+    float prev_dis = -1e38;
+    int prev_i = -1;
+    for (int i = 0; i < k; i++) {
+        if (dis[i] != prev_dis) {
+            if (i > prev_i + 1) {
+                // sort between prev_i and i - 1
+                std::sort (idx + prev_i, idx + i);
+            }
+            prev_i = i;
+            prev_dis = dis[i];
+        }
+    }
+}
+size_t merge_result_table_with (size_t n, size_t k,
+                                int64_t *I0, float *D0,
+                                const int64_t *I1, const float *D1,
+                                bool keep_min,
+                                int64_t translation)
+{
+    size_t n1 = 0;
+#pragma omp parallel reduction(+:n1)
+    {
+        std::vector<int64_t> tmpI (k);
+        std::vector<float> tmpD (k);
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t *lI0 = I0 + i * k;
+            float *lD0 = D0 + i * k;
+            const int64_t *lI1 = I1 + i * k;
+            const float *lD1 = D1 + i * k;
+            size_t r0 = 0;
+            size_t r1 = 0;
+            if (keep_min) {
+                for (size_t j = 0; j < k; j++) {
+                    if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
+                        tmpD[j] = lD0[r0];
+                        tmpI[j] = lI0[r0];
+                        r0++;
+                    } else if (lD1[r1] >= 0) {
+                        tmpD[j] = lD1[r1];
+                        tmpI[j] = lI1[r1] + translation;
+                        r1++;
+                    } else { // both are NaNs
+                        tmpD[j] = NAN;
+                        tmpI[j] = -1;
+                    }
+                }
+            } else {
+                for (size_t j = 0; j < k; j++) {
+                    if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
+                        tmpD[j] = lD0[r0];
+                        tmpI[j] = lI0[r0];
+                        r0++;
+                    } else if (lD1[r1] >= 0) {
+                        tmpD[j] = lD1[r1];
+                        tmpI[j] = lI1[r1] + translation;
+                        r1++;
+                    } else { // both are NaNs
+                        tmpD[j] = NAN;
+                        tmpI[j] = -1;
+                    }
+                }
+            }
+            n1 += r1;
+            memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
+            memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
+        }
+    }
+    return n1;
+}
+size_t ranklist_intersection_size (size_t k1, const int64_t *v1,
+                                   size_t k2, const int64_t *v2_in)
+{
+    if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
+    int64_t *v2 = new int64_t [k2];
+    memcpy (v2, v2_in, sizeof (int64_t) * k2);
+    std::sort (v2, v2 + k2);
+    { // de-dup v2
+        int64_t prev = -1;
+        size_t wp = 0;
+        for (size_t i = 0; i < k2; i++) {
+            if (v2 [i] != prev) {
+                v2[wp++] = prev = v2 [i];
+            }
+        }
+        k2 = wp;
+    }
+    const int64_t seen_flag = 1L << 60;
+    size_t count = 0;
+    for (size_t i = 0; i < k1; i++) {
+        int64_t q = v1 [i];
+        size_t i0 = 0, i1 = k2;
+        while (i0 + 1 < i1) {
+            size_t imed = (i1 + i0) / 2;
+            int64_t piv = v2 [imed] & ~seen_flag;
+            if (piv <= q) i0 = imed;
+            else          i1 = imed;
+        }
+        if (v2 [i0] == q) {
+            count++;
+            v2 [i0] |= seen_flag;
+        }
+    }
+    delete [] v2;
+    return count;
+}
+double imbalance_factor (int k, const int *hist) {
+    double tot = 0, uf = 0;
+    for (int i = 0 ; i < k ; i++) {
+        tot += hist[i];
+        uf += hist[i] * (double) hist[i];
+    }
+    uf = uf * k / (tot * tot);
+    return uf;
+}
+double imbalance_factor (int n, int k, const int64_t *assign) {
+    std::vector<int> hist(k, 0);
+    for (int i = 0; i < n; i++) {
+        hist[assign[i]]++;
+    }
+    return imbalance_factor (k, hist.data());
+}
+int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
+    memset (hist, 0, sizeof(hist[0]) * vmax);
+    int nout = 0;
+    while (n--) {
+        if (v[n] < 0 || v[n] >= vmax) nout++;
+        else hist[v[n]]++;
+    }
+    return nout;
+}
+void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
+{
+    FAISS_THROW_IF_NOT (nbits % 8 == 0);
+    size_t d = nbits / 8;
+    std::vector<int> accu(d * 256);
+    const uint8_t *c = codes;
+    for (size_t i = 0; i < n; i++)
+        for(int j = 0; j < d; j++)
+            accu[j * 256 + *c++]++;
+    memset (hist, 0, sizeof(*hist) * nbits);
+    for (int i = 0; i < d; i++) {
+        const int *ai = accu.data() + i * 256;
+        int * hi = hist + i * 8;
+        for (int j = 0; j < 256; j++)
+            for (int k = 0; k < 8; k++)
+                if ((j >> k) & 1)
+                    hi[k] += ai[j];
+    }
+}
+size_t ivec_checksum (size_t n, const int *a)
+{
+    size_t cs = 112909;
+    while (n--) cs = cs * 65713 + a[n] * 1686049;
+    return cs;
+}
+namespace {
+    struct ArgsortComparator {
+        const float *vals;
+        bool operator() (const size_t a, const size_t b) const {
+            return vals[a] < vals[b];
+        }
+    };
+    struct SegmentS {
+        size_t i0; // begin pointer in the permutation array
+        size_t i1; // end
+        size_t len() const {
+            return i1 - i0;
+        }
+    };
+    // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
+    // extended to > 1 merge thread
+    // merges 2 ranges that should be consecutive on the source into
+    // the union of the two on the destination
+    template<typename T>
+    void parallel_merge (const T *src, T *dst,
+                         SegmentS &s1, SegmentS & s2, int nt,
+                         const ArgsortComparator & comp) {
+        if (s2.len() > s1.len()) { // make sure that s1 larger than s2
+            std::swap(s1, s2);
+        }
+        // compute sub-ranges for each thread
+        SegmentS s1s[nt], s2s[nt], sws[nt];
+        s2s[0].i0 = s2.i0;
+        s2s[nt - 1].i1 = s2.i1;
+        // not sure parallel actually helps here
+#pragma omp parallel for num_threads(nt)
+        for (int t = 0; t < nt; t++) {
+            s1s[t].i0 = s1.i0 + s1.len() * t / nt;
+            s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
+            if (t + 1 < nt) {
+                T pivot = src[s1s[t].i1];
+                size_t i0 = s2.i0, i1 = s2.i1;
+                while (i0 + 1 < i1) {
+                    size_t imed = (i1 + i0) / 2;
+                    if (comp (pivot, src[imed])) {i1 = imed; }
+                    else                         {i0 = imed; }
+                }
+                s2s[t].i1 = s2s[t + 1].i0 = i1;
+            }
+        }
+        s1.i0 = std::min(s1.i0, s2.i0);
+        s1.i1 = std::max(s1.i1, s2.i1);
+        s2 = s1;
+        sws[0].i0 = s1.i0;
+        for (int t = 0; t < nt; t++) {
+            sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
+            if (t + 1 < nt) {
+                sws[t + 1].i0 = sws[t].i1;
+            }
+        }
+        assert(sws[nt - 1].i1 == s1.i1);
+        // do the actual merging
+#pragma omp parallel for num_threads(nt)
+        for (int t = 0; t < nt; t++) {
+            SegmentS sw = sws[t];
+            SegmentS s1t = s1s[t];
+            SegmentS s2t = s2s[t];
+            if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
+                for (;;) {
+                    // assert (sw.len() == s1t.len() + s2t.len());
+                    if (comp(src[s1t.i0], src[s2t.i0])) {
+                        dst[sw.i0++] = src[s1t.i0++];
+                        if (s1t.i0 == s1t.i1) break;
+                    } else {
+                        dst[sw.i0++] = src[s2t.i0++];
+                        if (s2t.i0 == s2t.i1) break;
+                    }
+                }
+            }
+            if (s1t.len() > 0) {
+                assert(s1t.len() == sw.len());
+                memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
+            } else if (s2t.len() > 0) {
+                assert(s2t.len() == sw.len());
+                memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
+            }
+        }
+    }
+};
+void fvec_argsort (size_t n, const float *vals,
+                    size_t *perm)
+{
+    for (size_t i = 0; i < n; i++) perm[i] = i;
+    ArgsortComparator comp = {vals};
+    std::sort (perm, perm + n, comp);
+}
+void fvec_argsort_parallel (size_t n, const float *vals,
+                            size_t *perm)
+{
+    size_t * perm2 = new size_t[n];
+    // 2 result tables, during merging, flip between them
+    size_t *permB = perm2, *permA = perm;
+    int nt = omp_get_max_threads();
+    { // prepare correct permutation so that the result ends in perm
+      // at final iteration
+        int nseg = nt;
+        while (nseg > 1) {
+            nseg = (nseg + 1) / 2;
+            std::swap (permA, permB);
+        }
+    }
+#pragma omp parallel
+    for (size_t i = 0; i < n; i++) permA[i] = i;
+    ArgsortComparator comp = {vals};
+    SegmentS segs[nt];
+    // independent sorts
+#pragma omp parallel for
+    for (int t = 0; t < nt; t++) {
+        size_t i0 = t * n / nt;
+        size_t i1 = (t + 1) * n / nt;
+        SegmentS seg = {i0, i1};
+        std::sort (permA + seg.i0, permA + seg.i1, comp);
+        segs[t] = seg;
+    }
+    int prev_nested = omp_get_nested();
+    omp_set_nested(1);
+    int nseg = nt;
+    while (nseg > 1) {
+        int nseg1 = (nseg + 1) / 2;
+        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
+        int sub_nseg1 = nseg / 2;
+#pragma omp parallel for num_threads(nseg1)
+        for (int s = 0; s < nseg; s += 2) {
+            if (s + 1 == nseg) { // otherwise isolated segment
+                memcpy(permB + segs[s].i0, permA + segs[s].i0,
+                       segs[s].len() * sizeof(size_t));
+            } else {
+                int t0 = s * sub_nt / sub_nseg1;
+                int t1 = (s + 1) * sub_nt / sub_nseg1;
+                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
+                parallel_merge(permA, permB, segs[s], segs[s + 1],
+                               t1 - t0, comp);
+            }
+        }
+        for (int s = 0; s < nseg; s += 2)
+            segs[s / 2] = segs[s];
+        nseg = nseg1;
+        std::swap (permA, permB);
+    }
+    assert (permA == perm);
+    omp_set_nested(prev_nested);
+    delete [] perm2;
+}
+const float *fvecs_maybe_subsample (
+          size_t d, size_t *n, size_t nmax, const float *x,
+          bool verbose, int64_t seed)
+{
+    if (*n <= nmax) return x; // nothing to do
+    size_t n2 = nmax;
+    if (verbose) {
+        printf ("  Input training set too big (max size is %ld), sampling "
+                "%ld / %ld vectors\n", nmax, n2, *n);
+    }
+    std::vector<int> subset (*n);
+    rand_perm (subset.data (), *n, seed);
+    float *x_subset = new float[n2 * d];
+    for (int64_t i = 0; i < n2; i++)
+        memcpy (&x_subset[i * d],
+                &x[subset[i] * size_t(d)],
+                sizeof (x[0]) * d);
+    *n = n2;
+    return x_subset;
+}
+void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
+    for (size_t i = 0; i < d; ++i) {
+        x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
+    }
+}
+void real_to_binary(size_t d, const float *x_in, uint8_t *x_out) {
+  for (size_t i = 0; i < d / 8; ++i) {
+    uint8_t b = 0;
+    for (int j = 0; j < 8; ++j) {
+      if (x_in[8 * i + j] > 0) {
+        b |= (1 << j);
+      }
+    }
+    x_out[i] = b;
+  }
+}
+// from Python's stringobject.c
+uint64_t hash_bytes (const uint8_t *bytes, int64_t n) {
+    const uint8_t *p = bytes;
+    uint64_t x = (uint64_t)(*p) << 7;
+    int64_t len = n;
+    while (--len >= 0) {
+        x = (1000003*x) ^ *p++;
+    }
+    x ^= n;
+    return x;
+}
+bool check_openmp() {
+    omp_set_num_threads(10);
+    if (omp_get_max_threads() != 10) {
+        return false;
+    }
+    std::vector<int> nt_per_thread(10);
+    size_t sum = 0;
+    bool in_parallel = true;
+#pragma omp parallel reduction(+: sum)
+    {
+        if (!omp_in_parallel()) {
+            in_parallel = false;
+        }
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        nt_per_thread[rank] = nt;
+#pragma omp for
+        for(int i = 0; i < 1000 * 1000 * 10; i++) {
+            sum += i;
+        }
+    }
+    if (!in_parallel) {
+        return false;
+    }
+    if (nt_per_thread[0] != 10) {
+        return false;
+    }
+    if (sum == 0) {
+        return false;
+    }
+    return true;
+}
+} // namespace faiss