RubyGems - faiss - Versions diffs - 0.1.0 → 0.1.1 - Mend

faiss 0.1.0 → 0.1.1

Files changed (226) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +103 -3
data/ext/faiss/ext.cpp +99 -32
data/ext/faiss/extconf.rb +12 -2
data/lib/faiss/ext.bundle +0 -0
data/lib/faiss/index.rb +3 -3
data/lib/faiss/index_binary.rb +3 -3
data/lib/faiss/kmeans.rb +1 -1
data/lib/faiss/pca_matrix.rb +2 -2
data/lib/faiss/product_quantizer.rb +3 -3
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/AutoTune.cpp +719 -0
data/vendor/faiss/AutoTune.h +212 -0
data/vendor/faiss/Clustering.cpp +261 -0
data/vendor/faiss/Clustering.h +101 -0
data/vendor/faiss/IVFlib.cpp +339 -0
data/vendor/faiss/IVFlib.h +132 -0
data/vendor/faiss/Index.cpp +171 -0
data/vendor/faiss/Index.h +261 -0
data/vendor/faiss/Index2Layer.cpp +437 -0
data/vendor/faiss/Index2Layer.h +85 -0
data/vendor/faiss/IndexBinary.cpp +77 -0
data/vendor/faiss/IndexBinary.h +163 -0
data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
data/vendor/faiss/IndexBinaryFlat.h +54 -0
data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
data/vendor/faiss/IndexBinaryHNSW.h +56 -0
data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
data/vendor/faiss/IndexBinaryIVF.h +211 -0
data/vendor/faiss/IndexFlat.cpp +508 -0
data/vendor/faiss/IndexFlat.h +175 -0
data/vendor/faiss/IndexHNSW.cpp +1090 -0
data/vendor/faiss/IndexHNSW.h +170 -0
data/vendor/faiss/IndexIVF.cpp +909 -0
data/vendor/faiss/IndexIVF.h +353 -0
data/vendor/faiss/IndexIVFFlat.cpp +502 -0
data/vendor/faiss/IndexIVFFlat.h +118 -0
data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
data/vendor/faiss/IndexIVFPQ.h +161 -0
data/vendor/faiss/IndexIVFPQR.cpp +219 -0
data/vendor/faiss/IndexIVFPQR.h +65 -0
data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
data/vendor/faiss/IndexLSH.cpp +225 -0
data/vendor/faiss/IndexLSH.h +87 -0
data/vendor/faiss/IndexLattice.cpp +143 -0
data/vendor/faiss/IndexLattice.h +68 -0
data/vendor/faiss/IndexPQ.cpp +1188 -0
data/vendor/faiss/IndexPQ.h +199 -0
data/vendor/faiss/IndexPreTransform.cpp +288 -0
data/vendor/faiss/IndexPreTransform.h +91 -0
data/vendor/faiss/IndexReplicas.cpp +123 -0
data/vendor/faiss/IndexReplicas.h +76 -0
data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
data/vendor/faiss/IndexScalarQuantizer.h +127 -0
data/vendor/faiss/IndexShards.cpp +317 -0
data/vendor/faiss/IndexShards.h +100 -0
data/vendor/faiss/InvertedLists.cpp +623 -0
data/vendor/faiss/InvertedLists.h +334 -0
data/vendor/faiss/LICENSE +21 -0
data/vendor/faiss/MatrixStats.cpp +252 -0
data/vendor/faiss/MatrixStats.h +62 -0
data/vendor/faiss/MetaIndexes.cpp +351 -0
data/vendor/faiss/MetaIndexes.h +126 -0
data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
data/vendor/faiss/OnDiskInvertedLists.h +127 -0
data/vendor/faiss/VectorTransform.cpp +1157 -0
data/vendor/faiss/VectorTransform.h +322 -0
data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
data/vendor/faiss/c_api/AutoTune_c.h +64 -0
data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
data/vendor/faiss/c_api/Clustering_c.h +117 -0
data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
data/vendor/faiss/c_api/IndexShards_c.h +42 -0
data/vendor/faiss/c_api/Index_c.cpp +105 -0
data/vendor/faiss/c_api/Index_c.h +183 -0
data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
data/vendor/faiss/c_api/clone_index_c.h +32 -0
data/vendor/faiss/c_api/error_c.h +42 -0
data/vendor/faiss/c_api/error_impl.cpp +27 -0
data/vendor/faiss/c_api/error_impl.h +16 -0
data/vendor/faiss/c_api/faiss_c.h +58 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
data/vendor/faiss/c_api/index_factory_c.h +30 -0
data/vendor/faiss/c_api/index_io_c.cpp +42 -0
data/vendor/faiss/c_api/index_io_c.h +50 -0
data/vendor/faiss/c_api/macros_impl.h +110 -0
data/vendor/faiss/clone_index.cpp +147 -0
data/vendor/faiss/clone_index.h +38 -0
data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
data/vendor/faiss/gpu/GpuCloner.h +82 -0
data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
data/vendor/faiss/gpu/GpuDistance.h +52 -0
data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
data/vendor/faiss/gpu/GpuIndex.h +148 -0
data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
data/vendor/faiss/gpu/GpuResources.cpp +52 -0
data/vendor/faiss/gpu/GpuResources.h +73 -0
data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
data/vendor/faiss/gpu/test/TestUtils.h +93 -0
data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
data/vendor/faiss/gpu/utils/Timer.h +52 -0
data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
data/vendor/faiss/impl/FaissAssert.h +95 -0
data/vendor/faiss/impl/FaissException.cpp +66 -0
data/vendor/faiss/impl/FaissException.h +71 -0
data/vendor/faiss/impl/HNSW.cpp +818 -0
data/vendor/faiss/impl/HNSW.h +275 -0
data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
data/vendor/faiss/impl/PolysemousTraining.h +158 -0
data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
data/vendor/faiss/impl/ProductQuantizer.h +242 -0
data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
data/vendor/faiss/impl/ThreadedIndex.h +80 -0
data/vendor/faiss/impl/index_read.cpp +793 -0
data/vendor/faiss/impl/index_write.cpp +558 -0
data/vendor/faiss/impl/io.cpp +142 -0
data/vendor/faiss/impl/io.h +98 -0
data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
data/vendor/faiss/impl/lattice_Zn.h +199 -0
data/vendor/faiss/index_factory.cpp +392 -0
data/vendor/faiss/index_factory.h +25 -0
data/vendor/faiss/index_io.h +75 -0
data/vendor/faiss/misc/test_blas.cpp +84 -0
data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
data/vendor/faiss/tests/test_merge.cpp +258 -0
data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
data/vendor/faiss/tests/test_params_override.cpp +231 -0
data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
data/vendor/faiss/utils/Heap.cpp +122 -0
data/vendor/faiss/utils/Heap.h +495 -0
data/vendor/faiss/utils/WorkerThread.cpp +126 -0
data/vendor/faiss/utils/WorkerThread.h +61 -0
data/vendor/faiss/utils/distances.cpp +765 -0
data/vendor/faiss/utils/distances.h +243 -0
data/vendor/faiss/utils/distances_simd.cpp +809 -0
data/vendor/faiss/utils/extra_distances.cpp +336 -0
data/vendor/faiss/utils/extra_distances.h +54 -0
data/vendor/faiss/utils/hamming-inl.h +472 -0
data/vendor/faiss/utils/hamming.cpp +792 -0
data/vendor/faiss/utils/hamming.h +220 -0
data/vendor/faiss/utils/random.cpp +192 -0
data/vendor/faiss/utils/random.h +60 -0
data/vendor/faiss/utils/utils.cpp +783 -0
data/vendor/faiss/utils/utils.h +181 -0
metadata +216 -2

data/vendor/faiss/utils/WorkerThread.cpp ADDED Viewed

@@ -0,0 +1,126 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/utils/WorkerThread.h>
+#include <faiss/impl/FaissAssert.h>
+#include <exception>
+namespace faiss {
+namespace {
+// Captures any exceptions thrown by the lambda and returns them via the promise
+void runCallback(std::function<void()>& fn,
+                 std::promise<bool>& promise) {
+  try {
+    fn();
+    promise.set_value(true);
+  } catch (...) {
+    promise.set_exception(std::current_exception());
+  }
+}
+} // namespace
+WorkerThread::WorkerThread() :
+    wantStop_(false) {
+  startThread();
+  // Make sure that the thread has started before continuing
+  add([](){}).get();
+}
+WorkerThread::~WorkerThread() {
+  stop();
+  waitForThreadExit();
+}
+void
+WorkerThread::startThread() {
+  thread_ = std::thread([this](){ threadMain(); });
+}
+void
+WorkerThread::stop() {
+  std::lock_guard<std::mutex> guard(mutex_);
+  wantStop_ = true;
+  monitor_.notify_one();
+}
+std::future<bool>
+WorkerThread::add(std::function<void()> f) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  if (wantStop_) {
+    // The timer thread has been stopped, or we want to stop; we can't
+    // schedule anything else
+    std::promise<bool> p;
+    auto fut = p.get_future();
+    // did not execute
+    p.set_value(false);
+    return fut;
+  }
+  auto pr = std::promise<bool>();
+  auto fut = pr.get_future();
+  queue_.emplace_back(std::make_pair(std::move(f), std::move(pr)));
+  // Wake up our thread
+  monitor_.notify_one();
+  return fut;
+}
+void
+WorkerThread::threadMain() {
+  threadLoop();
+  // Call all pending tasks
+  FAISS_ASSERT(wantStop_);
+  // flush all pending operations
+  for (auto& f : queue_) {
+    runCallback(f.first, f.second);
+  }
+}
+void
+WorkerThread::threadLoop() {
+  while (true) {
+    std::pair<std::function<void()>, std::promise<bool>> data;
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (!wantStop_ && queue_.empty()) {
+        monitor_.wait(lock);
+      }
+      if (wantStop_) {
+        return;
+      }
+      data = std::move(queue_.front());
+      queue_.pop_front();
+    }
+    runCallback(data.first, data.second);
+  }
+}
+void
+WorkerThread::waitForThreadExit() {
+  try {
+    thread_.join();
+  } catch (...) {
+  }
+}
+} // namespace

data/vendor/faiss/utils/WorkerThread.h ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <condition_variable>
+#include <future>
+#include <deque>
+#include <thread>
+namespace faiss {
+class WorkerThread {
+ public:
+  WorkerThread();
+  /// Stops and waits for the worker thread to exit, flushing all
+  /// pending lambdas
+  ~WorkerThread();
+  /// Request that the worker thread stop itself
+  void stop();
+  /// Blocking waits in the current thread for the worker thread to
+  /// stop
+  void waitForThreadExit();
+  /// Adds a lambda to run on the worker thread; returns a future that
+  /// can be used to block on its completion.
+  /// Future status is `true` if the lambda was run in the worker
+  /// thread; `false` if it was not run, because the worker thread is
+  /// exiting or has exited.
+  std::future<bool> add(std::function<void()> f);
+ private:
+  void startThread();
+  void threadMain();
+  void threadLoop();
+  /// Thread that all queued lambdas are run on
+  std::thread thread_;
+  /// Mutex for the queue and exit status
+  std::mutex mutex_;
+  /// Monitor for the exit status and the queue
+  std::condition_variable monitor_;
+  /// Whether or not we want the thread to exit
+  bool wantStop_;
+  /// Queue of pending lambdas to call
+  std::deque<std::pair<std::function<void()>, std::promise<bool>>> queue_;
+};
+} // namespace

data/vendor/faiss/utils/distances.cpp ADDED Viewed

@@ -0,0 +1,765 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/utils/distances.h>
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <cmath>
+#include <omp.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#ifndef FINTEGER
+#define FINTEGER long
+#endif
+extern "C" {
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
+int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
+                 float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
+int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
+           const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
+           float *beta, float *y, FINTEGER *incy);
+}
+namespace faiss {
+/***************************************************************************
+ * Matrix/vector ops
+ ***************************************************************************/
+/* Compute the inner product between a vector x and
+   a set of ny vectors y.
+   These functions are not intended to replace BLAS matrix-matrix, as they
+   would be significantly less efficient in this case. */
+void fvec_inner_products_ny (float * ip,
+                             const float * x,
+                             const float * y,
+                             size_t d, size_t ny)
+{
+    // Not sure which one is fastest
+#if 0
+    {
+        FINTEGER di = d;
+        FINTEGER nyi = ny;
+        float one = 1.0, zero = 0.0;
+        FINTEGER onei = 1;
+        sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
+    }
+#endif
+    for (size_t i = 0; i < ny; i++) {
+        ip[i] = fvec_inner_product (x, y, d);
+        y += d;
+    }
+}
+/* Compute the L2 norm of a set of nx vectors */
+void fvec_norms_L2 (float * __restrict nr,
+                    const float * __restrict x,
+                    size_t d, size_t nx)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
+    }
+}
+void fvec_norms_L2sqr (float * __restrict nr,
+                       const float * __restrict x,
+                       size_t d, size_t nx)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++)
+        nr[i] = fvec_norm_L2sqr (x + i * d, d);
+}
+void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        float * __restrict xi = x + i * d;
+        float nr = fvec_norm_L2sqr (xi, d);
+        if (nr > 0) {
+            size_t j;
+            const float inv_nr = 1.0 / sqrtf (nr);
+            for (j = 0; j < d; j++)
+                xi[j] *= inv_nr;
+        }
+    }
+}
+/***************************************************************************
+ * KNN functions
+ ***************************************************************************/
+/* Find the nearest neighbors for nx queries in a set of ny vectors */
+static void knn_inner_product_sse (const float * x,
+                        const float * y,
+                        size_t d, size_t nx, size_t ny,
+                        float_minheap_array_t * res)
+{
+    size_t k = res->k;
+    size_t check_period = InterruptCallback::get_period_hint (ny * d);
+    check_period *= omp_get_max_threads();
+    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
+        size_t i1 = std::min(i0 + check_period, nx);
+#pragma omp parallel for
+        for (size_t i = i0; i < i1; i++) {
+            const float * x_i = x + i * d;
+            const float * y_j = y;
+            float * __restrict simi = res->get_val(i);
+            int64_t * __restrict idxi = res->get_ids (i);
+            minheap_heapify (k, simi, idxi);
+            for (size_t j = 0; j < ny; j++) {
+                float ip = fvec_inner_product (x_i, y_j, d);
+                if (ip > simi[0]) {
+                    minheap_pop (k, simi, idxi);
+                    minheap_push (k, simi, idxi, ip, j);
+                }
+                y_j += d;
+            }
+            minheap_reorder (k, simi, idxi);
+        }
+        InterruptCallback::check ();
+    }
+}
+static void knn_L2sqr_sse (
+                const float * x,
+                const float * y,
+                size_t d, size_t nx, size_t ny,
+                float_maxheap_array_t * res)
+{
+    size_t k = res->k;
+    size_t check_period = InterruptCallback::get_period_hint (ny * d);
+    check_period *= omp_get_max_threads();
+    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
+        size_t i1 = std::min(i0 + check_period, nx);
+#pragma omp parallel for
+        for (size_t i = i0; i < i1; i++) {
+            const float * x_i = x + i * d;
+            const float * y_j = y;
+            size_t j;
+            float * simi = res->get_val(i);
+            int64_t * idxi = res->get_ids (i);
+            maxheap_heapify (k, simi, idxi);
+            for (j = 0; j < ny; j++) {
+                float disij = fvec_L2sqr (x_i, y_j, d);
+                if (disij < simi[0]) {
+                    maxheap_pop (k, simi, idxi);
+                    maxheap_push (k, simi, idxi, disij, j);
+                }
+                y_j += d;
+            }
+            maxheap_reorder (k, simi, idxi);
+        }
+        InterruptCallback::check ();
+    }
+}
+/** Find the nearest neighbors for nx queries in a set of ny vectors */
+static void knn_inner_product_blas (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_minheap_array_t * res)
+{
+    res->heapify ();
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) return;
+    /* block sizes */
+    const size_t bs_x = 4096, bs_y = 1024;
+    // const size_t bs_x = 16, bs_y = 16;
+    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
+    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
+        size_t i1 = i0 + bs_x;
+        if(i1 > nx) i1 = nx;
+        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
+            size_t j1 = j0 + bs_y;
+            if (j1 > ny) j1 = ny;
+            /* compute the actual dot products */
+            {
+                float one = 1, zero = 0;
+                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
+                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
+                        y + j0 * d, &di,
+                        x + i0 * d, &di, &zero,
+                        ip_block.get(), &nyi);
+            }
+            /* collect maxima */
+            res->addn (j1 - j0, ip_block.get(), j0, i0, i1 - i0);
+        }
+        InterruptCallback::check ();
+    }
+    res->reorder ();
+}
+// distance correction is an operator that can be applied to transform
+// the distances
+template<class DistanceCorrection>
+static void knn_L2sqr_blas (const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_maxheap_array_t * res,
+        const DistanceCorrection &corr)
+{
+    res->heapify ();
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) return;
+    size_t k = res->k;
+    /* block sizes */
+    const size_t bs_x = 4096, bs_y = 1024;
+    // const size_t bs_x = 16, bs_y = 16;
+    float *ip_block = new float[bs_x * bs_y];
+    float *x_norms = new float[nx];
+    float *y_norms = new float[ny];
+    ScopeDeleter<float> del1(ip_block), del3(x_norms), del2(y_norms);
+    fvec_norms_L2sqr (x_norms, x, d, nx);
+    fvec_norms_L2sqr (y_norms, y, d, ny);
+    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
+        size_t i1 = i0 + bs_x;
+        if(i1 > nx) i1 = nx;
+        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
+            size_t j1 = j0 + bs_y;
+            if (j1 > ny) j1 = ny;
+            /* compute the actual dot products */
+            {
+                float one = 1, zero = 0;
+                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
+                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
+                        y + j0 * d, &di,
+                        x + i0 * d, &di, &zero,
+                        ip_block, &nyi);
+            }
+            /* collect minima */
+#pragma omp parallel for
+            for (size_t i = i0; i < i1; i++) {
+                float * __restrict simi = res->get_val(i);
+                int64_t * __restrict idxi = res->get_ids (i);
+                const float *ip_line = ip_block + (i - i0) * (j1 - j0);
+                for (size_t j = j0; j < j1; j++) {
+                    float ip = *ip_line++;
+                    float dis = x_norms[i] + y_norms[j] - 2 * ip;
+                    // negative values can occur for identical vectors
+                    // due to roundoff errors
+                    if (dis < 0) dis = 0;
+                    dis = corr (dis, i, j);
+                    if (dis < simi[0]) {
+                        maxheap_pop (k, simi, idxi);
+                        maxheap_push (k, simi, idxi, dis, j);
+                    }
+                }
+            }
+        }
+        InterruptCallback::check ();
+    }
+    res->reorder ();
+}
+/*******************************************************
+ * KNN driver functions
+ *******************************************************/
+int distance_compute_blas_threshold = 20;
+void knn_inner_product (const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_minheap_array_t * res)
+{
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        knn_inner_product_sse (x, y, d, nx, ny, res);
+    } else {
+        knn_inner_product_blas (x, y, d, nx, ny, res);
+    }
+}
+struct NopDistanceCorrection {
+  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
+    return dis;
+    }
+};
+void knn_L2sqr (const float * x,
+                const float * y,
+                size_t d, size_t nx, size_t ny,
+                float_maxheap_array_t * res)
+{
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        knn_L2sqr_sse (x, y, d, nx, ny, res);
+    } else {
+        NopDistanceCorrection nop;
+        knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
+    }
+}
+struct BaseShiftDistanceCorrection {
+    const float *base_shift;
+    float operator()(float dis, size_t /*qno*/, size_t bno) const {
+      return dis - base_shift[bno];
+    }
+};
+void knn_L2sqr_base_shift (
+         const float * x,
+         const float * y,
+         size_t d, size_t nx, size_t ny,
+         float_maxheap_array_t * res,
+         const float *base_shift)
+{
+    BaseShiftDistanceCorrection corr = {base_shift};
+    knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
+}
+/***************************************************************************
+ * compute a subset of  distances
+ ***************************************************************************/
+/* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_inner_products_by_idx (float * __restrict ip,
+                                 const float * x,
+                                 const float * y,
+                                 const int64_t * __restrict ids, /* for y vecs */
+                                 size_t d, size_t nx, size_t ny)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nx; j++) {
+        const int64_t * __restrict idsj = ids + j * ny;
+        const float * xj = x + j * d;
+        float * __restrict ipj = ip + j * ny;
+        for (size_t i = 0; i < ny; i++) {
+            if (idsj[i] < 0)
+                continue;
+            ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
+        }
+    }
+}
+/* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_L2sqr_by_idx (float * __restrict dis,
+                        const float * x,
+                        const float * y,
+                        const int64_t * __restrict ids, /* ids of y vecs */
+                        size_t d, size_t nx, size_t ny)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nx; j++) {
+        const int64_t * __restrict idsj = ids + j * ny;
+        const float * xj = x + j * d;
+        float * __restrict disj = dis + j * ny;
+        for (size_t i = 0; i < ny; i++) {
+            if (idsj[i] < 0)
+                continue;
+            disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
+        }
+    }
+}
+void pairwise_indexed_L2sqr (
+        size_t d, size_t n,
+        const float * x, const int64_t *ix,
+        const float * y, const int64_t *iy,
+        float *dis)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < n; j++) {
+        if (ix[j] >= 0 && iy[j] >= 0) {
+            dis[j] = fvec_L2sqr (x + d * ix[j], y + d * iy[j], d);
+        }
+    }
+}
+void pairwise_indexed_inner_product (
+        size_t d, size_t n,
+        const float * x, const int64_t *ix,
+        const float * y, const int64_t *iy,
+        float *dis)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < n; j++) {
+        if (ix[j] >= 0 && iy[j] >= 0) {
+            dis[j] = fvec_inner_product (x + d * ix[j], y + d * iy[j], d);
+        }
+    }
+}
+/* Find the nearest neighbors for nx queries in a set of ny vectors
+   indexed by ids. May be useful for re-ranking a pre-selected vector list */
+void knn_inner_products_by_idx (const float * x,
+                                const float * y,
+                                const int64_t * ids,
+                                size_t d, size_t nx, size_t ny,
+                                float_minheap_array_t * res)
+{
+    size_t k = res->k;
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        const float * x_ = x + i * d;
+        const int64_t * idsi = ids + i * ny;
+        size_t j;
+        float * __restrict simi = res->get_val(i);
+        int64_t * __restrict idxi = res->get_ids (i);
+        minheap_heapify (k, simi, idxi);
+        for (j = 0; j < ny; j++) {
+            if (idsi[j] < 0) break;
+            float ip = fvec_inner_product (x_, y + d * idsi[j], d);
+            if (ip > simi[0]) {
+                minheap_pop (k, simi, idxi);
+                minheap_push (k, simi, idxi, ip, idsi[j]);
+            }
+        }
+        minheap_reorder (k, simi, idxi);
+    }
+}
+void knn_L2sqr_by_idx (const float * x,
+                       const float * y,
+                       const int64_t * __restrict ids,
+                       size_t d, size_t nx, size_t ny,
+                       float_maxheap_array_t * res)
+{
+    size_t k = res->k;
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        const float * x_ = x + i * d;
+        const int64_t * __restrict idsi = ids + i * ny;
+        float * __restrict simi = res->get_val(i);
+        int64_t * __restrict idxi = res->get_ids (i);
+        maxheap_heapify (res->k, simi, idxi);
+        for (size_t j = 0; j < ny; j++) {
+            float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
+            if (disij < simi[0]) {
+                maxheap_pop (k, simi, idxi);
+                maxheap_push (k, simi, idxi, disij, idsi[j]);
+            }
+        }
+        maxheap_reorder (res->k, simi, idxi);
+    }
+}
+/***************************************************************************
+ * Range search
+ ***************************************************************************/
+/** Find the nearest neighbors for nx queries in a set of ny vectors
+ * compute_l2 = compute pairwise squared L2 distance rather than inner prod
+ */
+template <bool compute_l2>
+static void range_search_blas (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *result)
+{
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) return;
+    /* block sizes */
+    const size_t bs_x = 4096, bs_y = 1024;
+    // const size_t bs_x = 16, bs_y = 16;
+    float *ip_block = new float[bs_x * bs_y];
+    ScopeDeleter<float> del0(ip_block);
+    float *x_norms = nullptr, *y_norms = nullptr;
+    ScopeDeleter<float> del1, del2;
+    if (compute_l2) {
+        x_norms = new float[nx];
+        del1.set (x_norms);
+        fvec_norms_L2sqr (x_norms, x, d, nx);
+        y_norms = new float[ny];
+        del2.set (y_norms);
+        fvec_norms_L2sqr (y_norms, y, d, ny);
+    }
+    std::vector <RangeSearchPartialResult *> partial_results;
+    for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
+        size_t j1 = j0 + bs_y;
+        if (j1 > ny) j1 = ny;
+        RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
+        partial_results.push_back (pres);
+        for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
+            size_t i1 = i0 + bs_x;
+            if(i1 > nx) i1 = nx;
+            /* compute the actual dot products */
+            {
+                float one = 1, zero = 0;
+                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
+                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
+                        y + j0 * d, &di,
+                        x + i0 * d, &di, &zero,
+                        ip_block, &nyi);
+            }
+            for (size_t i = i0; i < i1; i++) {
+                const float *ip_line = ip_block + (i - i0) * (j1 - j0);
+                RangeQueryResult & qres = pres->new_result (i);
+                for (size_t j = j0; j < j1; j++) {
+                    float ip = *ip_line++;
+                    if (compute_l2) {
+                        float dis =  x_norms[i] + y_norms[j] - 2 * ip;
+                        if (dis < radius) {
+                            qres.add (dis, j);
+                        }
+                    } else {
+                        if (ip > radius) {
+                            qres.add (ip, j);
+                        }
+                    }
+                }
+            }
+        }
+        InterruptCallback::check ();
+    }
+    RangeSearchPartialResult::merge (partial_results);
+}
+template <bool compute_l2>
+static void range_search_sse (const float * x,
+                const float * y,
+                size_t d, size_t nx, size_t ny,
+                float radius,
+                RangeSearchResult *res)
+{
+    FAISS_THROW_IF_NOT (d % 4 == 0);
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres (res);
+#pragma omp for
+        for (size_t i = 0; i < nx; i++) {
+            const float * x_ = x + i * d;
+            const float * y_ = y;
+            size_t j;
+            RangeQueryResult & qres = pres.new_result (i);
+            for (j = 0; j < ny; j++) {
+                if (compute_l2) {
+                    float disij = fvec_L2sqr (x_, y_, d);
+                    if (disij < radius) {
+                        qres.add (disij, j);
+                    }
+                } else {
+                    float ip = fvec_inner_product (x_, y_, d);
+                    if (ip > radius) {
+                        qres.add (ip, j);
+                    }
+                }
+                y_ += d;
+            }
+        }
+        pres.finalize ();
+    }
+    // check just at the end because the use case is typically just
+    // when the nb of queries is low.
+    InterruptCallback::check();
+}
+void range_search_L2sqr (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *res)
+{
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        range_search_sse<true> (x, y, d, nx, ny, radius, res);
+    } else {
+        range_search_blas<true> (x, y, d, nx, ny, radius, res);
+    }
+}
+void range_search_inner_product (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *res)
+{
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        range_search_sse<false> (x, y, d, nx, ny, radius, res);
+    } else {
+        range_search_blas<false> (x, y, d, nx, ny, radius, res);
+    }
+}
+void pairwise_L2sqr (int64_t d,
+                     int64_t nq, const float *xq,
+                     int64_t nb, const float *xb,
+                     float *dis,
+                     int64_t ldq, int64_t ldb, int64_t ldd)
+{
+    if (nq == 0 || nb == 0) return;
+    if (ldq == -1) ldq = d;
+    if (ldb == -1) ldb = d;
+    if (ldd == -1) ldd = nb;
+    // store in beginning of distance matrix to avoid malloc
+    float *b_norms = dis;
+#pragma omp parallel for
+    for (int64_t i = 0; i < nb; i++)
+        b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
+#pragma omp parallel for
+    for (int64_t i = 1; i < nq; i++) {
+        float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
+        for (int64_t j = 0; j < nb; j++)
+            dis[i * ldd + j] = q_norm + b_norms [j];
+    }
+    {
+        float q_norm = fvec_norm_L2sqr (xq, d);
+        for (int64_t j = 0; j < nb; j++)
+            dis[j] += q_norm;
+    }
+    {
+        FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
+        float one = 1.0, minus_2 = -2.0;
+        sgemm_ ("Transposed", "Not transposed",
+                &nbi, &nqi, &di,
+                &minus_2,
+                xb, &ldbi,
+                xq, &ldqi,
+                &one, dis, &lddi);
+    }
+}
+} // namespace faiss