RubyGems - faiss - Versions diffs - 0.1.0 → 0.1.1 - Mend

faiss 0.1.0 → 0.1.1

Files changed (226) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +103 -3
data/ext/faiss/ext.cpp +99 -32
data/ext/faiss/extconf.rb +12 -2
data/lib/faiss/ext.bundle +0 -0
data/lib/faiss/index.rb +3 -3
data/lib/faiss/index_binary.rb +3 -3
data/lib/faiss/kmeans.rb +1 -1
data/lib/faiss/pca_matrix.rb +2 -2
data/lib/faiss/product_quantizer.rb +3 -3
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/AutoTune.cpp +719 -0
data/vendor/faiss/AutoTune.h +212 -0
data/vendor/faiss/Clustering.cpp +261 -0
data/vendor/faiss/Clustering.h +101 -0
data/vendor/faiss/IVFlib.cpp +339 -0
data/vendor/faiss/IVFlib.h +132 -0
data/vendor/faiss/Index.cpp +171 -0
data/vendor/faiss/Index.h +261 -0
data/vendor/faiss/Index2Layer.cpp +437 -0
data/vendor/faiss/Index2Layer.h +85 -0
data/vendor/faiss/IndexBinary.cpp +77 -0
data/vendor/faiss/IndexBinary.h +163 -0
data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
data/vendor/faiss/IndexBinaryFlat.h +54 -0
data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
data/vendor/faiss/IndexBinaryHNSW.h +56 -0
data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
data/vendor/faiss/IndexBinaryIVF.h +211 -0
data/vendor/faiss/IndexFlat.cpp +508 -0
data/vendor/faiss/IndexFlat.h +175 -0
data/vendor/faiss/IndexHNSW.cpp +1090 -0
data/vendor/faiss/IndexHNSW.h +170 -0
data/vendor/faiss/IndexIVF.cpp +909 -0
data/vendor/faiss/IndexIVF.h +353 -0
data/vendor/faiss/IndexIVFFlat.cpp +502 -0
data/vendor/faiss/IndexIVFFlat.h +118 -0
data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
data/vendor/faiss/IndexIVFPQ.h +161 -0
data/vendor/faiss/IndexIVFPQR.cpp +219 -0
data/vendor/faiss/IndexIVFPQR.h +65 -0
data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
data/vendor/faiss/IndexLSH.cpp +225 -0
data/vendor/faiss/IndexLSH.h +87 -0
data/vendor/faiss/IndexLattice.cpp +143 -0
data/vendor/faiss/IndexLattice.h +68 -0
data/vendor/faiss/IndexPQ.cpp +1188 -0
data/vendor/faiss/IndexPQ.h +199 -0
data/vendor/faiss/IndexPreTransform.cpp +288 -0
data/vendor/faiss/IndexPreTransform.h +91 -0
data/vendor/faiss/IndexReplicas.cpp +123 -0
data/vendor/faiss/IndexReplicas.h +76 -0
data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
data/vendor/faiss/IndexScalarQuantizer.h +127 -0
data/vendor/faiss/IndexShards.cpp +317 -0
data/vendor/faiss/IndexShards.h +100 -0
data/vendor/faiss/InvertedLists.cpp +623 -0
data/vendor/faiss/InvertedLists.h +334 -0
data/vendor/faiss/LICENSE +21 -0
data/vendor/faiss/MatrixStats.cpp +252 -0
data/vendor/faiss/MatrixStats.h +62 -0
data/vendor/faiss/MetaIndexes.cpp +351 -0
data/vendor/faiss/MetaIndexes.h +126 -0
data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
data/vendor/faiss/OnDiskInvertedLists.h +127 -0
data/vendor/faiss/VectorTransform.cpp +1157 -0
data/vendor/faiss/VectorTransform.h +322 -0
data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
data/vendor/faiss/c_api/AutoTune_c.h +64 -0
data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
data/vendor/faiss/c_api/Clustering_c.h +117 -0
data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
data/vendor/faiss/c_api/IndexShards_c.h +42 -0
data/vendor/faiss/c_api/Index_c.cpp +105 -0
data/vendor/faiss/c_api/Index_c.h +183 -0
data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
data/vendor/faiss/c_api/clone_index_c.h +32 -0
data/vendor/faiss/c_api/error_c.h +42 -0
data/vendor/faiss/c_api/error_impl.cpp +27 -0
data/vendor/faiss/c_api/error_impl.h +16 -0
data/vendor/faiss/c_api/faiss_c.h +58 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
data/vendor/faiss/c_api/index_factory_c.h +30 -0
data/vendor/faiss/c_api/index_io_c.cpp +42 -0
data/vendor/faiss/c_api/index_io_c.h +50 -0
data/vendor/faiss/c_api/macros_impl.h +110 -0
data/vendor/faiss/clone_index.cpp +147 -0
data/vendor/faiss/clone_index.h +38 -0
data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
data/vendor/faiss/gpu/GpuCloner.h +82 -0
data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
data/vendor/faiss/gpu/GpuDistance.h +52 -0
data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
data/vendor/faiss/gpu/GpuIndex.h +148 -0
data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
data/vendor/faiss/gpu/GpuResources.cpp +52 -0
data/vendor/faiss/gpu/GpuResources.h +73 -0
data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
data/vendor/faiss/gpu/test/TestUtils.h +93 -0
data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
data/vendor/faiss/gpu/utils/Timer.h +52 -0
data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
data/vendor/faiss/impl/FaissAssert.h +95 -0
data/vendor/faiss/impl/FaissException.cpp +66 -0
data/vendor/faiss/impl/FaissException.h +71 -0
data/vendor/faiss/impl/HNSW.cpp +818 -0
data/vendor/faiss/impl/HNSW.h +275 -0
data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
data/vendor/faiss/impl/PolysemousTraining.h +158 -0
data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
data/vendor/faiss/impl/ProductQuantizer.h +242 -0
data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
data/vendor/faiss/impl/ThreadedIndex.h +80 -0
data/vendor/faiss/impl/index_read.cpp +793 -0
data/vendor/faiss/impl/index_write.cpp +558 -0
data/vendor/faiss/impl/io.cpp +142 -0
data/vendor/faiss/impl/io.h +98 -0
data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
data/vendor/faiss/impl/lattice_Zn.h +199 -0
data/vendor/faiss/index_factory.cpp +392 -0
data/vendor/faiss/index_factory.h +25 -0
data/vendor/faiss/index_io.h +75 -0
data/vendor/faiss/misc/test_blas.cpp +84 -0
data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
data/vendor/faiss/tests/test_merge.cpp +258 -0
data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
data/vendor/faiss/tests/test_params_override.cpp +231 -0
data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
data/vendor/faiss/utils/Heap.cpp +122 -0
data/vendor/faiss/utils/Heap.h +495 -0
data/vendor/faiss/utils/WorkerThread.cpp +126 -0
data/vendor/faiss/utils/WorkerThread.h +61 -0
data/vendor/faiss/utils/distances.cpp +765 -0
data/vendor/faiss/utils/distances.h +243 -0
data/vendor/faiss/utils/distances_simd.cpp +809 -0
data/vendor/faiss/utils/extra_distances.cpp +336 -0
data/vendor/faiss/utils/extra_distances.h +54 -0
data/vendor/faiss/utils/hamming-inl.h +472 -0
data/vendor/faiss/utils/hamming.cpp +792 -0
data/vendor/faiss/utils/hamming.h +220 -0
data/vendor/faiss/utils/random.cpp +192 -0
data/vendor/faiss/utils/random.h +60 -0
data/vendor/faiss/utils/utils.cpp +783 -0
data/vendor/faiss/utils/utils.h +181 -0
metadata +216 -2

data/vendor/faiss/impl/HNSW.h ADDED Viewed

@@ -0,0 +1,275 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include <unordered_set>
+#include <queue>
+#include <omp.h>
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/Heap.h>
+namespace faiss {
+/** Implementation of the Hierarchical Navigable Small World
+ * datastructure.
+ *
+ * Efficient and robust approximate nearest neighbor search using
+ * Hierarchical Navigable Small World graphs
+ *
+ *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
+ *
+ * This implmentation is heavily influenced by the NMSlib
+ * implementation by Yury Malkov and Leonid Boystov
+ * (https://github.com/searchivarius/nmslib)
+ *
+ * The HNSW object stores only the neighbor link structure, see
+ * IndexHNSW.h for the full index object.
+ */
+struct VisitedTable;
+struct DistanceComputer; // from AuxIndexStructures
+struct HNSW {
+  /// internal storage of vectors (32 bits: this is expensive)
+  typedef int storage_idx_t;
+  /// Faiss results are 64-bit
+  typedef Index::idx_t idx_t;
+  typedef std::pair<float, storage_idx_t> Node;
+  /** Heap structure that allows fast
+   */
+  struct MinimaxHeap {
+    int n;
+    int k;
+    int nvalid;
+    std::vector<storage_idx_t> ids;
+    std::vector<float> dis;
+    typedef faiss::CMax<float, storage_idx_t> HC;
+    explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
+    void push(storage_idx_t i, float v);
+    float max() const;
+    int size() const;
+    void clear();
+    int pop_min(float *vmin_out = nullptr);
+    int count_below(float thresh);
+  };
+  /// to sort pairs of (id, distance) from nearest to fathest or the reverse
+  struct NodeDistCloser {
+    float d;
+    int id;
+    NodeDistCloser(float d, int id): d(d), id(id) {}
+    bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
+  };
+  struct NodeDistFarther {
+    float d;
+    int id;
+    NodeDistFarther(float d, int id): d(d), id(id) {}
+    bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
+  };
+  /// assignment probability to each layer (sum=1)
+  std::vector<double> assign_probas;
+  /// number of neighbors stored per layer (cumulative), should not
+  /// be changed after first add
+  std::vector<int> cum_nneighbor_per_level;
+  /// level of each vector (base level = 1), size = ntotal
+  std::vector<int> levels;
+  /// offsets[i] is the offset in the neighbors array where vector i is stored
+  /// size ntotal + 1
+  std::vector<size_t> offsets;
+  /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
+  /// for all levels. this is where all storage goes.
+  std::vector<storage_idx_t> neighbors;
+  /// entry point in the search structure (one of the points with maximum level
+  storage_idx_t entry_point;
+  faiss::RandomGenerator rng;
+  /// maximum level
+  int max_level;
+  /// expansion factor at construction time
+  int efConstruction;
+  /// expansion factor at search time
+  int efSearch;
+  /// during search: do we check whether the next best distance is good enough?
+  bool check_relative_distance = true;
+  /// number of entry points in levels > 0.
+  int upper_beam;
+  /// use bounded queue during exploration
+  bool search_bounded_queue = true;
+  // methods that initialize the tree sizes
+  /// initialize the assign_probas and cum_nneighbor_per_level to
+  /// have 2*M links on level 0 and M links on levels > 0
+  void set_default_probas(int M, float levelMult);
+  /// set nb of neighbors for this level (before adding anything)
+  void set_nb_neighbors(int level_no, int n);
+  // methods that access the tree sizes
+  /// nb of neighbors for this level
+  int nb_neighbors(int layer_no) const;
+  /// cumumlative nb up to (and excluding) this level
+  int cum_nb_neighbors(int layer_no) const;
+  /// range of entries in the neighbors table of vertex no at layer_no
+  void neighbor_range(idx_t no, int layer_no,
+                      size_t * begin, size_t * end) const;
+  /// only mandatory parameter: nb of neighbors
+  explicit HNSW(int M = 32);
+  /// pick a random level for a new point
+  int random_level();
+  /// add n random levels to table (for debugging...)
+  void fill_with_random_links(size_t n);
+  void add_links_starting_from(DistanceComputer& ptdis,
+                               storage_idx_t pt_id,
+                               storage_idx_t nearest,
+                               float d_nearest,
+                               int level,
+                               omp_lock_t *locks,
+                               VisitedTable &vt);
+  /** add point pt_id on all levels <= pt_level and build the link
+   * structure for them. */
+  void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
+                      std::vector<omp_lock_t>& locks,
+                      VisitedTable& vt);
+  int search_from_candidates(DistanceComputer& qdis, int k,
+                             idx_t *I, float *D,
+                             MinimaxHeap& candidates,
+                             VisitedTable &vt,
+                             int level, int nres_in = 0) const;
+  std::priority_queue<Node> search_from_candidate_unbounded(
+    const Node& node,
+    DistanceComputer& qdis,
+    int ef,
+    VisitedTable *vt
+  ) const;
+  /// search interface
+  void search(DistanceComputer& qdis, int k,
+              idx_t *I, float *D,
+              VisitedTable& vt) const;
+  void reset();
+  void clear_neighbor_tables(int level);
+  void print_neighbor_stats(int level) const;
+  int prepare_level_tab(size_t n, bool preset_levels = false);
+  static void shrink_neighbor_list(
+    DistanceComputer& qdis,
+    std::priority_queue<NodeDistFarther>& input,
+    std::vector<NodeDistFarther>& output,
+    int max_size);
+};
+/**************************************************************
+ * Auxiliary structures
+ **************************************************************/
+/// set implementation optimized for fast access.
+struct VisitedTable {
+  std::vector<uint8_t> visited;
+  int visno;
+  explicit VisitedTable(int size)
+    : visited(size), visno(1) {}
+  /// set flog #no to true
+  void set(int no) {
+    visited[no] = visno;
+  }
+  /// get flag #no
+  bool get(int no) const {
+    return visited[no] == visno;
+  }
+  /// reset all flags to false
+  void advance() {
+    visno++;
+    if (visno == 250) {
+      // 250 rather than 255 because sometimes we use visno and visno+1
+      memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
+      visno = 1;
+    }
+  }
+};
+struct HNSWStats {
+  size_t n1, n2, n3;
+  size_t ndis;
+  size_t nreorder;
+  bool view;
+  HNSWStats() {
+    reset();
+  }
+  void reset() {
+    n1 = n2 = n3 = 0;
+    ndis = 0;
+    nreorder = 0;
+    view = false;
+  }
+};
+// global var that collects them all
+extern HNSWStats hnsw_stats;
+}  // namespace faiss

data/vendor/faiss/impl/PolysemousTraining.cpp ADDED Viewed

@@ -0,0 +1,953 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/impl/PolysemousTraining.h>
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <stdint.h>
+#include <algorithm>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/impl/FaissAssert.h>
+/*****************************************
+ * Mixed PQ / Hamming
+ ******************************************/
+namespace faiss {
+/****************************************************
+ * Optimization code
+ ****************************************************/
+SimulatedAnnealingParameters::SimulatedAnnealingParameters ()
+{
+    // set some reasonable defaults for the optimization
+    init_temperature = 0.7;
+    temperature_decay = pow (0.9, 1/500.);
+    // reduce by a factor 0.9 every 500 it
+    n_iter = 500000;
+    n_redo = 2;
+    seed = 123;
+    verbose = 0;
+    only_bit_flips = false;
+    init_random = false;
+}
+// what would the cost update be if iw and jw were swapped?
+// default implementation just computes both and computes the difference
+double PermutationObjective::cost_update (
+        const int *perm, int iw, int jw) const
+{
+    double orig_cost = compute_cost (perm);
+    std::vector<int> perm2 (n);
+    for (int i = 0; i < n; i++)
+        perm2[i] = perm[i];
+    perm2[iw] = perm[jw];
+    perm2[jw] = perm[iw];
+    double new_cost = compute_cost (perm2.data());
+    return new_cost - orig_cost;
+}
+SimulatedAnnealingOptimizer::SimulatedAnnealingOptimizer (
+        PermutationObjective *obj,
+        const SimulatedAnnealingParameters &p):
+    SimulatedAnnealingParameters (p),
+    obj (obj),
+    n(obj->n),
+    logfile (nullptr)
+{
+    rnd = new RandomGenerator (p.seed);
+    FAISS_THROW_IF_NOT (n < 100000 && n >=0 );
+}
+SimulatedAnnealingOptimizer::~SimulatedAnnealingOptimizer ()
+{
+    delete rnd;
+}
+// run the optimization and return the best result in best_perm
+double SimulatedAnnealingOptimizer::run_optimization (int * best_perm)
+{
+    double min_cost = 1e30;
+    // just do a few runs of the annealing and keep the lowest output cost
+    for (int it = 0; it < n_redo; it++) {
+        std::vector<int> perm(n);
+        for (int i = 0; i < n; i++)
+            perm[i] = i;
+         if (init_random) {
+            for (int i = 0; i < n; i++) {
+                int j = i + rnd->rand_int (n - i);
+                std::swap (perm[i], perm[j]);
+            }
+        }
+         float cost = optimize (perm.data());
+        if (logfile) fprintf (logfile, "\n");
+        if(verbose > 1) {
+            printf ("    optimization run %d: cost=%g %s\n",
+                    it, cost, cost < min_cost ? "keep" : "");
+        }
+        if (cost < min_cost) {
+            memcpy (best_perm, perm.data(), sizeof(perm[0]) * n);
+            min_cost = cost;
+        }
+    }
+     return min_cost;
+}
+// perform the optimization loop, starting from and modifying
+// permutation in-place
+double SimulatedAnnealingOptimizer::optimize (int *perm)
+{
+    double cost = init_cost = obj->compute_cost (perm);
+    int log2n = 0;
+    while (!(n <= (1 << log2n))) log2n++;
+    double temperature = init_temperature;
+     int n_swap = 0, n_hot = 0;
+    for (int it = 0; it < n_iter; it++) {
+        temperature = temperature * temperature_decay;
+        int iw, jw;
+        if (only_bit_flips) {
+            iw = rnd->rand_int (n);
+            jw = iw ^ (1 << rnd->rand_int (log2n));
+        } else {
+            iw = rnd->rand_int (n);
+            jw = rnd->rand_int (n - 1);
+            if (jw == iw) jw++;
+        }
+         double delta_cost = obj->cost_update (perm, iw, jw);
+         if (delta_cost < 0 || rnd->rand_float () < temperature) {
+            std::swap (perm[iw], perm[jw]);
+            cost += delta_cost;
+            n_swap++;
+            if (delta_cost >= 0) n_hot++;
+        }
+         if (verbose > 2 || (verbose > 1 && it % 10000 == 0)) {
+            printf ("      iteration %d cost %g temp %g n_swap %d "
+                    "(%d hot)     \r",
+                    it, cost, temperature, n_swap, n_hot);
+            fflush(stdout);
+        }
+        if (logfile) {
+            fprintf (logfile, "%d %g %g %d %d\n",
+                    it, cost, temperature, n_swap, n_hot);
+        }
+     }
+    if (verbose > 1) printf("\n");
+    return cost;
+}
+/****************************************************
+ * Cost functions: ReproduceDistanceTable
+ ****************************************************/
+static inline int hamming_dis (uint64_t a, uint64_t b)
+{
+    return __builtin_popcountl (a ^ b);
+}
+namespace {
+/// optimize permutation to reproduce a distance table with Hamming distances
+struct ReproduceWithHammingObjective : PermutationObjective {
+    int nbits;
+    double dis_weight_factor;
+    static double sqr (double x) { return x * x; }
+    // weihgting of distances: it is more important to reproduce small
+    // distances well
+    double dis_weight (double x) const
+    {
+        return exp (-dis_weight_factor * x);
+    }
+    std::vector<double> target_dis; // wanted distances (size n^2)
+    std::vector<double> weights;    // weights for each distance (size n^2)
+    // cost = quadratic difference between actual distance and Hamming distance
+    double compute_cost(const int* perm) const override {
+      double cost = 0;
+      for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+          double wanted = target_dis[i * n + j];
+          double w = weights[i * n + j];
+          double actual = hamming_dis(perm[i], perm[j]);
+          cost += w * sqr(wanted - actual);
+        }
+      }
+      return cost;
+    }
+    // what would the cost update be if iw and jw were swapped?
+    // computed in O(n) instead of O(n^2) for the full re-computation
+    double cost_update(const int* perm, int iw, int jw) const override {
+      double delta_cost = 0;
+      for (int i = 0; i < n; i++) {
+        if (i == iw) {
+          for (int j = 0; j < n; j++) {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual =
+                hamming_dis(perm[jw], perm[j == iw ? jw : j == jw ? iw : j]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+        } else if (i == jw) {
+          for (int j = 0; j < n; j++) {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual =
+                hamming_dis(perm[iw], perm[j == iw ? jw : j == jw ? iw : j]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+        } else {
+          int j = iw;
+          {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual = hamming_dis(perm[i], perm[jw]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+          j = jw;
+          {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual = hamming_dis(perm[i], perm[iw]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+        }
+      }
+      return delta_cost;
+    }
+    ReproduceWithHammingObjective (
+           int nbits,
+           const std::vector<double> & dis_table,
+           double dis_weight_factor):
+        nbits (nbits), dis_weight_factor (dis_weight_factor)
+    {
+        n = 1 << nbits;
+        FAISS_THROW_IF_NOT (dis_table.size() == n * n);
+        set_affine_target_dis (dis_table);
+    }
+    void set_affine_target_dis (const std::vector<double> & dis_table)
+    {
+        double sum = 0, sum2 = 0;
+        int n2 = n * n;
+        for (int i = 0; i < n2; i++) {
+            sum += dis_table [i];
+            sum2 += dis_table [i] * dis_table [i];
+        }
+        double mean = sum / n2;
+        double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
+        target_dis.resize (n2);
+        for (int i = 0; i < n2; i++) {
+            // the mapping function
+            double td = (dis_table [i] - mean) / stddev * sqrt(nbits / 4) +
+                nbits / 2;
+            target_dis[i] = td;
+            // compute a weight
+            weights.push_back (dis_weight (td));
+        }
+    }
+    ~ReproduceWithHammingObjective() override {}
+};
+} // anonymous namespace
+// weihgting of distances: it is more important to reproduce small
+// distances well
+double ReproduceDistancesObjective::dis_weight (double x) const
+{
+    return exp (-dis_weight_factor * x);
+}
+double ReproduceDistancesObjective::get_source_dis (int i, int j) const
+{
+    return source_dis [i * n + j];
+}
+// cost = quadratic difference between actual distance and Hamming distance
+double ReproduceDistancesObjective::compute_cost (const int *perm) const
+{
+    double cost = 0;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            double wanted = target_dis [i * n + j];
+            double w = weights [i * n + j];
+            double actual = get_source_dis (perm[i], perm[j]);
+            cost += w * sqr (wanted - actual);
+        }
+    }
+    return cost;
+}
+// what would the cost update be if iw and jw were swapped?
+// computed in O(n) instead of O(n^2) for the full re-computation
+double ReproduceDistancesObjective::cost_update(
+        const int *perm, int iw, int jw) const
+{
+    double delta_cost = 0;
+     for (int i = 0; i < n; i++) {
+        if (i == iw) {
+            for (int j = 0; j < n; j++) {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (
+                       perm[jw],
+                       perm[j == iw ? jw : j == jw ? iw : j]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+        } else if (i == jw) {
+            for (int j = 0; j < n; j++) {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (
+                       perm[iw],
+                       perm[j == iw ? jw : j == jw ? iw : j]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+        } else  {
+            int j = iw;
+            {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (perm[i], perm[jw]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+            j = jw;
+            {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (perm[i], perm[iw]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+        }
+    }
+     return delta_cost;
+}
+ReproduceDistancesObjective::ReproduceDistancesObjective (
+       int n,
+       const double *source_dis_in,
+       const double *target_dis_in,
+       double dis_weight_factor):
+    dis_weight_factor (dis_weight_factor),
+    target_dis (target_dis_in)
+{
+    this->n = n;
+    set_affine_target_dis (source_dis_in);
+}
+void ReproduceDistancesObjective::compute_mean_stdev (
+          const double *tab, size_t n2,
+          double *mean_out, double *stddev_out)
+{
+    double sum = 0, sum2 = 0;
+    for (int i = 0; i < n2; i++) {
+        sum += tab [i];
+        sum2 += tab [i] * tab [i];
+    }
+    double mean = sum / n2;
+    double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
+    *mean_out = mean;
+    *stddev_out = stddev;
+}
+void ReproduceDistancesObjective::set_affine_target_dis (
+          const double *source_dis_in)
+{
+    int n2 = n * n;
+    double mean_src, stddev_src;
+    compute_mean_stdev (source_dis_in, n2, &mean_src, &stddev_src);
+    double mean_target, stddev_target;
+    compute_mean_stdev (target_dis, n2, &mean_target, &stddev_target);
+    printf ("map mean %g std %g -> mean %g std %g\n",
+            mean_src, stddev_src, mean_target, stddev_target);
+    source_dis.resize (n2);
+    weights.resize (n2);
+    for (int i = 0; i < n2; i++) {
+        // the mapping function
+        source_dis[i] = (source_dis_in[i] - mean_src) / stddev_src
+            * stddev_target + mean_target;
+        // compute a weight
+        weights [i] = dis_weight (target_dis[i]);
+    }
+}
+/****************************************************
+ * Cost functions: RankingScore
+ ****************************************************/
+/// Maintains a 3D table of elementary costs.
+/// Accumulates elements based on Hamming distance comparisons
+template <typename Ttab, typename Taccu>
+struct Score3Computer: PermutationObjective {
+    int nc;
+    // cost matrix of size nc * nc *nc
+    // n_gt (i,j,k) = count of d_gt(x, y-) < d_gt(x, y+)
+    // where x has PQ code i, y- PQ code j and y+ PQ code k
+    std::vector<Ttab> n_gt;
+    /// the cost is a triple loop on the nc * nc * nc matrix of entries.
+    ///
+    Taccu compute (const int * perm) const
+    {
+        Taccu accu = 0;
+        const Ttab *p = n_gt.data();
+        for (int i = 0; i < nc; i++) {
+            int ip = perm [i];
+            for (int j = 0; j < nc; j++) {
+                int jp = perm [j];
+                for (int k = 0; k < nc; k++) {
+                    int kp = perm [k];
+                    if (hamming_dis (ip, jp) <
+                        hamming_dis (ip, kp)) {
+                        accu += *p; // n_gt [ ( i * nc + j) * nc + k];
+                    }
+                    p++;
+                }
+            }
+        }
+        return accu;
+    }
+    /** cost update if entries iw and jw of the permutation would be
+     * swapped.
+     *
+     * The computation is optimized by avoiding elements in the
+     * nc*nc*nc cube that are known not to change. For nc=256, this
+     * reduces the nb of cells to visit to about 6/256 th of the
+     * cells. Practical speedup is about 8x, and the code is quite
+     * complex :-/
+     */
+    Taccu compute_update (const int *perm, int iw, int jw) const
+    {
+        assert (iw != jw);
+        if (iw > jw) std::swap (iw, jw);
+        Taccu accu = 0;
+        const Ttab * n_gt_i = n_gt.data();
+        for (int i = 0; i < nc; i++) {
+            int ip0 = perm [i];
+            int ip = perm [i == iw ? jw : i == jw ? iw : i];
+            //accu += update_i (perm, iw, jw, ip0, ip, n_gt_i);
+            accu += update_i_cross (perm, iw, jw,
+                                    ip0, ip, n_gt_i);
+            if (ip != ip0)
+                accu += update_i_plane (perm, iw, jw,
+                                       ip0, ip, n_gt_i);
+            n_gt_i += nc * nc;
+        }
+        return accu;
+    }
+    Taccu update_i (const int *perm, int iw, int jw,
+                   int ip0, int ip, const Ttab * n_gt_i) const
+    {
+        Taccu accu = 0;
+        const Ttab *n_gt_ij = n_gt_i;
+        for (int j = 0; j < nc; j++) {
+            int jp0 = perm[j];
+            int jp = perm [j == iw ? jw : j == jw ? iw : j];
+            for (int k = 0; k < nc; k++) {
+                int kp0 = perm [k];
+                int kp = perm [k == iw ? jw : k == jw ? iw : k];
+                int ng = n_gt_ij [k];
+                if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                    accu += ng;
+                }
+                if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
+                    accu -= ng;
+                }
+            }
+            n_gt_ij += nc;
+        }
+        return accu;
+    }
+    // 2 inner loops for the case ip0 != ip
+    Taccu update_i_plane (const int *perm, int iw, int jw,
+                         int ip0, int ip, const Ttab * n_gt_i) const
+    {
+        Taccu accu = 0;
+        const Ttab *n_gt_ij = n_gt_i;
+        for (int j = 0; j < nc; j++) {
+            if (j != iw && j != jw) {
+                int jp = perm[j];
+                for (int k = 0; k < nc; k++) {
+                    if (k != iw && k != jw) {
+                        int kp = perm [k];
+                        Ttab ng = n_gt_ij [k];
+                        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                            accu += ng;
+                        }
+                        if (hamming_dis (ip0, jp) < hamming_dis (ip0, kp)) {
+                            accu -= ng;
+                        }
+                    }
+                }
+            }
+            n_gt_ij += nc;
+        }
+        return accu;
+    }
+    /// used for the 8 cells were the 3 indices are swapped
+    inline Taccu update_k (const int *perm, int iw, int jw,
+                          int ip0, int ip, int jp0, int jp,
+                          int k,
+                          const Ttab * n_gt_ij) const
+    {
+        Taccu accu = 0;
+        int kp0 = perm [k];
+        int kp = perm [k == iw ? jw : k == jw ? iw : k];
+        Ttab ng = n_gt_ij [k];
+        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+            accu += ng;
+        }
+        if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
+            accu -= ng;
+        }
+        return accu;
+    }
+    /// compute update on a line of k's, where i and j are swapped
+    Taccu update_j_line (const int *perm, int iw, int jw,
+                        int ip0, int ip, int jp0, int jp,
+                        const Ttab * n_gt_ij) const
+    {
+        Taccu accu = 0;
+        for (int k = 0; k < nc; k++) {
+            if (k == iw || k == jw) continue;
+            int kp = perm [k];
+            Ttab ng = n_gt_ij [k];
+            if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                accu += ng;
+            }
+            if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp)) {
+                accu -= ng;
+            }
+        }
+        return accu;
+    }
+    /// considers the 2 pairs of crossing lines j=iw or jw and k = iw or kw
+    Taccu update_i_cross (const int *perm, int iw, int jw,
+                        int ip0, int ip, const Ttab * n_gt_i) const
+    {
+        Taccu accu = 0;
+        const Ttab *n_gt_ij = n_gt_i;
+        for (int j = 0; j < nc; j++) {
+            int jp0 = perm[j];
+            int jp = perm [j == iw ? jw : j == jw ? iw : j];
+            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, iw, n_gt_ij);
+            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, jw, n_gt_ij);
+            if (jp != jp0)
+                accu += update_j_line (perm, iw, jw, ip0, ip, jp0, jp, n_gt_ij);
+            n_gt_ij += nc;
+        }
+        return accu;
+    }
+    /// PermutationObjective implementeation (just negates the scores
+    /// for minimization)
+    double compute_cost(const int* perm) const override {
+      return -compute(perm);
+    }
+    double cost_update(const int* perm, int iw, int jw) const override {
+      double ret = -compute_update(perm, iw, jw);
+      return ret;
+    }
+    ~Score3Computer() override {}
+};
+struct IndirectSort {
+    const float *tab;
+    bool operator () (int a, int b) {return tab[a] < tab[b]; }
+};
+struct RankingScore2: Score3Computer<float, double> {
+    int nbits;
+    int nq, nb;
+    const uint32_t *qcodes, *bcodes;
+    const float *gt_distances;
+    RankingScore2 (int nbits, int nq, int nb,
+                  const uint32_t *qcodes, const uint32_t *bcodes,
+                  const float *gt_distances):
+        nbits(nbits), nq(nq), nb(nb), qcodes(qcodes),
+        bcodes(bcodes), gt_distances(gt_distances)
+    {
+        n = nc = 1 << nbits;
+        n_gt.resize (nc * nc * nc);
+        init_n_gt ();
+    }
+    double rank_weight (int r)
+    {
+        return 1.0 / (r + 1);
+    }
+    /// count nb of i, j in a x b st. i < j
+    /// a and b should be sorted on input
+    /// they are the ranks of j and k respectively.
+    /// specific version for diff-of-rank weighting, cannot optimized
+    /// with a cumulative table
+    double accum_gt_weight_diff (const std::vector<int> & a,
+                                 const std::vector<int> & b)
+    {
+        int nb = b.size(), na = a.size();
+        double accu = 0;
+        int j = 0;
+        for (int i = 0; i < na; i++) {
+            int ai = a[i];
+            while (j < nb && ai >= b[j]) j++;
+            double accu_i = 0;
+            for (int k = j; k < b.size(); k++)
+                accu_i += rank_weight (b[k] - ai);
+            accu += rank_weight (ai) * accu_i;
+        }
+        return accu;
+    }
+    void init_n_gt ()
+    {
+        for (int q = 0; q < nq; q++) {
+            const float *gtd = gt_distances + q * nb;
+            const uint32_t *cb = bcodes;// all same codes
+            float * n_gt_q = & n_gt [qcodes[q] * nc * nc];
+            printf("init gt for q=%d/%d    \r", q, nq); fflush(stdout);
+            std::vector<int> rankv (nb);
+            int * ranks = rankv.data();
+            // elements in each code bin, ordered by rank within each bin
+            std::vector<std::vector<int> > tab (nc);
+            { // build rank table
+                IndirectSort s = {gtd};
+                for (int j = 0; j < nb; j++) ranks[j] = j;
+                std::sort (ranks, ranks + nb, s);
+            }
+            for (int rank = 0; rank < nb; rank++) {
+                int i = ranks [rank];
+                tab [cb[i]].push_back (rank);
+            }
+            // this is very expensive. Any suggestion for improvement
+            // welcome.
+            for (int i = 0; i < nc; i++) {
+                std::vector<int> & di = tab[i];
+                for (int j = 0; j < nc; j++) {
+                    std::vector<int> & dj = tab[j];
+                    n_gt_q [i * nc + j] += accum_gt_weight_diff (di, dj);
+                }
+            }
+        }
+    }
+};
+/*****************************************
+ * PolysemousTraining
+ ******************************************/
+PolysemousTraining::PolysemousTraining ()
+{
+    optimization_type = OT_ReproduceDistances_affine;
+    ntrain_permutation = 0;
+    dis_weight_factor = log(2);
+}
+void PolysemousTraining::optimize_reproduce_distances (
+       ProductQuantizer &pq) const
+{
+    int dsub = pq.dsub;
+    int n = pq.ksub;
+    int nbits = pq.nbits;
+#pragma omp parallel for
+    for (int m = 0; m < pq.M; m++) {
+        std::vector<double> dis_table;
+        // printf ("Optimizing quantizer %d\n", m);
+        float * centroids = pq.get_centroids (m, 0);
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < n; j++) {
+                dis_table.push_back (fvec_L2sqr (centroids + i * dsub,
+                                                 centroids + j * dsub,
+                                                 dsub));
+            }
+        }
+        std::vector<int> perm (n);
+        ReproduceWithHammingObjective obj (
+               nbits, dis_table,
+               dis_weight_factor);
+        SimulatedAnnealingOptimizer optim (&obj, *this);
+        if (log_pattern.size()) {
+            char fname[256];
+            snprintf (fname, 256, log_pattern.c_str(), m);
+            printf ("opening log file %s\n", fname);
+            optim.logfile = fopen (fname, "w");
+            FAISS_THROW_IF_NOT_MSG (optim.logfile, "could not open logfile");
+        }
+        double final_cost = optim.run_optimization (perm.data());
+        if (verbose > 0) {
+            printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
+                    m, optim.init_cost, final_cost);
+        }
+        if (log_pattern.size()) fclose (optim.logfile);
+        std::vector<float> centroids_copy;
+        for (int i = 0; i < dsub * n; i++)
+            centroids_copy.push_back (centroids[i]);
+        for (int i = 0; i < n; i++)
+            memcpy (centroids + perm[i] * dsub,
+                    centroids_copy.data() + i * dsub,
+                    dsub * sizeof(centroids[0]));
+    }
+}
+void PolysemousTraining::optimize_ranking (
+      ProductQuantizer &pq, size_t n, const float *x) const
+{
+    int dsub = pq.dsub;
+    int nbits = pq.nbits;
+    std::vector<uint8_t> all_codes (pq.code_size * n);
+    pq.compute_codes (x, all_codes.data(), n);
+    FAISS_THROW_IF_NOT (pq.nbits == 8);
+    if (n == 0)
+        pq.compute_sdc_table ();
+#pragma omp parallel for
+    for (int m = 0; m < pq.M; m++) {
+        size_t nq, nb;
+        std::vector <uint32_t> codes; // query codes, then db codes
+        std::vector <float> gt_distances; // nq * nb matrix of distances
+        if (n > 0) {
+            std::vector<float> xtrain (n * dsub);
+            for (int i = 0; i < n; i++)
+                memcpy (xtrain.data() + i * dsub,
+                        x + i * pq.d + m * dsub,
+                        sizeof(float) * dsub);
+            codes.resize (n);
+            for (int i = 0; i < n; i++)
+                codes [i] = all_codes [i * pq.code_size + m];
+            nq = n / 4; nb = n - nq;
+            const float *xq = xtrain.data();
+            const float *xb = xq + nq * dsub;
+            gt_distances.resize (nq * nb);
+            pairwise_L2sqr (dsub,
+                            nq, xq,
+                            nb, xb,
+                            gt_distances.data());
+        } else {
+            nq = nb = pq.ksub;
+            codes.resize (2 * nq);
+            for (int i = 0; i < nq; i++)
+                codes[i] = codes [i + nq] = i;
+            gt_distances.resize (nq * nb);
+            memcpy (gt_distances.data (),
+                    pq.sdc_table.data () + m * nq * nb,
+                    sizeof (float) * nq * nb);
+        }
+        double t0 = getmillisecs ();
+        PermutationObjective *obj = new RankingScore2 (
+                  nbits, nq, nb,
+                  codes.data(), codes.data() + nq,
+                  gt_distances.data ());
+        ScopeDeleter1<PermutationObjective> del (obj);
+        if (verbose > 0) {
+            printf("   m=%d, nq=%ld, nb=%ld, intialize RankingScore "
+                   "in %.3f ms\n",
+                   m, nq, nb, getmillisecs () - t0);
+        }
+        SimulatedAnnealingOptimizer optim (obj, *this);
+        if (log_pattern.size()) {
+            char fname[256];
+            snprintf (fname, 256, log_pattern.c_str(), m);
+            printf ("opening log file %s\n", fname);
+            optim.logfile = fopen (fname, "w");
+            FAISS_THROW_IF_NOT_FMT (optim.logfile,
+                                    "could not open logfile %s", fname);
+        }
+        std::vector<int> perm (pq.ksub);
+        double final_cost = optim.run_optimization (perm.data());
+        printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
+                m, optim.init_cost, final_cost);
+        if (log_pattern.size()) fclose (optim.logfile);
+        float * centroids = pq.get_centroids (m, 0);
+        std::vector<float> centroids_copy;
+        for (int i = 0; i < dsub * pq.ksub; i++)
+            centroids_copy.push_back (centroids[i]);
+        for (int i = 0; i < pq.ksub; i++)
+            memcpy (centroids + perm[i] * dsub,
+                    centroids_copy.data() + i * dsub,
+                    dsub * sizeof(centroids[0]));
+    }
+}
+void PolysemousTraining::optimize_pq_for_hamming (ProductQuantizer &pq,
+                                                size_t n, const float *x) const
+{
+    if (optimization_type == OT_None) {
+    } else if (optimization_type == OT_ReproduceDistances_affine) {
+        optimize_reproduce_distances (pq);
+    } else {
+        optimize_ranking (pq, n, x);
+    }
+    pq.compute_sdc_table ();
+}
+} // namespace faiss