RubyGems - faiss - Versions diffs - 0.1.0 → 0.1.1 - Mend

faiss 0.1.0 → 0.1.1

Files changed (226) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +103 -3
data/ext/faiss/ext.cpp +99 -32
data/ext/faiss/extconf.rb +12 -2
data/lib/faiss/ext.bundle +0 -0
data/lib/faiss/index.rb +3 -3
data/lib/faiss/index_binary.rb +3 -3
data/lib/faiss/kmeans.rb +1 -1
data/lib/faiss/pca_matrix.rb +2 -2
data/lib/faiss/product_quantizer.rb +3 -3
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/AutoTune.cpp +719 -0
data/vendor/faiss/AutoTune.h +212 -0
data/vendor/faiss/Clustering.cpp +261 -0
data/vendor/faiss/Clustering.h +101 -0
data/vendor/faiss/IVFlib.cpp +339 -0
data/vendor/faiss/IVFlib.h +132 -0
data/vendor/faiss/Index.cpp +171 -0
data/vendor/faiss/Index.h +261 -0
data/vendor/faiss/Index2Layer.cpp +437 -0
data/vendor/faiss/Index2Layer.h +85 -0
data/vendor/faiss/IndexBinary.cpp +77 -0
data/vendor/faiss/IndexBinary.h +163 -0
data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
data/vendor/faiss/IndexBinaryFlat.h +54 -0
data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
data/vendor/faiss/IndexBinaryHNSW.h +56 -0
data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
data/vendor/faiss/IndexBinaryIVF.h +211 -0
data/vendor/faiss/IndexFlat.cpp +508 -0
data/vendor/faiss/IndexFlat.h +175 -0
data/vendor/faiss/IndexHNSW.cpp +1090 -0
data/vendor/faiss/IndexHNSW.h +170 -0
data/vendor/faiss/IndexIVF.cpp +909 -0
data/vendor/faiss/IndexIVF.h +353 -0
data/vendor/faiss/IndexIVFFlat.cpp +502 -0
data/vendor/faiss/IndexIVFFlat.h +118 -0
data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
data/vendor/faiss/IndexIVFPQ.h +161 -0
data/vendor/faiss/IndexIVFPQR.cpp +219 -0
data/vendor/faiss/IndexIVFPQR.h +65 -0
data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
data/vendor/faiss/IndexLSH.cpp +225 -0
data/vendor/faiss/IndexLSH.h +87 -0
data/vendor/faiss/IndexLattice.cpp +143 -0
data/vendor/faiss/IndexLattice.h +68 -0
data/vendor/faiss/IndexPQ.cpp +1188 -0
data/vendor/faiss/IndexPQ.h +199 -0
data/vendor/faiss/IndexPreTransform.cpp +288 -0
data/vendor/faiss/IndexPreTransform.h +91 -0
data/vendor/faiss/IndexReplicas.cpp +123 -0
data/vendor/faiss/IndexReplicas.h +76 -0
data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
data/vendor/faiss/IndexScalarQuantizer.h +127 -0
data/vendor/faiss/IndexShards.cpp +317 -0
data/vendor/faiss/IndexShards.h +100 -0
data/vendor/faiss/InvertedLists.cpp +623 -0
data/vendor/faiss/InvertedLists.h +334 -0
data/vendor/faiss/LICENSE +21 -0
data/vendor/faiss/MatrixStats.cpp +252 -0
data/vendor/faiss/MatrixStats.h +62 -0
data/vendor/faiss/MetaIndexes.cpp +351 -0
data/vendor/faiss/MetaIndexes.h +126 -0
data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
data/vendor/faiss/OnDiskInvertedLists.h +127 -0
data/vendor/faiss/VectorTransform.cpp +1157 -0
data/vendor/faiss/VectorTransform.h +322 -0
data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
data/vendor/faiss/c_api/AutoTune_c.h +64 -0
data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
data/vendor/faiss/c_api/Clustering_c.h +117 -0
data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
data/vendor/faiss/c_api/IndexShards_c.h +42 -0
data/vendor/faiss/c_api/Index_c.cpp +105 -0
data/vendor/faiss/c_api/Index_c.h +183 -0
data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
data/vendor/faiss/c_api/clone_index_c.h +32 -0
data/vendor/faiss/c_api/error_c.h +42 -0
data/vendor/faiss/c_api/error_impl.cpp +27 -0
data/vendor/faiss/c_api/error_impl.h +16 -0
data/vendor/faiss/c_api/faiss_c.h +58 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
data/vendor/faiss/c_api/index_factory_c.h +30 -0
data/vendor/faiss/c_api/index_io_c.cpp +42 -0
data/vendor/faiss/c_api/index_io_c.h +50 -0
data/vendor/faiss/c_api/macros_impl.h +110 -0
data/vendor/faiss/clone_index.cpp +147 -0
data/vendor/faiss/clone_index.h +38 -0
data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
data/vendor/faiss/gpu/GpuCloner.h +82 -0
data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
data/vendor/faiss/gpu/GpuDistance.h +52 -0
data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
data/vendor/faiss/gpu/GpuIndex.h +148 -0
data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
data/vendor/faiss/gpu/GpuResources.cpp +52 -0
data/vendor/faiss/gpu/GpuResources.h +73 -0
data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
data/vendor/faiss/gpu/test/TestUtils.h +93 -0
data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
data/vendor/faiss/gpu/utils/Timer.h +52 -0
data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
data/vendor/faiss/impl/FaissAssert.h +95 -0
data/vendor/faiss/impl/FaissException.cpp +66 -0
data/vendor/faiss/impl/FaissException.h +71 -0
data/vendor/faiss/impl/HNSW.cpp +818 -0
data/vendor/faiss/impl/HNSW.h +275 -0
data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
data/vendor/faiss/impl/PolysemousTraining.h +158 -0
data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
data/vendor/faiss/impl/ProductQuantizer.h +242 -0
data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
data/vendor/faiss/impl/ThreadedIndex.h +80 -0
data/vendor/faiss/impl/index_read.cpp +793 -0
data/vendor/faiss/impl/index_write.cpp +558 -0
data/vendor/faiss/impl/io.cpp +142 -0
data/vendor/faiss/impl/io.h +98 -0
data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
data/vendor/faiss/impl/lattice_Zn.h +199 -0
data/vendor/faiss/index_factory.cpp +392 -0
data/vendor/faiss/index_factory.h +25 -0
data/vendor/faiss/index_io.h +75 -0
data/vendor/faiss/misc/test_blas.cpp +84 -0
data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
data/vendor/faiss/tests/test_merge.cpp +258 -0
data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
data/vendor/faiss/tests/test_params_override.cpp +231 -0
data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
data/vendor/faiss/utils/Heap.cpp +122 -0
data/vendor/faiss/utils/Heap.h +495 -0
data/vendor/faiss/utils/WorkerThread.cpp +126 -0
data/vendor/faiss/utils/WorkerThread.h +61 -0
data/vendor/faiss/utils/distances.cpp +765 -0
data/vendor/faiss/utils/distances.h +243 -0
data/vendor/faiss/utils/distances_simd.cpp +809 -0
data/vendor/faiss/utils/extra_distances.cpp +336 -0
data/vendor/faiss/utils/extra_distances.h +54 -0
data/vendor/faiss/utils/hamming-inl.h +472 -0
data/vendor/faiss/utils/hamming.cpp +792 -0
data/vendor/faiss/utils/hamming.h +220 -0
data/vendor/faiss/utils/random.cpp +192 -0
data/vendor/faiss/utils/random.h +60 -0
data/vendor/faiss/utils/utils.cpp +783 -0
data/vendor/faiss/utils/utils.h +181 -0
metadata +216 -2

data/vendor/faiss/OnDiskInvertedLists.h ADDED Viewed

@@ -0,0 +1,127 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_ON_DISK_INVERTED_LISTS_H
+#define FAISS_ON_DISK_INVERTED_LISTS_H
+#include <vector>
+#include <list>
+#include <faiss/IndexIVF.h>
+namespace faiss {
+struct LockLevels;
+/** On-disk storage of inverted lists.
+ *
+ * The data is stored in a mmapped chunk of memory (base ptointer ptr,
+ * size totsize). Each list is a range of memory that contains (object
+ * List) that contains:
+ *
+ * - uint8_t codes[capacity * code_size]
+ * - followed by idx_t ids[capacity]
+ *
+ * in each of the arrays, the size <= capacity first elements are
+ * used, the rest is not initialized.
+ *
+ * Addition and resize are supported by:
+ * - roundind up the capacity of the lists to a power of two
+ * - maintaining a list of empty slots, sorted by size.
+ * - resizing the mmapped block is adjusted as needed.
+ *
+ * An OnDiskInvertedLists is compact if the size == capacity for all
+ * lists and there are no available slots.
+ *
+ * Addition to the invlists is slow. For incremental add it is better
+ * to use a default ArrayInvertedLists object and convert it to an
+ * OnDisk with merge_from.
+ *
+ * When it is known that a set of lists will be accessed, it is useful
+ * to call prefetch_lists, that launches a set of threads to read the
+ * lists in parallel.
+ */
+struct OnDiskInvertedLists: InvertedLists {
+    struct List {
+        size_t size;     // size of inverted list (entries)
+        size_t capacity; // allocated size (entries)
+        size_t offset;   // offset in buffer (bytes)
+        List ();
+    };
+    // size nlist
+    std::vector<List> lists;
+    struct Slot {
+        size_t offset;    // bytes
+        size_t capacity;  // bytes
+        Slot (size_t offset, size_t capacity);
+        Slot ();
+    };
+    // size whatever space remains
+    std::list<Slot> slots;
+    std::string filename;
+    size_t totsize;
+    uint8_t *ptr; // mmap base pointer
+    bool read_only;  /// are inverted lists mapped read-only
+    OnDiskInvertedLists (size_t nlist, size_t code_size,
+                         const char *filename);
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+    void resize (size_t list_no, size_t new_size) override;
+    // copy all inverted lists into *this, in compact form (without
+    // allocating slots)
+    size_t merge_from (const InvertedLists **ils, int n_il, bool verbose=false);
+    /// restrict the inverted lists to l0:l1 without touching the mmapped region
+    void crop_invlists(size_t l0, size_t l1);
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+    virtual ~OnDiskInvertedLists ();
+    // private
+    LockLevels * locks;
+    // encapsulates the threads that are busy prefeteching
+    struct OngoingPrefetch;
+    OngoingPrefetch *pf;
+    int prefetch_nthread;
+    void do_mmap ();
+    void update_totsize (size_t new_totsize);
+    void resize_locked (size_t list_no, size_t new_size);
+    size_t allocate_slot (size_t capacity);
+    void free_slot (size_t offset, size_t capacity);
+    // empty constructor for the I/O functions
+    OnDiskInvertedLists ();
+};
+} // namespace faiss
+#endif

data/vendor/faiss/VectorTransform.cpp ADDED Viewed

@@ -0,0 +1,1157 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/VectorTransform.h>
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexPQ.h>
+using namespace faiss;
+extern "C" {
+// this is to keep the clang syntax checker happy
+#ifndef FINTEGER
+#define FINTEGER int
+#endif
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+int sgemm_ (
+        const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+        n, FINTEGER *k, const float *alpha, const float *a,
+        FINTEGER *lda, const float *b,
+        FINTEGER *ldb, float *beta,
+        float *c, FINTEGER *ldc);
+int dgemm_ (
+        const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+        n, FINTEGER *k, const double *alpha, const double *a,
+        FINTEGER *lda, const double *b,
+        FINTEGER *ldb, double *beta,
+        double *c, FINTEGER *ldc);
+int ssyrk_ (
+        const char *uplo, const char *trans, FINTEGER *n, FINTEGER *k,
+        float *alpha, float *a, FINTEGER *lda,
+        float *beta, float *c, FINTEGER *ldc);
+/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
+int ssyev_ (
+        const char *jobz, const char *uplo, FINTEGER *n, float *a,
+        FINTEGER *lda, float *w, float *work, FINTEGER *lwork,
+        FINTEGER *info);
+int dsyev_ (
+        const char *jobz, const char *uplo, FINTEGER *n, double *a,
+        FINTEGER *lda, double *w, double *work, FINTEGER *lwork,
+        FINTEGER *info);
+int sgesvd_(
+        const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
+        float *a, FINTEGER *lda, float *s, float *u, FINTEGER *ldu, float *vt,
+        FINTEGER *ldvt, float *work, FINTEGER *lwork, FINTEGER *info);
+int dgesvd_(
+     const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
+     double *a, FINTEGER *lda, double *s, double *u, FINTEGER *ldu, double *vt,
+     FINTEGER *ldvt, double *work, FINTEGER *lwork, FINTEGER *info);
+}
+/*********************************************
+ * VectorTransform
+ *********************************************/
+float * VectorTransform::apply (Index::idx_t n, const float * x) const
+{
+    float * xt = new float[n * d_out];
+    apply_noalloc (n, x, xt);
+    return xt;
+}
+void VectorTransform::train (idx_t, const float *) {
+    // does nothing by default
+}
+void VectorTransform::reverse_transform (
+             idx_t , const float *,
+             float *) const
+{
+    FAISS_THROW_MSG ("reverse transform not implemented");
+}
+/*********************************************
+ * LinearTransform
+ *********************************************/
+/// both d_in > d_out and d_out < d_in are supported
+LinearTransform::LinearTransform (int d_in, int d_out,
+                                  bool have_bias):
+    VectorTransform (d_in, d_out), have_bias (have_bias),
+    is_orthonormal (false), verbose (false)
+{
+    is_trained = false; // will be trained when A and b are initialized
+}
+void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
+                               float * xt) const
+{
+    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
+    float c_factor;
+    if (have_bias) {
+        FAISS_THROW_IF_NOT_MSG (b.size() == d_out, "Bias not initialized");
+        float * xi = xt;
+        for (int i = 0; i < n; i++)
+            for(int j = 0; j < d_out; j++)
+                *xi++ = b[j];
+        c_factor = 1.0;
+    } else {
+        c_factor = 0.0;
+    }
+    FAISS_THROW_IF_NOT_MSG (A.size() == d_out * d_in,
+                      "Transformation matrix not initialized");
+    float one = 1;
+    FINTEGER nbiti = d_out, ni = n, di = d_in;
+    sgemm_ ("Transposed", "Not transposed",
+            &nbiti, &ni, &di,
+            &one, A.data(), &di, x, &di, &c_factor, xt, &nbiti);
+}
+void LinearTransform::transform_transpose (idx_t n, const float * y,
+                                           float *x) const
+{
+    if (have_bias) { // allocate buffer to store bias-corrected data
+        float *y_new = new float [n * d_out];
+        const float *yr = y;
+        float *yw = y_new;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_out; j++) {
+                *yw++ = *yr++ - b [j];
+            }
+        }
+        y = y_new;
+    }
+    {
+        FINTEGER dii = d_in, doi = d_out, ni = n;
+        float one = 1.0, zero = 0.0;
+        sgemm_ ("Not", "Not", &dii, &ni, &doi,
+                &one, A.data (), &dii, y, &doi, &zero, x, &dii);
+    }
+    if (have_bias) delete [] y;
+}
+void LinearTransform::set_is_orthonormal ()
+{
+    if (d_out > d_in) {
+        // not clear what we should do in this case
+        is_orthonormal = false;
+        return;
+    }
+    if (d_out == 0) { // borderline case, unnormalized matrix
+        is_orthonormal = true;
+        return;
+    }
+    double eps = 4e-5;
+    FAISS_ASSERT(A.size() >= d_out * d_in);
+    {
+        std::vector<float> ATA(d_out * d_out);
+        FINTEGER dii = d_in, doi = d_out;
+        float one = 1.0, zero = 0.0;
+        sgemm_ ("Transposed", "Not", &doi, &doi, &dii,
+                &one, A.data (), &dii,
+                A.data(), &dii,
+                &zero, ATA.data(), &doi);
+        is_orthonormal = true;
+        for (long i = 0; i < d_out; i++) {
+            for (long j = 0; j < d_out; j++) {
+                float v = ATA[i + j * d_out];
+                if (i == j) v-= 1;
+                if (fabs(v) > eps) {
+                    is_orthonormal = false;
+                }
+            }
+        }
+    }
+}
+void LinearTransform::reverse_transform (idx_t n, const float * xt,
+                                         float *x) const
+{
+    if (is_orthonormal) {
+        transform_transpose (n, xt, x);
+    } else {
+        FAISS_THROW_MSG ("reverse transform not implemented for non-orthonormal matrices");
+    }
+}
+void LinearTransform::print_if_verbose (
+         const char*name, const std::vector<double> &mat,
+         int n, int d) const
+{
+    if (!verbose) return;
+    printf("matrix %s: %d*%d [\n", name, n, d);
+    FAISS_THROW_IF_NOT (mat.size() >= n * d);
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < d; j++) {
+            printf("%10.5g ", mat[i * d + j]);
+        }
+        printf("\n");
+    }
+    printf("]\n");
+}
+/*********************************************
+ * RandomRotationMatrix
+ *********************************************/
+void RandomRotationMatrix::init (int seed)
+{
+    if(d_out <= d_in) {
+        A.resize (d_out * d_in);
+        float *q = A.data();
+        float_randn(q, d_out * d_in, seed);
+        matrix_qr(d_in, d_out, q);
+    } else {
+        // use tight-frame transformation
+        A.resize (d_out * d_out);
+        float *q = A.data();
+        float_randn(q, d_out * d_out, seed);
+        matrix_qr(d_out, d_out, q);
+        // remove columns
+        int i, j;
+        for (i = 0; i < d_out; i++) {
+            for(j = 0; j < d_in; j++) {
+                q[i * d_in + j] = q[i * d_out + j];
+            }
+        }
+        A.resize(d_in * d_out);
+    }
+    is_orthonormal = true;
+    is_trained = true;
+}
+void RandomRotationMatrix::train (Index::idx_t /*n*/, const float */*x*/)
+{
+    // initialize with some arbitrary seed
+    init (12345);
+}
+/*********************************************
+ * PCAMatrix
+ *********************************************/
+PCAMatrix::PCAMatrix (int d_in, int d_out,
+                      float eigen_power, bool random_rotation):
+    LinearTransform(d_in, d_out, true),
+    eigen_power(eigen_power), random_rotation(random_rotation)
+{
+    is_trained = false;
+    max_points_per_d = 1000;
+    balanced_bins = 0;
+}
+namespace {
+/// Compute the eigenvalue decomposition of symmetric matrix cov,
+/// dimensions d_in-by-d_in. Output eigenvectors in cov.
+void eig(size_t d_in, double *cov, double *eigenvalues, int verbose)
+{
+    { // compute eigenvalues and vectors
+        FINTEGER info = 0, lwork = -1, di = d_in;
+        double workq;
+        dsyev_ ("Vectors as well", "Upper",
+                &di, cov, &di, eigenvalues, &workq, &lwork, &info);
+        lwork = FINTEGER(workq);
+        double *work = new double[lwork];
+        dsyev_ ("Vectors as well", "Upper",
+                &di, cov, &di, eigenvalues, work, &lwork, &info);
+        delete [] work;
+        if (info != 0) {
+            fprintf (stderr, "WARN ssyev info returns %d, "
+                     "a very bad PCA matrix is learnt\n",
+                     int(info));
+            // do not throw exception, as the matrix could still be useful
+        }
+        if(verbose && d_in <= 10) {
+            printf("info=%ld new eigvals=[", long(info));
+            for(int j = 0; j < d_in; j++) printf("%g ", eigenvalues[j]);
+            printf("]\n");
+            double *ci = cov;
+            printf("eigenvecs=\n");
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10.4g ", *ci++);
+                printf("\n");
+            }
+        }
+    }
+    // revert order of eigenvectors & values
+    for(int i = 0; i < d_in / 2; i++) {
+        std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
+        double *v1 = cov + i * d_in;
+        double *v2 = cov + (d_in - 1 - i) * d_in;
+        for(int j = 0; j < d_in; j++)
+            std::swap(v1[j], v2[j]);
+    }
+}
+}
+void PCAMatrix::train (Index::idx_t n, const float *x)
+{
+    const float * x_in = x;
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_points_per_d * d_in, x, verbose);
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+    // compute mean
+    mean.clear(); mean.resize(d_in, 0.0);
+    if (have_bias) { // we may want to skip the bias
+        const float *xi = x;
+        for (int i = 0; i < n; i++) {
+            for(int j = 0; j < d_in; j++)
+                mean[j] += *xi++;
+        }
+        for(int j = 0; j < d_in; j++)
+            mean[j] /= n;
+    }
+    if(verbose) {
+        printf("mean=[");
+        for(int j = 0; j < d_in; j++) printf("%g ", mean[j]);
+        printf("]\n");
+    }
+    if(n >= d_in) {
+        // compute covariance matrix, store it in PCA matrix
+        PCAMat.resize(d_in * d_in);
+        float * cov = PCAMat.data();
+        { // initialize with  mean * mean^T term
+            float *ci = cov;
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    *ci++ = - n * mean[i] * mean[j];
+            }
+        }
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+            ssyrk_ ("Up", "Non transposed",
+                    &di, &ni, &one, (float*)x, &di, &one, cov, &di);
+        }
+        if(verbose && d_in <= 10) {
+            float *ci = cov;
+            printf("cov=\n");
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+        std::vector<double> covd (d_in * d_in);
+        for (size_t i = 0; i < d_in * d_in; i++) covd [i] = cov [i];
+        std::vector<double> eigenvaluesd (d_in);
+        eig (d_in, covd.data (), eigenvaluesd.data (), verbose);
+        for (size_t i = 0; i < d_in * d_in; i++) PCAMat [i] = covd [i];
+        eigenvalues.resize (d_in);
+        for (size_t i = 0; i < d_in; i++)
+            eigenvalues [i] = eigenvaluesd [i];
+    } else {
+        std::vector<float> xc (n * d_in);
+        for (size_t i = 0; i < n; i++)
+            for(size_t j = 0; j < d_in; j++)
+                xc [i * d_in + j] = x [i * d_in + j] - mean[j];
+        // compute Gram matrix
+        std::vector<float> gram (n * n);
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0, zero = 0.0;
+            ssyrk_ ("Up", "Transposed",
+                    &ni, &di, &one, xc.data(), &di, &zero, gram.data(), &ni);
+        }
+        if(verbose && d_in <= 10) {
+            float *ci = gram.data();
+            printf("gram=\n");
+            for(int i = 0; i < n; i++) {
+                for(int j = 0; j < n; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+        std::vector<double> gramd (n * n);
+        for (size_t i = 0; i < n * n; i++)
+            gramd [i] = gram [i];
+        std::vector<double> eigenvaluesd (n);
+        // eig will fill in only the n first eigenvals
+        eig (n, gramd.data (), eigenvaluesd.data (), verbose);
+        PCAMat.resize(d_in * n);
+        for (size_t i = 0; i < n * n; i++)
+            gram [i] = gramd [i];
+        eigenvalues.resize (d_in);
+        // fill in only the n first ones
+        for (size_t i = 0; i < n; i++)
+            eigenvalues [i] = eigenvaluesd [i];
+        { // compute PCAMat = x' * v
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+            sgemm_ ("Non", "Non Trans",
+                    &di, &ni, &ni,
+                    &one, xc.data(), &di, gram.data(), &ni,
+                    &one, PCAMat.data(), &di);
+        }
+        if(verbose && d_in <= 10) {
+            float *ci = PCAMat.data();
+            printf("PCAMat=\n");
+            for(int i = 0; i < n; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+        fvec_renorm_L2 (d_in, n, PCAMat.data());
+    }
+    prepare_Ab();
+    is_trained = true;
+}
+void PCAMatrix::copy_from (const PCAMatrix & other)
+{
+    FAISS_THROW_IF_NOT (other.is_trained);
+    mean = other.mean;
+    eigenvalues = other.eigenvalues;
+    PCAMat = other.PCAMat;
+    prepare_Ab ();
+    is_trained = true;
+}
+void PCAMatrix::prepare_Ab ()
+{
+    FAISS_THROW_IF_NOT_FMT (
+            d_out * d_in <= PCAMat.size(),
+            "PCA matrix cannot output %d dimensions from %d ",
+            d_out, d_in);
+    if (!random_rotation) {
+        A = PCAMat;
+        A.resize(d_out * d_in); // strip off useless dimensions
+        // first scale the components
+        if (eigen_power != 0) {
+            float *ai = A.data();
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for(int j = 0; j < d_in; j++)
+                    *ai++ *= factor;
+            }
+        }
+        if (balanced_bins != 0) {
+            FAISS_THROW_IF_NOT (d_out % balanced_bins == 0);
+            int dsub = d_out / balanced_bins;
+            std::vector <float> Ain;
+            std::swap(A, Ain);
+            A.resize(d_out * d_in);
+            std::vector <float> accu(balanced_bins);
+            std::vector <int> counter(balanced_bins);
+            // greedy assignment
+            for (int i = 0; i < d_out; i++) {
+                // find best bin
+                int best_j = -1;
+                float min_w = 1e30;
+                for (int j = 0; j < balanced_bins; j++) {
+                    if (counter[j] < dsub && accu[j] < min_w) {
+                        min_w = accu[j];
+                        best_j = j;
+                    }
+                }
+                int row_dst = best_j * dsub + counter[best_j];
+                accu[best_j] += eigenvalues[i];
+                counter[best_j] ++;
+                memcpy (&A[row_dst * d_in], &Ain[i * d_in],
+                        d_in * sizeof (A[0]));
+            }
+            if (verbose) {
+                printf("  bin accu=[");
+                for (int i = 0; i < balanced_bins; i++)
+                    printf("%g ", accu[i]);
+                printf("]\n");
+            }
+        }
+    } else {
+        FAISS_THROW_IF_NOT_MSG (balanced_bins == 0,
+             "both balancing bins and applying a random rotation "
+             "does not make sense");
+        RandomRotationMatrix rr(d_out, d_out);
+        rr.init(5);
+        // apply scaling on the rotation matrix (right multiplication)
+        if (eigen_power != 0) {
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for(int j = 0; j < d_out; j++)
+                   rr.A[j * d_out + i] *= factor;
+            }
+        }
+        A.resize(d_in * d_out);
+        {
+            FINTEGER dii = d_in, doo = d_out;
+            float one = 1.0, zero = 0.0;
+            sgemm_ ("Not", "Not", &dii, &doo, &doo,
+                    &one, PCAMat.data(), &dii, rr.A.data(), &doo, &zero,
+                    A.data(), &dii);
+        }
+    }
+    b.clear(); b.resize(d_out);
+    for (int i = 0; i < d_out; i++) {
+        float accu = 0;
+        for (int j = 0; j < d_in; j++)
+            accu -= mean[j] * A[j + i * d_in];
+        b[i] = accu;
+    }
+    is_orthonormal = eigen_power == 0;
+}
+/*********************************************
+ * ITQMatrix
+ *********************************************/
+ITQMatrix::ITQMatrix (int d):
+    LinearTransform(d, d, false),
+    max_iter (50),
+    seed (123)
+{
+}
+/** translated from fbcode/deeplearning/catalyzer/catalyzer/quantizers.py */
+void ITQMatrix::train (Index::idx_t n, const float* xf)
+{
+    size_t d = d_in;
+    std::vector<double> rotation (d * d);
+    if (init_rotation.size() == d * d) {
+        memcpy (rotation.data(), init_rotation.data(),
+                d * d * sizeof(rotation[0]));
+    } else {
+        RandomRotationMatrix rrot (d, d);
+        rrot.init (seed);
+        for (size_t i = 0; i < d * d; i++) {
+            rotation[i] = rrot.A[i];
+        }
+    }
+    std::vector<double> x (n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        x[i] = xf[i];
+    }
+    std::vector<double> rotated_x (n * d), cov_mat (d * d);
+    std::vector<double> u (d * d), vt (d * d), singvals (d);
+    for (int i = 0; i < max_iter; i++) {
+        print_if_verbose ("rotation", rotation, d, d);
+        { // rotated_data = np.dot(training_data, rotation)
+            FINTEGER di = d, ni = n;
+            double one = 1, zero = 0;
+            dgemm_ ("N", "N", &di, &ni, &di,
+                    &one, rotation.data(), &di, x.data(), &di,
+                    &zero, rotated_x.data(), &di);
+        }
+        print_if_verbose ("rotated_x", rotated_x, n, d);
+        // binarize
+        for (size_t j = 0; j < n * d; j++) {
+            rotated_x[j] = rotated_x[j] < 0 ? -1 : 1;
+        }
+        // covariance matrix
+        { // rotated_data = np.dot(training_data, rotation)
+            FINTEGER di = d, ni = n;
+            double one = 1, zero = 0;
+            dgemm_ ("N", "T", &di, &di, &ni,
+                    &one, rotated_x.data(), &di, x.data(), &di,
+                    &zero, cov_mat.data(), &di);
+        }
+        print_if_verbose ("cov_mat", cov_mat, d, d);
+        // SVD
+        {
+            FINTEGER di = d;
+            FINTEGER lwork = -1, info;
+            double lwork1;
+            // workspace query
+            dgesvd_ ("A", "A", &di, &di, cov_mat.data(), &di,
+                     singvals.data(), u.data(), &di,
+                     vt.data(), &di,
+                     &lwork1, &lwork, &info);
+            FAISS_THROW_IF_NOT (info == 0);
+            lwork = size_t (lwork1);
+            std::vector<double> work (lwork);
+            dgesvd_ ("A", "A", &di, &di, cov_mat.data(), &di,
+                     singvals.data(), u.data(), &di,
+                     vt.data(), &di,
+                     work.data(), &lwork, &info);
+            FAISS_THROW_IF_NOT_FMT (info == 0, "sgesvd returned info=%d", info);
+        }
+        print_if_verbose ("u", u, d, d);
+        print_if_verbose ("vt", vt, d, d);
+        // update rotation
+        {
+            FINTEGER di = d;
+            double one = 1, zero = 0;
+            dgemm_ ("N", "T", &di, &di, &di,
+                    &one, u.data(), &di, vt.data(), &di,
+                    &zero, rotation.data(), &di);
+        }
+        print_if_verbose ("final rot", rotation, d, d);
+    }
+    A.resize (d * d);
+    for (size_t i = 0; i < d; i++) {
+        for (size_t j = 0; j < d; j++) {
+            A[i + d * j] = rotation[j + d * i];
+        }
+    }
+    is_trained = true;
+}
+ITQTransform::ITQTransform (int d_in, int d_out, bool do_pca):
+    VectorTransform (d_in, d_out),
+    do_pca (do_pca),
+    itq (d_out),
+    pca_then_itq (d_in, d_out, false)
+{
+    if (!do_pca) {
+        FAISS_THROW_IF_NOT (d_in == d_out);
+    }
+    max_train_per_dim = 10;
+    is_trained = false;
+}
+void ITQTransform::train (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (!is_trained);
+    const float * x_in = x;
+    size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n, max_train_points, x);
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+    std::unique_ptr<float []> x_norm(new float[n * d_in]);
+    { // normalize
+        int d = d_in;
+        mean.resize (d, 0);
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                mean[j] += x[i * d + j];
+            }
+        }
+        for (idx_t j = 0; j < d; j++) {
+            mean[j] /= n;
+        }
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+            x_norm[i * d + j] = x[i * d + j] - mean[j];
+            }
+        }
+        fvec_renorm_L2 (d_in, n, x_norm.get());
+    }
+    // train PCA
+    PCAMatrix pca (d_in, d_out);
+    float *x_pca;
+    std::unique_ptr<float []> x_pca_del;
+    if (do_pca) {
+        pca.have_bias = false;  // for consistency with reference implem
+        pca.train (n, x_norm.get());
+        x_pca = pca.apply (n, x_norm.get());
+        x_pca_del.reset(x_pca);
+    } else {
+        x_pca = x_norm.get();
+    }
+    // train ITQ
+    itq.train (n, x_pca);
+    // merge PCA and ITQ
+    if (do_pca) {
+        FINTEGER di = d_out, dini = d_in;
+        float one = 1, zero = 0;
+        pca_then_itq.A.resize(d_in * d_out);
+        sgemm_ ("N", "N", &dini, &di, &di,
+                &one, pca.A.data(), &dini,
+                itq.A.data(), &di,
+                &zero, pca_then_itq.A.data(), &dini);
+    } else {
+        pca_then_itq.A = itq.A;
+    }
+    pca_then_itq.is_trained = true;
+    is_trained = true;
+}
+void ITQTransform::apply_noalloc (Index::idx_t n, const float * x,
+                               float * xt) const
+{
+    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
+    std::unique_ptr<float []> x_norm(new float[n * d_in]);
+    { // normalize
+        int d = d_in;
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                x_norm[i * d + j] = x[i * d + j] - mean[j];
+            }
+        }
+        // this is not really useful if we are going to binarize right
+        // afterwards but OK
+        fvec_renorm_L2 (d_in, n, x_norm.get());
+    }
+    pca_then_itq.apply_noalloc (n, x_norm.get(), xt);
+}
+/*********************************************
+ * OPQMatrix
+ *********************************************/
+OPQMatrix::OPQMatrix (int d, int M, int d2):
+    LinearTransform (d, d2 == -1 ? d : d2, false), M(M),
+    niter (50),
+    niter_pq (4), niter_pq_0 (40),
+    verbose(false),
+    pq(nullptr)
+{
+    is_trained = false;
+    // OPQ is quite expensive to train, so set this right.
+    max_train_points = 256 * 256;
+    pq = nullptr;
+}
+void OPQMatrix::train (Index::idx_t n, const float *x)
+{
+    const float * x_in = x;
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_train_points, x, verbose);
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+    // To support d_out > d_in, we pad input vectors with 0s to d_out
+    size_t d = d_out <= d_in ? d_in : d_out;
+    size_t d2 = d_out;
+#if 0
+    // what this test shows: the only way of getting bit-exact
+    // reproducible results with sgeqrf and sgesvd seems to be forcing
+    // single-threading.
+    { // test repro
+        std::vector<float> r (d * d);
+        float * rotation = r.data();
+        float_randn (rotation, d * d, 1234);
+        printf("CS0: %016lx\n",
+               ivec_checksum (128*128, (int*)rotation));
+        matrix_qr (d, d, rotation);
+        printf("CS1: %016lx\n",
+               ivec_checksum (128*128, (int*)rotation));
+        return;
+    }
+#endif
+    if (verbose) {
+        printf ("OPQMatrix::train: training an OPQ rotation matrix "
+                "for M=%d from %ld vectors in %dD -> %dD\n",
+                M, n, d_in, d_out);
+    }
+    std::vector<float> xtrain (n * d);
+    // center x
+    {
+        std::vector<float> sum (d);
+        const float *xi = x;
+        for (size_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                sum [j] += *xi++;
+        }
+        for (int i = 0; i < d; i++) sum[i] /= n;
+        float *yi = xtrain.data();
+        xi = x;
+        for (size_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                *yi++ = *xi++ - sum[j];
+            yi += d - d_in;
+        }
+    }
+    float *rotation;
+    if (A.size () == 0) {
+        A.resize (d * d);
+        rotation = A.data();
+        if (verbose)
+            printf("  OPQMatrix::train: making random %ld*%ld rotation\n",
+                   d, d);
+        float_randn (rotation, d * d, 1234);
+        matrix_qr (d, d, rotation);
+        // we use only the d * d2 upper part of the matrix
+        A.resize (d * d2);
+    } else {
+        FAISS_THROW_IF_NOT (A.size() == d * d2);
+        rotation = A.data();
+    }
+    std::vector<float>
+        xproj (d2 * n), pq_recons (d2 * n), xxr (d * n),
+        tmp(d * d * 4);
+    ProductQuantizer pq_default (d2, M, 8);
+    ProductQuantizer &pq_regular = pq ? *pq : pq_default;
+    std::vector<uint8_t> codes (pq_regular.code_size * n);
+    double t0 = getmillisecs();
+    for (int iter = 0; iter < niter; iter++) {
+        { // torch.mm(xtrain, rotation:t())
+            FINTEGER di = d, d2i = d2, ni = n;
+            float zero = 0, one = 1;
+            sgemm_ ("Transposed", "Not transposed",
+                    &d2i, &ni, &di,
+                    &one, rotation, &di,
+                    xtrain.data(), &di,
+                    &zero, xproj.data(), &d2i);
+        }
+        pq_regular.cp.max_points_per_centroid = 1000;
+        pq_regular.cp.niter = iter == 0 ? niter_pq_0 : niter_pq;
+        pq_regular.verbose = verbose;
+        pq_regular.train (n, xproj.data());
+        if (verbose) {
+            printf("    encode / decode\n");
+        }
+        if (pq_regular.assign_index) {
+            pq_regular.compute_codes_with_assign_index
+                (xproj.data(), codes.data(), n);
+        } else {
+            pq_regular.compute_codes (xproj.data(), codes.data(), n);
+        }
+        pq_regular.decode (codes.data(), pq_recons.data(), n);
+        float pq_err = fvec_L2sqr (pq_recons.data(), xproj.data(), n * d2) / n;
+        if (verbose)
+            printf ("    Iteration %d (%d PQ iterations):"
+                    "%.3f s, obj=%g\n", iter, pq_regular.cp.niter,
+                    (getmillisecs () - t0) / 1000.0, pq_err);
+        {
+            float *u = tmp.data(), *vt = &tmp [d * d];
+            float *sing_val = &tmp [2 * d * d];
+            FINTEGER di = d, d2i = d2, ni = n;
+            float one = 1, zero = 0;
+            if (verbose) {
+                printf("    X * recons\n");
+            }
+            // torch.mm(xtrain:t(), pq_recons)
+            sgemm_ ("Not", "Transposed",
+                    &d2i, &di, &ni,
+                    &one, pq_recons.data(), &d2i,
+                    xtrain.data(), &di,
+                    &zero, xxr.data(), &d2i);
+            FINTEGER lwork = -1, info = -1;
+            float worksz;
+            // workspace query
+            sgesvd_ ("All", "All",
+                     &d2i, &di, xxr.data(), &d2i,
+                     sing_val,
+                     vt, &d2i, u, &di,
+                     &worksz, &lwork, &info);
+            lwork = int(worksz);
+            std::vector<float> work (lwork);
+            // u and vt swapped
+            sgesvd_ ("All", "All",
+                     &d2i, &di, xxr.data(), &d2i,
+                     sing_val,
+                     vt, &d2i, u, &di,
+                     work.data(), &lwork, &info);
+            sgemm_ ("Transposed", "Transposed",
+                    &di, &d2i, &d2i,
+                    &one, u, &di, vt, &d2i,
+                    &zero, rotation, &di);
+        }
+        pq_regular.train_type = ProductQuantizer::Train_hot_start;
+    }
+    // revert A matrix
+    if (d > d_in) {
+        for (long i = 0; i < d_out; i++)
+            memmove (&A[i * d_in], &A[i * d], sizeof(A[0]) * d_in);
+        A.resize (d_in * d_out);
+    }
+    is_trained = true;
+    is_orthonormal = true;
+}
+/*********************************************
+ * NormalizationTransform
+ *********************************************/
+NormalizationTransform::NormalizationTransform (int d, float norm):
+    VectorTransform (d, d), norm (norm)
+{
+}
+NormalizationTransform::NormalizationTransform ():
+    VectorTransform (-1, -1), norm (-1)
+{
+}
+void NormalizationTransform::apply_noalloc
+      (idx_t n, const float* x, float* xt) const
+{
+    if (norm == 2.0) {
+        memcpy (xt, x, sizeof (x[0]) * n * d_in);
+        fvec_renorm_L2 (d_in, n, xt);
+    } else {
+        FAISS_THROW_MSG ("not implemented");
+    }
+}
+void NormalizationTransform::reverse_transform (idx_t n, const float* xt,
+                                                float* x) const
+{
+    memcpy (x, xt, sizeof (xt[0]) * n * d_in);
+}
+/*********************************************
+ * CenteringTransform
+ *********************************************/
+CenteringTransform::CenteringTransform (int d):
+    VectorTransform (d, d)
+{
+    is_trained = false;
+}
+void CenteringTransform::train(Index::idx_t n, const float *x) {
+    FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
+    mean.resize (d_in, 0);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            mean[j] += *x++;
+        }
+    }
+    for (size_t j = 0; j < d_in; j++) {
+        mean[j] /= n;
+    }
+    is_trained = true;
+}
+void CenteringTransform::apply_noalloc
+      (idx_t n, const float* x, float* xt) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *xt++ = *x++ - mean[j];
+        }
+    }
+}
+void CenteringTransform::reverse_transform (idx_t n, const float* xt,
+                                                float* x) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *x++ = *xt++ + mean[j];
+        }
+    }
+}
+/*********************************************
+ * RemapDimensionsTransform
+ *********************************************/
+RemapDimensionsTransform::RemapDimensionsTransform (
+        int d_in, int d_out, const int *map_in):
+    VectorTransform (d_in, d_out)
+{
+    map.resize (d_out);
+    for (int i = 0; i < d_out; i++) {
+        map[i] = map_in[i];
+        FAISS_THROW_IF_NOT (map[i] == -1 || (map[i] >= 0 && map[i] < d_in));
+    }
+}
+RemapDimensionsTransform::RemapDimensionsTransform (
+      int d_in, int d_out, bool uniform): VectorTransform (d_in, d_out)
+{
+    map.resize (d_out, -1);
+    if (uniform) {
+        if (d_in < d_out) {
+            for (int i = 0; i < d_in; i++) {
+                map [i * d_out / d_in] = i;
+        }
+        } else {
+            for (int i = 0; i < d_out; i++) {
+                map [i] = i * d_in / d_out;
+            }
+        }
+    } else {
+        for (int i = 0; i < d_in && i < d_out; i++)
+            map [i] = i;
+    }
+}
+void RemapDimensionsTransform::apply_noalloc (idx_t n, const float * x,
+                                              float *xt) const
+{
+    for (idx_t i = 0; i < n; i++) {
+        for (int j = 0; j < d_out; j++) {
+            xt[j] = map[j] < 0 ? 0 : x[map[j]];
+        }
+        x += d_in;
+        xt += d_out;
+    }
+}
+void RemapDimensionsTransform::reverse_transform (idx_t n, const float * xt,
+                                                  float *x) const
+{
+    memset (x, 0, sizeof (*x) * n * d_in);
+    for (idx_t i = 0; i < n; i++) {
+        for (int j = 0; j < d_out; j++) {
+            if (map[j] >= 0) x[map[j]] = xt[j];
+        }
+        x += d_in;
+        xt += d_out;
+    }
+}