RubyGems - faiss - Versions diffs - 0.1.3 → 0.2.0 - Mend

faiss 0.1.3 → 0.2.0

Files changed (199) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +25 -0
data/LICENSE.txt +1 -1
data/README.md +16 -4
data/ext/faiss/ext.cpp +12 -308
data/ext/faiss/extconf.rb +6 -3
data/ext/faiss/index.cpp +189 -0
data/ext/faiss/index_binary.cpp +75 -0
data/ext/faiss/kmeans.cpp +40 -0
data/ext/faiss/numo.hpp +867 -0
data/ext/faiss/pca_matrix.cpp +33 -0
data/ext/faiss/product_quantizer.cpp +53 -0
data/ext/faiss/utils.cpp +13 -0
data/ext/faiss/utils.h +5 -0
data/lib/faiss.rb +0 -5
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +36 -33
data/vendor/faiss/faiss/AutoTune.h +6 -3
data/vendor/faiss/faiss/Clustering.cpp +16 -12
data/vendor/faiss/faiss/Index.cpp +3 -4
data/vendor/faiss/faiss/Index.h +3 -3
data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
data/vendor/faiss/faiss/IndexBinary.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
data/vendor/faiss/faiss/IndexFlat.h +0 -51
data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
data/vendor/faiss/faiss/IndexIVF.h +22 -15
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
data/vendor/faiss/faiss/IndexRefine.h +73 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
data/vendor/faiss/faiss/impl/io.cpp +33 -2
data/vendor/faiss/faiss/impl/io.h +7 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
data/vendor/faiss/faiss/index_factory.cpp +112 -7
data/vendor/faiss/faiss/index_io.h +1 -48
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
data/vendor/faiss/faiss/utils/Heap.h +61 -50
data/vendor/faiss/faiss/utils/distances.cpp +164 -319
data/vendor/faiss/faiss/utils/distances.h +28 -20
data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
data/vendor/faiss/faiss/utils/hamming.h +2 -7
data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
data/vendor/faiss/faiss/utils/partitioning.h +69 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
data/vendor/faiss/faiss/utils/simdlib.h +31 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
metadata +54 -149
data/lib/faiss/index.rb +0 -20
data/lib/faiss/index_binary.rb +0 -20
data/lib/faiss/kmeans.rb +0 -15
data/lib/faiss/pca_matrix.rb +0 -15
data/lib/faiss/product_quantizer.rb +0 -22
data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
data/vendor/faiss/c_api/AutoTune_c.h +0 -66
data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
data/vendor/faiss/c_api/Clustering_c.h +0 -123
data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
data/vendor/faiss/c_api/IndexShards_c.h +0 -39
data/vendor/faiss/c_api/Index_c.cpp +0 -105
data/vendor/faiss/c_api/Index_c.h +0 -183
data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
data/vendor/faiss/c_api/clone_index_c.h +0 -32
data/vendor/faiss/c_api/error_c.h +0 -42
data/vendor/faiss/c_api/error_impl.cpp +0 -27
data/vendor/faiss/c_api/error_impl.h +0 -16
data/vendor/faiss/c_api/faiss_c.h +0 -58
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
data/vendor/faiss/c_api/index_factory_c.h +0 -30
data/vendor/faiss/c_api/index_io_c.cpp +0 -42
data/vendor/faiss/c_api/index_io_c.h +0 -50
data/vendor/faiss/c_api/macros_impl.h +0 -110
data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
data/vendor/faiss/misc/test_blas.cpp +0 -87
data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
data/vendor/faiss/tests/test_merge.cpp +0 -260
data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
data/vendor/faiss/tests/test_params_override.cpp +0 -236
data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104

data/vendor/faiss/faiss/utils/distances.h CHANGED Viewed

@@ -156,6 +156,14 @@ void pairwise_indexed_inner_product (
 // threshold on nx above which we switch to BLAS to compute distances
 FAISS_API extern int distance_compute_blas_threshold;
+// block sizes for BLAS distance computations
+FAISS_API extern int distance_compute_blas_query_bs;
+FAISS_API extern int distance_compute_blas_database_bs;
+// above this number of results we switch to a reservoir to collect results
+// rather than a heap
+FAISS_API extern int distance_compute_min_k_reservoir;
 /** Return the k nearest neighors of each of the nx vectors x among the ny
  *  vector y, w.r.t to max inner product
  *
@@ -169,27 +177,17 @@ void knn_inner_product (
         size_t d, size_t nx, size_t ny,
         float_minheap_array_t * res);
-/** Same as knn_inner_product, for the L2 distance */
+/** Same as knn_inner_product, for the L2 distance
+ *  @param y_norm2    norms for the y vectors (nullptr or size ny)
+ */
 void knn_L2sqr (
         const float * x,
         const float * y,
         size_t d, size_t nx, size_t ny,
-        float_maxheap_array_t * res);
+        float_maxheap_array_t * res,
+        const float *y_norm2 = nullptr);
-/** same as knn_L2sqr, but base_shift[bno] is subtracted to all
- * computed distances.
- *
- * @param base_shift   size ny
- */
-void knn_L2sqr_base_shift (
-         const float * x,
-         const float * y,
-         size_t d, size_t nx, size_t ny,
-         float_maxheap_array_t * res,
-         const float *base_shift);
 /* Find the nearest neighbors for nx queries in a set of ny vectors
  * indexed by ids. May be useful for re-ranking a pre-selected vector list
  */
@@ -200,11 +198,12 @@ void knn_inner_products_by_idx (
         size_t d, size_t nx, size_t ny,
         float_minheap_array_t * res);
-void knn_L2sqr_by_idx (const float * x,
-                       const float * y,
-                       const int64_t * ids,
-                       size_t d, size_t nx, size_t ny,
-                       float_maxheap_array_t * res);
+void knn_L2sqr_by_idx (
+        const float * x,
+        const float * y,
+        const int64_t * ids,
+        size_t d, size_t nx, size_t ny,
+        float_maxheap_array_t * res);
 /***************************************************************************
  * Range search
@@ -239,6 +238,15 @@ void range_search_inner_product (
         RangeSearchResult *result);
+/***************************************************************************
+ * PQ tables computations
+ ***************************************************************************/
+/// specialized function for PQ2
+void compute_PQ_dis_tables_dsub2(
+        size_t d, size_t ksub, const float *centroids,
+        size_t nx, const float * x,
+        bool is_inner_product,
+        float * dis_tables);
 } // namespace faiss

data/vendor/faiss/faiss/utils/distances_simd.cpp CHANGED Viewed

@@ -14,6 +14,9 @@
 #include <cstring>
 #include <cmath>
+#include <faiss/utils/simdlib.h>
+#include <faiss/impl/FaissAssert.h>
 #ifdef __SSE3__
 #include <immintrin.h>
 #endif
@@ -127,6 +130,29 @@ void fvec_L2sqr_ny_ref (float * dis,
 }
+void fvec_inner_products_ny_ref (float * ip,
+                             const float * x,
+                             const float * y,
+                             size_t d, size_t ny)
+{
+    // BLAS slower for the use cases here
+#if 0
+    {
+        FINTEGER di = d;
+        FINTEGER nyi = ny;
+        float one = 1.0, zero = 0.0;
+        FINTEGER onei = 1;
+        sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
+    }
+#endif
+    for (size_t i = 0; i < ny; i++) {
+        ip[i] = fvec_inner_product (x, y, d);
+        y += d;
+    }
+}
 /*********************************************************
@@ -174,12 +200,39 @@ float fvec_norm_L2sqr (const float *  x,
 namespace {
-float sqr (float x) {
-    return x * x;
-}
+/// Function that does a component-wise operation between x and y
+/// to compute L2 distances. ElementOp can then be used in the fvec_op_ny
+/// functions below
+struct ElementOpL2 {
+    static float op (float x, float y) {
+        float tmp = x - y;
+        return tmp * tmp;
+    }
+    static __m128 op (__m128 x, __m128 y) {
+        __m128 tmp = x - y;
+        return tmp * tmp;
+    }
+};
+/// Function that does a component-wise operation between x and y
+/// to compute inner products
+struct ElementOpIP {
-void fvec_L2sqr_ny_D1 (float * dis, const float * x,
+    static float op (float x, float y) {
+        return x * y;
+    }
+    static __m128 op (__m128 x, __m128 y) {
+        return x * y;
+    }
+};
+template<class ElementOp>
+void fvec_op_ny_D1 (float * dis, const float * x,
                        const float * y, size_t ny)
 {
     float x0s = x[0];
@@ -187,11 +240,9 @@ void fvec_L2sqr_ny_D1 (float * dis, const float * x,
     size_t i;
     for (i = 0; i + 3 < ny; i += 4) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
         dis[i] = _mm_cvtss_f32 (accu);
-        tmp = _mm_shuffle_ps (accu, accu, 1);
+        __m128 tmp = _mm_shuffle_ps (accu, accu, 1);
         dis[i + 1] = _mm_cvtss_f32 (tmp);
         tmp = _mm_shuffle_ps (accu, accu, 2);
         dis[i + 2] = _mm_cvtss_f32 (tmp);
@@ -199,69 +250,63 @@ void fvec_L2sqr_ny_D1 (float * dis, const float * x,
         dis[i + 3] = _mm_cvtss_f32 (tmp);
     }
     while (i < ny) { // handle non-multiple-of-4 case
-        dis[i++] = sqr(x0s - *y++);
+        dis[i++] = ElementOp::op(x0s, *y++);
     }
 }
-void fvec_L2sqr_ny_D2 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D2 (float * dis, const float * x,
                        const float * y, size_t ny)
 {
     __m128 x0 = _mm_set_ps (x[1], x[0], x[1], x[0]);
     size_t i;
     for (i = 0; i + 1 < ny; i += 2) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
         accu = _mm_shuffle_ps (accu, accu, 3);
         dis[i + 1] = _mm_cvtss_f32 (accu);
     }
     if (i < ny) { // handle odd case
-        dis[i] = sqr(x[0] - y[0]) + sqr(x[1] - y[1]);
+        dis[i] = ElementOp::op(x[0], y[0]) + ElementOp::op(x[1], y[1]);
     }
 }
-void fvec_L2sqr_ny_D4 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D4 (float * dis, const float * x,
                         const float * y, size_t ny)
 {
     __m128 x0 = _mm_loadu_ps(x);
     for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
     }
 }
-void fvec_L2sqr_ny_D8 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D8 (float * dis, const float * x,
                         const float * y, size_t ny)
 {
     __m128 x0 = _mm_loadu_ps(x);
     __m128 x1 = _mm_loadu_ps(x + 4);
     for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        tmp = x1 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
+        accu       += ElementOp::op(x1, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
     }
 }
-void fvec_L2sqr_ny_D12 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D12 (float * dis, const float * x,
                         const float * y, size_t ny)
 {
     __m128 x0 = _mm_loadu_ps(x);
@@ -269,13 +314,9 @@ void fvec_L2sqr_ny_D12 (float * dis, const float * x,
     __m128 x2 = _mm_loadu_ps(x + 8);
     for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        tmp = x1 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
-        tmp = x2 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
+        accu       += ElementOp::op(x1, _mm_loadu_ps (y)); y += 4;
+        accu       += ElementOp::op(x2, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
@@ -283,31 +324,52 @@ void fvec_L2sqr_ny_D12 (float * dis, const float * x,
 }
 } // anonymous namespace
 void fvec_L2sqr_ny (float * dis, const float * x,
                         const float * y, size_t d, size_t ny) {
     // optimized for a few special cases
-    switch(d) {
-    case 1:
-        fvec_L2sqr_ny_D1 (dis, x, y, ny);
-        return;
-    case 2:
-        fvec_L2sqr_ny_D2 (dis, x, y, ny);
-        return;
-    case 4:
-        fvec_L2sqr_ny_D4 (dis, x, y, ny);
+#define DISPATCH(dval) \
+    case dval:\
+        fvec_op_ny_D ## dval <ElementOpL2> (dis, x, y, ny); \
         return;
-    case 8:
-        fvec_L2sqr_ny_D8 (dis, x, y, ny);
+    switch(d) {
+        DISPATCH(1)
+        DISPATCH(2)
+        DISPATCH(4)
+        DISPATCH(8)
+        DISPATCH(12)
+    default:
+        fvec_L2sqr_ny_ref (dis, x, y, d, ny);
         return;
-    case 12:
-        fvec_L2sqr_ny_D12 (dis, x, y, ny);
+    }
+#undef DISPATCH
+}
+void fvec_inner_products_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+#define DISPATCH(dval) \
+    case dval:\
+        fvec_op_ny_D ## dval <ElementOpIP> (dis, x, y, ny); \
         return;
+    switch(d) {
+        DISPATCH(1)
+        DISPATCH(2)
+        DISPATCH(4)
+        DISPATCH(8)
+        DISPATCH(12)
     default:
-        fvec_L2sqr_ny_ref (dis, x, y, d, ny);
+        fvec_inner_products_ny_ref (dis, x, y, d, ny);
         return;
     }
+#undef DISPATCH
 }
@@ -644,6 +706,11 @@ void fvec_L2sqr_ny (float * dis, const float * x,
     fvec_L2sqr_ny_ref (dis, x, y, d, ny);
 }
+void fvec_inner_products_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+    fvec_inner_products_ny_ref (dis, x, y, d, ny);
+}
 #endif
@@ -803,6 +870,167 @@ int fvec_madd_and_argmin (size_t n, const float *a,
 #endif
+/***************************************************************************
+ * PQ tables computations
+ ***************************************************************************/
+#ifdef __AVX2__
+namespace {
+// get even float32's of a and b, interleaved
+simd8float32 geteven(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6)
+    );
+}
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6)
+    );
+}
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4)
+    );
+}
+simd8float32 gethigh128(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4)
+    );
+}
+/// compute the IP for dsub = 2 for 8 centroids and 4 sub-vectors at a time
+template<bool is_inner_product>
+void pq2_8cents_table(
+        const simd8float32 centroids[8],
+        const simd8float32 x,
+        float *out, size_t ldo, size_t nout = 4
+) {
+    simd8float32 ips[4];
+    for(int i = 0; i < 4; i++) {
+        simd8float32 p1, p2;
+        if (is_inner_product) {
+            p1 = x * centroids[2 * i];
+            p2 = x * centroids[2 * i + 1];
+        } else {
+            p1 = (x - centroids[2 * i]);
+            p1 = p1 * p1;
+            p2 = (x - centroids[2 * i + 1]);
+            p2 = p2 * p2;
+        }
+        ips[i] = hadd(p1, p2);
+    }
+    simd8float32 ip02a = geteven(ips[0], ips[1]);
+    simd8float32 ip02b = geteven(ips[2], ips[3]);
+    simd8float32 ip0 = getlow128(ip02a, ip02b);
+    simd8float32 ip2 = gethigh128(ip02a, ip02b);
+    simd8float32 ip13a = getodd(ips[0], ips[1]);
+    simd8float32 ip13b = getodd(ips[2], ips[3]);
+    simd8float32 ip1 = getlow128(ip13a, ip13b);
+    simd8float32 ip3 = gethigh128(ip13a, ip13b);
+    switch(nout) {
+    case 4:
+        ip3.storeu(out + 3 * ldo);
+    case 3:
+        ip2.storeu(out + 2 * ldo);
+    case 2:
+        ip1.storeu(out + 1 * ldo);
+    case 1:
+        ip0.storeu(out);
+    }
+}
+simd8float32 load_simd8float32_partial(const float *x, int n) {
+    ALIGNED(32) float tmp[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    float *wp = tmp;
+    for (int i = 0; i < n; i++) {
+        *wp++ = *x++;
+    }
+    return simd8float32(tmp);
+}
+} // anonymous namespace
+void compute_PQ_dis_tables_dsub2(
+        size_t d, size_t ksub, const float *all_centroids,
+        size_t nx, const float * x,
+        bool is_inner_product,
+        float * dis_tables)
+{
+    size_t M = d / 2;
+    FAISS_THROW_IF_NOT(ksub % 8 == 0);
+    for(size_t m0 = 0; m0 < M; m0 += 4) {
+        int m1 = std::min(M, m0 + 4);
+        for(int k0 = 0; k0 < ksub; k0 += 8) {
+            simd8float32 centroids[8];
+            for (int k = 0; k < 8; k++) {
+                float centroid[8] __attribute__((aligned(32)));
+                size_t wp = 0;
+                size_t rp = (m0 * ksub + k + k0) * 2;
+                for (int m = m0; m < m1; m++) {
+                    centroid[wp++] = all_centroids[rp];
+                    centroid[wp++] = all_centroids[rp + 1];
+                    rp += 2 * ksub;
+                }
+                centroids[k] = simd8float32(centroid);
+            }
+            for(size_t i = 0; i < nx; i++) {
+                simd8float32 xi;
+                if (m1 == m0 + 4) {
+                    xi.loadu(x + i * d + m0 * 2);
+                } else {
+                    xi = load_simd8float32_partial(x + i * d + m0 * 2, 2 * (m1 - m0));
+                }
+                if(is_inner_product) {
+                    pq2_8cents_table<true>(
+                        centroids, xi,
+                        dis_tables + (i * M + m0) * ksub + k0,
+                        ksub, m1 - m0
+                    );
+                } else {
+                    pq2_8cents_table<false>(
+                        centroids, xi,
+                        dis_tables + (i * M + m0) * ksub + k0,
+                        ksub, m1 - m0
+                    );
+                }
+            }
+        }
+    }
+}
+#else
+void compute_PQ_dis_tables_dsub2(
+        size_t d, size_t ksub, const float *all_centroids,
+        size_t nx, const float * x,
+        bool is_inner_product,
+        float * dis_tables)
+{
+    FAISS_THROW_MSG("only implemented for AVX2");
+}
+#endif
 } // namespace faiss