RubyGems - faiss - Versions diffs - 0.1.3 → 0.1.4 - Mend

faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +36 -33
data/vendor/faiss/faiss/AutoTune.h +6 -3
data/vendor/faiss/faiss/Clustering.cpp +16 -12
data/vendor/faiss/faiss/Index.cpp +3 -4
data/vendor/faiss/faiss/Index.h +3 -3
data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
data/vendor/faiss/faiss/IndexBinary.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
data/vendor/faiss/faiss/IndexFlat.h +0 -51
data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
data/vendor/faiss/faiss/IndexIVF.h +22 -15
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
data/vendor/faiss/faiss/IndexRefine.h +73 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
data/vendor/faiss/faiss/impl/io.cpp +33 -2
data/vendor/faiss/faiss/impl/io.h +7 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
data/vendor/faiss/faiss/index_factory.cpp +112 -7
data/vendor/faiss/faiss/index_io.h +1 -48
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
data/vendor/faiss/faiss/utils/Heap.h +61 -50
data/vendor/faiss/faiss/utils/distances.cpp +164 -319
data/vendor/faiss/faiss/utils/distances.h +28 -20
data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
data/vendor/faiss/faiss/utils/hamming.h +2 -7
data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
data/vendor/faiss/faiss/utils/partitioning.h +69 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
data/vendor/faiss/faiss/utils/simdlib.h +31 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
metadata +43 -141
data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
data/vendor/faiss/c_api/AutoTune_c.h +0 -66
data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
data/vendor/faiss/c_api/Clustering_c.h +0 -123
data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
data/vendor/faiss/c_api/IndexShards_c.h +0 -39
data/vendor/faiss/c_api/Index_c.cpp +0 -105
data/vendor/faiss/c_api/Index_c.h +0 -183
data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
data/vendor/faiss/c_api/clone_index_c.h +0 -32
data/vendor/faiss/c_api/error_c.h +0 -42
data/vendor/faiss/c_api/error_impl.cpp +0 -27
data/vendor/faiss/c_api/error_impl.h +0 -16
data/vendor/faiss/c_api/faiss_c.h +0 -58
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
data/vendor/faiss/c_api/index_factory_c.h +0 -30
data/vendor/faiss/c_api/index_io_c.cpp +0 -42
data/vendor/faiss/c_api/index_io_c.h +0 -50
data/vendor/faiss/c_api/macros_impl.h +0 -110
data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
data/vendor/faiss/misc/test_blas.cpp +0 -87
data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
data/vendor/faiss/tests/test_merge.cpp +0 -260
data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
data/vendor/faiss/tests/test_params_override.cpp +0 -236
data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104

data/vendor/faiss/faiss/utils/distances.h CHANGED Viewed

@@ -156,6 +156,14 @@ void pairwise_indexed_inner_product (
 // threshold on nx above which we switch to BLAS to compute distances
 FAISS_API extern int distance_compute_blas_threshold;
+// block sizes for BLAS distance computations
+FAISS_API extern int distance_compute_blas_query_bs;
+FAISS_API extern int distance_compute_blas_database_bs;
+// above this number of results we switch to a reservoir to collect results
+// rather than a heap
+FAISS_API extern int distance_compute_min_k_reservoir;
 /** Return the k nearest neighors of each of the nx vectors x among the ny
  *  vector y, w.r.t to max inner product
  *
@@ -169,27 +177,17 @@ void knn_inner_product (
         size_t d, size_t nx, size_t ny,
         float_minheap_array_t * res);
-/** Same as knn_inner_product, for the L2 distance */
+/** Same as knn_inner_product, for the L2 distance
+ *  @param y_norm2    norms for the y vectors (nullptr or size ny)
+ */
 void knn_L2sqr (
         const float * x,
         const float * y,
         size_t d, size_t nx, size_t ny,
-        float_maxheap_array_t * res);
+        float_maxheap_array_t * res,
+        const float *y_norm2 = nullptr);
-/** same as knn_L2sqr, but base_shift[bno] is subtracted to all
- * computed distances.
- *
- * @param base_shift   size ny
- */
-void knn_L2sqr_base_shift (
-         const float * x,
-         const float * y,
-         size_t d, size_t nx, size_t ny,
-         float_maxheap_array_t * res,
-         const float *base_shift);
 /* Find the nearest neighbors for nx queries in a set of ny vectors
  * indexed by ids. May be useful for re-ranking a pre-selected vector list
  */
@@ -200,11 +198,12 @@ void knn_inner_products_by_idx (
         size_t d, size_t nx, size_t ny,
         float_minheap_array_t * res);
-void knn_L2sqr_by_idx (const float * x,
-                       const float * y,
-                       const int64_t * ids,
-                       size_t d, size_t nx, size_t ny,
-                       float_maxheap_array_t * res);
+void knn_L2sqr_by_idx (
+        const float * x,
+        const float * y,
+        const int64_t * ids,
+        size_t d, size_t nx, size_t ny,
+        float_maxheap_array_t * res);
 /***************************************************************************
  * Range search
@@ -239,6 +238,15 @@ void range_search_inner_product (
         RangeSearchResult *result);
+/***************************************************************************
+ * PQ tables computations
+ ***************************************************************************/
+/// specialized function for PQ2
+void compute_PQ_dis_tables_dsub2(
+        size_t d, size_t ksub, const float *centroids,
+        size_t nx, const float * x,
+        bool is_inner_product,
+        float * dis_tables);
 } // namespace faiss

data/vendor/faiss/faiss/utils/distances_simd.cpp CHANGED Viewed

@@ -14,6 +14,9 @@
 #include <cstring>
 #include <cmath>
+#include <faiss/utils/simdlib.h>
+#include <faiss/impl/FaissAssert.h>
 #ifdef __SSE3__
 #include <immintrin.h>
 #endif
@@ -127,6 +130,29 @@ void fvec_L2sqr_ny_ref (float * dis,
 }
+void fvec_inner_products_ny_ref (float * ip,
+                             const float * x,
+                             const float * y,
+                             size_t d, size_t ny)
+{
+    // BLAS slower for the use cases here
+#if 0
+    {
+        FINTEGER di = d;
+        FINTEGER nyi = ny;
+        float one = 1.0, zero = 0.0;
+        FINTEGER onei = 1;
+        sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
+    }
+#endif
+    for (size_t i = 0; i < ny; i++) {
+        ip[i] = fvec_inner_product (x, y, d);
+        y += d;
+    }
+}
 /*********************************************************
@@ -174,12 +200,39 @@ float fvec_norm_L2sqr (const float *  x,
 namespace {
-float sqr (float x) {
-    return x * x;
-}
+/// Function that does a component-wise operation between x and y
+/// to compute L2 distances. ElementOp can then be used in the fvec_op_ny
+/// functions below
+struct ElementOpL2 {
+    static float op (float x, float y) {
+        float tmp = x - y;
+        return tmp * tmp;
+    }
+    static __m128 op (__m128 x, __m128 y) {
+        __m128 tmp = x - y;
+        return tmp * tmp;
+    }
+};
+/// Function that does a component-wise operation between x and y
+/// to compute inner products
+struct ElementOpIP {
-void fvec_L2sqr_ny_D1 (float * dis, const float * x,
+    static float op (float x, float y) {
+        return x * y;
+    }
+    static __m128 op (__m128 x, __m128 y) {
+        return x * y;
+    }
+};
+template<class ElementOp>
+void fvec_op_ny_D1 (float * dis, const float * x,
                        const float * y, size_t ny)
 {
     float x0s = x[0];
@@ -187,11 +240,9 @@ void fvec_L2sqr_ny_D1 (float * dis, const float * x,
     size_t i;
     for (i = 0; i + 3 < ny; i += 4) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
         dis[i] = _mm_cvtss_f32 (accu);
-        tmp = _mm_shuffle_ps (accu, accu, 1);
+        __m128 tmp = _mm_shuffle_ps (accu, accu, 1);
         dis[i + 1] = _mm_cvtss_f32 (tmp);
         tmp = _mm_shuffle_ps (accu, accu, 2);
         dis[i + 2] = _mm_cvtss_f32 (tmp);
@@ -199,69 +250,63 @@ void fvec_L2sqr_ny_D1 (float * dis, const float * x,
         dis[i + 3] = _mm_cvtss_f32 (tmp);
     }
     while (i < ny) { // handle non-multiple-of-4 case
-        dis[i++] = sqr(x0s - *y++);
+        dis[i++] = ElementOp::op(x0s, *y++);
     }
 }
-void fvec_L2sqr_ny_D2 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D2 (float * dis, const float * x,
                        const float * y, size_t ny)
 {
     __m128 x0 = _mm_set_ps (x[1], x[0], x[1], x[0]);
     size_t i;
     for (i = 0; i + 1 < ny; i += 2) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
         accu = _mm_shuffle_ps (accu, accu, 3);
         dis[i + 1] = _mm_cvtss_f32 (accu);
     }
     if (i < ny) { // handle odd case
-        dis[i] = sqr(x[0] - y[0]) + sqr(x[1] - y[1]);
+        dis[i] = ElementOp::op(x[0], y[0]) + ElementOp::op(x[1], y[1]);
     }
 }
-void fvec_L2sqr_ny_D4 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D4 (float * dis, const float * x,
                         const float * y, size_t ny)
 {
     __m128 x0 = _mm_loadu_ps(x);
     for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
     }
 }
-void fvec_L2sqr_ny_D8 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D8 (float * dis, const float * x,
                         const float * y, size_t ny)
 {
     __m128 x0 = _mm_loadu_ps(x);
     __m128 x1 = _mm_loadu_ps(x + 4);
     for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        tmp = x1 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
+        accu       += ElementOp::op(x1, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
     }
 }
-void fvec_L2sqr_ny_D12 (float * dis, const float * x,
+template<class ElementOp>
+void fvec_op_ny_D12 (float * dis, const float * x,
                         const float * y, size_t ny)
 {
     __m128 x0 = _mm_loadu_ps(x);
@@ -269,13 +314,9 @@ void fvec_L2sqr_ny_D12 (float * dis, const float * x,
     __m128 x2 = _mm_loadu_ps(x + 8);
     for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        tmp = x1 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
-        tmp = x2 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
+        __m128 accu = ElementOp::op(x0, _mm_loadu_ps (y)); y += 4;
+        accu       += ElementOp::op(x1, _mm_loadu_ps (y)); y += 4;
+        accu       += ElementOp::op(x2, _mm_loadu_ps (y)); y += 4;
         accu = _mm_hadd_ps (accu, accu);
         accu = _mm_hadd_ps (accu, accu);
         dis[i] = _mm_cvtss_f32 (accu);
@@ -283,31 +324,52 @@ void fvec_L2sqr_ny_D12 (float * dis, const float * x,
 }
 } // anonymous namespace
 void fvec_L2sqr_ny (float * dis, const float * x,
                         const float * y, size_t d, size_t ny) {
     // optimized for a few special cases
-    switch(d) {
-    case 1:
-        fvec_L2sqr_ny_D1 (dis, x, y, ny);
-        return;
-    case 2:
-        fvec_L2sqr_ny_D2 (dis, x, y, ny);
-        return;
-    case 4:
-        fvec_L2sqr_ny_D4 (dis, x, y, ny);
+#define DISPATCH(dval) \
+    case dval:\
+        fvec_op_ny_D ## dval <ElementOpL2> (dis, x, y, ny); \
         return;
-    case 8:
-        fvec_L2sqr_ny_D8 (dis, x, y, ny);
+    switch(d) {
+        DISPATCH(1)
+        DISPATCH(2)
+        DISPATCH(4)
+        DISPATCH(8)
+        DISPATCH(12)
+    default:
+        fvec_L2sqr_ny_ref (dis, x, y, d, ny);
         return;
-    case 12:
-        fvec_L2sqr_ny_D12 (dis, x, y, ny);
+    }
+#undef DISPATCH
+}
+void fvec_inner_products_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+#define DISPATCH(dval) \
+    case dval:\
+        fvec_op_ny_D ## dval <ElementOpIP> (dis, x, y, ny); \
         return;
+    switch(d) {
+        DISPATCH(1)
+        DISPATCH(2)
+        DISPATCH(4)
+        DISPATCH(8)
+        DISPATCH(12)
     default:
-        fvec_L2sqr_ny_ref (dis, x, y, d, ny);
+        fvec_inner_products_ny_ref (dis, x, y, d, ny);
         return;
     }
+#undef DISPATCH
 }
@@ -644,6 +706,11 @@ void fvec_L2sqr_ny (float * dis, const float * x,
     fvec_L2sqr_ny_ref (dis, x, y, d, ny);
 }
+void fvec_inner_products_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+    fvec_inner_products_ny_ref (dis, x, y, d, ny);
+}
 #endif
@@ -803,6 +870,167 @@ int fvec_madd_and_argmin (size_t n, const float *a,
 #endif
+/***************************************************************************
+ * PQ tables computations
+ ***************************************************************************/
+#ifdef __AVX2__
+namespace {
+// get even float32's of a and b, interleaved
+simd8float32 geteven(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6)
+    );
+}
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6)
+    );
+}
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4)
+    );
+}
+simd8float32 gethigh128(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+        _mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4)
+    );
+}
+/// compute the IP for dsub = 2 for 8 centroids and 4 sub-vectors at a time
+template<bool is_inner_product>
+void pq2_8cents_table(
+        const simd8float32 centroids[8],
+        const simd8float32 x,
+        float *out, size_t ldo, size_t nout = 4
+) {
+    simd8float32 ips[4];
+    for(int i = 0; i < 4; i++) {
+        simd8float32 p1, p2;
+        if (is_inner_product) {
+            p1 = x * centroids[2 * i];
+            p2 = x * centroids[2 * i + 1];
+        } else {
+            p1 = (x - centroids[2 * i]);
+            p1 = p1 * p1;
+            p2 = (x - centroids[2 * i + 1]);
+            p2 = p2 * p2;
+        }
+        ips[i] = hadd(p1, p2);
+    }
+    simd8float32 ip02a = geteven(ips[0], ips[1]);
+    simd8float32 ip02b = geteven(ips[2], ips[3]);
+    simd8float32 ip0 = getlow128(ip02a, ip02b);
+    simd8float32 ip2 = gethigh128(ip02a, ip02b);
+    simd8float32 ip13a = getodd(ips[0], ips[1]);
+    simd8float32 ip13b = getodd(ips[2], ips[3]);
+    simd8float32 ip1 = getlow128(ip13a, ip13b);
+    simd8float32 ip3 = gethigh128(ip13a, ip13b);
+    switch(nout) {
+    case 4:
+        ip3.storeu(out + 3 * ldo);
+    case 3:
+        ip2.storeu(out + 2 * ldo);
+    case 2:
+        ip1.storeu(out + 1 * ldo);
+    case 1:
+        ip0.storeu(out);
+    }
+}
+simd8float32 load_simd8float32_partial(const float *x, int n) {
+    ALIGNED(32) float tmp[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    float *wp = tmp;
+    for (int i = 0; i < n; i++) {
+        *wp++ = *x++;
+    }
+    return simd8float32(tmp);
+}
+} // anonymous namespace
+void compute_PQ_dis_tables_dsub2(
+        size_t d, size_t ksub, const float *all_centroids,
+        size_t nx, const float * x,
+        bool is_inner_product,
+        float * dis_tables)
+{
+    size_t M = d / 2;
+    FAISS_THROW_IF_NOT(ksub % 8 == 0);
+    for(size_t m0 = 0; m0 < M; m0 += 4) {
+        int m1 = std::min(M, m0 + 4);
+        for(int k0 = 0; k0 < ksub; k0 += 8) {
+            simd8float32 centroids[8];
+            for (int k = 0; k < 8; k++) {
+                float centroid[8] __attribute__((aligned(32)));
+                size_t wp = 0;
+                size_t rp = (m0 * ksub + k + k0) * 2;
+                for (int m = m0; m < m1; m++) {
+                    centroid[wp++] = all_centroids[rp];
+                    centroid[wp++] = all_centroids[rp + 1];
+                    rp += 2 * ksub;
+                }
+                centroids[k] = simd8float32(centroid);
+            }
+            for(size_t i = 0; i < nx; i++) {
+                simd8float32 xi;
+                if (m1 == m0 + 4) {
+                    xi.loadu(x + i * d + m0 * 2);
+                } else {
+                    xi = load_simd8float32_partial(x + i * d + m0 * 2, 2 * (m1 - m0));
+                }
+                if(is_inner_product) {
+                    pq2_8cents_table<true>(
+                        centroids, xi,
+                        dis_tables + (i * M + m0) * ksub + k0,
+                        ksub, m1 - m0
+                    );
+                } else {
+                    pq2_8cents_table<false>(
+                        centroids, xi,
+                        dis_tables + (i * M + m0) * ksub + k0,
+                        ksub, m1 - m0
+                    );
+                }
+            }
+        }
+    }
+}
+#else
+void compute_PQ_dis_tables_dsub2(
+        size_t d, size_t ksub, const float *all_centroids,
+        size_t nx, const float * x,
+        bool is_inner_product,
+        float * dis_tables)
+{
+    FAISS_THROW_MSG("only implemented for AVX2");
+}
+#endif
 } // namespace faiss