RubyGems - faiss - Versions diffs - 0.1.3 → 0.1.4 - Mend

faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +36 -33
data/vendor/faiss/faiss/AutoTune.h +6 -3
data/vendor/faiss/faiss/Clustering.cpp +16 -12
data/vendor/faiss/faiss/Index.cpp +3 -4
data/vendor/faiss/faiss/Index.h +3 -3
data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
data/vendor/faiss/faiss/IndexBinary.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
data/vendor/faiss/faiss/IndexFlat.h +0 -51
data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
data/vendor/faiss/faiss/IndexIVF.h +22 -15
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
data/vendor/faiss/faiss/IndexRefine.h +73 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
data/vendor/faiss/faiss/impl/io.cpp +33 -2
data/vendor/faiss/faiss/impl/io.h +7 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
data/vendor/faiss/faiss/index_factory.cpp +112 -7
data/vendor/faiss/faiss/index_io.h +1 -48
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
data/vendor/faiss/faiss/utils/Heap.h +61 -50
data/vendor/faiss/faiss/utils/distances.cpp +164 -319
data/vendor/faiss/faiss/utils/distances.h +28 -20
data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
data/vendor/faiss/faiss/utils/hamming.h +2 -7
data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
data/vendor/faiss/faiss/utils/partitioning.h +69 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
data/vendor/faiss/faiss/utils/simdlib.h +31 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
metadata +43 -141
data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
data/vendor/faiss/c_api/AutoTune_c.h +0 -66
data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
data/vendor/faiss/c_api/Clustering_c.h +0 -123
data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
data/vendor/faiss/c_api/IndexShards_c.h +0 -39
data/vendor/faiss/c_api/Index_c.cpp +0 -105
data/vendor/faiss/c_api/Index_c.h +0 -183
data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
data/vendor/faiss/c_api/clone_index_c.h +0 -32
data/vendor/faiss/c_api/error_c.h +0 -42
data/vendor/faiss/c_api/error_impl.cpp +0 -27
data/vendor/faiss/c_api/error_impl.h +0 -16
data/vendor/faiss/c_api/faiss_c.h +0 -58
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
data/vendor/faiss/c_api/index_factory_c.h +0 -30
data/vendor/faiss/c_api/index_io_c.cpp +0 -42
data/vendor/faiss/c_api/index_io_c.h +0 -50
data/vendor/faiss/c_api/macros_impl.h +0 -110
data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
data/vendor/faiss/misc/test_blas.cpp +0 -87
data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
data/vendor/faiss/tests/test_merge.cpp +0 -260
data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
data/vendor/faiss/tests/test_params_override.cpp +0 -236
data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104

data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp ADDED Viewed

@@ -0,0 +1,180 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/simd_result_handlers.h>
+namespace faiss {
+using namespace simd_result_handlers;
+/***************************************************************
+ * accumulation functions
+ ***************************************************************/
+namespace {
+/*
+ * The computation kernel
+ * It accumulates results for NQ queries and BB * 32 database elements
+ * writes results in a ResultHandler
+ */
+template<int NQ, int BB, class ResultHandler>
+void kernel_accumulate_block(
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        ResultHandler & res)
+{
+    // distance accumulators
+    simd16uint16 accu[NQ][BB][4];
+    for(int q = 0; q < NQ; q++) {
+        for(int b = 0; b < BB; b++) {
+            accu[q][b][0].clear();
+            accu[q][b][1].clear();
+            accu[q][b][2].clear();
+            accu[q][b][3].clear();
+        }
+    }
+    for(int sq = 0; sq < nsq; sq += 2) {
+        simd32uint8 lut_cache[NQ];
+        for(int q = 0; q < NQ; q++) {
+            lut_cache[q] = simd32uint8(LUT);
+            LUT += 32;
+        }
+        for (int b = 0; b < BB; b++) {
+            simd32uint8 c = simd32uint8(codes);
+            codes += 32;
+            simd32uint8 mask(15);
+            simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+            simd32uint8 clo = c & mask;
+            for(int q = 0; q < NQ; q++) {
+                simd32uint8 lut = lut_cache[q];
+                simd32uint8 res0 = lut.lookup_2_lanes(clo);
+                simd32uint8 res1 = lut.lookup_2_lanes(chi);
+                accu[q][b][0] += simd16uint16(res0);
+                accu[q][b][1] += simd16uint16(res0) >> 8;
+                accu[q][b][2] += simd16uint16(res1);
+                accu[q][b][3] += simd16uint16(res1) >> 8;
+            }
+        }
+    }
+    for(int q = 0; q < NQ; q++) {
+        for (int b = 0; b < BB; b++) {
+            accu[q][b][0] -= accu[q][b][1] << 8;
+            simd16uint16 dis0 = combine2x2(accu[q][b][0], accu[q][b][1]);
+            accu[q][b][2] -= accu[q][b][3] << 8;
+            simd16uint16 dis1 = combine2x2(accu[q][b][2], accu[q][b][3]);
+            res.handle(q, b, dis0, dis1);
+        }
+    }
+}
+template<int NQ, int BB, class ResultHandler>
+void accumulate_fixed_blocks(
+        size_t nb,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        ResultHandler & res)
+{
+    constexpr int bbs = 32 * BB;
+    for (int64_t j0 = 0; j0 < nb; j0 += bbs) {
+        FixedStorageHandler<NQ, 2 * BB> res2;
+        kernel_accumulate_block<NQ, BB>(nsq, codes, LUT, res2);
+        res.set_block_origin(0, j0);
+        res2.to_other_handler(res);
+        codes += bbs * nsq / 2;
+    }
+}
+} // anonymous namespace
+template<class ResultHandler>
+void pq4_accumulate_loop(
+        int nq,
+        size_t nb, int bbs,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        ResultHandler & res)
+{
+    FAISS_THROW_IF_NOT(is_aligned_pointer(codes));
+    FAISS_THROW_IF_NOT(is_aligned_pointer(LUT));
+    FAISS_THROW_IF_NOT(bbs % 32 == 0);
+    FAISS_THROW_IF_NOT(nb % bbs == 0);
+#define DISPATCH(NQ, BB) \
+   case NQ * 1000 + BB: \
+    accumulate_fixed_blocks<NQ, BB>(nb, nsq, codes, LUT, res);  \
+    break
+    switch(nq * 1000 + bbs / 32) {
+        DISPATCH(1, 1);
+        DISPATCH(1, 2);
+        DISPATCH(1, 3);
+        DISPATCH(1, 4);
+        DISPATCH(1, 5);
+        DISPATCH(2, 1);
+        DISPATCH(2, 2);
+        DISPATCH(3, 1);
+        DISPATCH(4, 1);
+    default:
+        FAISS_THROW_FMT("nq=%d bbs=%d not instantiated", nq, bbs);
+    }
+#undef DISPATCH
+}
+// explicit template instantiations
+#define INSTANTIATE_ACCUMULATE(TH, C, with_id_map) \
+template void pq4_accumulate_loop<TH<C, with_id_map>> \
+    (int, size_t, int, int, const uint8_t *, const uint8_t *, TH<C, with_id_map> &);
+#define INSTANTIATE_3(C, with_id_map) \
+INSTANTIATE_ACCUMULATE(SingleResultHandler, C, with_id_map) \
+INSTANTIATE_ACCUMULATE(HeapHandler, C, with_id_map) \
+INSTANTIATE_ACCUMULATE(ReservoirHandler, C, with_id_map) \
+using Csi = CMax<uint16_t, int>;
+INSTANTIATE_3(Csi, false);
+using CsiMin = CMin<uint16_t, int>;
+INSTANTIATE_3(CsiMin, false);
+using Csl = CMax<uint16_t, int64_t>;
+INSTANTIATE_3(Csl, true);
+using CslMin = CMin<uint16_t, int64_t>;
+INSTANTIATE_3(CslMin, true);
+} // namespace faiss

data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp ADDED Viewed

@@ -0,0 +1,354 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/simdlib.h>
+#include <faiss/impl/simd_result_handlers.h>
+namespace faiss {
+using namespace simd_result_handlers;
+/************************************************************
+ * Accumulation functions
+ ************************************************************/
+namespace {
+/*
+ * The computation kernel
+ * It accumulates results for NQ queries and 2 * 16 database elements
+ * writes results in a ResultHandler
+ */
+template<int NQ, class ResultHandler>
+void kernel_accumulate_block(
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        ResultHandler & res)
+{
+    // dummy alloc to keep the windows compiler happy
+    constexpr int NQA = NQ > 0 ? NQ : 1;
+    // distance accumulators
+    simd16uint16 accu[NQA][4];
+    for(int q = 0; q < NQ; q++) {
+        for(int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+        }
+    }
+    // _mm_prefetch(codes + 768, 0);
+    for(int sq = 0; sq < nsq; sq += 2) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+        for(int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+            accu[q][0] += simd16uint16(res0);
+            accu[q][1] += simd16uint16(res0) >> 8;
+            accu[q][2] += simd16uint16(res1);
+            accu[q][3] += simd16uint16(res1) >> 8;
+        }
+    }
+    for(int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine2x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine2x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+// handle at most 4 blocks of queries
+template<int QBS, class ResultHandler>
+void accumulate_q_4step(
+        size_t ntotal2,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT0,
+        ResultHandler & res)
+{
+    constexpr int Q1 = QBS & 15;
+    constexpr int Q2 = (QBS >> 4) & 15;
+    constexpr int Q3 = (QBS >> 8) & 15;
+    constexpr int Q4 = (QBS >> 12) & 15;
+    constexpr int SQ = Q1 + Q2 + Q3 + Q4;
+    for (int64_t j0 = 0; j0 < ntotal2; j0 += 32) {
+        FixedStorageHandler<SQ, 2> res2;
+        const uint8_t *LUT = LUT0;
+        kernel_accumulate_block<Q1>(nsq, codes, LUT, res2);
+        LUT += Q1 * nsq * 16;
+        if (Q2 > 0) {
+            res2.set_block_origin(Q1, 0);
+            kernel_accumulate_block<Q2>(nsq, codes, LUT, res2);
+            LUT += Q2 * nsq * 16;
+        }
+        if (Q3 > 0) {
+            res2.set_block_origin(Q1 + Q2, 0);
+            kernel_accumulate_block<Q3>(nsq, codes, LUT, res2);
+            LUT += Q3 * nsq * 16;
+        }
+        if (Q4 > 0) {
+            res2.set_block_origin(Q1 + Q2 + Q3, 0);
+            kernel_accumulate_block<Q4>(nsq, codes, LUT, res2);
+        }
+        res.set_block_origin(0, j0);
+        res2.to_other_handler(res);
+        codes += 32 * nsq / 2;
+    }
+}
+template<int NQ, class ResultHandler>
+void kernel_accumulate_block_loop(
+        size_t ntotal2,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        ResultHandler & res)
+{
+    for (int64_t j0 = 0; j0 < ntotal2; j0 += 32) {
+        res.set_block_origin(0, j0);
+        kernel_accumulate_block<NQ, ResultHandler>
+            (nsq, codes + j0 * nsq / 2, LUT, res);
+    }
+}
+// non-template version of accumulate kernel -- dispatches dynamically
+template<class ResultHandler>
+void accumulate(
+        int nq,
+        size_t ntotal2,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        ResultHandler & res)
+{
+    assert(nsq % 2 == 0);
+    assert(is_aligned_pointer(codes));
+    assert(is_aligned_pointer(LUT));
+#define DISPATCH(NQ)                                       \
+    case NQ:                                                    \
+        kernel_accumulate_block_loop<NQ, ResultHandler>    \
+                (ntotal2, nsq, codes, LUT, res);                \
+    return
+    switch(nq) {
+        DISPATCH(1);
+        DISPATCH(2);
+        DISPATCH(3);
+        DISPATCH(4);
+    }
+    FAISS_THROW_FMT("accumulate nq=%d not instanciated",
+                    nq);
+#undef DISPATCH
+}
+} // anonumous namespace
+template<class ResultHandler>
+void pq4_accumulate_loop_qbs(
+        int qbs,
+        size_t ntotal2,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT0,
+        ResultHandler & res)
+{
+    assert(nsq % 2 == 0);
+    assert(is_aligned_pointer(codes));
+    assert(is_aligned_pointer(LUT0));
+    // try out optimized versions
+    switch(qbs) {
+#define DISPATCH(QBS) \
+        case QBS: accumulate_q_4step<QBS>  \
+            (ntotal2, nsq, codes, LUT0, res);  \
+        return;
+            DISPATCH(0x3333);  // 12
+            DISPATCH(0x2333);  // 11
+            DISPATCH(0x2233);  // 10
+            DISPATCH(0x333);   // 9
+            DISPATCH(0x2223);  // 9
+            DISPATCH(0x233);   // 8
+            DISPATCH(0x1223);  // 8
+            DISPATCH(0x223);   // 7
+            DISPATCH(0x34);    // 7
+            DISPATCH(0x133);   // 7
+            DISPATCH(0x6);     // 6
+            DISPATCH(0x33);    // 6
+            DISPATCH(0x123);   // 6
+            DISPATCH(0x222);   // 6
+            DISPATCH(0x23);    // 5
+            DISPATCH(0x5);     // 5
+            DISPATCH(0x13);    // 4
+            DISPATCH(0x22);    // 4
+            DISPATCH(0x4);     // 4
+            DISPATCH(0x3);     // 3
+            DISPATCH(0x21);    // 3
+            DISPATCH(0x2);     // 2
+            DISPATCH(0x1);     // 1
+#undef DISPATCH
+    }
+    // default implementation where qbs is not known at compile time
+    for (int64_t j0 = 0; j0 < ntotal2; j0 += 32) {
+        const uint8_t *LUT = LUT0;
+        int qi = qbs;
+        int i0 = 0;
+        while(qi) {
+            int nq = qi & 15;
+            qi >>= 4;
+            res.set_block_origin(i0, j0);
+#define DISPATCH(NQ)                                     \
+    case NQ:                                             \
+        kernel_accumulate_block<NQ, ResultHandler> \
+           (nsq, codes, LUT, res);                       \
+        break
+            switch(nq) {
+                DISPATCH(1);
+                DISPATCH(2);
+                DISPATCH(3);
+                DISPATCH(4);
+#undef DISPATCH
+            default:
+                FAISS_THROW_FMT("accumulate nq=%d not instanciated",
+                                nq);
+            }
+            i0 += nq;
+            LUT += nq * nsq * 16;
+        }
+        codes += 32 * nsq / 2;
+    }
+}
+// explicit template instantiations
+#define INSTANTIATE_ACCUMULATE_Q(RH) \
+template void pq4_accumulate_loop_qbs<RH> \
+    (int, size_t, int, const uint8_t *, const uint8_t *, RH &);
+using Csi = CMax<uint16_t, int>;
+INSTANTIATE_ACCUMULATE_Q(SingleResultHandler<Csi>)
+INSTANTIATE_ACCUMULATE_Q(HeapHandler<Csi>)
+INSTANTIATE_ACCUMULATE_Q(ReservoirHandler<Csi>)
+using Csi2 = CMin<uint16_t, int>;
+INSTANTIATE_ACCUMULATE_Q(SingleResultHandler<Csi2>)
+INSTANTIATE_ACCUMULATE_Q(HeapHandler<Csi2>)
+INSTANTIATE_ACCUMULATE_Q(ReservoirHandler<Csi2>)
+using Cfl = CMax<uint16_t, int64_t>;
+using HHCsl = HeapHandler<Cfl, true>;
+using RHCsl = ReservoirHandler<Cfl, true>;
+using SHCsl = SingleResultHandler<Cfl, true>;
+INSTANTIATE_ACCUMULATE_Q(HHCsl)
+INSTANTIATE_ACCUMULATE_Q(RHCsl)
+INSTANTIATE_ACCUMULATE_Q(SHCsl)
+using Cfl2 = CMin<uint16_t, int64_t>;
+using HHCsl2 = HeapHandler<Cfl2, true>;
+using RHCsl2 = ReservoirHandler<Cfl2, true>;
+using SHCsl2 = SingleResultHandler<Cfl2, true>;
+INSTANTIATE_ACCUMULATE_Q(HHCsl2)
+INSTANTIATE_ACCUMULATE_Q(RHCsl2)
+INSTANTIATE_ACCUMULATE_Q(SHCsl2)
+/***************************************************************
+ * Packing functions
+ ***************************************************************/
+int pq4_qbs_to_nq(int qbs) {
+    int i0 = 0;
+    int qi = qbs;
+    while(qi) {
+        int nq = qi & 15;
+        qi >>= 4;
+        i0 += nq;
+    }
+    return i0;
+}
+void accumulate_to_mem(
+        int nq,
+        size_t ntotal2,
+        int nsq,
+        const uint8_t *codes,
+        const uint8_t *LUT,
+        uint16_t* accu)
+{
+    FAISS_THROW_IF_NOT(ntotal2 % 32 == 0);
+    StoreResultHandler handler(accu, ntotal2);
+    accumulate(nq, ntotal2, nsq, codes, LUT, handler);
+}
+int pq4_preferred_qbs(int n) {
+    // from timmings in P141901742, P141902828
+    static int map[12] = {
+        0, 1, 2, 3, 0x13,
+        0x23, 0x33, 0x223, 0x233, 0x333,
+        0x2233, 0x2333
+    };
+    if (n <= 11) {
+        return map[n];
+    } else if (n <= 24) {
+        // override qbs: all first stages with 3 steps
+        // then 1 stage with the rest
+        int nbit = 4 * (n / 3); // nbits with only 3s
+        int qbs = 0x33333333 & ((1 << nbit) - 1);
+        qbs |= (n % 3) << nbit;
+        return qbs;
+    } else {
+        FAISS_THROW_FMT("number of queries %d too large", n);
+    }
+}
+} // namespace faiss