RubyGems - faiss - Versions diffs - 0.3.0 → 0.3.1 - Mend

faiss 0.3.0 → 0.3.1

Files changed (171) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +4 -18
data/vendor/faiss/faiss/Clustering.h +31 -21
data/vendor/faiss/faiss/IVFlib.cpp +22 -11
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +20 -5
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
data/vendor/faiss/faiss/IndexHNSW.h +12 -48
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
data/vendor/faiss/faiss/IndexIVF.h +37 -5
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +10 -10
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
data/vendor/faiss/faiss/impl/HNSW.h +9 -8
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
data/vendor/faiss/faiss/impl/io.cpp +10 -10
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
data/vendor/faiss/faiss/index_factory.cpp +10 -7
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/distances.cpp +128 -74
data/vendor/faiss/faiss/utils/distances.h +81 -4
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/utils.cpp +112 -6
data/vendor/faiss/faiss/utils/utils.h +57 -20
metadata +10 -3

data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp CHANGED Viewed

@@ -65,42 +65,65 @@ using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
  */
 struct Codec8bit {
-    static void encode_component(float x, uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
         code[i] = (int)(255 * x);
     }
-    static float decode_component(const uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
         return (code[i] + 0.5f) / 255.0f;
     }
 #ifdef __AVX2__
-    static __m256 decode_8_components(const uint8_t* code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + i);
-        __m128i c4lo = _mm_cvtepu8_epi32(_mm_set1_epi32(c8));
-        __m128i c4hi = _mm_cvtepu8_epi32(_mm_set1_epi32(c8 >> 32));
-        // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
-        __m256i i8 = _mm256_castsi128_si256(c4lo);
-        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        __m256 half = _mm256_set1_ps(0.5f);
-        f8 = _mm256_add_ps(f8, half);
-        __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
-        return _mm256_mul_ps(f8, one_255);
+    static FAISS_ALWAYS_INLINE __m256
+    decode_8_components(const uint8_t* code, int i) {
+        const uint64_t c8 = *(uint64_t*)(code + i);
+        const __m128i i8 = _mm_set1_epi64x(c8);
+        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
+        const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
+        return _mm256_fmadd_ps(f8, one_255, half_one_255);
+    }
+#endif
+#ifdef __aarch64__
+    static FAISS_ALWAYS_INLINE float32x4x2_t
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        float32x4x2_t res = vzipq_f32(res1, res2);
+        return vuzpq_f32(res.val[0], res.val[1]);
     }
 #endif
 };
 struct Codec4bit {
-    static void encode_component(float x, uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
         code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
     }
-    static float decode_component(const uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
         return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
     }
 #ifdef __AVX2__
-    static __m256 decode_8_components(const uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE __m256
+    decode_8_components(const uint8_t* code, int i) {
         uint32_t c4 = *(uint32_t*)(code + (i >> 1));
         uint32_t mask = 0x0f0f0f0f;
         uint32_t c4ev = c4 & mask;
@@ -120,10 +143,27 @@ struct Codec4bit {
         return _mm256_mul_ps(f8, one_255);
     }
 #endif
+#ifdef __aarch64__
+    static FAISS_ALWAYS_INLINE float32x4x2_t
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        float32x4x2_t res = vzipq_f32(res1, res2);
+        return vuzpq_f32(res.val[0], res.val[1]);
+    }
+#endif
 };
 struct Codec6bit {
-    static void encode_component(float x, uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
         int bits = (int)(x * 63.0);
         code += (i >> 2) * 3;
         switch (i & 3) {
@@ -144,7 +184,9 @@ struct Codec6bit {
         }
     }
-    static float decode_component(const uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
         uint8_t bits;
         code += (i >> 2) * 3;
         switch (i & 3) {
@@ -170,7 +212,7 @@ struct Codec6bit {
     /* Load 6 bytes that represent 8 6-bit values, return them as a
      * 8*32 bit vector register */
-    static __m256i load6(const uint16_t* code16) {
+    static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) {
         const __m128i perm = _mm_set_epi8(
                 -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0);
         const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0);
@@ -189,18 +231,45 @@ struct Codec6bit {
         return c5;
     }
-    static __m256 decode_8_components(const uint8_t* code, int i) {
+    static FAISS_ALWAYS_INLINE __m256
+    decode_8_components(const uint8_t* code, int i) {
+        // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here
+        // // for the reference, maybe, it becomes used oned day.
+        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
+        // const uint32_t* data32 = (const uint32_t*)data16;
+        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
+        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
+        // const __m128i i8 = _mm_set1_epi64x(vext);
+        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
+        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
         __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
         __m256 f8 = _mm256_cvtepi32_ps(i8);
         // this could also be done with bit manipulations but it is
         // not obviously faster
-        __m256 half = _mm256_set1_ps(0.5f);
-        f8 = _mm256_add_ps(f8, half);
-        __m256 one_63 = _mm256_set1_ps(1.f / 63.f);
-        return _mm256_mul_ps(f8, one_63);
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        return _mm256_fmadd_ps(f8, one_255, half_one_255);
     }
 #endif
+#ifdef __aarch64__
+    static FAISS_ALWAYS_INLINE float32x4x2_t
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        float32x4x2_t res = vzipq_f32(res1, res2);
+        return vuzpq_f32(res.val[0], res.val[1]);
+    }
+#endif
 };
 /*******************************************************************
@@ -242,7 +311,8 @@ struct QuantizerTemplate<Codec, true, 1> : ScalarQuantizer::SQuantizer {
         }
     }
-    float reconstruct_component(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
         float xi = Codec::decode_component(code, i);
         return vmin + xi * vdiff;
     }
@@ -255,11 +325,36 @@ struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
     QuantizerTemplate(size_t d, const std::vector<float>& trained)
             : QuantizerTemplate<Codec, true, 1>(d, trained) {}
-    __m256 reconstruct_8_components(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
         __m256 xi = Codec::decode_8_components(code, i);
-        return _mm256_add_ps(
-                _mm256_set1_ps(this->vmin),
-                _mm256_mul_ps(xi, _mm256_set1_ps(this->vdiff)));
+        return _mm256_fmadd_ps(
+                xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin));
+    }
+};
+#endif
+#ifdef __aarch64__
+template <class Codec>
+struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
+    QuantizerTemplate(size_t d, const std::vector<float>& trained)
+            : QuantizerTemplate<Codec, true, 1>(d, trained) {}
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32x4x2_t xi = Codec::decode_8_components(code, i);
+        float32x4x2_t res = vzipq_f32(
+                vfmaq_f32(
+                        vdupq_n_f32(this->vmin),
+                        xi.val[0],
+                        vdupq_n_f32(this->vdiff)),
+                vfmaq_f32(
+                        vdupq_n_f32(this->vmin),
+                        xi.val[1],
+                        vdupq_n_f32(this->vdiff)));
+        return vuzpq_f32(res.val[0], res.val[1]);
     }
 };
@@ -296,7 +391,8 @@ struct QuantizerTemplate<Codec, false, 1> : ScalarQuantizer::SQuantizer {
         }
     }
-    float reconstruct_component(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
         float xi = Codec::decode_component(code, i);
         return vmin[i] + xi * vdiff[i];
     }
@@ -309,11 +405,36 @@ struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
     QuantizerTemplate(size_t d, const std::vector<float>& trained)
             : QuantizerTemplate<Codec, false, 1>(d, trained) {}
-    __m256 reconstruct_8_components(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
         __m256 xi = Codec::decode_8_components(code, i);
-        return _mm256_add_ps(
-                _mm256_loadu_ps(this->vmin + i),
-                _mm256_mul_ps(xi, _mm256_loadu_ps(this->vdiff + i)));
+        return _mm256_fmadd_ps(
+                xi,
+                _mm256_loadu_ps(this->vdiff + i),
+                _mm256_loadu_ps(this->vmin + i));
+    }
+};
+#endif
+#ifdef __aarch64__
+template <class Codec>
+struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
+    QuantizerTemplate(size_t d, const std::vector<float>& trained)
+            : QuantizerTemplate<Codec, false, 1>(d, trained) {}
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32x4x2_t xi = Codec::decode_8_components(code, i);
+        float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
+        float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
+        float32x4x2_t res = vzipq_f32(
+                vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
+                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1]));
+        return vuzpq_f32(res.val[0], res.val[1]);
     }
 };
@@ -344,7 +465,8 @@ struct QuantizerFP16<1> : ScalarQuantizer::SQuantizer {
         }
     }
-    float reconstruct_component(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
         return decode_fp16(((uint16_t*)code)[i]);
     }
 };
@@ -356,7 +478,8 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
     QuantizerFP16(size_t d, const std::vector<float>& trained)
             : QuantizerFP16<1>(d, trained) {}
-    __m256 reconstruct_8_components(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
         __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i));
         return _mm256_cvtph_ps(codei);
     }
@@ -364,6 +487,23 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
 #endif
+#ifdef __aarch64__
+template <>
+struct QuantizerFP16<8> : QuantizerFP16<1> {
+    QuantizerFP16(size_t d, const std::vector<float>& trained)
+            : QuantizerFP16<1>(d, trained) {}
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld2_u16((const uint16_t*)(code + 2 * i));
+        return vzipq_f32(
+                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
+                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1])));
+    }
+};
+#endif
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -390,7 +530,8 @@ struct Quantizer8bitDirect<1> : ScalarQuantizer::SQuantizer {
         }
     }
-    float reconstruct_component(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
         return code[i];
     }
 };
@@ -402,7 +543,8 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
     Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
             : Quantizer8bitDirect<1>(d, trained) {}
-    __m256 reconstruct_8_components(const uint8_t* code, int i) const {
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
         __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
         __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
         return _mm256_cvtepi32_ps(y8);                      // 8 * float32
@@ -411,6 +553,28 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
 #endif
+#ifdef __aarch64__
+template <>
+struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
+    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirect<1>(d, trained) {}
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = code[i + j];
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        float32x4x2_t res = vzipq_f32(res1, res2);
+        return vuzpq_f32(res.val[0], res.val[1]);
+    }
+};
+#endif
 template <int SIMDWIDTH>
 ScalarQuantizer::SQuantizer* select_quantizer_1(
         QuantizerType qtype,
@@ -486,7 +650,7 @@ void train_Uniform(
     } else if (rs == ScalarQuantizer::RS_quantiles) {
         std::vector<float> x_copy(n);
         memcpy(x_copy.data(), x, n * sizeof(*x));
-        // TODO just do a qucikselect
+        // TODO just do a quickselect
         std::sort(x_copy.begin(), x_copy.end());
         int o = int(rs_arg * n);
         if (o < 0)
@@ -632,22 +796,22 @@ struct SimilarityL2<1> {
     float accu;
-    void begin() {
+    FAISS_ALWAYS_INLINE void begin() {
         accu = 0;
         yi = y;
     }
-    void add_component(float x) {
+    FAISS_ALWAYS_INLINE void add_component(float x) {
         float tmp = *yi++ - x;
         accu += tmp * tmp;
     }
-    void add_component_2(float x1, float x2) {
+    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
         float tmp = x1 - x2;
         accu += tmp * tmp;
     }
-    float result() {
+    FAISS_ALWAYS_INLINE float result() {
         return accu;
     }
 };
@@ -663,34 +827,89 @@ struct SimilarityL2<8> {
     explicit SimilarityL2(const float* y) : y(y) {}
     __m256 accu8;
-    void begin_8() {
+    FAISS_ALWAYS_INLINE void begin_8() {
         accu8 = _mm256_setzero_ps();
         yi = y;
     }
-    void add_8_components(__m256 x) {
+    FAISS_ALWAYS_INLINE void add_8_components(__m256 x) {
         __m256 yiv = _mm256_loadu_ps(yi);
         yi += 8;
         __m256 tmp = _mm256_sub_ps(yiv, x);
-        accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(tmp, tmp));
+        accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
     }
-    void add_8_components_2(__m256 x, __m256 y) {
-        __m256 tmp = _mm256_sub_ps(y, x);
-        accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(tmp, tmp));
+    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) {
+        __m256 tmp = _mm256_sub_ps(y_2, x);
+        accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
     }
-    float result_8() {
-        __m256 sum = _mm256_hadd_ps(accu8, accu8);
-        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        // now add the 0th and 4th component
-        return _mm_cvtss_f32(_mm256_castps256_ps128(sum2)) +
-                _mm_cvtss_f32(_mm256_extractf128_ps(sum2, 1));
+    FAISS_ALWAYS_INLINE float result_8() {
+        const __m128 sum = _mm_add_ps(
+                _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1));
+        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
+        const __m128 v1 = _mm_add_ps(sum, v0);
+        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+        const __m128 v3 = _mm_add_ps(v1, v2);
+        return _mm_cvtss_f32(v3);
     }
 };
 #endif
+#ifdef __aarch64__
+template <>
+struct SimilarityL2<8> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_L2;
+    const float *y, *yi;
+    explicit SimilarityL2(const float* y) : y(y) {}
+    float32x4x2_t accu8;
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8 = vzipq_f32(vdupq_n_f32(0.0f), vdupq_n_f32(0.0f));
+        yi = y;
+    }
+    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
+        float32x4x2_t yiv = vld1q_f32_x2(yi);
+        yi += 8;
+        float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]);
+        float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]);
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
+        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
+        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+    }
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            float32x4x2_t x,
+            float32x4x2_t y) {
+        float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]);
+        float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]);
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
+        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
+        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+    }
+    FAISS_ALWAYS_INLINE float result_8() {
+        float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]);
+        float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]);
+        float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0);
+        float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1);
+        return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0);
+    }
+};
+#endif
 template <int SIMDWIDTH>
 struct SimilarityIP {};
@@ -704,20 +923,20 @@ struct SimilarityIP<1> {
     explicit SimilarityIP(const float* y) : y(y) {}
-    void begin() {
+    FAISS_ALWAYS_INLINE void begin() {
         accu = 0;
         yi = y;
     }
-    void add_component(float x) {
+    FAISS_ALWAYS_INLINE void add_component(float x) {
         accu += *yi++ * x;
     }
-    void add_component_2(float x1, float x2) {
+    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
         accu += x1 * x2;
     }
-    float result() {
+    FAISS_ALWAYS_INLINE float result() {
         return accu;
     }
 };
@@ -737,27 +956,79 @@ struct SimilarityIP<8> {
     __m256 accu8;
-    void begin_8() {
+    FAISS_ALWAYS_INLINE void begin_8() {
         accu8 = _mm256_setzero_ps();
         yi = y;
     }
-    void add_8_components(__m256 x) {
+    FAISS_ALWAYS_INLINE void add_8_components(__m256 x) {
         __m256 yiv = _mm256_loadu_ps(yi);
         yi += 8;
-        accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(yiv, x));
+        accu8 = _mm256_fmadd_ps(yiv, x, accu8);
+    }
+    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) {
+        accu8 = _mm256_fmadd_ps(x1, x2, accu8);
+    }
+    FAISS_ALWAYS_INLINE float result_8() {
+        const __m128 sum = _mm_add_ps(
+                _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1));
+        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
+        const __m128 v1 = _mm_add_ps(sum, v0);
+        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+        const __m128 v3 = _mm_add_ps(v1, v2);
+        return _mm_cvtss_f32(v3);
+    }
+};
+#endif
+#ifdef __aarch64__
+template <>
+struct SimilarityIP<8> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+    const float *y, *yi;
+    explicit SimilarityIP(const float* y) : y(y) {}
+    float32x4x2_t accu8;
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8 = vzipq_f32(vdupq_n_f32(0.0f), vdupq_n_f32(0.0f));
+        yi = y;
+    }
+    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
+        float32x4x2_t yiv = vld1q_f32_x2(yi);
+        yi += 8;
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
+        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
+        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
     }
-    void add_8_components_2(__m256 x1, __m256 x2) {
-        accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(x1, x2));
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            float32x4x2_t x1,
+            float32x4x2_t x2) {
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
+        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
+        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
     }
-    float result_8() {
-        __m256 sum = _mm256_hadd_ps(accu8, accu8);
-        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        // now add the 0th and 4th component
-        return _mm_cvtss_f32(_mm256_castps256_ps128(sum2)) +
-                _mm_cvtss_f32(_mm256_extractf128_ps(sum2, 1));
+    FAISS_ALWAYS_INLINE float result_8() {
+        float32x4x2_t sum_tmp = vzipq_f32(
+                vpaddq_f32(accu8.val[0], accu8.val[0]),
+                vpaddq_f32(accu8.val[1], accu8.val[1]));
+        float32x4x2_t sum = vuzpq_f32(sum_tmp.val[0], sum_tmp.val[1]);
+        float32x4x2_t sum2_tmp = vzipq_f32(
+                vpaddq_f32(sum.val[0], sum.val[0]),
+                vpaddq_f32(sum.val[1], sum.val[1]));
+        float32x4x2_t sum2 = vuzpq_f32(sum2_tmp.val[0], sum2_tmp.val[1]);
+        return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
     }
 };
 #endif
@@ -864,6 +1135,53 @@ struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
 #endif
+#ifdef __aarch64__
+template <class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
+    using Sim = Similarity;
+    Quantizer quant;
+    DCTemplate(size_t d, const std::vector<float>& trained)
+            : quant(d, trained) {}
+    float compute_distance(const float* x, const uint8_t* code) const {
+        Similarity sim(x);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            float32x4x2_t xi = quant.reconstruct_8_components(code, i);
+            sim.add_8_components(xi);
+        }
+        return sim.result_8();
+    }
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        Similarity sim(nullptr);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            float32x4x2_t x1 = quant.reconstruct_8_components(code1, i);
+            float32x4x2_t x2 = quant.reconstruct_8_components(code2, i);
+            sim.add_8_components_2(x1, x2);
+        }
+        return sim.result_8();
+    }
+    void set_query(const float* x) final {
+        q = x;
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+    float query_to_code(const uint8_t* code) const final {
+        return compute_distance(q, code);
+    }
+};
+#endif
 /*******************************************************************
  * DistanceComputerByte: computes distances in the integer domain
  *******************************************************************/
@@ -980,6 +1298,54 @@ struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
 #endif
+#ifdef __aarch64__
+template <class Similarity>
+struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
+    using Sim = Similarity;
+    int d;
+    std::vector<uint8_t> tmp;
+    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        int accu = 0;
+        for (int i = 0; i < d; i++) {
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                accu += int(code1[i]) * code2[i];
+            } else {
+                int diff = int(code1[i]) - code2[i];
+                accu += diff * diff;
+            }
+        }
+        return accu;
+    }
+    void set_query(const float* x) final {
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+    float query_to_code(const uint8_t* code) const final {
+        return compute_code_distance(tmp.data(), code);
+    }
+};
+#endif
 /*******************************************************************
  * select_distance_computer: runtime selection of template
  * specialization
@@ -1115,34 +1481,8 @@ void ScalarQuantizer::train(size_t n, const float* x) {
     }
 }
-void ScalarQuantizer::train_residual(
-        size_t n,
-        const float* x,
-        Index* quantizer,
-        bool by_residual,
-        bool verbose) {
-    const float* x_in = x;
-    // 100k points more than enough
-    x = fvecs_maybe_subsample(d, (size_t*)&n, 100000, x, verbose, 1234);
-    ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
-    if (by_residual) {
-        std::vector<idx_t> idx(n);
-        quantizer->assign(n, x, idx.data());
-        std::vector<float> residuals(n * d);
-        quantizer->compute_residual_n(n, x, residuals.data(), idx.data());
-        train(n, residuals.data());
-    } else {
-        train(n, x);
-    }
-}
 ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const {
-#ifdef USE_F16C
+#if defined(USE_F16C) || defined(__aarch64__)
     if (d % 8 == 0) {
         return select_quantizer_1<8>(qtype, d, trained);
     } else
@@ -1173,7 +1513,7 @@ void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
 SQDistanceComputer* ScalarQuantizer::get_distance_computer(
         MetricType metric) const {
     FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
-#ifdef USE_F16C
+#if defined(USE_F16C) || defined(__aarch64__)
     if (d % 8 == 0) {
         if (metric == METRIC_L2) {
             return select_distance_computer<SimilarityL2<8>>(qtype, d, trained);
@@ -1204,7 +1544,6 @@ template <class DCClass, int use_sel>
 struct IVFSQScannerIP : InvertedListScanner {
     DCClass dc;
     bool by_residual;
-    const IDSelector* sel;
     float accu0; /// added to all distances
@@ -1215,9 +1554,11 @@ struct IVFSQScannerIP : InvertedListScanner {
             bool store_pairs,
             const IDSelector* sel,
             bool by_residual)
-            : dc(d, trained), by_residual(by_residual), sel(sel), accu0(0) {
+            : dc(d, trained), by_residual(by_residual), accu0(0) {
         this->store_pairs = store_pairs;
+        this->sel = sel;
         this->code_size = code_size;
+        this->keep_max = true;
     }
     void set_query(const float* query) override {
@@ -1288,7 +1629,6 @@ struct IVFSQScannerL2 : InvertedListScanner {
     bool by_residual;
     const Index* quantizer;
-    const IDSelector* sel;
     const float* x; /// current query
     std::vector<float> tmp;
@@ -1304,10 +1644,10 @@ struct IVFSQScannerL2 : InvertedListScanner {
             : dc(d, trained),
               by_residual(by_residual),
               quantizer(quantizer),
-              sel(sel),
               x(nullptr),
               tmp(d) {
         this->store_pairs = store_pairs;
+        this->sel = sel;
         this->code_size = code_size;
     }
@@ -1509,7 +1849,7 @@ InvertedListScanner* ScalarQuantizer::select_InvertedListScanner(
         bool store_pairs,
         const IDSelector* sel,
         bool by_residual) const {
-#ifdef USE_F16C
+#if defined(USE_F16C) || defined(__aarch64__)
     if (d % 8 == 0) {
         return sel0_InvertedListScanner<8>(
                 mt, this, quantizer, store_pairs, sel, by_residual);