faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/Index.h +1 -1
  5. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
  6. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
  7. data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
  8. data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
  9. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
  10. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
  11. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
  12. data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
  13. data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
  14. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
  15. data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
  16. data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
  17. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
  18. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
  19. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
  20. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
  21. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
  22. data/vendor/faiss/faiss/factory_tools.cpp +4 -0
  23. data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
  24. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
  25. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
  26. data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
  27. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
  28. data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
  29. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
  30. data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
  31. data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
  32. data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
  33. data/vendor/faiss/faiss/impl/HNSW.h +51 -13
  34. data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
  35. data/vendor/faiss/faiss/impl/Panorama.h +11 -0
  36. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
  37. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
  38. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
  39. data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
  40. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
  41. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
  42. data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
  43. data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
  44. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
  45. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
  46. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
  47. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
  48. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
  49. data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
  50. data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
  51. data/vendor/faiss/faiss/impl/io_macros.h +25 -0
  52. data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
  53. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
  54. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
  55. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
  56. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
  57. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
  58. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
  59. data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
  60. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
  61. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
  62. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
  63. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
  64. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
  65. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
  66. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
  67. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
  68. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
  69. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
  70. data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
  71. data/vendor/faiss/faiss/index_factory.cpp +5 -1
  72. data/vendor/faiss/faiss/index_io.h +16 -0
  73. data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
  74. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
  75. data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
  76. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
  77. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
  78. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
  79. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
  80. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
  81. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
  82. data/vendor/faiss/faiss/utils/bf16.h +34 -0
  83. data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
  84. data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
  85. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
  86. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
  87. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
  88. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
  89. data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
  90. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
  91. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
  92. data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
  93. metadata +12 -2
@@ -196,6 +196,12 @@ struct QuantizerTemplate<
196
196
  return simd8float32(_mm256_fmadd_ps(
197
197
  xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)));
198
198
  }
199
+
200
+ /// Raw codec decode without denormalization
201
+ FAISS_ALWAYS_INLINE simd8float32
202
+ decode_8_raw(const uint8_t* code, int i) const {
203
+ return Codec::decode_8_components(code, i);
204
+ }
199
205
  };
200
206
 
201
207
  template <class Codec>
@@ -229,32 +235,139 @@ struct QuantizerTemplate<
229
235
  * TurboQuant MSE quantizer
230
236
  **********************************************************/
231
237
 
232
- #define DEFINE_TQMSE_AVX2_SPECIALIZATION(NBITS, INDEX_EXPR) \
233
- template <> \
234
- struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX2> \
235
- : QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
236
- using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
237
- \
238
- QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
239
- : Base(d, trained) { \
240
- assert(d % 8 == 0); \
241
- } \
242
- \
243
- FAISS_ALWAYS_INLINE simd8float32 \
244
- reconstruct_8_components(const uint8_t* code, int i) const { \
245
- const __m256i indices = (INDEX_EXPR); \
246
- return simd8float32(_mm256_i32gather_ps( \
247
- this->centroids, indices, sizeof(float))); \
248
- } \
249
- }
250
-
251
- DEFINE_TQMSE_AVX2_SPECIALIZATION(1, unpack_8x1bit_to_u32(code, i));
252
- DEFINE_TQMSE_AVX2_SPECIALIZATION(2, unpack_8x2bit_to_u32(code, i));
253
- DEFINE_TQMSE_AVX2_SPECIALIZATION(3, unpack_8x3bit_to_u32(code, i));
254
- DEFINE_TQMSE_AVX2_SPECIALIZATION(4, unpack_8x4bit_to_u32(code, i));
255
-
256
- #undef DEFINE_TQMSE_AVX2_SPECIALIZATION
238
+ // 1-bit MSE: boundary is always at centroids midpoint.
239
+ // Encode: 8 comparisons → 1 byte via movemask.
240
+ // Decode: gather 8 centroids via index unpack.
241
+ // NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization,facebook-hte-ShadowingClass)
242
+ template <>
243
+ struct QuantizerTurboQuantMSE<1, SIMDLevel::AVX2>
244
+ : QuantizerTurboQuantMSE<1, SIMDLevel::NONE> {
245
+ using Base = QuantizerTurboQuantMSE<1, SIMDLevel::NONE>;
246
+
247
+ QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
248
+ : Base(d, trained) {
249
+ assert(d % 8 == 0);
250
+ }
251
+
252
+ FAISS_ALWAYS_INLINE simd8float32
253
+ reconstruct_8_components(const uint8_t* code, int i) const {
254
+ return simd8float32(_mm256_i32gather_ps(
255
+ this->centroids, unpack_8x1bit_to_u32(code, i), sizeof(float)));
256
+ }
257
+
258
+ void encode_vector(const float* x, uint8_t* code) const final {
259
+ __m256 boundary = _mm256_set1_ps(this->boundaries[0]);
260
+ for (size_t i = 0; i < this->d; i += 8) {
261
+ __m256 vals = _mm256_loadu_ps(x + i);
262
+ int mask = _mm256_movemask_ps(
263
+ _mm256_cmp_ps(vals, boundary, _CMP_GT_OQ));
264
+ code[i / 8] = static_cast<uint8_t>(mask);
265
+ }
266
+ }
267
+
268
+ void decode_vector(const uint8_t* code, float* x) const final {
269
+ for (size_t i = 0; i < this->d; i += 8) {
270
+ simd8float32 xi =
271
+ reconstruct_8_components(code, static_cast<int>(i));
272
+ _mm256_storeu_ps(x + i, xi.f);
273
+ }
274
+ }
275
+ };
276
+
277
+ // 2-bit MSE: 4 centroids, 3 boundaries.
278
+ // Encode: branchless index = sum of 3 comparisons per component.
279
+ // Decode: gather via index unpack.
280
+ // NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization,facebook-hte-ShadowingClass)
281
+ template <>
282
+ struct QuantizerTurboQuantMSE<2, SIMDLevel::AVX2>
283
+ : QuantizerTurboQuantMSE<2, SIMDLevel::NONE> {
284
+ using Base = QuantizerTurboQuantMSE<2, SIMDLevel::NONE>;
285
+
286
+ QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
287
+ : Base(d, trained) {
288
+ assert(d % 8 == 0);
289
+ }
290
+
291
+ FAISS_ALWAYS_INLINE simd8float32
292
+ reconstruct_8_components(const uint8_t* code, int i) const {
293
+ return simd8float32(_mm256_i32gather_ps(
294
+ this->centroids, unpack_8x2bit_to_u32(code, i), sizeof(float)));
295
+ }
296
+
297
+ void encode_vector(const float* x, uint8_t* code) const final {
298
+ // 3 boundaries → branchless: idx = (x>b0) + (x>b1) + (x>b2)
299
+ // _mm256_cmp_ps returns all-ones (-1 as int32) for true,
300
+ // so we negate the sum to get positive indices.
301
+ __m256 b0 = _mm256_set1_ps(this->boundaries[0]);
302
+ __m256 b1 = _mm256_set1_ps(this->boundaries[1]);
303
+ __m256 b2 = _mm256_set1_ps(this->boundaries[2]);
304
+ for (size_t i = 0; i < this->d; i += 8) {
305
+ __m256 vals = _mm256_loadu_ps(x + i);
306
+ __m256i gt0 =
307
+ _mm256_castps_si256(_mm256_cmp_ps(vals, b0, _CMP_GT_OQ));
308
+ __m256i gt1 =
309
+ _mm256_castps_si256(_mm256_cmp_ps(vals, b1, _CMP_GT_OQ));
310
+ __m256i gt2 =
311
+ _mm256_castps_si256(_mm256_cmp_ps(vals, b2, _CMP_GT_OQ));
312
+ // Each gt is 0 or -1 (0xFFFFFFFF). Sum = -(index).
313
+ __m256i idx = _mm256_sub_epi32(
314
+ _mm256_setzero_si256(),
315
+ _mm256_add_epi32(_mm256_add_epi32(gt0, gt1), gt2));
316
+ // Pack 8 x 2-bit indices into 2 bytes.
317
+ // Store to temp array and pack scalarly - faster than
318
+ // extract+permute.
319
+ alignas(32) int32_t idx_array[8];
320
+ _mm256_store_si256((__m256i*)idx_array, idx);
321
+ for (int j = 0; j < 8; j++) {
322
+ this->encode_index(
323
+ static_cast<uint8_t>(idx_array[j] & 0x3), code, i + j);
324
+ }
325
+ }
326
+ }
327
+
328
+ void decode_vector(const uint8_t* code, float* x) const final {
329
+ for (size_t i = 0; i < this->d; i += 8) {
330
+ simd8float32 xi =
331
+ reconstruct_8_components(code, static_cast<int>(i));
332
+ _mm256_storeu_ps(x + i, xi.f);
333
+ }
334
+ }
335
+ };
257
336
 
337
+ // 3-bit and 4-bit MSE: use branchless comparison chain for encode.
338
+ // k boundaries → idx = sum of k-1 comparisons.
339
+ #define DEFINE_TQMSE_AVX2_MULTIBIT(NBITS, UNPACK_EXPR) \
340
+ template <> \
341
+ struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX2> \
342
+ : QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
343
+ using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
344
+ \
345
+ QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
346
+ : Base(d, trained) { \
347
+ assert(d % 8 == 0); \
348
+ } \
349
+ \
350
+ FAISS_ALWAYS_INLINE simd8float32 \
351
+ reconstruct_8_components(const uint8_t* code, int i) const { \
352
+ return simd8float32(_mm256_i32gather_ps( \
353
+ this->centroids, (UNPACK_EXPR), sizeof(float))); \
354
+ } \
355
+ \
356
+ void decode_vector(const uint8_t* code, float* x) const final { \
357
+ for (size_t i = 0; i < this->d; i += 8) { \
358
+ simd8float32 xi = \
359
+ reconstruct_8_components(code, static_cast<int>(i)); \
360
+ _mm256_storeu_ps(x + i, xi.f); \
361
+ } \
362
+ } \
363
+ }
364
+
365
+ DEFINE_TQMSE_AVX2_MULTIBIT(3, unpack_8x3bit_to_u32(code, i));
366
+ DEFINE_TQMSE_AVX2_MULTIBIT(4, unpack_8x4bit_to_u32(code, i));
367
+
368
+ #undef DEFINE_TQMSE_AVX2_MULTIBIT
369
+
370
+ // 8-bit MSE: indices are raw bytes, no bit packing.
258
371
  template <>
259
372
  struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX2>
260
373
  : QuantizerTurboQuantMSE<8, SIMDLevel::NONE> {
@@ -273,6 +386,14 @@ struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX2>
273
386
  return simd8float32(
274
387
  _mm256_i32gather_ps(this->centroids, indices, sizeof(float)));
275
388
  }
389
+
390
+ void decode_vector(const uint8_t* code, float* x) const final {
391
+ for (size_t i = 0; i < this->d; i += 8) {
392
+ simd8float32 xi =
393
+ reconstruct_8_components(code, static_cast<int>(i));
394
+ _mm256_storeu_ps(x + i, xi.f);
395
+ }
396
+ }
276
397
  };
277
398
 
278
399
  /**********************************************************
@@ -399,6 +520,22 @@ struct SimilarityL2<SIMDLevel::AVX2> {
399
520
  const __m128 v3 = _mm_add_ps(v1, v2);
400
521
  return _mm_cvtss_f32(v3);
401
522
  }
523
+
524
+ static void adjust_query_for_raw_decode(
525
+ const float* x,
526
+ float* q_adj,
527
+ size_t d,
528
+ float vmin,
529
+ float vdiff,
530
+ float& scale_factor,
531
+ float& bias) {
532
+ float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
533
+ for (size_t i = 0; i < d; i++) {
534
+ q_adj[i] = (x[i] - vmin) * inv_vdiff;
535
+ }
536
+ scale_factor = vdiff * vdiff;
537
+ bias = 0;
538
+ }
402
539
  };
403
540
 
404
541
  template <>
@@ -442,6 +579,23 @@ struct SimilarityIP<SIMDLevel::AVX2> {
442
579
  const __m128 v3 = _mm_add_ps(v1, v2);
443
580
  return _mm_cvtss_f32(v3);
444
581
  }
582
+
583
+ static void adjust_query_for_raw_decode(
584
+ const float* x,
585
+ float* q_adj,
586
+ size_t d,
587
+ float vmin,
588
+ float vdiff,
589
+ float& scale_factor,
590
+ float& bias) {
591
+ float sum_q = 0;
592
+ for (size_t i = 0; i < d; i++) {
593
+ q_adj[i] = x[i];
594
+ sum_q += x[i];
595
+ }
596
+ scale_factor = vdiff;
597
+ bias = vmin * sum_q;
598
+ }
445
599
  };
446
600
 
447
601
  /**********************************************************
@@ -454,8 +608,23 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
454
608
 
455
609
  Quantizer quant;
456
610
 
611
+ // Pre-adjusted query buffer for uniform quantizers
612
+ std::vector<float> q_adj;
613
+ float scale_factor = 0;
614
+ float bias = 0;
615
+
616
+ static constexpr bool has_decode_raw() {
617
+ return requires(const Quantizer& q, const uint8_t* c, int i) {
618
+ { q.decode_8_raw(c, i) };
619
+ };
620
+ }
621
+
457
622
  DCTemplate(size_t d, const std::vector<float>& trained)
458
- : quant(d, trained) {}
623
+ : quant(d, trained) {
624
+ if constexpr (has_decode_raw()) {
625
+ q_adj.resize(d);
626
+ }
627
+ }
459
628
 
460
629
  float compute_distance(const float* x, const uint8_t* code) const {
461
630
  Similarity sim(x);
@@ -484,6 +653,26 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
484
653
 
485
654
  void set_query(const float* x) final {
486
655
  q = x;
656
+ if constexpr (has_decode_raw()) {
657
+ Sim::adjust_query_for_raw_decode(
658
+ x,
659
+ q_adj.data(),
660
+ quant.d,
661
+ quant.vmin,
662
+ quant.vdiff,
663
+ scale_factor,
664
+ bias);
665
+ }
666
+ }
667
+
668
+ float query_to_code_predecoded(const uint8_t* code) const {
669
+ Similarity sim(q_adj.data());
670
+ sim.begin_8();
671
+ for (size_t i = 0; i < quant.d; i += 8) {
672
+ simd8float32 xi = quant.decode_8_raw(code, static_cast<int>(i));
673
+ sim.add_8_components(xi);
674
+ }
675
+ return bias + scale_factor * sim.result_8();
487
676
  }
488
677
 
489
678
  float symmetric_dis(idx_t i, idx_t j) override {
@@ -492,7 +681,11 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
492
681
  }
493
682
 
494
683
  float query_to_code(const uint8_t* code) const final {
495
- return compute_distance(q, code);
684
+ if constexpr (has_decode_raw()) {
685
+ return query_to_code_predecoded(code);
686
+ } else {
687
+ return compute_distance(q, code);
688
+ }
496
689
  }
497
690
 
498
691
  void query_to_codes_batch_4(
@@ -594,6 +787,50 @@ struct DistanceComputerByte<Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
594
787
  }
595
788
  };
596
789
 
790
+ /**********************************************************
791
+ * TurboQuant masked_sum AVX2 specialization
792
+ **********************************************************/
793
+
794
+ template <SIMDLevel SL0>
795
+ float turboq_masked_sum(const float* arr, const uint8_t* bits, size_t d);
796
+
797
+ template <>
798
+ float turboq_masked_sum<SIMDLevel::AVX2>(
799
+ const float* arr,
800
+ const uint8_t* bits,
801
+ size_t d) {
802
+ const __m256i bit_masks = _mm256_set_epi32(128, 64, 32, 16, 8, 4, 2, 1);
803
+ __m256 acc = _mm256_setzero_ps();
804
+ size_t full_bytes = d / 8;
805
+ for (size_t byte_idx = 0; byte_idx < full_bytes; byte_idx++) {
806
+ __m256i byte_broadcast =
807
+ _mm256_set1_epi32(static_cast<int>(bits[byte_idx]));
808
+ __m256i masked = _mm256_and_si256(byte_broadcast, bit_masks);
809
+ __m256i cmp = _mm256_cmpeq_epi32(masked, bit_masks);
810
+ __m256 mask = _mm256_castsi256_ps(cmp);
811
+ __m256 vals = _mm256_loadu_ps(arr + byte_idx * 8);
812
+ acc = _mm256_add_ps(acc, _mm256_and_ps(mask, vals));
813
+ }
814
+ __m128 hi = _mm256_extractf128_ps(acc, 1);
815
+ __m128 lo = _mm256_castps256_ps128(acc);
816
+ __m128 sum128 = _mm_add_ps(lo, hi);
817
+ __m128 shuf = _mm_movehdup_ps(sum128);
818
+ __m128 sums = _mm_add_ps(sum128, shuf);
819
+ shuf = _mm_movehl_ps(shuf, sums);
820
+ sums = _mm_add_ss(sums, shuf);
821
+ float result = _mm_cvtss_f32(sums);
822
+ size_t tail_start = full_bytes * 8;
823
+ if (tail_start < d) {
824
+ uint8_t last_byte = bits[full_bytes];
825
+ for (size_t j = tail_start; j < d; j++) {
826
+ if (last_byte & (1 << (j - tail_start))) {
827
+ result += arr[j];
828
+ }
829
+ }
830
+ }
831
+ return result;
832
+ }
833
+
597
834
  } // namespace scalar_quantizer
598
835
  } // namespace faiss
599
836