faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/Index.h +1 -1
  5. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
  6. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
  7. data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
  8. data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
  9. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
  10. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
  11. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
  12. data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
  13. data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
  14. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
  15. data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
  16. data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
  17. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
  18. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
  19. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
  20. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
  21. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
  22. data/vendor/faiss/faiss/factory_tools.cpp +4 -0
  23. data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
  24. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
  25. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
  26. data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
  27. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
  28. data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
  29. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
  30. data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
  31. data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
  32. data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
  33. data/vendor/faiss/faiss/impl/HNSW.h +51 -13
  34. data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
  35. data/vendor/faiss/faiss/impl/Panorama.h +11 -0
  36. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
  37. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
  38. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
  39. data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
  40. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
  41. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
  42. data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
  43. data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
  44. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
  45. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
  46. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
  47. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
  48. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
  49. data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
  50. data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
  51. data/vendor/faiss/faiss/impl/io_macros.h +25 -0
  52. data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
  53. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
  54. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
  55. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
  56. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
  57. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
  58. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
  59. data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
  60. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
  61. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
  62. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
  63. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
  64. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
  65. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
  66. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
  67. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
  68. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
  69. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
  70. data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
  71. data/vendor/faiss/faiss/index_factory.cpp +5 -1
  72. data/vendor/faiss/faiss/index_io.h +16 -0
  73. data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
  74. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
  75. data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
  76. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
  77. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
  78. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
  79. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
  80. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
  81. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
  82. data/vendor/faiss/faiss/utils/bf16.h +34 -0
  83. data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
  84. data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
  85. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
  86. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
  87. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
  88. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
  89. data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
  90. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
  91. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
  92. data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
  93. metadata +12 -2
@@ -214,6 +214,12 @@ struct QuantizerTemplate<
214
214
  return simd16float32(_mm512_fmadd_ps(
215
215
  xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)));
216
216
  }
217
+
218
+ /// Raw codec decode without denormalization
219
+ FAISS_ALWAYS_INLINE simd16float32
220
+ decode_16_raw(const uint8_t* code, int i) const {
221
+ return Codec::decode_16_components(code, i);
222
+ }
217
223
  };
218
224
 
219
225
  template <class Codec>
@@ -247,32 +253,78 @@ struct QuantizerTemplate<
247
253
  * TurboQuant MSE quantizer
248
254
  **********************************************************/
249
255
 
250
- #define DEFINE_TQMSE_AVX512_SPECIALIZATION(NBITS, INDEX_EXPR) \
251
- template <> \
252
- struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX512> \
253
- : QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
254
- using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
255
- \
256
- QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
257
- : Base(d, trained) { \
258
- assert(d % 16 == 0); \
259
- } \
260
- \
261
- FAISS_ALWAYS_INLINE simd16float32 \
262
- reconstruct_16_components(const uint8_t* code, int i) const { \
263
- const __m512i indices = (INDEX_EXPR); \
264
- return simd16float32(_mm512_i32gather_ps( \
265
- indices, this->centroids, sizeof(float))); \
266
- } \
267
- }
268
-
269
- DEFINE_TQMSE_AVX512_SPECIALIZATION(1, unpack_16x1bit_to_u32(code, i));
270
- DEFINE_TQMSE_AVX512_SPECIALIZATION(2, unpack_16x2bit_to_u32(code, i));
271
- DEFINE_TQMSE_AVX512_SPECIALIZATION(3, unpack_16x3bit_to_u32(code, i));
272
- DEFINE_TQMSE_AVX512_SPECIALIZATION(4, unpack_16x4bit_to_u32(code, i));
273
-
274
- #undef DEFINE_TQMSE_AVX512_SPECIALIZATION
256
+ // 1-bit MSE AVX512: 16 comparisons → 2 bytes via mask compare.
257
+ template <>
258
+ struct QuantizerTurboQuantMSE<1, SIMDLevel::AVX512>
259
+ : QuantizerTurboQuantMSE<1, SIMDLevel::NONE> {
260
+ using Base = QuantizerTurboQuantMSE<1, SIMDLevel::NONE>;
261
+
262
+ QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
263
+ : Base(d, trained) {
264
+ assert(d % 16 == 0);
265
+ }
266
+
267
+ FAISS_ALWAYS_INLINE simd16float32
268
+ reconstruct_16_components(const uint8_t* code, int i) const {
269
+ return simd16float32(_mm512_i32gather_ps(
270
+ unpack_16x1bit_to_u32(code, i),
271
+ this->centroids,
272
+ sizeof(float)));
273
+ }
274
+
275
+ void encode_vector(const float* x, uint8_t* code) const final {
276
+ __m512 boundary = _mm512_set1_ps(this->boundaries[0]);
277
+ for (size_t i = 0; i < this->d; i += 16) {
278
+ __m512 vals = _mm512_loadu_ps(x + i);
279
+ __mmask16 mask = _mm512_cmp_ps_mask(vals, boundary, _CMP_GT_OQ);
280
+ uint16_t bits = _cvtmask16_u32(mask);
281
+ memcpy(code + i / 8, &bits, sizeof(uint16_t));
282
+ }
283
+ }
284
+
285
+ void decode_vector(const uint8_t* code, float* x) const final {
286
+ for (size_t i = 0; i < this->d; i += 16) {
287
+ simd16float32 xi =
288
+ reconstruct_16_components(code, static_cast<int>(i));
289
+ _mm512_storeu_ps(x + i, xi.f);
290
+ }
291
+ }
292
+ };
275
293
 
294
+ // 2-4 bit MSE AVX512: decode via gather, encode stays scalar.
295
+ #define DEFINE_TQMSE_AVX512_MULTIBIT(NBITS, UNPACK_EXPR) \
296
+ template <> \
297
+ struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX512> \
298
+ : QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
299
+ using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
300
+ \
301
+ QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
302
+ : Base(d, trained) { \
303
+ assert(d % 16 == 0); \
304
+ } \
305
+ \
306
+ FAISS_ALWAYS_INLINE simd16float32 \
307
+ reconstruct_16_components(const uint8_t* code, int i) const { \
308
+ return simd16float32(_mm512_i32gather_ps( \
309
+ (UNPACK_EXPR), this->centroids, sizeof(float))); \
310
+ } \
311
+ \
312
+ void decode_vector(const uint8_t* code, float* x) const final { \
313
+ for (size_t i = 0; i < this->d; i += 16) { \
314
+ simd16float32 xi = \
315
+ reconstruct_16_components(code, static_cast<int>(i)); \
316
+ _mm512_storeu_ps(x + i, xi.f); \
317
+ } \
318
+ } \
319
+ }
320
+
321
+ DEFINE_TQMSE_AVX512_MULTIBIT(2, unpack_16x2bit_to_u32(code, i));
322
+ DEFINE_TQMSE_AVX512_MULTIBIT(3, unpack_16x3bit_to_u32(code, i));
323
+ DEFINE_TQMSE_AVX512_MULTIBIT(4, unpack_16x4bit_to_u32(code, i));
324
+
325
+ #undef DEFINE_TQMSE_AVX512_MULTIBIT
326
+
327
+ // 8-bit MSE AVX512
276
328
  template <>
277
329
  struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX512>
278
330
  : QuantizerTurboQuantMSE<8, SIMDLevel::NONE> {
@@ -291,6 +343,14 @@ struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX512>
291
343
  return simd16float32(
292
344
  _mm512_i32gather_ps(indices, this->centroids, sizeof(float)));
293
345
  }
346
+
347
+ void decode_vector(const uint8_t* code, float* x) const final {
348
+ for (size_t i = 0; i < this->d; i += 16) {
349
+ simd16float32 xi =
350
+ reconstruct_16_components(code, static_cast<int>(i));
351
+ _mm512_storeu_ps(x + i, xi.f);
352
+ }
353
+ }
294
354
  };
295
355
 
296
356
  /**********************************************************
@@ -411,6 +471,22 @@ struct SimilarityL2<SIMDLevel::AVX512> {
411
471
  FAISS_ALWAYS_INLINE float result_16() {
412
472
  return horizontal_add(accu16);
413
473
  }
474
+
475
+ static void adjust_query_for_raw_decode(
476
+ const float* x,
477
+ float* q_adj,
478
+ size_t d,
479
+ float vmin,
480
+ float vdiff,
481
+ float& scale_factor,
482
+ float& bias) {
483
+ float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
484
+ for (size_t i = 0; i < d; i++) {
485
+ q_adj[i] = (x[i] - vmin) * inv_vdiff;
486
+ }
487
+ scale_factor = vdiff * vdiff;
488
+ bias = 0;
489
+ }
414
490
  };
415
491
 
416
492
  template <>
@@ -445,6 +521,23 @@ struct SimilarityIP<SIMDLevel::AVX512> {
445
521
  FAISS_ALWAYS_INLINE float result_16() {
446
522
  return horizontal_add(accu16);
447
523
  }
524
+
525
+ static void adjust_query_for_raw_decode(
526
+ const float* x,
527
+ float* q_adj,
528
+ size_t d,
529
+ float vmin,
530
+ float vdiff,
531
+ float& scale_factor,
532
+ float& bias) {
533
+ float sum_q = 0;
534
+ for (size_t i = 0; i < d; i++) {
535
+ q_adj[i] = x[i];
536
+ sum_q += x[i];
537
+ }
538
+ scale_factor = vdiff;
539
+ bias = vmin * sum_q;
540
+ }
448
541
  };
449
542
 
450
543
  /**********************************************************
@@ -458,8 +551,23 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
458
551
 
459
552
  Quantizer quant;
460
553
 
554
+ // Pre-adjusted query buffer for uniform quantizers
555
+ std::vector<float> q_adj;
556
+ float scale_factor = 0;
557
+ float bias = 0;
558
+
559
+ static constexpr bool has_decode_raw() {
560
+ return requires(const Quantizer& q, const uint8_t* c, int i) {
561
+ { q.decode_16_raw(c, i) };
562
+ };
563
+ }
564
+
461
565
  DCTemplate(size_t d, const std::vector<float>& trained)
462
- : quant(d, trained) {}
566
+ : quant(d, trained) {
567
+ if constexpr (has_decode_raw()) {
568
+ q_adj.resize(d);
569
+ }
570
+ }
463
571
 
464
572
  float compute_distance(const float* x, const uint8_t* code) const {
465
573
  Similarity sim(x);
@@ -485,6 +593,26 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
485
593
 
486
594
  void set_query(const float* x) final {
487
595
  q = x;
596
+ if constexpr (has_decode_raw()) {
597
+ Sim::adjust_query_for_raw_decode(
598
+ x,
599
+ q_adj.data(),
600
+ quant.d,
601
+ quant.vmin,
602
+ quant.vdiff,
603
+ scale_factor,
604
+ bias);
605
+ }
606
+ }
607
+
608
+ float query_to_code_predecoded(const uint8_t* code) const {
609
+ Similarity sim(q_adj.data());
610
+ sim.begin_16();
611
+ for (size_t i = 0; i < quant.d; i += 16) {
612
+ simd16float32 xi = quant.decode_16_raw(code, i);
613
+ sim.add_16_components(xi);
614
+ }
615
+ return bias + scale_factor * sim.result_16();
488
616
  }
489
617
 
490
618
  float symmetric_dis(idx_t i, idx_t j) override {
@@ -493,7 +621,11 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
493
621
  }
494
622
 
495
623
  float query_to_code(const uint8_t* code) const final {
496
- return compute_distance(q, code);
624
+ if constexpr (has_decode_raw()) {
625
+ return query_to_code_predecoded(code);
626
+ } else {
627
+ return compute_distance(q, code);
628
+ }
497
629
  }
498
630
 
499
631
  void query_to_codes_batch_4(
@@ -588,10 +720,50 @@ struct DistanceComputerByte<Similarity, SIMDLevel::AVX512>
588
720
  }
589
721
  };
590
722
 
723
+ /**********************************************************
724
+ * TurboQuant masked_sum AVX512 specialization
725
+ **********************************************************/
726
+
727
+ template <SIMDLevel SL0>
728
+ float turboq_masked_sum(const float* arr, const uint8_t* bits, size_t d);
729
+
730
+ template <>
731
+ float turboq_masked_sum<SIMDLevel::AVX512>(
732
+ const float* arr,
733
+ const uint8_t* bits,
734
+ size_t d) {
735
+ __m512 acc = _mm512_setzero_ps();
736
+ size_t i = 0;
737
+ size_t full_16 = (d / 16) * 16;
738
+ for (; i < full_16; i += 16) {
739
+ uint16_t mask16;
740
+ memcpy(&mask16, bits + i / 8, sizeof(mask16));
741
+ __mmask16 k = _cvtu32_mask16(mask16);
742
+ __m512 vals = _mm512_loadu_ps(arr + i);
743
+ acc = _mm512_mask_add_ps(acc, k, acc, vals);
744
+ }
745
+ float result = _mm512_reduce_add_ps(acc);
746
+ if (i < d) {
747
+ size_t remaining = d - i;
748
+ __mmask16 tail_mask = _cvtu32_mask16((1u << remaining) - 1);
749
+ __m512 tail_vals = _mm512_maskz_loadu_ps(tail_mask, arr + i);
750
+ uint16_t bits_tail = 0;
751
+ size_t bytes_remaining = (remaining + 7) / 8;
752
+ memcpy(&bits_tail, bits + i / 8, bytes_remaining);
753
+ __mmask16 bits_k = _cvtu32_mask16(bits_tail);
754
+ __mmask16 combined = _kand_mask16(tail_mask, bits_k);
755
+ __m512 masked_tail = _mm512_maskz_mov_ps(combined, tail_vals);
756
+ result += _mm512_reduce_add_ps(masked_tail);
757
+ }
758
+ return result;
759
+ }
760
+
591
761
  } // namespace scalar_quantizer
592
762
  } // namespace faiss
593
763
 
764
+ #ifndef SQ_AVX512_SKIP_DISPATCH
594
765
  #define THE_LEVEL_TO_DISPATCH SIMDLevel::AVX512
595
766
  #include <faiss/impl/scalar_quantizer/sq-dispatch.h>
767
+ #endif
596
768
 
597
769
  #endif // COMPILE_SIMD_AVX512