faiss 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +1 -1
  4. data/lib/faiss/version.rb +1 -1
  5. data/lib/faiss.rb +2 -2
  6. data/vendor/faiss/faiss/AutoTune.cpp +15 -4
  7. data/vendor/faiss/faiss/AutoTune.h +0 -1
  8. data/vendor/faiss/faiss/Clustering.cpp +1 -5
  9. data/vendor/faiss/faiss/Clustering.h +0 -2
  10. data/vendor/faiss/faiss/IVFlib.h +0 -2
  11. data/vendor/faiss/faiss/Index.h +1 -2
  12. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +17 -3
  13. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +10 -1
  14. data/vendor/faiss/faiss/IndexBinary.h +0 -1
  15. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -1
  16. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -0
  17. data/vendor/faiss/faiss/IndexBinaryHash.cpp +1 -3
  18. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +273 -48
  19. data/vendor/faiss/faiss/IndexBinaryIVF.h +18 -11
  20. data/vendor/faiss/faiss/IndexFastScan.cpp +13 -10
  21. data/vendor/faiss/faiss/IndexFastScan.h +5 -1
  22. data/vendor/faiss/faiss/IndexFlat.cpp +16 -3
  23. data/vendor/faiss/faiss/IndexFlat.h +1 -1
  24. data/vendor/faiss/faiss/IndexFlatCodes.cpp +5 -0
  25. data/vendor/faiss/faiss/IndexFlatCodes.h +7 -2
  26. data/vendor/faiss/faiss/IndexHNSW.cpp +3 -6
  27. data/vendor/faiss/faiss/IndexHNSW.h +0 -1
  28. data/vendor/faiss/faiss/IndexIDMap.cpp +4 -4
  29. data/vendor/faiss/faiss/IndexIDMap.h +0 -2
  30. data/vendor/faiss/faiss/IndexIVF.cpp +155 -129
  31. data/vendor/faiss/faiss/IndexIVF.h +121 -61
  32. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
  33. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +12 -11
  34. data/vendor/faiss/faiss/IndexIVFFastScan.h +6 -1
  35. data/vendor/faiss/faiss/IndexIVFPQ.cpp +221 -165
  36. data/vendor/faiss/faiss/IndexIVFPQ.h +1 -0
  37. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +6 -1
  38. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +0 -2
  39. data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -2
  40. data/vendor/faiss/faiss/IndexNNDescent.h +0 -1
  41. data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
  42. data/vendor/faiss/faiss/IndexPQ.cpp +7 -9
  43. data/vendor/faiss/faiss/IndexRefine.cpp +1 -1
  44. data/vendor/faiss/faiss/IndexReplicas.cpp +3 -4
  45. data/vendor/faiss/faiss/IndexReplicas.h +0 -1
  46. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +8 -1
  47. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +7 -0
  48. data/vendor/faiss/faiss/IndexShards.cpp +26 -109
  49. data/vendor/faiss/faiss/IndexShards.h +2 -3
  50. data/vendor/faiss/faiss/IndexShardsIVF.cpp +246 -0
  51. data/vendor/faiss/faiss/IndexShardsIVF.h +42 -0
  52. data/vendor/faiss/faiss/MetaIndexes.cpp +86 -0
  53. data/vendor/faiss/faiss/MetaIndexes.h +29 -0
  54. data/vendor/faiss/faiss/MetricType.h +14 -0
  55. data/vendor/faiss/faiss/VectorTransform.cpp +8 -10
  56. data/vendor/faiss/faiss/VectorTransform.h +1 -3
  57. data/vendor/faiss/faiss/clone_index.cpp +232 -18
  58. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +25 -3
  59. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +7 -0
  60. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +78 -0
  61. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +20 -6
  62. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +7 -1
  63. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +21 -7
  64. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +7 -0
  65. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +7 -0
  66. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +10 -3
  67. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +7 -1
  68. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +11 -3
  69. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +25 -2
  70. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +76 -29
  71. data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
  72. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +14 -13
  73. data/vendor/faiss/faiss/gpu/GpuDistance.h +18 -6
  74. data/vendor/faiss/faiss/gpu/GpuIndex.h +23 -21
  75. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +10 -10
  76. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -12
  77. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +29 -50
  78. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +3 -3
  79. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +8 -8
  80. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +4 -4
  81. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -5
  82. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +9 -7
  83. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +4 -4
  84. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
  85. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +1 -1
  86. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +55 -6
  87. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +20 -6
  88. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +95 -25
  89. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +67 -16
  90. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +4 -4
  91. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +7 -7
  92. data/vendor/faiss/faiss/gpu/test/TestUtils.h +4 -4
  93. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
  94. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
  95. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +0 -7
  96. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +9 -9
  97. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
  98. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -7
  99. data/vendor/faiss/faiss/impl/CodePacker.cpp +67 -0
  100. data/vendor/faiss/faiss/impl/CodePacker.h +71 -0
  101. data/vendor/faiss/faiss/impl/DistanceComputer.h +0 -2
  102. data/vendor/faiss/faiss/impl/HNSW.cpp +3 -7
  103. data/vendor/faiss/faiss/impl/HNSW.h +6 -9
  104. data/vendor/faiss/faiss/impl/IDSelector.cpp +1 -1
  105. data/vendor/faiss/faiss/impl/IDSelector.h +39 -1
  106. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +62 -51
  107. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +11 -12
  108. data/vendor/faiss/faiss/impl/NNDescent.cpp +3 -9
  109. data/vendor/faiss/faiss/impl/NNDescent.h +10 -10
  110. data/vendor/faiss/faiss/impl/NSG.cpp +1 -6
  111. data/vendor/faiss/faiss/impl/NSG.h +4 -7
  112. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +1 -15
  113. data/vendor/faiss/faiss/impl/PolysemousTraining.h +11 -10
  114. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +0 -7
  115. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -12
  116. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -4
  117. data/vendor/faiss/faiss/impl/Quantizer.h +6 -3
  118. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +796 -174
  119. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +16 -8
  120. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +3 -5
  121. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +4 -4
  122. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +3 -3
  123. data/vendor/faiss/faiss/impl/ThreadedIndex.h +4 -4
  124. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +291 -0
  125. data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +74 -0
  126. data/vendor/faiss/faiss/impl/code_distance/code_distance.h +123 -0
  127. data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +102 -0
  128. data/vendor/faiss/faiss/impl/index_read.cpp +13 -10
  129. data/vendor/faiss/faiss/impl/index_write.cpp +3 -4
  130. data/vendor/faiss/faiss/impl/kmeans1d.cpp +0 -1
  131. data/vendor/faiss/faiss/impl/kmeans1d.h +3 -3
  132. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
  133. data/vendor/faiss/faiss/impl/platform_macros.h +61 -0
  134. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +48 -4
  135. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +18 -4
  136. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
  137. data/vendor/faiss/faiss/index_factory.cpp +8 -10
  138. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +29 -12
  139. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +8 -2
  140. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
  141. data/vendor/faiss/faiss/invlists/DirectMap.h +2 -4
  142. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +118 -18
  143. data/vendor/faiss/faiss/invlists/InvertedLists.h +44 -4
  144. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
  145. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  146. data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
  147. data/vendor/faiss/faiss/python/python_callbacks.h +1 -1
  148. data/vendor/faiss/faiss/utils/AlignedTable.h +3 -1
  149. data/vendor/faiss/faiss/utils/Heap.cpp +139 -3
  150. data/vendor/faiss/faiss/utils/Heap.h +35 -1
  151. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +84 -0
  152. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +196 -0
  153. data/vendor/faiss/faiss/utils/approx_topk/generic.h +138 -0
  154. data/vendor/faiss/faiss/utils/approx_topk/mode.h +34 -0
  155. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +367 -0
  156. data/vendor/faiss/faiss/utils/distances.cpp +61 -7
  157. data/vendor/faiss/faiss/utils/distances.h +11 -0
  158. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +346 -0
  159. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +36 -0
  160. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +42 -0
  161. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +40 -0
  162. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +352 -0
  163. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +32 -0
  164. data/vendor/faiss/faiss/utils/distances_simd.cpp +515 -327
  165. data/vendor/faiss/faiss/utils/extra_distances-inl.h +17 -1
  166. data/vendor/faiss/faiss/utils/extra_distances.cpp +37 -8
  167. data/vendor/faiss/faiss/utils/extra_distances.h +2 -1
  168. data/vendor/faiss/faiss/utils/fp16-fp16c.h +7 -0
  169. data/vendor/faiss/faiss/utils/fp16-inl.h +7 -0
  170. data/vendor/faiss/faiss/utils/fp16.h +7 -0
  171. data/vendor/faiss/faiss/utils/hamming-inl.h +0 -456
  172. data/vendor/faiss/faiss/utils/hamming.cpp +104 -120
  173. data/vendor/faiss/faiss/utils/hamming.h +21 -10
  174. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +535 -0
  175. data/vendor/faiss/faiss/utils/hamming_distance/common.h +48 -0
  176. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +519 -0
  177. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +26 -0
  178. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +614 -0
  179. data/vendor/faiss/faiss/utils/partitioning.cpp +21 -25
  180. data/vendor/faiss/faiss/utils/simdlib_avx2.h +344 -3
  181. data/vendor/faiss/faiss/utils/simdlib_emulated.h +390 -0
  182. data/vendor/faiss/faiss/utils/simdlib_neon.h +655 -130
  183. data/vendor/faiss/faiss/utils/sorting.cpp +692 -0
  184. data/vendor/faiss/faiss/utils/sorting.h +71 -0
  185. data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +165 -0
  186. data/vendor/faiss/faiss/utils/utils.cpp +4 -176
  187. data/vendor/faiss/faiss/utils/utils.h +2 -9
  188. metadata +29 -3
  189. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +0 -26
@@ -33,9 +33,7 @@
33
33
 
34
34
  #include <faiss/impl/ProductQuantizer.h>
35
35
 
36
- #ifdef __AVX2__
37
- #include <immintrin.h>
38
- #endif
36
+ #include <faiss/impl/code_distance/code_distance.h>
39
37
 
40
38
  namespace faiss {
41
39
 
@@ -51,7 +49,6 @@ IndexIVFPQ::IndexIVFPQ(
51
49
  size_t nbits_per_idx,
52
50
  MetricType metric)
53
51
  : IndexIVF(quantizer, d, nlist, 0, metric), pq(d, M, nbits_per_idx) {
54
- FAISS_THROW_IF_NOT(nbits_per_idx <= 8);
55
52
  code_size = pq.code_size;
56
53
  invlists->code_size = code_size;
57
54
  is_trained = false;
@@ -198,9 +195,9 @@ void IndexIVFPQ::add_core(
198
195
 
199
196
  static float* compute_residuals(
200
197
  const Index* quantizer,
201
- Index::idx_t n,
198
+ idx_t n,
202
199
  const float* x,
203
- const Index::idx_t* list_nos) {
200
+ const idx_t* list_nos) {
204
201
  size_t d = quantizer->d;
205
202
  float* residuals = new float[n * d];
206
203
  // TODO: parallelize?
@@ -423,6 +420,7 @@ void initialize_IVFPQ_precomputed_table(
423
420
  const Index* quantizer,
424
421
  const ProductQuantizer& pq,
425
422
  AlignedTable<float>& precomputed_table,
423
+ bool by_residual,
426
424
  bool verbose) {
427
425
  size_t nlist = quantizer->ntotal;
428
426
  size_t d = quantizer->d;
@@ -434,10 +432,10 @@ void initialize_IVFPQ_precomputed_table(
434
432
  }
435
433
 
436
434
  if (use_precomputed_table == 0) { // then choose the type of table
437
- if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
435
+ if (!(quantizer->metric_type == METRIC_L2 && by_residual)) {
438
436
  if (verbose) {
439
437
  printf("IndexIVFPQ::precompute_table: precomputed "
440
- "tables not needed for inner product quantizers\n");
438
+ "tables needed only for L2 metric and by_residual is enabled\n");
441
439
  }
442
440
  precomputed_table.resize(0);
443
441
  return;
@@ -516,13 +514,16 @@ void initialize_IVFPQ_precomputed_table(
516
514
 
517
515
  void IndexIVFPQ::precompute_table() {
518
516
  initialize_IVFPQ_precomputed_table(
519
- use_precomputed_table, quantizer, pq, precomputed_table, verbose);
517
+ use_precomputed_table,
518
+ quantizer,
519
+ pq,
520
+ precomputed_table,
521
+ by_residual,
522
+ verbose);
520
523
  }
521
524
 
522
525
  namespace {
523
526
 
524
- using idx_t = Index::idx_t;
525
-
526
527
  #define TIC t0 = get_cycles()
527
528
  #define TOC get_cycles() - t0
528
529
 
@@ -623,7 +624,7 @@ struct QueryTables {
623
624
  *****************************************************/
624
625
 
625
626
  // fields specific to list
626
- Index::idx_t key;
627
+ idx_t key;
627
628
  float coarse_dis;
628
629
  std::vector<uint8_t> q_code;
629
630
 
@@ -886,140 +887,29 @@ struct IVFPQScannerT : QueryTables {
886
887
  * Scaning the codes: simple PQ scan.
887
888
  *****************************************************/
888
889
 
889
- #ifdef __AVX2__
890
- /// Returns the distance to a single code.
891
- /// General-purpose version.
892
- template <class SearchResultType, typename T = PQDecoder>
893
- typename std::enable_if<!(std::is_same<T, PQDecoder8>::value), float>::
894
- type inline distance_single_code(const uint8_t* code) const {
895
- PQDecoder decoder(code, pq.nbits);
896
-
897
- const float* tab = sim_table;
898
- float result = 0;
899
-
900
- for (size_t m = 0; m < pq.M; m++) {
901
- result += tab[decoder.decode()];
902
- tab += pq.ksub;
903
- }
904
-
905
- return result;
906
- }
907
-
908
- /// Returns the distance to a single code.
909
- /// Specialized AVX2 PQDecoder8 version.
910
- template <class SearchResultType, typename T = PQDecoder>
911
- typename std::enable_if<(std::is_same<T, PQDecoder8>::value), float>::
912
- type inline distance_single_code(const uint8_t* code) const {
913
- float result = 0;
914
-
915
- size_t m = 0;
916
- const size_t pqM16 = pq.M / 16;
917
-
918
- const float* tab = sim_table;
919
-
920
- if (pqM16 > 0) {
921
- // process 16 values per loop
922
-
923
- const __m256i ksub = _mm256_set1_epi32(pq.ksub);
924
- __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
925
- offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
926
-
927
- // accumulators of partial sums
928
- __m256 partialSum = _mm256_setzero_ps();
929
-
930
- // loop
931
- for (m = 0; m < pqM16 * 16; m += 16) {
932
- // load 16 uint8 values
933
- const __m128i mm1 =
934
- _mm_loadu_si128((const __m128i_u*)(code + m));
935
- {
936
- // convert uint8 values (low part of __m128i) to int32
937
- // values
938
- const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
939
-
940
- // add offsets
941
- const __m256i indices_to_read_from =
942
- _mm256_add_epi32(idx1, offsets_0);
943
-
944
- // gather 8 values, similar to 8 operations of tab[idx]
945
- __m256 collected = _mm256_i32gather_ps(
946
- tab, indices_to_read_from, sizeof(float));
947
- tab += pq.ksub * 8;
948
-
949
- // collect partial sums
950
- partialSum = _mm256_add_ps(partialSum, collected);
951
- }
952
-
953
- // move high 8 uint8 to low ones
954
- const __m128i mm2 =
955
- _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
956
- {
957
- // convert uint8 values (low part of __m128i) to int32
958
- // values
959
- const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
960
-
961
- // add offsets
962
- const __m256i indices_to_read_from =
963
- _mm256_add_epi32(idx1, offsets_0);
964
-
965
- // gather 8 values, similar to 8 operations of tab[idx]
966
- __m256 collected = _mm256_i32gather_ps(
967
- tab, indices_to_read_from, sizeof(float));
968
- tab += pq.ksub * 8;
969
-
970
- // collect partial sums
971
- partialSum = _mm256_add_ps(partialSum, collected);
972
- }
973
- }
974
-
975
- // horizontal sum for partialSum
976
- const __m256 h0 = _mm256_hadd_ps(partialSum, partialSum);
977
- const __m256 h1 = _mm256_hadd_ps(h0, h0);
978
-
979
- // extract high and low __m128 regs from __m256
980
- const __m128 h2 = _mm256_extractf128_ps(h1, 1);
981
- const __m128 h3 = _mm256_castps256_ps128(h1);
982
-
983
- // get a final hsum into all 4 regs
984
- const __m128 h4 = _mm_add_ss(h2, h3);
985
-
986
- // extract f[0] from __m128
987
- const float hsum = _mm_cvtss_f32(h4);
988
- result += hsum;
989
- }
990
-
991
- //
992
- if (m < pq.M) {
993
- // process leftovers
994
- PQDecoder decoder(code + m, pq.nbits);
995
-
996
- for (; m < pq.M; m++) {
997
- result += tab[decoder.decode()];
998
- tab += pq.ksub;
999
- }
1000
- }
1001
-
1002
- return result;
1003
- }
1004
-
1005
- #else
1006
- /// Returns the distance to a single code.
1007
- /// General-purpose version.
1008
- template <class SearchResultType>
1009
- inline float distance_single_code(const uint8_t* code) const {
1010
- PQDecoder decoder(code, pq.nbits);
1011
-
1012
- const float* tab = sim_table;
1013
- float result = 0;
1014
-
1015
- for (size_t m = 0; m < pq.M; m++) {
1016
- result += tab[decoder.decode()];
1017
- tab += pq.ksub;
1018
- }
1019
-
1020
- return result;
1021
- }
1022
- #endif
890
+ // This is the baseline version of scan_list_with_tables().
891
+ // It demonstrates what this function actually does.
892
+ //
893
+ // /// version of the scan where we use precomputed tables.
894
+ // template <class SearchResultType>
895
+ // void scan_list_with_table(
896
+ // size_t ncode,
897
+ // const uint8_t* codes,
898
+ // SearchResultType& res) const {
899
+ //
900
+ // for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
901
+ // if (res.skip_entry(j)) {
902
+ // continue;
903
+ // }
904
+ // float dis = dis0 + distance_single_code<PQDecoder>(
905
+ // pq, sim_table, codes);
906
+ // res.add(j, dis);
907
+ // }
908
+ // }
909
+
910
+ // This is the modified version of scan_list_with_tables().
911
+ // It was observed that doing manual unrolling of the loop that
912
+ // utilizes distance_single_code() speeds up the computations.
1023
913
 
1024
914
  /// version of the scan where we use precomputed tables.
1025
915
  template <class SearchResultType>
@@ -1027,12 +917,65 @@ struct IVFPQScannerT : QueryTables {
1027
917
  size_t ncode,
1028
918
  const uint8_t* codes,
1029
919
  SearchResultType& res) const {
1030
- for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
920
+ int counter = 0;
921
+
922
+ size_t saved_j[4] = {0, 0, 0, 0};
923
+ for (size_t j = 0; j < ncode; j++) {
1031
924
  if (res.skip_entry(j)) {
1032
925
  continue;
1033
926
  }
1034
- float dis = dis0 + distance_single_code<SearchResultType>(codes);
1035
- res.add(j, dis);
927
+
928
+ saved_j[0] = (counter == 0) ? j : saved_j[0];
929
+ saved_j[1] = (counter == 1) ? j : saved_j[1];
930
+ saved_j[2] = (counter == 2) ? j : saved_j[2];
931
+ saved_j[3] = (counter == 3) ? j : saved_j[3];
932
+
933
+ counter += 1;
934
+ if (counter == 4) {
935
+ float distance_0 = 0;
936
+ float distance_1 = 0;
937
+ float distance_2 = 0;
938
+ float distance_3 = 0;
939
+ distance_four_codes<PQDecoder>(
940
+ pq,
941
+ sim_table,
942
+ codes + saved_j[0] * pq.code_size,
943
+ codes + saved_j[1] * pq.code_size,
944
+ codes + saved_j[2] * pq.code_size,
945
+ codes + saved_j[3] * pq.code_size,
946
+ distance_0,
947
+ distance_1,
948
+ distance_2,
949
+ distance_3);
950
+
951
+ res.add(saved_j[0], dis0 + distance_0);
952
+ res.add(saved_j[1], dis0 + distance_1);
953
+ res.add(saved_j[2], dis0 + distance_2);
954
+ res.add(saved_j[3], dis0 + distance_3);
955
+ counter = 0;
956
+ }
957
+ }
958
+
959
+ if (counter >= 1) {
960
+ float dis =
961
+ dis0 +
962
+ distance_single_code<PQDecoder>(
963
+ pq, sim_table, codes + saved_j[0] * pq.code_size);
964
+ res.add(saved_j[0], dis);
965
+ }
966
+ if (counter >= 2) {
967
+ float dis =
968
+ dis0 +
969
+ distance_single_code<PQDecoder>(
970
+ pq, sim_table, codes + saved_j[1] * pq.code_size);
971
+ res.add(saved_j[1], dis);
972
+ }
973
+ if (counter >= 3) {
974
+ float dis =
975
+ dis0 +
976
+ distance_single_code<PQDecoder>(
977
+ pq, sim_table, codes + saved_j[2] * pq.code_size);
978
+ res.add(saved_j[2], dis);
1036
979
  }
1037
980
  }
1038
981
 
@@ -1101,6 +1044,46 @@ struct IVFPQScannerT : QueryTables {
1101
1044
  * Scanning codes with polysemous filtering
1102
1045
  *****************************************************/
1103
1046
 
1047
+ // This is the baseline version of scan_list_polysemous_hc().
1048
+ // It demonstrates what this function actually does.
1049
+
1050
+ // template <class HammingComputer, class SearchResultType>
1051
+ // void scan_list_polysemous_hc(
1052
+ // size_t ncode,
1053
+ // const uint8_t* codes,
1054
+ // SearchResultType& res) const {
1055
+ // int ht = ivfpq.polysemous_ht;
1056
+ // size_t n_hamming_pass = 0, nup = 0;
1057
+ //
1058
+ // int code_size = pq.code_size;
1059
+ //
1060
+ // HammingComputer hc(q_code.data(), code_size);
1061
+ //
1062
+ // for (size_t j = 0; j < ncode; j++, codes += code_size) {
1063
+ // if (res.skip_entry(j)) {
1064
+ // continue;
1065
+ // }
1066
+ // const uint8_t* b_code = codes;
1067
+ // int hd = hc.hamming(b_code);
1068
+ // if (hd < ht) {
1069
+ // n_hamming_pass++;
1070
+ //
1071
+ // float dis =
1072
+ // dis0 +
1073
+ // distance_single_code<PQDecoder>(
1074
+ // pq, sim_table, codes);
1075
+ //
1076
+ // res.add(j, dis);
1077
+ // }
1078
+ // }
1079
+ // #pragma omp critical
1080
+ // { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
1081
+ // }
1082
+
1083
+ // This is the modified version of scan_list_with_tables().
1084
+ // It was observed that doing manual unrolling of the loop that
1085
+ // utilizes distance_single_code() speeds up the computations.
1086
+
1104
1087
  template <class HammingComputer, class SearchResultType>
1105
1088
  void scan_list_polysemous_hc(
1106
1089
  size_t ncode,
@@ -1111,23 +1094,103 @@ struct IVFPQScannerT : QueryTables {
1111
1094
 
1112
1095
  int code_size = pq.code_size;
1113
1096
 
1097
+ size_t saved_j[8];
1098
+ int counter = 0;
1099
+
1114
1100
  HammingComputer hc(q_code.data(), code_size);
1115
1101
 
1116
- for (size_t j = 0; j < ncode; j++, codes += code_size) {
1102
+ for (size_t j = 0; j < (ncode / 4) * 4; j += 4) {
1103
+ const uint8_t* b_code = codes + j * code_size;
1104
+
1105
+ // Unrolling is a key. Basically, doing multiple popcount
1106
+ // operations one after another speeds things up.
1107
+
1108
+ // 9999999 is just an arbitrary large number
1109
+ int hd0 = (res.skip_entry(j + 0))
1110
+ ? 99999999
1111
+ : hc.hamming(b_code + 0 * code_size);
1112
+ int hd1 = (res.skip_entry(j + 1))
1113
+ ? 99999999
1114
+ : hc.hamming(b_code + 1 * code_size);
1115
+ int hd2 = (res.skip_entry(j + 2))
1116
+ ? 99999999
1117
+ : hc.hamming(b_code + 2 * code_size);
1118
+ int hd3 = (res.skip_entry(j + 3))
1119
+ ? 99999999
1120
+ : hc.hamming(b_code + 3 * code_size);
1121
+
1122
+ saved_j[counter] = j + 0;
1123
+ counter = (hd0 < ht) ? (counter + 1) : counter;
1124
+ saved_j[counter] = j + 1;
1125
+ counter = (hd1 < ht) ? (counter + 1) : counter;
1126
+ saved_j[counter] = j + 2;
1127
+ counter = (hd2 < ht) ? (counter + 1) : counter;
1128
+ saved_j[counter] = j + 3;
1129
+ counter = (hd3 < ht) ? (counter + 1) : counter;
1130
+
1131
+ if (counter >= 4) {
1132
+ // process four codes at the same time
1133
+ n_hamming_pass += 4;
1134
+
1135
+ float distance_0 = dis0;
1136
+ float distance_1 = dis0;
1137
+ float distance_2 = dis0;
1138
+ float distance_3 = dis0;
1139
+ distance_four_codes<PQDecoder>(
1140
+ pq,
1141
+ sim_table,
1142
+ codes + saved_j[0] * pq.code_size,
1143
+ codes + saved_j[1] * pq.code_size,
1144
+ codes + saved_j[2] * pq.code_size,
1145
+ codes + saved_j[3] * pq.code_size,
1146
+ distance_0,
1147
+ distance_1,
1148
+ distance_2,
1149
+ distance_3);
1150
+
1151
+ res.add(saved_j[0], dis0 + distance_0);
1152
+ res.add(saved_j[1], dis0 + distance_1);
1153
+ res.add(saved_j[2], dis0 + distance_2);
1154
+ res.add(saved_j[3], dis0 + distance_3);
1155
+
1156
+ //
1157
+ counter -= 4;
1158
+ saved_j[0] = saved_j[4];
1159
+ saved_j[1] = saved_j[5];
1160
+ saved_j[2] = saved_j[6];
1161
+ saved_j[3] = saved_j[7];
1162
+ }
1163
+ }
1164
+
1165
+ for (size_t kk = 0; kk < counter; kk++) {
1166
+ n_hamming_pass++;
1167
+
1168
+ float dis =
1169
+ dis0 +
1170
+ distance_single_code<PQDecoder>(
1171
+ pq, sim_table, codes + saved_j[kk] * pq.code_size);
1172
+
1173
+ res.add(saved_j[kk], dis);
1174
+ }
1175
+
1176
+ // process leftovers
1177
+ for (size_t j = (ncode / 4) * 4; j < ncode; j++) {
1117
1178
  if (res.skip_entry(j)) {
1118
1179
  continue;
1119
1180
  }
1120
- const uint8_t* b_code = codes;
1181
+ const uint8_t* b_code = codes + j * code_size;
1121
1182
  int hd = hc.hamming(b_code);
1122
1183
  if (hd < ht) {
1123
1184
  n_hamming_pass++;
1124
1185
 
1125
- float dis =
1126
- dis0 + distance_single_code<SearchResultType>(codes);
1186
+ float dis = dis0 +
1187
+ distance_single_code<PQDecoder>(
1188
+ pq, sim_table, codes + j * code_size);
1127
1189
 
1128
1190
  res.add(j, dis);
1129
1191
  }
1130
1192
  }
1193
+
1131
1194
  #pragma omp critical
1132
1195
  { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
1133
1196
  }
@@ -1171,7 +1234,7 @@ struct IVFPQScannerT : QueryTables {
1171
1234
  * use_sel: store or ignore the IDSelector
1172
1235
  */
1173
1236
  template <MetricType METRIC_TYPE, class C, class PQDecoder, bool use_sel>
1174
- struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
1237
+ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
1175
1238
  InvertedListScanner {
1176
1239
  int precompute_mode;
1177
1240
  const IDSelector* sel;
@@ -1181,9 +1244,7 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
1181
1244
  bool store_pairs,
1182
1245
  int precompute_mode,
1183
1246
  const IDSelector* sel)
1184
- : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(
1185
- ivfpq,
1186
- nullptr),
1247
+ : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
1187
1248
  precompute_mode(precompute_mode),
1188
1249
  sel(sel) {
1189
1250
  this->store_pairs = store_pairs;
@@ -1200,14 +1261,9 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
1200
1261
 
1201
1262
  float distance_to_code(const uint8_t* code) const override {
1202
1263
  assert(precompute_mode == 2);
1203
- float dis = this->dis0;
1204
- const float* tab = this->sim_table;
1205
- PQDecoder decoder(code, this->pq.nbits);
1206
-
1207
- for (size_t m = 0; m < this->pq.M; m++) {
1208
- dis += tab[decoder.decode()];
1209
- tab += this->pq.ksub;
1210
- }
1264
+ float dis = this->dis0 +
1265
+ distance_single_code<PQDecoder>(
1266
+ this->pq, this->sim_table, code);
1211
1267
  return dis;
1212
1268
  }
1213
1269
 
@@ -162,6 +162,7 @@ void initialize_IVFPQ_precomputed_table(
162
162
  const Index* quantizer,
163
163
  const ProductQuantizer& pq,
164
164
  AlignedTable<float>& precomputed_table,
165
+ bool by_residual,
165
166
  bool verbose);
166
167
 
167
168
  /// statistics are robust to internal threading, but not if
@@ -156,7 +156,12 @@ void IndexIVFPQFastScan::train_residual(idx_t n, const float* x_in) {
156
156
 
157
157
  void IndexIVFPQFastScan::precompute_table() {
158
158
  initialize_IVFPQ_precomputed_table(
159
- use_precomputed_table, quantizer, pq, precomputed_table, verbose);
159
+ use_precomputed_table,
160
+ quantizer,
161
+ pq,
162
+ precomputed_table,
163
+ by_residual,
164
+ verbose);
160
165
  }
161
166
 
162
167
  /*********************************************************
@@ -213,8 +213,6 @@ struct IVFScanner : InvertedListScanner {
213
213
  std::vector<uint8_t> qcode;
214
214
  HammingComputer hc;
215
215
 
216
- using idx_t = Index::idx_t;
217
-
218
216
  IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs)
219
217
  : index(index),
220
218
  nbit(index->nbit),
@@ -50,7 +50,6 @@ int sgemm_(
50
50
 
51
51
  namespace faiss {
52
52
 
53
- using idx_t = Index::idx_t;
54
53
  using storage_idx_t = NNDescent::storage_idx_t;
55
54
 
56
55
  /**************************************************************
@@ -89,7 +88,7 @@ struct NegativeDistanceComputer : DistanceComputer {
89
88
  };
90
89
 
91
90
  DistanceComputer* storage_distance_computer(const Index* storage) {
92
- if (storage->metric_type == METRIC_INNER_PRODUCT) {
91
+ if (is_similarity_metric(storage->metric_type)) {
93
92
  return new NegativeDistanceComputer(storage->get_distance_computer());
94
93
  } else {
95
94
  return storage->get_distance_computer();
@@ -25,7 +25,6 @@ struct IndexNNDescent : Index {
25
25
  using storage_idx_t = NNDescent::storage_idx_t;
26
26
 
27
27
  /// Faiss results are 64-bit
28
- using idx_t = Index::idx_t;
29
28
 
30
29
  // the link strcuture
31
30
  NNDescent nndescent;
@@ -23,7 +23,6 @@
23
23
 
24
24
  namespace faiss {
25
25
 
26
- using idx_t = Index::idx_t;
27
26
  using namespace nsg;
28
27
 
29
28
  /**************************************************************
@@ -113,7 +112,7 @@ void IndexNSG::search(
113
112
  InterruptCallback::check();
114
113
  }
115
114
 
116
- if (metric_type == METRIC_INNER_PRODUCT) {
115
+ if (is_similarity_metric(metric_type)) {
117
116
  // we need to revert the negated distances
118
117
  for (size_t i = 0; i < k * n; i++) {
119
118
  distances[i] = -distances[i];
@@ -19,6 +19,8 @@
19
19
  #include <faiss/impl/FaissAssert.h>
20
20
  #include <faiss/utils/hamming.h>
21
21
 
22
+ #include <faiss/impl/code_distance/code_distance.h>
23
+
22
24
  namespace faiss {
23
25
 
24
26
  /*********************************************************
@@ -74,22 +76,18 @@ template <class PQDecoder>
74
76
  struct PQDistanceComputer : FlatCodesDistanceComputer {
75
77
  size_t d;
76
78
  MetricType metric;
77
- Index::idx_t nb;
79
+ idx_t nb;
78
80
  const ProductQuantizer& pq;
79
81
  const float* sdc;
80
82
  std::vector<float> precomputed_table;
81
83
  size_t ndis;
82
84
 
83
85
  float distance_to_code(const uint8_t* code) final {
84
- const float* dt = precomputed_table.data();
85
- PQDecoder decoder(code, pq.nbits);
86
- float accu = 0;
87
- for (int j = 0; j < pq.M; j++) {
88
- accu += dt[decoder.decode()];
89
- dt += 1 << decoder.nbits;
90
- }
91
86
  ndis++;
92
- return accu;
87
+
88
+ float dis = distance_single_code<PQDecoder>(
89
+ pq, precomputed_table.data(), code);
90
+ return dis;
93
91
  }
94
92
 
95
93
  float symmetric_dis(idx_t i, idx_t j) override {
@@ -62,7 +62,7 @@ void IndexRefine::reset() {
62
62
 
63
63
  namespace {
64
64
 
65
- typedef faiss::Index::idx_t idx_t;
65
+ typedef faiss::idx_t idx_t;
66
66
 
67
67
  template <class C>
68
68
  static void reorder_2_heaps(
@@ -123,14 +123,13 @@ void IndexReplicasTemplate<IndexT>::search(
123
123
  size_t componentsPerVec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
124
124
 
125
125
  // Partition the query by the number of indices we have
126
- faiss::Index::idx_t queriesPerIndex =
127
- (faiss::Index::idx_t)(n + this->count() - 1) /
128
- (faiss::Index::idx_t)this->count();
126
+ faiss::idx_t queriesPerIndex =
127
+ (faiss::idx_t)(n + this->count() - 1) / (faiss::idx_t)this->count();
129
128
  FAISS_ASSERT(n / queriesPerIndex <= this->count());
130
129
 
131
130
  auto fn = [queriesPerIndex, componentsPerVec, n, x, k, distances, labels](
132
131
  int i, const IndexT* index) {
133
- faiss::Index::idx_t base = (faiss::Index::idx_t)i * queriesPerIndex;
132
+ faiss::idx_t base = (faiss::idx_t)i * queriesPerIndex;
134
133
 
135
134
  if (base < n) {
136
135
  auto numForIndex = std::min(queriesPerIndex, n - base);
@@ -20,7 +20,6 @@ namespace faiss {
20
20
  template <typename IndexT>
21
21
  class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
22
22
  public:
23
- using idx_t = typename IndexT::idx_t;
24
23
  using component_t = typename IndexT::component_t;
25
24
  using distance_t = typename IndexT::distance_t;
26
25
 
@@ -1,3 +1,10 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
1
8
  #include <faiss/IndexRowwiseMinMax.h>
2
9
 
3
10
  #include <cstdint>
@@ -11,7 +18,7 @@ namespace faiss {
11
18
 
12
19
  namespace {
13
20
 
14
- using idx_t = faiss::Index::idx_t;
21
+ using idx_t = faiss::idx_t;
15
22
 
16
23
  struct StorageMinMaxFP16 {
17
24
  uint16_t scaler;