faiss 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +1 -1
  5. data/ext/faiss/extconf.rb +1 -1
  6. data/lib/faiss/version.rb +1 -1
  7. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  8. data/vendor/faiss/faiss/AutoTune.h +6 -3
  9. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  10. data/vendor/faiss/faiss/Index.cpp +3 -4
  11. data/vendor/faiss/faiss/Index.h +3 -3
  12. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  13. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  14. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  15. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  16. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  17. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  18. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  19. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  20. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  21. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  22. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  24. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  25. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  26. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  27. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  28. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  29. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  30. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  31. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  32. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  33. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  34. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  35. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  36. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  37. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  38. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  39. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  40. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  41. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  42. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  43. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  44. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  47. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  48. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  49. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  50. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  51. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  52. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  53. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  54. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  55. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  56. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  57. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  58. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  59. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  60. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  61. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  62. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  63. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  64. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  65. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  71. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  72. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  73. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  74. data/vendor/faiss/faiss/impl/io.h +7 -2
  75. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  76. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  77. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  78. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  79. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  81. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  82. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  83. data/vendor/faiss/faiss/index_io.h +1 -48
  84. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  85. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  86. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  87. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  88. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  89. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  90. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  91. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  92. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  93. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  94. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  95. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  96. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  97. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  98. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  99. data/vendor/faiss/faiss/utils/distances.h +28 -20
  100. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  101. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  102. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  103. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  104. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  105. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  106. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  107. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  108. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  109. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  110. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  111. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  112. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  113. metadata +43 -141
  114. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  115. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  116. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  117. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  118. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  119. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  120. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  121. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  122. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  123. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  124. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  125. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  126. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  127. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  128. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  129. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  130. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  131. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  132. data/vendor/faiss/c_api/Index_c.h +0 -183
  133. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  134. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  135. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  136. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  137. data/vendor/faiss/c_api/error_c.h +0 -42
  138. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  139. data/vendor/faiss/c_api/error_impl.h +0 -16
  140. data/vendor/faiss/c_api/faiss_c.h +0 -58
  141. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  142. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  143. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  144. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  145. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  146. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  147. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  148. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  149. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  150. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  151. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  152. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  153. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  154. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  155. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  156. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  157. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  158. data/vendor/faiss/c_api/index_io_c.h +0 -50
  159. data/vendor/faiss/c_api/macros_impl.h +0 -110
  160. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  161. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  162. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  163. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  164. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  165. data/vendor/faiss/misc/test_blas.cpp +0 -87
  166. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  167. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  168. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  169. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  170. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  171. data/vendor/faiss/tests/test_merge.cpp +0 -260
  172. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  173. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  174. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  175. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  176. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  177. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  178. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  179. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  180. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  181. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  182. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  183. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  184. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -371,7 +371,7 @@ void IndexIVFPQ::reconstruct_from_offset (int64_t list_no, int64_t offset,
371
371
 
372
372
 
373
373
  /// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
374
- size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
374
+ size_t precomputed_table_max_bytes = ((size_t)1) << 31;
375
375
 
376
376
  /** Precomputed tables for residuals
377
377
  *
@@ -403,10 +403,22 @@ size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
403
403
  * is faster when the length of the lists is > ksub * M.
404
404
  */
405
405
 
406
- void IndexIVFPQ::precompute_table ()
406
+ void initialize_IVFPQ_precomputed_table (
407
+ int &use_precomputed_table,
408
+ const Index *quantizer,
409
+ const ProductQuantizer &pq,
410
+ AlignedTable<float> & precomputed_table,
411
+ bool verbose
412
+ )
407
413
  {
408
- if (use_precomputed_table == -1)
414
+ size_t nlist = quantizer->ntotal;
415
+ size_t d = quantizer->d;
416
+ FAISS_THROW_IF_NOT(d == pq.d);
417
+
418
+ if (use_precomputed_table == -1) {
419
+ precomputed_table.resize (0);
409
420
  return;
421
+ }
410
422
 
411
423
  if (use_precomputed_table == 0) { // then choose the type of table
412
424
  if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
@@ -414,6 +426,7 @@ void IndexIVFPQ::precompute_table ()
414
426
  printf("IndexIVFPQ::precompute_table: precomputed "
415
427
  "tables not needed for inner product quantizers\n");
416
428
  }
429
+ precomputed_table.resize (0);
417
430
  return;
418
431
  }
419
432
  const MultiIndexQuantizer *miq =
@@ -492,6 +505,16 @@ void IndexIVFPQ::precompute_table ()
492
505
 
493
506
  }
494
507
 
508
+ void IndexIVFPQ::precompute_table ()
509
+ {
510
+ initialize_IVFPQ_precomputed_table (
511
+ use_precomputed_table, quantizer, pq, precomputed_table,
512
+ verbose
513
+ );
514
+ }
515
+
516
+
517
+
495
518
  namespace {
496
519
 
497
520
  using idx_t = Index::idx_t;
@@ -676,11 +699,12 @@ struct QueryTables {
676
699
  } else if (use_precomputed_table == 1) {
677
700
  dis0 = coarse_dis;
678
701
 
679
- fvec_madd (pq.M * pq.ksub,
680
- &ivfpq.precomputed_table [key * pq.ksub * pq.M],
681
- -2.0, sim_table_2,
682
- sim_table);
683
-
702
+ fvec_madd (
703
+ pq.M * pq.ksub,
704
+ ivfpq.precomputed_table.data() + key * pq.ksub * pq.M,
705
+ -2.0, sim_table_2,
706
+ sim_table
707
+ );
684
708
 
685
709
  if (polysemous_ht != 0) {
686
710
  ivfpq.quantizer->compute_residual (qi, residual_vec, key);
@@ -706,8 +730,8 @@ struct QueryTables {
706
730
  k >>= cpq.nbits;
707
731
 
708
732
  // get corresponding table
709
- const float *pc = &ivfpq.precomputed_table
710
- [(ki * pq.M + cm * Mf) * pq.ksub];
733
+ const float *pc = ivfpq.precomputed_table.data() +
734
+ (ki * pq.M + cm * Mf) * pq.ksub;
711
735
 
712
736
  if (polysemous_ht == 0) {
713
737
 
@@ -741,7 +765,8 @@ struct QueryTables {
741
765
  if (use_precomputed_table == 1) {
742
766
  dis0 = coarse_dis;
743
767
 
744
- const float * s = &ivfpq.precomputed_table [key * pq.ksub * pq.M];
768
+ const float * s = ivfpq.precomputed_table.data() +
769
+ key * pq.ksub * pq.M;
745
770
  for (int m = 0; m < pq.M; m++) {
746
771
  sim_table_ptrs [m] = s;
747
772
  s += pq.ksub;
@@ -761,8 +786,8 @@ struct QueryTables {
761
786
  int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
762
787
  k >>= cpq.nbits;
763
788
 
764
- const float *pc = &ivfpq.precomputed_table
765
- [(ki * pq.M + cm * Mf) * pq.ksub];
789
+ const float *pc = ivfpq.precomputed_table.data() +
790
+ (ki * pq.M + cm * Mf) * pq.ksub;
766
791
 
767
792
  for (int m = m0; m < m0 + Mf; m++) {
768
793
  sim_table_ptrs [m] = pc;
@@ -803,9 +828,8 @@ struct KnnSearchResults {
803
828
 
804
829
  inline void add (idx_t j, float dis) {
805
830
  if (C::cmp (heap_sim[0], dis)) {
806
- heap_pop<C> (k, heap_sim, heap_ids);
807
831
  idx_t id = ids ? ids[j] : lo_build (key, j);
808
- heap_push<C> (k, heap_sim, heap_ids, dis, id);
832
+ heap_replace_top<C> (k, heap_sim, heap_ids, dis, id);
809
833
  nup++;
810
834
  }
811
835
  }
@@ -16,7 +16,7 @@
16
16
  #include <faiss/IndexIVF.h>
17
17
  #include <faiss/IndexPQ.h>
18
18
  #include <faiss/impl/platform_macros.h>
19
-
19
+ #include <faiss/utils/AlignedTable.h>
20
20
 
21
21
  namespace faiss {
22
22
 
@@ -28,10 +28,14 @@ struct IVFPQSearchParameters: IVFSearchParameters {
28
28
  };
29
29
 
30
30
 
31
+
32
+ FAISS_API extern size_t precomputed_table_max_bytes;
33
+
34
+
31
35
  /** Inverted file with Product Quantizer encoding. Each residual
32
36
  * vector is encoded as a product quantizer code.
33
37
  */
34
- struct FAISS_API IndexIVFPQ: IndexIVF {
38
+ struct IndexIVFPQ: IndexIVF {
35
39
  bool by_residual; ///< Encode residual or plain vector?
36
40
 
37
41
  ProductQuantizer pq; ///< produces the codes
@@ -45,18 +49,12 @@ struct FAISS_API IndexIVFPQ: IndexIVF {
45
49
 
46
50
  /** Precompute table that speed up query preprocessing at some
47
51
  * memory cost (used only for by_residual with L2 metric)
48
- * =-1: force disable
49
- * =0: decide heuristically (default: use tables only if they are
50
- * < precomputed_tables_max_bytes)
51
- * =1: tables that work for all quantizers (size 256 * nlist * M)
52
- * =2: specific version for MultiIndexQuantizer (much more compact)
53
52
  */
54
53
  int use_precomputed_table;
55
- static size_t precomputed_table_max_bytes;
56
54
 
57
55
  /// if use_precompute_table
58
56
  /// size nlist * pq.M * pq.ksub
59
- std::vector <float> precomputed_table;
57
+ AlignedTable<float> precomputed_table;
60
58
 
61
59
  IndexIVFPQ (
62
60
  Index * quantizer, size_t d, size_t nlist,
@@ -133,6 +131,24 @@ struct FAISS_API IndexIVFPQ: IndexIVF {
133
131
 
134
132
  };
135
133
 
134
+ /** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
135
+ *
136
+ * @param use_precomputed_table (I/O)
137
+ * =-1: force disable
138
+ * =0: decide heuristically (default: use tables only if they are
139
+ * < precomputed_tables_max_bytes), set use_precomputed_table on output
140
+ * =1: tables that work for all quantizers (size 256 * nlist * M)
141
+ * =2: specific version for MultiIndexQuantizer (much more compact)
142
+ * @param precomputed_table precomputed table to intialize
143
+ */
144
+
145
+ void initialize_IVFPQ_precomputed_table(
146
+ int &use_precomputed_table,
147
+ const Index *quantizer,
148
+ const ProductQuantizer &pq,
149
+ AlignedTable<float> & precomputed_table,
150
+ bool verbose
151
+ );
136
152
 
137
153
  /// statistics are robust to internal threading, but not if
138
154
  /// IndexIVFPQ::search_preassigned is called by multiple threads
@@ -0,0 +1,1116 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/IndexIVFPQFastScan.h>
9
+
10
+ #include <cassert>
11
+ #include <cstdio>
12
+ #include <inttypes.h>
13
+
14
+ #include <omp.h>
15
+
16
+ #include <memory>
17
+
18
+ #include <faiss/impl/FaissAssert.h>
19
+ #include <faiss/utils/utils.h>
20
+ #include <faiss/utils/distances.h>
21
+ #include <faiss/utils/simdlib.h>
22
+ #include <faiss/impl/AuxIndexStructures.h>
23
+
24
+ #include <faiss/invlists/BlockInvertedLists.h>
25
+
26
+ #include <faiss/impl/simd_result_handlers.h>
27
+ #include <faiss/utils/quantize_lut.h>
28
+ #include <faiss/impl/pq4_fast_scan.h>
29
+
30
+ namespace faiss {
31
+
32
+ using namespace simd_result_handlers;
33
+
34
+
35
+ inline size_t roundup(size_t a, size_t b) {
36
+ return (a + b - 1) / b * b;
37
+ }
38
+
39
+
40
+ IndexIVFPQFastScan::IndexIVFPQFastScan (
41
+ Index * quantizer, size_t d, size_t nlist,
42
+ size_t M, size_t nbits_per_idx,
43
+ MetricType metric, int bbs):
44
+ IndexIVF (quantizer, d, nlist, 0, metric),
45
+ pq (d, M, nbits_per_idx),
46
+ bbs (bbs)
47
+ {
48
+ FAISS_THROW_IF_NOT(nbits_per_idx == 4);
49
+ M2 = roundup(pq.M, 2);
50
+ by_residual = false; // set to false by default because it's much faster
51
+ is_trained = false;
52
+ code_size = pq.code_size;
53
+
54
+ replace_invlists(
55
+ new BlockInvertedLists(nlist, bbs, bbs * M2 / 2),
56
+ true
57
+ );
58
+ }
59
+
60
+ IndexIVFPQFastScan::IndexIVFPQFastScan ()
61
+ {
62
+ by_residual = false;
63
+ bbs = 0;
64
+ M2 = 0;
65
+ }
66
+
67
+
68
+ IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ & orig, int bbs):
69
+ IndexIVF(
70
+ orig.quantizer, orig.d, orig.nlist,
71
+ orig.pq.code_size, orig.metric_type),
72
+ pq(orig.pq),
73
+ bbs(bbs)
74
+ {
75
+ FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
76
+
77
+ by_residual = orig.by_residual;
78
+ ntotal = orig.ntotal;
79
+ is_trained = orig.is_trained;
80
+ nprobe = orig.nprobe;
81
+ size_t M = pq.M;
82
+
83
+ M2 = roundup(M, 2);
84
+
85
+ replace_invlists(
86
+ new BlockInvertedLists(orig.nlist, bbs, bbs * M2 / 2),
87
+ true
88
+ );
89
+
90
+ precomputed_table.resize(orig.precomputed_table.size());
91
+
92
+ if (precomputed_table.nbytes() > 0) {
93
+ memcpy(precomputed_table.get(), orig.precomputed_table.data(),
94
+ precomputed_table.nbytes()
95
+ );
96
+ }
97
+
98
+ for(size_t i = 0; i < nlist; i++) {
99
+ size_t nb = orig.invlists->list_size(i);
100
+ size_t nb2 = roundup(nb, bbs);
101
+ AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
102
+ pq4_pack_codes(
103
+ InvertedLists::ScopedCodes(orig.invlists, i).get(),
104
+ nb, M, nb2, bbs, M2,
105
+ tmp.get()
106
+ );
107
+ invlists->add_entries(
108
+ i, nb,
109
+ InvertedLists::ScopedIds(orig.invlists, i).get(),
110
+ tmp.get()
111
+ );
112
+ }
113
+
114
+ orig_invlists = orig.invlists;
115
+ }
116
+
117
+
118
+
119
+ /*********************************************************
120
+ * Training
121
+ *********************************************************/
122
+
123
+ void IndexIVFPQFastScan::train_residual (idx_t n, const float *x_in)
124
+ {
125
+
126
+ const float * x = fvecs_maybe_subsample (
127
+ d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
128
+ x_in, verbose, pq.cp.seed);
129
+
130
+ std::unique_ptr<float []> del_x;
131
+ if (x != x_in) {
132
+ del_x.reset((float*)x);
133
+ }
134
+
135
+ const float *trainset;
136
+ AlignedTable<float> residuals;
137
+
138
+ if (by_residual) {
139
+ if(verbose) printf("computing residuals\n");
140
+ std::vector<idx_t> assign(n);
141
+ quantizer->assign (n, x, assign.data());
142
+ residuals.resize(n * d);
143
+ for (idx_t i = 0; i < n; i++) {
144
+ quantizer->compute_residual (
145
+ x + i * d,
146
+ residuals.data() + i * d,
147
+ assign[i]
148
+ );
149
+ }
150
+ trainset = residuals.data();
151
+ } else {
152
+ trainset = x;
153
+ }
154
+
155
+ if (verbose) {
156
+ printf ("training %zdx%zd product quantizer on %zd vectors in %dD\n",
157
+ pq.M, pq.ksub, long(n), d);
158
+ }
159
+ pq.verbose = verbose;
160
+ pq.train (n, trainset);
161
+
162
+ if (by_residual && metric_type == METRIC_L2) {
163
+ precompute_table();
164
+ }
165
+
166
+ }
167
+
168
+ void IndexIVFPQFastScan::precompute_table ()
169
+ {
170
+ initialize_IVFPQ_precomputed_table(
171
+ use_precomputed_table,
172
+ quantizer, pq, precomputed_table, verbose
173
+ );
174
+ }
175
+
176
+
177
+ /*********************************************************
178
+ * Code management functions
179
+ *********************************************************/
180
+
181
+
182
+
183
+ void IndexIVFPQFastScan::encode_vectors(
184
+ idx_t n, const float* x, const idx_t *list_nos,
185
+ uint8_t * codes, bool include_listnos) const
186
+ {
187
+
188
+ if (by_residual) {
189
+ AlignedTable<float> residuals (n * d);
190
+ for (size_t i = 0; i < n; i++) {
191
+ if (list_nos[i] < 0) {
192
+ memset (residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
193
+ } else {
194
+ quantizer->compute_residual (
195
+ x + i * d, residuals.data() + i * d, list_nos[i]);
196
+ }
197
+ }
198
+ pq.compute_codes (residuals.data(), codes, n);
199
+ } else {
200
+ pq.compute_codes (x, codes, n);
201
+ }
202
+
203
+ if (include_listnos) {
204
+ size_t coarse_size = coarse_code_size();
205
+ for (idx_t i = n - 1; i >= 0; i--) {
206
+ uint8_t * code = codes + i * (coarse_size + code_size);
207
+ memmove (code + coarse_size,
208
+ codes + i * code_size, code_size);
209
+ encode_listno (list_nos[i], code);
210
+ }
211
+ }
212
+ }
213
+
214
+
215
+
216
+ void IndexIVFPQFastScan::add_with_ids (
217
+ idx_t n, const float * x, const idx_t *xids) {
218
+
219
+ // copied from IndexIVF::add_with_ids --->
220
+
221
+ // do some blocking to avoid excessive allocs
222
+ idx_t bs = 65536;
223
+ if (n > bs) {
224
+ for (idx_t i0 = 0; i0 < n; i0 += bs) {
225
+ idx_t i1 = std::min (n, i0 + bs);
226
+ if (verbose) {
227
+ printf(" IndexIVFPQFastScan::add_with_ids %zd: %zd",
228
+ size_t(i0), size_t(i1));
229
+ }
230
+ add_with_ids (i1 - i0, x + i0 * d,
231
+ xids ? xids + i0 : nullptr);
232
+ }
233
+ return;
234
+ }
235
+ InterruptCallback::check();
236
+
237
+ AlignedTable<uint8_t> codes(n * code_size);
238
+
239
+ FAISS_THROW_IF_NOT (is_trained);
240
+ direct_map.check_can_add (xids);
241
+
242
+ std::unique_ptr<idx_t []> idx(new idx_t[n]);
243
+ quantizer->assign (n, x, idx.get());
244
+ size_t nadd = 0, nminus1 = 0;
245
+
246
+ for (size_t i = 0; i < n; i++) {
247
+ if (idx[i] < 0) nminus1++;
248
+ }
249
+
250
+ AlignedTable<uint8_t> flat_codes(n * code_size);
251
+ encode_vectors (n, x, idx.get(), flat_codes.get());
252
+
253
+ DirectMapAdd dm_adder(direct_map, n, xids);
254
+
255
+ // <---
256
+
257
+ BlockInvertedLists *bil = dynamic_cast<BlockInvertedLists*>(invlists);
258
+ FAISS_THROW_IF_NOT_MSG (bil, "only block inverted lists supported");
259
+
260
+ // prepare batches
261
+ std::vector<idx_t> order(n);
262
+ for(idx_t i = 0; i < n ; i++) { order[i] = i; }
263
+
264
+ // TODO should not need stable
265
+ std::stable_sort(order.begin(), order.end(),
266
+ [&idx](idx_t a, idx_t b) {
267
+ return idx[a] < idx[b];
268
+ }
269
+ );
270
+
271
+ // TODO parallelize
272
+ idx_t i0 = 0;
273
+ while (i0 < n) {
274
+ idx_t list_no = idx[order[i0]];
275
+ idx_t i1 = i0 + 1;
276
+ while (i1 < n && idx[order[i1]] == list_no) {
277
+ i1 ++;
278
+ }
279
+
280
+ if (list_no == -1) {
281
+ i0 = i1;
282
+ continue;
283
+ }
284
+
285
+ // make linear array
286
+ AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
287
+ size_t list_size = bil->list_size(list_no);
288
+
289
+ bil->resize(list_no, list_size + i1 - i0);
290
+
291
+ for(idx_t i = i0; i < i1; i++) {
292
+ size_t ofs = list_size + i - i0;
293
+ idx_t id = xids ? xids[order[i]] : ntotal + order[i];
294
+ dm_adder.add (order[i], list_no, ofs);
295
+ bil->ids[list_no][ofs] = id;
296
+ memcpy(
297
+ list_codes.data() + (i - i0) * code_size,
298
+ flat_codes.data() + order[i] * code_size,
299
+ code_size
300
+ );
301
+ nadd++;
302
+ }
303
+ pq4_pack_codes_range(
304
+ list_codes.data(), pq.M,
305
+ list_size, list_size + i1 - i0,
306
+ bbs, M2, bil->codes[list_no].data()
307
+ );
308
+
309
+ i0 = i1;
310
+ }
311
+
312
+ ntotal += n;
313
+
314
+ }
315
+
316
+
317
+
318
+ /*********************************************************
319
+ * search
320
+ *********************************************************/
321
+
322
+
323
+ namespace {
324
+
325
+ // from impl/ProductQuantizer.cpp
326
+ template <class C, typename dis_t>
327
+ void pq_estimators_from_tables_generic(
328
+ const ProductQuantizer& pq, size_t nbits,
329
+ const uint8_t *codes, size_t ncodes,
330
+ const dis_t *dis_table, const int64_t * ids,
331
+ float dis0,
332
+ size_t k, typename C::T *heap_dis, int64_t *heap_ids)
333
+ {
334
+ using accu_t = typename C::T;
335
+ const size_t M = pq.M;
336
+ const size_t ksub = pq.ksub;
337
+ for (size_t j = 0; j < ncodes; ++j) {
338
+ PQDecoderGeneric decoder(
339
+ codes + j * pq.code_size, nbits
340
+ );
341
+ accu_t dis = dis0;
342
+ const dis_t * dt = dis_table;
343
+ for (size_t m = 0; m < M; m++) {
344
+ uint64_t c = decoder.decode();
345
+ dis += dt[c];
346
+ dt += ksub;
347
+ }
348
+
349
+ if (C::cmp(heap_dis[0], dis)) {
350
+ heap_pop<C>(k, heap_dis, heap_ids);
351
+ heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
352
+ }
353
+ }
354
+ }
355
+
356
+ using idx_t = Index::idx_t;
357
+ using namespace quantize_lut;
358
+
359
+ void fvec_madd_avx (
360
+ size_t n, const float *a,
361
+ float bf, const float *b, float *c)
362
+ {
363
+ assert(is_aligned_pointer(a));
364
+ assert(is_aligned_pointer(b));
365
+ assert(is_aligned_pointer(c));
366
+ assert(n % 8 == 0);
367
+ simd8float32 bf8(bf);
368
+ n /= 8;
369
+ for(size_t i = 0; i < n; i++) {
370
+ simd8float32 ai(a);
371
+ simd8float32 bi(b);
372
+
373
+ simd8float32 ci = fmadd(bf8, bi, ai);
374
+ ci.store(c);
375
+ c += 8;
376
+ a += 8;
377
+ b += 8;
378
+ }
379
+
380
+ }
381
+
382
+ } // anonymous namespace
383
+
384
+ /*********************************************************
385
+ * Look-Up Table functions
386
+ *********************************************************/
387
+
388
+
389
+ void IndexIVFPQFastScan::compute_LUT(
390
+ size_t n, const float *x,
391
+ const idx_t *coarse_ids, const float *coarse_dis,
392
+ AlignedTable<float> & dis_tables,
393
+ AlignedTable<float> & biases
394
+ ) const
395
+ {
396
+ const IndexIVFPQFastScan & ivfpq = *this;
397
+ size_t dim12 = pq.ksub * pq.M;
398
+ size_t d = pq.d;
399
+ size_t nprobe = ivfpq.nprobe;
400
+
401
+ if (ivfpq.by_residual) {
402
+
403
+ if (ivfpq.metric_type == METRIC_L2) {
404
+
405
+ dis_tables.resize(n * nprobe * dim12);
406
+
407
+ if (ivfpq.use_precomputed_table == 1) {
408
+ biases.resize(n * nprobe);
409
+ memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
410
+
411
+ AlignedTable<float> ip_table(n * dim12);
412
+ pq.compute_inner_prod_tables (n, x, ip_table.get());
413
+
414
+ #pragma omp parallel for if (n * nprobe > 8000)
415
+ for(idx_t ij = 0; ij < n * nprobe; ij++) {
416
+ idx_t i = ij / nprobe;
417
+ float *tab = dis_tables.get() + ij * dim12;
418
+ idx_t cij = coarse_ids[ij];
419
+
420
+ if (cij >= 0) {
421
+ fvec_madd_avx (
422
+ dim12,
423
+ precomputed_table.get() + cij * dim12,
424
+ -2, ip_table.get() + i * dim12,
425
+ tab
426
+ );
427
+ } else {
428
+ // fill with NaNs so that they are ignored during
429
+ // LUT quantization
430
+ memset (tab, -1, sizeof(float) * dim12);
431
+ }
432
+ }
433
+
434
+ } else {
435
+
436
+ std::unique_ptr<float[]> xrel(new float[n * nprobe * d]);
437
+ biases.resize(n * nprobe);
438
+ memset(biases.get(), 0, sizeof(float) * n * nprobe);
439
+
440
+ #pragma omp parallel for if (n * nprobe > 8000)
441
+ for(idx_t ij = 0; ij < n * nprobe; ij++) {
442
+ idx_t i = ij / nprobe;
443
+ float *xij = &xrel[ij * d];
444
+ idx_t cij = coarse_ids[ij];
445
+
446
+ if (cij >= 0) {
447
+ ivfpq.quantizer->compute_residual(
448
+ x + i * d, xij, cij);
449
+ } else {
450
+ // will fill with NaNs
451
+ memset(xij, -1, sizeof(float) * d);
452
+ }
453
+ }
454
+
455
+ pq.compute_distance_tables (
456
+ n * nprobe, xrel.get(), dis_tables.get());
457
+
458
+ }
459
+
460
+ } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
461
+ dis_tables.resize(n * dim12);
462
+ pq.compute_inner_prod_tables (n, x, dis_tables.get());
463
+ // compute_inner_prod_tables(pq, n, x, dis_tables.get());
464
+
465
+ biases.resize(n * nprobe);
466
+ memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
467
+ } else {
468
+ FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
469
+ }
470
+
471
+ } else {
472
+ dis_tables.resize(n * dim12);
473
+ if (ivfpq.metric_type == METRIC_L2) {
474
+ pq.compute_distance_tables (n, x, dis_tables.get());
475
+ } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
476
+ pq.compute_inner_prod_tables (n, x, dis_tables.get());
477
+ } else {
478
+ FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
479
+ }
480
+ }
481
+
482
+ }
483
+
484
+ void IndexIVFPQFastScan::compute_LUT_uint8(
485
+ size_t n, const float *x,
486
+ const idx_t *coarse_ids, const float *coarse_dis,
487
+ AlignedTable<uint8_t> & dis_tables,
488
+ AlignedTable<uint16_t> & biases,
489
+ float * normalizers
490
+ ) const {
491
+ const IndexIVFPQFastScan & ivfpq = *this;
492
+ AlignedTable<float> dis_tables_float;
493
+ AlignedTable<float> biases_float;
494
+
495
+ uint64_t t0 = get_cy();
496
+ compute_LUT(
497
+ n, x,
498
+ coarse_ids, coarse_dis,
499
+ dis_tables_float, biases_float
500
+ );
501
+ IVFFastScan_stats.t_compute_distance_tables += get_cy() - t0;
502
+
503
+ bool lut_is_3d = ivfpq.by_residual && ivfpq.metric_type == METRIC_L2;
504
+ size_t dim123 = pq.ksub * pq.M;
505
+ size_t dim123_2 = pq.ksub * M2;
506
+ if (lut_is_3d) {
507
+ dim123 *= nprobe;
508
+ dim123_2 *= nprobe;
509
+ }
510
+ dis_tables.resize(n * dim123_2);
511
+ if (biases_float.get()) {
512
+ biases.resize(n * nprobe);
513
+ }
514
+ uint64_t t1 = get_cy();
515
+
516
+ #pragma omp parallel for if (n > 100)
517
+ for(int64_t i = 0; i < n; i++) {
518
+ const float *t_in = dis_tables_float.get() + i * dim123;
519
+ const float *b_in = nullptr;
520
+ uint8_t *t_out = dis_tables.get() + i * dim123_2;
521
+ uint16_t *b_out = nullptr;
522
+ if (biases_float.get()) {
523
+ b_in = biases_float.get() + i * nprobe;
524
+ b_out = biases.get() + i * nprobe;
525
+ }
526
+
527
+ quantize_LUT_and_bias(
528
+ nprobe, pq.M, pq.ksub, lut_is_3d,
529
+ t_in, b_in,
530
+ t_out, M2, b_out,
531
+ normalizers + 2 * i, normalizers + 2 * i + 1
532
+ );
533
+ }
534
+ IVFFastScan_stats.t_round += get_cy() - t1;
535
+
536
+ }
537
+
538
+
539
+ /*********************************************************
540
+ * Search functions
541
+ *********************************************************/
542
+
543
+ template<bool is_max>
544
+ void IndexIVFPQFastScan::search_dispatch_implem(
545
+ idx_t n,
546
+ const float* x,
547
+ idx_t k,
548
+ float* distances,
549
+ idx_t* labels) const
550
+ {
551
+ using Cfloat = typename std::conditional<is_max,
552
+ CMax<float, int64_t>, CMin<float, int64_t> >::type;
553
+
554
+ using C = typename std::conditional<is_max,
555
+ CMax<uint16_t, int64_t>, CMin<uint16_t, int64_t> >::type;
556
+
557
+ if (n == 0) {
558
+ return;
559
+ }
560
+
561
+ // actual implementation used
562
+ int impl = implem;
563
+
564
+ if (impl == 0) {
565
+ if (bbs == 32) {
566
+ impl = 12;
567
+ } else {
568
+ impl = 10;
569
+ }
570
+ if (k > 20) {
571
+ impl ++;
572
+ }
573
+ }
574
+
575
+ if (impl == 1) {
576
+ search_implem_1<Cfloat>(n, x, k, distances, labels);
577
+ } else if (impl == 2) {
578
+ search_implem_2<C>(n, x, k, distances, labels);
579
+
580
+ } else if (impl >= 10 && impl <= 13) {
581
+ size_t ndis = 0, nlist_visited = 0;
582
+
583
+ if (n < 2) {
584
+ if (impl == 12 || impl == 13) {
585
+ search_implem_12<C>
586
+ (n, x, k, distances, labels, impl, &ndis, &nlist_visited);
587
+ } else {
588
+ search_implem_10<C>
589
+ (n, x, k, distances, labels, impl, &ndis, &nlist_visited);
590
+ }
591
+ } else {
592
+ // explicitly slice over threads
593
+ int nslice;
594
+ if (n <= omp_get_max_threads()) {
595
+ nslice = n;
596
+ } else if (by_residual && metric_type == METRIC_L2) {
597
+ // make sure we don't make too big LUT tables
598
+ size_t lut_size_per_query =
599
+ pq.M * pq.ksub * nprobe * (sizeof(float) + sizeof(uint8_t));
600
+
601
+ size_t max_lut_size = precomputed_table_max_bytes;
602
+ // how many queries we can handle within mem budget
603
+ size_t nq_ok = std::max(max_lut_size / lut_size_per_query, size_t(1));
604
+ nslice = roundup(std::max(size_t(n / nq_ok), size_t(1)), omp_get_max_threads());
605
+ } else {
606
+ // LUTs unlikely to be a limiting factor
607
+ nslice = omp_get_max_threads();
608
+ }
609
+
610
+ #pragma omp parallel for reduction(+: ndis, nlist_visited)
611
+ for (int slice = 0; slice < nslice; slice++) {
612
+ idx_t i0 = n * slice / nslice;
613
+ idx_t i1 = n * (slice + 1) / nslice;
614
+ float *dis_i = distances + i0 * k;
615
+ idx_t *lab_i = labels + i0 * k;
616
+ if (impl == 12 || impl == 13) {
617
+ search_implem_12<C>(
618
+ i1 - i0, x + i0 * d, k, dis_i, lab_i,
619
+ impl, &ndis, &nlist_visited
620
+ );
621
+ } else {
622
+ search_implem_10<C>(
623
+ i1 - i0, x + i0 * d, k, dis_i, lab_i,
624
+ impl, &ndis, &nlist_visited
625
+ );
626
+ }
627
+ }
628
+ }
629
+ indexIVF_stats.nq += n;
630
+ indexIVF_stats.ndis += ndis;
631
+ indexIVF_stats.nlist += nlist_visited;
632
+ } else {
633
+ FAISS_THROW_FMT("implem %d does not exist", implem);
634
+ }
635
+
636
+ }
637
+
638
+
639
+ void IndexIVFPQFastScan::search(
640
+ idx_t n, const float* x, idx_t k,
641
+ float* distances, idx_t* labels) const
642
+ {
643
+ if (metric_type == METRIC_L2) {
644
+ search_dispatch_implem<true>(n, x, k, distances, labels);
645
+ } else {
646
+ search_dispatch_implem<false>(n, x, k, distances, labels);
647
+ }
648
+ }
649
+
650
+ template<class C>
651
+ void IndexIVFPQFastScan::search_implem_1(
652
+ idx_t n, const float* x, idx_t k,
653
+ float* distances, idx_t* labels) const
654
+ {
655
+ FAISS_THROW_IF_NOT(orig_invlists);
656
+
657
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
658
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
659
+
660
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
661
+
662
+ size_t dim12 = pq.ksub * pq.M;
663
+ AlignedTable<float> dis_tables;
664
+ AlignedTable<float> biases;
665
+
666
+ compute_LUT (
667
+ n, x,
668
+ coarse_ids.get(), coarse_dis.get(),
669
+ dis_tables, biases
670
+ );
671
+
672
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
673
+
674
+ size_t ndis = 0, nlist_visited = 0;
675
+
676
+ #pragma omp parallel for reduction(+: ndis, nlist_visited)
677
+ for(idx_t i = 0; i < n; i++) {
678
+ int64_t *heap_ids = labels + i * k;
679
+ float *heap_dis = distances + i * k;
680
+ heap_heapify<C> (k, heap_dis, heap_ids);
681
+ float *LUT = nullptr;
682
+
683
+ if (single_LUT) {
684
+ LUT = dis_tables.get() + i * dim12;
685
+ }
686
+ for(idx_t j = 0; j < nprobe; j++) {
687
+ if (!single_LUT) {
688
+ LUT = dis_tables.get() + (i * nprobe + j) * dim12;
689
+ }
690
+ idx_t list_no = coarse_ids[i * nprobe + j];
691
+ if (list_no < 0) continue;
692
+ size_t ls = orig_invlists->list_size(list_no);
693
+ if (ls == 0) continue;
694
+ InvertedLists::ScopedCodes codes(orig_invlists, list_no);
695
+ InvertedLists::ScopedIds ids(orig_invlists, list_no);
696
+
697
+ float bias = biases.get() ? biases[i * nprobe + j] : 0;
698
+
699
+ pq_estimators_from_tables_generic<C>(
700
+ pq, pq.nbits, codes.get(), ls,
701
+ LUT, ids.get(), bias,
702
+ k, heap_dis, heap_ids
703
+ );
704
+ nlist_visited ++;
705
+ ndis ++;
706
+ }
707
+ heap_reorder<C> (k, heap_dis, heap_ids);
708
+ }
709
+ indexIVF_stats.nq += n;
710
+ indexIVF_stats.ndis += ndis;
711
+ indexIVF_stats.nlist += nlist_visited;
712
+ }
713
+
714
+ template<class C>
715
+ void IndexIVFPQFastScan::search_implem_2(
716
+ idx_t n, const float* x, idx_t k,
717
+ float* distances, idx_t* labels) const
718
+ {
719
+ FAISS_THROW_IF_NOT(orig_invlists);
720
+
721
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
722
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
723
+
724
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
725
+
726
+ size_t dim12 = pq.ksub * M2;
727
+ AlignedTable<uint8_t> dis_tables;
728
+ AlignedTable<uint16_t> biases;
729
+ std::unique_ptr<float[]> normalizers(new float[2 * n]);
730
+
731
+ compute_LUT_uint8 (
732
+ n, x,
733
+ coarse_ids.get(), coarse_dis.get(),
734
+ dis_tables, biases,
735
+ normalizers.get()
736
+ );
737
+
738
+
739
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
740
+
741
+ size_t ndis = 0, nlist_visited = 0;
742
+
743
+ #pragma omp parallel for reduction(+: ndis, nlist_visited)
744
+ for(idx_t i = 0; i < n; i++) {
745
+ std::vector<uint16_t> tmp_dis(k);
746
+ int64_t *heap_ids = labels + i * k;
747
+ uint16_t *heap_dis = tmp_dis.data();
748
+ heap_heapify<C> (k, heap_dis, heap_ids);
749
+ const uint8_t *LUT = nullptr;
750
+
751
+ if (single_LUT) {
752
+ LUT = dis_tables.get() + i * dim12;
753
+ }
754
+ for(idx_t j = 0; j < nprobe; j++) {
755
+ if (!single_LUT) {
756
+ LUT = dis_tables.get() + (i * nprobe + j) * dim12;
757
+ }
758
+ idx_t list_no = coarse_ids[i * nprobe + j];
759
+ if (list_no < 0) continue;
760
+ size_t ls = orig_invlists->list_size(list_no);
761
+ if (ls == 0) continue;
762
+ InvertedLists::ScopedCodes codes(orig_invlists, list_no);
763
+ InvertedLists::ScopedIds ids(orig_invlists, list_no);
764
+
765
+ uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
766
+
767
+ pq_estimators_from_tables_generic<C>(
768
+ pq, pq.nbits, codes.get(), ls,
769
+ LUT, ids.get(), bias,
770
+ k, heap_dis, heap_ids
771
+ );
772
+
773
+ nlist_visited++;
774
+ ndis += ls;
775
+ }
776
+ heap_reorder<C> (k, heap_dis, heap_ids);
777
+ // convert distances to float
778
+ {
779
+ float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
780
+ if (skip & 16) {
781
+ one_a = 1;
782
+ b = 0;
783
+ }
784
+ float *heap_dis_float = distances + i * k;
785
+ for (int j = 0; j < k; j++) {
786
+ heap_dis_float[j] = b + heap_dis[j] * one_a;
787
+ }
788
+ }
789
+ }
790
+ indexIVF_stats.nq += n;
791
+ indexIVF_stats.ndis += ndis;
792
+ indexIVF_stats.nlist += nlist_visited;
793
+ }
794
+
795
+
796
+
797
+ template<class C>
798
+ void IndexIVFPQFastScan::search_implem_10(
799
+ idx_t n, const float* x, idx_t k,
800
+ float* distances, idx_t* labels,
801
+ int impl, size_t *ndis_out, size_t *nlist_out) const
802
+ {
803
+ memset(distances, -1, sizeof(float) * k * n);
804
+ memset(labels, -1, sizeof(idx_t) * k * n);
805
+
806
+ using HeapHC = HeapHandler<C, true>;
807
+ using ReservoirHC = ReservoirHandler<C, true>;
808
+ using SingleResultHC = SingleResultHandler<C, true>;
809
+
810
+
811
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
812
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
813
+
814
+ uint64_t times[10];
815
+ memset(times, 0, sizeof(times));
816
+ int ti = 0;
817
+ #define TIC times[ti++] = get_cy()
818
+ TIC;
819
+
820
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
821
+
822
+ TIC;
823
+
824
+ size_t dim12 = pq.ksub * M2;
825
+ AlignedTable<uint8_t> dis_tables;
826
+ AlignedTable<uint16_t> biases;
827
+ std::unique_ptr<float[]> normalizers (new float[2 * n]);
828
+
829
+ compute_LUT_uint8 (
830
+ n, x,
831
+ coarse_ids.get(), coarse_dis.get(),
832
+ dis_tables, biases, normalizers.get()
833
+ );
834
+
835
+ TIC;
836
+
837
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
838
+
839
+ TIC;
840
+ size_t ndis = 0, nlist_visited = 0;
841
+
842
+ {
843
+ AlignedTable<uint16_t> tmp_distances(k);
844
+ for(idx_t i = 0; i < n; i++) {
845
+ const uint8_t *LUT = nullptr;
846
+ int qmap1[1] = {0};
847
+ std::unique_ptr<SIMDResultHandler<C, true> > handler;
848
+
849
+ if (k == 1) {
850
+ handler.reset(new SingleResultHC(1, 0));
851
+ } else if (impl == 10) {
852
+ handler.reset(new HeapHC(1, tmp_distances.get(), labels + i * k, k, 0));
853
+ } else if (impl == 11) {
854
+ handler.reset(new ReservoirHC(1, 0, k, 2 * k));
855
+ } else {
856
+ FAISS_THROW_MSG("invalid");
857
+ }
858
+
859
+ handler->q_map = qmap1;
860
+
861
+ if (single_LUT) {
862
+ LUT = dis_tables.get() + i * dim12;
863
+ }
864
+ for(idx_t j = 0; j < nprobe; j++) {
865
+ size_t ij = i * nprobe + j;
866
+ if (!single_LUT) {
867
+ LUT = dis_tables.get() + ij * dim12;
868
+ }
869
+ if (biases.get()) {
870
+ handler->dbias = biases.get() + ij;
871
+ }
872
+
873
+ idx_t list_no = coarse_ids[ij];
874
+ if (list_no < 0) continue;
875
+ size_t ls = invlists->list_size(list_no);
876
+ if (ls == 0) continue;
877
+
878
+ InvertedLists::ScopedCodes codes(invlists, list_no);
879
+ InvertedLists::ScopedIds ids(invlists, list_no);
880
+
881
+ handler->ntotal = ls;
882
+ handler->id_map = ids.get();
883
+
884
+ #define DISPATCH(classHC) \
885
+ if(auto *res = dynamic_cast<classHC* > (handler.get())) { \
886
+ pq4_accumulate_loop( \
887
+ 1, roundup(ls, bbs), bbs, M2, \
888
+ codes.get(), LUT, \
889
+ *res \
890
+ ); \
891
+ }
892
+ DISPATCH(HeapHC)
893
+ else DISPATCH(ReservoirHC)
894
+ else DISPATCH(SingleResultHC)
895
+ #undef DISPATCH
896
+
897
+ nlist_visited ++;
898
+ ndis ++;
899
+ }
900
+
901
+ handler->to_flat_arrays(
902
+ distances + i * k, labels + i * k,
903
+ skip & 16 ? nullptr : normalizers.get() + i * 2
904
+ );
905
+ }
906
+ }
907
+ *ndis_out = ndis;
908
+ *nlist_out = nlist;
909
+ }
910
+
911
+
912
+
913
+ template<class C>
914
+ void IndexIVFPQFastScan::search_implem_12(
915
+ idx_t n, const float* x, idx_t k,
916
+ float* distances, idx_t* labels,
917
+ int impl, size_t *ndis_out, size_t *nlist_out) const
918
+ {
919
+ if (n == 0) { // does not work well with reservoir
920
+ return;
921
+ }
922
+ FAISS_THROW_IF_NOT(bbs == 32);
923
+
924
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
925
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
926
+
927
+ uint64_t times[10];
928
+ memset(times, 0, sizeof(times));
929
+ int ti = 0;
930
+ #define TIC times[ti++] = get_cy()
931
+ TIC;
932
+
933
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
934
+
935
+ TIC;
936
+
937
+ size_t dim12 = pq.ksub * M2;
938
+ AlignedTable<uint8_t> dis_tables;
939
+ AlignedTable<uint16_t> biases;
940
+ std::unique_ptr<float[]> normalizers (new float[2 * n]);
941
+
942
+ compute_LUT_uint8 (
943
+ n, x,
944
+ coarse_ids.get(), coarse_dis.get(),
945
+ dis_tables, biases, normalizers.get()
946
+ );
947
+
948
+ TIC;
949
+
950
+ struct QC {
951
+ int qno; // sequence number of the query
952
+ int list_no; // list to visit
953
+ int rank; // this is the rank'th result of the coarse quantizer
954
+ };
955
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
956
+
957
+ std::vector<QC> qcs;
958
+ {
959
+ int ij = 0;
960
+ for(int i = 0; i < n; i++) {
961
+ for(int j = 0; j < nprobe; j++) {
962
+ if (coarse_ids[ij] >= 0) {
963
+ qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
964
+ }
965
+ ij++;
966
+ }
967
+ }
968
+ std::sort(
969
+ qcs.begin(), qcs.end(),
970
+ [](const QC &a, const QC & b) {
971
+ return a.list_no < b.list_no;
972
+ }
973
+ );
974
+ }
975
+ TIC;
976
+
977
+ // prepare the result handlers
978
+
979
+ std::unique_ptr<SIMDResultHandler<C, true> > handler;
980
+ AlignedTable<uint16_t> tmp_distances;
981
+
982
+ using HeapHC = HeapHandler<C, true>;
983
+ using ReservoirHC = ReservoirHandler<C, true>;
984
+ using SingleResultHC = SingleResultHandler<C, true>;
985
+
986
+ if (k == 1) {
987
+ handler.reset(new SingleResultHC(n, 0));
988
+ } else if (impl == 12) {
989
+ tmp_distances.resize(n * k);
990
+ handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0));
991
+ } else if (impl == 13) {
992
+ handler.reset(new ReservoirHC(n, 0, k, 2 * k));
993
+ }
994
+
995
+ int qbs2 = this->qbs2 ? this->qbs2 : 11;
996
+
997
+ std::vector<uint16_t> tmp_bias;
998
+ if (biases.get()) {
999
+ tmp_bias.resize(qbs2);
1000
+ handler->dbias = tmp_bias.data();
1001
+ }
1002
+ TIC;
1003
+
1004
+ size_t ndis = 0;
1005
+
1006
+ size_t i0 = 0;
1007
+ uint64_t t_copy_pack = 0, t_scan = 0;
1008
+ while (i0 < qcs.size()) {
1009
+ uint64_t tt0 = get_cy();
1010
+
1011
+ // find all queries that access this inverted list
1012
+ int list_no = qcs[i0].list_no;
1013
+ size_t i1 = i0 + 1;
1014
+
1015
+ while(i1 < qcs.size() && i1 < i0 + qbs2) {
1016
+ if (qcs[i1].list_no != list_no) {
1017
+ break;
1018
+ }
1019
+ i1++;
1020
+ }
1021
+
1022
+ size_t list_size = invlists->list_size(list_no);
1023
+
1024
+ if (list_size == 0) {
1025
+ i0 = i1;
1026
+ continue;
1027
+ }
1028
+
1029
+ // re-organize LUTs and biases into the right order
1030
+ int nc = i1 - i0;
1031
+
1032
+ std::vector<int> q_map(nc), lut_entries(nc);
1033
+ AlignedTable<uint8_t> LUT(nc * dim12);
1034
+ memset(LUT.get(), -1, nc * dim12);
1035
+ int qbs = pq4_preferred_qbs(nc);
1036
+
1037
+ for(size_t i = i0; i < i1; i++) {
1038
+ const QC & qc = qcs[i];
1039
+ q_map[i - i0] = qc.qno;
1040
+ int ij = qc.qno * nprobe + qc.rank;
1041
+ lut_entries[i - i0] = single_LUT ? qc.qno : ij;
1042
+ if (biases.get()) {
1043
+ tmp_bias[i - i0] = biases[ij];
1044
+ }
1045
+ }
1046
+ pq4_pack_LUT_qbs_q_map(
1047
+ qbs, M2, dis_tables.get(), lut_entries.data(),
1048
+ LUT.get()
1049
+ );
1050
+
1051
+ // access the inverted list
1052
+
1053
+ ndis += (i1 - i0) * list_size;
1054
+
1055
+ InvertedLists::ScopedCodes codes(invlists, list_no);
1056
+ InvertedLists::ScopedIds ids(invlists, list_no);
1057
+
1058
+ // prepare the handler
1059
+
1060
+ handler->ntotal = list_size;
1061
+ handler->q_map = q_map.data();
1062
+ handler->id_map = ids.get();
1063
+ uint64_t tt1 = get_cy();
1064
+
1065
+ #define DISPATCH(classHC) \
1066
+ if(auto *res = dynamic_cast<classHC* > (handler.get())) { \
1067
+ pq4_accumulate_loop_qbs( \
1068
+ qbs, list_size, M2, \
1069
+ codes.get(), LUT.get(), \
1070
+ *res \
1071
+ ); \
1072
+ }
1073
+ DISPATCH(HeapHC)
1074
+ else DISPATCH(ReservoirHC)
1075
+ else DISPATCH(SingleResultHC)
1076
+
1077
+ // prepare for next loop
1078
+ i0 = i1;
1079
+
1080
+ uint64_t tt2 = get_cy();
1081
+ t_copy_pack += tt1 - tt0;
1082
+ t_scan += tt2 - tt1;
1083
+ }
1084
+ TIC;
1085
+
1086
+ // labels is in-place for HeapHC
1087
+ handler->to_flat_arrays(
1088
+ distances, labels,
1089
+ skip & 16 ? nullptr : normalizers.get()
1090
+ );
1091
+
1092
+ TIC;
1093
+
1094
+ // these stats are not thread-safe
1095
+
1096
+ for(int i = 1; i < ti; i++) {
1097
+ IVFFastScan_stats.times[i] += times[i] - times[i-1];
1098
+ }
1099
+ IVFFastScan_stats.t_copy_pack += t_copy_pack;
1100
+ IVFFastScan_stats.t_scan += t_scan;
1101
+
1102
+ if (auto *rh = dynamic_cast<ReservoirHC*> (handler.get())) {
1103
+ for (int i = 0; i < 4; i++) {
1104
+ IVFFastScan_stats.reservoir_times[i] += rh->times[i];
1105
+ }
1106
+ }
1107
+
1108
+ *ndis_out = ndis;
1109
+ *nlist_out = nlist;
1110
+
1111
+ }
1112
+
1113
+
1114
+ IVFFastScanStats IVFFastScan_stats;
1115
+
1116
+ } // namespace faiss