faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +1 -1
  5. data/ext/faiss/extconf.rb +1 -1
  6. data/lib/faiss/version.rb +1 -1
  7. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  8. data/vendor/faiss/faiss/AutoTune.h +6 -3
  9. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  10. data/vendor/faiss/faiss/Index.cpp +3 -4
  11. data/vendor/faiss/faiss/Index.h +3 -3
  12. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  13. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  14. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  15. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  16. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  17. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  18. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  19. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  20. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  21. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  22. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  24. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  25. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  26. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  27. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  28. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  29. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  30. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  31. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  32. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  33. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  34. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  35. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  36. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  37. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  38. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  39. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  40. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  41. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  42. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  43. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  44. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  47. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  48. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  49. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  50. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  51. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  52. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  53. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  54. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  55. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  56. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  57. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  58. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  59. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  60. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  61. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  62. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  63. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  64. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  65. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  71. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  72. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  73. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  74. data/vendor/faiss/faiss/impl/io.h +7 -2
  75. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  76. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  77. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  78. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  79. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  81. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  82. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  83. data/vendor/faiss/faiss/index_io.h +1 -48
  84. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  85. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  86. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  87. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  88. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  89. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  90. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  91. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  92. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  93. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  94. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  95. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  96. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  97. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  98. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  99. data/vendor/faiss/faiss/utils/distances.h +28 -20
  100. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  101. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  102. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  103. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  104. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  105. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  106. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  107. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  108. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  109. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  110. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  111. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  112. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  113. metadata +43 -141
  114. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  115. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  116. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  117. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  118. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  119. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  120. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  121. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  122. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  123. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  124. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  125. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  126. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  127. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  128. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  129. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  130. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  131. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  132. data/vendor/faiss/c_api/Index_c.h +0 -183
  133. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  134. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  135. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  136. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  137. data/vendor/faiss/c_api/error_c.h +0 -42
  138. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  139. data/vendor/faiss/c_api/error_impl.h +0 -16
  140. data/vendor/faiss/c_api/faiss_c.h +0 -58
  141. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  142. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  143. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  144. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  145. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  146. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  147. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  148. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  149. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  150. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  151. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  152. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  153. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  154. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  155. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  156. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  157. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  158. data/vendor/faiss/c_api/index_io_c.h +0 -50
  159. data/vendor/faiss/c_api/macros_impl.h +0 -110
  160. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  161. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  162. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  163. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  164. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  165. data/vendor/faiss/misc/test_blas.cpp +0 -87
  166. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  167. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  168. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  169. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  170. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  171. data/vendor/faiss/tests/test_merge.cpp +0 -260
  172. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  173. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  174. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  175. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  176. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  177. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  178. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  179. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  180. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  181. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  182. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  183. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  184. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -371,7 +371,7 @@ void IndexIVFPQ::reconstruct_from_offset (int64_t list_no, int64_t offset,
371
371
 
372
372
 
373
373
  /// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
374
- size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
374
+ size_t precomputed_table_max_bytes = ((size_t)1) << 31;
375
375
 
376
376
  /** Precomputed tables for residuals
377
377
  *
@@ -403,10 +403,22 @@ size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
403
403
  * is faster when the length of the lists is > ksub * M.
404
404
  */
405
405
 
406
- void IndexIVFPQ::precompute_table ()
406
+ void initialize_IVFPQ_precomputed_table (
407
+ int &use_precomputed_table,
408
+ const Index *quantizer,
409
+ const ProductQuantizer &pq,
410
+ AlignedTable<float> & precomputed_table,
411
+ bool verbose
412
+ )
407
413
  {
408
- if (use_precomputed_table == -1)
414
+ size_t nlist = quantizer->ntotal;
415
+ size_t d = quantizer->d;
416
+ FAISS_THROW_IF_NOT(d == pq.d);
417
+
418
+ if (use_precomputed_table == -1) {
419
+ precomputed_table.resize (0);
409
420
  return;
421
+ }
410
422
 
411
423
  if (use_precomputed_table == 0) { // then choose the type of table
412
424
  if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
@@ -414,6 +426,7 @@ void IndexIVFPQ::precompute_table ()
414
426
  printf("IndexIVFPQ::precompute_table: precomputed "
415
427
  "tables not needed for inner product quantizers\n");
416
428
  }
429
+ precomputed_table.resize (0);
417
430
  return;
418
431
  }
419
432
  const MultiIndexQuantizer *miq =
@@ -492,6 +505,16 @@ void IndexIVFPQ::precompute_table ()
492
505
 
493
506
  }
494
507
 
508
+ void IndexIVFPQ::precompute_table ()
509
+ {
510
+ initialize_IVFPQ_precomputed_table (
511
+ use_precomputed_table, quantizer, pq, precomputed_table,
512
+ verbose
513
+ );
514
+ }
515
+
516
+
517
+
495
518
  namespace {
496
519
 
497
520
  using idx_t = Index::idx_t;
@@ -676,11 +699,12 @@ struct QueryTables {
676
699
  } else if (use_precomputed_table == 1) {
677
700
  dis0 = coarse_dis;
678
701
 
679
- fvec_madd (pq.M * pq.ksub,
680
- &ivfpq.precomputed_table [key * pq.ksub * pq.M],
681
- -2.0, sim_table_2,
682
- sim_table);
683
-
702
+ fvec_madd (
703
+ pq.M * pq.ksub,
704
+ ivfpq.precomputed_table.data() + key * pq.ksub * pq.M,
705
+ -2.0, sim_table_2,
706
+ sim_table
707
+ );
684
708
 
685
709
  if (polysemous_ht != 0) {
686
710
  ivfpq.quantizer->compute_residual (qi, residual_vec, key);
@@ -706,8 +730,8 @@ struct QueryTables {
706
730
  k >>= cpq.nbits;
707
731
 
708
732
  // get corresponding table
709
- const float *pc = &ivfpq.precomputed_table
710
- [(ki * pq.M + cm * Mf) * pq.ksub];
733
+ const float *pc = ivfpq.precomputed_table.data() +
734
+ (ki * pq.M + cm * Mf) * pq.ksub;
711
735
 
712
736
  if (polysemous_ht == 0) {
713
737
 
@@ -741,7 +765,8 @@ struct QueryTables {
741
765
  if (use_precomputed_table == 1) {
742
766
  dis0 = coarse_dis;
743
767
 
744
- const float * s = &ivfpq.precomputed_table [key * pq.ksub * pq.M];
768
+ const float * s = ivfpq.precomputed_table.data() +
769
+ key * pq.ksub * pq.M;
745
770
  for (int m = 0; m < pq.M; m++) {
746
771
  sim_table_ptrs [m] = s;
747
772
  s += pq.ksub;
@@ -761,8 +786,8 @@ struct QueryTables {
761
786
  int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
762
787
  k >>= cpq.nbits;
763
788
 
764
- const float *pc = &ivfpq.precomputed_table
765
- [(ki * pq.M + cm * Mf) * pq.ksub];
789
+ const float *pc = ivfpq.precomputed_table.data() +
790
+ (ki * pq.M + cm * Mf) * pq.ksub;
766
791
 
767
792
  for (int m = m0; m < m0 + Mf; m++) {
768
793
  sim_table_ptrs [m] = pc;
@@ -803,9 +828,8 @@ struct KnnSearchResults {
803
828
 
804
829
  inline void add (idx_t j, float dis) {
805
830
  if (C::cmp (heap_sim[0], dis)) {
806
- heap_pop<C> (k, heap_sim, heap_ids);
807
831
  idx_t id = ids ? ids[j] : lo_build (key, j);
808
- heap_push<C> (k, heap_sim, heap_ids, dis, id);
832
+ heap_replace_top<C> (k, heap_sim, heap_ids, dis, id);
809
833
  nup++;
810
834
  }
811
835
  }
@@ -16,7 +16,7 @@
16
16
  #include <faiss/IndexIVF.h>
17
17
  #include <faiss/IndexPQ.h>
18
18
  #include <faiss/impl/platform_macros.h>
19
-
19
+ #include <faiss/utils/AlignedTable.h>
20
20
 
21
21
  namespace faiss {
22
22
 
@@ -28,10 +28,14 @@ struct IVFPQSearchParameters: IVFSearchParameters {
28
28
  };
29
29
 
30
30
 
31
+
32
+ FAISS_API extern size_t precomputed_table_max_bytes;
33
+
34
+
31
35
  /** Inverted file with Product Quantizer encoding. Each residual
32
36
  * vector is encoded as a product quantizer code.
33
37
  */
34
- struct FAISS_API IndexIVFPQ: IndexIVF {
38
+ struct IndexIVFPQ: IndexIVF {
35
39
  bool by_residual; ///< Encode residual or plain vector?
36
40
 
37
41
  ProductQuantizer pq; ///< produces the codes
@@ -45,18 +49,12 @@ struct FAISS_API IndexIVFPQ: IndexIVF {
45
49
 
46
50
  /** Precompute table that speed up query preprocessing at some
47
51
  * memory cost (used only for by_residual with L2 metric)
48
- * =-1: force disable
49
- * =0: decide heuristically (default: use tables only if they are
50
- * < precomputed_tables_max_bytes)
51
- * =1: tables that work for all quantizers (size 256 * nlist * M)
52
- * =2: specific version for MultiIndexQuantizer (much more compact)
53
52
  */
54
53
  int use_precomputed_table;
55
- static size_t precomputed_table_max_bytes;
56
54
 
57
55
  /// if use_precompute_table
58
56
  /// size nlist * pq.M * pq.ksub
59
- std::vector <float> precomputed_table;
57
+ AlignedTable<float> precomputed_table;
60
58
 
61
59
  IndexIVFPQ (
62
60
  Index * quantizer, size_t d, size_t nlist,
@@ -133,6 +131,24 @@ struct FAISS_API IndexIVFPQ: IndexIVF {
133
131
 
134
132
  };
135
133
 
134
+ /** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
135
+ *
136
+ * @param use_precomputed_table (I/O)
137
+ * =-1: force disable
138
+ * =0: decide heuristically (default: use tables only if they are
139
+ * < precomputed_tables_max_bytes), set use_precomputed_table on output
140
+ * =1: tables that work for all quantizers (size 256 * nlist * M)
141
+ * =2: specific version for MultiIndexQuantizer (much more compact)
142
+ * @param precomputed_table precomputed table to intialize
143
+ */
144
+
145
+ void initialize_IVFPQ_precomputed_table(
146
+ int &use_precomputed_table,
147
+ const Index *quantizer,
148
+ const ProductQuantizer &pq,
149
+ AlignedTable<float> & precomputed_table,
150
+ bool verbose
151
+ );
136
152
 
137
153
  /// statistics are robust to internal threading, but not if
138
154
  /// IndexIVFPQ::search_preassigned is called by multiple threads
@@ -0,0 +1,1116 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/IndexIVFPQFastScan.h>
9
+
10
+ #include <cassert>
11
+ #include <cstdio>
12
+ #include <inttypes.h>
13
+
14
+ #include <omp.h>
15
+
16
+ #include <memory>
17
+
18
+ #include <faiss/impl/FaissAssert.h>
19
+ #include <faiss/utils/utils.h>
20
+ #include <faiss/utils/distances.h>
21
+ #include <faiss/utils/simdlib.h>
22
+ #include <faiss/impl/AuxIndexStructures.h>
23
+
24
+ #include <faiss/invlists/BlockInvertedLists.h>
25
+
26
+ #include <faiss/impl/simd_result_handlers.h>
27
+ #include <faiss/utils/quantize_lut.h>
28
+ #include <faiss/impl/pq4_fast_scan.h>
29
+
30
+ namespace faiss {
31
+
32
+ using namespace simd_result_handlers;
33
+
34
+
35
+ inline size_t roundup(size_t a, size_t b) {
36
+ return (a + b - 1) / b * b;
37
+ }
38
+
39
+
40
+ IndexIVFPQFastScan::IndexIVFPQFastScan (
41
+ Index * quantizer, size_t d, size_t nlist,
42
+ size_t M, size_t nbits_per_idx,
43
+ MetricType metric, int bbs):
44
+ IndexIVF (quantizer, d, nlist, 0, metric),
45
+ pq (d, M, nbits_per_idx),
46
+ bbs (bbs)
47
+ {
48
+ FAISS_THROW_IF_NOT(nbits_per_idx == 4);
49
+ M2 = roundup(pq.M, 2);
50
+ by_residual = false; // set to false by default because it's much faster
51
+ is_trained = false;
52
+ code_size = pq.code_size;
53
+
54
+ replace_invlists(
55
+ new BlockInvertedLists(nlist, bbs, bbs * M2 / 2),
56
+ true
57
+ );
58
+ }
59
+
60
+ IndexIVFPQFastScan::IndexIVFPQFastScan ()
61
+ {
62
+ by_residual = false;
63
+ bbs = 0;
64
+ M2 = 0;
65
+ }
66
+
67
+
68
+ IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ & orig, int bbs):
69
+ IndexIVF(
70
+ orig.quantizer, orig.d, orig.nlist,
71
+ orig.pq.code_size, orig.metric_type),
72
+ pq(orig.pq),
73
+ bbs(bbs)
74
+ {
75
+ FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
76
+
77
+ by_residual = orig.by_residual;
78
+ ntotal = orig.ntotal;
79
+ is_trained = orig.is_trained;
80
+ nprobe = orig.nprobe;
81
+ size_t M = pq.M;
82
+
83
+ M2 = roundup(M, 2);
84
+
85
+ replace_invlists(
86
+ new BlockInvertedLists(orig.nlist, bbs, bbs * M2 / 2),
87
+ true
88
+ );
89
+
90
+ precomputed_table.resize(orig.precomputed_table.size());
91
+
92
+ if (precomputed_table.nbytes() > 0) {
93
+ memcpy(precomputed_table.get(), orig.precomputed_table.data(),
94
+ precomputed_table.nbytes()
95
+ );
96
+ }
97
+
98
+ for(size_t i = 0; i < nlist; i++) {
99
+ size_t nb = orig.invlists->list_size(i);
100
+ size_t nb2 = roundup(nb, bbs);
101
+ AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
102
+ pq4_pack_codes(
103
+ InvertedLists::ScopedCodes(orig.invlists, i).get(),
104
+ nb, M, nb2, bbs, M2,
105
+ tmp.get()
106
+ );
107
+ invlists->add_entries(
108
+ i, nb,
109
+ InvertedLists::ScopedIds(orig.invlists, i).get(),
110
+ tmp.get()
111
+ );
112
+ }
113
+
114
+ orig_invlists = orig.invlists;
115
+ }
116
+
117
+
118
+
119
+ /*********************************************************
120
+ * Training
121
+ *********************************************************/
122
+
123
+ void IndexIVFPQFastScan::train_residual (idx_t n, const float *x_in)
124
+ {
125
+
126
+ const float * x = fvecs_maybe_subsample (
127
+ d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
128
+ x_in, verbose, pq.cp.seed);
129
+
130
+ std::unique_ptr<float []> del_x;
131
+ if (x != x_in) {
132
+ del_x.reset((float*)x);
133
+ }
134
+
135
+ const float *trainset;
136
+ AlignedTable<float> residuals;
137
+
138
+ if (by_residual) {
139
+ if(verbose) printf("computing residuals\n");
140
+ std::vector<idx_t> assign(n);
141
+ quantizer->assign (n, x, assign.data());
142
+ residuals.resize(n * d);
143
+ for (idx_t i = 0; i < n; i++) {
144
+ quantizer->compute_residual (
145
+ x + i * d,
146
+ residuals.data() + i * d,
147
+ assign[i]
148
+ );
149
+ }
150
+ trainset = residuals.data();
151
+ } else {
152
+ trainset = x;
153
+ }
154
+
155
+ if (verbose) {
156
+ printf ("training %zdx%zd product quantizer on %zd vectors in %dD\n",
157
+ pq.M, pq.ksub, long(n), d);
158
+ }
159
+ pq.verbose = verbose;
160
+ pq.train (n, trainset);
161
+
162
+ if (by_residual && metric_type == METRIC_L2) {
163
+ precompute_table();
164
+ }
165
+
166
+ }
167
+
168
+ void IndexIVFPQFastScan::precompute_table ()
169
+ {
170
+ initialize_IVFPQ_precomputed_table(
171
+ use_precomputed_table,
172
+ quantizer, pq, precomputed_table, verbose
173
+ );
174
+ }
175
+
176
+
177
+ /*********************************************************
178
+ * Code management functions
179
+ *********************************************************/
180
+
181
+
182
+
183
+ void IndexIVFPQFastScan::encode_vectors(
184
+ idx_t n, const float* x, const idx_t *list_nos,
185
+ uint8_t * codes, bool include_listnos) const
186
+ {
187
+
188
+ if (by_residual) {
189
+ AlignedTable<float> residuals (n * d);
190
+ for (size_t i = 0; i < n; i++) {
191
+ if (list_nos[i] < 0) {
192
+ memset (residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
193
+ } else {
194
+ quantizer->compute_residual (
195
+ x + i * d, residuals.data() + i * d, list_nos[i]);
196
+ }
197
+ }
198
+ pq.compute_codes (residuals.data(), codes, n);
199
+ } else {
200
+ pq.compute_codes (x, codes, n);
201
+ }
202
+
203
+ if (include_listnos) {
204
+ size_t coarse_size = coarse_code_size();
205
+ for (idx_t i = n - 1; i >= 0; i--) {
206
+ uint8_t * code = codes + i * (coarse_size + code_size);
207
+ memmove (code + coarse_size,
208
+ codes + i * code_size, code_size);
209
+ encode_listno (list_nos[i], code);
210
+ }
211
+ }
212
+ }
213
+
214
+
215
+
216
+ void IndexIVFPQFastScan::add_with_ids (
217
+ idx_t n, const float * x, const idx_t *xids) {
218
+
219
+ // copied from IndexIVF::add_with_ids --->
220
+
221
+ // do some blocking to avoid excessive allocs
222
+ idx_t bs = 65536;
223
+ if (n > bs) {
224
+ for (idx_t i0 = 0; i0 < n; i0 += bs) {
225
+ idx_t i1 = std::min (n, i0 + bs);
226
+ if (verbose) {
227
+ printf(" IndexIVFPQFastScan::add_with_ids %zd: %zd",
228
+ size_t(i0), size_t(i1));
229
+ }
230
+ add_with_ids (i1 - i0, x + i0 * d,
231
+ xids ? xids + i0 : nullptr);
232
+ }
233
+ return;
234
+ }
235
+ InterruptCallback::check();
236
+
237
+ AlignedTable<uint8_t> codes(n * code_size);
238
+
239
+ FAISS_THROW_IF_NOT (is_trained);
240
+ direct_map.check_can_add (xids);
241
+
242
+ std::unique_ptr<idx_t []> idx(new idx_t[n]);
243
+ quantizer->assign (n, x, idx.get());
244
+ size_t nadd = 0, nminus1 = 0;
245
+
246
+ for (size_t i = 0; i < n; i++) {
247
+ if (idx[i] < 0) nminus1++;
248
+ }
249
+
250
+ AlignedTable<uint8_t> flat_codes(n * code_size);
251
+ encode_vectors (n, x, idx.get(), flat_codes.get());
252
+
253
+ DirectMapAdd dm_adder(direct_map, n, xids);
254
+
255
+ // <---
256
+
257
+ BlockInvertedLists *bil = dynamic_cast<BlockInvertedLists*>(invlists);
258
+ FAISS_THROW_IF_NOT_MSG (bil, "only block inverted lists supported");
259
+
260
+ // prepare batches
261
+ std::vector<idx_t> order(n);
262
+ for(idx_t i = 0; i < n ; i++) { order[i] = i; }
263
+
264
+ // TODO should not need stable
265
+ std::stable_sort(order.begin(), order.end(),
266
+ [&idx](idx_t a, idx_t b) {
267
+ return idx[a] < idx[b];
268
+ }
269
+ );
270
+
271
+ // TODO parallelize
272
+ idx_t i0 = 0;
273
+ while (i0 < n) {
274
+ idx_t list_no = idx[order[i0]];
275
+ idx_t i1 = i0 + 1;
276
+ while (i1 < n && idx[order[i1]] == list_no) {
277
+ i1 ++;
278
+ }
279
+
280
+ if (list_no == -1) {
281
+ i0 = i1;
282
+ continue;
283
+ }
284
+
285
+ // make linear array
286
+ AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
287
+ size_t list_size = bil->list_size(list_no);
288
+
289
+ bil->resize(list_no, list_size + i1 - i0);
290
+
291
+ for(idx_t i = i0; i < i1; i++) {
292
+ size_t ofs = list_size + i - i0;
293
+ idx_t id = xids ? xids[order[i]] : ntotal + order[i];
294
+ dm_adder.add (order[i], list_no, ofs);
295
+ bil->ids[list_no][ofs] = id;
296
+ memcpy(
297
+ list_codes.data() + (i - i0) * code_size,
298
+ flat_codes.data() + order[i] * code_size,
299
+ code_size
300
+ );
301
+ nadd++;
302
+ }
303
+ pq4_pack_codes_range(
304
+ list_codes.data(), pq.M,
305
+ list_size, list_size + i1 - i0,
306
+ bbs, M2, bil->codes[list_no].data()
307
+ );
308
+
309
+ i0 = i1;
310
+ }
311
+
312
+ ntotal += n;
313
+
314
+ }
315
+
316
+
317
+
318
+ /*********************************************************
319
+ * search
320
+ *********************************************************/
321
+
322
+
323
+ namespace {
324
+
325
+ // from impl/ProductQuantizer.cpp
326
+ template <class C, typename dis_t>
327
+ void pq_estimators_from_tables_generic(
328
+ const ProductQuantizer& pq, size_t nbits,
329
+ const uint8_t *codes, size_t ncodes,
330
+ const dis_t *dis_table, const int64_t * ids,
331
+ float dis0,
332
+ size_t k, typename C::T *heap_dis, int64_t *heap_ids)
333
+ {
334
+ using accu_t = typename C::T;
335
+ const size_t M = pq.M;
336
+ const size_t ksub = pq.ksub;
337
+ for (size_t j = 0; j < ncodes; ++j) {
338
+ PQDecoderGeneric decoder(
339
+ codes + j * pq.code_size, nbits
340
+ );
341
+ accu_t dis = dis0;
342
+ const dis_t * dt = dis_table;
343
+ for (size_t m = 0; m < M; m++) {
344
+ uint64_t c = decoder.decode();
345
+ dis += dt[c];
346
+ dt += ksub;
347
+ }
348
+
349
+ if (C::cmp(heap_dis[0], dis)) {
350
+ heap_pop<C>(k, heap_dis, heap_ids);
351
+ heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
352
+ }
353
+ }
354
+ }
355
+
356
+ using idx_t = Index::idx_t;
357
+ using namespace quantize_lut;
358
+
359
+ void fvec_madd_avx (
360
+ size_t n, const float *a,
361
+ float bf, const float *b, float *c)
362
+ {
363
+ assert(is_aligned_pointer(a));
364
+ assert(is_aligned_pointer(b));
365
+ assert(is_aligned_pointer(c));
366
+ assert(n % 8 == 0);
367
+ simd8float32 bf8(bf);
368
+ n /= 8;
369
+ for(size_t i = 0; i < n; i++) {
370
+ simd8float32 ai(a);
371
+ simd8float32 bi(b);
372
+
373
+ simd8float32 ci = fmadd(bf8, bi, ai);
374
+ ci.store(c);
375
+ c += 8;
376
+ a += 8;
377
+ b += 8;
378
+ }
379
+
380
+ }
381
+
382
+ } // anonymous namespace
383
+
384
+ /*********************************************************
385
+ * Look-Up Table functions
386
+ *********************************************************/
387
+
388
+
389
+ void IndexIVFPQFastScan::compute_LUT(
390
+ size_t n, const float *x,
391
+ const idx_t *coarse_ids, const float *coarse_dis,
392
+ AlignedTable<float> & dis_tables,
393
+ AlignedTable<float> & biases
394
+ ) const
395
+ {
396
+ const IndexIVFPQFastScan & ivfpq = *this;
397
+ size_t dim12 = pq.ksub * pq.M;
398
+ size_t d = pq.d;
399
+ size_t nprobe = ivfpq.nprobe;
400
+
401
+ if (ivfpq.by_residual) {
402
+
403
+ if (ivfpq.metric_type == METRIC_L2) {
404
+
405
+ dis_tables.resize(n * nprobe * dim12);
406
+
407
+ if (ivfpq.use_precomputed_table == 1) {
408
+ biases.resize(n * nprobe);
409
+ memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
410
+
411
+ AlignedTable<float> ip_table(n * dim12);
412
+ pq.compute_inner_prod_tables (n, x, ip_table.get());
413
+
414
+ #pragma omp parallel for if (n * nprobe > 8000)
415
+ for(idx_t ij = 0; ij < n * nprobe; ij++) {
416
+ idx_t i = ij / nprobe;
417
+ float *tab = dis_tables.get() + ij * dim12;
418
+ idx_t cij = coarse_ids[ij];
419
+
420
+ if (cij >= 0) {
421
+ fvec_madd_avx (
422
+ dim12,
423
+ precomputed_table.get() + cij * dim12,
424
+ -2, ip_table.get() + i * dim12,
425
+ tab
426
+ );
427
+ } else {
428
+ // fill with NaNs so that they are ignored during
429
+ // LUT quantization
430
+ memset (tab, -1, sizeof(float) * dim12);
431
+ }
432
+ }
433
+
434
+ } else {
435
+
436
+ std::unique_ptr<float[]> xrel(new float[n * nprobe * d]);
437
+ biases.resize(n * nprobe);
438
+ memset(biases.get(), 0, sizeof(float) * n * nprobe);
439
+
440
+ #pragma omp parallel for if (n * nprobe > 8000)
441
+ for(idx_t ij = 0; ij < n * nprobe; ij++) {
442
+ idx_t i = ij / nprobe;
443
+ float *xij = &xrel[ij * d];
444
+ idx_t cij = coarse_ids[ij];
445
+
446
+ if (cij >= 0) {
447
+ ivfpq.quantizer->compute_residual(
448
+ x + i * d, xij, cij);
449
+ } else {
450
+ // will fill with NaNs
451
+ memset(xij, -1, sizeof(float) * d);
452
+ }
453
+ }
454
+
455
+ pq.compute_distance_tables (
456
+ n * nprobe, xrel.get(), dis_tables.get());
457
+
458
+ }
459
+
460
+ } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
461
+ dis_tables.resize(n * dim12);
462
+ pq.compute_inner_prod_tables (n, x, dis_tables.get());
463
+ // compute_inner_prod_tables(pq, n, x, dis_tables.get());
464
+
465
+ biases.resize(n * nprobe);
466
+ memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
467
+ } else {
468
+ FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
469
+ }
470
+
471
+ } else {
472
+ dis_tables.resize(n * dim12);
473
+ if (ivfpq.metric_type == METRIC_L2) {
474
+ pq.compute_distance_tables (n, x, dis_tables.get());
475
+ } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
476
+ pq.compute_inner_prod_tables (n, x, dis_tables.get());
477
+ } else {
478
+ FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
479
+ }
480
+ }
481
+
482
+ }
483
+
484
+ void IndexIVFPQFastScan::compute_LUT_uint8(
485
+ size_t n, const float *x,
486
+ const idx_t *coarse_ids, const float *coarse_dis,
487
+ AlignedTable<uint8_t> & dis_tables,
488
+ AlignedTable<uint16_t> & biases,
489
+ float * normalizers
490
+ ) const {
491
+ const IndexIVFPQFastScan & ivfpq = *this;
492
+ AlignedTable<float> dis_tables_float;
493
+ AlignedTable<float> biases_float;
494
+
495
+ uint64_t t0 = get_cy();
496
+ compute_LUT(
497
+ n, x,
498
+ coarse_ids, coarse_dis,
499
+ dis_tables_float, biases_float
500
+ );
501
+ IVFFastScan_stats.t_compute_distance_tables += get_cy() - t0;
502
+
503
+ bool lut_is_3d = ivfpq.by_residual && ivfpq.metric_type == METRIC_L2;
504
+ size_t dim123 = pq.ksub * pq.M;
505
+ size_t dim123_2 = pq.ksub * M2;
506
+ if (lut_is_3d) {
507
+ dim123 *= nprobe;
508
+ dim123_2 *= nprobe;
509
+ }
510
+ dis_tables.resize(n * dim123_2);
511
+ if (biases_float.get()) {
512
+ biases.resize(n * nprobe);
513
+ }
514
+ uint64_t t1 = get_cy();
515
+
516
+ #pragma omp parallel for if (n > 100)
517
+ for(int64_t i = 0; i < n; i++) {
518
+ const float *t_in = dis_tables_float.get() + i * dim123;
519
+ const float *b_in = nullptr;
520
+ uint8_t *t_out = dis_tables.get() + i * dim123_2;
521
+ uint16_t *b_out = nullptr;
522
+ if (biases_float.get()) {
523
+ b_in = biases_float.get() + i * nprobe;
524
+ b_out = biases.get() + i * nprobe;
525
+ }
526
+
527
+ quantize_LUT_and_bias(
528
+ nprobe, pq.M, pq.ksub, lut_is_3d,
529
+ t_in, b_in,
530
+ t_out, M2, b_out,
531
+ normalizers + 2 * i, normalizers + 2 * i + 1
532
+ );
533
+ }
534
+ IVFFastScan_stats.t_round += get_cy() - t1;
535
+
536
+ }
537
+
538
+
539
+ /*********************************************************
540
+ * Search functions
541
+ *********************************************************/
542
+
543
+ template<bool is_max>
544
+ void IndexIVFPQFastScan::search_dispatch_implem(
545
+ idx_t n,
546
+ const float* x,
547
+ idx_t k,
548
+ float* distances,
549
+ idx_t* labels) const
550
+ {
551
+ using Cfloat = typename std::conditional<is_max,
552
+ CMax<float, int64_t>, CMin<float, int64_t> >::type;
553
+
554
+ using C = typename std::conditional<is_max,
555
+ CMax<uint16_t, int64_t>, CMin<uint16_t, int64_t> >::type;
556
+
557
+ if (n == 0) {
558
+ return;
559
+ }
560
+
561
+ // actual implementation used
562
+ int impl = implem;
563
+
564
+ if (impl == 0) {
565
+ if (bbs == 32) {
566
+ impl = 12;
567
+ } else {
568
+ impl = 10;
569
+ }
570
+ if (k > 20) {
571
+ impl ++;
572
+ }
573
+ }
574
+
575
+ if (impl == 1) {
576
+ search_implem_1<Cfloat>(n, x, k, distances, labels);
577
+ } else if (impl == 2) {
578
+ search_implem_2<C>(n, x, k, distances, labels);
579
+
580
+ } else if (impl >= 10 && impl <= 13) {
581
+ size_t ndis = 0, nlist_visited = 0;
582
+
583
+ if (n < 2) {
584
+ if (impl == 12 || impl == 13) {
585
+ search_implem_12<C>
586
+ (n, x, k, distances, labels, impl, &ndis, &nlist_visited);
587
+ } else {
588
+ search_implem_10<C>
589
+ (n, x, k, distances, labels, impl, &ndis, &nlist_visited);
590
+ }
591
+ } else {
592
+ // explicitly slice over threads
593
+ int nslice;
594
+ if (n <= omp_get_max_threads()) {
595
+ nslice = n;
596
+ } else if (by_residual && metric_type == METRIC_L2) {
597
+ // make sure we don't make too big LUT tables
598
+ size_t lut_size_per_query =
599
+ pq.M * pq.ksub * nprobe * (sizeof(float) + sizeof(uint8_t));
600
+
601
+ size_t max_lut_size = precomputed_table_max_bytes;
602
+ // how many queries we can handle within mem budget
603
+ size_t nq_ok = std::max(max_lut_size / lut_size_per_query, size_t(1));
604
+ nslice = roundup(std::max(size_t(n / nq_ok), size_t(1)), omp_get_max_threads());
605
+ } else {
606
+ // LUTs unlikely to be a limiting factor
607
+ nslice = omp_get_max_threads();
608
+ }
609
+
610
+ #pragma omp parallel for reduction(+: ndis, nlist_visited)
611
+ for (int slice = 0; slice < nslice; slice++) {
612
+ idx_t i0 = n * slice / nslice;
613
+ idx_t i1 = n * (slice + 1) / nslice;
614
+ float *dis_i = distances + i0 * k;
615
+ idx_t *lab_i = labels + i0 * k;
616
+ if (impl == 12 || impl == 13) {
617
+ search_implem_12<C>(
618
+ i1 - i0, x + i0 * d, k, dis_i, lab_i,
619
+ impl, &ndis, &nlist_visited
620
+ );
621
+ } else {
622
+ search_implem_10<C>(
623
+ i1 - i0, x + i0 * d, k, dis_i, lab_i,
624
+ impl, &ndis, &nlist_visited
625
+ );
626
+ }
627
+ }
628
+ }
629
+ indexIVF_stats.nq += n;
630
+ indexIVF_stats.ndis += ndis;
631
+ indexIVF_stats.nlist += nlist_visited;
632
+ } else {
633
+ FAISS_THROW_FMT("implem %d does not exist", implem);
634
+ }
635
+
636
+ }
637
+
638
+
639
+ void IndexIVFPQFastScan::search(
640
+ idx_t n, const float* x, idx_t k,
641
+ float* distances, idx_t* labels) const
642
+ {
643
+ if (metric_type == METRIC_L2) {
644
+ search_dispatch_implem<true>(n, x, k, distances, labels);
645
+ } else {
646
+ search_dispatch_implem<false>(n, x, k, distances, labels);
647
+ }
648
+ }
649
+
650
+ template<class C>
651
+ void IndexIVFPQFastScan::search_implem_1(
652
+ idx_t n, const float* x, idx_t k,
653
+ float* distances, idx_t* labels) const
654
+ {
655
+ FAISS_THROW_IF_NOT(orig_invlists);
656
+
657
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
658
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
659
+
660
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
661
+
662
+ size_t dim12 = pq.ksub * pq.M;
663
+ AlignedTable<float> dis_tables;
664
+ AlignedTable<float> biases;
665
+
666
+ compute_LUT (
667
+ n, x,
668
+ coarse_ids.get(), coarse_dis.get(),
669
+ dis_tables, biases
670
+ );
671
+
672
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
673
+
674
+ size_t ndis = 0, nlist_visited = 0;
675
+
676
+ #pragma omp parallel for reduction(+: ndis, nlist_visited)
677
+ for(idx_t i = 0; i < n; i++) {
678
+ int64_t *heap_ids = labels + i * k;
679
+ float *heap_dis = distances + i * k;
680
+ heap_heapify<C> (k, heap_dis, heap_ids);
681
+ float *LUT = nullptr;
682
+
683
+ if (single_LUT) {
684
+ LUT = dis_tables.get() + i * dim12;
685
+ }
686
+ for(idx_t j = 0; j < nprobe; j++) {
687
+ if (!single_LUT) {
688
+ LUT = dis_tables.get() + (i * nprobe + j) * dim12;
689
+ }
690
+ idx_t list_no = coarse_ids[i * nprobe + j];
691
+ if (list_no < 0) continue;
692
+ size_t ls = orig_invlists->list_size(list_no);
693
+ if (ls == 0) continue;
694
+ InvertedLists::ScopedCodes codes(orig_invlists, list_no);
695
+ InvertedLists::ScopedIds ids(orig_invlists, list_no);
696
+
697
+ float bias = biases.get() ? biases[i * nprobe + j] : 0;
698
+
699
+ pq_estimators_from_tables_generic<C>(
700
+ pq, pq.nbits, codes.get(), ls,
701
+ LUT, ids.get(), bias,
702
+ k, heap_dis, heap_ids
703
+ );
704
+ nlist_visited ++;
705
+ ndis ++;
706
+ }
707
+ heap_reorder<C> (k, heap_dis, heap_ids);
708
+ }
709
+ indexIVF_stats.nq += n;
710
+ indexIVF_stats.ndis += ndis;
711
+ indexIVF_stats.nlist += nlist_visited;
712
+ }
713
+
714
+ template<class C>
715
+ void IndexIVFPQFastScan::search_implem_2(
716
+ idx_t n, const float* x, idx_t k,
717
+ float* distances, idx_t* labels) const
718
+ {
719
+ FAISS_THROW_IF_NOT(orig_invlists);
720
+
721
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
722
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
723
+
724
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
725
+
726
+ size_t dim12 = pq.ksub * M2;
727
+ AlignedTable<uint8_t> dis_tables;
728
+ AlignedTable<uint16_t> biases;
729
+ std::unique_ptr<float[]> normalizers(new float[2 * n]);
730
+
731
+ compute_LUT_uint8 (
732
+ n, x,
733
+ coarse_ids.get(), coarse_dis.get(),
734
+ dis_tables, biases,
735
+ normalizers.get()
736
+ );
737
+
738
+
739
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
740
+
741
+ size_t ndis = 0, nlist_visited = 0;
742
+
743
+ #pragma omp parallel for reduction(+: ndis, nlist_visited)
744
+ for(idx_t i = 0; i < n; i++) {
745
+ std::vector<uint16_t> tmp_dis(k);
746
+ int64_t *heap_ids = labels + i * k;
747
+ uint16_t *heap_dis = tmp_dis.data();
748
+ heap_heapify<C> (k, heap_dis, heap_ids);
749
+ const uint8_t *LUT = nullptr;
750
+
751
+ if (single_LUT) {
752
+ LUT = dis_tables.get() + i * dim12;
753
+ }
754
+ for(idx_t j = 0; j < nprobe; j++) {
755
+ if (!single_LUT) {
756
+ LUT = dis_tables.get() + (i * nprobe + j) * dim12;
757
+ }
758
+ idx_t list_no = coarse_ids[i * nprobe + j];
759
+ if (list_no < 0) continue;
760
+ size_t ls = orig_invlists->list_size(list_no);
761
+ if (ls == 0) continue;
762
+ InvertedLists::ScopedCodes codes(orig_invlists, list_no);
763
+ InvertedLists::ScopedIds ids(orig_invlists, list_no);
764
+
765
+ uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
766
+
767
+ pq_estimators_from_tables_generic<C>(
768
+ pq, pq.nbits, codes.get(), ls,
769
+ LUT, ids.get(), bias,
770
+ k, heap_dis, heap_ids
771
+ );
772
+
773
+ nlist_visited++;
774
+ ndis += ls;
775
+ }
776
+ heap_reorder<C> (k, heap_dis, heap_ids);
777
+ // convert distances to float
778
+ {
779
+ float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
780
+ if (skip & 16) {
781
+ one_a = 1;
782
+ b = 0;
783
+ }
784
+ float *heap_dis_float = distances + i * k;
785
+ for (int j = 0; j < k; j++) {
786
+ heap_dis_float[j] = b + heap_dis[j] * one_a;
787
+ }
788
+ }
789
+ }
790
+ indexIVF_stats.nq += n;
791
+ indexIVF_stats.ndis += ndis;
792
+ indexIVF_stats.nlist += nlist_visited;
793
+ }
794
+
795
+
796
+
797
+ template<class C>
798
+ void IndexIVFPQFastScan::search_implem_10(
799
+ idx_t n, const float* x, idx_t k,
800
+ float* distances, idx_t* labels,
801
+ int impl, size_t *ndis_out, size_t *nlist_out) const
802
+ {
803
+ memset(distances, -1, sizeof(float) * k * n);
804
+ memset(labels, -1, sizeof(idx_t) * k * n);
805
+
806
+ using HeapHC = HeapHandler<C, true>;
807
+ using ReservoirHC = ReservoirHandler<C, true>;
808
+ using SingleResultHC = SingleResultHandler<C, true>;
809
+
810
+
811
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
812
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
813
+
814
+ uint64_t times[10];
815
+ memset(times, 0, sizeof(times));
816
+ int ti = 0;
817
+ #define TIC times[ti++] = get_cy()
818
+ TIC;
819
+
820
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
821
+
822
+ TIC;
823
+
824
+ size_t dim12 = pq.ksub * M2;
825
+ AlignedTable<uint8_t> dis_tables;
826
+ AlignedTable<uint16_t> biases;
827
+ std::unique_ptr<float[]> normalizers (new float[2 * n]);
828
+
829
+ compute_LUT_uint8 (
830
+ n, x,
831
+ coarse_ids.get(), coarse_dis.get(),
832
+ dis_tables, biases, normalizers.get()
833
+ );
834
+
835
+ TIC;
836
+
837
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
838
+
839
+ TIC;
840
+ size_t ndis = 0, nlist_visited = 0;
841
+
842
+ {
843
+ AlignedTable<uint16_t> tmp_distances(k);
844
+ for(idx_t i = 0; i < n; i++) {
845
+ const uint8_t *LUT = nullptr;
846
+ int qmap1[1] = {0};
847
+ std::unique_ptr<SIMDResultHandler<C, true> > handler;
848
+
849
+ if (k == 1) {
850
+ handler.reset(new SingleResultHC(1, 0));
851
+ } else if (impl == 10) {
852
+ handler.reset(new HeapHC(1, tmp_distances.get(), labels + i * k, k, 0));
853
+ } else if (impl == 11) {
854
+ handler.reset(new ReservoirHC(1, 0, k, 2 * k));
855
+ } else {
856
+ FAISS_THROW_MSG("invalid");
857
+ }
858
+
859
+ handler->q_map = qmap1;
860
+
861
+ if (single_LUT) {
862
+ LUT = dis_tables.get() + i * dim12;
863
+ }
864
+ for(idx_t j = 0; j < nprobe; j++) {
865
+ size_t ij = i * nprobe + j;
866
+ if (!single_LUT) {
867
+ LUT = dis_tables.get() + ij * dim12;
868
+ }
869
+ if (biases.get()) {
870
+ handler->dbias = biases.get() + ij;
871
+ }
872
+
873
+ idx_t list_no = coarse_ids[ij];
874
+ if (list_no < 0) continue;
875
+ size_t ls = invlists->list_size(list_no);
876
+ if (ls == 0) continue;
877
+
878
+ InvertedLists::ScopedCodes codes(invlists, list_no);
879
+ InvertedLists::ScopedIds ids(invlists, list_no);
880
+
881
+ handler->ntotal = ls;
882
+ handler->id_map = ids.get();
883
+
884
+ #define DISPATCH(classHC) \
885
+ if(auto *res = dynamic_cast<classHC* > (handler.get())) { \
886
+ pq4_accumulate_loop( \
887
+ 1, roundup(ls, bbs), bbs, M2, \
888
+ codes.get(), LUT, \
889
+ *res \
890
+ ); \
891
+ }
892
+ DISPATCH(HeapHC)
893
+ else DISPATCH(ReservoirHC)
894
+ else DISPATCH(SingleResultHC)
895
+ #undef DISPATCH
896
+
897
+ nlist_visited ++;
898
+ ndis ++;
899
+ }
900
+
901
+ handler->to_flat_arrays(
902
+ distances + i * k, labels + i * k,
903
+ skip & 16 ? nullptr : normalizers.get() + i * 2
904
+ );
905
+ }
906
+ }
907
+ *ndis_out = ndis;
908
+ *nlist_out = nlist;
909
+ }
910
+
911
+
912
+
913
+ template<class C>
914
+ void IndexIVFPQFastScan::search_implem_12(
915
+ idx_t n, const float* x, idx_t k,
916
+ float* distances, idx_t* labels,
917
+ int impl, size_t *ndis_out, size_t *nlist_out) const
918
+ {
919
+ if (n == 0) { // does not work well with reservoir
920
+ return;
921
+ }
922
+ FAISS_THROW_IF_NOT(bbs == 32);
923
+
924
+ std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
925
+ std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
926
+
927
+ uint64_t times[10];
928
+ memset(times, 0, sizeof(times));
929
+ int ti = 0;
930
+ #define TIC times[ti++] = get_cy()
931
+ TIC;
932
+
933
+ quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
934
+
935
+ TIC;
936
+
937
+ size_t dim12 = pq.ksub * M2;
938
+ AlignedTable<uint8_t> dis_tables;
939
+ AlignedTable<uint16_t> biases;
940
+ std::unique_ptr<float[]> normalizers (new float[2 * n]);
941
+
942
+ compute_LUT_uint8 (
943
+ n, x,
944
+ coarse_ids.get(), coarse_dis.get(),
945
+ dis_tables, biases, normalizers.get()
946
+ );
947
+
948
+ TIC;
949
+
950
+ struct QC {
951
+ int qno; // sequence number of the query
952
+ int list_no; // list to visit
953
+ int rank; // this is the rank'th result of the coarse quantizer
954
+ };
955
+ bool single_LUT = !(by_residual && metric_type == METRIC_L2);
956
+
957
+ std::vector<QC> qcs;
958
+ {
959
+ int ij = 0;
960
+ for(int i = 0; i < n; i++) {
961
+ for(int j = 0; j < nprobe; j++) {
962
+ if (coarse_ids[ij] >= 0) {
963
+ qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
964
+ }
965
+ ij++;
966
+ }
967
+ }
968
+ std::sort(
969
+ qcs.begin(), qcs.end(),
970
+ [](const QC &a, const QC & b) {
971
+ return a.list_no < b.list_no;
972
+ }
973
+ );
974
+ }
975
+ TIC;
976
+
977
+ // prepare the result handlers
978
+
979
+ std::unique_ptr<SIMDResultHandler<C, true> > handler;
980
+ AlignedTable<uint16_t> tmp_distances;
981
+
982
+ using HeapHC = HeapHandler<C, true>;
983
+ using ReservoirHC = ReservoirHandler<C, true>;
984
+ using SingleResultHC = SingleResultHandler<C, true>;
985
+
986
+ if (k == 1) {
987
+ handler.reset(new SingleResultHC(n, 0));
988
+ } else if (impl == 12) {
989
+ tmp_distances.resize(n * k);
990
+ handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0));
991
+ } else if (impl == 13) {
992
+ handler.reset(new ReservoirHC(n, 0, k, 2 * k));
993
+ }
994
+
995
+ int qbs2 = this->qbs2 ? this->qbs2 : 11;
996
+
997
+ std::vector<uint16_t> tmp_bias;
998
+ if (biases.get()) {
999
+ tmp_bias.resize(qbs2);
1000
+ handler->dbias = tmp_bias.data();
1001
+ }
1002
+ TIC;
1003
+
1004
+ size_t ndis = 0;
1005
+
1006
+ size_t i0 = 0;
1007
+ uint64_t t_copy_pack = 0, t_scan = 0;
1008
+ while (i0 < qcs.size()) {
1009
+ uint64_t tt0 = get_cy();
1010
+
1011
+ // find all queries that access this inverted list
1012
+ int list_no = qcs[i0].list_no;
1013
+ size_t i1 = i0 + 1;
1014
+
1015
+ while(i1 < qcs.size() && i1 < i0 + qbs2) {
1016
+ if (qcs[i1].list_no != list_no) {
1017
+ break;
1018
+ }
1019
+ i1++;
1020
+ }
1021
+
1022
+ size_t list_size = invlists->list_size(list_no);
1023
+
1024
+ if (list_size == 0) {
1025
+ i0 = i1;
1026
+ continue;
1027
+ }
1028
+
1029
+ // re-organize LUTs and biases into the right order
1030
+ int nc = i1 - i0;
1031
+
1032
+ std::vector<int> q_map(nc), lut_entries(nc);
1033
+ AlignedTable<uint8_t> LUT(nc * dim12);
1034
+ memset(LUT.get(), -1, nc * dim12);
1035
+ int qbs = pq4_preferred_qbs(nc);
1036
+
1037
+ for(size_t i = i0; i < i1; i++) {
1038
+ const QC & qc = qcs[i];
1039
+ q_map[i - i0] = qc.qno;
1040
+ int ij = qc.qno * nprobe + qc.rank;
1041
+ lut_entries[i - i0] = single_LUT ? qc.qno : ij;
1042
+ if (biases.get()) {
1043
+ tmp_bias[i - i0] = biases[ij];
1044
+ }
1045
+ }
1046
+ pq4_pack_LUT_qbs_q_map(
1047
+ qbs, M2, dis_tables.get(), lut_entries.data(),
1048
+ LUT.get()
1049
+ );
1050
+
1051
+ // access the inverted list
1052
+
1053
+ ndis += (i1 - i0) * list_size;
1054
+
1055
+ InvertedLists::ScopedCodes codes(invlists, list_no);
1056
+ InvertedLists::ScopedIds ids(invlists, list_no);
1057
+
1058
+ // prepare the handler
1059
+
1060
+ handler->ntotal = list_size;
1061
+ handler->q_map = q_map.data();
1062
+ handler->id_map = ids.get();
1063
+ uint64_t tt1 = get_cy();
1064
+
1065
+ #define DISPATCH(classHC) \
1066
+ if(auto *res = dynamic_cast<classHC* > (handler.get())) { \
1067
+ pq4_accumulate_loop_qbs( \
1068
+ qbs, list_size, M2, \
1069
+ codes.get(), LUT.get(), \
1070
+ *res \
1071
+ ); \
1072
+ }
1073
+ DISPATCH(HeapHC)
1074
+ else DISPATCH(ReservoirHC)
1075
+ else DISPATCH(SingleResultHC)
1076
+
1077
+ // prepare for next loop
1078
+ i0 = i1;
1079
+
1080
+ uint64_t tt2 = get_cy();
1081
+ t_copy_pack += tt1 - tt0;
1082
+ t_scan += tt2 - tt1;
1083
+ }
1084
+ TIC;
1085
+
1086
+ // labels is in-place for HeapHC
1087
+ handler->to_flat_arrays(
1088
+ distances, labels,
1089
+ skip & 16 ? nullptr : normalizers.get()
1090
+ );
1091
+
1092
+ TIC;
1093
+
1094
+ // these stats are not thread-safe
1095
+
1096
+ for(int i = 1; i < ti; i++) {
1097
+ IVFFastScan_stats.times[i] += times[i] - times[i-1];
1098
+ }
1099
+ IVFFastScan_stats.t_copy_pack += t_copy_pack;
1100
+ IVFFastScan_stats.t_scan += t_scan;
1101
+
1102
+ if (auto *rh = dynamic_cast<ReservoirHC*> (handler.get())) {
1103
+ for (int i = 0; i < 4; i++) {
1104
+ IVFFastScan_stats.reservoir_times[i] += rh->times[i];
1105
+ }
1106
+ }
1107
+
1108
+ *ndis_out = ndis;
1109
+ *nlist_out = nlist;
1110
+
1111
+ }
1112
+
1113
+
1114
+ IVFFastScanStats IVFFastScan_stats;
1115
+
1116
+ } // namespace faiss