faiss 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +103 -3
  4. data/ext/faiss/ext.cpp +99 -32
  5. data/ext/faiss/extconf.rb +12 -2
  6. data/lib/faiss/ext.bundle +0 -0
  7. data/lib/faiss/index.rb +3 -3
  8. data/lib/faiss/index_binary.rb +3 -3
  9. data/lib/faiss/kmeans.rb +1 -1
  10. data/lib/faiss/pca_matrix.rb +2 -2
  11. data/lib/faiss/product_quantizer.rb +3 -3
  12. data/lib/faiss/version.rb +1 -1
  13. data/vendor/faiss/AutoTune.cpp +719 -0
  14. data/vendor/faiss/AutoTune.h +212 -0
  15. data/vendor/faiss/Clustering.cpp +261 -0
  16. data/vendor/faiss/Clustering.h +101 -0
  17. data/vendor/faiss/IVFlib.cpp +339 -0
  18. data/vendor/faiss/IVFlib.h +132 -0
  19. data/vendor/faiss/Index.cpp +171 -0
  20. data/vendor/faiss/Index.h +261 -0
  21. data/vendor/faiss/Index2Layer.cpp +437 -0
  22. data/vendor/faiss/Index2Layer.h +85 -0
  23. data/vendor/faiss/IndexBinary.cpp +77 -0
  24. data/vendor/faiss/IndexBinary.h +163 -0
  25. data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
  26. data/vendor/faiss/IndexBinaryFlat.h +54 -0
  27. data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
  28. data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
  29. data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
  30. data/vendor/faiss/IndexBinaryHNSW.h +56 -0
  31. data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
  32. data/vendor/faiss/IndexBinaryIVF.h +211 -0
  33. data/vendor/faiss/IndexFlat.cpp +508 -0
  34. data/vendor/faiss/IndexFlat.h +175 -0
  35. data/vendor/faiss/IndexHNSW.cpp +1090 -0
  36. data/vendor/faiss/IndexHNSW.h +170 -0
  37. data/vendor/faiss/IndexIVF.cpp +909 -0
  38. data/vendor/faiss/IndexIVF.h +353 -0
  39. data/vendor/faiss/IndexIVFFlat.cpp +502 -0
  40. data/vendor/faiss/IndexIVFFlat.h +118 -0
  41. data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
  42. data/vendor/faiss/IndexIVFPQ.h +161 -0
  43. data/vendor/faiss/IndexIVFPQR.cpp +219 -0
  44. data/vendor/faiss/IndexIVFPQR.h +65 -0
  45. data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
  46. data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
  47. data/vendor/faiss/IndexLSH.cpp +225 -0
  48. data/vendor/faiss/IndexLSH.h +87 -0
  49. data/vendor/faiss/IndexLattice.cpp +143 -0
  50. data/vendor/faiss/IndexLattice.h +68 -0
  51. data/vendor/faiss/IndexPQ.cpp +1188 -0
  52. data/vendor/faiss/IndexPQ.h +199 -0
  53. data/vendor/faiss/IndexPreTransform.cpp +288 -0
  54. data/vendor/faiss/IndexPreTransform.h +91 -0
  55. data/vendor/faiss/IndexReplicas.cpp +123 -0
  56. data/vendor/faiss/IndexReplicas.h +76 -0
  57. data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
  58. data/vendor/faiss/IndexScalarQuantizer.h +127 -0
  59. data/vendor/faiss/IndexShards.cpp +317 -0
  60. data/vendor/faiss/IndexShards.h +100 -0
  61. data/vendor/faiss/InvertedLists.cpp +623 -0
  62. data/vendor/faiss/InvertedLists.h +334 -0
  63. data/vendor/faiss/LICENSE +21 -0
  64. data/vendor/faiss/MatrixStats.cpp +252 -0
  65. data/vendor/faiss/MatrixStats.h +62 -0
  66. data/vendor/faiss/MetaIndexes.cpp +351 -0
  67. data/vendor/faiss/MetaIndexes.h +126 -0
  68. data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
  69. data/vendor/faiss/OnDiskInvertedLists.h +127 -0
  70. data/vendor/faiss/VectorTransform.cpp +1157 -0
  71. data/vendor/faiss/VectorTransform.h +322 -0
  72. data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
  73. data/vendor/faiss/c_api/AutoTune_c.h +64 -0
  74. data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
  75. data/vendor/faiss/c_api/Clustering_c.h +117 -0
  76. data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
  77. data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
  78. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
  79. data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
  80. data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
  81. data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
  82. data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
  83. data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
  84. data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
  85. data/vendor/faiss/c_api/IndexShards_c.h +42 -0
  86. data/vendor/faiss/c_api/Index_c.cpp +105 -0
  87. data/vendor/faiss/c_api/Index_c.h +183 -0
  88. data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
  89. data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
  90. data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
  91. data/vendor/faiss/c_api/clone_index_c.h +32 -0
  92. data/vendor/faiss/c_api/error_c.h +42 -0
  93. data/vendor/faiss/c_api/error_impl.cpp +27 -0
  94. data/vendor/faiss/c_api/error_impl.h +16 -0
  95. data/vendor/faiss/c_api/faiss_c.h +58 -0
  96. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
  97. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
  98. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
  99. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
  100. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
  101. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
  102. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
  103. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
  104. data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
  105. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
  106. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
  107. data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
  108. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
  109. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
  110. data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
  111. data/vendor/faiss/c_api/index_factory_c.h +30 -0
  112. data/vendor/faiss/c_api/index_io_c.cpp +42 -0
  113. data/vendor/faiss/c_api/index_io_c.h +50 -0
  114. data/vendor/faiss/c_api/macros_impl.h +110 -0
  115. data/vendor/faiss/clone_index.cpp +147 -0
  116. data/vendor/faiss/clone_index.h +38 -0
  117. data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
  118. data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
  119. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
  120. data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
  121. data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
  122. data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
  123. data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
  124. data/vendor/faiss/gpu/GpuCloner.h +82 -0
  125. data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
  126. data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
  127. data/vendor/faiss/gpu/GpuDistance.h +52 -0
  128. data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
  129. data/vendor/faiss/gpu/GpuIndex.h +148 -0
  130. data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
  131. data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
  132. data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
  133. data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
  134. data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
  135. data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
  136. data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
  137. data/vendor/faiss/gpu/GpuResources.cpp +52 -0
  138. data/vendor/faiss/gpu/GpuResources.h +73 -0
  139. data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
  140. data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
  141. data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
  142. data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
  143. data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
  144. data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
  145. data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
  146. data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
  147. data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
  148. data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
  149. data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
  150. data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
  151. data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
  152. data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
  153. data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
  154. data/vendor/faiss/gpu/test/TestUtils.h +93 -0
  155. data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
  156. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
  157. data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
  158. data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
  159. data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
  160. data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
  161. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
  162. data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
  163. data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
  164. data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
  165. data/vendor/faiss/gpu/utils/Timer.h +52 -0
  166. data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
  167. data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
  168. data/vendor/faiss/impl/FaissAssert.h +95 -0
  169. data/vendor/faiss/impl/FaissException.cpp +66 -0
  170. data/vendor/faiss/impl/FaissException.h +71 -0
  171. data/vendor/faiss/impl/HNSW.cpp +818 -0
  172. data/vendor/faiss/impl/HNSW.h +275 -0
  173. data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
  174. data/vendor/faiss/impl/PolysemousTraining.h +158 -0
  175. data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
  176. data/vendor/faiss/impl/ProductQuantizer.h +242 -0
  177. data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
  178. data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
  179. data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
  180. data/vendor/faiss/impl/ThreadedIndex.h +80 -0
  181. data/vendor/faiss/impl/index_read.cpp +793 -0
  182. data/vendor/faiss/impl/index_write.cpp +558 -0
  183. data/vendor/faiss/impl/io.cpp +142 -0
  184. data/vendor/faiss/impl/io.h +98 -0
  185. data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
  186. data/vendor/faiss/impl/lattice_Zn.h +199 -0
  187. data/vendor/faiss/index_factory.cpp +392 -0
  188. data/vendor/faiss/index_factory.h +25 -0
  189. data/vendor/faiss/index_io.h +75 -0
  190. data/vendor/faiss/misc/test_blas.cpp +84 -0
  191. data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
  192. data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
  193. data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
  194. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
  195. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
  196. data/vendor/faiss/tests/test_merge.cpp +258 -0
  197. data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
  198. data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
  199. data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
  200. data/vendor/faiss/tests/test_params_override.cpp +231 -0
  201. data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
  202. data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
  203. data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
  204. data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
  205. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
  206. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
  207. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
  208. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
  209. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
  210. data/vendor/faiss/utils/Heap.cpp +122 -0
  211. data/vendor/faiss/utils/Heap.h +495 -0
  212. data/vendor/faiss/utils/WorkerThread.cpp +126 -0
  213. data/vendor/faiss/utils/WorkerThread.h +61 -0
  214. data/vendor/faiss/utils/distances.cpp +765 -0
  215. data/vendor/faiss/utils/distances.h +243 -0
  216. data/vendor/faiss/utils/distances_simd.cpp +809 -0
  217. data/vendor/faiss/utils/extra_distances.cpp +336 -0
  218. data/vendor/faiss/utils/extra_distances.h +54 -0
  219. data/vendor/faiss/utils/hamming-inl.h +472 -0
  220. data/vendor/faiss/utils/hamming.cpp +792 -0
  221. data/vendor/faiss/utils/hamming.h +220 -0
  222. data/vendor/faiss/utils/random.cpp +192 -0
  223. data/vendor/faiss/utils/random.h +60 -0
  224. data/vendor/faiss/utils/utils.cpp +783 -0
  225. data/vendor/faiss/utils/utils.h +181 -0
  226. metadata +216 -2
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #ifndef FAISS_INDEX_IVFPQ_H
11
+ #define FAISS_INDEX_IVFPQ_H
12
+
13
+
14
+ #include <vector>
15
+
16
+ #include <faiss/IndexIVF.h>
17
+ #include <faiss/IndexPQ.h>
18
+
19
+
20
+ namespace faiss {
21
+
22
+ struct IVFPQSearchParameters: IVFSearchParameters {
23
+ size_t scan_table_threshold; ///< use table computation or on-the-fly?
24
+ int polysemous_ht; ///< Hamming thresh for polysemous filtering
25
+ ~IVFPQSearchParameters () {}
26
+ };
27
+
28
+
29
+ /** Inverted file with Product Quantizer encoding. Each residual
30
+ * vector is encoded as a product quantizer code.
31
+ */
32
+ struct IndexIVFPQ: IndexIVF {
33
+ bool by_residual; ///< Encode residual or plain vector?
34
+
35
+ ProductQuantizer pq; ///< produces the codes
36
+
37
+ bool do_polysemous_training; ///< reorder PQ centroids after training?
38
+ PolysemousTraining *polysemous_training; ///< if NULL, use default
39
+
40
+ // search-time parameters
41
+ size_t scan_table_threshold; ///< use table computation or on-the-fly?
42
+ int polysemous_ht; ///< Hamming thresh for polysemous filtering
43
+
44
+ /** Precompute table that speed up query preprocessing at some
45
+ * memory cost
46
+ * =-1: force disable
47
+ * =0: decide heuristically (default: use tables only if they are
48
+ * < precomputed_tables_max_bytes)
49
+ * =1: tables that work for all quantizers (size 256 * nlist * M)
50
+ * =2: specific version for MultiIndexQuantizer (much more compact)
51
+ */
52
+ int use_precomputed_table; ///< if by_residual, build precompute tables
53
+ static size_t precomputed_table_max_bytes;
54
+
55
+ /// if use_precompute_table
56
+ /// size nlist * pq.M * pq.ksub
57
+ std::vector <float> precomputed_table;
58
+
59
+ IndexIVFPQ (
60
+ Index * quantizer, size_t d, size_t nlist,
61
+ size_t M, size_t nbits_per_idx);
62
+
63
+ void add_with_ids(idx_t n, const float* x, const idx_t* xids = nullptr)
64
+ override;
65
+
66
+ void encode_vectors(idx_t n, const float* x,
67
+ const idx_t *list_nos,
68
+ uint8_t * codes,
69
+ bool include_listnos = false) const override;
70
+
71
+ void sa_decode (idx_t n, const uint8_t *bytes,
72
+ float *x) const override;
73
+
74
+
75
+ /// same as add_core, also:
76
+ /// - output 2nd level residuals if residuals_2 != NULL
77
+ /// - use precomputed list numbers if precomputed_idx != NULL
78
+ void add_core_o (idx_t n, const float *x,
79
+ const idx_t *xids, float *residuals_2,
80
+ const idx_t *precomputed_idx = nullptr);
81
+
82
+ /// trains the product quantizer
83
+ void train_residual(idx_t n, const float* x) override;
84
+
85
+ /// same as train_residual, also output 2nd level residuals
86
+ void train_residual_o (idx_t n, const float *x, float *residuals_2);
87
+
88
+ void reconstruct_from_offset (int64_t list_no, int64_t offset,
89
+ float* recons) const override;
90
+
91
+ /** Find exact duplicates in the dataset.
92
+ *
93
+ * the duplicates are returned in pre-allocated arrays (see the
94
+ * max sizes).
95
+ *
96
+ * @params lims limits between groups of duplicates
97
+ * (max size ntotal / 2 + 1)
98
+ * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
99
+ * duplicates (max size ntotal)
100
+ * @return n number of groups found
101
+ */
102
+ size_t find_duplicates (idx_t *ids, size_t *lims) const;
103
+
104
+ // map a vector to a binary code knowning the index
105
+ void encode (idx_t key, const float * x, uint8_t * code) const;
106
+
107
+ /** Encode multiple vectors
108
+ *
109
+ * @param n nb vectors to encode
110
+ * @param keys posting list ids for those vectors (size n)
111
+ * @param x vectors (size n * d)
112
+ * @param codes output codes (size n * code_size)
113
+ * @param compute_keys if false, assume keys are precomputed,
114
+ * otherwise compute them
115
+ */
116
+ void encode_multiple (size_t n, idx_t *keys,
117
+ const float * x, uint8_t * codes,
118
+ bool compute_keys = false) const;
119
+
120
+ /// inverse of encode_multiple
121
+ void decode_multiple (size_t n, const idx_t *keys,
122
+ const uint8_t * xcodes, float * x) const;
123
+
124
+ InvertedListScanner *get_InvertedListScanner (bool store_pairs)
125
+ const override;
126
+
127
+ /// build precomputed table
128
+ void precompute_table ();
129
+
130
+ IndexIVFPQ ();
131
+
132
+ };
133
+
134
+
135
+ /// statistics are robust to internal threading, but not if
136
+ /// IndexIVFPQ::search_preassigned is called by multiple threads
137
+ struct IndexIVFPQStats {
138
+ size_t nrefine; // nb of refines (IVFPQR)
139
+
140
+ size_t n_hamming_pass;
141
+ // nb of passed Hamming distance tests (for polysemous)
142
+
143
+ // timings measured with the CPU RTC
144
+ // on all threads
145
+ size_t search_cycles;
146
+ size_t refine_cycles; // only for IVFPQR
147
+
148
+ IndexIVFPQStats () {reset (); }
149
+ void reset ();
150
+ };
151
+
152
+ // global var that collects them all
153
+ extern IndexIVFPQStats indexIVFPQ_stats;
154
+
155
+
156
+
157
+
158
+ } // namespace faiss
159
+
160
+
161
+ #endif
@@ -0,0 +1,219 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #include <faiss/IndexIVFPQR.h>
11
+
12
+ #include <faiss/utils/Heap.h>
13
+ #include <faiss/utils/utils.h>
14
+ #include <faiss/utils/distances.h>
15
+
16
+ #include <faiss/impl/FaissAssert.h>
17
+
18
+
19
+ namespace faiss {
20
+
21
+ /*****************************************
22
+ * IndexIVFPQR implementation
23
+ ******************************************/
24
+
25
+ IndexIVFPQR::IndexIVFPQR (
26
+ Index * quantizer, size_t d, size_t nlist,
27
+ size_t M, size_t nbits_per_idx,
28
+ size_t M_refine, size_t nbits_per_idx_refine):
29
+ IndexIVFPQ (quantizer, d, nlist, M, nbits_per_idx),
30
+ refine_pq (d, M_refine, nbits_per_idx_refine),
31
+ k_factor (4)
32
+ {
33
+ by_residual = true;
34
+ }
35
+
36
+ IndexIVFPQR::IndexIVFPQR ():
37
+ k_factor (1)
38
+ {
39
+ by_residual = true;
40
+ }
41
+
42
+
43
+
44
+ void IndexIVFPQR::reset()
45
+ {
46
+ IndexIVFPQ::reset();
47
+ refine_codes.clear();
48
+ }
49
+
50
+
51
+
52
+
53
+ void IndexIVFPQR::train_residual (idx_t n, const float *x)
54
+ {
55
+
56
+ float * residual_2 = new float [n * d];
57
+ ScopeDeleter <float> del(residual_2);
58
+
59
+ train_residual_o (n, x, residual_2);
60
+
61
+ if (verbose)
62
+ printf ("training %zdx%zd 2nd level PQ quantizer on %ld %dD-vectors\n",
63
+ refine_pq.M, refine_pq.ksub, n, d);
64
+
65
+ refine_pq.cp.max_points_per_centroid = 1000;
66
+ refine_pq.cp.verbose = verbose;
67
+
68
+ refine_pq.train (n, residual_2);
69
+
70
+ }
71
+
72
+
73
+ void IndexIVFPQR::add_with_ids (idx_t n, const float *x, const idx_t *xids) {
74
+ add_core (n, x, xids, nullptr);
75
+ }
76
+
77
+ void IndexIVFPQR::add_core (idx_t n, const float *x, const idx_t *xids,
78
+ const idx_t *precomputed_idx) {
79
+
80
+ float * residual_2 = new float [n * d];
81
+ ScopeDeleter <float> del(residual_2);
82
+
83
+ idx_t n0 = ntotal;
84
+
85
+ add_core_o (n, x, xids, residual_2, precomputed_idx);
86
+
87
+ refine_codes.resize (ntotal * refine_pq.code_size);
88
+
89
+ refine_pq.compute_codes (
90
+ residual_2, &refine_codes[n0 * refine_pq.code_size], n);
91
+
92
+
93
+ }
94
+ #define TIC t0 = get_cycles()
95
+ #define TOC get_cycles () - t0
96
+
97
+
98
+ void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
99
+ const idx_t *idx,
100
+ const float *L1_dis,
101
+ float *distances, idx_t *labels,
102
+ bool store_pairs,
103
+ const IVFSearchParameters *params
104
+ ) const
105
+ {
106
+ uint64_t t0;
107
+ TIC;
108
+ size_t k_coarse = long(k * k_factor);
109
+ idx_t *coarse_labels = new idx_t [k_coarse * n];
110
+ ScopeDeleter<idx_t> del1 (coarse_labels);
111
+ { // query with quantizer levels 1 and 2.
112
+ float *coarse_distances = new float [k_coarse * n];
113
+ ScopeDeleter<float> del(coarse_distances);
114
+
115
+ IndexIVFPQ::search_preassigned (
116
+ n, x, k_coarse,
117
+ idx, L1_dis, coarse_distances, coarse_labels,
118
+ true, params);
119
+ }
120
+
121
+
122
+ indexIVFPQ_stats.search_cycles += TOC;
123
+
124
+ TIC;
125
+
126
+ // 3rd level refinement
127
+ size_t n_refine = 0;
128
+ #pragma omp parallel reduction(+ : n_refine)
129
+ {
130
+ // tmp buffers
131
+ float *residual_1 = new float [2 * d];
132
+ ScopeDeleter<float> del (residual_1);
133
+ float *residual_2 = residual_1 + d;
134
+ #pragma omp for
135
+ for (idx_t i = 0; i < n; i++) {
136
+ const float *xq = x + i * d;
137
+ const idx_t * shortlist = coarse_labels + k_coarse * i;
138
+ float * heap_sim = distances + k * i;
139
+ idx_t * heap_ids = labels + k * i;
140
+ maxheap_heapify (k, heap_sim, heap_ids);
141
+
142
+ for (int j = 0; j < k_coarse; j++) {
143
+ idx_t sl = shortlist[j];
144
+
145
+ if (sl == -1) continue;
146
+
147
+ int list_no = sl >> 32;
148
+ int ofs = sl & 0xffffffff;
149
+
150
+ assert (list_no >= 0 && list_no < nlist);
151
+ assert (ofs >= 0 && ofs < invlists->list_size (list_no));
152
+
153
+ // 1st level residual
154
+ quantizer->compute_residual (xq, residual_1, list_no);
155
+
156
+ // 2nd level residual
157
+ const uint8_t * l2code =
158
+ invlists->get_single_code (list_no, ofs);
159
+
160
+ pq.decode (l2code, residual_2);
161
+ for (int l = 0; l < d; l++)
162
+ residual_2[l] = residual_1[l] - residual_2[l];
163
+
164
+ // 3rd level residual's approximation
165
+ idx_t id = invlists->get_single_id (list_no, ofs);
166
+ assert (0 <= id && id < ntotal);
167
+ refine_pq.decode (&refine_codes [id * refine_pq.code_size],
168
+ residual_1);
169
+
170
+ float dis = fvec_L2sqr (residual_1, residual_2, d);
171
+
172
+ if (dis < heap_sim[0]) {
173
+ maxheap_pop (k, heap_sim, heap_ids);
174
+ idx_t id_or_pair = store_pairs ? sl : id;
175
+ maxheap_push (k, heap_sim, heap_ids, dis, id_or_pair);
176
+ }
177
+ n_refine ++;
178
+ }
179
+ maxheap_reorder (k, heap_sim, heap_ids);
180
+ }
181
+ }
182
+ indexIVFPQ_stats.nrefine += n_refine;
183
+ indexIVFPQ_stats.refine_cycles += TOC;
184
+ }
185
+
186
+ void IndexIVFPQR::reconstruct_from_offset (int64_t list_no, int64_t offset,
187
+ float* recons) const
188
+ {
189
+ IndexIVFPQ::reconstruct_from_offset (list_no, offset, recons);
190
+
191
+ idx_t id = invlists->get_single_id (list_no, offset);
192
+ assert (0 <= id && id < ntotal);
193
+
194
+ std::vector<float> r3(d);
195
+ refine_pq.decode (&refine_codes [id * refine_pq.code_size], r3.data());
196
+ for (int i = 0; i < d; ++i) {
197
+ recons[i] += r3[i];
198
+ }
199
+ }
200
+
201
+ void IndexIVFPQR::merge_from (IndexIVF &other_in, idx_t add_id)
202
+ {
203
+ IndexIVFPQR *other = dynamic_cast<IndexIVFPQR *> (&other_in);
204
+ FAISS_THROW_IF_NOT(other);
205
+
206
+ IndexIVF::merge_from (other_in, add_id);
207
+
208
+ refine_codes.insert (refine_codes.end(),
209
+ other->refine_codes.begin(),
210
+ other->refine_codes.end());
211
+ other->refine_codes.clear();
212
+ }
213
+
214
+ size_t IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
215
+ FAISS_THROW_MSG("not implemented");
216
+ return 0;
217
+ }
218
+
219
+ } // namespace faiss
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #pragma once
11
+
12
+ #include <vector>
13
+
14
+ #include <faiss/IndexIVFPQ.h>
15
+
16
+
17
+ namespace faiss {
18
+
19
+
20
+
21
+ /** Index with an additional level of PQ refinement */
22
+ struct IndexIVFPQR: IndexIVFPQ {
23
+ ProductQuantizer refine_pq; ///< 3rd level quantizer
24
+ std::vector <uint8_t> refine_codes; ///< corresponding codes
25
+
26
+ /// factor between k requested in search and the k requested from the IVFPQ
27
+ float k_factor;
28
+
29
+ IndexIVFPQR (
30
+ Index * quantizer, size_t d, size_t nlist,
31
+ size_t M, size_t nbits_per_idx,
32
+ size_t M_refine, size_t nbits_per_idx_refine);
33
+
34
+ void reset() override;
35
+
36
+ size_t remove_ids(const IDSelector& sel) override;
37
+
38
+ /// trains the two product quantizers
39
+ void train_residual(idx_t n, const float* x) override;
40
+
41
+ void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
42
+
43
+ /// same as add_with_ids, but optionally use the precomputed list ids
44
+ void add_core (idx_t n, const float *x, const idx_t *xids,
45
+ const idx_t *precomputed_idx = nullptr);
46
+
47
+ void reconstruct_from_offset (int64_t list_no, int64_t offset,
48
+ float* recons) const override;
49
+
50
+ void merge_from (IndexIVF &other, idx_t add_id) override;
51
+
52
+
53
+ void search_preassigned (idx_t n, const float *x, idx_t k,
54
+ const idx_t *assign,
55
+ const float *centroid_dis,
56
+ float *distances, idx_t *labels,
57
+ bool store_pairs,
58
+ const IVFSearchParameters *params=nullptr
59
+ ) const override;
60
+
61
+ IndexIVFPQR();
62
+ };
63
+
64
+
65
+ } // namespace faiss
@@ -0,0 +1,331 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+
11
+ #include <faiss/IndexIVFSpectralHash.h>
12
+
13
+ #include <memory>
14
+ #include <algorithm>
15
+ #include <stdint.h>
16
+
17
+ #include <faiss/utils/hamming.h>
18
+ #include <faiss/utils/utils.h>
19
+ #include <faiss/impl/FaissAssert.h>
20
+ #include <faiss/impl/AuxIndexStructures.h>
21
+ #include <faiss/VectorTransform.h>
22
+
23
+ namespace faiss {
24
+
25
+
26
+ IndexIVFSpectralHash::IndexIVFSpectralHash (
27
+ Index * quantizer, size_t d, size_t nlist,
28
+ int nbit, float period):
29
+ IndexIVF (quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
30
+ nbit (nbit), period (period), threshold_type (Thresh_global)
31
+ {
32
+ FAISS_THROW_IF_NOT (code_size % 4 == 0);
33
+ RandomRotationMatrix *rr = new RandomRotationMatrix (d, nbit);
34
+ rr->init (1234);
35
+ vt = rr;
36
+ own_fields = true;
37
+ is_trained = false;
38
+ }
39
+
40
+ IndexIVFSpectralHash::IndexIVFSpectralHash():
41
+ IndexIVF(), vt(nullptr), own_fields(false),
42
+ nbit(0), period(0), threshold_type(Thresh_global)
43
+ {}
44
+
45
+ IndexIVFSpectralHash::~IndexIVFSpectralHash ()
46
+ {
47
+ if (own_fields) {
48
+ delete vt;
49
+ }
50
+ }
51
+
52
+ namespace {
53
+
54
+
55
+ float median (size_t n, float *x) {
56
+ std::sort(x, x + n);
57
+ if (n % 2 == 1) {
58
+ return x [n / 2];
59
+ } else {
60
+ return (x [n / 2 - 1] + x [n / 2]) / 2;
61
+ }
62
+ }
63
+
64
+ }
65
+
66
+
67
+ void IndexIVFSpectralHash::train_residual (idx_t n, const float *x)
68
+ {
69
+ if (!vt->is_trained) {
70
+ vt->train (n, x);
71
+ }
72
+
73
+ if (threshold_type == Thresh_global) {
74
+ // nothing to do
75
+ return;
76
+ } else if (threshold_type == Thresh_centroid ||
77
+ threshold_type == Thresh_centroid_half) {
78
+ // convert all centroids with vt
79
+ std::vector<float> centroids (nlist * d);
80
+ quantizer->reconstruct_n (0, nlist, centroids.data());
81
+ trained.resize(nlist * nbit);
82
+ vt->apply_noalloc (nlist, centroids.data(), trained.data());
83
+ if (threshold_type == Thresh_centroid_half) {
84
+ for (size_t i = 0; i < nlist * nbit; i++) {
85
+ trained[i] -= 0.25 * period;
86
+ }
87
+ }
88
+ return;
89
+ }
90
+ // otherwise train medians
91
+
92
+ // assign
93
+ std::unique_ptr<idx_t []> idx (new idx_t [n]);
94
+ quantizer->assign (n, x, idx.get());
95
+
96
+ std::vector<size_t> sizes(nlist + 1);
97
+ for (size_t i = 0; i < n; i++) {
98
+ FAISS_THROW_IF_NOT (idx[i] >= 0);
99
+ sizes[idx[i]]++;
100
+ }
101
+
102
+ size_t ofs = 0;
103
+ for (int j = 0; j < nlist; j++) {
104
+ size_t o0 = ofs;
105
+ ofs += sizes[j];
106
+ sizes[j] = o0;
107
+ }
108
+
109
+ // transform
110
+ std::unique_ptr<float []> xt (vt->apply (n, x));
111
+
112
+ // transpose + reorder
113
+ std::unique_ptr<float []> xo (new float[n * nbit]);
114
+
115
+ for (size_t i = 0; i < n; i++) {
116
+ size_t idest = sizes[idx[i]]++;
117
+ for (size_t j = 0; j < nbit; j++) {
118
+ xo[idest + n * j] = xt[i * nbit + j];
119
+ }
120
+ }
121
+
122
+ trained.resize (n * nbit);
123
+ // compute medians
124
+ #pragma omp for
125
+ for (int i = 0; i < nlist; i++) {
126
+ size_t i0 = i == 0 ? 0 : sizes[i - 1];
127
+ size_t i1 = sizes[i];
128
+ for (int j = 0; j < nbit; j++) {
129
+ float *xoi = xo.get() + i0 + n * j;
130
+ if (i0 == i1) { // nothing to train
131
+ trained[i * nbit + j] = 0.0;
132
+ } else if (i1 == i0 + 1) {
133
+ trained[i * nbit + j] = xoi[0];
134
+ } else {
135
+ trained[i * nbit + j] = median(i1 - i0, xoi);
136
+ }
137
+ }
138
+ }
139
+ }
140
+
141
+
142
+ namespace {
143
+
144
+ void binarize_with_freq(size_t nbit, float freq,
145
+ const float *x, const float *c,
146
+ uint8_t *codes)
147
+ {
148
+ memset (codes, 0, (nbit + 7) / 8);
149
+ for (size_t i = 0; i < nbit; i++) {
150
+ float xf = (x[i] - c[i]);
151
+ int xi = int(floor(xf * freq));
152
+ int bit = xi & 1;
153
+ codes[i >> 3] |= bit << (i & 7);
154
+ }
155
+ }
156
+
157
+
158
+ };
159
+
160
+
161
+
162
+ void IndexIVFSpectralHash::encode_vectors(idx_t n, const float* x_in,
163
+ const idx_t *list_nos,
164
+ uint8_t * codes,
165
+ bool include_listnos) const
166
+ {
167
+ FAISS_THROW_IF_NOT (is_trained);
168
+ float freq = 2.0 / period;
169
+
170
+ FAISS_THROW_IF_NOT_MSG (!include_listnos, "listnos encoding not supported");
171
+
172
+ // transform with vt
173
+ std::unique_ptr<float []> x (vt->apply (n, x_in));
174
+
175
+ #pragma omp parallel
176
+ {
177
+ std::vector<float> zero (nbit);
178
+
179
+ // each thread takes care of a subset of lists
180
+ #pragma omp for
181
+ for (size_t i = 0; i < n; i++) {
182
+ int64_t list_no = list_nos [i];
183
+
184
+ if (list_no >= 0) {
185
+ const float *c;
186
+ if (threshold_type == Thresh_global) {
187
+ c = zero.data();
188
+ } else {
189
+ c = trained.data() + list_no * nbit;
190
+ }
191
+ binarize_with_freq (nbit, freq,
192
+ x.get() + i * nbit, c,
193
+ codes + i * code_size) ;
194
+ }
195
+ }
196
+ }
197
+ }
198
+
199
+ namespace {
200
+
201
+
202
+ template<class HammingComputer>
203
+ struct IVFScanner: InvertedListScanner {
204
+
205
+ // copied from index structure
206
+ const IndexIVFSpectralHash *index;
207
+ size_t code_size;
208
+ size_t nbit;
209
+ bool store_pairs;
210
+
211
+ float period, freq;
212
+ std::vector<float> q;
213
+ std::vector<float> zero;
214
+ std::vector<uint8_t> qcode;
215
+ HammingComputer hc;
216
+
217
+ using idx_t = Index::idx_t;
218
+
219
+ IVFScanner (const IndexIVFSpectralHash * index,
220
+ bool store_pairs):
221
+ index (index),
222
+ code_size(index->code_size),
223
+ nbit(index->nbit),
224
+ store_pairs(store_pairs),
225
+ period(index->period), freq(2.0 / index->period),
226
+ q(nbit), zero(nbit), qcode(code_size),
227
+ hc(qcode.data(), code_size)
228
+ {
229
+ }
230
+
231
+
232
+ void set_query (const float *query) override {
233
+ FAISS_THROW_IF_NOT(query);
234
+ FAISS_THROW_IF_NOT(q.size() == nbit);
235
+ index->vt->apply_noalloc (1, query, q.data());
236
+
237
+ if (index->threshold_type ==
238
+ IndexIVFSpectralHash::Thresh_global) {
239
+ binarize_with_freq
240
+ (nbit, freq, q.data(), zero.data(), qcode.data());
241
+ hc.set (qcode.data(), code_size);
242
+ }
243
+ }
244
+
245
+ idx_t list_no;
246
+
247
+ void set_list (idx_t list_no, float /*coarse_dis*/) override {
248
+ this->list_no = list_no;
249
+ if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
250
+ const float *c = index->trained.data() + list_no * nbit;
251
+ binarize_with_freq (nbit, freq, q.data(), c, qcode.data());
252
+ hc.set (qcode.data(), code_size);
253
+ }
254
+ }
255
+
256
+ float distance_to_code (const uint8_t *code) const final {
257
+ return hc.hamming (code);
258
+ }
259
+
260
+ size_t scan_codes (size_t list_size,
261
+ const uint8_t *codes,
262
+ const idx_t *ids,
263
+ float *simi, idx_t *idxi,
264
+ size_t k) const override
265
+ {
266
+ size_t nup = 0;
267
+ for (size_t j = 0; j < list_size; j++) {
268
+
269
+ float dis = hc.hamming (codes);
270
+
271
+ if (dis < simi [0]) {
272
+ maxheap_pop (k, simi, idxi);
273
+ int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
274
+ maxheap_push (k, simi, idxi, dis, id);
275
+ nup++;
276
+ }
277
+ codes += code_size;
278
+ }
279
+ return nup;
280
+ }
281
+
282
+ void scan_codes_range (size_t list_size,
283
+ const uint8_t *codes,
284
+ const idx_t *ids,
285
+ float radius,
286
+ RangeQueryResult & res) const override
287
+ {
288
+ for (size_t j = 0; j < list_size; j++) {
289
+ float dis = hc.hamming (codes);
290
+ if (dis < radius) {
291
+ int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
292
+ res.add (dis, id);
293
+ }
294
+ codes += code_size;
295
+ }
296
+ }
297
+
298
+
299
+ };
300
+
301
+ } // anonymous namespace
302
+
303
+ InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner
304
+ (bool store_pairs) const
305
+ {
306
+ switch (code_size) {
307
+ #define HANDLE_CODE_SIZE(cs) \
308
+ case cs: \
309
+ return new IVFScanner<HammingComputer ## cs> (this, store_pairs)
310
+ HANDLE_CODE_SIZE(4);
311
+ HANDLE_CODE_SIZE(8);
312
+ HANDLE_CODE_SIZE(16);
313
+ HANDLE_CODE_SIZE(20);
314
+ HANDLE_CODE_SIZE(32);
315
+ HANDLE_CODE_SIZE(64);
316
+ #undef HANDLE_CODE_SIZE
317
+ default:
318
+ if (code_size % 8 == 0) {
319
+ return new IVFScanner<HammingComputerM8>(this, store_pairs);
320
+ } else if (code_size % 4 == 0) {
321
+ return new IVFScanner<HammingComputerM4>(this, store_pairs);
322
+ } else {
323
+ FAISS_THROW_MSG("not supported");
324
+ }
325
+ }
326
+
327
+ }
328
+
329
+
330
+
331
+ } // namespace faiss