faiss 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +103 -3
  4. data/ext/faiss/ext.cpp +99 -32
  5. data/ext/faiss/extconf.rb +12 -2
  6. data/lib/faiss/ext.bundle +0 -0
  7. data/lib/faiss/index.rb +3 -3
  8. data/lib/faiss/index_binary.rb +3 -3
  9. data/lib/faiss/kmeans.rb +1 -1
  10. data/lib/faiss/pca_matrix.rb +2 -2
  11. data/lib/faiss/product_quantizer.rb +3 -3
  12. data/lib/faiss/version.rb +1 -1
  13. data/vendor/faiss/AutoTune.cpp +719 -0
  14. data/vendor/faiss/AutoTune.h +212 -0
  15. data/vendor/faiss/Clustering.cpp +261 -0
  16. data/vendor/faiss/Clustering.h +101 -0
  17. data/vendor/faiss/IVFlib.cpp +339 -0
  18. data/vendor/faiss/IVFlib.h +132 -0
  19. data/vendor/faiss/Index.cpp +171 -0
  20. data/vendor/faiss/Index.h +261 -0
  21. data/vendor/faiss/Index2Layer.cpp +437 -0
  22. data/vendor/faiss/Index2Layer.h +85 -0
  23. data/vendor/faiss/IndexBinary.cpp +77 -0
  24. data/vendor/faiss/IndexBinary.h +163 -0
  25. data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
  26. data/vendor/faiss/IndexBinaryFlat.h +54 -0
  27. data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
  28. data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
  29. data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
  30. data/vendor/faiss/IndexBinaryHNSW.h +56 -0
  31. data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
  32. data/vendor/faiss/IndexBinaryIVF.h +211 -0
  33. data/vendor/faiss/IndexFlat.cpp +508 -0
  34. data/vendor/faiss/IndexFlat.h +175 -0
  35. data/vendor/faiss/IndexHNSW.cpp +1090 -0
  36. data/vendor/faiss/IndexHNSW.h +170 -0
  37. data/vendor/faiss/IndexIVF.cpp +909 -0
  38. data/vendor/faiss/IndexIVF.h +353 -0
  39. data/vendor/faiss/IndexIVFFlat.cpp +502 -0
  40. data/vendor/faiss/IndexIVFFlat.h +118 -0
  41. data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
  42. data/vendor/faiss/IndexIVFPQ.h +161 -0
  43. data/vendor/faiss/IndexIVFPQR.cpp +219 -0
  44. data/vendor/faiss/IndexIVFPQR.h +65 -0
  45. data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
  46. data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
  47. data/vendor/faiss/IndexLSH.cpp +225 -0
  48. data/vendor/faiss/IndexLSH.h +87 -0
  49. data/vendor/faiss/IndexLattice.cpp +143 -0
  50. data/vendor/faiss/IndexLattice.h +68 -0
  51. data/vendor/faiss/IndexPQ.cpp +1188 -0
  52. data/vendor/faiss/IndexPQ.h +199 -0
  53. data/vendor/faiss/IndexPreTransform.cpp +288 -0
  54. data/vendor/faiss/IndexPreTransform.h +91 -0
  55. data/vendor/faiss/IndexReplicas.cpp +123 -0
  56. data/vendor/faiss/IndexReplicas.h +76 -0
  57. data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
  58. data/vendor/faiss/IndexScalarQuantizer.h +127 -0
  59. data/vendor/faiss/IndexShards.cpp +317 -0
  60. data/vendor/faiss/IndexShards.h +100 -0
  61. data/vendor/faiss/InvertedLists.cpp +623 -0
  62. data/vendor/faiss/InvertedLists.h +334 -0
  63. data/vendor/faiss/LICENSE +21 -0
  64. data/vendor/faiss/MatrixStats.cpp +252 -0
  65. data/vendor/faiss/MatrixStats.h +62 -0
  66. data/vendor/faiss/MetaIndexes.cpp +351 -0
  67. data/vendor/faiss/MetaIndexes.h +126 -0
  68. data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
  69. data/vendor/faiss/OnDiskInvertedLists.h +127 -0
  70. data/vendor/faiss/VectorTransform.cpp +1157 -0
  71. data/vendor/faiss/VectorTransform.h +322 -0
  72. data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
  73. data/vendor/faiss/c_api/AutoTune_c.h +64 -0
  74. data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
  75. data/vendor/faiss/c_api/Clustering_c.h +117 -0
  76. data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
  77. data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
  78. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
  79. data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
  80. data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
  81. data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
  82. data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
  83. data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
  84. data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
  85. data/vendor/faiss/c_api/IndexShards_c.h +42 -0
  86. data/vendor/faiss/c_api/Index_c.cpp +105 -0
  87. data/vendor/faiss/c_api/Index_c.h +183 -0
  88. data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
  89. data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
  90. data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
  91. data/vendor/faiss/c_api/clone_index_c.h +32 -0
  92. data/vendor/faiss/c_api/error_c.h +42 -0
  93. data/vendor/faiss/c_api/error_impl.cpp +27 -0
  94. data/vendor/faiss/c_api/error_impl.h +16 -0
  95. data/vendor/faiss/c_api/faiss_c.h +58 -0
  96. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
  97. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
  98. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
  99. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
  100. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
  101. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
  102. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
  103. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
  104. data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
  105. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
  106. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
  107. data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
  108. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
  109. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
  110. data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
  111. data/vendor/faiss/c_api/index_factory_c.h +30 -0
  112. data/vendor/faiss/c_api/index_io_c.cpp +42 -0
  113. data/vendor/faiss/c_api/index_io_c.h +50 -0
  114. data/vendor/faiss/c_api/macros_impl.h +110 -0
  115. data/vendor/faiss/clone_index.cpp +147 -0
  116. data/vendor/faiss/clone_index.h +38 -0
  117. data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
  118. data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
  119. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
  120. data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
  121. data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
  122. data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
  123. data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
  124. data/vendor/faiss/gpu/GpuCloner.h +82 -0
  125. data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
  126. data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
  127. data/vendor/faiss/gpu/GpuDistance.h +52 -0
  128. data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
  129. data/vendor/faiss/gpu/GpuIndex.h +148 -0
  130. data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
  131. data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
  132. data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
  133. data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
  134. data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
  135. data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
  136. data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
  137. data/vendor/faiss/gpu/GpuResources.cpp +52 -0
  138. data/vendor/faiss/gpu/GpuResources.h +73 -0
  139. data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
  140. data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
  141. data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
  142. data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
  143. data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
  144. data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
  145. data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
  146. data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
  147. data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
  148. data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
  149. data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
  150. data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
  151. data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
  152. data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
  153. data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
  154. data/vendor/faiss/gpu/test/TestUtils.h +93 -0
  155. data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
  156. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
  157. data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
  158. data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
  159. data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
  160. data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
  161. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
  162. data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
  163. data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
  164. data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
  165. data/vendor/faiss/gpu/utils/Timer.h +52 -0
  166. data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
  167. data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
  168. data/vendor/faiss/impl/FaissAssert.h +95 -0
  169. data/vendor/faiss/impl/FaissException.cpp +66 -0
  170. data/vendor/faiss/impl/FaissException.h +71 -0
  171. data/vendor/faiss/impl/HNSW.cpp +818 -0
  172. data/vendor/faiss/impl/HNSW.h +275 -0
  173. data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
  174. data/vendor/faiss/impl/PolysemousTraining.h +158 -0
  175. data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
  176. data/vendor/faiss/impl/ProductQuantizer.h +242 -0
  177. data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
  178. data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
  179. data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
  180. data/vendor/faiss/impl/ThreadedIndex.h +80 -0
  181. data/vendor/faiss/impl/index_read.cpp +793 -0
  182. data/vendor/faiss/impl/index_write.cpp +558 -0
  183. data/vendor/faiss/impl/io.cpp +142 -0
  184. data/vendor/faiss/impl/io.h +98 -0
  185. data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
  186. data/vendor/faiss/impl/lattice_Zn.h +199 -0
  187. data/vendor/faiss/index_factory.cpp +392 -0
  188. data/vendor/faiss/index_factory.h +25 -0
  189. data/vendor/faiss/index_io.h +75 -0
  190. data/vendor/faiss/misc/test_blas.cpp +84 -0
  191. data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
  192. data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
  193. data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
  194. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
  195. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
  196. data/vendor/faiss/tests/test_merge.cpp +258 -0
  197. data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
  198. data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
  199. data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
  200. data/vendor/faiss/tests/test_params_override.cpp +231 -0
  201. data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
  202. data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
  203. data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
  204. data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
  205. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
  206. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
  207. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
  208. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
  209. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
  210. data/vendor/faiss/utils/Heap.cpp +122 -0
  211. data/vendor/faiss/utils/Heap.h +495 -0
  212. data/vendor/faiss/utils/WorkerThread.cpp +126 -0
  213. data/vendor/faiss/utils/WorkerThread.h +61 -0
  214. data/vendor/faiss/utils/distances.cpp +765 -0
  215. data/vendor/faiss/utils/distances.h +243 -0
  216. data/vendor/faiss/utils/distances_simd.cpp +809 -0
  217. data/vendor/faiss/utils/extra_distances.cpp +336 -0
  218. data/vendor/faiss/utils/extra_distances.h +54 -0
  219. data/vendor/faiss/utils/hamming-inl.h +472 -0
  220. data/vendor/faiss/utils/hamming.cpp +792 -0
  221. data/vendor/faiss/utils/hamming.h +220 -0
  222. data/vendor/faiss/utils/random.cpp +192 -0
  223. data/vendor/faiss/utils/random.h +60 -0
  224. data/vendor/faiss/utils/utils.cpp +783 -0
  225. data/vendor/faiss/utils/utils.h +181 -0
  226. metadata +216 -2
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #ifndef FAISS_INDEX_IVFPQ_H
11
+ #define FAISS_INDEX_IVFPQ_H
12
+
13
+
14
+ #include <vector>
15
+
16
+ #include <faiss/IndexIVF.h>
17
+ #include <faiss/IndexPQ.h>
18
+
19
+
20
+ namespace faiss {
21
+
22
+ struct IVFPQSearchParameters: IVFSearchParameters {
23
+ size_t scan_table_threshold; ///< use table computation or on-the-fly?
24
+ int polysemous_ht; ///< Hamming thresh for polysemous filtering
25
+ ~IVFPQSearchParameters () {}
26
+ };
27
+
28
+
29
+ /** Inverted file with Product Quantizer encoding. Each residual
30
+ * vector is encoded as a product quantizer code.
31
+ */
32
+ struct IndexIVFPQ: IndexIVF {
33
+ bool by_residual; ///< Encode residual or plain vector?
34
+
35
+ ProductQuantizer pq; ///< produces the codes
36
+
37
+ bool do_polysemous_training; ///< reorder PQ centroids after training?
38
+ PolysemousTraining *polysemous_training; ///< if NULL, use default
39
+
40
+ // search-time parameters
41
+ size_t scan_table_threshold; ///< use table computation or on-the-fly?
42
+ int polysemous_ht; ///< Hamming thresh for polysemous filtering
43
+
44
+ /** Precompute table that speed up query preprocessing at some
45
+ * memory cost
46
+ * =-1: force disable
47
+ * =0: decide heuristically (default: use tables only if they are
48
+ * < precomputed_tables_max_bytes)
49
+ * =1: tables that work for all quantizers (size 256 * nlist * M)
50
+ * =2: specific version for MultiIndexQuantizer (much more compact)
51
+ */
52
+ int use_precomputed_table; ///< if by_residual, build precompute tables
53
+ static size_t precomputed_table_max_bytes;
54
+
55
+ /// if use_precompute_table
56
+ /// size nlist * pq.M * pq.ksub
57
+ std::vector <float> precomputed_table;
58
+
59
+ IndexIVFPQ (
60
+ Index * quantizer, size_t d, size_t nlist,
61
+ size_t M, size_t nbits_per_idx);
62
+
63
+ void add_with_ids(idx_t n, const float* x, const idx_t* xids = nullptr)
64
+ override;
65
+
66
+ void encode_vectors(idx_t n, const float* x,
67
+ const idx_t *list_nos,
68
+ uint8_t * codes,
69
+ bool include_listnos = false) const override;
70
+
71
+ void sa_decode (idx_t n, const uint8_t *bytes,
72
+ float *x) const override;
73
+
74
+
75
+ /// same as add_core, also:
76
+ /// - output 2nd level residuals if residuals_2 != NULL
77
+ /// - use precomputed list numbers if precomputed_idx != NULL
78
+ void add_core_o (idx_t n, const float *x,
79
+ const idx_t *xids, float *residuals_2,
80
+ const idx_t *precomputed_idx = nullptr);
81
+
82
+ /// trains the product quantizer
83
+ void train_residual(idx_t n, const float* x) override;
84
+
85
+ /// same as train_residual, also output 2nd level residuals
86
+ void train_residual_o (idx_t n, const float *x, float *residuals_2);
87
+
88
+ void reconstruct_from_offset (int64_t list_no, int64_t offset,
89
+ float* recons) const override;
90
+
91
+ /** Find exact duplicates in the dataset.
92
+ *
93
+ * the duplicates are returned in pre-allocated arrays (see the
94
+ * max sizes).
95
+ *
96
+ * @params lims limits between groups of duplicates
97
+ * (max size ntotal / 2 + 1)
98
+ * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
99
+ * duplicates (max size ntotal)
100
+ * @return n number of groups found
101
+ */
102
+ size_t find_duplicates (idx_t *ids, size_t *lims) const;
103
+
104
+ // map a vector to a binary code knowning the index
105
+ void encode (idx_t key, const float * x, uint8_t * code) const;
106
+
107
+ /** Encode multiple vectors
108
+ *
109
+ * @param n nb vectors to encode
110
+ * @param keys posting list ids for those vectors (size n)
111
+ * @param x vectors (size n * d)
112
+ * @param codes output codes (size n * code_size)
113
+ * @param compute_keys if false, assume keys are precomputed,
114
+ * otherwise compute them
115
+ */
116
+ void encode_multiple (size_t n, idx_t *keys,
117
+ const float * x, uint8_t * codes,
118
+ bool compute_keys = false) const;
119
+
120
+ /// inverse of encode_multiple
121
+ void decode_multiple (size_t n, const idx_t *keys,
122
+ const uint8_t * xcodes, float * x) const;
123
+
124
+ InvertedListScanner *get_InvertedListScanner (bool store_pairs)
125
+ const override;
126
+
127
+ /// build precomputed table
128
+ void precompute_table ();
129
+
130
+ IndexIVFPQ ();
131
+
132
+ };
133
+
134
+
135
+ /// statistics are robust to internal threading, but not if
136
+ /// IndexIVFPQ::search_preassigned is called by multiple threads
137
+ struct IndexIVFPQStats {
138
+ size_t nrefine; // nb of refines (IVFPQR)
139
+
140
+ size_t n_hamming_pass;
141
+ // nb of passed Hamming distance tests (for polysemous)
142
+
143
+ // timings measured with the CPU RTC
144
+ // on all threads
145
+ size_t search_cycles;
146
+ size_t refine_cycles; // only for IVFPQR
147
+
148
+ IndexIVFPQStats () {reset (); }
149
+ void reset ();
150
+ };
151
+
152
+ // global var that collects them all
153
+ extern IndexIVFPQStats indexIVFPQ_stats;
154
+
155
+
156
+
157
+
158
+ } // namespace faiss
159
+
160
+
161
+ #endif
@@ -0,0 +1,219 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #include <faiss/IndexIVFPQR.h>
11
+
12
+ #include <faiss/utils/Heap.h>
13
+ #include <faiss/utils/utils.h>
14
+ #include <faiss/utils/distances.h>
15
+
16
+ #include <faiss/impl/FaissAssert.h>
17
+
18
+
19
+ namespace faiss {
20
+
21
+ /*****************************************
22
+ * IndexIVFPQR implementation
23
+ ******************************************/
24
+
25
+ IndexIVFPQR::IndexIVFPQR (
26
+ Index * quantizer, size_t d, size_t nlist,
27
+ size_t M, size_t nbits_per_idx,
28
+ size_t M_refine, size_t nbits_per_idx_refine):
29
+ IndexIVFPQ (quantizer, d, nlist, M, nbits_per_idx),
30
+ refine_pq (d, M_refine, nbits_per_idx_refine),
31
+ k_factor (4)
32
+ {
33
+ by_residual = true;
34
+ }
35
+
36
+ IndexIVFPQR::IndexIVFPQR ():
37
+ k_factor (1)
38
+ {
39
+ by_residual = true;
40
+ }
41
+
42
+
43
+
44
+ void IndexIVFPQR::reset()
45
+ {
46
+ IndexIVFPQ::reset();
47
+ refine_codes.clear();
48
+ }
49
+
50
+
51
+
52
+
53
+ void IndexIVFPQR::train_residual (idx_t n, const float *x)
54
+ {
55
+
56
+ float * residual_2 = new float [n * d];
57
+ ScopeDeleter <float> del(residual_2);
58
+
59
+ train_residual_o (n, x, residual_2);
60
+
61
+ if (verbose)
62
+ printf ("training %zdx%zd 2nd level PQ quantizer on %ld %dD-vectors\n",
63
+ refine_pq.M, refine_pq.ksub, n, d);
64
+
65
+ refine_pq.cp.max_points_per_centroid = 1000;
66
+ refine_pq.cp.verbose = verbose;
67
+
68
+ refine_pq.train (n, residual_2);
69
+
70
+ }
71
+
72
+
73
+ void IndexIVFPQR::add_with_ids (idx_t n, const float *x, const idx_t *xids) {
74
+ add_core (n, x, xids, nullptr);
75
+ }
76
+
77
+ void IndexIVFPQR::add_core (idx_t n, const float *x, const idx_t *xids,
78
+ const idx_t *precomputed_idx) {
79
+
80
+ float * residual_2 = new float [n * d];
81
+ ScopeDeleter <float> del(residual_2);
82
+
83
+ idx_t n0 = ntotal;
84
+
85
+ add_core_o (n, x, xids, residual_2, precomputed_idx);
86
+
87
+ refine_codes.resize (ntotal * refine_pq.code_size);
88
+
89
+ refine_pq.compute_codes (
90
+ residual_2, &refine_codes[n0 * refine_pq.code_size], n);
91
+
92
+
93
+ }
94
+ #define TIC t0 = get_cycles()
95
+ #define TOC get_cycles () - t0
96
+
97
+
98
+ void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
99
+ const idx_t *idx,
100
+ const float *L1_dis,
101
+ float *distances, idx_t *labels,
102
+ bool store_pairs,
103
+ const IVFSearchParameters *params
104
+ ) const
105
+ {
106
+ uint64_t t0;
107
+ TIC;
108
+ size_t k_coarse = long(k * k_factor);
109
+ idx_t *coarse_labels = new idx_t [k_coarse * n];
110
+ ScopeDeleter<idx_t> del1 (coarse_labels);
111
+ { // query with quantizer levels 1 and 2.
112
+ float *coarse_distances = new float [k_coarse * n];
113
+ ScopeDeleter<float> del(coarse_distances);
114
+
115
+ IndexIVFPQ::search_preassigned (
116
+ n, x, k_coarse,
117
+ idx, L1_dis, coarse_distances, coarse_labels,
118
+ true, params);
119
+ }
120
+
121
+
122
+ indexIVFPQ_stats.search_cycles += TOC;
123
+
124
+ TIC;
125
+
126
+ // 3rd level refinement
127
+ size_t n_refine = 0;
128
+ #pragma omp parallel reduction(+ : n_refine)
129
+ {
130
+ // tmp buffers
131
+ float *residual_1 = new float [2 * d];
132
+ ScopeDeleter<float> del (residual_1);
133
+ float *residual_2 = residual_1 + d;
134
+ #pragma omp for
135
+ for (idx_t i = 0; i < n; i++) {
136
+ const float *xq = x + i * d;
137
+ const idx_t * shortlist = coarse_labels + k_coarse * i;
138
+ float * heap_sim = distances + k * i;
139
+ idx_t * heap_ids = labels + k * i;
140
+ maxheap_heapify (k, heap_sim, heap_ids);
141
+
142
+ for (int j = 0; j < k_coarse; j++) {
143
+ idx_t sl = shortlist[j];
144
+
145
+ if (sl == -1) continue;
146
+
147
+ int list_no = sl >> 32;
148
+ int ofs = sl & 0xffffffff;
149
+
150
+ assert (list_no >= 0 && list_no < nlist);
151
+ assert (ofs >= 0 && ofs < invlists->list_size (list_no));
152
+
153
+ // 1st level residual
154
+ quantizer->compute_residual (xq, residual_1, list_no);
155
+
156
+ // 2nd level residual
157
+ const uint8_t * l2code =
158
+ invlists->get_single_code (list_no, ofs);
159
+
160
+ pq.decode (l2code, residual_2);
161
+ for (int l = 0; l < d; l++)
162
+ residual_2[l] = residual_1[l] - residual_2[l];
163
+
164
+ // 3rd level residual's approximation
165
+ idx_t id = invlists->get_single_id (list_no, ofs);
166
+ assert (0 <= id && id < ntotal);
167
+ refine_pq.decode (&refine_codes [id * refine_pq.code_size],
168
+ residual_1);
169
+
170
+ float dis = fvec_L2sqr (residual_1, residual_2, d);
171
+
172
+ if (dis < heap_sim[0]) {
173
+ maxheap_pop (k, heap_sim, heap_ids);
174
+ idx_t id_or_pair = store_pairs ? sl : id;
175
+ maxheap_push (k, heap_sim, heap_ids, dis, id_or_pair);
176
+ }
177
+ n_refine ++;
178
+ }
179
+ maxheap_reorder (k, heap_sim, heap_ids);
180
+ }
181
+ }
182
+ indexIVFPQ_stats.nrefine += n_refine;
183
+ indexIVFPQ_stats.refine_cycles += TOC;
184
+ }
185
+
186
+ void IndexIVFPQR::reconstruct_from_offset (int64_t list_no, int64_t offset,
187
+ float* recons) const
188
+ {
189
+ IndexIVFPQ::reconstruct_from_offset (list_no, offset, recons);
190
+
191
+ idx_t id = invlists->get_single_id (list_no, offset);
192
+ assert (0 <= id && id < ntotal);
193
+
194
+ std::vector<float> r3(d);
195
+ refine_pq.decode (&refine_codes [id * refine_pq.code_size], r3.data());
196
+ for (int i = 0; i < d; ++i) {
197
+ recons[i] += r3[i];
198
+ }
199
+ }
200
+
201
+ void IndexIVFPQR::merge_from (IndexIVF &other_in, idx_t add_id)
202
+ {
203
+ IndexIVFPQR *other = dynamic_cast<IndexIVFPQR *> (&other_in);
204
+ FAISS_THROW_IF_NOT(other);
205
+
206
+ IndexIVF::merge_from (other_in, add_id);
207
+
208
+ refine_codes.insert (refine_codes.end(),
209
+ other->refine_codes.begin(),
210
+ other->refine_codes.end());
211
+ other->refine_codes.clear();
212
+ }
213
+
214
+ size_t IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
215
+ FAISS_THROW_MSG("not implemented");
216
+ return 0;
217
+ }
218
+
219
+ } // namespace faiss
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #pragma once
11
+
12
+ #include <vector>
13
+
14
+ #include <faiss/IndexIVFPQ.h>
15
+
16
+
17
+ namespace faiss {
18
+
19
+
20
+
21
+ /** Index with an additional level of PQ refinement */
22
+ struct IndexIVFPQR: IndexIVFPQ {
23
+ ProductQuantizer refine_pq; ///< 3rd level quantizer
24
+ std::vector <uint8_t> refine_codes; ///< corresponding codes
25
+
26
+ /// factor between k requested in search and the k requested from the IVFPQ
27
+ float k_factor;
28
+
29
+ IndexIVFPQR (
30
+ Index * quantizer, size_t d, size_t nlist,
31
+ size_t M, size_t nbits_per_idx,
32
+ size_t M_refine, size_t nbits_per_idx_refine);
33
+
34
+ void reset() override;
35
+
36
+ size_t remove_ids(const IDSelector& sel) override;
37
+
38
+ /// trains the two product quantizers
39
+ void train_residual(idx_t n, const float* x) override;
40
+
41
+ void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
42
+
43
+ /// same as add_with_ids, but optionally use the precomputed list ids
44
+ void add_core (idx_t n, const float *x, const idx_t *xids,
45
+ const idx_t *precomputed_idx = nullptr);
46
+
47
+ void reconstruct_from_offset (int64_t list_no, int64_t offset,
48
+ float* recons) const override;
49
+
50
+ void merge_from (IndexIVF &other, idx_t add_id) override;
51
+
52
+
53
+ void search_preassigned (idx_t n, const float *x, idx_t k,
54
+ const idx_t *assign,
55
+ const float *centroid_dis,
56
+ float *distances, idx_t *labels,
57
+ bool store_pairs,
58
+ const IVFSearchParameters *params=nullptr
59
+ ) const override;
60
+
61
+ IndexIVFPQR();
62
+ };
63
+
64
+
65
+ } // namespace faiss
@@ -0,0 +1,331 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+
11
+ #include <faiss/IndexIVFSpectralHash.h>
12
+
13
+ #include <memory>
14
+ #include <algorithm>
15
+ #include <stdint.h>
16
+
17
+ #include <faiss/utils/hamming.h>
18
+ #include <faiss/utils/utils.h>
19
+ #include <faiss/impl/FaissAssert.h>
20
+ #include <faiss/impl/AuxIndexStructures.h>
21
+ #include <faiss/VectorTransform.h>
22
+
23
+ namespace faiss {
24
+
25
+
26
+ IndexIVFSpectralHash::IndexIVFSpectralHash (
27
+ Index * quantizer, size_t d, size_t nlist,
28
+ int nbit, float period):
29
+ IndexIVF (quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
30
+ nbit (nbit), period (period), threshold_type (Thresh_global)
31
+ {
32
+ FAISS_THROW_IF_NOT (code_size % 4 == 0);
33
+ RandomRotationMatrix *rr = new RandomRotationMatrix (d, nbit);
34
+ rr->init (1234);
35
+ vt = rr;
36
+ own_fields = true;
37
+ is_trained = false;
38
+ }
39
+
40
+ IndexIVFSpectralHash::IndexIVFSpectralHash():
41
+ IndexIVF(), vt(nullptr), own_fields(false),
42
+ nbit(0), period(0), threshold_type(Thresh_global)
43
+ {}
44
+
45
+ IndexIVFSpectralHash::~IndexIVFSpectralHash ()
46
+ {
47
+ if (own_fields) {
48
+ delete vt;
49
+ }
50
+ }
51
+
52
+ namespace {
53
+
54
+
55
+ float median (size_t n, float *x) {
56
+ std::sort(x, x + n);
57
+ if (n % 2 == 1) {
58
+ return x [n / 2];
59
+ } else {
60
+ return (x [n / 2 - 1] + x [n / 2]) / 2;
61
+ }
62
+ }
63
+
64
+ }
65
+
66
+
67
+ void IndexIVFSpectralHash::train_residual (idx_t n, const float *x)
68
+ {
69
+ if (!vt->is_trained) {
70
+ vt->train (n, x);
71
+ }
72
+
73
+ if (threshold_type == Thresh_global) {
74
+ // nothing to do
75
+ return;
76
+ } else if (threshold_type == Thresh_centroid ||
77
+ threshold_type == Thresh_centroid_half) {
78
+ // convert all centroids with vt
79
+ std::vector<float> centroids (nlist * d);
80
+ quantizer->reconstruct_n (0, nlist, centroids.data());
81
+ trained.resize(nlist * nbit);
82
+ vt->apply_noalloc (nlist, centroids.data(), trained.data());
83
+ if (threshold_type == Thresh_centroid_half) {
84
+ for (size_t i = 0; i < nlist * nbit; i++) {
85
+ trained[i] -= 0.25 * period;
86
+ }
87
+ }
88
+ return;
89
+ }
90
+ // otherwise train medians
91
+
92
+ // assign
93
+ std::unique_ptr<idx_t []> idx (new idx_t [n]);
94
+ quantizer->assign (n, x, idx.get());
95
+
96
+ std::vector<size_t> sizes(nlist + 1);
97
+ for (size_t i = 0; i < n; i++) {
98
+ FAISS_THROW_IF_NOT (idx[i] >= 0);
99
+ sizes[idx[i]]++;
100
+ }
101
+
102
+ size_t ofs = 0;
103
+ for (int j = 0; j < nlist; j++) {
104
+ size_t o0 = ofs;
105
+ ofs += sizes[j];
106
+ sizes[j] = o0;
107
+ }
108
+
109
+ // transform
110
+ std::unique_ptr<float []> xt (vt->apply (n, x));
111
+
112
+ // transpose + reorder
113
+ std::unique_ptr<float []> xo (new float[n * nbit]);
114
+
115
+ for (size_t i = 0; i < n; i++) {
116
+ size_t idest = sizes[idx[i]]++;
117
+ for (size_t j = 0; j < nbit; j++) {
118
+ xo[idest + n * j] = xt[i * nbit + j];
119
+ }
120
+ }
121
+
122
+ trained.resize (n * nbit);
123
+ // compute medians
124
+ #pragma omp for
125
+ for (int i = 0; i < nlist; i++) {
126
+ size_t i0 = i == 0 ? 0 : sizes[i - 1];
127
+ size_t i1 = sizes[i];
128
+ for (int j = 0; j < nbit; j++) {
129
+ float *xoi = xo.get() + i0 + n * j;
130
+ if (i0 == i1) { // nothing to train
131
+ trained[i * nbit + j] = 0.0;
132
+ } else if (i1 == i0 + 1) {
133
+ trained[i * nbit + j] = xoi[0];
134
+ } else {
135
+ trained[i * nbit + j] = median(i1 - i0, xoi);
136
+ }
137
+ }
138
+ }
139
+ }
140
+
141
+
142
+ namespace {
143
+
144
+ void binarize_with_freq(size_t nbit, float freq,
145
+ const float *x, const float *c,
146
+ uint8_t *codes)
147
+ {
148
+ memset (codes, 0, (nbit + 7) / 8);
149
+ for (size_t i = 0; i < nbit; i++) {
150
+ float xf = (x[i] - c[i]);
151
+ int xi = int(floor(xf * freq));
152
+ int bit = xi & 1;
153
+ codes[i >> 3] |= bit << (i & 7);
154
+ }
155
+ }
156
+
157
+
158
+ };
159
+
160
+
161
+
162
+ void IndexIVFSpectralHash::encode_vectors(idx_t n, const float* x_in,
163
+ const idx_t *list_nos,
164
+ uint8_t * codes,
165
+ bool include_listnos) const
166
+ {
167
+ FAISS_THROW_IF_NOT (is_trained);
168
+ float freq = 2.0 / period;
169
+
170
+ FAISS_THROW_IF_NOT_MSG (!include_listnos, "listnos encoding not supported");
171
+
172
+ // transform with vt
173
+ std::unique_ptr<float []> x (vt->apply (n, x_in));
174
+
175
+ #pragma omp parallel
176
+ {
177
+ std::vector<float> zero (nbit);
178
+
179
+ // each thread takes care of a subset of lists
180
+ #pragma omp for
181
+ for (size_t i = 0; i < n; i++) {
182
+ int64_t list_no = list_nos [i];
183
+
184
+ if (list_no >= 0) {
185
+ const float *c;
186
+ if (threshold_type == Thresh_global) {
187
+ c = zero.data();
188
+ } else {
189
+ c = trained.data() + list_no * nbit;
190
+ }
191
+ binarize_with_freq (nbit, freq,
192
+ x.get() + i * nbit, c,
193
+ codes + i * code_size) ;
194
+ }
195
+ }
196
+ }
197
+ }
198
+
199
+ namespace {
200
+
201
+
202
+ template<class HammingComputer>
203
+ struct IVFScanner: InvertedListScanner {
204
+
205
+ // copied from index structure
206
+ const IndexIVFSpectralHash *index;
207
+ size_t code_size;
208
+ size_t nbit;
209
+ bool store_pairs;
210
+
211
+ float period, freq;
212
+ std::vector<float> q;
213
+ std::vector<float> zero;
214
+ std::vector<uint8_t> qcode;
215
+ HammingComputer hc;
216
+
217
+ using idx_t = Index::idx_t;
218
+
219
+ IVFScanner (const IndexIVFSpectralHash * index,
220
+ bool store_pairs):
221
+ index (index),
222
+ code_size(index->code_size),
223
+ nbit(index->nbit),
224
+ store_pairs(store_pairs),
225
+ period(index->period), freq(2.0 / index->period),
226
+ q(nbit), zero(nbit), qcode(code_size),
227
+ hc(qcode.data(), code_size)
228
+ {
229
+ }
230
+
231
+
232
+ void set_query (const float *query) override {
233
+ FAISS_THROW_IF_NOT(query);
234
+ FAISS_THROW_IF_NOT(q.size() == nbit);
235
+ index->vt->apply_noalloc (1, query, q.data());
236
+
237
+ if (index->threshold_type ==
238
+ IndexIVFSpectralHash::Thresh_global) {
239
+ binarize_with_freq
240
+ (nbit, freq, q.data(), zero.data(), qcode.data());
241
+ hc.set (qcode.data(), code_size);
242
+ }
243
+ }
244
+
245
+ idx_t list_no;
246
+
247
+ void set_list (idx_t list_no, float /*coarse_dis*/) override {
248
+ this->list_no = list_no;
249
+ if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
250
+ const float *c = index->trained.data() + list_no * nbit;
251
+ binarize_with_freq (nbit, freq, q.data(), c, qcode.data());
252
+ hc.set (qcode.data(), code_size);
253
+ }
254
+ }
255
+
256
+ float distance_to_code (const uint8_t *code) const final {
257
+ return hc.hamming (code);
258
+ }
259
+
260
+ size_t scan_codes (size_t list_size,
261
+ const uint8_t *codes,
262
+ const idx_t *ids,
263
+ float *simi, idx_t *idxi,
264
+ size_t k) const override
265
+ {
266
+ size_t nup = 0;
267
+ for (size_t j = 0; j < list_size; j++) {
268
+
269
+ float dis = hc.hamming (codes);
270
+
271
+ if (dis < simi [0]) {
272
+ maxheap_pop (k, simi, idxi);
273
+ int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
274
+ maxheap_push (k, simi, idxi, dis, id);
275
+ nup++;
276
+ }
277
+ codes += code_size;
278
+ }
279
+ return nup;
280
+ }
281
+
282
+ void scan_codes_range (size_t list_size,
283
+ const uint8_t *codes,
284
+ const idx_t *ids,
285
+ float radius,
286
+ RangeQueryResult & res) const override
287
+ {
288
+ for (size_t j = 0; j < list_size; j++) {
289
+ float dis = hc.hamming (codes);
290
+ if (dis < radius) {
291
+ int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
292
+ res.add (dis, id);
293
+ }
294
+ codes += code_size;
295
+ }
296
+ }
297
+
298
+
299
+ };
300
+
301
+ } // anonymous namespace
302
+
303
+ InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner
304
+ (bool store_pairs) const
305
+ {
306
+ switch (code_size) {
307
+ #define HANDLE_CODE_SIZE(cs) \
308
+ case cs: \
309
+ return new IVFScanner<HammingComputer ## cs> (this, store_pairs)
310
+ HANDLE_CODE_SIZE(4);
311
+ HANDLE_CODE_SIZE(8);
312
+ HANDLE_CODE_SIZE(16);
313
+ HANDLE_CODE_SIZE(20);
314
+ HANDLE_CODE_SIZE(32);
315
+ HANDLE_CODE_SIZE(64);
316
+ #undef HANDLE_CODE_SIZE
317
+ default:
318
+ if (code_size % 8 == 0) {
319
+ return new IVFScanner<HammingComputerM8>(this, store_pairs);
320
+ } else if (code_size % 4 == 0) {
321
+ return new IVFScanner<HammingComputerM4>(this, store_pairs);
322
+ } else {
323
+ FAISS_THROW_MSG("not supported");
324
+ }
325
+ }
326
+
327
+ }
328
+
329
+
330
+
331
+ } // namespace faiss