faiss 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +103 -3
  4. data/ext/faiss/ext.cpp +99 -32
  5. data/ext/faiss/extconf.rb +12 -2
  6. data/lib/faiss/ext.bundle +0 -0
  7. data/lib/faiss/index.rb +3 -3
  8. data/lib/faiss/index_binary.rb +3 -3
  9. data/lib/faiss/kmeans.rb +1 -1
  10. data/lib/faiss/pca_matrix.rb +2 -2
  11. data/lib/faiss/product_quantizer.rb +3 -3
  12. data/lib/faiss/version.rb +1 -1
  13. data/vendor/faiss/AutoTune.cpp +719 -0
  14. data/vendor/faiss/AutoTune.h +212 -0
  15. data/vendor/faiss/Clustering.cpp +261 -0
  16. data/vendor/faiss/Clustering.h +101 -0
  17. data/vendor/faiss/IVFlib.cpp +339 -0
  18. data/vendor/faiss/IVFlib.h +132 -0
  19. data/vendor/faiss/Index.cpp +171 -0
  20. data/vendor/faiss/Index.h +261 -0
  21. data/vendor/faiss/Index2Layer.cpp +437 -0
  22. data/vendor/faiss/Index2Layer.h +85 -0
  23. data/vendor/faiss/IndexBinary.cpp +77 -0
  24. data/vendor/faiss/IndexBinary.h +163 -0
  25. data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
  26. data/vendor/faiss/IndexBinaryFlat.h +54 -0
  27. data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
  28. data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
  29. data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
  30. data/vendor/faiss/IndexBinaryHNSW.h +56 -0
  31. data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
  32. data/vendor/faiss/IndexBinaryIVF.h +211 -0
  33. data/vendor/faiss/IndexFlat.cpp +508 -0
  34. data/vendor/faiss/IndexFlat.h +175 -0
  35. data/vendor/faiss/IndexHNSW.cpp +1090 -0
  36. data/vendor/faiss/IndexHNSW.h +170 -0
  37. data/vendor/faiss/IndexIVF.cpp +909 -0
  38. data/vendor/faiss/IndexIVF.h +353 -0
  39. data/vendor/faiss/IndexIVFFlat.cpp +502 -0
  40. data/vendor/faiss/IndexIVFFlat.h +118 -0
  41. data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
  42. data/vendor/faiss/IndexIVFPQ.h +161 -0
  43. data/vendor/faiss/IndexIVFPQR.cpp +219 -0
  44. data/vendor/faiss/IndexIVFPQR.h +65 -0
  45. data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
  46. data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
  47. data/vendor/faiss/IndexLSH.cpp +225 -0
  48. data/vendor/faiss/IndexLSH.h +87 -0
  49. data/vendor/faiss/IndexLattice.cpp +143 -0
  50. data/vendor/faiss/IndexLattice.h +68 -0
  51. data/vendor/faiss/IndexPQ.cpp +1188 -0
  52. data/vendor/faiss/IndexPQ.h +199 -0
  53. data/vendor/faiss/IndexPreTransform.cpp +288 -0
  54. data/vendor/faiss/IndexPreTransform.h +91 -0
  55. data/vendor/faiss/IndexReplicas.cpp +123 -0
  56. data/vendor/faiss/IndexReplicas.h +76 -0
  57. data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
  58. data/vendor/faiss/IndexScalarQuantizer.h +127 -0
  59. data/vendor/faiss/IndexShards.cpp +317 -0
  60. data/vendor/faiss/IndexShards.h +100 -0
  61. data/vendor/faiss/InvertedLists.cpp +623 -0
  62. data/vendor/faiss/InvertedLists.h +334 -0
  63. data/vendor/faiss/LICENSE +21 -0
  64. data/vendor/faiss/MatrixStats.cpp +252 -0
  65. data/vendor/faiss/MatrixStats.h +62 -0
  66. data/vendor/faiss/MetaIndexes.cpp +351 -0
  67. data/vendor/faiss/MetaIndexes.h +126 -0
  68. data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
  69. data/vendor/faiss/OnDiskInvertedLists.h +127 -0
  70. data/vendor/faiss/VectorTransform.cpp +1157 -0
  71. data/vendor/faiss/VectorTransform.h +322 -0
  72. data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
  73. data/vendor/faiss/c_api/AutoTune_c.h +64 -0
  74. data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
  75. data/vendor/faiss/c_api/Clustering_c.h +117 -0
  76. data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
  77. data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
  78. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
  79. data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
  80. data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
  81. data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
  82. data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
  83. data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
  84. data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
  85. data/vendor/faiss/c_api/IndexShards_c.h +42 -0
  86. data/vendor/faiss/c_api/Index_c.cpp +105 -0
  87. data/vendor/faiss/c_api/Index_c.h +183 -0
  88. data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
  89. data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
  90. data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
  91. data/vendor/faiss/c_api/clone_index_c.h +32 -0
  92. data/vendor/faiss/c_api/error_c.h +42 -0
  93. data/vendor/faiss/c_api/error_impl.cpp +27 -0
  94. data/vendor/faiss/c_api/error_impl.h +16 -0
  95. data/vendor/faiss/c_api/faiss_c.h +58 -0
  96. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
  97. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
  98. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
  99. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
  100. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
  101. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
  102. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
  103. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
  104. data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
  105. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
  106. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
  107. data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
  108. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
  109. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
  110. data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
  111. data/vendor/faiss/c_api/index_factory_c.h +30 -0
  112. data/vendor/faiss/c_api/index_io_c.cpp +42 -0
  113. data/vendor/faiss/c_api/index_io_c.h +50 -0
  114. data/vendor/faiss/c_api/macros_impl.h +110 -0
  115. data/vendor/faiss/clone_index.cpp +147 -0
  116. data/vendor/faiss/clone_index.h +38 -0
  117. data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
  118. data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
  119. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
  120. data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
  121. data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
  122. data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
  123. data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
  124. data/vendor/faiss/gpu/GpuCloner.h +82 -0
  125. data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
  126. data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
  127. data/vendor/faiss/gpu/GpuDistance.h +52 -0
  128. data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
  129. data/vendor/faiss/gpu/GpuIndex.h +148 -0
  130. data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
  131. data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
  132. data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
  133. data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
  134. data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
  135. data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
  136. data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
  137. data/vendor/faiss/gpu/GpuResources.cpp +52 -0
  138. data/vendor/faiss/gpu/GpuResources.h +73 -0
  139. data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
  140. data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
  141. data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
  142. data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
  143. data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
  144. data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
  145. data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
  146. data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
  147. data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
  148. data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
  149. data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
  150. data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
  151. data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
  152. data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
  153. data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
  154. data/vendor/faiss/gpu/test/TestUtils.h +93 -0
  155. data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
  156. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
  157. data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
  158. data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
  159. data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
  160. data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
  161. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
  162. data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
  163. data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
  164. data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
  165. data/vendor/faiss/gpu/utils/Timer.h +52 -0
  166. data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
  167. data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
  168. data/vendor/faiss/impl/FaissAssert.h +95 -0
  169. data/vendor/faiss/impl/FaissException.cpp +66 -0
  170. data/vendor/faiss/impl/FaissException.h +71 -0
  171. data/vendor/faiss/impl/HNSW.cpp +818 -0
  172. data/vendor/faiss/impl/HNSW.h +275 -0
  173. data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
  174. data/vendor/faiss/impl/PolysemousTraining.h +158 -0
  175. data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
  176. data/vendor/faiss/impl/ProductQuantizer.h +242 -0
  177. data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
  178. data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
  179. data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
  180. data/vendor/faiss/impl/ThreadedIndex.h +80 -0
  181. data/vendor/faiss/impl/index_read.cpp +793 -0
  182. data/vendor/faiss/impl/index_write.cpp +558 -0
  183. data/vendor/faiss/impl/io.cpp +142 -0
  184. data/vendor/faiss/impl/io.h +98 -0
  185. data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
  186. data/vendor/faiss/impl/lattice_Zn.h +199 -0
  187. data/vendor/faiss/index_factory.cpp +392 -0
  188. data/vendor/faiss/index_factory.h +25 -0
  189. data/vendor/faiss/index_io.h +75 -0
  190. data/vendor/faiss/misc/test_blas.cpp +84 -0
  191. data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
  192. data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
  193. data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
  194. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
  195. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
  196. data/vendor/faiss/tests/test_merge.cpp +258 -0
  197. data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
  198. data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
  199. data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
  200. data/vendor/faiss/tests/test_params_override.cpp +231 -0
  201. data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
  202. data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
  203. data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
  204. data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
  205. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
  206. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
  207. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
  208. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
  209. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
  210. data/vendor/faiss/utils/Heap.cpp +122 -0
  211. data/vendor/faiss/utils/Heap.h +495 -0
  212. data/vendor/faiss/utils/WorkerThread.cpp +126 -0
  213. data/vendor/faiss/utils/WorkerThread.h +61 -0
  214. data/vendor/faiss/utils/distances.cpp +765 -0
  215. data/vendor/faiss/utils/distances.h +243 -0
  216. data/vendor/faiss/utils/distances_simd.cpp +809 -0
  217. data/vendor/faiss/utils/extra_distances.cpp +336 -0
  218. data/vendor/faiss/utils/extra_distances.h +54 -0
  219. data/vendor/faiss/utils/hamming-inl.h +472 -0
  220. data/vendor/faiss/utils/hamming.cpp +792 -0
  221. data/vendor/faiss/utils/hamming.h +220 -0
  222. data/vendor/faiss/utils/random.cpp +192 -0
  223. data/vendor/faiss/utils/random.h +60 -0
  224. data/vendor/faiss/utils/utils.cpp +783 -0
  225. data/vendor/faiss/utils/utils.h +181 -0
  226. metadata +216 -2
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #pragma once
11
+
12
+ #include <faiss/impl/HNSW.h>
13
+ #include <faiss/IndexBinaryFlat.h>
14
+ #include <faiss/utils/utils.h>
15
+
16
+
17
+ namespace faiss {
18
+
19
+
20
+ /** The HNSW index is a normal random-access index with a HNSW
21
+ * link structure built on top */
22
+
23
+ struct IndexBinaryHNSW : IndexBinary {
24
+ typedef HNSW::storage_idx_t storage_idx_t;
25
+
26
+ // the link strcuture
27
+ HNSW hnsw;
28
+
29
+ // the sequential storage
30
+ bool own_fields;
31
+ IndexBinary *storage;
32
+
33
+ explicit IndexBinaryHNSW();
34
+ explicit IndexBinaryHNSW(int d, int M = 32);
35
+ explicit IndexBinaryHNSW(IndexBinary *storage, int M = 32);
36
+
37
+ ~IndexBinaryHNSW() override;
38
+
39
+ DistanceComputer *get_distance_computer() const;
40
+
41
+ void add(idx_t n, const uint8_t *x) override;
42
+
43
+ /// Trains the storage if needed
44
+ void train(idx_t n, const uint8_t* x) override;
45
+
46
+ /// entry point for search
47
+ void search(idx_t n, const uint8_t *x, idx_t k,
48
+ int32_t *distances, idx_t *labels) const override;
49
+
50
+ void reconstruct(idx_t key, uint8_t* recons) const override;
51
+
52
+ void reset() override;
53
+ };
54
+
55
+
56
+ } // namespace faiss
@@ -0,0 +1,671 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // Copyright 2004-present Facebook. All Rights Reserved
9
+ // -*- c++ -*-
10
+
11
+ #include <faiss/IndexBinaryIVF.h>
12
+
13
+ #include <cstdio>
14
+ #include <memory>
15
+
16
+ #include <faiss/utils/hamming.h>
17
+ #include <faiss/utils/utils.h>
18
+
19
+ #include <faiss/impl/AuxIndexStructures.h>
20
+ #include <faiss/impl/FaissAssert.h>
21
+ #include <faiss/IndexFlat.h>
22
+
23
+
24
+ namespace faiss {
25
+
26
+ IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist)
27
+ : IndexBinary(d),
28
+ invlists(new ArrayInvertedLists(nlist, code_size)),
29
+ own_invlists(true),
30
+ nprobe(1),
31
+ max_codes(0),
32
+ maintain_direct_map(false),
33
+ quantizer(quantizer),
34
+ nlist(nlist),
35
+ own_fields(false),
36
+ clustering_index(nullptr)
37
+ {
38
+ FAISS_THROW_IF_NOT (d == quantizer->d);
39
+ is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
40
+
41
+ cp.niter = 10;
42
+ }
43
+
44
+ IndexBinaryIVF::IndexBinaryIVF()
45
+ : invlists(nullptr),
46
+ own_invlists(false),
47
+ nprobe(1),
48
+ max_codes(0),
49
+ maintain_direct_map(false),
50
+ quantizer(nullptr),
51
+ nlist(0),
52
+ own_fields(false),
53
+ clustering_index(nullptr)
54
+ {}
55
+
56
+ void IndexBinaryIVF::add(idx_t n, const uint8_t *x) {
57
+ add_with_ids(n, x, nullptr);
58
+ }
59
+
60
+ void IndexBinaryIVF::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) {
61
+ add_core(n, x, xids, nullptr);
62
+ }
63
+
64
+ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
65
+ const idx_t *precomputed_idx) {
66
+ FAISS_THROW_IF_NOT(is_trained);
67
+ assert(invlists);
68
+ FAISS_THROW_IF_NOT_MSG(!(maintain_direct_map && xids),
69
+ "cannot have direct map and add with ids");
70
+
71
+ const idx_t * idx;
72
+
73
+ std::unique_ptr<idx_t[]> scoped_idx;
74
+
75
+ if (precomputed_idx) {
76
+ idx = precomputed_idx;
77
+ } else {
78
+ scoped_idx.reset(new idx_t[n]);
79
+ quantizer->assign(n, x, scoped_idx.get());
80
+ idx = scoped_idx.get();
81
+ }
82
+
83
+ long n_add = 0;
84
+ for (size_t i = 0; i < n; i++) {
85
+ idx_t id = xids ? xids[i] : ntotal + i;
86
+ idx_t list_no = idx[i];
87
+
88
+ if (list_no < 0)
89
+ continue;
90
+ const uint8_t *xi = x + i * code_size;
91
+ size_t offset = invlists->add_entry(list_no, id, xi);
92
+
93
+ if (maintain_direct_map)
94
+ direct_map.push_back(list_no << 32 | offset);
95
+ n_add++;
96
+ }
97
+ if (verbose) {
98
+ printf("IndexBinaryIVF::add_with_ids: added %ld / %ld vectors\n",
99
+ n_add, n);
100
+ }
101
+ ntotal += n_add;
102
+ }
103
+
104
+ void IndexBinaryIVF::make_direct_map(bool new_maintain_direct_map) {
105
+ // nothing to do
106
+ if (new_maintain_direct_map == maintain_direct_map)
107
+ return;
108
+
109
+ if (new_maintain_direct_map) {
110
+ direct_map.resize(ntotal, -1);
111
+ for (size_t key = 0; key < nlist; key++) {
112
+ size_t list_size = invlists->list_size(key);
113
+ const idx_t *idlist = invlists->get_ids(key);
114
+
115
+ for (size_t ofs = 0; ofs < list_size; ofs++) {
116
+ FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] < ntotal,
117
+ "direct map supported only for seuquential ids");
118
+ direct_map[idlist[ofs]] = key << 32 | ofs;
119
+ }
120
+ }
121
+ } else {
122
+ direct_map.clear();
123
+ }
124
+ maintain_direct_map = new_maintain_direct_map;
125
+ }
126
+
127
+ void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k,
128
+ int32_t *distances, idx_t *labels) const {
129
+ std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
130
+ std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
131
+
132
+ double t0 = getmillisecs();
133
+ quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
134
+ indexIVF_stats.quantization_time += getmillisecs() - t0;
135
+
136
+ t0 = getmillisecs();
137
+ invlists->prefetch_lists(idx.get(), n * nprobe);
138
+
139
+ search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
140
+ distances, labels, false);
141
+ indexIVF_stats.search_time += getmillisecs() - t0;
142
+ }
143
+
144
+ void IndexBinaryIVF::reconstruct(idx_t key, uint8_t *recons) const {
145
+ FAISS_THROW_IF_NOT_MSG(direct_map.size() == ntotal,
146
+ "direct map is not initialized");
147
+ idx_t list_no = direct_map[key] >> 32;
148
+ idx_t offset = direct_map[key] & 0xffffffff;
149
+ reconstruct_from_offset(list_no, offset, recons);
150
+ }
151
+
152
+ void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
153
+ FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
154
+
155
+ for (idx_t list_no = 0; list_no < nlist; list_no++) {
156
+ size_t list_size = invlists->list_size(list_no);
157
+ const Index::idx_t *idlist = invlists->get_ids(list_no);
158
+
159
+ for (idx_t offset = 0; offset < list_size; offset++) {
160
+ idx_t id = idlist[offset];
161
+ if (!(id >= i0 && id < i0 + ni)) {
162
+ continue;
163
+ }
164
+
165
+ uint8_t *reconstructed = recons + (id - i0) * d;
166
+ reconstruct_from_offset(list_no, offset, reconstructed);
167
+ }
168
+ }
169
+ }
170
+
171
+ void IndexBinaryIVF::search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
172
+ int32_t *distances, idx_t *labels,
173
+ uint8_t *recons) const {
174
+ std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
175
+ std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
176
+
177
+ quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
178
+
179
+ invlists->prefetch_lists(idx.get(), n * nprobe);
180
+
181
+ // search_preassigned() with `store_pairs` enabled to obtain the list_no
182
+ // and offset into `codes` for reconstruction
183
+ search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
184
+ distances, labels, /* store_pairs */true);
185
+ for (idx_t i = 0; i < n; ++i) {
186
+ for (idx_t j = 0; j < k; ++j) {
187
+ idx_t ij = i * k + j;
188
+ idx_t key = labels[ij];
189
+ uint8_t *reconstructed = recons + ij * d;
190
+ if (key < 0) {
191
+ // Fill with NaNs
192
+ memset(reconstructed, -1, sizeof(*reconstructed) * d);
193
+ } else {
194
+ int list_no = key >> 32;
195
+ int offset = key & 0xffffffff;
196
+
197
+ // Update label to the actual id
198
+ labels[ij] = invlists->get_single_id(list_no, offset);
199
+
200
+ reconstruct_from_offset(list_no, offset, reconstructed);
201
+ }
202
+ }
203
+ }
204
+ }
205
+
206
+ void IndexBinaryIVF::reconstruct_from_offset(idx_t list_no, idx_t offset,
207
+ uint8_t *recons) const {
208
+ memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
209
+ }
210
+
211
+ void IndexBinaryIVF::reset() {
212
+ direct_map.clear();
213
+ invlists->reset();
214
+ ntotal = 0;
215
+ }
216
+
217
+ size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
218
+ FAISS_THROW_IF_NOT_MSG(!maintain_direct_map,
219
+ "direct map remove not implemented");
220
+
221
+ std::vector<idx_t> toremove(nlist);
222
+
223
+ #pragma omp parallel for
224
+ for (idx_t i = 0; i < nlist; i++) {
225
+ idx_t l0 = invlists->list_size (i), l = l0, j = 0;
226
+ const idx_t *idsi = invlists->get_ids(i);
227
+ while (j < l) {
228
+ if (sel.is_member(idsi[j])) {
229
+ l--;
230
+ invlists->update_entry(
231
+ i, j,
232
+ invlists->get_single_id(i, l),
233
+ invlists->get_single_code(i, l));
234
+ } else {
235
+ j++;
236
+ }
237
+ }
238
+ toremove[i] = l0 - l;
239
+ }
240
+ // this will not run well in parallel on ondisk because of possible shrinks
241
+ size_t nremove = 0;
242
+ for (idx_t i = 0; i < nlist; i++) {
243
+ if (toremove[i] > 0) {
244
+ nremove += toremove[i];
245
+ invlists->resize(
246
+ i, invlists->list_size(i) - toremove[i]);
247
+ }
248
+ }
249
+ ntotal -= nremove;
250
+ return nremove;
251
+ }
252
+
253
+ void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
254
+ if (verbose) {
255
+ printf("Training quantizer\n");
256
+ }
257
+
258
+ if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
259
+ if (verbose) {
260
+ printf("IVF quantizer does not need training.\n");
261
+ }
262
+ } else {
263
+ if (verbose) {
264
+ printf("Training quantizer on %ld vectors in %dD\n", n, d);
265
+ }
266
+
267
+ Clustering clus(d, nlist, cp);
268
+ quantizer->reset();
269
+
270
+ std::unique_ptr<float[]> x_f(new float[n * d]);
271
+ binary_to_real(n * d, x, x_f.get());
272
+
273
+ IndexFlatL2 index_tmp(d);
274
+
275
+ if (clustering_index && verbose) {
276
+ printf("using clustering_index of dimension %d to do the clustering\n",
277
+ clustering_index->d);
278
+ }
279
+
280
+ clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
281
+
282
+ std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
283
+ real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
284
+
285
+ quantizer->add(clus.k, x_b.get());
286
+ quantizer->is_trained = true;
287
+ }
288
+
289
+ is_trained = true;
290
+ }
291
+
292
+ void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
293
+ // minimal sanity checks
294
+ FAISS_THROW_IF_NOT(other.d == d);
295
+ FAISS_THROW_IF_NOT(other.nlist == nlist);
296
+ FAISS_THROW_IF_NOT(other.code_size == code_size);
297
+ FAISS_THROW_IF_NOT_MSG((!maintain_direct_map &&
298
+ !other.maintain_direct_map),
299
+ "direct map copy not implemented");
300
+ FAISS_THROW_IF_NOT_MSG(typeid (*this) == typeid (other),
301
+ "can only merge indexes of the same type");
302
+
303
+ invlists->merge_from (other.invlists, add_id);
304
+
305
+ ntotal += other.ntotal;
306
+ other.ntotal = 0;
307
+ }
308
+
309
+ void IndexBinaryIVF::replace_invlists(InvertedLists *il, bool own) {
310
+ FAISS_THROW_IF_NOT(il->nlist == nlist &&
311
+ il->code_size == code_size);
312
+ if (own_invlists) {
313
+ delete invlists;
314
+ }
315
+ invlists = il;
316
+ own_invlists = own;
317
+ }
318
+
319
+
320
+ namespace {
321
+
322
+ using idx_t = Index::idx_t;
323
+
324
+
325
+ template<class HammingComputer, bool store_pairs>
326
+ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
327
+
328
+ HammingComputer hc;
329
+ size_t code_size;
330
+
331
+ IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
332
+ {}
333
+
334
+ void set_query (const uint8_t *query_vector) override {
335
+ hc.set (query_vector, code_size);
336
+ }
337
+
338
+ idx_t list_no;
339
+ void set_list (idx_t list_no, uint8_t /* coarse_dis */) override {
340
+ this->list_no = list_no;
341
+ }
342
+
343
+ uint32_t distance_to_code (const uint8_t *code) const override {
344
+ return hc.hamming (code);
345
+ }
346
+
347
+ size_t scan_codes (size_t n,
348
+ const uint8_t *codes,
349
+ const idx_t *ids,
350
+ int32_t *simi, idx_t *idxi,
351
+ size_t k) const override
352
+ {
353
+ using C = CMax<int32_t, idx_t>;
354
+
355
+ size_t nup = 0;
356
+ for (size_t j = 0; j < n; j++) {
357
+ uint32_t dis = hc.hamming (codes);
358
+ if (dis < simi[0]) {
359
+ heap_pop<C> (k, simi, idxi);
360
+ idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
361
+ heap_push<C> (k, simi, idxi, dis, id);
362
+ nup++;
363
+ }
364
+ codes += code_size;
365
+ }
366
+ return nup;
367
+ }
368
+
369
+
370
+ };
371
+
372
+
373
+ template <bool store_pairs>
374
+ BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
375
+
376
+ switch (code_size) {
377
+ #define HANDLE_CS(cs) \
378
+ case cs: \
379
+ return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
380
+ HANDLE_CS(4);
381
+ HANDLE_CS(8);
382
+ HANDLE_CS(16);
383
+ HANDLE_CS(20);
384
+ HANDLE_CS(32);
385
+ HANDLE_CS(64);
386
+ #undef HANDLE_CS
387
+ default:
388
+ if (code_size % 8 == 0) {
389
+ return new IVFBinaryScannerL2<HammingComputerM8,
390
+ store_pairs> (code_size);
391
+ } else if (code_size % 4 == 0) {
392
+ return new IVFBinaryScannerL2<HammingComputerM4,
393
+ store_pairs> (code_size);
394
+ } else {
395
+ return new IVFBinaryScannerL2<HammingComputerDefault,
396
+ store_pairs> (code_size);
397
+ }
398
+ }
399
+ }
400
+
401
+
402
+ void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
403
+ size_t n,
404
+ const uint8_t *x,
405
+ idx_t k,
406
+ const idx_t *keys,
407
+ const int32_t * coarse_dis,
408
+ int32_t *distances, idx_t *labels,
409
+ bool store_pairs,
410
+ const IVFSearchParameters *params)
411
+ {
412
+ long nprobe = params ? params->nprobe : ivf.nprobe;
413
+ long max_codes = params ? params->max_codes : ivf.max_codes;
414
+ MetricType metric_type = ivf.metric_type;
415
+
416
+ // almost verbatim copy from IndexIVF::search_preassigned
417
+
418
+ size_t nlistv = 0, ndis = 0, nheap = 0;
419
+ using HeapForIP = CMin<int32_t, idx_t>;
420
+ using HeapForL2 = CMax<int32_t, idx_t>;
421
+
422
+ #pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
423
+ {
424
+ std::unique_ptr<BinaryInvertedListScanner> scanner
425
+ (ivf.get_InvertedListScanner (store_pairs));
426
+
427
+ #pragma omp for
428
+ for (size_t i = 0; i < n; i++) {
429
+ const uint8_t *xi = x + i * ivf.code_size;
430
+ scanner->set_query(xi);
431
+
432
+ const idx_t * keysi = keys + i * nprobe;
433
+ int32_t * simi = distances + k * i;
434
+ idx_t * idxi = labels + k * i;
435
+
436
+ if (metric_type == METRIC_INNER_PRODUCT) {
437
+ heap_heapify<HeapForIP> (k, simi, idxi);
438
+ } else {
439
+ heap_heapify<HeapForL2> (k, simi, idxi);
440
+ }
441
+
442
+ size_t nscan = 0;
443
+
444
+ for (size_t ik = 0; ik < nprobe; ik++) {
445
+ idx_t key = keysi[ik]; /* select the list */
446
+ if (key < 0) {
447
+ // not enough centroids for multiprobe
448
+ continue;
449
+ }
450
+ FAISS_THROW_IF_NOT_FMT
451
+ (key < (idx_t) ivf.nlist,
452
+ "Invalid key=%ld at ik=%ld nlist=%ld\n",
453
+ key, ik, ivf.nlist);
454
+
455
+ scanner->set_list (key, coarse_dis[i * nprobe + ik]);
456
+
457
+ nlistv++;
458
+
459
+ size_t list_size = ivf.invlists->list_size(key);
460
+ InvertedLists::ScopedCodes scodes (ivf.invlists, key);
461
+ std::unique_ptr<InvertedLists::ScopedIds> sids;
462
+ const Index::idx_t * ids = nullptr;
463
+
464
+ if (!store_pairs) {
465
+ sids.reset (new InvertedLists::ScopedIds (ivf.invlists, key));
466
+ ids = sids->get();
467
+ }
468
+
469
+ nheap += scanner->scan_codes (list_size, scodes.get(),
470
+ ids, simi, idxi, k);
471
+
472
+ nscan += list_size;
473
+ if (max_codes && nscan >= max_codes)
474
+ break;
475
+ }
476
+
477
+ ndis += nscan;
478
+ if (metric_type == METRIC_INNER_PRODUCT) {
479
+ heap_reorder<HeapForIP> (k, simi, idxi);
480
+ } else {
481
+ heap_reorder<HeapForL2> (k, simi, idxi);
482
+ }
483
+
484
+ } // parallel for
485
+ } // parallel
486
+
487
+ indexIVF_stats.nq += n;
488
+ indexIVF_stats.nlist += nlistv;
489
+ indexIVF_stats.ndis += ndis;
490
+ indexIVF_stats.nheap_updates += nheap;
491
+
492
+ }
493
+
494
+ template<class HammingComputer, bool store_pairs>
495
+ void search_knn_hamming_count(const IndexBinaryIVF& ivf,
496
+ size_t nx,
497
+ const uint8_t *x,
498
+ const idx_t *keys,
499
+ int k,
500
+ int32_t *distances,
501
+ idx_t *labels,
502
+ const IVFSearchParameters *params) {
503
+ const int nBuckets = ivf.d + 1;
504
+ std::vector<int> all_counters(nx * nBuckets, 0);
505
+ std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
506
+
507
+ long nprobe = params ? params->nprobe : ivf.nprobe;
508
+ long max_codes = params ? params->max_codes : ivf.max_codes;
509
+
510
+ std::vector<HCounterState<HammingComputer>> cs;
511
+ for (size_t i = 0; i < nx; ++i) {
512
+ cs.push_back(HCounterState<HammingComputer>(
513
+ all_counters.data() + i * nBuckets,
514
+ all_ids_per_dis.get() + i * nBuckets * k,
515
+ x + i * ivf.code_size,
516
+ ivf.d,
517
+ k
518
+ ));
519
+ }
520
+
521
+ size_t nlistv = 0, ndis = 0;
522
+
523
+ #pragma omp parallel for reduction(+: nlistv, ndis)
524
+ for (size_t i = 0; i < nx; i++) {
525
+ const idx_t * keysi = keys + i * nprobe;
526
+ HCounterState<HammingComputer>& csi = cs[i];
527
+
528
+ size_t nscan = 0;
529
+
530
+ for (size_t ik = 0; ik < nprobe; ik++) {
531
+ idx_t key = keysi[ik]; /* select the list */
532
+ if (key < 0) {
533
+ // not enough centroids for multiprobe
534
+ continue;
535
+ }
536
+ FAISS_THROW_IF_NOT_FMT (
537
+ key < (idx_t) ivf.nlist,
538
+ "Invalid key=%ld at ik=%ld nlist=%ld\n",
539
+ key, ik, ivf.nlist);
540
+
541
+ nlistv++;
542
+ size_t list_size = ivf.invlists->list_size(key);
543
+ InvertedLists::ScopedCodes scodes (ivf.invlists, key);
544
+ const uint8_t *list_vecs = scodes.get();
545
+ const Index::idx_t *ids = store_pairs
546
+ ? nullptr
547
+ : ivf.invlists->get_ids(key);
548
+
549
+ for (size_t j = 0; j < list_size; j++) {
550
+ const uint8_t * yj = list_vecs + ivf.code_size * j;
551
+
552
+ idx_t id = store_pairs ? (key << 32 | j) : ids[j];
553
+ csi.update_counter(yj, id);
554
+ }
555
+ if (ids)
556
+ ivf.invlists->release_ids (key, ids);
557
+
558
+ nscan += list_size;
559
+ if (max_codes && nscan >= max_codes)
560
+ break;
561
+ }
562
+ ndis += nscan;
563
+
564
+ int nres = 0;
565
+ for (int b = 0; b < nBuckets && nres < k; b++) {
566
+ for (int l = 0; l < csi.counters[b] && nres < k; l++) {
567
+ labels[i * k + nres] = csi.ids_per_dis[b * k + l];
568
+ distances[i * k + nres] = b;
569
+ nres++;
570
+ }
571
+ }
572
+ while (nres < k) {
573
+ labels[i * k + nres] = -1;
574
+ distances[i * k + nres] = std::numeric_limits<int32_t>::max();
575
+ ++nres;
576
+ }
577
+ }
578
+
579
+ indexIVF_stats.nq += nx;
580
+ indexIVF_stats.nlist += nlistv;
581
+ indexIVF_stats.ndis += ndis;
582
+ }
583
+
584
+
585
+
586
+ template<bool store_pairs>
587
+ void search_knn_hamming_count_1 (
588
+ const IndexBinaryIVF& ivf,
589
+ size_t nx,
590
+ const uint8_t *x,
591
+ const idx_t *keys,
592
+ int k,
593
+ int32_t *distances,
594
+ idx_t *labels,
595
+ const IVFSearchParameters *params) {
596
+ switch (ivf.code_size) {
597
+ #define HANDLE_CS(cs) \
598
+ case cs: \
599
+ search_knn_hamming_count<HammingComputer ## cs, store_pairs>( \
600
+ ivf, nx, x, keys, k, distances, labels, params); \
601
+ break;
602
+ HANDLE_CS(4);
603
+ HANDLE_CS(8);
604
+ HANDLE_CS(16);
605
+ HANDLE_CS(20);
606
+ HANDLE_CS(32);
607
+ HANDLE_CS(64);
608
+ #undef HANDLE_CS
609
+ default:
610
+ if (ivf.code_size % 8 == 0) {
611
+ search_knn_hamming_count<HammingComputerM8, store_pairs>
612
+ (ivf, nx, x, keys, k, distances, labels, params);
613
+ } else if (ivf.code_size % 4 == 0) {
614
+ search_knn_hamming_count<HammingComputerM4, store_pairs>
615
+ (ivf, nx, x, keys, k, distances, labels, params);
616
+ } else {
617
+ search_knn_hamming_count<HammingComputerDefault, store_pairs>
618
+ (ivf, nx, x, keys, k, distances, labels, params);
619
+ }
620
+ break;
621
+ }
622
+
623
+ }
624
+
625
+ } // namespace
626
+
627
+ BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
628
+ (bool store_pairs) const
629
+ {
630
+ if (store_pairs) {
631
+ return select_IVFBinaryScannerL2<true> (code_size);
632
+ } else {
633
+ return select_IVFBinaryScannerL2<false> (code_size);
634
+ }
635
+ }
636
+
637
+ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
638
+ const idx_t *idx,
639
+ const int32_t * coarse_dis,
640
+ int32_t *distances, idx_t *labels,
641
+ bool store_pairs,
642
+ const IVFSearchParameters *params
643
+ ) const {
644
+
645
+ if (use_heap) {
646
+ search_knn_hamming_heap (*this, n, x, k, idx, coarse_dis,
647
+ distances, labels, store_pairs,
648
+ params);
649
+ } else {
650
+ if (store_pairs) {
651
+ search_knn_hamming_count_1<true>
652
+ (*this, n, x, idx, k, distances, labels, params);
653
+ } else {
654
+ search_knn_hamming_count_1<false>
655
+ (*this, n, x, idx, k, distances, labels, params);
656
+ }
657
+ }
658
+ }
659
+
660
+ IndexBinaryIVF::~IndexBinaryIVF() {
661
+ if (own_invlists) {
662
+ delete invlists;
663
+ }
664
+
665
+ if (own_fields) {
666
+ delete quantizer;
667
+ }
668
+ }
669
+
670
+
671
+ } // namespace faiss