faiss 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +103 -3
  4. data/ext/faiss/ext.cpp +99 -32
  5. data/ext/faiss/extconf.rb +12 -2
  6. data/lib/faiss/ext.bundle +0 -0
  7. data/lib/faiss/index.rb +3 -3
  8. data/lib/faiss/index_binary.rb +3 -3
  9. data/lib/faiss/kmeans.rb +1 -1
  10. data/lib/faiss/pca_matrix.rb +2 -2
  11. data/lib/faiss/product_quantizer.rb +3 -3
  12. data/lib/faiss/version.rb +1 -1
  13. data/vendor/faiss/AutoTune.cpp +719 -0
  14. data/vendor/faiss/AutoTune.h +212 -0
  15. data/vendor/faiss/Clustering.cpp +261 -0
  16. data/vendor/faiss/Clustering.h +101 -0
  17. data/vendor/faiss/IVFlib.cpp +339 -0
  18. data/vendor/faiss/IVFlib.h +132 -0
  19. data/vendor/faiss/Index.cpp +171 -0
  20. data/vendor/faiss/Index.h +261 -0
  21. data/vendor/faiss/Index2Layer.cpp +437 -0
  22. data/vendor/faiss/Index2Layer.h +85 -0
  23. data/vendor/faiss/IndexBinary.cpp +77 -0
  24. data/vendor/faiss/IndexBinary.h +163 -0
  25. data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
  26. data/vendor/faiss/IndexBinaryFlat.h +54 -0
  27. data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
  28. data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
  29. data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
  30. data/vendor/faiss/IndexBinaryHNSW.h +56 -0
  31. data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
  32. data/vendor/faiss/IndexBinaryIVF.h +211 -0
  33. data/vendor/faiss/IndexFlat.cpp +508 -0
  34. data/vendor/faiss/IndexFlat.h +175 -0
  35. data/vendor/faiss/IndexHNSW.cpp +1090 -0
  36. data/vendor/faiss/IndexHNSW.h +170 -0
  37. data/vendor/faiss/IndexIVF.cpp +909 -0
  38. data/vendor/faiss/IndexIVF.h +353 -0
  39. data/vendor/faiss/IndexIVFFlat.cpp +502 -0
  40. data/vendor/faiss/IndexIVFFlat.h +118 -0
  41. data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
  42. data/vendor/faiss/IndexIVFPQ.h +161 -0
  43. data/vendor/faiss/IndexIVFPQR.cpp +219 -0
  44. data/vendor/faiss/IndexIVFPQR.h +65 -0
  45. data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
  46. data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
  47. data/vendor/faiss/IndexLSH.cpp +225 -0
  48. data/vendor/faiss/IndexLSH.h +87 -0
  49. data/vendor/faiss/IndexLattice.cpp +143 -0
  50. data/vendor/faiss/IndexLattice.h +68 -0
  51. data/vendor/faiss/IndexPQ.cpp +1188 -0
  52. data/vendor/faiss/IndexPQ.h +199 -0
  53. data/vendor/faiss/IndexPreTransform.cpp +288 -0
  54. data/vendor/faiss/IndexPreTransform.h +91 -0
  55. data/vendor/faiss/IndexReplicas.cpp +123 -0
  56. data/vendor/faiss/IndexReplicas.h +76 -0
  57. data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
  58. data/vendor/faiss/IndexScalarQuantizer.h +127 -0
  59. data/vendor/faiss/IndexShards.cpp +317 -0
  60. data/vendor/faiss/IndexShards.h +100 -0
  61. data/vendor/faiss/InvertedLists.cpp +623 -0
  62. data/vendor/faiss/InvertedLists.h +334 -0
  63. data/vendor/faiss/LICENSE +21 -0
  64. data/vendor/faiss/MatrixStats.cpp +252 -0
  65. data/vendor/faiss/MatrixStats.h +62 -0
  66. data/vendor/faiss/MetaIndexes.cpp +351 -0
  67. data/vendor/faiss/MetaIndexes.h +126 -0
  68. data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
  69. data/vendor/faiss/OnDiskInvertedLists.h +127 -0
  70. data/vendor/faiss/VectorTransform.cpp +1157 -0
  71. data/vendor/faiss/VectorTransform.h +322 -0
  72. data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
  73. data/vendor/faiss/c_api/AutoTune_c.h +64 -0
  74. data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
  75. data/vendor/faiss/c_api/Clustering_c.h +117 -0
  76. data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
  77. data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
  78. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
  79. data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
  80. data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
  81. data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
  82. data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
  83. data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
  84. data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
  85. data/vendor/faiss/c_api/IndexShards_c.h +42 -0
  86. data/vendor/faiss/c_api/Index_c.cpp +105 -0
  87. data/vendor/faiss/c_api/Index_c.h +183 -0
  88. data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
  89. data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
  90. data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
  91. data/vendor/faiss/c_api/clone_index_c.h +32 -0
  92. data/vendor/faiss/c_api/error_c.h +42 -0
  93. data/vendor/faiss/c_api/error_impl.cpp +27 -0
  94. data/vendor/faiss/c_api/error_impl.h +16 -0
  95. data/vendor/faiss/c_api/faiss_c.h +58 -0
  96. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
  97. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
  98. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
  99. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
  100. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
  101. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
  102. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
  103. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
  104. data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
  105. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
  106. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
  107. data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
  108. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
  109. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
  110. data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
  111. data/vendor/faiss/c_api/index_factory_c.h +30 -0
  112. data/vendor/faiss/c_api/index_io_c.cpp +42 -0
  113. data/vendor/faiss/c_api/index_io_c.h +50 -0
  114. data/vendor/faiss/c_api/macros_impl.h +110 -0
  115. data/vendor/faiss/clone_index.cpp +147 -0
  116. data/vendor/faiss/clone_index.h +38 -0
  117. data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
  118. data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
  119. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
  120. data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
  121. data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
  122. data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
  123. data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
  124. data/vendor/faiss/gpu/GpuCloner.h +82 -0
  125. data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
  126. data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
  127. data/vendor/faiss/gpu/GpuDistance.h +52 -0
  128. data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
  129. data/vendor/faiss/gpu/GpuIndex.h +148 -0
  130. data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
  131. data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
  132. data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
  133. data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
  134. data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
  135. data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
  136. data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
  137. data/vendor/faiss/gpu/GpuResources.cpp +52 -0
  138. data/vendor/faiss/gpu/GpuResources.h +73 -0
  139. data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
  140. data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
  141. data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
  142. data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
  143. data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
  144. data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
  145. data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
  146. data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
  147. data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
  148. data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
  149. data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
  150. data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
  151. data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
  152. data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
  153. data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
  154. data/vendor/faiss/gpu/test/TestUtils.h +93 -0
  155. data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
  156. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
  157. data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
  158. data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
  159. data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
  160. data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
  161. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
  162. data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
  163. data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
  164. data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
  165. data/vendor/faiss/gpu/utils/Timer.h +52 -0
  166. data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
  167. data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
  168. data/vendor/faiss/impl/FaissAssert.h +95 -0
  169. data/vendor/faiss/impl/FaissException.cpp +66 -0
  170. data/vendor/faiss/impl/FaissException.h +71 -0
  171. data/vendor/faiss/impl/HNSW.cpp +818 -0
  172. data/vendor/faiss/impl/HNSW.h +275 -0
  173. data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
  174. data/vendor/faiss/impl/PolysemousTraining.h +158 -0
  175. data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
  176. data/vendor/faiss/impl/ProductQuantizer.h +242 -0
  177. data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
  178. data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
  179. data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
  180. data/vendor/faiss/impl/ThreadedIndex.h +80 -0
  181. data/vendor/faiss/impl/index_read.cpp +793 -0
  182. data/vendor/faiss/impl/index_write.cpp +558 -0
  183. data/vendor/faiss/impl/io.cpp +142 -0
  184. data/vendor/faiss/impl/io.h +98 -0
  185. data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
  186. data/vendor/faiss/impl/lattice_Zn.h +199 -0
  187. data/vendor/faiss/index_factory.cpp +392 -0
  188. data/vendor/faiss/index_factory.h +25 -0
  189. data/vendor/faiss/index_io.h +75 -0
  190. data/vendor/faiss/misc/test_blas.cpp +84 -0
  191. data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
  192. data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
  193. data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
  194. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
  195. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
  196. data/vendor/faiss/tests/test_merge.cpp +258 -0
  197. data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
  198. data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
  199. data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
  200. data/vendor/faiss/tests/test_params_override.cpp +231 -0
  201. data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
  202. data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
  203. data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
  204. data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
  205. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
  206. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
  207. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
  208. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
  209. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
  210. data/vendor/faiss/utils/Heap.cpp +122 -0
  211. data/vendor/faiss/utils/Heap.h +495 -0
  212. data/vendor/faiss/utils/WorkerThread.cpp +126 -0
  213. data/vendor/faiss/utils/WorkerThread.h +61 -0
  214. data/vendor/faiss/utils/distances.cpp +765 -0
  215. data/vendor/faiss/utils/distances.h +243 -0
  216. data/vendor/faiss/utils/distances_simd.cpp +809 -0
  217. data/vendor/faiss/utils/extra_distances.cpp +336 -0
  218. data/vendor/faiss/utils/extra_distances.h +54 -0
  219. data/vendor/faiss/utils/hamming-inl.h +472 -0
  220. data/vendor/faiss/utils/hamming.cpp +792 -0
  221. data/vendor/faiss/utils/hamming.h +220 -0
  222. data/vendor/faiss/utils/random.cpp +192 -0
  223. data/vendor/faiss/utils/random.h +60 -0
  224. data/vendor/faiss/utils/utils.cpp +783 -0
  225. data/vendor/faiss/utils/utils.h +181 -0
  226. metadata +216 -2
@@ -0,0 +1,334 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #ifndef FAISS_INVERTEDLISTS_IVF_H
11
+ #define FAISS_INVERTEDLISTS_IVF_H
12
+
13
+ /**
14
+ * Definition of inverted lists + a few common classes that implement
15
+ * the interface.
16
+ */
17
+
18
+ #include <vector>
19
+ #include <faiss/Index.h>
20
+
21
+
22
+ namespace faiss {
23
+
24
+ /** Table of inverted lists
25
+ * multithreading rules:
26
+ * - concurrent read accesses are allowed
27
+ * - concurrent update accesses are allowed
28
+ * - for resize and add_entries, only concurrent access to different lists
29
+ * are allowed
30
+ */
31
+ struct InvertedLists {
32
+ typedef Index::idx_t idx_t;
33
+
34
+ size_t nlist; ///< number of possible key values
35
+ size_t code_size; ///< code size per vector in bytes
36
+
37
+ InvertedLists (size_t nlist, size_t code_size);
38
+
39
+ /*************************
40
+ * Read only functions */
41
+
42
+ /// get the size of a list
43
+ virtual size_t list_size(size_t list_no) const = 0;
44
+
45
+ /** get the codes for an inverted list
46
+ * must be released by release_codes
47
+ *
48
+ * @return codes size list_size * code_size
49
+ */
50
+ virtual const uint8_t * get_codes (size_t list_no) const = 0;
51
+
52
+ /** get the ids for an inverted list
53
+ * must be released by release_ids
54
+ *
55
+ * @return ids size list_size
56
+ */
57
+ virtual const idx_t * get_ids (size_t list_no) const = 0;
58
+
59
+ /// release codes returned by get_codes (default implementation is nop
60
+ virtual void release_codes (size_t list_no, const uint8_t *codes) const;
61
+
62
+ /// release ids returned by get_ids
63
+ virtual void release_ids (size_t list_no, const idx_t *ids) const;
64
+
65
+ /// @return a single id in an inverted list
66
+ virtual idx_t get_single_id (size_t list_no, size_t offset) const;
67
+
68
+ /// @return a single code in an inverted list
69
+ /// (should be deallocated with release_codes)
70
+ virtual const uint8_t * get_single_code (
71
+ size_t list_no, size_t offset) const;
72
+
73
+ /// prepare the following lists (default does nothing)
74
+ /// a list can be -1 hence the signed long
75
+ virtual void prefetch_lists (const idx_t *list_nos, int nlist) const;
76
+
77
+ /*************************
78
+ * writing functions */
79
+
80
+ /// add one entry to an inverted list
81
+ virtual size_t add_entry (size_t list_no, idx_t theid,
82
+ const uint8_t *code);
83
+
84
+ virtual size_t add_entries (
85
+ size_t list_no, size_t n_entry,
86
+ const idx_t* ids, const uint8_t *code) = 0;
87
+
88
+ virtual void update_entry (size_t list_no, size_t offset,
89
+ idx_t id, const uint8_t *code);
90
+
91
+ virtual void update_entries (size_t list_no, size_t offset, size_t n_entry,
92
+ const idx_t *ids, const uint8_t *code) = 0;
93
+
94
+ virtual void resize (size_t list_no, size_t new_size) = 0;
95
+
96
+ virtual void reset ();
97
+
98
+ /// move all entries from oivf (empty on output)
99
+ void merge_from (InvertedLists *oivf, size_t add_id);
100
+
101
+ virtual ~InvertedLists ();
102
+
103
+ /*************************
104
+ * statistics */
105
+
106
+ /// 1= perfectly balanced, >1: imbalanced
107
+ double imbalance_factor () const;
108
+
109
+ /// display some stats about the inverted lists
110
+ void print_stats () const;
111
+
112
+ /// sum up list sizes
113
+ size_t compute_ntotal () const;
114
+
115
+ /**************************************
116
+ * Scoped inverted lists (for automatic deallocation)
117
+ *
118
+ * instead of writing:
119
+ *
120
+ * uint8_t * codes = invlists->get_codes (10);
121
+ * ... use codes
122
+ * invlists->release_codes(10, codes)
123
+ *
124
+ * write:
125
+ *
126
+ * ScopedCodes codes (invlists, 10);
127
+ * ... use codes.get()
128
+ * // release called automatically when codes goes out of scope
129
+ *
130
+ * the following function call also works:
131
+ *
132
+ * foo (123, ScopedCodes (invlists, 10).get(), 456);
133
+ *
134
+ */
135
+
136
+ struct ScopedIds {
137
+ const InvertedLists *il;
138
+ const idx_t *ids;
139
+ size_t list_no;
140
+
141
+ ScopedIds (const InvertedLists *il, size_t list_no):
142
+ il (il), ids (il->get_ids (list_no)), list_no (list_no)
143
+ {}
144
+
145
+ const idx_t *get() {return ids; }
146
+
147
+ idx_t operator [] (size_t i) const {
148
+ return ids[i];
149
+ }
150
+
151
+ ~ScopedIds () {
152
+ il->release_ids (list_no, ids);
153
+ }
154
+ };
155
+
156
+ struct ScopedCodes {
157
+ const InvertedLists *il;
158
+ const uint8_t *codes;
159
+ size_t list_no;
160
+
161
+ ScopedCodes (const InvertedLists *il, size_t list_no):
162
+ il (il), codes (il->get_codes (list_no)), list_no (list_no)
163
+ {}
164
+
165
+ ScopedCodes (const InvertedLists *il, size_t list_no, size_t offset):
166
+ il (il), codes (il->get_single_code (list_no, offset)),
167
+ list_no (list_no)
168
+ {}
169
+
170
+ const uint8_t *get() {return codes; }
171
+
172
+ ~ScopedCodes () {
173
+ il->release_codes (list_no, codes);
174
+ }
175
+ };
176
+
177
+
178
+ };
179
+
180
+
181
+ /// simple (default) implementation as an array of inverted lists
182
+ struct ArrayInvertedLists: InvertedLists {
183
+ std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
184
+ std::vector < std::vector<idx_t> > ids; ///< Inverted lists for indexes
185
+
186
+ ArrayInvertedLists (size_t nlist, size_t code_size);
187
+
188
+ size_t list_size(size_t list_no) const override;
189
+ const uint8_t * get_codes (size_t list_no) const override;
190
+ const idx_t * get_ids (size_t list_no) const override;
191
+
192
+ size_t add_entries (
193
+ size_t list_no, size_t n_entry,
194
+ const idx_t* ids, const uint8_t *code) override;
195
+
196
+ void update_entries (size_t list_no, size_t offset, size_t n_entry,
197
+ const idx_t *ids, const uint8_t *code) override;
198
+
199
+ void resize (size_t list_no, size_t new_size) override;
200
+
201
+ virtual ~ArrayInvertedLists ();
202
+ };
203
+
204
+ /*****************************************************************
205
+ * Meta-inverted lists
206
+ *
207
+ * About terminology: the inverted lists are seen as a sparse matrix,
208
+ * that can be stacked horizontally, vertically and sliced.
209
+ *****************************************************************/
210
+
211
+ struct ReadOnlyInvertedLists: InvertedLists {
212
+
213
+ ReadOnlyInvertedLists (size_t nlist, size_t code_size):
214
+ InvertedLists (nlist, code_size) {}
215
+
216
+ size_t add_entries (
217
+ size_t list_no, size_t n_entry,
218
+ const idx_t* ids, const uint8_t *code) override;
219
+
220
+ void update_entries (size_t list_no, size_t offset, size_t n_entry,
221
+ const idx_t *ids, const uint8_t *code) override;
222
+
223
+ void resize (size_t list_no, size_t new_size) override;
224
+
225
+ };
226
+
227
+
228
+ /// Horizontal stack of inverted lists
229
+ struct HStackInvertedLists: ReadOnlyInvertedLists {
230
+
231
+ std::vector<const InvertedLists *>ils;
232
+
233
+ /// build InvertedLists by concatenating nil of them
234
+ HStackInvertedLists (int nil, const InvertedLists **ils);
235
+
236
+ size_t list_size(size_t list_no) const override;
237
+ const uint8_t * get_codes (size_t list_no) const override;
238
+ const idx_t * get_ids (size_t list_no) const override;
239
+
240
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
241
+
242
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
243
+ void release_ids (size_t list_no, const idx_t *ids) const override;
244
+
245
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
246
+
247
+ const uint8_t * get_single_code (
248
+ size_t list_no, size_t offset) const override;
249
+
250
+ };
251
+
252
+ using ConcatenatedInvertedLists = HStackInvertedLists;
253
+
254
+
255
+ /// vertical slice of indexes in another InvertedLists
256
+ struct SliceInvertedLists: ReadOnlyInvertedLists {
257
+ const InvertedLists *il;
258
+ idx_t i0, i1;
259
+
260
+ SliceInvertedLists(const InvertedLists *il, idx_t i0, idx_t i1);
261
+
262
+ size_t list_size(size_t list_no) const override;
263
+ const uint8_t * get_codes (size_t list_no) const override;
264
+ const idx_t * get_ids (size_t list_no) const override;
265
+
266
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
267
+ void release_ids (size_t list_no, const idx_t *ids) const override;
268
+
269
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
270
+
271
+ const uint8_t * get_single_code (
272
+ size_t list_no, size_t offset) const override;
273
+
274
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
275
+ };
276
+
277
+
278
+ struct VStackInvertedLists: ReadOnlyInvertedLists {
279
+ std::vector<const InvertedLists *>ils;
280
+ std::vector<idx_t> cumsz;
281
+
282
+ /// build InvertedLists by concatenating nil of them
283
+ VStackInvertedLists (int nil, const InvertedLists **ils);
284
+
285
+ size_t list_size(size_t list_no) const override;
286
+ const uint8_t * get_codes (size_t list_no) const override;
287
+ const idx_t * get_ids (size_t list_no) const override;
288
+
289
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
290
+ void release_ids (size_t list_no, const idx_t *ids) const override;
291
+
292
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
293
+
294
+ const uint8_t * get_single_code (
295
+ size_t list_no, size_t offset) const override;
296
+
297
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
298
+
299
+ };
300
+
301
+
302
+ /** use the first inverted lists if they are non-empty otherwise use the second
303
+ *
304
+ * This is useful if il1 has a few inverted lists that are too long,
305
+ * and that il0 has replacement lists for those, with empty lists for
306
+ * the others. */
307
+ struct MaskedInvertedLists: ReadOnlyInvertedLists {
308
+
309
+ const InvertedLists *il0;
310
+ const InvertedLists *il1;
311
+
312
+ MaskedInvertedLists (const InvertedLists *il0,
313
+ const InvertedLists *il1);
314
+
315
+ size_t list_size(size_t list_no) const override;
316
+ const uint8_t * get_codes (size_t list_no) const override;
317
+ const idx_t * get_ids (size_t list_no) const override;
318
+
319
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
320
+ void release_ids (size_t list_no, const idx_t *ids) const override;
321
+
322
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
323
+
324
+ const uint8_t * get_single_code (
325
+ size_t list_no, size_t offset) const override;
326
+
327
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
328
+
329
+ };
330
+
331
+ } // namespace faiss
332
+
333
+
334
+ #endif
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Facebook, Inc. and its affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,252 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+
11
+ #include <faiss/MatrixStats.h>
12
+
13
+
14
+ #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
15
+
16
+ #include <cmath>
17
+ #include <cstdio>
18
+ #include <faiss/utils/utils.h>
19
+
20
+ namespace faiss {
21
+
22
+ /*********************************************************************
23
+ * MatrixStats
24
+ *********************************************************************/
25
+
26
+ MatrixStats::PerDimStats::PerDimStats():
27
+ n(0), n_nan(0), n_inf(0), n0(0),
28
+ min(HUGE_VALF), max(-HUGE_VALF),
29
+ sum(0), sum2(0),
30
+ mean(NAN), stddev(NAN)
31
+ {}
32
+
33
+
34
+ void MatrixStats::PerDimStats::add (float x)
35
+ {
36
+ n++;
37
+ if (std::isnan(x)) {
38
+ n_nan++;
39
+ return;
40
+ }
41
+ if (!std::isfinite(x)) {
42
+ n_inf++;
43
+ return;
44
+ }
45
+ if (x == 0) n0++;
46
+ if (x < min) min = x;
47
+ if (x > max) max = x;
48
+ sum += x;
49
+ sum2 += (double)x * (double)x;
50
+ }
51
+
52
+ void MatrixStats::PerDimStats::compute_mean_std ()
53
+ {
54
+ n_valid = n - n_nan - n_inf;
55
+ mean = sum / n_valid;
56
+ double var = sum2 / n_valid - mean * mean;
57
+ if (var < 0) var = 0;
58
+ stddev = sqrt(var);
59
+ }
60
+
61
+
62
+ void MatrixStats::do_comment (const char *fmt, ...)
63
+ {
64
+ va_list ap;
65
+
66
+ /* Determine required size */
67
+ va_start(ap, fmt);
68
+ size_t size = vsnprintf(buf, nbuf, fmt, ap);
69
+ va_end(ap);
70
+
71
+ nbuf -= size;
72
+ buf += size;
73
+ }
74
+
75
+
76
+
77
+ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
78
+ n(n), d(d),
79
+ n_collision(0), n_valid(0), n0(0),
80
+ min_norm2(HUGE_VAL), max_norm2(0)
81
+ {
82
+ std::vector<char> comment_buf (10000);
83
+ buf = comment_buf.data ();
84
+ nbuf = comment_buf.size();
85
+
86
+ do_comment ("analyzing %ld vectors of size %ld\n", n, d);
87
+
88
+ if (d > 1024) {
89
+ do_comment (
90
+ "indexing this many dimensions is hard, "
91
+ "please consider dimensionality reducution (with PCAMatrix)\n");
92
+ }
93
+
94
+ size_t nbytes = sizeof (x[0]) * d;
95
+ per_dim_stats.resize (d);
96
+
97
+ for (size_t i = 0; i < n; i++) {
98
+ const float *xi = x + d * i;
99
+ double sum2 = 0;
100
+ for (size_t j = 0; j < d; j++) {
101
+ per_dim_stats[j].add (xi[j]);
102
+ sum2 += xi[j] * (double)xi[j];
103
+ }
104
+
105
+ if (std::isfinite (sum2)) {
106
+ n_valid++;
107
+ if (sum2 == 0) {
108
+ n0 ++;
109
+ } else {
110
+ if (sum2 < min_norm2) min_norm2 = sum2;
111
+ if (sum2 > max_norm2) max_norm2 = sum2;
112
+ }
113
+ }
114
+
115
+ { // check hash
116
+ uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
117
+ auto elt = occurrences.find (hash);
118
+ if (elt == occurrences.end()) {
119
+ Occurrence occ = {i, 1};
120
+ occurrences[hash] = occ;
121
+ } else {
122
+ if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
123
+ elt->second.count ++;
124
+ } else {
125
+ n_collision ++;
126
+ // we should use a list of collisions but overkill
127
+ }
128
+ }
129
+ }
130
+ }
131
+
132
+ // invalid vecor stats
133
+ if (n_valid == n) {
134
+ do_comment ("no NaN or Infs in data\n");
135
+ } else {
136
+ do_comment ("%ld vectors contain NaN or Inf "
137
+ "(or have too large components), "
138
+ "expect bad results with indexing!\n", n - n_valid);
139
+ }
140
+
141
+ // copies in dataset
142
+ if (occurrences.size() == n) {
143
+ do_comment ("all vectors are distinct\n");
144
+ } else {
145
+ do_comment ("%ld vectors are distinct (%.2f%%)\n",
146
+ occurrences.size(),
147
+ occurrences.size() * 100.0 / n);
148
+
149
+ if (n_collision > 0) {
150
+ do_comment ("%ld collisions in hash table, "
151
+ "counts may be invalid\n", n_collision);
152
+ }
153
+
154
+ Occurrence max = {0, 0};
155
+ for (auto it = occurrences.begin();
156
+ it != occurrences.end(); ++it) {
157
+ if (it->second.count > max.count) {
158
+ max = it->second;
159
+ }
160
+ }
161
+ do_comment ("vector %ld has %ld copies\n", max.first, max.count);
162
+ }
163
+
164
+ { // norm stats
165
+ min_norm2 = sqrt (min_norm2);
166
+ max_norm2 = sqrt (max_norm2);
167
+ do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
168
+ min_norm2, max_norm2, n0);
169
+
170
+ if (max_norm2 < min_norm2 * 1.0001) {
171
+ do_comment ("vectors are normalized, inner product and "
172
+ "L2 search are equivalent\n");
173
+ }
174
+
175
+ if (max_norm2 > min_norm2 * 100) {
176
+ do_comment ("vectors have very large differences in norms, "
177
+ "is this normal?\n");
178
+ }
179
+ }
180
+
181
+ { // per dimension stats
182
+
183
+ double max_std = 0, min_std = HUGE_VAL;
184
+
185
+ size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
186
+
187
+ for (size_t j = 0; j < d; j++) {
188
+ PerDimStats &st = per_dim_stats[j];
189
+ st.compute_mean_std ();
190
+ n0 += st.n0;
191
+
192
+ if (st.max == st.min) {
193
+ n_0_range ++;
194
+ } else if (st.max < 1.001 * st.min) {
195
+ n_dangerous_range ++;
196
+ }
197
+
198
+ if (st.stddev > max_std) max_std = st.stddev;
199
+ if (st.stddev < min_std) min_std = st.stddev;
200
+ }
201
+
202
+
203
+
204
+ if (n0 == 0) {
205
+ do_comment ("matrix contains no 0s\n");
206
+ } else {
207
+ do_comment ("matrix contains %.2f %% 0 entries\n",
208
+ n0 * 100.0 / (n * d));
209
+ }
210
+
211
+ if (n_0_range == 0) {
212
+ do_comment ("no constant dimensions\n");
213
+ } else {
214
+ do_comment ("%ld dimensions are constant: they can be removed\n",
215
+ n_0_range);
216
+ }
217
+
218
+ if (n_dangerous_range == 0) {
219
+ do_comment ("no dimension has a too large mean\n");
220
+ } else {
221
+ do_comment ("%ld dimensions are too large "
222
+ "wrt. their variance, may loose precision "
223
+ "in IndexFlatL2 (use CenteringTransform)\n",
224
+ n_dangerous_range);
225
+ }
226
+
227
+ do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
228
+
229
+ size_t n_small_var = 0;
230
+
231
+ for (size_t j = 0; j < d; j++) {
232
+ const PerDimStats &st = per_dim_stats[j];
233
+ if (st.stddev < max_std * 1e-4) {
234
+ n_small_var++;
235
+ }
236
+ }
237
+
238
+ if (n_small_var > 0) {
239
+ do_comment ("%ld dimensions have negligible stddev wrt. "
240
+ "the largest dimension, they could be ignored",
241
+ n_small_var);
242
+ }
243
+
244
+ }
245
+ comments = comment_buf.data ();
246
+ buf = nullptr;
247
+ nbuf = 0;
248
+ }
249
+
250
+
251
+
252
+ } // namespace faiss