faiss 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +103 -3
  4. data/ext/faiss/ext.cpp +99 -32
  5. data/ext/faiss/extconf.rb +12 -2
  6. data/lib/faiss/ext.bundle +0 -0
  7. data/lib/faiss/index.rb +3 -3
  8. data/lib/faiss/index_binary.rb +3 -3
  9. data/lib/faiss/kmeans.rb +1 -1
  10. data/lib/faiss/pca_matrix.rb +2 -2
  11. data/lib/faiss/product_quantizer.rb +3 -3
  12. data/lib/faiss/version.rb +1 -1
  13. data/vendor/faiss/AutoTune.cpp +719 -0
  14. data/vendor/faiss/AutoTune.h +212 -0
  15. data/vendor/faiss/Clustering.cpp +261 -0
  16. data/vendor/faiss/Clustering.h +101 -0
  17. data/vendor/faiss/IVFlib.cpp +339 -0
  18. data/vendor/faiss/IVFlib.h +132 -0
  19. data/vendor/faiss/Index.cpp +171 -0
  20. data/vendor/faiss/Index.h +261 -0
  21. data/vendor/faiss/Index2Layer.cpp +437 -0
  22. data/vendor/faiss/Index2Layer.h +85 -0
  23. data/vendor/faiss/IndexBinary.cpp +77 -0
  24. data/vendor/faiss/IndexBinary.h +163 -0
  25. data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
  26. data/vendor/faiss/IndexBinaryFlat.h +54 -0
  27. data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
  28. data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
  29. data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
  30. data/vendor/faiss/IndexBinaryHNSW.h +56 -0
  31. data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
  32. data/vendor/faiss/IndexBinaryIVF.h +211 -0
  33. data/vendor/faiss/IndexFlat.cpp +508 -0
  34. data/vendor/faiss/IndexFlat.h +175 -0
  35. data/vendor/faiss/IndexHNSW.cpp +1090 -0
  36. data/vendor/faiss/IndexHNSW.h +170 -0
  37. data/vendor/faiss/IndexIVF.cpp +909 -0
  38. data/vendor/faiss/IndexIVF.h +353 -0
  39. data/vendor/faiss/IndexIVFFlat.cpp +502 -0
  40. data/vendor/faiss/IndexIVFFlat.h +118 -0
  41. data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
  42. data/vendor/faiss/IndexIVFPQ.h +161 -0
  43. data/vendor/faiss/IndexIVFPQR.cpp +219 -0
  44. data/vendor/faiss/IndexIVFPQR.h +65 -0
  45. data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
  46. data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
  47. data/vendor/faiss/IndexLSH.cpp +225 -0
  48. data/vendor/faiss/IndexLSH.h +87 -0
  49. data/vendor/faiss/IndexLattice.cpp +143 -0
  50. data/vendor/faiss/IndexLattice.h +68 -0
  51. data/vendor/faiss/IndexPQ.cpp +1188 -0
  52. data/vendor/faiss/IndexPQ.h +199 -0
  53. data/vendor/faiss/IndexPreTransform.cpp +288 -0
  54. data/vendor/faiss/IndexPreTransform.h +91 -0
  55. data/vendor/faiss/IndexReplicas.cpp +123 -0
  56. data/vendor/faiss/IndexReplicas.h +76 -0
  57. data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
  58. data/vendor/faiss/IndexScalarQuantizer.h +127 -0
  59. data/vendor/faiss/IndexShards.cpp +317 -0
  60. data/vendor/faiss/IndexShards.h +100 -0
  61. data/vendor/faiss/InvertedLists.cpp +623 -0
  62. data/vendor/faiss/InvertedLists.h +334 -0
  63. data/vendor/faiss/LICENSE +21 -0
  64. data/vendor/faiss/MatrixStats.cpp +252 -0
  65. data/vendor/faiss/MatrixStats.h +62 -0
  66. data/vendor/faiss/MetaIndexes.cpp +351 -0
  67. data/vendor/faiss/MetaIndexes.h +126 -0
  68. data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
  69. data/vendor/faiss/OnDiskInvertedLists.h +127 -0
  70. data/vendor/faiss/VectorTransform.cpp +1157 -0
  71. data/vendor/faiss/VectorTransform.h +322 -0
  72. data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
  73. data/vendor/faiss/c_api/AutoTune_c.h +64 -0
  74. data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
  75. data/vendor/faiss/c_api/Clustering_c.h +117 -0
  76. data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
  77. data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
  78. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
  79. data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
  80. data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
  81. data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
  82. data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
  83. data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
  84. data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
  85. data/vendor/faiss/c_api/IndexShards_c.h +42 -0
  86. data/vendor/faiss/c_api/Index_c.cpp +105 -0
  87. data/vendor/faiss/c_api/Index_c.h +183 -0
  88. data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
  89. data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
  90. data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
  91. data/vendor/faiss/c_api/clone_index_c.h +32 -0
  92. data/vendor/faiss/c_api/error_c.h +42 -0
  93. data/vendor/faiss/c_api/error_impl.cpp +27 -0
  94. data/vendor/faiss/c_api/error_impl.h +16 -0
  95. data/vendor/faiss/c_api/faiss_c.h +58 -0
  96. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
  97. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
  98. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
  99. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
  100. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
  101. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
  102. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
  103. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
  104. data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
  105. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
  106. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
  107. data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
  108. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
  109. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
  110. data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
  111. data/vendor/faiss/c_api/index_factory_c.h +30 -0
  112. data/vendor/faiss/c_api/index_io_c.cpp +42 -0
  113. data/vendor/faiss/c_api/index_io_c.h +50 -0
  114. data/vendor/faiss/c_api/macros_impl.h +110 -0
  115. data/vendor/faiss/clone_index.cpp +147 -0
  116. data/vendor/faiss/clone_index.h +38 -0
  117. data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
  118. data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
  119. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
  120. data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
  121. data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
  122. data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
  123. data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
  124. data/vendor/faiss/gpu/GpuCloner.h +82 -0
  125. data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
  126. data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
  127. data/vendor/faiss/gpu/GpuDistance.h +52 -0
  128. data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
  129. data/vendor/faiss/gpu/GpuIndex.h +148 -0
  130. data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
  131. data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
  132. data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
  133. data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
  134. data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
  135. data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
  136. data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
  137. data/vendor/faiss/gpu/GpuResources.cpp +52 -0
  138. data/vendor/faiss/gpu/GpuResources.h +73 -0
  139. data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
  140. data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
  141. data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
  142. data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
  143. data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
  144. data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
  145. data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
  146. data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
  147. data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
  148. data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
  149. data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
  150. data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
  151. data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
  152. data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
  153. data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
  154. data/vendor/faiss/gpu/test/TestUtils.h +93 -0
  155. data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
  156. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
  157. data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
  158. data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
  159. data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
  160. data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
  161. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
  162. data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
  163. data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
  164. data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
  165. data/vendor/faiss/gpu/utils/Timer.h +52 -0
  166. data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
  167. data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
  168. data/vendor/faiss/impl/FaissAssert.h +95 -0
  169. data/vendor/faiss/impl/FaissException.cpp +66 -0
  170. data/vendor/faiss/impl/FaissException.h +71 -0
  171. data/vendor/faiss/impl/HNSW.cpp +818 -0
  172. data/vendor/faiss/impl/HNSW.h +275 -0
  173. data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
  174. data/vendor/faiss/impl/PolysemousTraining.h +158 -0
  175. data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
  176. data/vendor/faiss/impl/ProductQuantizer.h +242 -0
  177. data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
  178. data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
  179. data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
  180. data/vendor/faiss/impl/ThreadedIndex.h +80 -0
  181. data/vendor/faiss/impl/index_read.cpp +793 -0
  182. data/vendor/faiss/impl/index_write.cpp +558 -0
  183. data/vendor/faiss/impl/io.cpp +142 -0
  184. data/vendor/faiss/impl/io.h +98 -0
  185. data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
  186. data/vendor/faiss/impl/lattice_Zn.h +199 -0
  187. data/vendor/faiss/index_factory.cpp +392 -0
  188. data/vendor/faiss/index_factory.h +25 -0
  189. data/vendor/faiss/index_io.h +75 -0
  190. data/vendor/faiss/misc/test_blas.cpp +84 -0
  191. data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
  192. data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
  193. data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
  194. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
  195. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
  196. data/vendor/faiss/tests/test_merge.cpp +258 -0
  197. data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
  198. data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
  199. data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
  200. data/vendor/faiss/tests/test_params_override.cpp +231 -0
  201. data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
  202. data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
  203. data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
  204. data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
  205. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
  206. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
  207. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
  208. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
  209. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
  210. data/vendor/faiss/utils/Heap.cpp +122 -0
  211. data/vendor/faiss/utils/Heap.h +495 -0
  212. data/vendor/faiss/utils/WorkerThread.cpp +126 -0
  213. data/vendor/faiss/utils/WorkerThread.h +61 -0
  214. data/vendor/faiss/utils/distances.cpp +765 -0
  215. data/vendor/faiss/utils/distances.h +243 -0
  216. data/vendor/faiss/utils/distances_simd.cpp +809 -0
  217. data/vendor/faiss/utils/extra_distances.cpp +336 -0
  218. data/vendor/faiss/utils/extra_distances.h +54 -0
  219. data/vendor/faiss/utils/hamming-inl.h +472 -0
  220. data/vendor/faiss/utils/hamming.cpp +792 -0
  221. data/vendor/faiss/utils/hamming.h +220 -0
  222. data/vendor/faiss/utils/random.cpp +192 -0
  223. data/vendor/faiss/utils/random.h +60 -0
  224. data/vendor/faiss/utils/utils.cpp +783 -0
  225. data/vendor/faiss/utils/utils.h +181 -0
  226. metadata +216 -2
@@ -0,0 +1,334 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ #ifndef FAISS_INVERTEDLISTS_IVF_H
11
+ #define FAISS_INVERTEDLISTS_IVF_H
12
+
13
+ /**
14
+ * Definition of inverted lists + a few common classes that implement
15
+ * the interface.
16
+ */
17
+
18
+ #include <vector>
19
+ #include <faiss/Index.h>
20
+
21
+
22
+ namespace faiss {
23
+
24
+ /** Table of inverted lists
25
+ * multithreading rules:
26
+ * - concurrent read accesses are allowed
27
+ * - concurrent update accesses are allowed
28
+ * - for resize and add_entries, only concurrent access to different lists
29
+ * are allowed
30
+ */
31
+ struct InvertedLists {
32
+ typedef Index::idx_t idx_t;
33
+
34
+ size_t nlist; ///< number of possible key values
35
+ size_t code_size; ///< code size per vector in bytes
36
+
37
+ InvertedLists (size_t nlist, size_t code_size);
38
+
39
+ /*************************
40
+ * Read only functions */
41
+
42
+ /// get the size of a list
43
+ virtual size_t list_size(size_t list_no) const = 0;
44
+
45
+ /** get the codes for an inverted list
46
+ * must be released by release_codes
47
+ *
48
+ * @return codes size list_size * code_size
49
+ */
50
+ virtual const uint8_t * get_codes (size_t list_no) const = 0;
51
+
52
+ /** get the ids for an inverted list
53
+ * must be released by release_ids
54
+ *
55
+ * @return ids size list_size
56
+ */
57
+ virtual const idx_t * get_ids (size_t list_no) const = 0;
58
+
59
+ /// release codes returned by get_codes (default implementation is nop
60
+ virtual void release_codes (size_t list_no, const uint8_t *codes) const;
61
+
62
+ /// release ids returned by get_ids
63
+ virtual void release_ids (size_t list_no, const idx_t *ids) const;
64
+
65
+ /// @return a single id in an inverted list
66
+ virtual idx_t get_single_id (size_t list_no, size_t offset) const;
67
+
68
+ /// @return a single code in an inverted list
69
+ /// (should be deallocated with release_codes)
70
+ virtual const uint8_t * get_single_code (
71
+ size_t list_no, size_t offset) const;
72
+
73
+ /// prepare the following lists (default does nothing)
74
+ /// a list can be -1 hence the signed long
75
+ virtual void prefetch_lists (const idx_t *list_nos, int nlist) const;
76
+
77
+ /*************************
78
+ * writing functions */
79
+
80
+ /// add one entry to an inverted list
81
+ virtual size_t add_entry (size_t list_no, idx_t theid,
82
+ const uint8_t *code);
83
+
84
+ virtual size_t add_entries (
85
+ size_t list_no, size_t n_entry,
86
+ const idx_t* ids, const uint8_t *code) = 0;
87
+
88
+ virtual void update_entry (size_t list_no, size_t offset,
89
+ idx_t id, const uint8_t *code);
90
+
91
+ virtual void update_entries (size_t list_no, size_t offset, size_t n_entry,
92
+ const idx_t *ids, const uint8_t *code) = 0;
93
+
94
+ virtual void resize (size_t list_no, size_t new_size) = 0;
95
+
96
+ virtual void reset ();
97
+
98
+ /// move all entries from oivf (empty on output)
99
+ void merge_from (InvertedLists *oivf, size_t add_id);
100
+
101
+ virtual ~InvertedLists ();
102
+
103
+ /*************************
104
+ * statistics */
105
+
106
+ /// 1= perfectly balanced, >1: imbalanced
107
+ double imbalance_factor () const;
108
+
109
+ /// display some stats about the inverted lists
110
+ void print_stats () const;
111
+
112
+ /// sum up list sizes
113
+ size_t compute_ntotal () const;
114
+
115
+ /**************************************
116
+ * Scoped inverted lists (for automatic deallocation)
117
+ *
118
+ * instead of writing:
119
+ *
120
+ * uint8_t * codes = invlists->get_codes (10);
121
+ * ... use codes
122
+ * invlists->release_codes(10, codes)
123
+ *
124
+ * write:
125
+ *
126
+ * ScopedCodes codes (invlists, 10);
127
+ * ... use codes.get()
128
+ * // release called automatically when codes goes out of scope
129
+ *
130
+ * the following function call also works:
131
+ *
132
+ * foo (123, ScopedCodes (invlists, 10).get(), 456);
133
+ *
134
+ */
135
+
136
+ struct ScopedIds {
137
+ const InvertedLists *il;
138
+ const idx_t *ids;
139
+ size_t list_no;
140
+
141
+ ScopedIds (const InvertedLists *il, size_t list_no):
142
+ il (il), ids (il->get_ids (list_no)), list_no (list_no)
143
+ {}
144
+
145
+ const idx_t *get() {return ids; }
146
+
147
+ idx_t operator [] (size_t i) const {
148
+ return ids[i];
149
+ }
150
+
151
+ ~ScopedIds () {
152
+ il->release_ids (list_no, ids);
153
+ }
154
+ };
155
+
156
+ struct ScopedCodes {
157
+ const InvertedLists *il;
158
+ const uint8_t *codes;
159
+ size_t list_no;
160
+
161
+ ScopedCodes (const InvertedLists *il, size_t list_no):
162
+ il (il), codes (il->get_codes (list_no)), list_no (list_no)
163
+ {}
164
+
165
+ ScopedCodes (const InvertedLists *il, size_t list_no, size_t offset):
166
+ il (il), codes (il->get_single_code (list_no, offset)),
167
+ list_no (list_no)
168
+ {}
169
+
170
+ const uint8_t *get() {return codes; }
171
+
172
+ ~ScopedCodes () {
173
+ il->release_codes (list_no, codes);
174
+ }
175
+ };
176
+
177
+
178
+ };
179
+
180
+
181
+ /// simple (default) implementation as an array of inverted lists
182
+ struct ArrayInvertedLists: InvertedLists {
183
+ std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
184
+ std::vector < std::vector<idx_t> > ids; ///< Inverted lists for indexes
185
+
186
+ ArrayInvertedLists (size_t nlist, size_t code_size);
187
+
188
+ size_t list_size(size_t list_no) const override;
189
+ const uint8_t * get_codes (size_t list_no) const override;
190
+ const idx_t * get_ids (size_t list_no) const override;
191
+
192
+ size_t add_entries (
193
+ size_t list_no, size_t n_entry,
194
+ const idx_t* ids, const uint8_t *code) override;
195
+
196
+ void update_entries (size_t list_no, size_t offset, size_t n_entry,
197
+ const idx_t *ids, const uint8_t *code) override;
198
+
199
+ void resize (size_t list_no, size_t new_size) override;
200
+
201
+ virtual ~ArrayInvertedLists ();
202
+ };
203
+
204
+ /*****************************************************************
205
+ * Meta-inverted lists
206
+ *
207
+ * About terminology: the inverted lists are seen as a sparse matrix,
208
+ * that can be stacked horizontally, vertically and sliced.
209
+ *****************************************************************/
210
+
211
+ struct ReadOnlyInvertedLists: InvertedLists {
212
+
213
+ ReadOnlyInvertedLists (size_t nlist, size_t code_size):
214
+ InvertedLists (nlist, code_size) {}
215
+
216
+ size_t add_entries (
217
+ size_t list_no, size_t n_entry,
218
+ const idx_t* ids, const uint8_t *code) override;
219
+
220
+ void update_entries (size_t list_no, size_t offset, size_t n_entry,
221
+ const idx_t *ids, const uint8_t *code) override;
222
+
223
+ void resize (size_t list_no, size_t new_size) override;
224
+
225
+ };
226
+
227
+
228
+ /// Horizontal stack of inverted lists
229
+ struct HStackInvertedLists: ReadOnlyInvertedLists {
230
+
231
+ std::vector<const InvertedLists *>ils;
232
+
233
+ /// build InvertedLists by concatenating nil of them
234
+ HStackInvertedLists (int nil, const InvertedLists **ils);
235
+
236
+ size_t list_size(size_t list_no) const override;
237
+ const uint8_t * get_codes (size_t list_no) const override;
238
+ const idx_t * get_ids (size_t list_no) const override;
239
+
240
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
241
+
242
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
243
+ void release_ids (size_t list_no, const idx_t *ids) const override;
244
+
245
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
246
+
247
+ const uint8_t * get_single_code (
248
+ size_t list_no, size_t offset) const override;
249
+
250
+ };
251
+
252
+ using ConcatenatedInvertedLists = HStackInvertedLists;
253
+
254
+
255
+ /// vertical slice of indexes in another InvertedLists
256
+ struct SliceInvertedLists: ReadOnlyInvertedLists {
257
+ const InvertedLists *il;
258
+ idx_t i0, i1;
259
+
260
+ SliceInvertedLists(const InvertedLists *il, idx_t i0, idx_t i1);
261
+
262
+ size_t list_size(size_t list_no) const override;
263
+ const uint8_t * get_codes (size_t list_no) const override;
264
+ const idx_t * get_ids (size_t list_no) const override;
265
+
266
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
267
+ void release_ids (size_t list_no, const idx_t *ids) const override;
268
+
269
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
270
+
271
+ const uint8_t * get_single_code (
272
+ size_t list_no, size_t offset) const override;
273
+
274
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
275
+ };
276
+
277
+
278
+ struct VStackInvertedLists: ReadOnlyInvertedLists {
279
+ std::vector<const InvertedLists *>ils;
280
+ std::vector<idx_t> cumsz;
281
+
282
+ /// build InvertedLists by concatenating nil of them
283
+ VStackInvertedLists (int nil, const InvertedLists **ils);
284
+
285
+ size_t list_size(size_t list_no) const override;
286
+ const uint8_t * get_codes (size_t list_no) const override;
287
+ const idx_t * get_ids (size_t list_no) const override;
288
+
289
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
290
+ void release_ids (size_t list_no, const idx_t *ids) const override;
291
+
292
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
293
+
294
+ const uint8_t * get_single_code (
295
+ size_t list_no, size_t offset) const override;
296
+
297
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
298
+
299
+ };
300
+
301
+
302
+ /** use the first inverted lists if they are non-empty otherwise use the second
303
+ *
304
+ * This is useful if il1 has a few inverted lists that are too long,
305
+ * and that il0 has replacement lists for those, with empty lists for
306
+ * the others. */
307
+ struct MaskedInvertedLists: ReadOnlyInvertedLists {
308
+
309
+ const InvertedLists *il0;
310
+ const InvertedLists *il1;
311
+
312
+ MaskedInvertedLists (const InvertedLists *il0,
313
+ const InvertedLists *il1);
314
+
315
+ size_t list_size(size_t list_no) const override;
316
+ const uint8_t * get_codes (size_t list_no) const override;
317
+ const idx_t * get_ids (size_t list_no) const override;
318
+
319
+ void release_codes (size_t list_no, const uint8_t *codes) const override;
320
+ void release_ids (size_t list_no, const idx_t *ids) const override;
321
+
322
+ idx_t get_single_id (size_t list_no, size_t offset) const override;
323
+
324
+ const uint8_t * get_single_code (
325
+ size_t list_no, size_t offset) const override;
326
+
327
+ void prefetch_lists (const idx_t *list_nos, int nlist) const override;
328
+
329
+ };
330
+
331
+ } // namespace faiss
332
+
333
+
334
+ #endif
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Facebook, Inc. and its affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,252 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+
11
+ #include <faiss/MatrixStats.h>
12
+
13
+
14
+ #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
15
+
16
+ #include <cmath>
17
+ #include <cstdio>
18
+ #include <faiss/utils/utils.h>
19
+
20
+ namespace faiss {
21
+
22
+ /*********************************************************************
23
+ * MatrixStats
24
+ *********************************************************************/
25
+
26
+ MatrixStats::PerDimStats::PerDimStats():
27
+ n(0), n_nan(0), n_inf(0), n0(0),
28
+ min(HUGE_VALF), max(-HUGE_VALF),
29
+ sum(0), sum2(0),
30
+ mean(NAN), stddev(NAN)
31
+ {}
32
+
33
+
34
+ void MatrixStats::PerDimStats::add (float x)
35
+ {
36
+ n++;
37
+ if (std::isnan(x)) {
38
+ n_nan++;
39
+ return;
40
+ }
41
+ if (!std::isfinite(x)) {
42
+ n_inf++;
43
+ return;
44
+ }
45
+ if (x == 0) n0++;
46
+ if (x < min) min = x;
47
+ if (x > max) max = x;
48
+ sum += x;
49
+ sum2 += (double)x * (double)x;
50
+ }
51
+
52
+ void MatrixStats::PerDimStats::compute_mean_std ()
53
+ {
54
+ n_valid = n - n_nan - n_inf;
55
+ mean = sum / n_valid;
56
+ double var = sum2 / n_valid - mean * mean;
57
+ if (var < 0) var = 0;
58
+ stddev = sqrt(var);
59
+ }
60
+
61
+
62
+ void MatrixStats::do_comment (const char *fmt, ...)
63
+ {
64
+ va_list ap;
65
+
66
+ /* Determine required size */
67
+ va_start(ap, fmt);
68
+ size_t size = vsnprintf(buf, nbuf, fmt, ap);
69
+ va_end(ap);
70
+
71
+ nbuf -= size;
72
+ buf += size;
73
+ }
74
+
75
+
76
+
77
+ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
78
+ n(n), d(d),
79
+ n_collision(0), n_valid(0), n0(0),
80
+ min_norm2(HUGE_VAL), max_norm2(0)
81
+ {
82
+ std::vector<char> comment_buf (10000);
83
+ buf = comment_buf.data ();
84
+ nbuf = comment_buf.size();
85
+
86
+ do_comment ("analyzing %ld vectors of size %ld\n", n, d);
87
+
88
+ if (d > 1024) {
89
+ do_comment (
90
+ "indexing this many dimensions is hard, "
91
+ "please consider dimensionality reducution (with PCAMatrix)\n");
92
+ }
93
+
94
+ size_t nbytes = sizeof (x[0]) * d;
95
+ per_dim_stats.resize (d);
96
+
97
+ for (size_t i = 0; i < n; i++) {
98
+ const float *xi = x + d * i;
99
+ double sum2 = 0;
100
+ for (size_t j = 0; j < d; j++) {
101
+ per_dim_stats[j].add (xi[j]);
102
+ sum2 += xi[j] * (double)xi[j];
103
+ }
104
+
105
+ if (std::isfinite (sum2)) {
106
+ n_valid++;
107
+ if (sum2 == 0) {
108
+ n0 ++;
109
+ } else {
110
+ if (sum2 < min_norm2) min_norm2 = sum2;
111
+ if (sum2 > max_norm2) max_norm2 = sum2;
112
+ }
113
+ }
114
+
115
+ { // check hash
116
+ uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
117
+ auto elt = occurrences.find (hash);
118
+ if (elt == occurrences.end()) {
119
+ Occurrence occ = {i, 1};
120
+ occurrences[hash] = occ;
121
+ } else {
122
+ if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
123
+ elt->second.count ++;
124
+ } else {
125
+ n_collision ++;
126
+ // we should use a list of collisions but overkill
127
+ }
128
+ }
129
+ }
130
+ }
131
+
132
+ // invalid vecor stats
133
+ if (n_valid == n) {
134
+ do_comment ("no NaN or Infs in data\n");
135
+ } else {
136
+ do_comment ("%ld vectors contain NaN or Inf "
137
+ "(or have too large components), "
138
+ "expect bad results with indexing!\n", n - n_valid);
139
+ }
140
+
141
+ // copies in dataset
142
+ if (occurrences.size() == n) {
143
+ do_comment ("all vectors are distinct\n");
144
+ } else {
145
+ do_comment ("%ld vectors are distinct (%.2f%%)\n",
146
+ occurrences.size(),
147
+ occurrences.size() * 100.0 / n);
148
+
149
+ if (n_collision > 0) {
150
+ do_comment ("%ld collisions in hash table, "
151
+ "counts may be invalid\n", n_collision);
152
+ }
153
+
154
+ Occurrence max = {0, 0};
155
+ for (auto it = occurrences.begin();
156
+ it != occurrences.end(); ++it) {
157
+ if (it->second.count > max.count) {
158
+ max = it->second;
159
+ }
160
+ }
161
+ do_comment ("vector %ld has %ld copies\n", max.first, max.count);
162
+ }
163
+
164
+ { // norm stats
165
+ min_norm2 = sqrt (min_norm2);
166
+ max_norm2 = sqrt (max_norm2);
167
+ do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
168
+ min_norm2, max_norm2, n0);
169
+
170
+ if (max_norm2 < min_norm2 * 1.0001) {
171
+ do_comment ("vectors are normalized, inner product and "
172
+ "L2 search are equivalent\n");
173
+ }
174
+
175
+ if (max_norm2 > min_norm2 * 100) {
176
+ do_comment ("vectors have very large differences in norms, "
177
+ "is this normal?\n");
178
+ }
179
+ }
180
+
181
+ { // per dimension stats
182
+
183
+ double max_std = 0, min_std = HUGE_VAL;
184
+
185
+ size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
186
+
187
+ for (size_t j = 0; j < d; j++) {
188
+ PerDimStats &st = per_dim_stats[j];
189
+ st.compute_mean_std ();
190
+ n0 += st.n0;
191
+
192
+ if (st.max == st.min) {
193
+ n_0_range ++;
194
+ } else if (st.max < 1.001 * st.min) {
195
+ n_dangerous_range ++;
196
+ }
197
+
198
+ if (st.stddev > max_std) max_std = st.stddev;
199
+ if (st.stddev < min_std) min_std = st.stddev;
200
+ }
201
+
202
+
203
+
204
+ if (n0 == 0) {
205
+ do_comment ("matrix contains no 0s\n");
206
+ } else {
207
+ do_comment ("matrix contains %.2f %% 0 entries\n",
208
+ n0 * 100.0 / (n * d));
209
+ }
210
+
211
+ if (n_0_range == 0) {
212
+ do_comment ("no constant dimensions\n");
213
+ } else {
214
+ do_comment ("%ld dimensions are constant: they can be removed\n",
215
+ n_0_range);
216
+ }
217
+
218
+ if (n_dangerous_range == 0) {
219
+ do_comment ("no dimension has a too large mean\n");
220
+ } else {
221
+ do_comment ("%ld dimensions are too large "
222
+ "wrt. their variance, may loose precision "
223
+ "in IndexFlatL2 (use CenteringTransform)\n",
224
+ n_dangerous_range);
225
+ }
226
+
227
+ do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
228
+
229
+ size_t n_small_var = 0;
230
+
231
+ for (size_t j = 0; j < d; j++) {
232
+ const PerDimStats &st = per_dim_stats[j];
233
+ if (st.stddev < max_std * 1e-4) {
234
+ n_small_var++;
235
+ }
236
+ }
237
+
238
+ if (n_small_var > 0) {
239
+ do_comment ("%ld dimensions have negligible stddev wrt. "
240
+ "the largest dimension, they could be ignored",
241
+ n_small_var);
242
+ }
243
+
244
+ }
245
+ comments = comment_buf.data ();
246
+ buf = nullptr;
247
+ nbuf = 0;
248
+ }
249
+
250
+
251
+
252
+ } // namespace faiss