faiss 0.1.7 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -7
  4. data/ext/faiss/ext.cpp +1 -1
  5. data/ext/faiss/extconf.rb +8 -2
  6. data/ext/faiss/index.cpp +102 -69
  7. data/ext/faiss/index_binary.cpp +24 -30
  8. data/ext/faiss/kmeans.cpp +20 -16
  9. data/ext/faiss/numo.hpp +867 -0
  10. data/ext/faiss/pca_matrix.cpp +13 -14
  11. data/ext/faiss/product_quantizer.cpp +23 -24
  12. data/ext/faiss/utils.cpp +10 -37
  13. data/ext/faiss/utils.h +2 -13
  14. data/lib/faiss/version.rb +1 -1
  15. data/lib/faiss.rb +0 -5
  16. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  17. data/vendor/faiss/faiss/AutoTune.h +55 -56
  18. data/vendor/faiss/faiss/Clustering.cpp +334 -195
  19. data/vendor/faiss/faiss/Clustering.h +88 -35
  20. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  21. data/vendor/faiss/faiss/IVFlib.h +48 -51
  22. data/vendor/faiss/faiss/Index.cpp +85 -103
  23. data/vendor/faiss/faiss/Index.h +54 -48
  24. data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
  25. data/vendor/faiss/faiss/Index2Layer.h +22 -22
  26. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  27. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  28. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  29. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  30. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  31. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  32. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  33. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  34. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  35. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  36. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  37. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  38. data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
  39. data/vendor/faiss/faiss/IndexFlat.h +35 -46
  40. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  41. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  42. data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
  43. data/vendor/faiss/faiss/IndexIVF.h +146 -113
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
  54. data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
  55. data/vendor/faiss/faiss/IndexLSH.h +21 -26
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -67
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
  69. data/vendor/faiss/faiss/IndexRefine.h +22 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
  73. data/vendor/faiss/faiss/IndexResidual.h +152 -0
  74. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
  75. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
  76. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  77. data/vendor/faiss/faiss/IndexShards.h +85 -73
  78. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  79. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  80. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  81. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  82. data/vendor/faiss/faiss/MetricType.h +7 -7
  83. data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
  84. data/vendor/faiss/faiss/VectorTransform.h +61 -89
  85. data/vendor/faiss/faiss/clone_index.cpp +77 -73
  86. data/vendor/faiss/faiss/clone_index.h +4 -9
  87. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  88. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  89. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
  90. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  91. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  92. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  93. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  94. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  95. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  96. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  97. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  101. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  102. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  103. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  104. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  106. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  108. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  110. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  112. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  113. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  114. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  115. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  116. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  121. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  122. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  124. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  125. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  126. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  128. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  129. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  130. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  131. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
  133. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  135. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  136. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  137. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  138. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  139. data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
  140. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
  142. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  144. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  145. data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
  146. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  148. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  149. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  151. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
  153. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
  154. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
  156. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  157. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  158. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  159. data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
  160. data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
  161. data/vendor/faiss/faiss/impl/io.cpp +75 -94
  162. data/vendor/faiss/faiss/impl/io.h +31 -41
  163. data/vendor/faiss/faiss/impl/io_macros.h +40 -29
  164. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  165. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  166. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  167. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  171. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  172. data/vendor/faiss/faiss/index_factory.cpp +269 -218
  173. data/vendor/faiss/faiss/index_factory.h +6 -7
  174. data/vendor/faiss/faiss/index_io.h +23 -26
  175. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  177. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  178. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  179. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  180. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  181. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  183. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  185. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  186. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  187. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  188. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  189. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  190. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  191. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  192. data/vendor/faiss/faiss/utils/distances.cpp +301 -310
  193. data/vendor/faiss/faiss/utils/distances.h +133 -118
  194. data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
  195. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  196. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  197. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  198. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  199. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  200. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  201. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  202. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  203. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  204. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  205. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  206. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  207. data/vendor/faiss/faiss/utils/random.h +13 -16
  208. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  209. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  210. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  211. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  212. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  213. data/vendor/faiss/faiss/utils/utils.h +53 -48
  214. metadata +26 -12
  215. data/lib/faiss/index.rb +0 -20
  216. data/lib/faiss/index_binary.rb +0 -20
  217. data/lib/faiss/kmeans.rb +0 -15
  218. data/lib/faiss/pca_matrix.rb +0 -15
  219. data/lib/faiss/product_quantizer.rb +0 -22
@@ -18,82 +18,94 @@ namespace faiss {
18
18
  */
19
19
  template <typename IndexT>
20
20
  struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
21
- using idx_t = typename IndexT::idx_t;
22
- using component_t = typename IndexT::component_t;
23
- using distance_t = typename IndexT::distance_t;
24
-
25
- /**
26
- * The dimension that all sub-indices must share will be the dimension of the
27
- * first sub-index added
28
- *
29
- * @param threaded do we use one thread per sub_index or do
30
- * queries sequentially?
31
- * @param successive_ids should we shift the returned ids by
32
- * the size of each sub-index or return them
33
- * as they are?
34
- */
35
- explicit IndexShardsTemplate(bool threaded = false,
36
- bool successive_ids = true);
37
-
38
- /**
39
- * @param threaded do we use one thread per sub_index or do
40
- * queries sequentially?
41
- * @param successive_ids should we shift the returned ids by
42
- * the size of each sub-index or return them
43
- * as they are?
44
- */
45
- explicit IndexShardsTemplate(idx_t d,
46
- bool threaded = false,
47
- bool successive_ids = true);
48
-
49
- /// int version due to the implicit bool conversion ambiguity of int as
50
- /// dimension
51
- explicit IndexShardsTemplate(int d,
52
- bool threaded = false,
53
- bool successive_ids = true);
54
-
55
- /// Alias for addIndex()
56
- void add_shard(IndexT* index) { this->addIndex(index); }
57
-
58
- /// Alias for removeIndex()
59
- void remove_shard(IndexT* index) { this->removeIndex(index); }
60
-
61
- /// supported only for sub-indices that implement add_with_ids
62
- void add(idx_t n, const component_t* x) override;
63
-
64
- /**
65
- * Cases (successive_ids, xids):
66
- * - true, non-NULL ERROR: it makes no sense to pass in ids and
67
- * request them to be shifted
68
- * - true, NULL OK, but should be called only once (calls add()
69
- * on sub-indexes).
70
- * - false, non-NULL OK: will call add_with_ids with passed in xids
71
- * distributed evenly over shards
72
- * - false, NULL OK: will call add_with_ids on each sub-index,
73
- * starting at ntotal
74
- */
75
- void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
76
-
77
- void search(idx_t n, const component_t* x, idx_t k,
78
- distance_t* distances, idx_t* labels) const override;
79
-
80
- void train(idx_t n, const component_t* x) override;
81
-
82
- bool successive_ids;
83
-
84
- /// Synchronize the top-level index (IndexShards) with data in the sub-indices
85
- void syncWithSubIndexes();
86
-
87
- protected:
88
- /// Called just after an index is added
89
- void onAfterAddIndex(IndexT* index) override;
90
-
91
- /// Called just after an index is removed
92
- void onAfterRemoveIndex(IndexT* index) override;
21
+ using idx_t = typename IndexT::idx_t;
22
+ using component_t = typename IndexT::component_t;
23
+ using distance_t = typename IndexT::distance_t;
24
+
25
+ /**
26
+ * The dimension that all sub-indices must share will be the dimension of
27
+ * the first sub-index added
28
+ *
29
+ * @param threaded do we use one thread per sub_index or do
30
+ * queries sequentially?
31
+ * @param successive_ids should we shift the returned ids by
32
+ * the size of each sub-index or return them
33
+ * as they are?
34
+ */
35
+ explicit IndexShardsTemplate(
36
+ bool threaded = false,
37
+ bool successive_ids = true);
38
+
39
+ /**
40
+ * @param threaded do we use one thread per sub_index or do
41
+ * queries sequentially?
42
+ * @param successive_ids should we shift the returned ids by
43
+ * the size of each sub-index or return them
44
+ * as they are?
45
+ */
46
+ explicit IndexShardsTemplate(
47
+ idx_t d,
48
+ bool threaded = false,
49
+ bool successive_ids = true);
50
+
51
+ /// int version due to the implicit bool conversion ambiguity of int as
52
+ /// dimension
53
+ explicit IndexShardsTemplate(
54
+ int d,
55
+ bool threaded = false,
56
+ bool successive_ids = true);
57
+
58
+ /// Alias for addIndex()
59
+ void add_shard(IndexT* index) {
60
+ this->addIndex(index);
61
+ }
62
+
63
+ /// Alias for removeIndex()
64
+ void remove_shard(IndexT* index) {
65
+ this->removeIndex(index);
66
+ }
67
+
68
+ /// supported only for sub-indices that implement add_with_ids
69
+ void add(idx_t n, const component_t* x) override;
70
+
71
+ /**
72
+ * Cases (successive_ids, xids):
73
+ * - true, non-NULL ERROR: it makes no sense to pass in ids and
74
+ * request them to be shifted
75
+ * - true, NULL OK, but should be called only once (calls add()
76
+ * on sub-indexes).
77
+ * - false, non-NULL OK: will call add_with_ids with passed in xids
78
+ * distributed evenly over shards
79
+ * - false, NULL OK: will call add_with_ids on each sub-index,
80
+ * starting at ntotal
81
+ */
82
+ void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
83
+ override;
84
+
85
+ void search(
86
+ idx_t n,
87
+ const component_t* x,
88
+ idx_t k,
89
+ distance_t* distances,
90
+ idx_t* labels) const override;
91
+
92
+ void train(idx_t n, const component_t* x) override;
93
+
94
+ bool successive_ids;
95
+
96
+ /// Synchronize the top-level index (IndexShards) with data in the
97
+ /// sub-indices
98
+ void syncWithSubIndexes();
99
+
100
+ protected:
101
+ /// Called just after an index is added
102
+ void onAfterAddIndex(IndexT* index) override;
103
+
104
+ /// Called just after an index is removed
105
+ void onAfterRemoveIndex(IndexT* index) override;
93
106
  };
94
107
 
95
108
  using IndexShards = IndexShardsTemplate<Index>;
96
109
  using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
97
110
 
98
-
99
111
  } // namespace faiss
@@ -7,15 +7,13 @@
7
7
 
8
8
  // -*- c++ -*-
9
9
 
10
-
11
10
  #include <faiss/MatrixStats.h>
12
11
 
12
+ #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
13
13
 
14
- #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
15
-
14
+ #include <faiss/utils/utils.h>
16
15
  #include <cmath>
17
16
  #include <cstdio>
18
- #include <faiss/utils/utils.h>
19
17
 
20
18
  namespace faiss {
21
19
 
@@ -23,16 +21,19 @@ namespace faiss {
23
21
  * MatrixStats
24
22
  *********************************************************************/
25
23
 
26
- MatrixStats::PerDimStats::PerDimStats():
27
- n(0), n_nan(0), n_inf(0), n0(0),
28
- min(HUGE_VALF), max(-HUGE_VALF),
29
- sum(0), sum2(0),
30
- mean(NAN), stddev(NAN)
31
- {}
32
-
33
-
34
- void MatrixStats::PerDimStats::add (float x)
35
- {
24
+ MatrixStats::PerDimStats::PerDimStats()
25
+ : n(0),
26
+ n_nan(0),
27
+ n_inf(0),
28
+ n0(0),
29
+ min(HUGE_VALF),
30
+ max(-HUGE_VALF),
31
+ sum(0),
32
+ sum2(0),
33
+ mean(NAN),
34
+ stddev(NAN) {}
35
+
36
+ void MatrixStats::PerDimStats::add(float x) {
36
37
  n++;
37
38
  if (std::isnan(x)) {
38
39
  n_nan++;
@@ -42,25 +43,26 @@ void MatrixStats::PerDimStats::add (float x)
42
43
  n_inf++;
43
44
  return;
44
45
  }
45
- if (x == 0) n0++;
46
- if (x < min) min = x;
47
- if (x > max) max = x;
46
+ if (x == 0)
47
+ n0++;
48
+ if (x < min)
49
+ min = x;
50
+ if (x > max)
51
+ max = x;
48
52
  sum += x;
49
53
  sum2 += (double)x * (double)x;
50
54
  }
51
55
 
52
- void MatrixStats::PerDimStats::compute_mean_std ()
53
- {
56
+ void MatrixStats::PerDimStats::compute_mean_std() {
54
57
  n_valid = n - n_nan - n_inf;
55
58
  mean = sum / n_valid;
56
59
  double var = sum2 / n_valid - mean * mean;
57
- if (var < 0) var = 0;
60
+ if (var < 0)
61
+ var = 0;
58
62
  stddev = sqrt(var);
59
63
  }
60
64
 
61
-
62
- void MatrixStats::do_comment (const char *fmt, ...)
63
- {
65
+ void MatrixStats::do_comment(const char* fmt, ...) {
64
66
  va_list ap;
65
67
 
66
68
  /* Determine required size */
@@ -72,57 +74,60 @@ void MatrixStats::do_comment (const char *fmt, ...)
72
74
  buf += size;
73
75
  }
74
76
 
75
-
76
-
77
- MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
78
- n(n), d(d),
79
- n_collision(0), n_valid(0), n0(0),
80
- min_norm2(HUGE_VAL), max_norm2(0)
81
- {
82
- std::vector<char> comment_buf (10000);
83
- buf = comment_buf.data ();
77
+ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
78
+ : n(n),
79
+ d(d),
80
+ n_collision(0),
81
+ n_valid(0),
82
+ n0(0),
83
+ min_norm2(HUGE_VAL),
84
+ max_norm2(0) {
85
+ std::vector<char> comment_buf(10000);
86
+ buf = comment_buf.data();
84
87
  nbuf = comment_buf.size();
85
88
 
86
- do_comment ("analyzing %ld vectors of size %ld\n", n, d);
89
+ do_comment("analyzing %ld vectors of size %ld\n", n, d);
87
90
 
88
91
  if (d > 1024) {
89
- do_comment (
90
- "indexing this many dimensions is hard, "
91
- "please consider dimensionality reducution (with PCAMatrix)\n");
92
+ do_comment(
93
+ "indexing this many dimensions is hard, "
94
+ "please consider dimensionality reducution (with PCAMatrix)\n");
92
95
  }
93
96
 
94
- size_t nbytes = sizeof (x[0]) * d;
95
- per_dim_stats.resize (d);
97
+ size_t nbytes = sizeof(x[0]) * d;
98
+ per_dim_stats.resize(d);
96
99
 
97
100
  for (size_t i = 0; i < n; i++) {
98
- const float *xi = x + d * i;
101
+ const float* xi = x + d * i;
99
102
  double sum2 = 0;
100
103
  for (size_t j = 0; j < d; j++) {
101
- per_dim_stats[j].add (xi[j]);
104
+ per_dim_stats[j].add(xi[j]);
102
105
  sum2 += xi[j] * (double)xi[j];
103
106
  }
104
107
 
105
- if (std::isfinite (sum2)) {
108
+ if (std::isfinite(sum2)) {
106
109
  n_valid++;
107
110
  if (sum2 == 0) {
108
- n0 ++;
111
+ n0++;
109
112
  } else {
110
- if (sum2 < min_norm2) min_norm2 = sum2;
111
- if (sum2 > max_norm2) max_norm2 = sum2;
113
+ if (sum2 < min_norm2)
114
+ min_norm2 = sum2;
115
+ if (sum2 > max_norm2)
116
+ max_norm2 = sum2;
112
117
  }
113
118
  }
114
119
 
115
120
  { // check hash
116
121
  uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
117
- auto elt = occurrences.find (hash);
122
+ auto elt = occurrences.find(hash);
118
123
  if (elt == occurrences.end()) {
119
124
  Occurrence occ = {i, 1};
120
125
  occurrences[hash] = occ;
121
126
  } else {
122
- if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
123
- elt->second.count ++;
127
+ if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
128
+ elt->second.count++;
124
129
  } else {
125
- n_collision ++;
130
+ n_collision++;
126
131
  // we should use a list of collisions but overkill
127
132
  }
128
133
  }
@@ -131,50 +136,59 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
131
136
 
132
137
  // invalid vecor stats
133
138
  if (n_valid == n) {
134
- do_comment ("no NaN or Infs in data\n");
139
+ do_comment("no NaN or Infs in data\n");
135
140
  } else {
136
- do_comment ("%ld vectors contain NaN or Inf "
137
- "(or have too large components), "
138
- "expect bad results with indexing!\n", n - n_valid);
141
+ do_comment(
142
+ "%ld vectors contain NaN or Inf "
143
+ "(or have too large components), "
144
+ "expect bad results with indexing!\n",
145
+ n - n_valid);
139
146
  }
140
147
 
141
148
  // copies in dataset
142
149
  if (occurrences.size() == n) {
143
- do_comment ("all vectors are distinct\n");
150
+ do_comment("all vectors are distinct\n");
144
151
  } else {
145
- do_comment ("%ld vectors are distinct (%.2f%%)\n",
146
- occurrences.size(),
147
- occurrences.size() * 100.0 / n);
152
+ do_comment(
153
+ "%ld vectors are distinct (%.2f%%)\n",
154
+ occurrences.size(),
155
+ occurrences.size() * 100.0 / n);
148
156
 
149
157
  if (n_collision > 0) {
150
- do_comment ("%ld collisions in hash table, "
151
- "counts may be invalid\n", n_collision);
158
+ do_comment(
159
+ "%ld collisions in hash table, "
160
+ "counts may be invalid\n",
161
+ n_collision);
152
162
  }
153
163
 
154
164
  Occurrence max = {0, 0};
155
- for (auto it = occurrences.begin();
156
- it != occurrences.end(); ++it) {
165
+ for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
157
166
  if (it->second.count > max.count) {
158
167
  max = it->second;
159
168
  }
160
169
  }
161
- do_comment ("vector %ld has %ld copies\n", max.first, max.count);
170
+ do_comment("vector %ld has %ld copies\n", max.first, max.count);
162
171
  }
163
172
 
164
173
  { // norm stats
165
- min_norm2 = sqrt (min_norm2);
166
- max_norm2 = sqrt (max_norm2);
167
- do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
168
- min_norm2, max_norm2, n0);
174
+ min_norm2 = sqrt(min_norm2);
175
+ max_norm2 = sqrt(max_norm2);
176
+ do_comment(
177
+ "range of L2 norms=[%g, %g] (%ld null vectors)\n",
178
+ min_norm2,
179
+ max_norm2,
180
+ n0);
169
181
 
170
182
  if (max_norm2 < min_norm2 * 1.0001) {
171
- do_comment ("vectors are normalized, inner product and "
172
- "L2 search are equivalent\n");
183
+ do_comment(
184
+ "vectors are normalized, inner product and "
185
+ "L2 search are equivalent\n");
173
186
  }
174
187
 
175
188
  if (max_norm2 > min_norm2 * 100) {
176
- do_comment ("vectors have very large differences in norms, "
177
- "is this normal?\n");
189
+ do_comment(
190
+ "vectors have very large differences in norms, "
191
+ "is this normal?\n");
178
192
  }
179
193
  }
180
194
 
@@ -185,68 +199,69 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
185
199
  size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
186
200
 
187
201
  for (size_t j = 0; j < d; j++) {
188
- PerDimStats &st = per_dim_stats[j];
189
- st.compute_mean_std ();
202
+ PerDimStats& st = per_dim_stats[j];
203
+ st.compute_mean_std();
190
204
  n0 += st.n0;
191
205
 
192
206
  if (st.max == st.min) {
193
- n_0_range ++;
207
+ n_0_range++;
194
208
  } else if (st.max < 1.001 * st.min) {
195
- n_dangerous_range ++;
209
+ n_dangerous_range++;
196
210
  }
197
211
 
198
- if (st.stddev > max_std) max_std = st.stddev;
199
- if (st.stddev < min_std) min_std = st.stddev;
212
+ if (st.stddev > max_std)
213
+ max_std = st.stddev;
214
+ if (st.stddev < min_std)
215
+ min_std = st.stddev;
200
216
  }
201
217
 
202
-
203
-
204
218
  if (n0 == 0) {
205
- do_comment ("matrix contains no 0s\n");
219
+ do_comment("matrix contains no 0s\n");
206
220
  } else {
207
- do_comment ("matrix contains %.2f %% 0 entries\n",
208
- n0 * 100.0 / (n * d));
221
+ do_comment(
222
+ "matrix contains %.2f %% 0 entries\n",
223
+ n0 * 100.0 / (n * d));
209
224
  }
210
225
 
211
226
  if (n_0_range == 0) {
212
- do_comment ("no constant dimensions\n");
227
+ do_comment("no constant dimensions\n");
213
228
  } else {
214
- do_comment ("%ld dimensions are constant: they can be removed\n",
215
- n_0_range);
229
+ do_comment(
230
+ "%ld dimensions are constant: they can be removed\n",
231
+ n_0_range);
216
232
  }
217
233
 
218
234
  if (n_dangerous_range == 0) {
219
- do_comment ("no dimension has a too large mean\n");
235
+ do_comment("no dimension has a too large mean\n");
220
236
  } else {
221
- do_comment ("%ld dimensions are too large "
222
- "wrt. their variance, may loose precision "
223
- "in IndexFlatL2 (use CenteringTransform)\n",
224
- n_dangerous_range);
237
+ do_comment(
238
+ "%ld dimensions are too large "
239
+ "wrt. their variance, may loose precision "
240
+ "in IndexFlatL2 (use CenteringTransform)\n",
241
+ n_dangerous_range);
225
242
  }
226
243
 
227
- do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
244
+ do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
228
245
 
229
246
  size_t n_small_var = 0;
230
247
 
231
248
  for (size_t j = 0; j < d; j++) {
232
- const PerDimStats &st = per_dim_stats[j];
249
+ const PerDimStats& st = per_dim_stats[j];
233
250
  if (st.stddev < max_std * 1e-4) {
234
251
  n_small_var++;
235
252
  }
236
253
  }
237
254
 
238
255
  if (n_small_var > 0) {
239
- do_comment ("%ld dimensions have negligible stddev wrt. "
240
- "the largest dimension, they could be ignored",
241
- n_small_var);
256
+ do_comment(
257
+ "%ld dimensions have negligible stddev wrt. "
258
+ "the largest dimension, they could be ignored",
259
+ n_small_var);
242
260
  }
243
-
244
261
  }
245
- comments = comment_buf.data ();
262
+ comments = comment_buf.data();
246
263
  buf = nullptr;
247
264
  nbuf = 0;
248
265
  }
249
266
 
250
-
251
-
252
267
  } // namespace faiss
@@ -9,22 +9,20 @@
9
9
 
10
10
  #pragma once
11
11
 
12
- #include <vector>
12
+ #include <stdint.h>
13
13
  #include <string>
14
14
  #include <unordered_map>
15
- #include <stdint.h>
16
-
15
+ #include <vector>
17
16
 
18
17
  namespace faiss {
19
18
 
20
-
21
19
  /** Reports some statistics on a dataset and comments on them.
22
20
  *
23
21
  * It is a class rather than a function so that all stats can also be
24
22
  * accessed from code */
25
23
 
26
24
  struct MatrixStats {
27
- MatrixStats (size_t n, size_t d, const float *x);
25
+ MatrixStats(size_t n, size_t d, const float* x);
28
26
  std::string comments;
29
27
 
30
28
  // raw statistics
@@ -42,8 +40,8 @@ struct MatrixStats {
42
40
  double mean, stddev;
43
41
 
44
42
  PerDimStats();
45
- void add (float x);
46
- void compute_mean_std ();
43
+ void add(float x);
44
+ void compute_mean_std();
47
45
  };
48
46
 
49
47
  std::vector<PerDimStats> per_dim_stats;
@@ -53,10 +51,9 @@ struct MatrixStats {
53
51
  };
54
52
  std::unordered_map<uint64_t, Occurrence> occurrences;
55
53
 
56
- char *buf;
54
+ char* buf;
57
55
  size_t nbuf;
58
- void do_comment (const char *fmt, ...);
59
-
56
+ void do_comment(const char* fmt, ...);
60
57
  };
61
58
 
62
59
  } // namespace faiss