faiss 0.1.7 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -7
  4. data/ext/faiss/ext.cpp +1 -1
  5. data/ext/faiss/extconf.rb +8 -2
  6. data/ext/faiss/index.cpp +102 -69
  7. data/ext/faiss/index_binary.cpp +24 -30
  8. data/ext/faiss/kmeans.cpp +20 -16
  9. data/ext/faiss/numo.hpp +867 -0
  10. data/ext/faiss/pca_matrix.cpp +13 -14
  11. data/ext/faiss/product_quantizer.cpp +23 -24
  12. data/ext/faiss/utils.cpp +10 -37
  13. data/ext/faiss/utils.h +2 -13
  14. data/lib/faiss/version.rb +1 -1
  15. data/lib/faiss.rb +0 -5
  16. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  17. data/vendor/faiss/faiss/AutoTune.h +55 -56
  18. data/vendor/faiss/faiss/Clustering.cpp +334 -195
  19. data/vendor/faiss/faiss/Clustering.h +88 -35
  20. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  21. data/vendor/faiss/faiss/IVFlib.h +48 -51
  22. data/vendor/faiss/faiss/Index.cpp +85 -103
  23. data/vendor/faiss/faiss/Index.h +54 -48
  24. data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
  25. data/vendor/faiss/faiss/Index2Layer.h +22 -22
  26. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  27. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  28. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  29. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  30. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  31. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  32. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  33. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  34. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  35. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  36. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  37. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  38. data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
  39. data/vendor/faiss/faiss/IndexFlat.h +35 -46
  40. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  41. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  42. data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
  43. data/vendor/faiss/faiss/IndexIVF.h +146 -113
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
  54. data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
  55. data/vendor/faiss/faiss/IndexLSH.h +21 -26
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -67
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
  69. data/vendor/faiss/faiss/IndexRefine.h +22 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
  73. data/vendor/faiss/faiss/IndexResidual.h +152 -0
  74. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
  75. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
  76. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  77. data/vendor/faiss/faiss/IndexShards.h +85 -73
  78. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  79. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  80. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  81. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  82. data/vendor/faiss/faiss/MetricType.h +7 -7
  83. data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
  84. data/vendor/faiss/faiss/VectorTransform.h +61 -89
  85. data/vendor/faiss/faiss/clone_index.cpp +77 -73
  86. data/vendor/faiss/faiss/clone_index.h +4 -9
  87. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  88. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  89. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
  90. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  91. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  92. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  93. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  94. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  95. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  96. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  97. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  101. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  102. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  103. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  104. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  106. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  108. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  110. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  112. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  113. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  114. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  115. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  116. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  121. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  122. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  124. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  125. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  126. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  128. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  129. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  130. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  131. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
  133. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  135. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  136. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  137. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  138. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  139. data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
  140. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
  142. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  144. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  145. data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
  146. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  148. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  149. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  151. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
  153. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
  154. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
  156. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  157. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  158. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  159. data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
  160. data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
  161. data/vendor/faiss/faiss/impl/io.cpp +75 -94
  162. data/vendor/faiss/faiss/impl/io.h +31 -41
  163. data/vendor/faiss/faiss/impl/io_macros.h +40 -29
  164. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  165. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  166. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  167. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  171. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  172. data/vendor/faiss/faiss/index_factory.cpp +269 -218
  173. data/vendor/faiss/faiss/index_factory.h +6 -7
  174. data/vendor/faiss/faiss/index_io.h +23 -26
  175. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  177. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  178. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  179. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  180. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  181. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  183. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  185. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  186. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  187. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  188. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  189. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  190. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  191. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  192. data/vendor/faiss/faiss/utils/distances.cpp +301 -310
  193. data/vendor/faiss/faiss/utils/distances.h +133 -118
  194. data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
  195. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  196. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  197. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  198. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  199. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  200. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  201. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  202. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  203. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  204. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  205. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  206. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  207. data/vendor/faiss/faiss/utils/random.h +13 -16
  208. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  209. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  210. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  211. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  212. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  213. data/vendor/faiss/faiss/utils/utils.h +53 -48
  214. metadata +26 -12
  215. data/lib/faiss/index.rb +0 -20
  216. data/lib/faiss/index_binary.rb +0 -20
  217. data/lib/faiss/kmeans.rb +0 -15
  218. data/lib/faiss/pca_matrix.rb +0 -15
  219. data/lib/faiss/product_quantizer.rb +0 -22
@@ -18,82 +18,94 @@ namespace faiss {
18
18
  */
19
19
  template <typename IndexT>
20
20
  struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
21
- using idx_t = typename IndexT::idx_t;
22
- using component_t = typename IndexT::component_t;
23
- using distance_t = typename IndexT::distance_t;
24
-
25
- /**
26
- * The dimension that all sub-indices must share will be the dimension of the
27
- * first sub-index added
28
- *
29
- * @param threaded do we use one thread per sub_index or do
30
- * queries sequentially?
31
- * @param successive_ids should we shift the returned ids by
32
- * the size of each sub-index or return them
33
- * as they are?
34
- */
35
- explicit IndexShardsTemplate(bool threaded = false,
36
- bool successive_ids = true);
37
-
38
- /**
39
- * @param threaded do we use one thread per sub_index or do
40
- * queries sequentially?
41
- * @param successive_ids should we shift the returned ids by
42
- * the size of each sub-index or return them
43
- * as they are?
44
- */
45
- explicit IndexShardsTemplate(idx_t d,
46
- bool threaded = false,
47
- bool successive_ids = true);
48
-
49
- /// int version due to the implicit bool conversion ambiguity of int as
50
- /// dimension
51
- explicit IndexShardsTemplate(int d,
52
- bool threaded = false,
53
- bool successive_ids = true);
54
-
55
- /// Alias for addIndex()
56
- void add_shard(IndexT* index) { this->addIndex(index); }
57
-
58
- /// Alias for removeIndex()
59
- void remove_shard(IndexT* index) { this->removeIndex(index); }
60
-
61
- /// supported only for sub-indices that implement add_with_ids
62
- void add(idx_t n, const component_t* x) override;
63
-
64
- /**
65
- * Cases (successive_ids, xids):
66
- * - true, non-NULL ERROR: it makes no sense to pass in ids and
67
- * request them to be shifted
68
- * - true, NULL OK, but should be called only once (calls add()
69
- * on sub-indexes).
70
- * - false, non-NULL OK: will call add_with_ids with passed in xids
71
- * distributed evenly over shards
72
- * - false, NULL OK: will call add_with_ids on each sub-index,
73
- * starting at ntotal
74
- */
75
- void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
76
-
77
- void search(idx_t n, const component_t* x, idx_t k,
78
- distance_t* distances, idx_t* labels) const override;
79
-
80
- void train(idx_t n, const component_t* x) override;
81
-
82
- bool successive_ids;
83
-
84
- /// Synchronize the top-level index (IndexShards) with data in the sub-indices
85
- void syncWithSubIndexes();
86
-
87
- protected:
88
- /// Called just after an index is added
89
- void onAfterAddIndex(IndexT* index) override;
90
-
91
- /// Called just after an index is removed
92
- void onAfterRemoveIndex(IndexT* index) override;
21
+ using idx_t = typename IndexT::idx_t;
22
+ using component_t = typename IndexT::component_t;
23
+ using distance_t = typename IndexT::distance_t;
24
+
25
+ /**
26
+ * The dimension that all sub-indices must share will be the dimension of
27
+ * the first sub-index added
28
+ *
29
+ * @param threaded do we use one thread per sub_index or do
30
+ * queries sequentially?
31
+ * @param successive_ids should we shift the returned ids by
32
+ * the size of each sub-index or return them
33
+ * as they are?
34
+ */
35
+ explicit IndexShardsTemplate(
36
+ bool threaded = false,
37
+ bool successive_ids = true);
38
+
39
+ /**
40
+ * @param threaded do we use one thread per sub_index or do
41
+ * queries sequentially?
42
+ * @param successive_ids should we shift the returned ids by
43
+ * the size of each sub-index or return them
44
+ * as they are?
45
+ */
46
+ explicit IndexShardsTemplate(
47
+ idx_t d,
48
+ bool threaded = false,
49
+ bool successive_ids = true);
50
+
51
+ /// int version due to the implicit bool conversion ambiguity of int as
52
+ /// dimension
53
+ explicit IndexShardsTemplate(
54
+ int d,
55
+ bool threaded = false,
56
+ bool successive_ids = true);
57
+
58
+ /// Alias for addIndex()
59
+ void add_shard(IndexT* index) {
60
+ this->addIndex(index);
61
+ }
62
+
63
+ /// Alias for removeIndex()
64
+ void remove_shard(IndexT* index) {
65
+ this->removeIndex(index);
66
+ }
67
+
68
+ /// supported only for sub-indices that implement add_with_ids
69
+ void add(idx_t n, const component_t* x) override;
70
+
71
+ /**
72
+ * Cases (successive_ids, xids):
73
+ * - true, non-NULL ERROR: it makes no sense to pass in ids and
74
+ * request them to be shifted
75
+ * - true, NULL OK, but should be called only once (calls add()
76
+ * on sub-indexes).
77
+ * - false, non-NULL OK: will call add_with_ids with passed in xids
78
+ * distributed evenly over shards
79
+ * - false, NULL OK: will call add_with_ids on each sub-index,
80
+ * starting at ntotal
81
+ */
82
+ void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
83
+ override;
84
+
85
+ void search(
86
+ idx_t n,
87
+ const component_t* x,
88
+ idx_t k,
89
+ distance_t* distances,
90
+ idx_t* labels) const override;
91
+
92
+ void train(idx_t n, const component_t* x) override;
93
+
94
+ bool successive_ids;
95
+
96
+ /// Synchronize the top-level index (IndexShards) with data in the
97
+ /// sub-indices
98
+ void syncWithSubIndexes();
99
+
100
+ protected:
101
+ /// Called just after an index is added
102
+ void onAfterAddIndex(IndexT* index) override;
103
+
104
+ /// Called just after an index is removed
105
+ void onAfterRemoveIndex(IndexT* index) override;
93
106
  };
94
107
 
95
108
  using IndexShards = IndexShardsTemplate<Index>;
96
109
  using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
97
110
 
98
-
99
111
  } // namespace faiss
@@ -7,15 +7,13 @@
7
7
 
8
8
  // -*- c++ -*-
9
9
 
10
-
11
10
  #include <faiss/MatrixStats.h>
12
11
 
12
+ #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
13
13
 
14
- #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
15
-
14
+ #include <faiss/utils/utils.h>
16
15
  #include <cmath>
17
16
  #include <cstdio>
18
- #include <faiss/utils/utils.h>
19
17
 
20
18
  namespace faiss {
21
19
 
@@ -23,16 +21,19 @@ namespace faiss {
23
21
  * MatrixStats
24
22
  *********************************************************************/
25
23
 
26
- MatrixStats::PerDimStats::PerDimStats():
27
- n(0), n_nan(0), n_inf(0), n0(0),
28
- min(HUGE_VALF), max(-HUGE_VALF),
29
- sum(0), sum2(0),
30
- mean(NAN), stddev(NAN)
31
- {}
32
-
33
-
34
- void MatrixStats::PerDimStats::add (float x)
35
- {
24
+ MatrixStats::PerDimStats::PerDimStats()
25
+ : n(0),
26
+ n_nan(0),
27
+ n_inf(0),
28
+ n0(0),
29
+ min(HUGE_VALF),
30
+ max(-HUGE_VALF),
31
+ sum(0),
32
+ sum2(0),
33
+ mean(NAN),
34
+ stddev(NAN) {}
35
+
36
+ void MatrixStats::PerDimStats::add(float x) {
36
37
  n++;
37
38
  if (std::isnan(x)) {
38
39
  n_nan++;
@@ -42,25 +43,26 @@ void MatrixStats::PerDimStats::add (float x)
42
43
  n_inf++;
43
44
  return;
44
45
  }
45
- if (x == 0) n0++;
46
- if (x < min) min = x;
47
- if (x > max) max = x;
46
+ if (x == 0)
47
+ n0++;
48
+ if (x < min)
49
+ min = x;
50
+ if (x > max)
51
+ max = x;
48
52
  sum += x;
49
53
  sum2 += (double)x * (double)x;
50
54
  }
51
55
 
52
- void MatrixStats::PerDimStats::compute_mean_std ()
53
- {
56
+ void MatrixStats::PerDimStats::compute_mean_std() {
54
57
  n_valid = n - n_nan - n_inf;
55
58
  mean = sum / n_valid;
56
59
  double var = sum2 / n_valid - mean * mean;
57
- if (var < 0) var = 0;
60
+ if (var < 0)
61
+ var = 0;
58
62
  stddev = sqrt(var);
59
63
  }
60
64
 
61
-
62
- void MatrixStats::do_comment (const char *fmt, ...)
63
- {
65
+ void MatrixStats::do_comment(const char* fmt, ...) {
64
66
  va_list ap;
65
67
 
66
68
  /* Determine required size */
@@ -72,57 +74,60 @@ void MatrixStats::do_comment (const char *fmt, ...)
72
74
  buf += size;
73
75
  }
74
76
 
75
-
76
-
77
- MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
78
- n(n), d(d),
79
- n_collision(0), n_valid(0), n0(0),
80
- min_norm2(HUGE_VAL), max_norm2(0)
81
- {
82
- std::vector<char> comment_buf (10000);
83
- buf = comment_buf.data ();
77
+ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
78
+ : n(n),
79
+ d(d),
80
+ n_collision(0),
81
+ n_valid(0),
82
+ n0(0),
83
+ min_norm2(HUGE_VAL),
84
+ max_norm2(0) {
85
+ std::vector<char> comment_buf(10000);
86
+ buf = comment_buf.data();
84
87
  nbuf = comment_buf.size();
85
88
 
86
- do_comment ("analyzing %ld vectors of size %ld\n", n, d);
89
+ do_comment("analyzing %ld vectors of size %ld\n", n, d);
87
90
 
88
91
  if (d > 1024) {
89
- do_comment (
90
- "indexing this many dimensions is hard, "
91
- "please consider dimensionality reducution (with PCAMatrix)\n");
92
+ do_comment(
93
+ "indexing this many dimensions is hard, "
94
+ "please consider dimensionality reducution (with PCAMatrix)\n");
92
95
  }
93
96
 
94
- size_t nbytes = sizeof (x[0]) * d;
95
- per_dim_stats.resize (d);
97
+ size_t nbytes = sizeof(x[0]) * d;
98
+ per_dim_stats.resize(d);
96
99
 
97
100
  for (size_t i = 0; i < n; i++) {
98
- const float *xi = x + d * i;
101
+ const float* xi = x + d * i;
99
102
  double sum2 = 0;
100
103
  for (size_t j = 0; j < d; j++) {
101
- per_dim_stats[j].add (xi[j]);
104
+ per_dim_stats[j].add(xi[j]);
102
105
  sum2 += xi[j] * (double)xi[j];
103
106
  }
104
107
 
105
- if (std::isfinite (sum2)) {
108
+ if (std::isfinite(sum2)) {
106
109
  n_valid++;
107
110
  if (sum2 == 0) {
108
- n0 ++;
111
+ n0++;
109
112
  } else {
110
- if (sum2 < min_norm2) min_norm2 = sum2;
111
- if (sum2 > max_norm2) max_norm2 = sum2;
113
+ if (sum2 < min_norm2)
114
+ min_norm2 = sum2;
115
+ if (sum2 > max_norm2)
116
+ max_norm2 = sum2;
112
117
  }
113
118
  }
114
119
 
115
120
  { // check hash
116
121
  uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
117
- auto elt = occurrences.find (hash);
122
+ auto elt = occurrences.find(hash);
118
123
  if (elt == occurrences.end()) {
119
124
  Occurrence occ = {i, 1};
120
125
  occurrences[hash] = occ;
121
126
  } else {
122
- if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
123
- elt->second.count ++;
127
+ if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
128
+ elt->second.count++;
124
129
  } else {
125
- n_collision ++;
130
+ n_collision++;
126
131
  // we should use a list of collisions but overkill
127
132
  }
128
133
  }
@@ -131,50 +136,59 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
131
136
 
132
137
  // invalid vecor stats
133
138
  if (n_valid == n) {
134
- do_comment ("no NaN or Infs in data\n");
139
+ do_comment("no NaN or Infs in data\n");
135
140
  } else {
136
- do_comment ("%ld vectors contain NaN or Inf "
137
- "(or have too large components), "
138
- "expect bad results with indexing!\n", n - n_valid);
141
+ do_comment(
142
+ "%ld vectors contain NaN or Inf "
143
+ "(or have too large components), "
144
+ "expect bad results with indexing!\n",
145
+ n - n_valid);
139
146
  }
140
147
 
141
148
  // copies in dataset
142
149
  if (occurrences.size() == n) {
143
- do_comment ("all vectors are distinct\n");
150
+ do_comment("all vectors are distinct\n");
144
151
  } else {
145
- do_comment ("%ld vectors are distinct (%.2f%%)\n",
146
- occurrences.size(),
147
- occurrences.size() * 100.0 / n);
152
+ do_comment(
153
+ "%ld vectors are distinct (%.2f%%)\n",
154
+ occurrences.size(),
155
+ occurrences.size() * 100.0 / n);
148
156
 
149
157
  if (n_collision > 0) {
150
- do_comment ("%ld collisions in hash table, "
151
- "counts may be invalid\n", n_collision);
158
+ do_comment(
159
+ "%ld collisions in hash table, "
160
+ "counts may be invalid\n",
161
+ n_collision);
152
162
  }
153
163
 
154
164
  Occurrence max = {0, 0};
155
- for (auto it = occurrences.begin();
156
- it != occurrences.end(); ++it) {
165
+ for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
157
166
  if (it->second.count > max.count) {
158
167
  max = it->second;
159
168
  }
160
169
  }
161
- do_comment ("vector %ld has %ld copies\n", max.first, max.count);
170
+ do_comment("vector %ld has %ld copies\n", max.first, max.count);
162
171
  }
163
172
 
164
173
  { // norm stats
165
- min_norm2 = sqrt (min_norm2);
166
- max_norm2 = sqrt (max_norm2);
167
- do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
168
- min_norm2, max_norm2, n0);
174
+ min_norm2 = sqrt(min_norm2);
175
+ max_norm2 = sqrt(max_norm2);
176
+ do_comment(
177
+ "range of L2 norms=[%g, %g] (%ld null vectors)\n",
178
+ min_norm2,
179
+ max_norm2,
180
+ n0);
169
181
 
170
182
  if (max_norm2 < min_norm2 * 1.0001) {
171
- do_comment ("vectors are normalized, inner product and "
172
- "L2 search are equivalent\n");
183
+ do_comment(
184
+ "vectors are normalized, inner product and "
185
+ "L2 search are equivalent\n");
173
186
  }
174
187
 
175
188
  if (max_norm2 > min_norm2 * 100) {
176
- do_comment ("vectors have very large differences in norms, "
177
- "is this normal?\n");
189
+ do_comment(
190
+ "vectors have very large differences in norms, "
191
+ "is this normal?\n");
178
192
  }
179
193
  }
180
194
 
@@ -185,68 +199,69 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
185
199
  size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
186
200
 
187
201
  for (size_t j = 0; j < d; j++) {
188
- PerDimStats &st = per_dim_stats[j];
189
- st.compute_mean_std ();
202
+ PerDimStats& st = per_dim_stats[j];
203
+ st.compute_mean_std();
190
204
  n0 += st.n0;
191
205
 
192
206
  if (st.max == st.min) {
193
- n_0_range ++;
207
+ n_0_range++;
194
208
  } else if (st.max < 1.001 * st.min) {
195
- n_dangerous_range ++;
209
+ n_dangerous_range++;
196
210
  }
197
211
 
198
- if (st.stddev > max_std) max_std = st.stddev;
199
- if (st.stddev < min_std) min_std = st.stddev;
212
+ if (st.stddev > max_std)
213
+ max_std = st.stddev;
214
+ if (st.stddev < min_std)
215
+ min_std = st.stddev;
200
216
  }
201
217
 
202
-
203
-
204
218
  if (n0 == 0) {
205
- do_comment ("matrix contains no 0s\n");
219
+ do_comment("matrix contains no 0s\n");
206
220
  } else {
207
- do_comment ("matrix contains %.2f %% 0 entries\n",
208
- n0 * 100.0 / (n * d));
221
+ do_comment(
222
+ "matrix contains %.2f %% 0 entries\n",
223
+ n0 * 100.0 / (n * d));
209
224
  }
210
225
 
211
226
  if (n_0_range == 0) {
212
- do_comment ("no constant dimensions\n");
227
+ do_comment("no constant dimensions\n");
213
228
  } else {
214
- do_comment ("%ld dimensions are constant: they can be removed\n",
215
- n_0_range);
229
+ do_comment(
230
+ "%ld dimensions are constant: they can be removed\n",
231
+ n_0_range);
216
232
  }
217
233
 
218
234
  if (n_dangerous_range == 0) {
219
- do_comment ("no dimension has a too large mean\n");
235
+ do_comment("no dimension has a too large mean\n");
220
236
  } else {
221
- do_comment ("%ld dimensions are too large "
222
- "wrt. their variance, may loose precision "
223
- "in IndexFlatL2 (use CenteringTransform)\n",
224
- n_dangerous_range);
237
+ do_comment(
238
+ "%ld dimensions are too large "
239
+ "wrt. their variance, may loose precision "
240
+ "in IndexFlatL2 (use CenteringTransform)\n",
241
+ n_dangerous_range);
225
242
  }
226
243
 
227
- do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
244
+ do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
228
245
 
229
246
  size_t n_small_var = 0;
230
247
 
231
248
  for (size_t j = 0; j < d; j++) {
232
- const PerDimStats &st = per_dim_stats[j];
249
+ const PerDimStats& st = per_dim_stats[j];
233
250
  if (st.stddev < max_std * 1e-4) {
234
251
  n_small_var++;
235
252
  }
236
253
  }
237
254
 
238
255
  if (n_small_var > 0) {
239
- do_comment ("%ld dimensions have negligible stddev wrt. "
240
- "the largest dimension, they could be ignored",
241
- n_small_var);
256
+ do_comment(
257
+ "%ld dimensions have negligible stddev wrt. "
258
+ "the largest dimension, they could be ignored",
259
+ n_small_var);
242
260
  }
243
-
244
261
  }
245
- comments = comment_buf.data ();
262
+ comments = comment_buf.data();
246
263
  buf = nullptr;
247
264
  nbuf = 0;
248
265
  }
249
266
 
250
-
251
-
252
267
  } // namespace faiss
@@ -9,22 +9,20 @@
9
9
 
10
10
  #pragma once
11
11
 
12
- #include <vector>
12
+ #include <stdint.h>
13
13
  #include <string>
14
14
  #include <unordered_map>
15
- #include <stdint.h>
16
-
15
+ #include <vector>
17
16
 
18
17
  namespace faiss {
19
18
 
20
-
21
19
  /** Reports some statistics on a dataset and comments on them.
22
20
  *
23
21
  * It is a class rather than a function so that all stats can also be
24
22
  * accessed from code */
25
23
 
26
24
  struct MatrixStats {
27
- MatrixStats (size_t n, size_t d, const float *x);
25
+ MatrixStats(size_t n, size_t d, const float* x);
28
26
  std::string comments;
29
27
 
30
28
  // raw statistics
@@ -42,8 +40,8 @@ struct MatrixStats {
42
40
  double mean, stddev;
43
41
 
44
42
  PerDimStats();
45
- void add (float x);
46
- void compute_mean_std ();
43
+ void add(float x);
44
+ void compute_mean_std();
47
45
  };
48
46
 
49
47
  std::vector<PerDimStats> per_dim_stats;
@@ -53,10 +51,9 @@ struct MatrixStats {
53
51
  };
54
52
  std::unordered_map<uint64_t, Occurrence> occurrences;
55
53
 
56
- char *buf;
54
+ char* buf;
57
55
  size_t nbuf;
58
- void do_comment (const char *fmt, ...);
59
-
56
+ void do_comment(const char* fmt, ...);
60
57
  };
61
58
 
62
59
  } // namespace faiss