faiss 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -7
  5. data/ext/faiss/extconf.rb +6 -3
  6. data/ext/faiss/numo.hpp +4 -4
  7. data/ext/faiss/utils.cpp +1 -1
  8. data/ext/faiss/utils.h +1 -1
  9. data/lib/faiss/version.rb +1 -1
  10. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  11. data/vendor/faiss/faiss/AutoTune.h +55 -56
  12. data/vendor/faiss/faiss/Clustering.cpp +365 -194
  13. data/vendor/faiss/faiss/Clustering.h +102 -35
  14. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  15. data/vendor/faiss/faiss/IVFlib.h +48 -51
  16. data/vendor/faiss/faiss/Index.cpp +85 -103
  17. data/vendor/faiss/faiss/Index.h +54 -48
  18. data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
  19. data/vendor/faiss/faiss/Index2Layer.h +22 -36
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
  21. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
  22. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  23. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  24. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  25. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  26. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  27. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  28. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  29. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  30. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  31. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  32. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  33. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  34. data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
  35. data/vendor/faiss/faiss/IndexFlat.h +42 -59
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  39. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  40. data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
  41. data/vendor/faiss/faiss/IndexIVF.h +169 -118
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
  54. data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
  55. data/vendor/faiss/faiss/IndexLSH.h +20 -38
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -82
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
  69. data/vendor/faiss/faiss/IndexRefine.h +32 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
  73. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
  74. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  75. data/vendor/faiss/faiss/IndexShards.h +85 -73
  76. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  77. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  78. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  79. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  80. data/vendor/faiss/faiss/MetricType.h +7 -7
  81. data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
  82. data/vendor/faiss/faiss/VectorTransform.h +64 -89
  83. data/vendor/faiss/faiss/clone_index.cpp +78 -73
  84. data/vendor/faiss/faiss/clone_index.h +4 -9
  85. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  86. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  87. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
  88. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  89. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  90. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  91. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  92. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  93. data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
  94. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  95. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  96. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  97. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  101. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  102. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  106. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  108. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  110. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  112. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  113. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  114. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  115. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  116. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  121. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  122. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  124. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  125. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  126. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  128. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  129. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  130. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  135. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  136. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  137. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  138. data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
  139. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
  142. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  144. data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
  145. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  146. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  148. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  149. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  151. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
  153. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  154. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  156. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  157. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  158. data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
  159. data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
  160. data/vendor/faiss/faiss/impl/io.cpp +76 -95
  161. data/vendor/faiss/faiss/impl/io.h +31 -41
  162. data/vendor/faiss/faiss/impl/io_macros.h +60 -29
  163. data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
  164. data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
  165. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  166. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  167. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  171. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  172. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  173. data/vendor/faiss/faiss/index_factory.cpp +619 -397
  174. data/vendor/faiss/faiss/index_factory.h +8 -6
  175. data/vendor/faiss/faiss/index_io.h +23 -26
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  177. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  178. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  179. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  180. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  181. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  183. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  185. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  186. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  187. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  188. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  189. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  190. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  191. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  192. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  193. data/vendor/faiss/faiss/utils/distances.cpp +305 -312
  194. data/vendor/faiss/faiss/utils/distances.h +170 -122
  195. data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
  196. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  197. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  198. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  199. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  200. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  201. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  202. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  203. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  204. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  205. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  206. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  207. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  208. data/vendor/faiss/faiss/utils/random.h +13 -16
  209. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  210. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  211. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  212. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  213. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  214. data/vendor/faiss/faiss/utils/utils.h +54 -49
  215. metadata +29 -4
@@ -18,82 +18,94 @@ namespace faiss {
18
18
  */
19
19
  template <typename IndexT>
20
20
  struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
21
- using idx_t = typename IndexT::idx_t;
22
- using component_t = typename IndexT::component_t;
23
- using distance_t = typename IndexT::distance_t;
24
-
25
- /**
26
- * The dimension that all sub-indices must share will be the dimension of the
27
- * first sub-index added
28
- *
29
- * @param threaded do we use one thread per sub_index or do
30
- * queries sequentially?
31
- * @param successive_ids should we shift the returned ids by
32
- * the size of each sub-index or return them
33
- * as they are?
34
- */
35
- explicit IndexShardsTemplate(bool threaded = false,
36
- bool successive_ids = true);
37
-
38
- /**
39
- * @param threaded do we use one thread per sub_index or do
40
- * queries sequentially?
41
- * @param successive_ids should we shift the returned ids by
42
- * the size of each sub-index or return them
43
- * as they are?
44
- */
45
- explicit IndexShardsTemplate(idx_t d,
46
- bool threaded = false,
47
- bool successive_ids = true);
48
-
49
- /// int version due to the implicit bool conversion ambiguity of int as
50
- /// dimension
51
- explicit IndexShardsTemplate(int d,
52
- bool threaded = false,
53
- bool successive_ids = true);
54
-
55
- /// Alias for addIndex()
56
- void add_shard(IndexT* index) { this->addIndex(index); }
57
-
58
- /// Alias for removeIndex()
59
- void remove_shard(IndexT* index) { this->removeIndex(index); }
60
-
61
- /// supported only for sub-indices that implement add_with_ids
62
- void add(idx_t n, const component_t* x) override;
63
-
64
- /**
65
- * Cases (successive_ids, xids):
66
- * - true, non-NULL ERROR: it makes no sense to pass in ids and
67
- * request them to be shifted
68
- * - true, NULL OK, but should be called only once (calls add()
69
- * on sub-indexes).
70
- * - false, non-NULL OK: will call add_with_ids with passed in xids
71
- * distributed evenly over shards
72
- * - false, NULL OK: will call add_with_ids on each sub-index,
73
- * starting at ntotal
74
- */
75
- void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
76
-
77
- void search(idx_t n, const component_t* x, idx_t k,
78
- distance_t* distances, idx_t* labels) const override;
79
-
80
- void train(idx_t n, const component_t* x) override;
81
-
82
- bool successive_ids;
83
-
84
- /// Synchronize the top-level index (IndexShards) with data in the sub-indices
85
- void syncWithSubIndexes();
86
-
87
- protected:
88
- /// Called just after an index is added
89
- void onAfterAddIndex(IndexT* index) override;
90
-
91
- /// Called just after an index is removed
92
- void onAfterRemoveIndex(IndexT* index) override;
21
+ using idx_t = typename IndexT::idx_t;
22
+ using component_t = typename IndexT::component_t;
23
+ using distance_t = typename IndexT::distance_t;
24
+
25
+ /**
26
+ * The dimension that all sub-indices must share will be the dimension of
27
+ * the first sub-index added
28
+ *
29
+ * @param threaded do we use one thread per sub_index or do
30
+ * queries sequentially?
31
+ * @param successive_ids should we shift the returned ids by
32
+ * the size of each sub-index or return them
33
+ * as they are?
34
+ */
35
+ explicit IndexShardsTemplate(
36
+ bool threaded = false,
37
+ bool successive_ids = true);
38
+
39
+ /**
40
+ * @param threaded do we use one thread per sub_index or do
41
+ * queries sequentially?
42
+ * @param successive_ids should we shift the returned ids by
43
+ * the size of each sub-index or return them
44
+ * as they are?
45
+ */
46
+ explicit IndexShardsTemplate(
47
+ idx_t d,
48
+ bool threaded = false,
49
+ bool successive_ids = true);
50
+
51
+ /// int version due to the implicit bool conversion ambiguity of int as
52
+ /// dimension
53
+ explicit IndexShardsTemplate(
54
+ int d,
55
+ bool threaded = false,
56
+ bool successive_ids = true);
57
+
58
+ /// Alias for addIndex()
59
+ void add_shard(IndexT* index) {
60
+ this->addIndex(index);
61
+ }
62
+
63
+ /// Alias for removeIndex()
64
+ void remove_shard(IndexT* index) {
65
+ this->removeIndex(index);
66
+ }
67
+
68
+ /// supported only for sub-indices that implement add_with_ids
69
+ void add(idx_t n, const component_t* x) override;
70
+
71
+ /**
72
+ * Cases (successive_ids, xids):
73
+ * - true, non-NULL ERROR: it makes no sense to pass in ids and
74
+ * request them to be shifted
75
+ * - true, NULL OK, but should be called only once (calls add()
76
+ * on sub-indexes).
77
+ * - false, non-NULL OK: will call add_with_ids with passed in xids
78
+ * distributed evenly over shards
79
+ * - false, NULL OK: will call add_with_ids on each sub-index,
80
+ * starting at ntotal
81
+ */
82
+ void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
83
+ override;
84
+
85
+ void search(
86
+ idx_t n,
87
+ const component_t* x,
88
+ idx_t k,
89
+ distance_t* distances,
90
+ idx_t* labels) const override;
91
+
92
+ void train(idx_t n, const component_t* x) override;
93
+
94
+ bool successive_ids;
95
+
96
+ /// Synchronize the top-level index (IndexShards) with data in the
97
+ /// sub-indices
98
+ void syncWithSubIndexes();
99
+
100
+ protected:
101
+ /// Called just after an index is added
102
+ void onAfterAddIndex(IndexT* index) override;
103
+
104
+ /// Called just after an index is removed
105
+ void onAfterRemoveIndex(IndexT* index) override;
93
106
  };
94
107
 
95
108
  using IndexShards = IndexShardsTemplate<Index>;
96
109
  using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
97
110
 
98
-
99
111
  } // namespace faiss
@@ -7,15 +7,13 @@
7
7
 
8
8
  // -*- c++ -*-
9
9
 
10
-
11
10
  #include <faiss/MatrixStats.h>
12
11
 
12
+ #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
13
13
 
14
- #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
15
-
14
+ #include <faiss/utils/utils.h>
16
15
  #include <cmath>
17
16
  #include <cstdio>
18
- #include <faiss/utils/utils.h>
19
17
 
20
18
  namespace faiss {
21
19
 
@@ -23,16 +21,19 @@ namespace faiss {
23
21
  * MatrixStats
24
22
  *********************************************************************/
25
23
 
26
- MatrixStats::PerDimStats::PerDimStats():
27
- n(0), n_nan(0), n_inf(0), n0(0),
28
- min(HUGE_VALF), max(-HUGE_VALF),
29
- sum(0), sum2(0),
30
- mean(NAN), stddev(NAN)
31
- {}
32
-
33
-
34
- void MatrixStats::PerDimStats::add (float x)
35
- {
24
+ MatrixStats::PerDimStats::PerDimStats()
25
+ : n(0),
26
+ n_nan(0),
27
+ n_inf(0),
28
+ n0(0),
29
+ min(HUGE_VALF),
30
+ max(-HUGE_VALF),
31
+ sum(0),
32
+ sum2(0),
33
+ mean(NAN),
34
+ stddev(NAN) {}
35
+
36
+ void MatrixStats::PerDimStats::add(float x) {
36
37
  n++;
37
38
  if (std::isnan(x)) {
38
39
  n_nan++;
@@ -42,25 +43,26 @@ void MatrixStats::PerDimStats::add (float x)
42
43
  n_inf++;
43
44
  return;
44
45
  }
45
- if (x == 0) n0++;
46
- if (x < min) min = x;
47
- if (x > max) max = x;
46
+ if (x == 0)
47
+ n0++;
48
+ if (x < min)
49
+ min = x;
50
+ if (x > max)
51
+ max = x;
48
52
  sum += x;
49
53
  sum2 += (double)x * (double)x;
50
54
  }
51
55
 
52
- void MatrixStats::PerDimStats::compute_mean_std ()
53
- {
56
+ void MatrixStats::PerDimStats::compute_mean_std() {
54
57
  n_valid = n - n_nan - n_inf;
55
58
  mean = sum / n_valid;
56
59
  double var = sum2 / n_valid - mean * mean;
57
- if (var < 0) var = 0;
60
+ if (var < 0)
61
+ var = 0;
58
62
  stddev = sqrt(var);
59
63
  }
60
64
 
61
-
62
- void MatrixStats::do_comment (const char *fmt, ...)
63
- {
65
+ void MatrixStats::do_comment(const char* fmt, ...) {
64
66
  va_list ap;
65
67
 
66
68
  /* Determine required size */
@@ -72,57 +74,60 @@ void MatrixStats::do_comment (const char *fmt, ...)
72
74
  buf += size;
73
75
  }
74
76
 
75
-
76
-
77
- MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
78
- n(n), d(d),
79
- n_collision(0), n_valid(0), n0(0),
80
- min_norm2(HUGE_VAL), max_norm2(0)
81
- {
82
- std::vector<char> comment_buf (10000);
83
- buf = comment_buf.data ();
77
+ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
78
+ : n(n),
79
+ d(d),
80
+ n_collision(0),
81
+ n_valid(0),
82
+ n0(0),
83
+ min_norm2(HUGE_VAL),
84
+ max_norm2(0) {
85
+ std::vector<char> comment_buf(10000);
86
+ buf = comment_buf.data();
84
87
  nbuf = comment_buf.size();
85
88
 
86
- do_comment ("analyzing %ld vectors of size %ld\n", n, d);
89
+ do_comment("analyzing %ld vectors of size %ld\n", n, d);
87
90
 
88
91
  if (d > 1024) {
89
- do_comment (
90
- "indexing this many dimensions is hard, "
91
- "please consider dimensionality reducution (with PCAMatrix)\n");
92
+ do_comment(
93
+ "indexing this many dimensions is hard, "
94
+ "please consider dimensionality reducution (with PCAMatrix)\n");
92
95
  }
93
96
 
94
- size_t nbytes = sizeof (x[0]) * d;
95
- per_dim_stats.resize (d);
97
+ size_t nbytes = sizeof(x[0]) * d;
98
+ per_dim_stats.resize(d);
96
99
 
97
100
  for (size_t i = 0; i < n; i++) {
98
- const float *xi = x + d * i;
101
+ const float* xi = x + d * i;
99
102
  double sum2 = 0;
100
103
  for (size_t j = 0; j < d; j++) {
101
- per_dim_stats[j].add (xi[j]);
104
+ per_dim_stats[j].add(xi[j]);
102
105
  sum2 += xi[j] * (double)xi[j];
103
106
  }
104
107
 
105
- if (std::isfinite (sum2)) {
108
+ if (std::isfinite(sum2)) {
106
109
  n_valid++;
107
110
  if (sum2 == 0) {
108
- n0 ++;
111
+ n0++;
109
112
  } else {
110
- if (sum2 < min_norm2) min_norm2 = sum2;
111
- if (sum2 > max_norm2) max_norm2 = sum2;
113
+ if (sum2 < min_norm2)
114
+ min_norm2 = sum2;
115
+ if (sum2 > max_norm2)
116
+ max_norm2 = sum2;
112
117
  }
113
118
  }
114
119
 
115
120
  { // check hash
116
121
  uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
117
- auto elt = occurrences.find (hash);
122
+ auto elt = occurrences.find(hash);
118
123
  if (elt == occurrences.end()) {
119
124
  Occurrence occ = {i, 1};
120
125
  occurrences[hash] = occ;
121
126
  } else {
122
- if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
123
- elt->second.count ++;
127
+ if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
128
+ elt->second.count++;
124
129
  } else {
125
- n_collision ++;
130
+ n_collision++;
126
131
  // we should use a list of collisions but overkill
127
132
  }
128
133
  }
@@ -131,50 +136,59 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
131
136
 
132
137
  // invalid vecor stats
133
138
  if (n_valid == n) {
134
- do_comment ("no NaN or Infs in data\n");
139
+ do_comment("no NaN or Infs in data\n");
135
140
  } else {
136
- do_comment ("%ld vectors contain NaN or Inf "
137
- "(or have too large components), "
138
- "expect bad results with indexing!\n", n - n_valid);
141
+ do_comment(
142
+ "%ld vectors contain NaN or Inf "
143
+ "(or have too large components), "
144
+ "expect bad results with indexing!\n",
145
+ n - n_valid);
139
146
  }
140
147
 
141
148
  // copies in dataset
142
149
  if (occurrences.size() == n) {
143
- do_comment ("all vectors are distinct\n");
150
+ do_comment("all vectors are distinct\n");
144
151
  } else {
145
- do_comment ("%ld vectors are distinct (%.2f%%)\n",
146
- occurrences.size(),
147
- occurrences.size() * 100.0 / n);
152
+ do_comment(
153
+ "%ld vectors are distinct (%.2f%%)\n",
154
+ occurrences.size(),
155
+ occurrences.size() * 100.0 / n);
148
156
 
149
157
  if (n_collision > 0) {
150
- do_comment ("%ld collisions in hash table, "
151
- "counts may be invalid\n", n_collision);
158
+ do_comment(
159
+ "%ld collisions in hash table, "
160
+ "counts may be invalid\n",
161
+ n_collision);
152
162
  }
153
163
 
154
164
  Occurrence max = {0, 0};
155
- for (auto it = occurrences.begin();
156
- it != occurrences.end(); ++it) {
165
+ for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
157
166
  if (it->second.count > max.count) {
158
167
  max = it->second;
159
168
  }
160
169
  }
161
- do_comment ("vector %ld has %ld copies\n", max.first, max.count);
170
+ do_comment("vector %ld has %ld copies\n", max.first, max.count);
162
171
  }
163
172
 
164
173
  { // norm stats
165
- min_norm2 = sqrt (min_norm2);
166
- max_norm2 = sqrt (max_norm2);
167
- do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
168
- min_norm2, max_norm2, n0);
174
+ min_norm2 = sqrt(min_norm2);
175
+ max_norm2 = sqrt(max_norm2);
176
+ do_comment(
177
+ "range of L2 norms=[%g, %g] (%ld null vectors)\n",
178
+ min_norm2,
179
+ max_norm2,
180
+ n0);
169
181
 
170
182
  if (max_norm2 < min_norm2 * 1.0001) {
171
- do_comment ("vectors are normalized, inner product and "
172
- "L2 search are equivalent\n");
183
+ do_comment(
184
+ "vectors are normalized, inner product and "
185
+ "L2 search are equivalent\n");
173
186
  }
174
187
 
175
188
  if (max_norm2 > min_norm2 * 100) {
176
- do_comment ("vectors have very large differences in norms, "
177
- "is this normal?\n");
189
+ do_comment(
190
+ "vectors have very large differences in norms, "
191
+ "is this normal?\n");
178
192
  }
179
193
  }
180
194
 
@@ -185,68 +199,69 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
185
199
  size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
186
200
 
187
201
  for (size_t j = 0; j < d; j++) {
188
- PerDimStats &st = per_dim_stats[j];
189
- st.compute_mean_std ();
202
+ PerDimStats& st = per_dim_stats[j];
203
+ st.compute_mean_std();
190
204
  n0 += st.n0;
191
205
 
192
206
  if (st.max == st.min) {
193
- n_0_range ++;
207
+ n_0_range++;
194
208
  } else if (st.max < 1.001 * st.min) {
195
- n_dangerous_range ++;
209
+ n_dangerous_range++;
196
210
  }
197
211
 
198
- if (st.stddev > max_std) max_std = st.stddev;
199
- if (st.stddev < min_std) min_std = st.stddev;
212
+ if (st.stddev > max_std)
213
+ max_std = st.stddev;
214
+ if (st.stddev < min_std)
215
+ min_std = st.stddev;
200
216
  }
201
217
 
202
-
203
-
204
218
  if (n0 == 0) {
205
- do_comment ("matrix contains no 0s\n");
219
+ do_comment("matrix contains no 0s\n");
206
220
  } else {
207
- do_comment ("matrix contains %.2f %% 0 entries\n",
208
- n0 * 100.0 / (n * d));
221
+ do_comment(
222
+ "matrix contains %.2f %% 0 entries\n",
223
+ n0 * 100.0 / (n * d));
209
224
  }
210
225
 
211
226
  if (n_0_range == 0) {
212
- do_comment ("no constant dimensions\n");
227
+ do_comment("no constant dimensions\n");
213
228
  } else {
214
- do_comment ("%ld dimensions are constant: they can be removed\n",
215
- n_0_range);
229
+ do_comment(
230
+ "%ld dimensions are constant: they can be removed\n",
231
+ n_0_range);
216
232
  }
217
233
 
218
234
  if (n_dangerous_range == 0) {
219
- do_comment ("no dimension has a too large mean\n");
235
+ do_comment("no dimension has a too large mean\n");
220
236
  } else {
221
- do_comment ("%ld dimensions are too large "
222
- "wrt. their variance, may loose precision "
223
- "in IndexFlatL2 (use CenteringTransform)\n",
224
- n_dangerous_range);
237
+ do_comment(
238
+ "%ld dimensions are too large "
239
+ "wrt. their variance, may loose precision "
240
+ "in IndexFlatL2 (use CenteringTransform)\n",
241
+ n_dangerous_range);
225
242
  }
226
243
 
227
- do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
244
+ do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
228
245
 
229
246
  size_t n_small_var = 0;
230
247
 
231
248
  for (size_t j = 0; j < d; j++) {
232
- const PerDimStats &st = per_dim_stats[j];
249
+ const PerDimStats& st = per_dim_stats[j];
233
250
  if (st.stddev < max_std * 1e-4) {
234
251
  n_small_var++;
235
252
  }
236
253
  }
237
254
 
238
255
  if (n_small_var > 0) {
239
- do_comment ("%ld dimensions have negligible stddev wrt. "
240
- "the largest dimension, they could be ignored",
241
- n_small_var);
256
+ do_comment(
257
+ "%ld dimensions have negligible stddev wrt. "
258
+ "the largest dimension, they could be ignored",
259
+ n_small_var);
242
260
  }
243
-
244
261
  }
245
- comments = comment_buf.data ();
262
+ comments = comment_buf.data();
246
263
  buf = nullptr;
247
264
  nbuf = 0;
248
265
  }
249
266
 
250
-
251
-
252
267
  } // namespace faiss
@@ -9,22 +9,20 @@
9
9
 
10
10
  #pragma once
11
11
 
12
- #include <vector>
12
+ #include <stdint.h>
13
13
  #include <string>
14
14
  #include <unordered_map>
15
- #include <stdint.h>
16
-
15
+ #include <vector>
17
16
 
18
17
  namespace faiss {
19
18
 
20
-
21
19
  /** Reports some statistics on a dataset and comments on them.
22
20
  *
23
21
  * It is a class rather than a function so that all stats can also be
24
22
  * accessed from code */
25
23
 
26
24
  struct MatrixStats {
27
- MatrixStats (size_t n, size_t d, const float *x);
25
+ MatrixStats(size_t n, size_t d, const float* x);
28
26
  std::string comments;
29
27
 
30
28
  // raw statistics
@@ -42,8 +40,8 @@ struct MatrixStats {
42
40
  double mean, stddev;
43
41
 
44
42
  PerDimStats();
45
- void add (float x);
46
- void compute_mean_std ();
43
+ void add(float x);
44
+ void compute_mean_std();
47
45
  };
48
46
 
49
47
  std::vector<PerDimStats> per_dim_stats;
@@ -53,10 +51,9 @@ struct MatrixStats {
53
51
  };
54
52
  std::unordered_map<uint64_t, Occurrence> occurrences;
55
53
 
56
- char *buf;
54
+ char* buf;
57
55
  size_t nbuf;
58
- void do_comment (const char *fmt, ...);
59
-
56
+ void do_comment(const char* fmt, ...);
60
57
  };
61
58
 
62
59
  } // namespace faiss