faiss 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -7
- data/ext/faiss/extconf.rb +6 -3
- data/ext/faiss/numo.hpp +4 -4
- data/ext/faiss/utils.cpp +1 -1
- data/ext/faiss/utils.h +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +365 -194
- data/vendor/faiss/faiss/Clustering.h +102 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
- data/vendor/faiss/faiss/Index2Layer.h +22 -36
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
- data/vendor/faiss/faiss/IndexFlat.h +42 -59
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
- data/vendor/faiss/faiss/IndexIVF.h +169 -118
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
- data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
- data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
- data/vendor/faiss/faiss/IndexLSH.h +20 -38
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
- data/vendor/faiss/faiss/IndexPQ.h +64 -82
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
- data/vendor/faiss/faiss/IndexRefine.h +32 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
- data/vendor/faiss/faiss/VectorTransform.h +64 -89
- data/vendor/faiss/faiss/clone_index.cpp +78 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
- data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
- data/vendor/faiss/faiss/impl/io.cpp +76 -95
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +60 -29
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
- data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +619 -397
- data/vendor/faiss/faiss/index_factory.h +8 -6
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +305 -312
- data/vendor/faiss/faiss/utils/distances.h +170 -122
- data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +54 -49
- metadata +29 -4
|
@@ -18,82 +18,94 @@ namespace faiss {
|
|
|
18
18
|
*/
|
|
19
19
|
template <typename IndexT>
|
|
20
20
|
struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
21
|
+
using idx_t = typename IndexT::idx_t;
|
|
22
|
+
using component_t = typename IndexT::component_t;
|
|
23
|
+
using distance_t = typename IndexT::distance_t;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* The dimension that all sub-indices must share will be the dimension of
|
|
27
|
+
* the first sub-index added
|
|
28
|
+
*
|
|
29
|
+
* @param threaded do we use one thread per sub_index or do
|
|
30
|
+
* queries sequentially?
|
|
31
|
+
* @param successive_ids should we shift the returned ids by
|
|
32
|
+
* the size of each sub-index or return them
|
|
33
|
+
* as they are?
|
|
34
|
+
*/
|
|
35
|
+
explicit IndexShardsTemplate(
|
|
36
|
+
bool threaded = false,
|
|
37
|
+
bool successive_ids = true);
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* @param threaded do we use one thread per sub_index or do
|
|
41
|
+
* queries sequentially?
|
|
42
|
+
* @param successive_ids should we shift the returned ids by
|
|
43
|
+
* the size of each sub-index or return them
|
|
44
|
+
* as they are?
|
|
45
|
+
*/
|
|
46
|
+
explicit IndexShardsTemplate(
|
|
47
|
+
idx_t d,
|
|
48
|
+
bool threaded = false,
|
|
49
|
+
bool successive_ids = true);
|
|
50
|
+
|
|
51
|
+
/// int version due to the implicit bool conversion ambiguity of int as
|
|
52
|
+
/// dimension
|
|
53
|
+
explicit IndexShardsTemplate(
|
|
54
|
+
int d,
|
|
55
|
+
bool threaded = false,
|
|
56
|
+
bool successive_ids = true);
|
|
57
|
+
|
|
58
|
+
/// Alias for addIndex()
|
|
59
|
+
void add_shard(IndexT* index) {
|
|
60
|
+
this->addIndex(index);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/// Alias for removeIndex()
|
|
64
|
+
void remove_shard(IndexT* index) {
|
|
65
|
+
this->removeIndex(index);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/// supported only for sub-indices that implement add_with_ids
|
|
69
|
+
void add(idx_t n, const component_t* x) override;
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Cases (successive_ids, xids):
|
|
73
|
+
* - true, non-NULL ERROR: it makes no sense to pass in ids and
|
|
74
|
+
* request them to be shifted
|
|
75
|
+
* - true, NULL OK, but should be called only once (calls add()
|
|
76
|
+
* on sub-indexes).
|
|
77
|
+
* - false, non-NULL OK: will call add_with_ids with passed in xids
|
|
78
|
+
* distributed evenly over shards
|
|
79
|
+
* - false, NULL OK: will call add_with_ids on each sub-index,
|
|
80
|
+
* starting at ntotal
|
|
81
|
+
*/
|
|
82
|
+
void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
|
|
83
|
+
override;
|
|
84
|
+
|
|
85
|
+
void search(
|
|
86
|
+
idx_t n,
|
|
87
|
+
const component_t* x,
|
|
88
|
+
idx_t k,
|
|
89
|
+
distance_t* distances,
|
|
90
|
+
idx_t* labels) const override;
|
|
91
|
+
|
|
92
|
+
void train(idx_t n, const component_t* x) override;
|
|
93
|
+
|
|
94
|
+
bool successive_ids;
|
|
95
|
+
|
|
96
|
+
/// Synchronize the top-level index (IndexShards) with data in the
|
|
97
|
+
/// sub-indices
|
|
98
|
+
void syncWithSubIndexes();
|
|
99
|
+
|
|
100
|
+
protected:
|
|
101
|
+
/// Called just after an index is added
|
|
102
|
+
void onAfterAddIndex(IndexT* index) override;
|
|
103
|
+
|
|
104
|
+
/// Called just after an index is removed
|
|
105
|
+
void onAfterRemoveIndex(IndexT* index) override;
|
|
93
106
|
};
|
|
94
107
|
|
|
95
108
|
using IndexShards = IndexShardsTemplate<Index>;
|
|
96
109
|
using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
|
|
97
110
|
|
|
98
|
-
|
|
99
111
|
} // namespace faiss
|
|
@@ -7,15 +7,13 @@
|
|
|
7
7
|
|
|
8
8
|
// -*- c++ -*-
|
|
9
9
|
|
|
10
|
-
|
|
11
10
|
#include <faiss/MatrixStats.h>
|
|
12
11
|
|
|
12
|
+
#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
|
|
13
13
|
|
|
14
|
-
#include <
|
|
15
|
-
|
|
14
|
+
#include <faiss/utils/utils.h>
|
|
16
15
|
#include <cmath>
|
|
17
16
|
#include <cstdio>
|
|
18
|
-
#include <faiss/utils/utils.h>
|
|
19
17
|
|
|
20
18
|
namespace faiss {
|
|
21
19
|
|
|
@@ -23,16 +21,19 @@ namespace faiss {
|
|
|
23
21
|
* MatrixStats
|
|
24
22
|
*********************************************************************/
|
|
25
23
|
|
|
26
|
-
MatrixStats::PerDimStats::PerDimStats()
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
24
|
+
MatrixStats::PerDimStats::PerDimStats()
|
|
25
|
+
: n(0),
|
|
26
|
+
n_nan(0),
|
|
27
|
+
n_inf(0),
|
|
28
|
+
n0(0),
|
|
29
|
+
min(HUGE_VALF),
|
|
30
|
+
max(-HUGE_VALF),
|
|
31
|
+
sum(0),
|
|
32
|
+
sum2(0),
|
|
33
|
+
mean(NAN),
|
|
34
|
+
stddev(NAN) {}
|
|
35
|
+
|
|
36
|
+
void MatrixStats::PerDimStats::add(float x) {
|
|
36
37
|
n++;
|
|
37
38
|
if (std::isnan(x)) {
|
|
38
39
|
n_nan++;
|
|
@@ -42,25 +43,26 @@ void MatrixStats::PerDimStats::add (float x)
|
|
|
42
43
|
n_inf++;
|
|
43
44
|
return;
|
|
44
45
|
}
|
|
45
|
-
if (x == 0)
|
|
46
|
-
|
|
47
|
-
if (x
|
|
46
|
+
if (x == 0)
|
|
47
|
+
n0++;
|
|
48
|
+
if (x < min)
|
|
49
|
+
min = x;
|
|
50
|
+
if (x > max)
|
|
51
|
+
max = x;
|
|
48
52
|
sum += x;
|
|
49
53
|
sum2 += (double)x * (double)x;
|
|
50
54
|
}
|
|
51
55
|
|
|
52
|
-
void MatrixStats::PerDimStats::compute_mean_std
|
|
53
|
-
{
|
|
56
|
+
void MatrixStats::PerDimStats::compute_mean_std() {
|
|
54
57
|
n_valid = n - n_nan - n_inf;
|
|
55
58
|
mean = sum / n_valid;
|
|
56
59
|
double var = sum2 / n_valid - mean * mean;
|
|
57
|
-
if (var < 0)
|
|
60
|
+
if (var < 0)
|
|
61
|
+
var = 0;
|
|
58
62
|
stddev = sqrt(var);
|
|
59
63
|
}
|
|
60
64
|
|
|
61
|
-
|
|
62
|
-
void MatrixStats::do_comment (const char *fmt, ...)
|
|
63
|
-
{
|
|
65
|
+
void MatrixStats::do_comment(const char* fmt, ...) {
|
|
64
66
|
va_list ap;
|
|
65
67
|
|
|
66
68
|
/* Determine required size */
|
|
@@ -72,57 +74,60 @@ void MatrixStats::do_comment (const char *fmt, ...)
|
|
|
72
74
|
buf += size;
|
|
73
75
|
}
|
|
74
76
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
77
|
+
MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
78
|
+
: n(n),
|
|
79
|
+
d(d),
|
|
80
|
+
n_collision(0),
|
|
81
|
+
n_valid(0),
|
|
82
|
+
n0(0),
|
|
83
|
+
min_norm2(HUGE_VAL),
|
|
84
|
+
max_norm2(0) {
|
|
85
|
+
std::vector<char> comment_buf(10000);
|
|
86
|
+
buf = comment_buf.data();
|
|
84
87
|
nbuf = comment_buf.size();
|
|
85
88
|
|
|
86
|
-
do_comment
|
|
89
|
+
do_comment("analyzing %ld vectors of size %ld\n", n, d);
|
|
87
90
|
|
|
88
91
|
if (d > 1024) {
|
|
89
|
-
do_comment
|
|
90
|
-
|
|
91
|
-
|
|
92
|
+
do_comment(
|
|
93
|
+
"indexing this many dimensions is hard, "
|
|
94
|
+
"please consider dimensionality reducution (with PCAMatrix)\n");
|
|
92
95
|
}
|
|
93
96
|
|
|
94
|
-
size_t nbytes = sizeof
|
|
95
|
-
per_dim_stats.resize
|
|
97
|
+
size_t nbytes = sizeof(x[0]) * d;
|
|
98
|
+
per_dim_stats.resize(d);
|
|
96
99
|
|
|
97
100
|
for (size_t i = 0; i < n; i++) {
|
|
98
|
-
const float
|
|
101
|
+
const float* xi = x + d * i;
|
|
99
102
|
double sum2 = 0;
|
|
100
103
|
for (size_t j = 0; j < d; j++) {
|
|
101
|
-
per_dim_stats[j].add
|
|
104
|
+
per_dim_stats[j].add(xi[j]);
|
|
102
105
|
sum2 += xi[j] * (double)xi[j];
|
|
103
106
|
}
|
|
104
107
|
|
|
105
|
-
if (std::isfinite
|
|
108
|
+
if (std::isfinite(sum2)) {
|
|
106
109
|
n_valid++;
|
|
107
110
|
if (sum2 == 0) {
|
|
108
|
-
n0
|
|
111
|
+
n0++;
|
|
109
112
|
} else {
|
|
110
|
-
if (sum2 < min_norm2)
|
|
111
|
-
|
|
113
|
+
if (sum2 < min_norm2)
|
|
114
|
+
min_norm2 = sum2;
|
|
115
|
+
if (sum2 > max_norm2)
|
|
116
|
+
max_norm2 = sum2;
|
|
112
117
|
}
|
|
113
118
|
}
|
|
114
119
|
|
|
115
120
|
{ // check hash
|
|
116
121
|
uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
|
|
117
|
-
auto elt = occurrences.find
|
|
122
|
+
auto elt = occurrences.find(hash);
|
|
118
123
|
if (elt == occurrences.end()) {
|
|
119
124
|
Occurrence occ = {i, 1};
|
|
120
125
|
occurrences[hash] = occ;
|
|
121
126
|
} else {
|
|
122
|
-
if (!memcmp
|
|
123
|
-
elt->second.count
|
|
127
|
+
if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
|
|
128
|
+
elt->second.count++;
|
|
124
129
|
} else {
|
|
125
|
-
n_collision
|
|
130
|
+
n_collision++;
|
|
126
131
|
// we should use a list of collisions but overkill
|
|
127
132
|
}
|
|
128
133
|
}
|
|
@@ -131,50 +136,59 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
|
|
|
131
136
|
|
|
132
137
|
// invalid vecor stats
|
|
133
138
|
if (n_valid == n) {
|
|
134
|
-
do_comment
|
|
139
|
+
do_comment("no NaN or Infs in data\n");
|
|
135
140
|
} else {
|
|
136
|
-
do_comment
|
|
137
|
-
|
|
138
|
-
|
|
141
|
+
do_comment(
|
|
142
|
+
"%ld vectors contain NaN or Inf "
|
|
143
|
+
"(or have too large components), "
|
|
144
|
+
"expect bad results with indexing!\n",
|
|
145
|
+
n - n_valid);
|
|
139
146
|
}
|
|
140
147
|
|
|
141
148
|
// copies in dataset
|
|
142
149
|
if (occurrences.size() == n) {
|
|
143
|
-
do_comment
|
|
150
|
+
do_comment("all vectors are distinct\n");
|
|
144
151
|
} else {
|
|
145
|
-
do_comment
|
|
146
|
-
|
|
147
|
-
|
|
152
|
+
do_comment(
|
|
153
|
+
"%ld vectors are distinct (%.2f%%)\n",
|
|
154
|
+
occurrences.size(),
|
|
155
|
+
occurrences.size() * 100.0 / n);
|
|
148
156
|
|
|
149
157
|
if (n_collision > 0) {
|
|
150
|
-
do_comment
|
|
151
|
-
|
|
158
|
+
do_comment(
|
|
159
|
+
"%ld collisions in hash table, "
|
|
160
|
+
"counts may be invalid\n",
|
|
161
|
+
n_collision);
|
|
152
162
|
}
|
|
153
163
|
|
|
154
164
|
Occurrence max = {0, 0};
|
|
155
|
-
for (auto it = occurrences.begin();
|
|
156
|
-
it != occurrences.end(); ++it) {
|
|
165
|
+
for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
|
|
157
166
|
if (it->second.count > max.count) {
|
|
158
167
|
max = it->second;
|
|
159
168
|
}
|
|
160
169
|
}
|
|
161
|
-
do_comment
|
|
170
|
+
do_comment("vector %ld has %ld copies\n", max.first, max.count);
|
|
162
171
|
}
|
|
163
172
|
|
|
164
173
|
{ // norm stats
|
|
165
|
-
min_norm2 = sqrt
|
|
166
|
-
max_norm2 = sqrt
|
|
167
|
-
do_comment
|
|
168
|
-
|
|
174
|
+
min_norm2 = sqrt(min_norm2);
|
|
175
|
+
max_norm2 = sqrt(max_norm2);
|
|
176
|
+
do_comment(
|
|
177
|
+
"range of L2 norms=[%g, %g] (%ld null vectors)\n",
|
|
178
|
+
min_norm2,
|
|
179
|
+
max_norm2,
|
|
180
|
+
n0);
|
|
169
181
|
|
|
170
182
|
if (max_norm2 < min_norm2 * 1.0001) {
|
|
171
|
-
do_comment
|
|
172
|
-
|
|
183
|
+
do_comment(
|
|
184
|
+
"vectors are normalized, inner product and "
|
|
185
|
+
"L2 search are equivalent\n");
|
|
173
186
|
}
|
|
174
187
|
|
|
175
188
|
if (max_norm2 > min_norm2 * 100) {
|
|
176
|
-
do_comment
|
|
177
|
-
|
|
189
|
+
do_comment(
|
|
190
|
+
"vectors have very large differences in norms, "
|
|
191
|
+
"is this normal?\n");
|
|
178
192
|
}
|
|
179
193
|
}
|
|
180
194
|
|
|
@@ -185,68 +199,69 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
|
|
|
185
199
|
size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
|
|
186
200
|
|
|
187
201
|
for (size_t j = 0; j < d; j++) {
|
|
188
|
-
PerDimStats
|
|
189
|
-
st.compute_mean_std
|
|
202
|
+
PerDimStats& st = per_dim_stats[j];
|
|
203
|
+
st.compute_mean_std();
|
|
190
204
|
n0 += st.n0;
|
|
191
205
|
|
|
192
206
|
if (st.max == st.min) {
|
|
193
|
-
n_0_range
|
|
207
|
+
n_0_range++;
|
|
194
208
|
} else if (st.max < 1.001 * st.min) {
|
|
195
|
-
n_dangerous_range
|
|
209
|
+
n_dangerous_range++;
|
|
196
210
|
}
|
|
197
211
|
|
|
198
|
-
if (st.stddev > max_std)
|
|
199
|
-
|
|
212
|
+
if (st.stddev > max_std)
|
|
213
|
+
max_std = st.stddev;
|
|
214
|
+
if (st.stddev < min_std)
|
|
215
|
+
min_std = st.stddev;
|
|
200
216
|
}
|
|
201
217
|
|
|
202
|
-
|
|
203
|
-
|
|
204
218
|
if (n0 == 0) {
|
|
205
|
-
do_comment
|
|
219
|
+
do_comment("matrix contains no 0s\n");
|
|
206
220
|
} else {
|
|
207
|
-
do_comment
|
|
208
|
-
|
|
221
|
+
do_comment(
|
|
222
|
+
"matrix contains %.2f %% 0 entries\n",
|
|
223
|
+
n0 * 100.0 / (n * d));
|
|
209
224
|
}
|
|
210
225
|
|
|
211
226
|
if (n_0_range == 0) {
|
|
212
|
-
do_comment
|
|
227
|
+
do_comment("no constant dimensions\n");
|
|
213
228
|
} else {
|
|
214
|
-
do_comment
|
|
215
|
-
|
|
229
|
+
do_comment(
|
|
230
|
+
"%ld dimensions are constant: they can be removed\n",
|
|
231
|
+
n_0_range);
|
|
216
232
|
}
|
|
217
233
|
|
|
218
234
|
if (n_dangerous_range == 0) {
|
|
219
|
-
do_comment
|
|
235
|
+
do_comment("no dimension has a too large mean\n");
|
|
220
236
|
} else {
|
|
221
|
-
do_comment
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
237
|
+
do_comment(
|
|
238
|
+
"%ld dimensions are too large "
|
|
239
|
+
"wrt. their variance, may loose precision "
|
|
240
|
+
"in IndexFlatL2 (use CenteringTransform)\n",
|
|
241
|
+
n_dangerous_range);
|
|
225
242
|
}
|
|
226
243
|
|
|
227
|
-
do_comment
|
|
244
|
+
do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
|
|
228
245
|
|
|
229
246
|
size_t n_small_var = 0;
|
|
230
247
|
|
|
231
248
|
for (size_t j = 0; j < d; j++) {
|
|
232
|
-
const PerDimStats
|
|
249
|
+
const PerDimStats& st = per_dim_stats[j];
|
|
233
250
|
if (st.stddev < max_std * 1e-4) {
|
|
234
251
|
n_small_var++;
|
|
235
252
|
}
|
|
236
253
|
}
|
|
237
254
|
|
|
238
255
|
if (n_small_var > 0) {
|
|
239
|
-
do_comment
|
|
240
|
-
|
|
241
|
-
|
|
256
|
+
do_comment(
|
|
257
|
+
"%ld dimensions have negligible stddev wrt. "
|
|
258
|
+
"the largest dimension, they could be ignored",
|
|
259
|
+
n_small_var);
|
|
242
260
|
}
|
|
243
|
-
|
|
244
261
|
}
|
|
245
|
-
comments = comment_buf.data
|
|
262
|
+
comments = comment_buf.data();
|
|
246
263
|
buf = nullptr;
|
|
247
264
|
nbuf = 0;
|
|
248
265
|
}
|
|
249
266
|
|
|
250
|
-
|
|
251
|
-
|
|
252
267
|
} // namespace faiss
|
|
@@ -9,22 +9,20 @@
|
|
|
9
9
|
|
|
10
10
|
#pragma once
|
|
11
11
|
|
|
12
|
-
#include <
|
|
12
|
+
#include <stdint.h>
|
|
13
13
|
#include <string>
|
|
14
14
|
#include <unordered_map>
|
|
15
|
-
#include <
|
|
16
|
-
|
|
15
|
+
#include <vector>
|
|
17
16
|
|
|
18
17
|
namespace faiss {
|
|
19
18
|
|
|
20
|
-
|
|
21
19
|
/** Reports some statistics on a dataset and comments on them.
|
|
22
20
|
*
|
|
23
21
|
* It is a class rather than a function so that all stats can also be
|
|
24
22
|
* accessed from code */
|
|
25
23
|
|
|
26
24
|
struct MatrixStats {
|
|
27
|
-
MatrixStats
|
|
25
|
+
MatrixStats(size_t n, size_t d, const float* x);
|
|
28
26
|
std::string comments;
|
|
29
27
|
|
|
30
28
|
// raw statistics
|
|
@@ -42,8 +40,8 @@ struct MatrixStats {
|
|
|
42
40
|
double mean, stddev;
|
|
43
41
|
|
|
44
42
|
PerDimStats();
|
|
45
|
-
void add
|
|
46
|
-
void compute_mean_std
|
|
43
|
+
void add(float x);
|
|
44
|
+
void compute_mean_std();
|
|
47
45
|
};
|
|
48
46
|
|
|
49
47
|
std::vector<PerDimStats> per_dim_stats;
|
|
@@ -53,10 +51,9 @@ struct MatrixStats {
|
|
|
53
51
|
};
|
|
54
52
|
std::unordered_map<uint64_t, Occurrence> occurrences;
|
|
55
53
|
|
|
56
|
-
char
|
|
54
|
+
char* buf;
|
|
57
55
|
size_t nbuf;
|
|
58
|
-
void do_comment
|
|
59
|
-
|
|
56
|
+
void do_comment(const char* fmt, ...);
|
|
60
57
|
};
|
|
61
58
|
|
|
62
59
|
} // namespace faiss
|