faiss 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +103 -3
- data/ext/faiss/ext.cpp +99 -32
- data/ext/faiss/extconf.rb +12 -2
- data/lib/faiss/ext.bundle +0 -0
- data/lib/faiss/index.rb +3 -3
- data/lib/faiss/index_binary.rb +3 -3
- data/lib/faiss/kmeans.rb +1 -1
- data/lib/faiss/pca_matrix.rb +2 -2
- data/lib/faiss/product_quantizer.rb +3 -3
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/AutoTune.cpp +719 -0
- data/vendor/faiss/AutoTune.h +212 -0
- data/vendor/faiss/Clustering.cpp +261 -0
- data/vendor/faiss/Clustering.h +101 -0
- data/vendor/faiss/IVFlib.cpp +339 -0
- data/vendor/faiss/IVFlib.h +132 -0
- data/vendor/faiss/Index.cpp +171 -0
- data/vendor/faiss/Index.h +261 -0
- data/vendor/faiss/Index2Layer.cpp +437 -0
- data/vendor/faiss/Index2Layer.h +85 -0
- data/vendor/faiss/IndexBinary.cpp +77 -0
- data/vendor/faiss/IndexBinary.h +163 -0
- data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
- data/vendor/faiss/IndexBinaryFlat.h +54 -0
- data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
- data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
- data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
- data/vendor/faiss/IndexBinaryHNSW.h +56 -0
- data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
- data/vendor/faiss/IndexBinaryIVF.h +211 -0
- data/vendor/faiss/IndexFlat.cpp +508 -0
- data/vendor/faiss/IndexFlat.h +175 -0
- data/vendor/faiss/IndexHNSW.cpp +1090 -0
- data/vendor/faiss/IndexHNSW.h +170 -0
- data/vendor/faiss/IndexIVF.cpp +909 -0
- data/vendor/faiss/IndexIVF.h +353 -0
- data/vendor/faiss/IndexIVFFlat.cpp +502 -0
- data/vendor/faiss/IndexIVFFlat.h +118 -0
- data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
- data/vendor/faiss/IndexIVFPQ.h +161 -0
- data/vendor/faiss/IndexIVFPQR.cpp +219 -0
- data/vendor/faiss/IndexIVFPQR.h +65 -0
- data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
- data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
- data/vendor/faiss/IndexLSH.cpp +225 -0
- data/vendor/faiss/IndexLSH.h +87 -0
- data/vendor/faiss/IndexLattice.cpp +143 -0
- data/vendor/faiss/IndexLattice.h +68 -0
- data/vendor/faiss/IndexPQ.cpp +1188 -0
- data/vendor/faiss/IndexPQ.h +199 -0
- data/vendor/faiss/IndexPreTransform.cpp +288 -0
- data/vendor/faiss/IndexPreTransform.h +91 -0
- data/vendor/faiss/IndexReplicas.cpp +123 -0
- data/vendor/faiss/IndexReplicas.h +76 -0
- data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
- data/vendor/faiss/IndexScalarQuantizer.h +127 -0
- data/vendor/faiss/IndexShards.cpp +317 -0
- data/vendor/faiss/IndexShards.h +100 -0
- data/vendor/faiss/InvertedLists.cpp +623 -0
- data/vendor/faiss/InvertedLists.h +334 -0
- data/vendor/faiss/LICENSE +21 -0
- data/vendor/faiss/MatrixStats.cpp +252 -0
- data/vendor/faiss/MatrixStats.h +62 -0
- data/vendor/faiss/MetaIndexes.cpp +351 -0
- data/vendor/faiss/MetaIndexes.h +126 -0
- data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
- data/vendor/faiss/OnDiskInvertedLists.h +127 -0
- data/vendor/faiss/VectorTransform.cpp +1157 -0
- data/vendor/faiss/VectorTransform.h +322 -0
- data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
- data/vendor/faiss/c_api/AutoTune_c.h +64 -0
- data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
- data/vendor/faiss/c_api/Clustering_c.h +117 -0
- data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
- data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
- data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
- data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
- data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
- data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
- data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
- data/vendor/faiss/c_api/IndexShards_c.h +42 -0
- data/vendor/faiss/c_api/Index_c.cpp +105 -0
- data/vendor/faiss/c_api/Index_c.h +183 -0
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
- data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
- data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
- data/vendor/faiss/c_api/clone_index_c.h +32 -0
- data/vendor/faiss/c_api/error_c.h +42 -0
- data/vendor/faiss/c_api/error_impl.cpp +27 -0
- data/vendor/faiss/c_api/error_impl.h +16 -0
- data/vendor/faiss/c_api/faiss_c.h +58 -0
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
- data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
- data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
- data/vendor/faiss/c_api/index_factory_c.h +30 -0
- data/vendor/faiss/c_api/index_io_c.cpp +42 -0
- data/vendor/faiss/c_api/index_io_c.h +50 -0
- data/vendor/faiss/c_api/macros_impl.h +110 -0
- data/vendor/faiss/clone_index.cpp +147 -0
- data/vendor/faiss/clone_index.h +38 -0
- data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
- data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
- data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
- data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
- data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
- data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
- data/vendor/faiss/gpu/GpuCloner.h +82 -0
- data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
- data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
- data/vendor/faiss/gpu/GpuDistance.h +52 -0
- data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
- data/vendor/faiss/gpu/GpuIndex.h +148 -0
- data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
- data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
- data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
- data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
- data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
- data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
- data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
- data/vendor/faiss/gpu/GpuResources.cpp +52 -0
- data/vendor/faiss/gpu/GpuResources.h +73 -0
- data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
- data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
- data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
- data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
- data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
- data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
- data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
- data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
- data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
- data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
- data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
- data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
- data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
- data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
- data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
- data/vendor/faiss/gpu/test/TestUtils.h +93 -0
- data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
- data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
- data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
- data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
- data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
- data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
- data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
- data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
- data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
- data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
- data/vendor/faiss/gpu/utils/Timer.h +52 -0
- data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
- data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
- data/vendor/faiss/impl/FaissAssert.h +95 -0
- data/vendor/faiss/impl/FaissException.cpp +66 -0
- data/vendor/faiss/impl/FaissException.h +71 -0
- data/vendor/faiss/impl/HNSW.cpp +818 -0
- data/vendor/faiss/impl/HNSW.h +275 -0
- data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
- data/vendor/faiss/impl/PolysemousTraining.h +158 -0
- data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
- data/vendor/faiss/impl/ProductQuantizer.h +242 -0
- data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
- data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
- data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
- data/vendor/faiss/impl/ThreadedIndex.h +80 -0
- data/vendor/faiss/impl/index_read.cpp +793 -0
- data/vendor/faiss/impl/index_write.cpp +558 -0
- data/vendor/faiss/impl/io.cpp +142 -0
- data/vendor/faiss/impl/io.h +98 -0
- data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
- data/vendor/faiss/impl/lattice_Zn.h +199 -0
- data/vendor/faiss/index_factory.cpp +392 -0
- data/vendor/faiss/index_factory.h +25 -0
- data/vendor/faiss/index_io.h +75 -0
- data/vendor/faiss/misc/test_blas.cpp +84 -0
- data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
- data/vendor/faiss/tests/test_merge.cpp +258 -0
- data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
- data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
- data/vendor/faiss/tests/test_params_override.cpp +231 -0
- data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
- data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
- data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
- data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
- data/vendor/faiss/utils/Heap.cpp +122 -0
- data/vendor/faiss/utils/Heap.h +495 -0
- data/vendor/faiss/utils/WorkerThread.cpp +126 -0
- data/vendor/faiss/utils/WorkerThread.h +61 -0
- data/vendor/faiss/utils/distances.cpp +765 -0
- data/vendor/faiss/utils/distances.h +243 -0
- data/vendor/faiss/utils/distances_simd.cpp +809 -0
- data/vendor/faiss/utils/extra_distances.cpp +336 -0
- data/vendor/faiss/utils/extra_distances.h +54 -0
- data/vendor/faiss/utils/hamming-inl.h +472 -0
- data/vendor/faiss/utils/hamming.cpp +792 -0
- data/vendor/faiss/utils/hamming.h +220 -0
- data/vendor/faiss/utils/random.cpp +192 -0
- data/vendor/faiss/utils/random.h +60 -0
- data/vendor/faiss/utils/utils.cpp +783 -0
- data/vendor/faiss/utils/utils.h +181 -0
- metadata +216 -2
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// -*- c++ -*-
|
|
9
|
+
|
|
10
|
+
#ifndef FAISS_INVERTEDLISTS_IVF_H
|
|
11
|
+
#define FAISS_INVERTEDLISTS_IVF_H
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Definition of inverted lists + a few common classes that implement
|
|
15
|
+
* the interface.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#include <vector>
|
|
19
|
+
#include <faiss/Index.h>
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
namespace faiss {
|
|
23
|
+
|
|
24
|
+
/** Table of inverted lists
|
|
25
|
+
* multithreading rules:
|
|
26
|
+
* - concurrent read accesses are allowed
|
|
27
|
+
* - concurrent update accesses are allowed
|
|
28
|
+
* - for resize and add_entries, only concurrent access to different lists
|
|
29
|
+
* are allowed
|
|
30
|
+
*/
|
|
31
|
+
struct InvertedLists {
|
|
32
|
+
typedef Index::idx_t idx_t;
|
|
33
|
+
|
|
34
|
+
size_t nlist; ///< number of possible key values
|
|
35
|
+
size_t code_size; ///< code size per vector in bytes
|
|
36
|
+
|
|
37
|
+
InvertedLists (size_t nlist, size_t code_size);
|
|
38
|
+
|
|
39
|
+
/*************************
|
|
40
|
+
* Read only functions */
|
|
41
|
+
|
|
42
|
+
/// get the size of a list
|
|
43
|
+
virtual size_t list_size(size_t list_no) const = 0;
|
|
44
|
+
|
|
45
|
+
/** get the codes for an inverted list
|
|
46
|
+
* must be released by release_codes
|
|
47
|
+
*
|
|
48
|
+
* @return codes size list_size * code_size
|
|
49
|
+
*/
|
|
50
|
+
virtual const uint8_t * get_codes (size_t list_no) const = 0;
|
|
51
|
+
|
|
52
|
+
/** get the ids for an inverted list
|
|
53
|
+
* must be released by release_ids
|
|
54
|
+
*
|
|
55
|
+
* @return ids size list_size
|
|
56
|
+
*/
|
|
57
|
+
virtual const idx_t * get_ids (size_t list_no) const = 0;
|
|
58
|
+
|
|
59
|
+
/// release codes returned by get_codes (default implementation is nop
|
|
60
|
+
virtual void release_codes (size_t list_no, const uint8_t *codes) const;
|
|
61
|
+
|
|
62
|
+
/// release ids returned by get_ids
|
|
63
|
+
virtual void release_ids (size_t list_no, const idx_t *ids) const;
|
|
64
|
+
|
|
65
|
+
/// @return a single id in an inverted list
|
|
66
|
+
virtual idx_t get_single_id (size_t list_no, size_t offset) const;
|
|
67
|
+
|
|
68
|
+
/// @return a single code in an inverted list
|
|
69
|
+
/// (should be deallocated with release_codes)
|
|
70
|
+
virtual const uint8_t * get_single_code (
|
|
71
|
+
size_t list_no, size_t offset) const;
|
|
72
|
+
|
|
73
|
+
/// prepare the following lists (default does nothing)
|
|
74
|
+
/// a list can be -1 hence the signed long
|
|
75
|
+
virtual void prefetch_lists (const idx_t *list_nos, int nlist) const;
|
|
76
|
+
|
|
77
|
+
/*************************
|
|
78
|
+
* writing functions */
|
|
79
|
+
|
|
80
|
+
/// add one entry to an inverted list
|
|
81
|
+
virtual size_t add_entry (size_t list_no, idx_t theid,
|
|
82
|
+
const uint8_t *code);
|
|
83
|
+
|
|
84
|
+
virtual size_t add_entries (
|
|
85
|
+
size_t list_no, size_t n_entry,
|
|
86
|
+
const idx_t* ids, const uint8_t *code) = 0;
|
|
87
|
+
|
|
88
|
+
virtual void update_entry (size_t list_no, size_t offset,
|
|
89
|
+
idx_t id, const uint8_t *code);
|
|
90
|
+
|
|
91
|
+
virtual void update_entries (size_t list_no, size_t offset, size_t n_entry,
|
|
92
|
+
const idx_t *ids, const uint8_t *code) = 0;
|
|
93
|
+
|
|
94
|
+
virtual void resize (size_t list_no, size_t new_size) = 0;
|
|
95
|
+
|
|
96
|
+
virtual void reset ();
|
|
97
|
+
|
|
98
|
+
/// move all entries from oivf (empty on output)
|
|
99
|
+
void merge_from (InvertedLists *oivf, size_t add_id);
|
|
100
|
+
|
|
101
|
+
virtual ~InvertedLists ();
|
|
102
|
+
|
|
103
|
+
/*************************
|
|
104
|
+
* statistics */
|
|
105
|
+
|
|
106
|
+
/// 1= perfectly balanced, >1: imbalanced
|
|
107
|
+
double imbalance_factor () const;
|
|
108
|
+
|
|
109
|
+
/// display some stats about the inverted lists
|
|
110
|
+
void print_stats () const;
|
|
111
|
+
|
|
112
|
+
/// sum up list sizes
|
|
113
|
+
size_t compute_ntotal () const;
|
|
114
|
+
|
|
115
|
+
/**************************************
|
|
116
|
+
* Scoped inverted lists (for automatic deallocation)
|
|
117
|
+
*
|
|
118
|
+
* instead of writing:
|
|
119
|
+
*
|
|
120
|
+
* uint8_t * codes = invlists->get_codes (10);
|
|
121
|
+
* ... use codes
|
|
122
|
+
* invlists->release_codes(10, codes)
|
|
123
|
+
*
|
|
124
|
+
* write:
|
|
125
|
+
*
|
|
126
|
+
* ScopedCodes codes (invlists, 10);
|
|
127
|
+
* ... use codes.get()
|
|
128
|
+
* // release called automatically when codes goes out of scope
|
|
129
|
+
*
|
|
130
|
+
* the following function call also works:
|
|
131
|
+
*
|
|
132
|
+
* foo (123, ScopedCodes (invlists, 10).get(), 456);
|
|
133
|
+
*
|
|
134
|
+
*/
|
|
135
|
+
|
|
136
|
+
struct ScopedIds {
|
|
137
|
+
const InvertedLists *il;
|
|
138
|
+
const idx_t *ids;
|
|
139
|
+
size_t list_no;
|
|
140
|
+
|
|
141
|
+
ScopedIds (const InvertedLists *il, size_t list_no):
|
|
142
|
+
il (il), ids (il->get_ids (list_no)), list_no (list_no)
|
|
143
|
+
{}
|
|
144
|
+
|
|
145
|
+
const idx_t *get() {return ids; }
|
|
146
|
+
|
|
147
|
+
idx_t operator [] (size_t i) const {
|
|
148
|
+
return ids[i];
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
~ScopedIds () {
|
|
152
|
+
il->release_ids (list_no, ids);
|
|
153
|
+
}
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
struct ScopedCodes {
|
|
157
|
+
const InvertedLists *il;
|
|
158
|
+
const uint8_t *codes;
|
|
159
|
+
size_t list_no;
|
|
160
|
+
|
|
161
|
+
ScopedCodes (const InvertedLists *il, size_t list_no):
|
|
162
|
+
il (il), codes (il->get_codes (list_no)), list_no (list_no)
|
|
163
|
+
{}
|
|
164
|
+
|
|
165
|
+
ScopedCodes (const InvertedLists *il, size_t list_no, size_t offset):
|
|
166
|
+
il (il), codes (il->get_single_code (list_no, offset)),
|
|
167
|
+
list_no (list_no)
|
|
168
|
+
{}
|
|
169
|
+
|
|
170
|
+
const uint8_t *get() {return codes; }
|
|
171
|
+
|
|
172
|
+
~ScopedCodes () {
|
|
173
|
+
il->release_codes (list_no, codes);
|
|
174
|
+
}
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
/// simple (default) implementation as an array of inverted lists
|
|
182
|
+
struct ArrayInvertedLists: InvertedLists {
|
|
183
|
+
std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
|
|
184
|
+
std::vector < std::vector<idx_t> > ids; ///< Inverted lists for indexes
|
|
185
|
+
|
|
186
|
+
ArrayInvertedLists (size_t nlist, size_t code_size);
|
|
187
|
+
|
|
188
|
+
size_t list_size(size_t list_no) const override;
|
|
189
|
+
const uint8_t * get_codes (size_t list_no) const override;
|
|
190
|
+
const idx_t * get_ids (size_t list_no) const override;
|
|
191
|
+
|
|
192
|
+
size_t add_entries (
|
|
193
|
+
size_t list_no, size_t n_entry,
|
|
194
|
+
const idx_t* ids, const uint8_t *code) override;
|
|
195
|
+
|
|
196
|
+
void update_entries (size_t list_no, size_t offset, size_t n_entry,
|
|
197
|
+
const idx_t *ids, const uint8_t *code) override;
|
|
198
|
+
|
|
199
|
+
void resize (size_t list_no, size_t new_size) override;
|
|
200
|
+
|
|
201
|
+
virtual ~ArrayInvertedLists ();
|
|
202
|
+
};
|
|
203
|
+
|
|
204
|
+
/*****************************************************************
|
|
205
|
+
* Meta-inverted lists
|
|
206
|
+
*
|
|
207
|
+
* About terminology: the inverted lists are seen as a sparse matrix,
|
|
208
|
+
* that can be stacked horizontally, vertically and sliced.
|
|
209
|
+
*****************************************************************/
|
|
210
|
+
|
|
211
|
+
struct ReadOnlyInvertedLists: InvertedLists {
|
|
212
|
+
|
|
213
|
+
ReadOnlyInvertedLists (size_t nlist, size_t code_size):
|
|
214
|
+
InvertedLists (nlist, code_size) {}
|
|
215
|
+
|
|
216
|
+
size_t add_entries (
|
|
217
|
+
size_t list_no, size_t n_entry,
|
|
218
|
+
const idx_t* ids, const uint8_t *code) override;
|
|
219
|
+
|
|
220
|
+
void update_entries (size_t list_no, size_t offset, size_t n_entry,
|
|
221
|
+
const idx_t *ids, const uint8_t *code) override;
|
|
222
|
+
|
|
223
|
+
void resize (size_t list_no, size_t new_size) override;
|
|
224
|
+
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
/// Horizontal stack of inverted lists
|
|
229
|
+
struct HStackInvertedLists: ReadOnlyInvertedLists {
|
|
230
|
+
|
|
231
|
+
std::vector<const InvertedLists *>ils;
|
|
232
|
+
|
|
233
|
+
/// build InvertedLists by concatenating nil of them
|
|
234
|
+
HStackInvertedLists (int nil, const InvertedLists **ils);
|
|
235
|
+
|
|
236
|
+
size_t list_size(size_t list_no) const override;
|
|
237
|
+
const uint8_t * get_codes (size_t list_no) const override;
|
|
238
|
+
const idx_t * get_ids (size_t list_no) const override;
|
|
239
|
+
|
|
240
|
+
void prefetch_lists (const idx_t *list_nos, int nlist) const override;
|
|
241
|
+
|
|
242
|
+
void release_codes (size_t list_no, const uint8_t *codes) const override;
|
|
243
|
+
void release_ids (size_t list_no, const idx_t *ids) const override;
|
|
244
|
+
|
|
245
|
+
idx_t get_single_id (size_t list_no, size_t offset) const override;
|
|
246
|
+
|
|
247
|
+
const uint8_t * get_single_code (
|
|
248
|
+
size_t list_no, size_t offset) const override;
|
|
249
|
+
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
using ConcatenatedInvertedLists = HStackInvertedLists;
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
/// vertical slice of indexes in another InvertedLists
|
|
256
|
+
struct SliceInvertedLists: ReadOnlyInvertedLists {
|
|
257
|
+
const InvertedLists *il;
|
|
258
|
+
idx_t i0, i1;
|
|
259
|
+
|
|
260
|
+
SliceInvertedLists(const InvertedLists *il, idx_t i0, idx_t i1);
|
|
261
|
+
|
|
262
|
+
size_t list_size(size_t list_no) const override;
|
|
263
|
+
const uint8_t * get_codes (size_t list_no) const override;
|
|
264
|
+
const idx_t * get_ids (size_t list_no) const override;
|
|
265
|
+
|
|
266
|
+
void release_codes (size_t list_no, const uint8_t *codes) const override;
|
|
267
|
+
void release_ids (size_t list_no, const idx_t *ids) const override;
|
|
268
|
+
|
|
269
|
+
idx_t get_single_id (size_t list_no, size_t offset) const override;
|
|
270
|
+
|
|
271
|
+
const uint8_t * get_single_code (
|
|
272
|
+
size_t list_no, size_t offset) const override;
|
|
273
|
+
|
|
274
|
+
void prefetch_lists (const idx_t *list_nos, int nlist) const override;
|
|
275
|
+
};
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
struct VStackInvertedLists: ReadOnlyInvertedLists {
|
|
279
|
+
std::vector<const InvertedLists *>ils;
|
|
280
|
+
std::vector<idx_t> cumsz;
|
|
281
|
+
|
|
282
|
+
/// build InvertedLists by concatenating nil of them
|
|
283
|
+
VStackInvertedLists (int nil, const InvertedLists **ils);
|
|
284
|
+
|
|
285
|
+
size_t list_size(size_t list_no) const override;
|
|
286
|
+
const uint8_t * get_codes (size_t list_no) const override;
|
|
287
|
+
const idx_t * get_ids (size_t list_no) const override;
|
|
288
|
+
|
|
289
|
+
void release_codes (size_t list_no, const uint8_t *codes) const override;
|
|
290
|
+
void release_ids (size_t list_no, const idx_t *ids) const override;
|
|
291
|
+
|
|
292
|
+
idx_t get_single_id (size_t list_no, size_t offset) const override;
|
|
293
|
+
|
|
294
|
+
const uint8_t * get_single_code (
|
|
295
|
+
size_t list_no, size_t offset) const override;
|
|
296
|
+
|
|
297
|
+
void prefetch_lists (const idx_t *list_nos, int nlist) const override;
|
|
298
|
+
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
/** use the first inverted lists if they are non-empty otherwise use the second
|
|
303
|
+
*
|
|
304
|
+
* This is useful if il1 has a few inverted lists that are too long,
|
|
305
|
+
* and that il0 has replacement lists for those, with empty lists for
|
|
306
|
+
* the others. */
|
|
307
|
+
struct MaskedInvertedLists: ReadOnlyInvertedLists {
|
|
308
|
+
|
|
309
|
+
const InvertedLists *il0;
|
|
310
|
+
const InvertedLists *il1;
|
|
311
|
+
|
|
312
|
+
MaskedInvertedLists (const InvertedLists *il0,
|
|
313
|
+
const InvertedLists *il1);
|
|
314
|
+
|
|
315
|
+
size_t list_size(size_t list_no) const override;
|
|
316
|
+
const uint8_t * get_codes (size_t list_no) const override;
|
|
317
|
+
const idx_t * get_ids (size_t list_no) const override;
|
|
318
|
+
|
|
319
|
+
void release_codes (size_t list_no, const uint8_t *codes) const override;
|
|
320
|
+
void release_ids (size_t list_no, const idx_t *ids) const override;
|
|
321
|
+
|
|
322
|
+
idx_t get_single_id (size_t list_no, size_t offset) const override;
|
|
323
|
+
|
|
324
|
+
const uint8_t * get_single_code (
|
|
325
|
+
size_t list_no, size_t offset) const override;
|
|
326
|
+
|
|
327
|
+
void prefetch_lists (const idx_t *list_nos, int nlist) const override;
|
|
328
|
+
|
|
329
|
+
};
|
|
330
|
+
|
|
331
|
+
} // namespace faiss
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
#endif
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Facebook, Inc. and its affiliates.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// -*- c++ -*-
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
#include <faiss/MatrixStats.h>
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
|
|
15
|
+
|
|
16
|
+
#include <cmath>
|
|
17
|
+
#include <cstdio>
|
|
18
|
+
#include <faiss/utils/utils.h>
|
|
19
|
+
|
|
20
|
+
namespace faiss {
|
|
21
|
+
|
|
22
|
+
/*********************************************************************
|
|
23
|
+
* MatrixStats
|
|
24
|
+
*********************************************************************/
|
|
25
|
+
|
|
26
|
+
MatrixStats::PerDimStats::PerDimStats():
|
|
27
|
+
n(0), n_nan(0), n_inf(0), n0(0),
|
|
28
|
+
min(HUGE_VALF), max(-HUGE_VALF),
|
|
29
|
+
sum(0), sum2(0),
|
|
30
|
+
mean(NAN), stddev(NAN)
|
|
31
|
+
{}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
void MatrixStats::PerDimStats::add (float x)
|
|
35
|
+
{
|
|
36
|
+
n++;
|
|
37
|
+
if (std::isnan(x)) {
|
|
38
|
+
n_nan++;
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
if (!std::isfinite(x)) {
|
|
42
|
+
n_inf++;
|
|
43
|
+
return;
|
|
44
|
+
}
|
|
45
|
+
if (x == 0) n0++;
|
|
46
|
+
if (x < min) min = x;
|
|
47
|
+
if (x > max) max = x;
|
|
48
|
+
sum += x;
|
|
49
|
+
sum2 += (double)x * (double)x;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
void MatrixStats::PerDimStats::compute_mean_std ()
|
|
53
|
+
{
|
|
54
|
+
n_valid = n - n_nan - n_inf;
|
|
55
|
+
mean = sum / n_valid;
|
|
56
|
+
double var = sum2 / n_valid - mean * mean;
|
|
57
|
+
if (var < 0) var = 0;
|
|
58
|
+
stddev = sqrt(var);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
void MatrixStats::do_comment (const char *fmt, ...)
|
|
63
|
+
{
|
|
64
|
+
va_list ap;
|
|
65
|
+
|
|
66
|
+
/* Determine required size */
|
|
67
|
+
va_start(ap, fmt);
|
|
68
|
+
size_t size = vsnprintf(buf, nbuf, fmt, ap);
|
|
69
|
+
va_end(ap);
|
|
70
|
+
|
|
71
|
+
nbuf -= size;
|
|
72
|
+
buf += size;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
|
|
78
|
+
n(n), d(d),
|
|
79
|
+
n_collision(0), n_valid(0), n0(0),
|
|
80
|
+
min_norm2(HUGE_VAL), max_norm2(0)
|
|
81
|
+
{
|
|
82
|
+
std::vector<char> comment_buf (10000);
|
|
83
|
+
buf = comment_buf.data ();
|
|
84
|
+
nbuf = comment_buf.size();
|
|
85
|
+
|
|
86
|
+
do_comment ("analyzing %ld vectors of size %ld\n", n, d);
|
|
87
|
+
|
|
88
|
+
if (d > 1024) {
|
|
89
|
+
do_comment (
|
|
90
|
+
"indexing this many dimensions is hard, "
|
|
91
|
+
"please consider dimensionality reducution (with PCAMatrix)\n");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
size_t nbytes = sizeof (x[0]) * d;
|
|
95
|
+
per_dim_stats.resize (d);
|
|
96
|
+
|
|
97
|
+
for (size_t i = 0; i < n; i++) {
|
|
98
|
+
const float *xi = x + d * i;
|
|
99
|
+
double sum2 = 0;
|
|
100
|
+
for (size_t j = 0; j < d; j++) {
|
|
101
|
+
per_dim_stats[j].add (xi[j]);
|
|
102
|
+
sum2 += xi[j] * (double)xi[j];
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (std::isfinite (sum2)) {
|
|
106
|
+
n_valid++;
|
|
107
|
+
if (sum2 == 0) {
|
|
108
|
+
n0 ++;
|
|
109
|
+
} else {
|
|
110
|
+
if (sum2 < min_norm2) min_norm2 = sum2;
|
|
111
|
+
if (sum2 > max_norm2) max_norm2 = sum2;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
{ // check hash
|
|
116
|
+
uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
|
|
117
|
+
auto elt = occurrences.find (hash);
|
|
118
|
+
if (elt == occurrences.end()) {
|
|
119
|
+
Occurrence occ = {i, 1};
|
|
120
|
+
occurrences[hash] = occ;
|
|
121
|
+
} else {
|
|
122
|
+
if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
|
|
123
|
+
elt->second.count ++;
|
|
124
|
+
} else {
|
|
125
|
+
n_collision ++;
|
|
126
|
+
// we should use a list of collisions but overkill
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// invalid vecor stats
|
|
133
|
+
if (n_valid == n) {
|
|
134
|
+
do_comment ("no NaN or Infs in data\n");
|
|
135
|
+
} else {
|
|
136
|
+
do_comment ("%ld vectors contain NaN or Inf "
|
|
137
|
+
"(or have too large components), "
|
|
138
|
+
"expect bad results with indexing!\n", n - n_valid);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// copies in dataset
|
|
142
|
+
if (occurrences.size() == n) {
|
|
143
|
+
do_comment ("all vectors are distinct\n");
|
|
144
|
+
} else {
|
|
145
|
+
do_comment ("%ld vectors are distinct (%.2f%%)\n",
|
|
146
|
+
occurrences.size(),
|
|
147
|
+
occurrences.size() * 100.0 / n);
|
|
148
|
+
|
|
149
|
+
if (n_collision > 0) {
|
|
150
|
+
do_comment ("%ld collisions in hash table, "
|
|
151
|
+
"counts may be invalid\n", n_collision);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
Occurrence max = {0, 0};
|
|
155
|
+
for (auto it = occurrences.begin();
|
|
156
|
+
it != occurrences.end(); ++it) {
|
|
157
|
+
if (it->second.count > max.count) {
|
|
158
|
+
max = it->second;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
do_comment ("vector %ld has %ld copies\n", max.first, max.count);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
{ // norm stats
|
|
165
|
+
min_norm2 = sqrt (min_norm2);
|
|
166
|
+
max_norm2 = sqrt (max_norm2);
|
|
167
|
+
do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
|
|
168
|
+
min_norm2, max_norm2, n0);
|
|
169
|
+
|
|
170
|
+
if (max_norm2 < min_norm2 * 1.0001) {
|
|
171
|
+
do_comment ("vectors are normalized, inner product and "
|
|
172
|
+
"L2 search are equivalent\n");
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (max_norm2 > min_norm2 * 100) {
|
|
176
|
+
do_comment ("vectors have very large differences in norms, "
|
|
177
|
+
"is this normal?\n");
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
{ // per dimension stats
|
|
182
|
+
|
|
183
|
+
double max_std = 0, min_std = HUGE_VAL;
|
|
184
|
+
|
|
185
|
+
size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
|
|
186
|
+
|
|
187
|
+
for (size_t j = 0; j < d; j++) {
|
|
188
|
+
PerDimStats &st = per_dim_stats[j];
|
|
189
|
+
st.compute_mean_std ();
|
|
190
|
+
n0 += st.n0;
|
|
191
|
+
|
|
192
|
+
if (st.max == st.min) {
|
|
193
|
+
n_0_range ++;
|
|
194
|
+
} else if (st.max < 1.001 * st.min) {
|
|
195
|
+
n_dangerous_range ++;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if (st.stddev > max_std) max_std = st.stddev;
|
|
199
|
+
if (st.stddev < min_std) min_std = st.stddev;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
if (n0 == 0) {
|
|
205
|
+
do_comment ("matrix contains no 0s\n");
|
|
206
|
+
} else {
|
|
207
|
+
do_comment ("matrix contains %.2f %% 0 entries\n",
|
|
208
|
+
n0 * 100.0 / (n * d));
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (n_0_range == 0) {
|
|
212
|
+
do_comment ("no constant dimensions\n");
|
|
213
|
+
} else {
|
|
214
|
+
do_comment ("%ld dimensions are constant: they can be removed\n",
|
|
215
|
+
n_0_range);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (n_dangerous_range == 0) {
|
|
219
|
+
do_comment ("no dimension has a too large mean\n");
|
|
220
|
+
} else {
|
|
221
|
+
do_comment ("%ld dimensions are too large "
|
|
222
|
+
"wrt. their variance, may loose precision "
|
|
223
|
+
"in IndexFlatL2 (use CenteringTransform)\n",
|
|
224
|
+
n_dangerous_range);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
|
|
228
|
+
|
|
229
|
+
size_t n_small_var = 0;
|
|
230
|
+
|
|
231
|
+
for (size_t j = 0; j < d; j++) {
|
|
232
|
+
const PerDimStats &st = per_dim_stats[j];
|
|
233
|
+
if (st.stddev < max_std * 1e-4) {
|
|
234
|
+
n_small_var++;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (n_small_var > 0) {
|
|
239
|
+
do_comment ("%ld dimensions have negligible stddev wrt. "
|
|
240
|
+
"the largest dimension, they could be ignored",
|
|
241
|
+
n_small_var);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
}
|
|
245
|
+
comments = comment_buf.data ();
|
|
246
|
+
buf = nullptr;
|
|
247
|
+
nbuf = 0;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
} // namespace faiss
|