faiss 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +103 -3
- data/ext/faiss/ext.cpp +99 -32
- data/ext/faiss/extconf.rb +12 -2
- data/lib/faiss/ext.bundle +0 -0
- data/lib/faiss/index.rb +3 -3
- data/lib/faiss/index_binary.rb +3 -3
- data/lib/faiss/kmeans.rb +1 -1
- data/lib/faiss/pca_matrix.rb +2 -2
- data/lib/faiss/product_quantizer.rb +3 -3
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/AutoTune.cpp +719 -0
- data/vendor/faiss/AutoTune.h +212 -0
- data/vendor/faiss/Clustering.cpp +261 -0
- data/vendor/faiss/Clustering.h +101 -0
- data/vendor/faiss/IVFlib.cpp +339 -0
- data/vendor/faiss/IVFlib.h +132 -0
- data/vendor/faiss/Index.cpp +171 -0
- data/vendor/faiss/Index.h +261 -0
- data/vendor/faiss/Index2Layer.cpp +437 -0
- data/vendor/faiss/Index2Layer.h +85 -0
- data/vendor/faiss/IndexBinary.cpp +77 -0
- data/vendor/faiss/IndexBinary.h +163 -0
- data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
- data/vendor/faiss/IndexBinaryFlat.h +54 -0
- data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
- data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
- data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
- data/vendor/faiss/IndexBinaryHNSW.h +56 -0
- data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
- data/vendor/faiss/IndexBinaryIVF.h +211 -0
- data/vendor/faiss/IndexFlat.cpp +508 -0
- data/vendor/faiss/IndexFlat.h +175 -0
- data/vendor/faiss/IndexHNSW.cpp +1090 -0
- data/vendor/faiss/IndexHNSW.h +170 -0
- data/vendor/faiss/IndexIVF.cpp +909 -0
- data/vendor/faiss/IndexIVF.h +353 -0
- data/vendor/faiss/IndexIVFFlat.cpp +502 -0
- data/vendor/faiss/IndexIVFFlat.h +118 -0
- data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
- data/vendor/faiss/IndexIVFPQ.h +161 -0
- data/vendor/faiss/IndexIVFPQR.cpp +219 -0
- data/vendor/faiss/IndexIVFPQR.h +65 -0
- data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
- data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
- data/vendor/faiss/IndexLSH.cpp +225 -0
- data/vendor/faiss/IndexLSH.h +87 -0
- data/vendor/faiss/IndexLattice.cpp +143 -0
- data/vendor/faiss/IndexLattice.h +68 -0
- data/vendor/faiss/IndexPQ.cpp +1188 -0
- data/vendor/faiss/IndexPQ.h +199 -0
- data/vendor/faiss/IndexPreTransform.cpp +288 -0
- data/vendor/faiss/IndexPreTransform.h +91 -0
- data/vendor/faiss/IndexReplicas.cpp +123 -0
- data/vendor/faiss/IndexReplicas.h +76 -0
- data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
- data/vendor/faiss/IndexScalarQuantizer.h +127 -0
- data/vendor/faiss/IndexShards.cpp +317 -0
- data/vendor/faiss/IndexShards.h +100 -0
- data/vendor/faiss/InvertedLists.cpp +623 -0
- data/vendor/faiss/InvertedLists.h +334 -0
- data/vendor/faiss/LICENSE +21 -0
- data/vendor/faiss/MatrixStats.cpp +252 -0
- data/vendor/faiss/MatrixStats.h +62 -0
- data/vendor/faiss/MetaIndexes.cpp +351 -0
- data/vendor/faiss/MetaIndexes.h +126 -0
- data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
- data/vendor/faiss/OnDiskInvertedLists.h +127 -0
- data/vendor/faiss/VectorTransform.cpp +1157 -0
- data/vendor/faiss/VectorTransform.h +322 -0
- data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
- data/vendor/faiss/c_api/AutoTune_c.h +64 -0
- data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
- data/vendor/faiss/c_api/Clustering_c.h +117 -0
- data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
- data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
- data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
- data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
- data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
- data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
- data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
- data/vendor/faiss/c_api/IndexShards_c.h +42 -0
- data/vendor/faiss/c_api/Index_c.cpp +105 -0
- data/vendor/faiss/c_api/Index_c.h +183 -0
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
- data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
- data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
- data/vendor/faiss/c_api/clone_index_c.h +32 -0
- data/vendor/faiss/c_api/error_c.h +42 -0
- data/vendor/faiss/c_api/error_impl.cpp +27 -0
- data/vendor/faiss/c_api/error_impl.h +16 -0
- data/vendor/faiss/c_api/faiss_c.h +58 -0
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
- data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
- data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
- data/vendor/faiss/c_api/index_factory_c.h +30 -0
- data/vendor/faiss/c_api/index_io_c.cpp +42 -0
- data/vendor/faiss/c_api/index_io_c.h +50 -0
- data/vendor/faiss/c_api/macros_impl.h +110 -0
- data/vendor/faiss/clone_index.cpp +147 -0
- data/vendor/faiss/clone_index.h +38 -0
- data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
- data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
- data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
- data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
- data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
- data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
- data/vendor/faiss/gpu/GpuCloner.h +82 -0
- data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
- data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
- data/vendor/faiss/gpu/GpuDistance.h +52 -0
- data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
- data/vendor/faiss/gpu/GpuIndex.h +148 -0
- data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
- data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
- data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
- data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
- data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
- data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
- data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
- data/vendor/faiss/gpu/GpuResources.cpp +52 -0
- data/vendor/faiss/gpu/GpuResources.h +73 -0
- data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
- data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
- data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
- data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
- data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
- data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
- data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
- data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
- data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
- data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
- data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
- data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
- data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
- data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
- data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
- data/vendor/faiss/gpu/test/TestUtils.h +93 -0
- data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
- data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
- data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
- data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
- data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
- data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
- data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
- data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
- data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
- data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
- data/vendor/faiss/gpu/utils/Timer.h +52 -0
- data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
- data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
- data/vendor/faiss/impl/FaissAssert.h +95 -0
- data/vendor/faiss/impl/FaissException.cpp +66 -0
- data/vendor/faiss/impl/FaissException.h +71 -0
- data/vendor/faiss/impl/HNSW.cpp +818 -0
- data/vendor/faiss/impl/HNSW.h +275 -0
- data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
- data/vendor/faiss/impl/PolysemousTraining.h +158 -0
- data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
- data/vendor/faiss/impl/ProductQuantizer.h +242 -0
- data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
- data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
- data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
- data/vendor/faiss/impl/ThreadedIndex.h +80 -0
- data/vendor/faiss/impl/index_read.cpp +793 -0
- data/vendor/faiss/impl/index_write.cpp +558 -0
- data/vendor/faiss/impl/io.cpp +142 -0
- data/vendor/faiss/impl/io.h +98 -0
- data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
- data/vendor/faiss/impl/lattice_Zn.h +199 -0
- data/vendor/faiss/index_factory.cpp +392 -0
- data/vendor/faiss/index_factory.h +25 -0
- data/vendor/faiss/index_io.h +75 -0
- data/vendor/faiss/misc/test_blas.cpp +84 -0
- data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
- data/vendor/faiss/tests/test_merge.cpp +258 -0
- data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
- data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
- data/vendor/faiss/tests/test_params_override.cpp +231 -0
- data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
- data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
- data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
- data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
- data/vendor/faiss/utils/Heap.cpp +122 -0
- data/vendor/faiss/utils/Heap.h +495 -0
- data/vendor/faiss/utils/WorkerThread.cpp +126 -0
- data/vendor/faiss/utils/WorkerThread.h +61 -0
- data/vendor/faiss/utils/distances.cpp +765 -0
- data/vendor/faiss/utils/distances.h +243 -0
- data/vendor/faiss/utils/distances_simd.cpp +809 -0
- data/vendor/faiss/utils/extra_distances.cpp +336 -0
- data/vendor/faiss/utils/extra_distances.h +54 -0
- data/vendor/faiss/utils/hamming-inl.h +472 -0
- data/vendor/faiss/utils/hamming.cpp +792 -0
- data/vendor/faiss/utils/hamming.h +220 -0
- data/vendor/faiss/utils/random.cpp +192 -0
- data/vendor/faiss/utils/random.h +60 -0
- data/vendor/faiss/utils/utils.cpp +783 -0
- data/vendor/faiss/utils/utils.h +181 -0
- metadata +216 -2
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// -*- c++ -*-
|
|
9
|
+
|
|
10
|
+
#ifndef FAISS_INDEX_IVF_H
|
|
11
|
+
#define FAISS_INDEX_IVF_H
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
#include <vector>
|
|
15
|
+
#include <stdint.h>
|
|
16
|
+
|
|
17
|
+
#include <faiss/Index.h>
|
|
18
|
+
#include <faiss/InvertedLists.h>
|
|
19
|
+
#include <faiss/Clustering.h>
|
|
20
|
+
#include <faiss/utils/Heap.h>
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
namespace faiss {
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
/** Encapsulates a quantizer object for the IndexIVF
|
|
27
|
+
*
|
|
28
|
+
* The class isolates the fields that are independent of the storage
|
|
29
|
+
* of the lists (especially training)
|
|
30
|
+
*/
|
|
31
|
+
struct Level1Quantizer {
|
|
32
|
+
Index * quantizer; ///< quantizer that maps vectors to inverted lists
|
|
33
|
+
size_t nlist; ///< number of possible key values
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* = 0: use the quantizer as index in a kmeans training
|
|
38
|
+
* = 1: just pass on the training set to the train() of the quantizer
|
|
39
|
+
* = 2: kmeans training on a flat index + add the centroids to the quantizer
|
|
40
|
+
*/
|
|
41
|
+
char quantizer_trains_alone;
|
|
42
|
+
bool own_fields; ///< whether object owns the quantizer
|
|
43
|
+
|
|
44
|
+
ClusteringParameters cp; ///< to override default clustering params
|
|
45
|
+
Index *clustering_index; ///< to override index used during clustering
|
|
46
|
+
|
|
47
|
+
/// Trains the quantizer and calls train_residual to train sub-quantizers
|
|
48
|
+
void train_q1 (size_t n, const float *x, bool verbose,
|
|
49
|
+
MetricType metric_type);
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
/// compute the number of bytes required to store list ids
|
|
53
|
+
size_t coarse_code_size () const;
|
|
54
|
+
void encode_listno (Index::idx_t list_no, uint8_t *code) const;
|
|
55
|
+
Index::idx_t decode_listno (const uint8_t *code) const;
|
|
56
|
+
|
|
57
|
+
Level1Quantizer (Index * quantizer, size_t nlist);
|
|
58
|
+
|
|
59
|
+
Level1Quantizer ();
|
|
60
|
+
|
|
61
|
+
~Level1Quantizer ();
|
|
62
|
+
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
struct IVFSearchParameters {
|
|
68
|
+
size_t nprobe; ///< number of probes at query time
|
|
69
|
+
size_t max_codes; ///< max nb of codes to visit to do a query
|
|
70
|
+
virtual ~IVFSearchParameters () {}
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
struct InvertedListScanner;
|
|
76
|
+
|
|
77
|
+
/** Index based on a inverted file (IVF)
|
|
78
|
+
*
|
|
79
|
+
* In the inverted file, the quantizer (an Index instance) provides a
|
|
80
|
+
* quantization index for each vector to be added. The quantization
|
|
81
|
+
* index maps to a list (aka inverted list or posting list), where the
|
|
82
|
+
* id of the vector is stored.
|
|
83
|
+
*
|
|
84
|
+
* The inverted list object is required only after trainng. If none is
|
|
85
|
+
* set externally, an ArrayInvertedLists is used automatically.
|
|
86
|
+
*
|
|
87
|
+
* At search time, the vector to be searched is also quantized, and
|
|
88
|
+
* only the list corresponding to the quantization index is
|
|
89
|
+
* searched. This speeds up the search by making it
|
|
90
|
+
* non-exhaustive. This can be relaxed using multi-probe search: a few
|
|
91
|
+
* (nprobe) quantization indices are selected and several inverted
|
|
92
|
+
* lists are visited.
|
|
93
|
+
*
|
|
94
|
+
* Sub-classes implement a post-filtering of the index that refines
|
|
95
|
+
* the distance estimation from the query to databse vectors.
|
|
96
|
+
*/
|
|
97
|
+
struct IndexIVF: Index, Level1Quantizer {
|
|
98
|
+
/// Acess to the actual data
|
|
99
|
+
InvertedLists *invlists;
|
|
100
|
+
bool own_invlists;
|
|
101
|
+
|
|
102
|
+
size_t code_size; ///< code size per vector in bytes
|
|
103
|
+
|
|
104
|
+
size_t nprobe; ///< number of probes at query time
|
|
105
|
+
size_t max_codes; ///< max nb of codes to visit to do a query
|
|
106
|
+
|
|
107
|
+
/** Parallel mode determines how queries are parallelized with OpenMP
|
|
108
|
+
*
|
|
109
|
+
* 0 (default): parallelize over queries
|
|
110
|
+
* 1: parallelize over over inverted lists
|
|
111
|
+
* 2: parallelize over both
|
|
112
|
+
*/
|
|
113
|
+
int parallel_mode;
|
|
114
|
+
|
|
115
|
+
/// map for direct access to the elements. Enables reconstruct().
|
|
116
|
+
bool maintain_direct_map;
|
|
117
|
+
std::vector <idx_t> direct_map;
|
|
118
|
+
|
|
119
|
+
/** The Inverted file takes a quantizer (an Index) on input,
|
|
120
|
+
* which implements the function mapping a vector to a list
|
|
121
|
+
* identifier. The pointer is borrowed: the quantizer should not
|
|
122
|
+
* be deleted while the IndexIVF is in use.
|
|
123
|
+
*/
|
|
124
|
+
IndexIVF (Index * quantizer, size_t d,
|
|
125
|
+
size_t nlist, size_t code_size,
|
|
126
|
+
MetricType metric = METRIC_L2);
|
|
127
|
+
|
|
128
|
+
void reset() override;
|
|
129
|
+
|
|
130
|
+
/// Trains the quantizer and calls train_residual to train sub-quantizers
|
|
131
|
+
void train(idx_t n, const float* x) override;
|
|
132
|
+
|
|
133
|
+
/// Calls add_with_ids with NULL ids
|
|
134
|
+
void add(idx_t n, const float* x) override;
|
|
135
|
+
|
|
136
|
+
/// default implementation that calls encode_vectors
|
|
137
|
+
void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
|
|
138
|
+
|
|
139
|
+
/** Encodes a set of vectors as they would appear in the inverted lists
|
|
140
|
+
*
|
|
141
|
+
* @param list_nos inverted list ids as returned by the
|
|
142
|
+
* quantizer (size n). -1s are ignored.
|
|
143
|
+
* @param codes output codes, size n * code_size
|
|
144
|
+
* @param include_listno
|
|
145
|
+
* include the list ids in the code (in this case add
|
|
146
|
+
* ceil(log8(nlist)) to the code size)
|
|
147
|
+
*/
|
|
148
|
+
virtual void encode_vectors(idx_t n, const float* x,
|
|
149
|
+
const idx_t *list_nos,
|
|
150
|
+
uint8_t * codes,
|
|
151
|
+
bool include_listno = false) const = 0;
|
|
152
|
+
|
|
153
|
+
/// Sub-classes that encode the residuals can train their encoders here
|
|
154
|
+
/// does nothing by default
|
|
155
|
+
virtual void train_residual (idx_t n, const float *x);
|
|
156
|
+
|
|
157
|
+
/** search a set of vectors, that are pre-quantized by the IVF
|
|
158
|
+
* quantizer. Fill in the corresponding heaps with the query
|
|
159
|
+
* results. The default implementation uses InvertedListScanners
|
|
160
|
+
* to do the search.
|
|
161
|
+
*
|
|
162
|
+
* @param n nb of vectors to query
|
|
163
|
+
* @param x query vectors, size nx * d
|
|
164
|
+
* @param assign coarse quantization indices, size nx * nprobe
|
|
165
|
+
* @param centroid_dis
|
|
166
|
+
* distances to coarse centroids, size nx * nprobe
|
|
167
|
+
* @param distance
|
|
168
|
+
* output distances, size n * k
|
|
169
|
+
* @param labels output labels, size n * k
|
|
170
|
+
* @param store_pairs store inv list index + inv list offset
|
|
171
|
+
* instead in upper/lower 32 bit of result,
|
|
172
|
+
* instead of ids (used for reranking).
|
|
173
|
+
* @param params used to override the object's search parameters
|
|
174
|
+
*/
|
|
175
|
+
virtual void search_preassigned (idx_t n, const float *x, idx_t k,
|
|
176
|
+
const idx_t *assign,
|
|
177
|
+
const float *centroid_dis,
|
|
178
|
+
float *distances, idx_t *labels,
|
|
179
|
+
bool store_pairs,
|
|
180
|
+
const IVFSearchParameters *params=nullptr
|
|
181
|
+
) const;
|
|
182
|
+
|
|
183
|
+
/** assign the vectors, then call search_preassign */
|
|
184
|
+
void search (idx_t n, const float *x, idx_t k,
|
|
185
|
+
float *distances, idx_t *labels) const override;
|
|
186
|
+
|
|
187
|
+
void range_search (idx_t n, const float* x, float radius,
|
|
188
|
+
RangeSearchResult* result) const override;
|
|
189
|
+
|
|
190
|
+
void range_search_preassigned(idx_t nx, const float *x, float radius,
|
|
191
|
+
const idx_t *keys, const float *coarse_dis,
|
|
192
|
+
RangeSearchResult *result) const;
|
|
193
|
+
|
|
194
|
+
/// get a scanner for this index (store_pairs means ignore labels)
|
|
195
|
+
virtual InvertedListScanner *get_InvertedListScanner (
|
|
196
|
+
bool store_pairs=false) const;
|
|
197
|
+
|
|
198
|
+
void reconstruct (idx_t key, float* recons) const override;
|
|
199
|
+
|
|
200
|
+
/** Reconstruct a subset of the indexed vectors.
|
|
201
|
+
*
|
|
202
|
+
* Overrides default implementation to bypass reconstruct() which requires
|
|
203
|
+
* direct_map to be maintained.
|
|
204
|
+
*
|
|
205
|
+
* @param i0 first vector to reconstruct
|
|
206
|
+
* @param ni nb of vectors to reconstruct
|
|
207
|
+
* @param recons output array of reconstructed vectors, size ni * d
|
|
208
|
+
*/
|
|
209
|
+
void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
|
|
210
|
+
|
|
211
|
+
/** Similar to search, but also reconstructs the stored vectors (or an
|
|
212
|
+
* approximation in the case of lossy coding) for the search results.
|
|
213
|
+
*
|
|
214
|
+
* Overrides default implementation to avoid having to maintain direct_map
|
|
215
|
+
* and instead fetch the code offsets through the `store_pairs` flag in
|
|
216
|
+
* search_preassigned().
|
|
217
|
+
*
|
|
218
|
+
* @param recons reconstructed vectors size (n, k, d)
|
|
219
|
+
*/
|
|
220
|
+
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
|
|
221
|
+
float *distances, idx_t *labels,
|
|
222
|
+
float *recons) const override;
|
|
223
|
+
|
|
224
|
+
/** Reconstruct a vector given the location in terms of (inv list index +
|
|
225
|
+
* inv list offset) instead of the id.
|
|
226
|
+
*
|
|
227
|
+
* Useful for reconstructing when the direct_map is not maintained and
|
|
228
|
+
* the inv list offset is computed by search_preassigned() with
|
|
229
|
+
* `store_pairs` set.
|
|
230
|
+
*/
|
|
231
|
+
virtual void reconstruct_from_offset (int64_t list_no, int64_t offset,
|
|
232
|
+
float* recons) const;
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
/// Dataset manipulation functions
|
|
236
|
+
|
|
237
|
+
size_t remove_ids(const IDSelector& sel) override;
|
|
238
|
+
|
|
239
|
+
/** check that the two indexes are compatible (ie, they are
|
|
240
|
+
* trained in the same way and have the same
|
|
241
|
+
* parameters). Otherwise throw. */
|
|
242
|
+
void check_compatible_for_merge (const IndexIVF &other) const;
|
|
243
|
+
|
|
244
|
+
/** moves the entries from another dataset to self. On output,
|
|
245
|
+
* other is empty. add_id is added to all moved ids (for
|
|
246
|
+
* sequential ids, this would be this->ntotal */
|
|
247
|
+
virtual void merge_from (IndexIVF &other, idx_t add_id);
|
|
248
|
+
|
|
249
|
+
/** copy a subset of the entries index to the other index
|
|
250
|
+
*
|
|
251
|
+
* if subset_type == 0: copies ids in [a1, a2)
|
|
252
|
+
* if subset_type == 1: copies ids if id % a1 == a2
|
|
253
|
+
* if subset_type == 2: copies inverted lists such that a1
|
|
254
|
+
* elements are left before and a2 elements are after
|
|
255
|
+
*/
|
|
256
|
+
virtual void copy_subset_to (IndexIVF & other, int subset_type,
|
|
257
|
+
idx_t a1, idx_t a2) const;
|
|
258
|
+
|
|
259
|
+
~IndexIVF() override;
|
|
260
|
+
|
|
261
|
+
size_t get_list_size (size_t list_no) const
|
|
262
|
+
{ return invlists->list_size(list_no); }
|
|
263
|
+
|
|
264
|
+
/** intialize a direct map
|
|
265
|
+
*
|
|
266
|
+
* @param new_maintain_direct_map if true, create a direct map,
|
|
267
|
+
* else clear it
|
|
268
|
+
*/
|
|
269
|
+
void make_direct_map (bool new_maintain_direct_map=true);
|
|
270
|
+
|
|
271
|
+
/// replace the inverted lists, old one is deallocated if own_invlists
|
|
272
|
+
void replace_invlists (InvertedLists *il, bool own=false);
|
|
273
|
+
|
|
274
|
+
/* The standalone codec interface (except sa_decode that is specific) */
|
|
275
|
+
size_t sa_code_size () const override;
|
|
276
|
+
|
|
277
|
+
void sa_encode (idx_t n, const float *x,
|
|
278
|
+
uint8_t *bytes) const override;
|
|
279
|
+
|
|
280
|
+
IndexIVF ();
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
struct RangeQueryResult;
|
|
284
|
+
|
|
285
|
+
/** Object that handles a query. The inverted lists to scan are
|
|
286
|
+
* provided externally. The object has a lot of state, but
|
|
287
|
+
* distance_to_code and scan_codes can be called in multiple
|
|
288
|
+
* threads */
|
|
289
|
+
struct InvertedListScanner {
|
|
290
|
+
|
|
291
|
+
using idx_t = Index::idx_t;
|
|
292
|
+
|
|
293
|
+
/// from now on we handle this query.
|
|
294
|
+
virtual void set_query (const float *query_vector) = 0;
|
|
295
|
+
|
|
296
|
+
/// following codes come from this inverted list
|
|
297
|
+
virtual void set_list (idx_t list_no, float coarse_dis) = 0;
|
|
298
|
+
|
|
299
|
+
/// compute a single query-to-code distance
|
|
300
|
+
virtual float distance_to_code (const uint8_t *code) const = 0;
|
|
301
|
+
|
|
302
|
+
/** scan a set of codes, compute distances to current query and
|
|
303
|
+
* update heap of results if necessary.
|
|
304
|
+
*
|
|
305
|
+
* @param n number of codes to scan
|
|
306
|
+
* @param codes codes to scan (n * code_size)
|
|
307
|
+
* @param ids corresponding ids (ignored if store_pairs)
|
|
308
|
+
* @param distances heap distances (size k)
|
|
309
|
+
* @param labels heap labels (size k)
|
|
310
|
+
* @param k heap size
|
|
311
|
+
* @return number of heap updates performed
|
|
312
|
+
*/
|
|
313
|
+
virtual size_t scan_codes (size_t n,
|
|
314
|
+
const uint8_t *codes,
|
|
315
|
+
const idx_t *ids,
|
|
316
|
+
float *distances, idx_t *labels,
|
|
317
|
+
size_t k) const = 0;
|
|
318
|
+
|
|
319
|
+
/** scan a set of codes, compute distances to current query and
|
|
320
|
+
* update results if distances are below radius
|
|
321
|
+
*
|
|
322
|
+
* (default implementation fails) */
|
|
323
|
+
virtual void scan_codes_range (size_t n,
|
|
324
|
+
const uint8_t *codes,
|
|
325
|
+
const idx_t *ids,
|
|
326
|
+
float radius,
|
|
327
|
+
RangeQueryResult &result) const;
|
|
328
|
+
|
|
329
|
+
virtual ~InvertedListScanner () {}
|
|
330
|
+
|
|
331
|
+
};
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
struct IndexIVFStats {
|
|
335
|
+
size_t nq; // nb of queries run
|
|
336
|
+
size_t nlist; // nb of inverted lists scanned
|
|
337
|
+
size_t ndis; // nb of distancs computed
|
|
338
|
+
size_t nheap_updates; // nb of times the heap was updated
|
|
339
|
+
double quantization_time; // time spent quantizing vectors (in ms)
|
|
340
|
+
double search_time; // time spent searching lists (in ms)
|
|
341
|
+
|
|
342
|
+
IndexIVFStats () {reset (); }
|
|
343
|
+
void reset ();
|
|
344
|
+
};
|
|
345
|
+
|
|
346
|
+
// global var that collects them all
|
|
347
|
+
extern IndexIVFStats indexIVF_stats;
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
} // namespace faiss
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
#endif
|
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// -*- c++ -*-
|
|
9
|
+
|
|
10
|
+
#include <faiss/IndexIVFFlat.h>
|
|
11
|
+
|
|
12
|
+
#include <cstdio>
|
|
13
|
+
|
|
14
|
+
#include <faiss/IndexFlat.h>
|
|
15
|
+
|
|
16
|
+
#include <faiss/utils/distances.h>
|
|
17
|
+
#include <faiss/utils/utils.h>
|
|
18
|
+
#include <faiss/impl/FaissAssert.h>
|
|
19
|
+
#include <faiss/impl/AuxIndexStructures.h>
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
namespace faiss {
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
/*****************************************
|
|
26
|
+
* IndexIVFFlat implementation
|
|
27
|
+
******************************************/
|
|
28
|
+
|
|
29
|
+
IndexIVFFlat::IndexIVFFlat (Index * quantizer,
|
|
30
|
+
size_t d, size_t nlist, MetricType metric):
|
|
31
|
+
IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
|
|
32
|
+
{
|
|
33
|
+
code_size = sizeof(float) * d;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const idx_t *xids)
|
|
38
|
+
{
|
|
39
|
+
add_core (n, x, xids, nullptr);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
|
|
43
|
+
const int64_t *precomputed_idx)
|
|
44
|
+
|
|
45
|
+
{
|
|
46
|
+
FAISS_THROW_IF_NOT (is_trained);
|
|
47
|
+
assert (invlists);
|
|
48
|
+
FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
|
|
49
|
+
"cannot have direct map and add with ids");
|
|
50
|
+
const int64_t * idx;
|
|
51
|
+
ScopeDeleter<int64_t> del;
|
|
52
|
+
|
|
53
|
+
if (precomputed_idx) {
|
|
54
|
+
idx = precomputed_idx;
|
|
55
|
+
} else {
|
|
56
|
+
int64_t * idx0 = new int64_t [n];
|
|
57
|
+
del.set (idx0);
|
|
58
|
+
quantizer->assign (n, x, idx0);
|
|
59
|
+
idx = idx0;
|
|
60
|
+
}
|
|
61
|
+
int64_t n_add = 0;
|
|
62
|
+
for (size_t i = 0; i < n; i++) {
|
|
63
|
+
int64_t id = xids ? xids[i] : ntotal + i;
|
|
64
|
+
int64_t list_no = idx [i];
|
|
65
|
+
|
|
66
|
+
if (list_no < 0)
|
|
67
|
+
continue;
|
|
68
|
+
const float *xi = x + i * d;
|
|
69
|
+
size_t offset = invlists->add_entry (
|
|
70
|
+
list_no, id, (const uint8_t*) xi);
|
|
71
|
+
|
|
72
|
+
if (maintain_direct_map)
|
|
73
|
+
direct_map.push_back (list_no << 32 | offset);
|
|
74
|
+
n_add++;
|
|
75
|
+
}
|
|
76
|
+
if (verbose) {
|
|
77
|
+
printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
|
|
78
|
+
n_add, n);
|
|
79
|
+
}
|
|
80
|
+
ntotal += n;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
void IndexIVFFlat::encode_vectors(idx_t n, const float* x,
|
|
84
|
+
const idx_t * list_nos,
|
|
85
|
+
uint8_t * codes,
|
|
86
|
+
bool include_listnos) const
|
|
87
|
+
{
|
|
88
|
+
if (!include_listnos) {
|
|
89
|
+
memcpy (codes, x, code_size * n);
|
|
90
|
+
} else {
|
|
91
|
+
size_t coarse_size = coarse_code_size ();
|
|
92
|
+
for (size_t i = 0; i < n; i++) {
|
|
93
|
+
int64_t list_no = list_nos [i];
|
|
94
|
+
uint8_t *code = codes + i * (code_size + coarse_size);
|
|
95
|
+
const float *xi = x + i * d;
|
|
96
|
+
if (list_no >= 0) {
|
|
97
|
+
encode_listno (list_no, code);
|
|
98
|
+
memcpy (code + coarse_size, xi, code_size);
|
|
99
|
+
} else {
|
|
100
|
+
memset (code, 0, code_size + coarse_size);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
void IndexIVFFlat::sa_decode (idx_t n, const uint8_t *bytes,
|
|
108
|
+
float *x) const
|
|
109
|
+
{
|
|
110
|
+
size_t coarse_size = coarse_code_size ();
|
|
111
|
+
for (size_t i = 0; i < n; i++) {
|
|
112
|
+
const uint8_t *code = bytes + i * (code_size + coarse_size);
|
|
113
|
+
float *xi = x + i * d;
|
|
114
|
+
memcpy (xi, code + coarse_size, code_size);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
namespace {
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
template<MetricType metric, class C>
|
|
123
|
+
struct IVFFlatScanner: InvertedListScanner {
|
|
124
|
+
size_t d;
|
|
125
|
+
bool store_pairs;
|
|
126
|
+
|
|
127
|
+
IVFFlatScanner(size_t d, bool store_pairs):
|
|
128
|
+
d(d), store_pairs(store_pairs) {}
|
|
129
|
+
|
|
130
|
+
const float *xi;
|
|
131
|
+
void set_query (const float *query) override {
|
|
132
|
+
this->xi = query;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
idx_t list_no;
|
|
136
|
+
void set_list (idx_t list_no, float /* coarse_dis */) override {
|
|
137
|
+
this->list_no = list_no;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
float distance_to_code (const uint8_t *code) const override {
|
|
141
|
+
const float *yj = (float*)code;
|
|
142
|
+
float dis = metric == METRIC_INNER_PRODUCT ?
|
|
143
|
+
fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
|
|
144
|
+
return dis;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
size_t scan_codes (size_t list_size,
|
|
148
|
+
const uint8_t *codes,
|
|
149
|
+
const idx_t *ids,
|
|
150
|
+
float *simi, idx_t *idxi,
|
|
151
|
+
size_t k) const override
|
|
152
|
+
{
|
|
153
|
+
const float *list_vecs = (const float*)codes;
|
|
154
|
+
size_t nup = 0;
|
|
155
|
+
for (size_t j = 0; j < list_size; j++) {
|
|
156
|
+
const float * yj = list_vecs + d * j;
|
|
157
|
+
float dis = metric == METRIC_INNER_PRODUCT ?
|
|
158
|
+
fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
|
|
159
|
+
if (C::cmp (simi[0], dis)) {
|
|
160
|
+
heap_pop<C> (k, simi, idxi);
|
|
161
|
+
int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
|
|
162
|
+
heap_push<C> (k, simi, idxi, dis, id);
|
|
163
|
+
nup++;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return nup;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
void scan_codes_range (size_t list_size,
|
|
170
|
+
const uint8_t *codes,
|
|
171
|
+
const idx_t *ids,
|
|
172
|
+
float radius,
|
|
173
|
+
RangeQueryResult & res) const override
|
|
174
|
+
{
|
|
175
|
+
const float *list_vecs = (const float*)codes;
|
|
176
|
+
for (size_t j = 0; j < list_size; j++) {
|
|
177
|
+
const float * yj = list_vecs + d * j;
|
|
178
|
+
float dis = metric == METRIC_INNER_PRODUCT ?
|
|
179
|
+
fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
|
|
180
|
+
if (C::cmp (radius, dis)) {
|
|
181
|
+
int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
|
|
182
|
+
res.add (dis, id);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
} // anonymous namespace
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
|
|
196
|
+
(bool store_pairs) const
|
|
197
|
+
{
|
|
198
|
+
if (metric_type == METRIC_INNER_PRODUCT) {
|
|
199
|
+
return new IVFFlatScanner<
|
|
200
|
+
METRIC_INNER_PRODUCT, CMin<float, int64_t> > (d, store_pairs);
|
|
201
|
+
} else if (metric_type == METRIC_L2) {
|
|
202
|
+
return new IVFFlatScanner<
|
|
203
|
+
METRIC_L2, CMax<float, int64_t> >(d, store_pairs);
|
|
204
|
+
} else {
|
|
205
|
+
FAISS_THROW_MSG("metric type not supported");
|
|
206
|
+
}
|
|
207
|
+
return nullptr;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
|
|
213
|
+
{
|
|
214
|
+
|
|
215
|
+
FAISS_THROW_IF_NOT (maintain_direct_map);
|
|
216
|
+
FAISS_THROW_IF_NOT (is_trained);
|
|
217
|
+
std::vector<idx_t> assign (n);
|
|
218
|
+
quantizer->assign (n, x, assign.data());
|
|
219
|
+
|
|
220
|
+
for (size_t i = 0; i < n; i++) {
|
|
221
|
+
idx_t id = new_ids[i];
|
|
222
|
+
FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
|
|
223
|
+
"id to update out of range");
|
|
224
|
+
{ // remove old one
|
|
225
|
+
int64_t dm = direct_map[id];
|
|
226
|
+
int64_t ofs = dm & 0xffffffff;
|
|
227
|
+
int64_t il = dm >> 32;
|
|
228
|
+
size_t l = invlists->list_size (il);
|
|
229
|
+
if (ofs != l - 1) { // move l - 1 to ofs
|
|
230
|
+
int64_t id2 = invlists->get_single_id (il, l - 1);
|
|
231
|
+
direct_map[id2] = (il << 32) | ofs;
|
|
232
|
+
invlists->update_entry (il, ofs, id2,
|
|
233
|
+
invlists->get_single_code (il, l - 1));
|
|
234
|
+
}
|
|
235
|
+
invlists->resize (il, l - 1);
|
|
236
|
+
}
|
|
237
|
+
{ // insert new one
|
|
238
|
+
int64_t il = assign[i];
|
|
239
|
+
size_t l = invlists->list_size (il);
|
|
240
|
+
int64_t dm = (il << 32) | l;
|
|
241
|
+
direct_map[id] = dm;
|
|
242
|
+
invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
void IndexIVFFlat::reconstruct_from_offset (int64_t list_no, int64_t offset,
|
|
249
|
+
float* recons) const
|
|
250
|
+
{
|
|
251
|
+
memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/*****************************************
|
|
255
|
+
* IndexIVFFlatDedup implementation
|
|
256
|
+
******************************************/
|
|
257
|
+
|
|
258
|
+
IndexIVFFlatDedup::IndexIVFFlatDedup (
|
|
259
|
+
Index * quantizer, size_t d, size_t nlist_,
|
|
260
|
+
MetricType metric_type):
|
|
261
|
+
IndexIVFFlat (quantizer, d, nlist_, metric_type)
|
|
262
|
+
{}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
void IndexIVFFlatDedup::train(idx_t n, const float* x)
|
|
266
|
+
{
|
|
267
|
+
std::unordered_map<uint64_t, idx_t> map;
|
|
268
|
+
float * x2 = new float [n * d];
|
|
269
|
+
ScopeDeleter<float> del (x2);
|
|
270
|
+
|
|
271
|
+
int64_t n2 = 0;
|
|
272
|
+
for (int64_t i = 0; i < n; i++) {
|
|
273
|
+
uint64_t hash = hash_bytes((uint8_t *)(x + i * d), code_size);
|
|
274
|
+
if (map.count(hash) &&
|
|
275
|
+
!memcmp (x2 + map[hash] * d, x + i * d, code_size)) {
|
|
276
|
+
// is duplicate, skip
|
|
277
|
+
} else {
|
|
278
|
+
map [hash] = n2;
|
|
279
|
+
memcpy (x2 + n2 * d, x + i * d, code_size);
|
|
280
|
+
n2 ++;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
if (verbose) {
|
|
284
|
+
printf ("IndexIVFFlatDedup::train: train on %ld points after dedup "
|
|
285
|
+
"(was %ld points)\n", n2, n);
|
|
286
|
+
}
|
|
287
|
+
IndexIVFFlat::train (n2, x2);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
void IndexIVFFlatDedup::add_with_ids(
|
|
293
|
+
idx_t na, const float* x, const idx_t* xids)
|
|
294
|
+
{
|
|
295
|
+
|
|
296
|
+
FAISS_THROW_IF_NOT (is_trained);
|
|
297
|
+
assert (invlists);
|
|
298
|
+
FAISS_THROW_IF_NOT_MSG (
|
|
299
|
+
!maintain_direct_map,
|
|
300
|
+
"IVFFlatDedup not implemented with direct_map");
|
|
301
|
+
int64_t * idx = new int64_t [na];
|
|
302
|
+
ScopeDeleter<int64_t> del (idx);
|
|
303
|
+
quantizer->assign (na, x, idx);
|
|
304
|
+
|
|
305
|
+
int64_t n_add = 0, n_dup = 0;
|
|
306
|
+
// TODO make a omp loop with this
|
|
307
|
+
for (size_t i = 0; i < na; i++) {
|
|
308
|
+
idx_t id = xids ? xids[i] : ntotal + i;
|
|
309
|
+
int64_t list_no = idx [i];
|
|
310
|
+
|
|
311
|
+
if (list_no < 0) {
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
const float *xi = x + i * d;
|
|
315
|
+
|
|
316
|
+
// search if there is already an entry with that id
|
|
317
|
+
InvertedLists::ScopedCodes codes (invlists, list_no);
|
|
318
|
+
|
|
319
|
+
int64_t n = invlists->list_size (list_no);
|
|
320
|
+
int64_t offset = -1;
|
|
321
|
+
for (int64_t o = 0; o < n; o++) {
|
|
322
|
+
if (!memcmp (codes.get() + o * code_size,
|
|
323
|
+
xi, code_size)) {
|
|
324
|
+
offset = o;
|
|
325
|
+
break;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
if (offset == -1) { // not found
|
|
330
|
+
invlists->add_entry (list_no, id, (const uint8_t*) xi);
|
|
331
|
+
} else {
|
|
332
|
+
// mark equivalence
|
|
333
|
+
idx_t id2 = invlists->get_single_id (list_no, offset);
|
|
334
|
+
std::pair<idx_t, idx_t> pair (id2, id);
|
|
335
|
+
instances.insert (pair);
|
|
336
|
+
n_dup ++;
|
|
337
|
+
}
|
|
338
|
+
n_add++;
|
|
339
|
+
}
|
|
340
|
+
if (verbose) {
|
|
341
|
+
printf("IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
|
|
342
|
+
" (out of which %ld are duplicates)\n",
|
|
343
|
+
n_add, na, n_dup);
|
|
344
|
+
}
|
|
345
|
+
ntotal += n_add;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
void IndexIVFFlatDedup::search_preassigned (
|
|
349
|
+
idx_t n, const float *x, idx_t k,
|
|
350
|
+
const idx_t *assign,
|
|
351
|
+
const float *centroid_dis,
|
|
352
|
+
float *distances, idx_t *labels,
|
|
353
|
+
bool store_pairs,
|
|
354
|
+
const IVFSearchParameters *params) const
|
|
355
|
+
{
|
|
356
|
+
FAISS_THROW_IF_NOT_MSG (
|
|
357
|
+
!store_pairs, "store_pairs not supported in IVFDedup");
|
|
358
|
+
|
|
359
|
+
IndexIVFFlat::search_preassigned (n, x, k, assign, centroid_dis,
|
|
360
|
+
distances, labels, false,
|
|
361
|
+
params);
|
|
362
|
+
|
|
363
|
+
std::vector <idx_t> labels2 (k);
|
|
364
|
+
std::vector <float> dis2 (k);
|
|
365
|
+
|
|
366
|
+
for (int64_t i = 0; i < n; i++) {
|
|
367
|
+
idx_t *labels1 = labels + i * k;
|
|
368
|
+
float *dis1 = distances + i * k;
|
|
369
|
+
int64_t j = 0;
|
|
370
|
+
for (; j < k; j++) {
|
|
371
|
+
if (instances.find (labels1[j]) != instances.end ()) {
|
|
372
|
+
// a duplicate: special handling
|
|
373
|
+
break;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
if (j < k) {
|
|
377
|
+
// there are duplicates, special handling
|
|
378
|
+
int64_t j0 = j;
|
|
379
|
+
int64_t rp = j;
|
|
380
|
+
while (j < k) {
|
|
381
|
+
auto range = instances.equal_range (labels1[rp]);
|
|
382
|
+
float dis = dis1[rp];
|
|
383
|
+
labels2[j] = labels1[rp];
|
|
384
|
+
dis2[j] = dis;
|
|
385
|
+
j ++;
|
|
386
|
+
for (auto it = range.first; j < k && it != range.second; ++it) {
|
|
387
|
+
labels2[j] = it->second;
|
|
388
|
+
dis2[j] = dis;
|
|
389
|
+
j++;
|
|
390
|
+
}
|
|
391
|
+
rp++;
|
|
392
|
+
}
|
|
393
|
+
memcpy (labels1 + j0, labels2.data() + j0,
|
|
394
|
+
sizeof(labels1[0]) * (k - j0));
|
|
395
|
+
memcpy (dis1 + j0, dis2.data() + j0,
|
|
396
|
+
sizeof(dis2[0]) * (k - j0));
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel)
|
|
404
|
+
{
|
|
405
|
+
std::unordered_map<idx_t, idx_t> replace;
|
|
406
|
+
std::vector<std::pair<idx_t, idx_t> > toadd;
|
|
407
|
+
for (auto it = instances.begin(); it != instances.end(); ) {
|
|
408
|
+
if (sel.is_member(it->first)) {
|
|
409
|
+
// then we erase this entry
|
|
410
|
+
if (!sel.is_member(it->second)) {
|
|
411
|
+
// if the second is not erased
|
|
412
|
+
if (replace.count(it->first) == 0) {
|
|
413
|
+
replace[it->first] = it->second;
|
|
414
|
+
} else { // remember we should add an element
|
|
415
|
+
std::pair<idx_t, idx_t> new_entry (
|
|
416
|
+
replace[it->first], it->second);
|
|
417
|
+
toadd.push_back(new_entry);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
it = instances.erase(it);
|
|
421
|
+
} else {
|
|
422
|
+
if (sel.is_member(it->second)) {
|
|
423
|
+
it = instances.erase(it);
|
|
424
|
+
} else {
|
|
425
|
+
++it;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
instances.insert (toadd.begin(), toadd.end());
|
|
431
|
+
|
|
432
|
+
// mostly copied from IndexIVF.cpp
|
|
433
|
+
|
|
434
|
+
FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
|
|
435
|
+
"direct map remove not implemented");
|
|
436
|
+
|
|
437
|
+
std::vector<int64_t> toremove(nlist);
|
|
438
|
+
|
|
439
|
+
#pragma omp parallel for
|
|
440
|
+
for (int64_t i = 0; i < nlist; i++) {
|
|
441
|
+
int64_t l0 = invlists->list_size (i), l = l0, j = 0;
|
|
442
|
+
InvertedLists::ScopedIds idsi (invlists, i);
|
|
443
|
+
while (j < l) {
|
|
444
|
+
if (sel.is_member (idsi[j])) {
|
|
445
|
+
if (replace.count(idsi[j]) == 0) {
|
|
446
|
+
l--;
|
|
447
|
+
invlists->update_entry (
|
|
448
|
+
i, j,
|
|
449
|
+
invlists->get_single_id (i, l),
|
|
450
|
+
InvertedLists::ScopedCodes (invlists, i, l).get());
|
|
451
|
+
} else {
|
|
452
|
+
invlists->update_entry (
|
|
453
|
+
i, j,
|
|
454
|
+
replace[idsi[j]],
|
|
455
|
+
InvertedLists::ScopedCodes (invlists, i, j).get());
|
|
456
|
+
j++;
|
|
457
|
+
}
|
|
458
|
+
} else {
|
|
459
|
+
j++;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
toremove[i] = l0 - l;
|
|
463
|
+
}
|
|
464
|
+
// this will not run well in parallel on ondisk because of possible shrinks
|
|
465
|
+
int64_t nremove = 0;
|
|
466
|
+
for (int64_t i = 0; i < nlist; i++) {
|
|
467
|
+
if (toremove[i] > 0) {
|
|
468
|
+
nremove += toremove[i];
|
|
469
|
+
invlists->resize(
|
|
470
|
+
i, invlists->list_size(i) - toremove[i]);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
ntotal -= nremove;
|
|
474
|
+
return nremove;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
void IndexIVFFlatDedup::range_search(
|
|
479
|
+
idx_t ,
|
|
480
|
+
const float* ,
|
|
481
|
+
float ,
|
|
482
|
+
RangeSearchResult* ) const
|
|
483
|
+
{
|
|
484
|
+
FAISS_THROW_MSG ("not implemented");
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
|
|
488
|
+
{
|
|
489
|
+
FAISS_THROW_MSG ("not implemented");
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
void IndexIVFFlatDedup::reconstruct_from_offset (
|
|
494
|
+
int64_t , int64_t , float* ) const
|
|
495
|
+
{
|
|
496
|
+
FAISS_THROW_MSG ("not implemented");
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
} // namespace faiss
|