faiss 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +36 -33
- data/vendor/faiss/faiss/AutoTune.h +6 -3
- data/vendor/faiss/faiss/Clustering.cpp +16 -12
- data/vendor/faiss/faiss/Index.cpp +3 -4
- data/vendor/faiss/faiss/Index.h +3 -3
- data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
- data/vendor/faiss/faiss/IndexBinary.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
- data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
- data/vendor/faiss/faiss/IndexFlat.h +0 -51
- data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
- data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
- data/vendor/faiss/faiss/IndexIVF.h +22 -15
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
- data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
- data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
- data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
- data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
- data/vendor/faiss/faiss/IndexRefine.h +73 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
- data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
- data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
- data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
- data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
- data/vendor/faiss/faiss/impl/io.cpp +33 -2
- data/vendor/faiss/faiss/impl/io.h +7 -2
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
- data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
- data/vendor/faiss/faiss/index_factory.cpp +112 -7
- data/vendor/faiss/faiss/index_io.h +1 -48
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
- data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
- data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
- data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
- data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
- data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
- data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
- data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
- data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
- data/vendor/faiss/faiss/utils/Heap.h +61 -50
- data/vendor/faiss/faiss/utils/distances.cpp +164 -319
- data/vendor/faiss/faiss/utils/distances.h +28 -20
- data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
- data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
- data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
- data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
- data/vendor/faiss/faiss/utils/hamming.h +2 -7
- data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
- data/vendor/faiss/faiss/utils/partitioning.h +69 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
- data/vendor/faiss/faiss/utils/simdlib.h +31 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
- metadata +43 -141
- data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
- data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
- data/vendor/faiss/c_api/AutoTune_c.h +0 -66
- data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
- data/vendor/faiss/c_api/Clustering_c.h +0 -123
- data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
- data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
- data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
- data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
- data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
- data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
- data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
- data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
- data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
- data/vendor/faiss/c_api/IndexShards_c.h +0 -39
- data/vendor/faiss/c_api/Index_c.cpp +0 -105
- data/vendor/faiss/c_api/Index_c.h +0 -183
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
- data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
- data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
- data/vendor/faiss/c_api/clone_index_c.h +0 -32
- data/vendor/faiss/c_api/error_c.h +0 -42
- data/vendor/faiss/c_api/error_impl.cpp +0 -27
- data/vendor/faiss/c_api/error_impl.h +0 -16
- data/vendor/faiss/c_api/faiss_c.h +0 -58
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
- data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
- data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
- data/vendor/faiss/c_api/index_factory_c.h +0 -30
- data/vendor/faiss/c_api/index_io_c.cpp +0 -42
- data/vendor/faiss/c_api/index_io_c.h +0 -50
- data/vendor/faiss/c_api/macros_impl.h +0 -110
- data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
- data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
- data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
- data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
- data/vendor/faiss/misc/test_blas.cpp +0 -87
- data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
- data/vendor/faiss/tests/test_merge.cpp +0 -260
- data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
- data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
- data/vendor/faiss/tests/test_params_override.cpp +0 -236
- data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
- data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
- data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
- data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -192,7 +192,7 @@ void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
|
|
192
192
|
size_t coarse_size = include_listnos ? coarse_code_size () : 0;
|
193
193
|
memset(codes, 0, (code_size + coarse_size) * n);
|
194
194
|
|
195
|
-
#pragma omp parallel if(n >
|
195
|
+
#pragma omp parallel if(n > 1000)
|
196
196
|
{
|
197
197
|
std::vector<float> residual (d);
|
198
198
|
|
@@ -222,7 +222,7 @@ void IndexIVFScalarQuantizer::sa_decode (idx_t n, const uint8_t *codes,
|
|
222
222
|
std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
|
223
223
|
size_t coarse_size = coarse_code_size ();
|
224
224
|
|
225
|
-
#pragma omp parallel if(n >
|
225
|
+
#pragma omp parallel if(n > 1000)
|
226
226
|
{
|
227
227
|
std::vector<float> residual (d);
|
228
228
|
|
@@ -82,7 +82,7 @@ struct IndexScalarQuantizer: Index {
|
|
82
82
|
|
83
83
|
|
84
84
|
/** An IVF implementation where the components of the residuals are
|
85
|
-
* encoded with a scalar
|
85
|
+
* encoded with a scalar quantizer. All distance computations
|
86
86
|
* are asymmetric, so the encoded vectors are decoded and approximate
|
87
87
|
* distances are computed.
|
88
88
|
*/
|
@@ -36,9 +36,12 @@ class GpuIndex : public faiss::Index {
|
|
36
36
|
float metricArg,
|
37
37
|
GpuIndexConfig config);
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
|
39
|
+
/// Returns the device that this index is resident on
|
40
|
+
int getDevice() const;
|
41
|
+
|
42
|
+
/// Returns a reference to our GpuResources object that manages memory, stream
|
43
|
+
/// and handle resources on the GPU
|
44
|
+
std::shared_ptr<GpuResources> getResources();
|
42
45
|
|
43
46
|
/// Set the minimum data size for searches (in MiB) for which we use
|
44
47
|
/// CPU -> GPU paging
|
@@ -50,7 +53,7 @@ class GpuIndex : public faiss::Index {
|
|
50
53
|
/// `x` can be resident on the CPU or any GPU; copies are performed
|
51
54
|
/// as needed
|
52
55
|
/// Handles paged adds if the add set is too large; calls addInternal_
|
53
|
-
void add(
|
56
|
+
void add(Index::idx_t, const float* x) override;
|
54
57
|
|
55
58
|
/// `x` and `ids` can be resident on the CPU or any GPU; copies are
|
56
59
|
/// performed as needed
|
@@ -59,6 +62,13 @@ class GpuIndex : public faiss::Index {
|
|
59
62
|
const float* x,
|
60
63
|
const Index::idx_t* ids) override;
|
61
64
|
|
65
|
+
/// `x` and `labels` can be resident on the CPU or any GPU; copies are
|
66
|
+
/// performed as needed
|
67
|
+
void assign(Index::idx_t n,
|
68
|
+
const float* x,
|
69
|
+
Index::idx_t* labels,
|
70
|
+
Index::idx_t k = 1) const override;
|
71
|
+
|
62
72
|
/// `x`, `distances` and `labels` can be resident on the CPU or any
|
63
73
|
/// GPU; copies are performed as needed
|
64
74
|
void search(Index::idx_t n,
|
@@ -136,11 +146,8 @@ private:
|
|
136
146
|
/// Manages streams, cuBLAS handles and scratch memory for devices
|
137
147
|
std::shared_ptr<GpuResources> resources_;
|
138
148
|
|
139
|
-
///
|
140
|
-
const
|
141
|
-
|
142
|
-
/// The memory space of our primary storage on the GPU
|
143
|
-
const MemorySpace memorySpace_;
|
149
|
+
/// Our configuration options
|
150
|
+
const GpuIndexConfig config_;
|
144
151
|
|
145
152
|
/// Size above which we page copies from the CPU to GPU
|
146
153
|
size_t minPagedSize_;
|
@@ -38,6 +38,13 @@ class GpuIndexBinaryFlat : public IndexBinary {
|
|
38
38
|
|
39
39
|
~GpuIndexBinaryFlat() override;
|
40
40
|
|
41
|
+
/// Returns the device that this index is resident on
|
42
|
+
int getDevice() const;
|
43
|
+
|
44
|
+
/// Returns a reference to our GpuResources object that manages memory, stream
|
45
|
+
/// and handle resources on the GPU
|
46
|
+
std::shared_ptr<GpuResources> getResources();
|
47
|
+
|
41
48
|
/// Initialize ourselves from the given CPU index; will overwrite
|
42
49
|
/// all data in ourselves
|
43
50
|
void copyFrom(const faiss::IndexBinaryFlat* index);
|
@@ -80,7 +87,7 @@ class GpuIndexBinaryFlat : public IndexBinary {
|
|
80
87
|
std::shared_ptr<GpuResources> resources_;
|
81
88
|
|
82
89
|
/// Configuration options
|
83
|
-
GpuIndexBinaryFlatConfig
|
90
|
+
const GpuIndexBinaryFlatConfig binaryFlatConfig_;
|
84
91
|
|
85
92
|
/// Holds our GPU data containing the list of vectors
|
86
93
|
std::unique_ptr<BinaryFlatIndex> data_;
|
@@ -21,7 +21,7 @@ struct IndexFlatIP;
|
|
21
21
|
|
22
22
|
namespace faiss { namespace gpu {
|
23
23
|
|
24
|
-
|
24
|
+
class FlatIndex;
|
25
25
|
|
26
26
|
struct GpuIndexFlatConfig : public GpuIndexConfig {
|
27
27
|
inline GpuIndexFlatConfig()
|
@@ -87,27 +87,27 @@ class GpuIndexFlat : public GpuIndex {
|
|
87
87
|
void train(Index::idx_t n, const float* x) override;
|
88
88
|
|
89
89
|
/// Overrides to avoid excessive copies
|
90
|
-
void add(
|
90
|
+
void add(Index::idx_t, const float* x) override;
|
91
91
|
|
92
92
|
/// Reconstruction methods; prefer the batch reconstruct as it will
|
93
93
|
/// be more efficient
|
94
|
-
void reconstruct(
|
94
|
+
void reconstruct(Index::idx_t key, float* out) const override;
|
95
95
|
|
96
96
|
/// Batch reconstruction method
|
97
|
-
void reconstruct_n(
|
98
|
-
|
97
|
+
void reconstruct_n(Index::idx_t i0,
|
98
|
+
Index::idx_t num,
|
99
99
|
float* out) const override;
|
100
100
|
|
101
101
|
/// Compute residual
|
102
102
|
void compute_residual(const float* x,
|
103
103
|
float* residual,
|
104
|
-
|
104
|
+
Index::idx_t key) const override;
|
105
105
|
|
106
106
|
/// Compute residual (batch mode)
|
107
|
-
void compute_residual_n(
|
107
|
+
void compute_residual_n(Index::idx_t n,
|
108
108
|
const float* xs,
|
109
109
|
float* residuals,
|
110
|
-
const
|
110
|
+
const Index::idx_t* keys) const override;
|
111
111
|
|
112
112
|
/// For internal access
|
113
113
|
inline FlatIndex* getGpuData() { return data_.get(); }
|
@@ -126,11 +126,11 @@ class GpuIndexFlat : public GpuIndex {
|
|
126
126
|
const float* x,
|
127
127
|
int k,
|
128
128
|
float* distances,
|
129
|
-
|
129
|
+
Index::idx_t* labels) const override;
|
130
130
|
|
131
131
|
protected:
|
132
|
-
/// Our
|
133
|
-
const GpuIndexFlatConfig
|
132
|
+
/// Our configuration options
|
133
|
+
const GpuIndexFlatConfig flatConfig_;
|
134
134
|
|
135
135
|
/// Holds our GPU data containing the list of vectors
|
136
136
|
std::unique_ptr<FlatIndex> data_;
|
@@ -56,6 +56,22 @@ class GpuIndexIVF : public GpuIndex {
|
|
56
56
|
/// Returns the number of inverted lists we're managing
|
57
57
|
int getNumLists() const;
|
58
58
|
|
59
|
+
/// Returns the number of vectors present in a particular inverted list
|
60
|
+
virtual int getListLength(int listId) const = 0;
|
61
|
+
|
62
|
+
/// Return the encoded vector data contained in a particular inverted list,
|
63
|
+
/// for debugging purposes.
|
64
|
+
/// If gpuFormat is true, the data is returned as it is encoded in the
|
65
|
+
/// GPU-side representation.
|
66
|
+
/// Otherwise, it is converted to the CPU format.
|
67
|
+
/// compliant format, while the native GPU format may differ.
|
68
|
+
virtual std::vector<uint8_t>
|
69
|
+
getListVectorData(int listId, bool gpuFormat = false) const = 0;
|
70
|
+
|
71
|
+
/// Return the vector indices contained in a particular inverted list, for
|
72
|
+
/// debugging purposes.
|
73
|
+
virtual std::vector<Index::idx_t> getListIndices(int listId) const = 0;
|
74
|
+
|
59
75
|
/// Return the quantizer we're using
|
60
76
|
GpuIndexFlat* getQuantizer();
|
61
77
|
|
@@ -67,7 +83,7 @@ class GpuIndexIVF : public GpuIndex {
|
|
67
83
|
|
68
84
|
protected:
|
69
85
|
bool addImplRequiresIDs_() const override;
|
70
|
-
void trainQuantizer_(
|
86
|
+
void trainQuantizer_(Index::idx_t n, const float* x);
|
71
87
|
|
72
88
|
public:
|
73
89
|
/// Exposing this like the CPU version for manipulation
|
@@ -83,7 +99,8 @@ class GpuIndexIVF : public GpuIndex {
|
|
83
99
|
GpuIndexFlat* quantizer;
|
84
100
|
|
85
101
|
protected:
|
86
|
-
|
102
|
+
/// Our configuration options
|
103
|
+
const GpuIndexIVFConfig ivfConfig_;
|
87
104
|
};
|
88
105
|
|
89
106
|
} } // namespace
|
@@ -19,6 +19,13 @@ class IVFFlat;
|
|
19
19
|
class GpuIndexFlat;
|
20
20
|
|
21
21
|
struct GpuIndexIVFFlatConfig : public GpuIndexIVFConfig {
|
22
|
+
inline GpuIndexIVFFlatConfig()
|
23
|
+
: interleavedLayout(true) {
|
24
|
+
}
|
25
|
+
|
26
|
+
/// Use the alternative memory layout for the IVF lists
|
27
|
+
/// (currently the default)
|
28
|
+
bool interleavedLayout;
|
22
29
|
};
|
23
30
|
|
24
31
|
/// Wrapper around the GPU implementation that looks like
|
@@ -56,10 +63,28 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
|
|
56
63
|
/// to exactly the amount needed. Returns space reclaimed in bytes
|
57
64
|
size_t reclaimMemory();
|
58
65
|
|
66
|
+
/// Clears out all inverted lists, but retains the coarse centroid information
|
59
67
|
void reset() override;
|
60
68
|
|
69
|
+
/// Trains the coarse quantizer based on the given vector data
|
61
70
|
void train(Index::idx_t n, const float* x) override;
|
62
71
|
|
72
|
+
/// Returns the number of vectors present in a particular inverted list
|
73
|
+
int getListLength(int listId) const override;
|
74
|
+
|
75
|
+
/// Return the encoded vector data contained in a particular inverted list,
|
76
|
+
/// for debugging purposes.
|
77
|
+
/// If gpuFormat is true, the data is returned as it is encoded in the
|
78
|
+
/// GPU-side representation.
|
79
|
+
/// Otherwise, it is converted to the CPU format.
|
80
|
+
/// compliant format, while the native GPU format may differ.
|
81
|
+
std::vector<uint8_t>
|
82
|
+
getListVectorData(int listId, bool gpuFormat = false) const override;
|
83
|
+
|
84
|
+
/// Return the vector indices contained in a particular inverted list, for
|
85
|
+
/// debugging purposes.
|
86
|
+
std::vector<Index::idx_t> getListIndices(int listId) const override;
|
87
|
+
|
63
88
|
protected:
|
64
89
|
/// Called from GpuIndex for add/add_with_ids
|
65
90
|
void addImpl_(int n,
|
@@ -73,8 +98,9 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
|
|
73
98
|
float* distances,
|
74
99
|
Index::idx_t* labels) const override;
|
75
100
|
|
76
|
-
|
77
|
-
|
101
|
+
protected:
|
102
|
+
/// Our configuration options
|
103
|
+
const GpuIndexIVFFlatConfig ivfFlatConfig_;
|
78
104
|
|
79
105
|
/// Desired inverted list memory reservation
|
80
106
|
size_t reserveMemoryVecs_;
|
@@ -23,7 +23,7 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
|
|
23
23
|
inline GpuIndexIVFPQConfig()
|
24
24
|
: useFloat16LookupTables(false),
|
25
25
|
usePrecomputedTables(false),
|
26
|
-
|
26
|
+
interleavedLayout(false),
|
27
27
|
useMMCodeDistance(false) {
|
28
28
|
}
|
29
29
|
|
@@ -38,7 +38,7 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
|
|
38
38
|
|
39
39
|
/// Use the alternative memory layout for the IVF lists
|
40
40
|
/// WARNING: this is a feature under development, do not use!
|
41
|
-
bool
|
41
|
+
bool interleavedLayout;
|
42
42
|
|
43
43
|
/// Use GEMM-backed computation of PQ code distances for the no precomputed
|
44
44
|
/// table version of IVFPQ.
|
@@ -108,19 +108,24 @@ class GpuIndexIVFPQ : public GpuIndexIVF {
|
|
108
108
|
/// product centroid information
|
109
109
|
void reset() override;
|
110
110
|
|
111
|
+
/// Trains the coarse and product quantizer based on the given vector data
|
111
112
|
void train(Index::idx_t n, const float* x) override;
|
112
113
|
|
113
|
-
///
|
114
|
-
|
115
|
-
int getListLength(int listId) const;
|
114
|
+
/// Returns the number of vectors present in a particular inverted list
|
115
|
+
int getListLength(int listId) const override;
|
116
116
|
|
117
|
-
///
|
118
|
-
///
|
119
|
-
|
117
|
+
/// Return the encoded vector data contained in a particular inverted list,
|
118
|
+
/// for debugging purposes.
|
119
|
+
/// If gpuFormat is true, the data is returned as it is encoded in the
|
120
|
+
/// GPU-side representation.
|
121
|
+
/// Otherwise, it is converted to the CPU format.
|
122
|
+
/// compliant format, while the native GPU format may differ.
|
123
|
+
std::vector<uint8_t>
|
124
|
+
getListVectorData(int listId, bool gpuFormat = false) const override;
|
120
125
|
|
121
|
-
///
|
122
|
-
///
|
123
|
-
std::vector<
|
126
|
+
/// Return the vector indices contained in a particular inverted list, for
|
127
|
+
/// debugging purposes.
|
128
|
+
std::vector<Index::idx_t> getListIndices(int listId) const override;
|
124
129
|
|
125
130
|
protected:
|
126
131
|
/// Called from GpuIndex for add/add_with_ids
|
@@ -135,13 +140,18 @@ class GpuIndexIVFPQ : public GpuIndexIVF {
|
|
135
140
|
float* distances,
|
136
141
|
Index::idx_t* labels) const override;
|
137
142
|
|
138
|
-
|
143
|
+
/// Throws errors if configuration settings are improper
|
139
144
|
void verifySettings_() const;
|
140
145
|
|
146
|
+
/// Trains the PQ quantizer based on the given vector data
|
141
147
|
void trainResidualQuantizer_(Index::idx_t n, const float* x);
|
142
148
|
|
143
|
-
|
144
|
-
|
149
|
+
protected:
|
150
|
+
/// Our configuration options that we were initialized with
|
151
|
+
const GpuIndexIVFPQConfig ivfpqConfig_;
|
152
|
+
|
153
|
+
/// Runtime override: whether or not we use precomputed tables
|
154
|
+
bool usePrecomputedTables_;
|
145
155
|
|
146
156
|
/// Number of sub-quantizers per encoded vector
|
147
157
|
int subQuantizers_;
|
@@ -18,6 +18,13 @@ class IVFFlat;
|
|
18
18
|
class GpuIndexFlat;
|
19
19
|
|
20
20
|
struct GpuIndexIVFScalarQuantizerConfig : public GpuIndexIVFConfig {
|
21
|
+
inline GpuIndexIVFScalarQuantizerConfig()
|
22
|
+
: interleavedLayout(true) {
|
23
|
+
}
|
24
|
+
|
25
|
+
/// Use the alternative memory layout for the IVF lists
|
26
|
+
/// (currently the default)
|
27
|
+
bool interleavedLayout;
|
21
28
|
};
|
22
29
|
|
23
30
|
/// Wrapper around the GPU implementation that looks like
|
@@ -61,10 +68,29 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
|
|
61
68
|
/// to exactly the amount needed. Returns space reclaimed in bytes
|
62
69
|
size_t reclaimMemory();
|
63
70
|
|
71
|
+
/// Clears out all inverted lists, but retains the coarse and scalar quantizer
|
72
|
+
/// information
|
64
73
|
void reset() override;
|
65
74
|
|
75
|
+
/// Trains the coarse and scalar quantizer based on the given vector data
|
66
76
|
void train(Index::idx_t n, const float* x) override;
|
67
77
|
|
78
|
+
/// Returns the number of vectors present in a particular inverted list
|
79
|
+
int getListLength(int listId) const override;
|
80
|
+
|
81
|
+
/// Return the encoded vector data contained in a particular inverted list,
|
82
|
+
/// for debugging purposes.
|
83
|
+
/// If gpuFormat is true, the data is returned as it is encoded in the
|
84
|
+
/// GPU-side representation.
|
85
|
+
/// Otherwise, it is converted to the CPU format.
|
86
|
+
/// compliant format, while the native GPU format may differ.
|
87
|
+
std::vector<uint8_t>
|
88
|
+
getListVectorData(int listId, bool gpuFormat = false) const override;
|
89
|
+
|
90
|
+
/// Return the vector indices contained in a particular inverted list, for
|
91
|
+
/// debugging purposes.
|
92
|
+
std::vector<Index::idx_t> getListIndices(int listId) const override;
|
93
|
+
|
68
94
|
protected:
|
69
95
|
/// Called from GpuIndex for add/add_with_ids
|
70
96
|
void addImpl_(int n,
|
@@ -88,8 +114,9 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
|
|
88
114
|
/// Exposed like the CPU version
|
89
115
|
bool by_residual;
|
90
116
|
|
91
|
-
|
92
|
-
|
117
|
+
protected:
|
118
|
+
/// Our configuration options
|
119
|
+
const GpuIndexIVFScalarQuantizerConfig ivfSQConfig_;
|
93
120
|
|
94
121
|
/// Desired inverted list memory reservation
|
95
122
|
size_t reserveMemoryVecs_;
|
@@ -198,6 +198,10 @@ class GpuResources {
|
|
198
198
|
/// given device
|
199
199
|
virtual cudaStream_t getDefaultStream(int device) = 0;
|
200
200
|
|
201
|
+
/// Overrides the default stream for a device to the user-supplied stream. The
|
202
|
+
/// resources object does not own this stream (i.e., it will not destroy it).
|
203
|
+
virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
|
204
|
+
|
201
205
|
/// Returns the set of alternative streams that we use for the given device
|
202
206
|
virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
|
203
207
|
|
@@ -101,12 +101,8 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
|
|
101
101
|
for (auto& entry : defaultStreams_) {
|
102
102
|
DeviceScope scope(entry.first);
|
103
103
|
|
104
|
-
|
105
|
-
|
106
|
-
// The user did not specify this stream, thus we are the ones
|
107
|
-
// who have created it
|
108
|
-
CUDA_VERIFY(cudaStreamDestroy(entry.second));
|
109
|
-
}
|
104
|
+
// We created these streams, so are responsible for destroying them
|
105
|
+
CUDA_VERIFY(cudaStreamDestroy(entry.second));
|
110
106
|
}
|
111
107
|
|
112
108
|
for (auto& entry : alternateStreams_) {
|
@@ -210,16 +206,47 @@ StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
|
|
210
206
|
|
211
207
|
void
|
212
208
|
StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
it
|
209
|
+
if (isInitialized(device)) {
|
210
|
+
// A new series of calls may not be ordered with what was the previous
|
211
|
+
// stream, so if the stream being specified is different, then we need to
|
212
|
+
// ensure ordering between the two (new stream waits on old).
|
213
|
+
auto it = userDefaultStreams_.find(device);
|
214
|
+
cudaStream_t prevStream = nullptr;
|
215
|
+
|
216
|
+
if (it != userDefaultStreams_.end()) {
|
217
|
+
prevStream = it->second;
|
218
|
+
} else {
|
219
|
+
FAISS_ASSERT(defaultStreams_.count(device));
|
220
|
+
prevStream = defaultStreams_[device];
|
221
|
+
}
|
222
|
+
|
223
|
+
if (prevStream != stream) {
|
224
|
+
streamWait({stream}, {prevStream});
|
225
|
+
}
|
218
226
|
}
|
219
227
|
|
220
228
|
userDefaultStreams_[device] = stream;
|
221
229
|
}
|
222
230
|
|
231
|
+
void
|
232
|
+
StandardGpuResourcesImpl::revertDefaultStream(int device) {
|
233
|
+
if (isInitialized(device)) {
|
234
|
+
auto it = userDefaultStreams_.find(device);
|
235
|
+
|
236
|
+
if (it != userDefaultStreams_.end()) {
|
237
|
+
// There was a user stream set that we need to synchronize against
|
238
|
+
cudaStream_t prevStream = userDefaultStreams_[device];
|
239
|
+
|
240
|
+
FAISS_ASSERT(defaultStreams_.count(device));
|
241
|
+
cudaStream_t newStream = defaultStreams_[device];
|
242
|
+
|
243
|
+
streamWait({newStream}, {prevStream});
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
userDefaultStreams_.erase(device);
|
248
|
+
}
|
249
|
+
|
223
250
|
void
|
224
251
|
StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
|
225
252
|
for (int dev = 0; dev < getNumDevices(); ++dev) {
|
@@ -274,14 +301,8 @@ StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
274
301
|
|
275
302
|
// Create streams
|
276
303
|
cudaStream_t defaultStream = 0;
|
277
|
-
|
278
|
-
|
279
|
-
// We already have a stream provided by the user
|
280
|
-
defaultStream = it->second;
|
281
|
-
} else {
|
282
|
-
CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
|
283
|
-
cudaStreamNonBlocking));
|
284
|
-
}
|
304
|
+
CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
|
305
|
+
cudaStreamNonBlocking));
|
285
306
|
|
286
307
|
defaultStreams_[device] = defaultStream;
|
287
308
|
|
@@ -308,15 +329,14 @@ StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
308
329
|
FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
|
309
330
|
blasHandles_[device] = blasHandle;
|
310
331
|
|
311
|
-
//
|
312
|
-
|
313
|
-
//
|
314
|
-
if
|
315
|
-
|
316
|
-
}
|
317
|
-
#endif
|
332
|
+
// For CUDA 10 on V100, enabling tensor core usage would enable automatic
|
333
|
+
// rounding down of inputs to f16 (though accumulate in f32) which results in
|
334
|
+
// unacceptable loss of precision in general.
|
335
|
+
// For CUDA 11 / A100, only enable tensor core support if it doesn't result in
|
336
|
+
// a loss of precision.
|
318
337
|
#if CUDA_VERSION >= 11000
|
319
|
-
cublasSetMathMode(blasHandle,
|
338
|
+
cublasSetMathMode(blasHandle,
|
339
|
+
CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
320
340
|
#endif
|
321
341
|
|
322
342
|
FAISS_ASSERT(allocs_.count(device) == 0);
|
@@ -341,6 +361,14 @@ StandardGpuResourcesImpl::getBlasHandle(int device) {
|
|
341
361
|
cudaStream_t
|
342
362
|
StandardGpuResourcesImpl::getDefaultStream(int device) {
|
343
363
|
initializeForDevice(device);
|
364
|
+
|
365
|
+
auto it = userDefaultStreams_.find(device);
|
366
|
+
if (it != userDefaultStreams_.end()) {
|
367
|
+
// There is a user override stream set
|
368
|
+
return it->second;
|
369
|
+
}
|
370
|
+
|
371
|
+
// Otherwise, our base default stream
|
344
372
|
return defaultStreams_[device];
|
345
373
|
}
|
346
374
|
|
@@ -539,6 +567,11 @@ StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
|
|
539
567
|
res_->setDefaultStream(device, stream);
|
540
568
|
}
|
541
569
|
|
570
|
+
void
|
571
|
+
StandardGpuResources::revertDefaultStream(int device) {
|
572
|
+
res_->revertDefaultStream(device);
|
573
|
+
}
|
574
|
+
|
542
575
|
void
|
543
576
|
StandardGpuResources::setDefaultNullStreamAllDevices() {
|
544
577
|
res_->setDefaultNullStreamAllDevices();
|