faiss 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +9 -2
- data/ext/faiss/index.cpp +1 -1
- data/ext/faiss/index_binary.cpp +2 -2
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +7 -7
- data/vendor/faiss/faiss/AutoTune.h +1 -2
- data/vendor/faiss/faiss/Clustering.cpp +39 -22
- data/vendor/faiss/faiss/Clustering.h +40 -21
- data/vendor/faiss/faiss/IVFlib.cpp +26 -12
- data/vendor/faiss/faiss/Index.cpp +1 -1
- data/vendor/faiss/faiss/Index.h +40 -10
- data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
- data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
- data/vendor/faiss/faiss/IndexBinary.h +8 -19
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +107 -188
- data/vendor/faiss/faiss/IndexFastScan.cpp +95 -146
- data/vendor/faiss/faiss/IndexFastScan.h +9 -8
- data/vendor/faiss/faiss/IndexFlat.cpp +206 -10
- data/vendor/faiss/faiss/IndexFlat.h +20 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +170 -5
- data/vendor/faiss/faiss/IndexFlatCodes.h +23 -4
- data/vendor/faiss/faiss/IndexHNSW.cpp +231 -382
- data/vendor/faiss/faiss/IndexHNSW.h +62 -49
- data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
- data/vendor/faiss/faiss/IndexIDMap.h +24 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +162 -56
- data/vendor/faiss/faiss/IndexIVF.h +46 -6
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +33 -26
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +6 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +502 -401
- data/vendor/faiss/faiss/IndexIVFFastScan.h +63 -26
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
- data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +79 -125
- data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +39 -52
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
- data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
- data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
- data/vendor/faiss/faiss/IndexLattice.h +3 -22
- data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -33
- data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
- data/vendor/faiss/faiss/IndexNSG.h +11 -11
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
- data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
- data/vendor/faiss/faiss/IndexPQ.h +1 -4
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRefine.cpp +54 -24
- data/vendor/faiss/faiss/IndexRefine.h +7 -0
- data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +25 -17
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
- data/vendor/faiss/faiss/IndexShards.cpp +21 -29
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
- data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
- data/vendor/faiss/faiss/MatrixStats.h +21 -9
- data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
- data/vendor/faiss/faiss/MetricType.h +7 -2
- data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
- data/vendor/faiss/faiss/VectorTransform.h +7 -7
- data/vendor/faiss/faiss/clone_index.cpp +15 -10
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
- data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
- data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +123 -8
- data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +13 -0
- data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +30 -12
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +14 -9
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +20 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
- data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +142 -17
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +7 -1
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +332 -40
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
- data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +26 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +10 -3
- data/vendor/faiss/faiss/impl/DistanceComputer.h +70 -1
- data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
- data/vendor/faiss/faiss/impl/FaissException.h +13 -34
- data/vendor/faiss/faiss/impl/HNSW.cpp +605 -186
- data/vendor/faiss/faiss/impl/HNSW.h +52 -30
- data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +11 -9
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +42 -27
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -22
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +6 -2
- data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
- data/vendor/faiss/faiss/impl/ResultHandler.h +347 -172
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +1104 -147
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -8
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +285 -42
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
- data/vendor/faiss/faiss/impl/index_read.cpp +74 -34
- data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
- data/vendor/faiss/faiss/impl/index_write.cpp +88 -51
- data/vendor/faiss/faiss/impl/io.cpp +23 -15
- data/vendor/faiss/faiss/impl/io.h +4 -4
- data/vendor/faiss/faiss/impl/io_macros.h +6 -0
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
- data/vendor/faiss/faiss/impl/platform_macros.h +40 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +14 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +487 -49
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +481 -225
- data/vendor/faiss/faiss/index_factory.cpp +41 -20
- data/vendor/faiss/faiss/index_io.h +12 -5
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +10 -2
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +73 -17
- data/vendor/faiss/faiss/invlists/InvertedLists.h +26 -8
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +24 -9
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +4 -4
- data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
- data/vendor/faiss/faiss/utils/Heap.h +105 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
- data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
- data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
- data/vendor/faiss/faiss/utils/bf16.h +36 -0
- data/vendor/faiss/faiss/utils/distances.cpp +147 -123
- data/vendor/faiss/faiss/utils/distances.h +86 -9
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
- data/vendor/faiss/faiss/utils/distances_simd.cpp +1589 -243
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
- data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
- data/vendor/faiss/faiss/utils/fp16.h +2 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +163 -111
- data/vendor/faiss/faiss/utils/hamming.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +19 -88
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
- data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
- data/vendor/faiss/faiss/utils/prefetch.h +77 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
- data/vendor/faiss/faiss/utils/random.cpp +43 -0
- data/vendor/faiss/faiss/utils/random.h +25 -0
- data/vendor/faiss/faiss/utils/simdlib.h +10 -1
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +77 -79
- data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
- data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
- data/vendor/faiss/faiss/utils/sorting.h +27 -0
- data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
- data/vendor/faiss/faiss/utils/utils.cpp +120 -7
- data/vendor/faiss/faiss/utils/utils.h +60 -20
- metadata +23 -4
- data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
|
@@ -4,6 +4,29 @@
|
|
|
4
4
|
* This source code is licensed under the MIT license found in the
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
|
+
/*
|
|
8
|
+
* Copyright (c) 2023, NVIDIA CORPORATION.
|
|
9
|
+
*
|
|
10
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
* you may not use this file except in compliance with the License.
|
|
12
|
+
* You may obtain a copy of the License at
|
|
13
|
+
*
|
|
14
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
*
|
|
16
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
* See the License for the specific language governing permissions and
|
|
20
|
+
* limitations under the License.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
#if defined USE_NVIDIA_RAFT
|
|
24
|
+
#include <raft/core/device_resources.hpp>
|
|
25
|
+
#include <rmm/mr/device/managed_memory_resource.hpp>
|
|
26
|
+
#include <rmm/mr/device/per_device_resource.hpp>
|
|
27
|
+
#include <rmm/mr/host/pinned_memory_resource.hpp>
|
|
28
|
+
#include <memory>
|
|
29
|
+
#endif
|
|
7
30
|
|
|
8
31
|
#include <faiss/gpu/StandardGpuResources.h>
|
|
9
32
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
@@ -66,7 +89,12 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
|
|
|
66
89
|
//
|
|
67
90
|
|
|
68
91
|
StandardGpuResourcesImpl::StandardGpuResourcesImpl()
|
|
69
|
-
:
|
|
92
|
+
:
|
|
93
|
+
#if defined USE_NVIDIA_RAFT
|
|
94
|
+
mmr_(new rmm::mr::managed_memory_resource),
|
|
95
|
+
pmr_(new rmm::mr::pinned_memory_resource),
|
|
96
|
+
#endif
|
|
97
|
+
pinnedMemAlloc_(nullptr),
|
|
70
98
|
pinnedMemAllocSize_(0),
|
|
71
99
|
// let the adjustment function determine the memory size for us by
|
|
72
100
|
// passing in a huge value that will then be adjusted
|
|
@@ -74,7 +102,8 @@ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
|
|
|
74
102
|
-1,
|
|
75
103
|
std::numeric_limits<size_t>::max())),
|
|
76
104
|
pinnedMemSize_(kDefaultPinnedMemoryAllocation),
|
|
77
|
-
allocLogging_(false) {
|
|
105
|
+
allocLogging_(false) {
|
|
106
|
+
}
|
|
78
107
|
|
|
79
108
|
StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
|
|
80
109
|
// The temporary memory allocator has allocated memory through us, so clean
|
|
@@ -129,6 +158,9 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
|
|
|
129
158
|
}
|
|
130
159
|
|
|
131
160
|
if (pinnedMemAlloc_) {
|
|
161
|
+
#if defined USE_NVIDIA_RAFT
|
|
162
|
+
pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
|
|
163
|
+
#else
|
|
132
164
|
auto err = cudaFreeHost(pinnedMemAlloc_);
|
|
133
165
|
FAISS_ASSERT_FMT(
|
|
134
166
|
err == cudaSuccess,
|
|
@@ -136,6 +168,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
|
|
|
136
168
|
pinnedMemAlloc_,
|
|
137
169
|
(int)err,
|
|
138
170
|
cudaGetErrorString(err));
|
|
171
|
+
#endif
|
|
139
172
|
}
|
|
140
173
|
}
|
|
141
174
|
|
|
@@ -187,11 +220,11 @@ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
|
|
|
187
220
|
p.second.reset();
|
|
188
221
|
|
|
189
222
|
// Allocate new
|
|
190
|
-
p.second = std::
|
|
223
|
+
p.second = std::make_unique<StackDeviceMemory>(
|
|
191
224
|
this,
|
|
192
225
|
p.first,
|
|
193
226
|
// adjust for this specific device
|
|
194
|
-
getDefaultTempMemForGPU(device, tempMemSize_))
|
|
227
|
+
getDefaultTempMemForGPU(device, tempMemSize_));
|
|
195
228
|
}
|
|
196
229
|
}
|
|
197
230
|
}
|
|
@@ -224,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
|
|
|
224
257
|
if (prevStream != stream) {
|
|
225
258
|
streamWait({stream}, {prevStream});
|
|
226
259
|
}
|
|
260
|
+
#if defined USE_NVIDIA_RAFT
|
|
261
|
+
// delete the raft handle for this device, which will be initialized
|
|
262
|
+
// with the updated stream during any subsequent calls to getRaftHandle
|
|
263
|
+
auto it2 = raftHandles_.find(device);
|
|
264
|
+
if (it2 != raftHandles_.end()) {
|
|
265
|
+
raftHandles_.erase(it2);
|
|
266
|
+
}
|
|
267
|
+
#endif
|
|
227
268
|
}
|
|
228
269
|
|
|
229
270
|
userDefaultStreams_[device] = stream;
|
|
@@ -242,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
|
|
|
242
283
|
|
|
243
284
|
streamWait({newStream}, {prevStream});
|
|
244
285
|
}
|
|
286
|
+
#if defined USE_NVIDIA_RAFT
|
|
287
|
+
// delete the raft handle for this device, which will be initialized
|
|
288
|
+
// with the updated stream during any subsequent calls to getRaftHandle
|
|
289
|
+
auto it2 = raftHandles_.find(device);
|
|
290
|
+
if (it2 != raftHandles_.end()) {
|
|
291
|
+
raftHandles_.erase(it2);
|
|
292
|
+
}
|
|
293
|
+
#endif
|
|
245
294
|
}
|
|
246
295
|
|
|
247
296
|
userDefaultStreams_.erase(device);
|
|
@@ -274,6 +323,19 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
|
274
323
|
// If this is the first device that we're initializing, create our
|
|
275
324
|
// pinned memory allocation
|
|
276
325
|
if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
|
|
326
|
+
#if defined USE_NVIDIA_RAFT
|
|
327
|
+
// If this is the first device that we're initializing, create our
|
|
328
|
+
// pinned memory allocation
|
|
329
|
+
if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
|
|
330
|
+
try {
|
|
331
|
+
pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
|
|
332
|
+
} catch (const std::bad_alloc& rmm_ex) {
|
|
333
|
+
FAISS_THROW_MSG("CUDA memory allocation error");
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
pinnedMemAllocSize_ = pinnedMemSize_;
|
|
337
|
+
}
|
|
338
|
+
#else
|
|
277
339
|
auto err = cudaHostAlloc(
|
|
278
340
|
&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
|
|
279
341
|
|
|
@@ -286,6 +348,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
|
286
348
|
cudaGetErrorString(err));
|
|
287
349
|
|
|
288
350
|
pinnedMemAllocSize_ = pinnedMemSize_;
|
|
351
|
+
#endif
|
|
289
352
|
}
|
|
290
353
|
|
|
291
354
|
// Make sure that device properties for all devices are cached
|
|
@@ -300,19 +363,32 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
|
300
363
|
prop.major,
|
|
301
364
|
prop.minor);
|
|
302
365
|
|
|
366
|
+
#if USE_AMD_ROCM
|
|
367
|
+
// Our code is pre-built with and expects warpSize == 32 or 64, validate
|
|
368
|
+
// that
|
|
369
|
+
FAISS_ASSERT_FMT(
|
|
370
|
+
prop.warpSize == 32 || prop.warpSize == 64,
|
|
371
|
+
"Device id %d does not have expected warpSize of 32 or 64",
|
|
372
|
+
device);
|
|
373
|
+
#else
|
|
303
374
|
// Our code is pre-built with and expects warpSize == 32, validate that
|
|
304
375
|
FAISS_ASSERT_FMT(
|
|
305
376
|
prop.warpSize == 32,
|
|
306
377
|
"Device id %d does not have expected warpSize of 32",
|
|
307
378
|
device);
|
|
379
|
+
#endif
|
|
308
380
|
|
|
309
381
|
// Create streams
|
|
310
|
-
cudaStream_t defaultStream =
|
|
382
|
+
cudaStream_t defaultStream = nullptr;
|
|
311
383
|
CUDA_VERIFY(
|
|
312
384
|
cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
|
|
313
385
|
|
|
314
386
|
defaultStreams_[device] = defaultStream;
|
|
315
387
|
|
|
388
|
+
#if defined USE_NVIDIA_RAFT
|
|
389
|
+
raftHandles_.emplace(std::make_pair(device, defaultStream));
|
|
390
|
+
#endif
|
|
391
|
+
|
|
316
392
|
cudaStream_t asyncCopyStream = 0;
|
|
317
393
|
CUDA_VERIFY(
|
|
318
394
|
cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
|
|
@@ -321,7 +397,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
|
321
397
|
|
|
322
398
|
std::vector<cudaStream_t> deviceStreams;
|
|
323
399
|
for (int j = 0; j < kNumStreams; ++j) {
|
|
324
|
-
cudaStream_t stream =
|
|
400
|
+
cudaStream_t stream = nullptr;
|
|
325
401
|
CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
326
402
|
|
|
327
403
|
deviceStreams.push_back(stream);
|
|
@@ -330,7 +406,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
|
330
406
|
alternateStreams_[device] = std::move(deviceStreams);
|
|
331
407
|
|
|
332
408
|
// Create cuBLAS handle
|
|
333
|
-
cublasHandle_t blasHandle =
|
|
409
|
+
cublasHandle_t blasHandle = nullptr;
|
|
334
410
|
auto blasStatus = cublasCreate(&blasHandle);
|
|
335
411
|
FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
|
|
336
412
|
blasHandles_[device] = blasHandle;
|
|
@@ -348,11 +424,11 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
|
348
424
|
allocs_[device] = std::unordered_map<void*, AllocRequest>();
|
|
349
425
|
|
|
350
426
|
FAISS_ASSERT(tempMemory_.count(device) == 0);
|
|
351
|
-
auto mem = std::
|
|
427
|
+
auto mem = std::make_unique<StackDeviceMemory>(
|
|
352
428
|
this,
|
|
353
429
|
device,
|
|
354
430
|
// adjust for this specific device
|
|
355
|
-
getDefaultTempMemForGPU(device, tempMemSize_))
|
|
431
|
+
getDefaultTempMemForGPU(device, tempMemSize_));
|
|
356
432
|
|
|
357
433
|
tempMemory_.emplace(device, std::move(mem));
|
|
358
434
|
}
|
|
@@ -375,6 +451,25 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
|
|
|
375
451
|
return defaultStreams_[device];
|
|
376
452
|
}
|
|
377
453
|
|
|
454
|
+
#if defined USE_NVIDIA_RAFT
|
|
455
|
+
raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
|
|
456
|
+
initializeForDevice(device);
|
|
457
|
+
|
|
458
|
+
auto it = raftHandles_.find(device);
|
|
459
|
+
if (it == raftHandles_.end()) {
|
|
460
|
+
// Make sure we are using the stream the user may have already assigned
|
|
461
|
+
// to the current GpuResources
|
|
462
|
+
raftHandles_.emplace(device, getDefaultStream(device));
|
|
463
|
+
|
|
464
|
+
// Initialize cublas handle
|
|
465
|
+
raftHandles_[device].get_cublas_handle();
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// Otherwise, our base default handle
|
|
469
|
+
return raftHandles_[device];
|
|
470
|
+
}
|
|
471
|
+
#endif
|
|
472
|
+
|
|
378
473
|
std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
|
|
379
474
|
int device) {
|
|
380
475
|
initializeForDevice(device);
|
|
@@ -406,8 +501,6 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
|
|
|
406
501
|
void* p = nullptr;
|
|
407
502
|
|
|
408
503
|
if (adjReq.space == MemorySpace::Temporary) {
|
|
409
|
-
// If we don't have enough space in our temporary memory manager, we
|
|
410
|
-
// need to allocate this request separately
|
|
411
504
|
auto& tempMem = tempMemory_[adjReq.device];
|
|
412
505
|
|
|
413
506
|
if (adjReq.size > tempMem->getSizeAvailable()) {
|
|
@@ -428,15 +521,25 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
|
|
|
428
521
|
|
|
429
522
|
// Otherwise, we can handle this locally
|
|
430
523
|
p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
|
|
431
|
-
|
|
432
524
|
} else if (adjReq.space == MemorySpace::Device) {
|
|
525
|
+
#if defined USE_NVIDIA_RAFT
|
|
526
|
+
try {
|
|
527
|
+
rmm::mr::device_memory_resource* current_mr =
|
|
528
|
+
rmm::mr::get_per_device_resource(
|
|
529
|
+
rmm::cuda_device_id{adjReq.device});
|
|
530
|
+
p = current_mr->allocate_async(adjReq.size, adjReq.stream);
|
|
531
|
+
adjReq.mr = current_mr;
|
|
532
|
+
} catch (const std::bad_alloc& rmm_ex) {
|
|
533
|
+
FAISS_THROW_MSG("CUDA memory allocation error");
|
|
534
|
+
}
|
|
535
|
+
#else
|
|
433
536
|
auto err = cudaMalloc(&p, adjReq.size);
|
|
434
537
|
|
|
435
538
|
// Throw if we fail to allocate
|
|
436
539
|
if (err != cudaSuccess) {
|
|
437
540
|
// FIXME: as of CUDA 11, a memory allocation error appears to be
|
|
438
|
-
// presented via cudaGetLastError as well, and needs to be
|
|
439
|
-
// Just call the function to clear it
|
|
541
|
+
// presented via cudaGetLastError as well, and needs to be
|
|
542
|
+
// cleared. Just call the function to clear it
|
|
440
543
|
cudaGetLastError();
|
|
441
544
|
|
|
442
545
|
std::stringstream ss;
|
|
@@ -451,7 +554,20 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
|
|
|
451
554
|
|
|
452
555
|
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
|
453
556
|
}
|
|
557
|
+
#endif
|
|
454
558
|
} else if (adjReq.space == MemorySpace::Unified) {
|
|
559
|
+
#if defined USE_NVIDIA_RAFT
|
|
560
|
+
try {
|
|
561
|
+
// for now, use our own managed MR to do Unified Memory allocations.
|
|
562
|
+
// TODO: change this to use the current device resource once RMM has
|
|
563
|
+
// a way to retrieve a "guaranteed" managed memory resource for a
|
|
564
|
+
// device.
|
|
565
|
+
p = mmr_->allocate_async(adjReq.size, adjReq.stream);
|
|
566
|
+
adjReq.mr = mmr_.get();
|
|
567
|
+
} catch (const std::bad_alloc& rmm_ex) {
|
|
568
|
+
FAISS_THROW_MSG("CUDA memory allocation error");
|
|
569
|
+
}
|
|
570
|
+
#else
|
|
455
571
|
auto err = cudaMallocManaged(&p, adjReq.size);
|
|
456
572
|
|
|
457
573
|
if (err != cudaSuccess) {
|
|
@@ -472,6 +588,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
|
|
|
472
588
|
|
|
473
589
|
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
|
474
590
|
}
|
|
591
|
+
#endif
|
|
475
592
|
} else {
|
|
476
593
|
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
|
|
477
594
|
}
|
|
@@ -505,10 +622,12 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
|
|
|
505
622
|
|
|
506
623
|
if (req.space == MemorySpace::Temporary) {
|
|
507
624
|
tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
|
|
508
|
-
|
|
509
625
|
} else if (
|
|
510
626
|
req.space == MemorySpace::Device ||
|
|
511
627
|
req.space == MemorySpace::Unified) {
|
|
628
|
+
#if defined USE_NVIDIA_RAFT
|
|
629
|
+
req.mr->deallocate_async(p, req.size, req.stream);
|
|
630
|
+
#else
|
|
512
631
|
auto err = cudaFree(p);
|
|
513
632
|
FAISS_ASSERT_FMT(
|
|
514
633
|
err == cudaSuccess,
|
|
@@ -516,7 +635,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
|
|
|
516
635
|
p,
|
|
517
636
|
(int)err,
|
|
518
637
|
cudaGetErrorString(err));
|
|
519
|
-
|
|
638
|
+
#endif
|
|
520
639
|
} else {
|
|
521
640
|
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
|
|
522
641
|
}
|
|
@@ -561,7 +680,7 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
|
|
|
561
680
|
StandardGpuResources::StandardGpuResources()
|
|
562
681
|
: res_(new StandardGpuResourcesImpl) {}
|
|
563
682
|
|
|
564
|
-
StandardGpuResources::~StandardGpuResources()
|
|
683
|
+
StandardGpuResources::~StandardGpuResources() = default;
|
|
565
684
|
|
|
566
685
|
std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
|
|
567
686
|
return res_;
|
|
@@ -600,6 +719,12 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
|
|
|
600
719
|
return res_->getDefaultStream(device);
|
|
601
720
|
}
|
|
602
721
|
|
|
722
|
+
#if defined USE_NVIDIA_RAFT
|
|
723
|
+
raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
|
|
724
|
+
return res_->getRaftHandle(device);
|
|
725
|
+
}
|
|
726
|
+
#endif
|
|
727
|
+
|
|
603
728
|
size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
|
|
604
729
|
return res_->getTempMemoryAvailable(device);
|
|
605
730
|
}
|
|
@@ -4,9 +4,29 @@
|
|
|
4
4
|
* This source code is licensed under the MIT license found in the
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
|
+
/*
|
|
8
|
+
* Copyright (c) 2023, NVIDIA CORPORATION.
|
|
9
|
+
*
|
|
10
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
* you may not use this file except in compliance with the License.
|
|
12
|
+
* You may obtain a copy of the License at
|
|
13
|
+
*
|
|
14
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
*
|
|
16
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
* See the License for the specific language governing permissions and
|
|
20
|
+
* limitations under the License.
|
|
21
|
+
*/
|
|
7
22
|
|
|
8
23
|
#pragma once
|
|
9
24
|
|
|
25
|
+
#if defined USE_NVIDIA_RAFT
|
|
26
|
+
#include <raft/core/device_resources.hpp>
|
|
27
|
+
#include <rmm/mr/host/pinned_memory_resource.hpp>
|
|
28
|
+
#endif
|
|
29
|
+
|
|
10
30
|
#include <faiss/gpu/GpuResources.h>
|
|
11
31
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
12
32
|
#include <faiss/gpu/utils/StackDeviceMemory.h>
|
|
@@ -15,6 +35,7 @@
|
|
|
15
35
|
#include <unordered_map>
|
|
16
36
|
#include <vector>
|
|
17
37
|
|
|
38
|
+
#pragma GCC visibility push(default)
|
|
18
39
|
namespace faiss {
|
|
19
40
|
namespace gpu {
|
|
20
41
|
|
|
@@ -58,6 +79,12 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
|
58
79
|
/// this stream upon exit from an index or other Faiss GPU call.
|
|
59
80
|
cudaStream_t getDefaultStream(int device) override;
|
|
60
81
|
|
|
82
|
+
#if defined USE_NVIDIA_RAFT
|
|
83
|
+
/// Returns the raft handle for the given device which can be used to
|
|
84
|
+
/// make calls to other raft primitives.
|
|
85
|
+
raft::device_resources& getRaftHandle(int device) override;
|
|
86
|
+
#endif
|
|
87
|
+
|
|
61
88
|
/// Called to change the work ordering streams to the null stream
|
|
62
89
|
/// for all devices
|
|
63
90
|
void setDefaultNullStreamAllDevices();
|
|
@@ -92,7 +119,7 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
|
92
119
|
|
|
93
120
|
cudaStream_t getAsyncCopyStream(int device) override;
|
|
94
121
|
|
|
95
|
-
|
|
122
|
+
protected:
|
|
96
123
|
/// Have GPU resources been initialized for this device yet?
|
|
97
124
|
bool isInitialized(int device) const;
|
|
98
125
|
|
|
@@ -100,7 +127,7 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
|
100
127
|
/// memory size
|
|
101
128
|
static size_t getDefaultTempMemForGPU(int device, size_t requested);
|
|
102
129
|
|
|
103
|
-
|
|
130
|
+
protected:
|
|
104
131
|
/// Set of currently outstanding memory allocations per device
|
|
105
132
|
/// device -> (alloc request, allocated ptr)
|
|
106
133
|
std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
|
|
@@ -124,6 +151,27 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
|
124
151
|
/// cuBLAS handle for each device
|
|
125
152
|
std::unordered_map<int, cublasHandle_t> blasHandles_;
|
|
126
153
|
|
|
154
|
+
#if defined USE_NVIDIA_RAFT
|
|
155
|
+
/// raft handle for each device
|
|
156
|
+
std::unordered_map<int, raft::device_resources> raftHandles_;
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* FIXME: Integrating these in a separate code path for now. Ultimately,
|
|
160
|
+
* it would be nice if we use a simple memory resource abstraction
|
|
161
|
+
* in FAISS so we could plug in whether to use RMM's memory resources
|
|
162
|
+
* or the default.
|
|
163
|
+
*
|
|
164
|
+
* There's enough duplicated logic that it doesn't *seem* to make sense
|
|
165
|
+
* to create a subclass only for the RMM memory resources.
|
|
166
|
+
*/
|
|
167
|
+
|
|
168
|
+
// managed_memory_resource
|
|
169
|
+
std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
|
|
170
|
+
|
|
171
|
+
// pinned_memory_resource
|
|
172
|
+
std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
|
|
173
|
+
#endif
|
|
174
|
+
|
|
127
175
|
/// Pinned memory allocation for use with this GPU
|
|
128
176
|
void* pinnedMemAlloc_;
|
|
129
177
|
size_t pinnedMemAllocSize_;
|
|
@@ -183,10 +231,15 @@ class StandardGpuResources : public GpuResourcesProvider {
|
|
|
183
231
|
/// Export a description of memory used for Python
|
|
184
232
|
std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
|
|
185
233
|
const;
|
|
186
|
-
|
|
187
234
|
/// Returns the current default stream
|
|
188
235
|
cudaStream_t getDefaultStream(int device);
|
|
189
236
|
|
|
237
|
+
#if defined USE_NVIDIA_RAFT
|
|
238
|
+
/// Returns the raft handle for the given device which can be used to
|
|
239
|
+
/// make calls to other raft primitives.
|
|
240
|
+
raft::device_resources& getRaftHandle(int device);
|
|
241
|
+
#endif
|
|
242
|
+
|
|
190
243
|
/// Returns the current amount of temp memory available
|
|
191
244
|
size_t getTempMemoryAvailable(int device) const;
|
|
192
245
|
|
|
@@ -203,3 +256,4 @@ class StandardGpuResources : public GpuResourcesProvider {
|
|
|
203
256
|
|
|
204
257
|
} // namespace gpu
|
|
205
258
|
} // namespace faiss
|
|
259
|
+
#pragma GCC visibility pop
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
#include <faiss/gpu/impl/InterleavedCodes.h>
|
|
9
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
9
10
|
#include <faiss/gpu/utils/StaticUtils.h>
|
|
10
11
|
#include <faiss/impl/FaissAssert.h>
|
|
11
12
|
|
|
@@ -166,15 +167,16 @@ void unpackInterleavedWord(
|
|
|
166
167
|
int numVecs,
|
|
167
168
|
int dims,
|
|
168
169
|
int bitsPerCode) {
|
|
169
|
-
int
|
|
170
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
171
|
+
int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
|
|
170
172
|
int wordsPerBlock = wordsPerDimBlock * dims;
|
|
171
|
-
int numBlocks = utils::divUp(numVecs,
|
|
173
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
172
174
|
|
|
173
175
|
#pragma omp parallel for
|
|
174
176
|
for (int i = 0; i < numVecs; ++i) {
|
|
175
|
-
int block = i /
|
|
177
|
+
int block = i / warpSize;
|
|
176
178
|
FAISS_ASSERT(block < numBlocks);
|
|
177
|
-
int lane = i %
|
|
179
|
+
int lane = i % warpSize;
|
|
178
180
|
|
|
179
181
|
for (int j = 0; j < dims; ++j) {
|
|
180
182
|
int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
|
@@ -188,9 +190,10 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
188
190
|
int numVecs,
|
|
189
191
|
int dims,
|
|
190
192
|
int bitsPerCode) {
|
|
191
|
-
int
|
|
193
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
194
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
|
192
195
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
|
193
|
-
int numBlocks = utils::divUp(numVecs,
|
|
196
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
194
197
|
size_t totalSize = (size_t)bytesPerBlock * numBlocks;
|
|
195
198
|
FAISS_ASSERT(data.size() == totalSize);
|
|
196
199
|
|
|
@@ -217,8 +220,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
217
220
|
} else if (bitsPerCode == 4) {
|
|
218
221
|
#pragma omp parallel for
|
|
219
222
|
for (int i = 0; i < numVecs; ++i) {
|
|
220
|
-
int block = i /
|
|
221
|
-
int lane = i %
|
|
223
|
+
int block = i / warpSize;
|
|
224
|
+
int lane = i % warpSize;
|
|
222
225
|
|
|
223
226
|
int word = lane / 2;
|
|
224
227
|
int subWord = lane % 2;
|
|
@@ -235,8 +238,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
235
238
|
} else if (bitsPerCode == 5) {
|
|
236
239
|
#pragma omp parallel for
|
|
237
240
|
for (int i = 0; i < numVecs; ++i) {
|
|
238
|
-
int block = i /
|
|
239
|
-
int blockVector = i %
|
|
241
|
+
int block = i / warpSize;
|
|
242
|
+
int blockVector = i % warpSize;
|
|
240
243
|
|
|
241
244
|
for (int j = 0; j < dims; ++j) {
|
|
242
245
|
uint8_t* dimBlock =
|
|
@@ -257,8 +260,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
257
260
|
} else if (bitsPerCode == 6) {
|
|
258
261
|
#pragma omp parallel for
|
|
259
262
|
for (int i = 0; i < numVecs; ++i) {
|
|
260
|
-
int block = i /
|
|
261
|
-
int blockVector = i %
|
|
263
|
+
int block = i / warpSize;
|
|
264
|
+
int blockVector = i % warpSize;
|
|
262
265
|
|
|
263
266
|
for (int j = 0; j < dims; ++j) {
|
|
264
267
|
uint8_t* dimBlock =
|
|
@@ -442,17 +445,18 @@ void packInterleavedWord(
|
|
|
442
445
|
int numVecs,
|
|
443
446
|
int dims,
|
|
444
447
|
int bitsPerCode) {
|
|
445
|
-
int
|
|
448
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
449
|
+
int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
|
|
446
450
|
int wordsPerBlock = wordsPerDimBlock * dims;
|
|
447
|
-
int numBlocks = utils::divUp(numVecs,
|
|
451
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
448
452
|
|
|
449
453
|
// We're guaranteed that all other slots not filled by the vectors present
|
|
450
454
|
// are initialized to zero (from the vector constructor in packInterleaved)
|
|
451
455
|
#pragma omp parallel for
|
|
452
456
|
for (int i = 0; i < numVecs; ++i) {
|
|
453
|
-
int block = i /
|
|
457
|
+
int block = i / warpSize;
|
|
454
458
|
FAISS_ASSERT(block < numBlocks);
|
|
455
|
-
int lane = i %
|
|
459
|
+
int lane = i % warpSize;
|
|
456
460
|
|
|
457
461
|
for (int j = 0; j < dims; ++j) {
|
|
458
462
|
int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
|
@@ -466,9 +470,10 @@ std::vector<uint8_t> packInterleaved(
|
|
|
466
470
|
int numVecs,
|
|
467
471
|
int dims,
|
|
468
472
|
int bitsPerCode) {
|
|
469
|
-
int
|
|
473
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
474
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
|
470
475
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
|
471
|
-
int numBlocks = utils::divUp(numVecs,
|
|
476
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
472
477
|
size_t totalSize = (size_t)bytesPerBlock * numBlocks;
|
|
473
478
|
|
|
474
479
|
// bit codes padded to whole bytes
|
|
@@ -499,7 +504,7 @@ std::vector<uint8_t> packInterleaved(
|
|
|
499
504
|
for (int i = 0; i < numBlocks; ++i) {
|
|
500
505
|
for (int j = 0; j < dims; ++j) {
|
|
501
506
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
502
|
-
int loVec = i *
|
|
507
|
+
int loVec = i * warpSize + k * 2;
|
|
503
508
|
int hiVec = loVec + 1;
|
|
504
509
|
|
|
505
510
|
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
|
@@ -516,7 +521,7 @@ std::vector<uint8_t> packInterleaved(
|
|
|
516
521
|
for (int j = 0; j < dims; ++j) {
|
|
517
522
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
518
523
|
// What input vectors we are pulling from
|
|
519
|
-
int loVec = i *
|
|
524
|
+
int loVec = i * warpSize + (k * 8) / 5;
|
|
520
525
|
int hiVec = loVec + 1;
|
|
521
526
|
int hiVec2 = hiVec + 1;
|
|
522
527
|
|
|
@@ -536,7 +541,7 @@ std::vector<uint8_t> packInterleaved(
|
|
|
536
541
|
for (int j = 0; j < dims; ++j) {
|
|
537
542
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
538
543
|
// What input vectors we are pulling from
|
|
539
|
-
int loVec = i *
|
|
544
|
+
int loVec = i * warpSize + (k * 8) / 6;
|
|
540
545
|
int hiVec = loVec + 1;
|
|
541
546
|
|
|
542
547
|
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#include <vector>
|
|
18
18
|
|
|
19
19
|
#include <cuda_profiler_api.h>
|
|
20
|
+
#include <faiss/impl/AuxIndexStructures.h>
|
|
20
21
|
|
|
21
22
|
DEFINE_int32(num, 10000, "# of vecs");
|
|
22
23
|
DEFINE_int32(k, 100, "# of clusters");
|
|
@@ -34,6 +35,7 @@ DEFINE_int64(
|
|
|
34
35
|
"minimum size to use CPU -> GPU paged copies");
|
|
35
36
|
DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
|
|
36
37
|
DEFINE_int32(max_points, -1, "max points per centroid");
|
|
38
|
+
DEFINE_double(timeout, 0, "timeout in seconds");
|
|
37
39
|
|
|
38
40
|
using namespace faiss::gpu;
|
|
39
41
|
|
|
@@ -42,7 +44,7 @@ int main(int argc, char** argv) {
|
|
|
42
44
|
|
|
43
45
|
cudaProfilerStop();
|
|
44
46
|
|
|
45
|
-
auto seed = FLAGS_seed != -
|
|
47
|
+
auto seed = FLAGS_seed != -1 ? FLAGS_seed : time(nullptr);
|
|
46
48
|
printf("using seed %ld\n", seed);
|
|
47
49
|
|
|
48
50
|
std::vector<float> vecs((size_t)FLAGS_num * FLAGS_dim);
|
|
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
|
|
|
99
101
|
cp.max_points_per_centroid = FLAGS_max_points;
|
|
100
102
|
}
|
|
101
103
|
|
|
104
|
+
auto tc = new faiss::TimeoutCallback();
|
|
105
|
+
faiss::InterruptCallback::instance.reset(tc);
|
|
106
|
+
|
|
102
107
|
faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
|
|
103
108
|
|
|
104
109
|
// Time k-means
|
|
105
110
|
{
|
|
111
|
+
tc->set_timeout(FLAGS_timeout);
|
|
106
112
|
CpuTimer timer;
|
|
107
113
|
|
|
108
114
|
kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
#include <faiss/gpu/impl/InterleavedCodes.h>
|
|
9
9
|
#include <faiss/gpu/test/TestUtils.h>
|
|
10
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
10
11
|
#include <faiss/gpu/utils/StaticUtils.h>
|
|
11
12
|
#include <gtest/gtest.h>
|
|
12
13
|
#include <cmath>
|
|
@@ -119,8 +120,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
|
119
120
|
std::cout << bitsPerCode << " " << dims << " " << numVecs
|
|
120
121
|
<< "\n";
|
|
121
122
|
|
|
122
|
-
int
|
|
123
|
-
int
|
|
123
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
124
|
+
int blocks = utils::divUp(numVecs, warpSize);
|
|
125
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
|
124
126
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
|
125
127
|
int size = blocks * bytesPerBlock;
|
|
126
128
|
|
|
@@ -132,9 +134,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
|
132
134
|
|
|
133
135
|
for (int i = 0; i < blocks; ++i) {
|
|
134
136
|
for (int j = 0; j < dims; ++j) {
|
|
135
|
-
for (int k = 0; k <
|
|
137
|
+
for (int k = 0; k < warpSize; ++k) {
|
|
136
138
|
for (int l = 0; l < bytesPerCode; ++l) {
|
|
137
|
-
int vec = i *
|
|
139
|
+
int vec = i * warpSize + k;
|
|
138
140
|
if (vec < numVecs) {
|
|
139
141
|
data[i * bytesPerBlock +
|
|
140
142
|
j * bytesPerDimBlock +
|
|
@@ -148,7 +150,8 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
|
148
150
|
for (int i = 0; i < blocks; ++i) {
|
|
149
151
|
for (int j = 0; j < dims; ++j) {
|
|
150
152
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
151
|
-
int loVec =
|
|
153
|
+
int loVec =
|
|
154
|
+
i * warpSize + (k * 8) / bitsPerCode;
|
|
152
155
|
int hiVec = loVec + 1;
|
|
153
156
|
int hiVec2 = hiVec + 1;
|
|
154
157
|
|