faiss 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (192) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +1 -1
  4. data/lib/faiss/version.rb +1 -1
  5. data/vendor/faiss/benchs/bench_6bit_codec.cpp +80 -0
  6. data/vendor/faiss/c_api/AutoTune_c.h +2 -0
  7. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -6
  8. data/vendor/faiss/c_api/IndexShards_c.h +1 -4
  9. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +4 -2
  10. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +1 -1
  11. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +1 -1
  12. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +1 -1
  13. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +1 -1
  14. data/vendor/faiss/demos/demo_imi_flat.cpp +5 -2
  15. data/vendor/faiss/demos/demo_imi_pq.cpp +6 -2
  16. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +7 -2
  17. data/vendor/faiss/{AutoTune.cpp → faiss/AutoTune.cpp} +9 -9
  18. data/vendor/faiss/{AutoTune.h → faiss/AutoTune.h} +0 -0
  19. data/vendor/faiss/{Clustering.cpp → faiss/Clustering.cpp} +13 -12
  20. data/vendor/faiss/{Clustering.h → faiss/Clustering.h} +0 -0
  21. data/vendor/faiss/{DirectMap.cpp → faiss/DirectMap.cpp} +0 -0
  22. data/vendor/faiss/{DirectMap.h → faiss/DirectMap.h} +0 -0
  23. data/vendor/faiss/{IVFlib.cpp → faiss/IVFlib.cpp} +86 -11
  24. data/vendor/faiss/{IVFlib.h → faiss/IVFlib.h} +26 -8
  25. data/vendor/faiss/{Index.cpp → faiss/Index.cpp} +0 -0
  26. data/vendor/faiss/{Index.h → faiss/Index.h} +1 -1
  27. data/vendor/faiss/{Index2Layer.cpp → faiss/Index2Layer.cpp} +12 -11
  28. data/vendor/faiss/{Index2Layer.h → faiss/Index2Layer.h} +0 -0
  29. data/vendor/faiss/{IndexBinary.cpp → faiss/IndexBinary.cpp} +2 -1
  30. data/vendor/faiss/{IndexBinary.h → faiss/IndexBinary.h} +0 -0
  31. data/vendor/faiss/{IndexBinaryFlat.cpp → faiss/IndexBinaryFlat.cpp} +0 -0
  32. data/vendor/faiss/{IndexBinaryFlat.h → faiss/IndexBinaryFlat.h} +0 -0
  33. data/vendor/faiss/{IndexBinaryFromFloat.cpp → faiss/IndexBinaryFromFloat.cpp} +1 -0
  34. data/vendor/faiss/{IndexBinaryFromFloat.h → faiss/IndexBinaryFromFloat.h} +0 -0
  35. data/vendor/faiss/{IndexBinaryHNSW.cpp → faiss/IndexBinaryHNSW.cpp} +1 -2
  36. data/vendor/faiss/{IndexBinaryHNSW.h → faiss/IndexBinaryHNSW.h} +0 -0
  37. data/vendor/faiss/{IndexBinaryHash.cpp → faiss/IndexBinaryHash.cpp} +16 -7
  38. data/vendor/faiss/{IndexBinaryHash.h → faiss/IndexBinaryHash.h} +2 -1
  39. data/vendor/faiss/{IndexBinaryIVF.cpp → faiss/IndexBinaryIVF.cpp} +10 -16
  40. data/vendor/faiss/{IndexBinaryIVF.h → faiss/IndexBinaryIVF.h} +1 -1
  41. data/vendor/faiss/{IndexFlat.cpp → faiss/IndexFlat.cpp} +0 -0
  42. data/vendor/faiss/{IndexFlat.h → faiss/IndexFlat.h} +0 -0
  43. data/vendor/faiss/{IndexHNSW.cpp → faiss/IndexHNSW.cpp} +63 -32
  44. data/vendor/faiss/{IndexHNSW.h → faiss/IndexHNSW.h} +0 -0
  45. data/vendor/faiss/{IndexIVF.cpp → faiss/IndexIVF.cpp} +129 -46
  46. data/vendor/faiss/{IndexIVF.h → faiss/IndexIVF.h} +7 -3
  47. data/vendor/faiss/{IndexIVFFlat.cpp → faiss/IndexIVFFlat.cpp} +6 -5
  48. data/vendor/faiss/{IndexIVFFlat.h → faiss/IndexIVFFlat.h} +0 -0
  49. data/vendor/faiss/{IndexIVFPQ.cpp → faiss/IndexIVFPQ.cpp} +9 -8
  50. data/vendor/faiss/{IndexIVFPQ.h → faiss/IndexIVFPQ.h} +4 -2
  51. data/vendor/faiss/{IndexIVFPQR.cpp → faiss/IndexIVFPQR.cpp} +3 -1
  52. data/vendor/faiss/{IndexIVFPQR.h → faiss/IndexIVFPQR.h} +0 -0
  53. data/vendor/faiss/{IndexIVFSpectralHash.cpp → faiss/IndexIVFSpectralHash.cpp} +1 -1
  54. data/vendor/faiss/{IndexIVFSpectralHash.h → faiss/IndexIVFSpectralHash.h} +0 -0
  55. data/vendor/faiss/{IndexLSH.cpp → faiss/IndexLSH.cpp} +0 -0
  56. data/vendor/faiss/{IndexLSH.h → faiss/IndexLSH.h} +0 -0
  57. data/vendor/faiss/{IndexLattice.cpp → faiss/IndexLattice.cpp} +0 -0
  58. data/vendor/faiss/{IndexLattice.h → faiss/IndexLattice.h} +0 -0
  59. data/vendor/faiss/{IndexPQ.cpp → faiss/IndexPQ.cpp} +6 -6
  60. data/vendor/faiss/{IndexPQ.h → faiss/IndexPQ.h} +3 -1
  61. data/vendor/faiss/{IndexPreTransform.cpp → faiss/IndexPreTransform.cpp} +0 -0
  62. data/vendor/faiss/{IndexPreTransform.h → faiss/IndexPreTransform.h} +0 -0
  63. data/vendor/faiss/{IndexReplicas.cpp → faiss/IndexReplicas.cpp} +102 -10
  64. data/vendor/faiss/{IndexReplicas.h → faiss/IndexReplicas.h} +6 -0
  65. data/vendor/faiss/{IndexScalarQuantizer.cpp → faiss/IndexScalarQuantizer.cpp} +3 -3
  66. data/vendor/faiss/{IndexScalarQuantizer.h → faiss/IndexScalarQuantizer.h} +0 -0
  67. data/vendor/faiss/{IndexShards.cpp → faiss/IndexShards.cpp} +37 -12
  68. data/vendor/faiss/{IndexShards.h → faiss/IndexShards.h} +3 -4
  69. data/vendor/faiss/{InvertedLists.cpp → faiss/InvertedLists.cpp} +2 -2
  70. data/vendor/faiss/{InvertedLists.h → faiss/InvertedLists.h} +1 -0
  71. data/vendor/faiss/{MatrixStats.cpp → faiss/MatrixStats.cpp} +0 -0
  72. data/vendor/faiss/{MatrixStats.h → faiss/MatrixStats.h} +0 -0
  73. data/vendor/faiss/{MetaIndexes.cpp → faiss/MetaIndexes.cpp} +5 -3
  74. data/vendor/faiss/{MetaIndexes.h → faiss/MetaIndexes.h} +0 -0
  75. data/vendor/faiss/{MetricType.h → faiss/MetricType.h} +0 -0
  76. data/vendor/faiss/{OnDiskInvertedLists.cpp → faiss/OnDiskInvertedLists.cpp} +141 -3
  77. data/vendor/faiss/{OnDiskInvertedLists.h → faiss/OnDiskInvertedLists.h} +27 -7
  78. data/vendor/faiss/{VectorTransform.cpp → faiss/VectorTransform.cpp} +4 -3
  79. data/vendor/faiss/{VectorTransform.h → faiss/VectorTransform.h} +0 -0
  80. data/vendor/faiss/{clone_index.cpp → faiss/clone_index.cpp} +0 -0
  81. data/vendor/faiss/{clone_index.h → faiss/clone_index.h} +0 -0
  82. data/vendor/faiss/{gpu → faiss/gpu}/GpuAutoTune.cpp +0 -0
  83. data/vendor/faiss/{gpu → faiss/gpu}/GpuAutoTune.h +0 -0
  84. data/vendor/faiss/{gpu → faiss/gpu}/GpuCloner.cpp +14 -14
  85. data/vendor/faiss/{gpu → faiss/gpu}/GpuCloner.h +6 -7
  86. data/vendor/faiss/{gpu → faiss/gpu}/GpuClonerOptions.cpp +0 -0
  87. data/vendor/faiss/{gpu → faiss/gpu}/GpuClonerOptions.h +0 -0
  88. data/vendor/faiss/{gpu → faiss/gpu}/GpuDistance.h +12 -4
  89. data/vendor/faiss/{gpu → faiss/gpu}/GpuFaissAssert.h +0 -0
  90. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndex.h +3 -9
  91. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexBinaryFlat.h +7 -7
  92. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexFlat.h +35 -10
  93. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVF.h +1 -2
  94. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVFFlat.h +4 -3
  95. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVFPQ.h +21 -4
  96. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVFScalarQuantizer.h +4 -3
  97. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndicesOptions.h +0 -0
  98. data/vendor/faiss/faiss/gpu/GpuResources.cpp +200 -0
  99. data/vendor/faiss/faiss/gpu/GpuResources.h +264 -0
  100. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +572 -0
  101. data/vendor/faiss/{gpu → faiss/gpu}/StandardGpuResources.h +83 -15
  102. data/vendor/faiss/{gpu → faiss/gpu}/impl/RemapIndices.cpp +0 -0
  103. data/vendor/faiss/{gpu → faiss/gpu}/impl/RemapIndices.h +0 -0
  104. data/vendor/faiss/{gpu → faiss/gpu}/perf/IndexWrapper-inl.h +1 -1
  105. data/vendor/faiss/{gpu → faiss/gpu}/perf/IndexWrapper.h +1 -1
  106. data/vendor/faiss/{gpu → faiss/gpu}/perf/PerfClustering.cpp +1 -1
  107. data/vendor/faiss/{gpu → faiss/gpu}/perf/PerfIVFPQAdd.cpp +0 -0
  108. data/vendor/faiss/{gpu → faiss/gpu}/perf/WriteIndex.cpp +0 -0
  109. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexBinaryFlat.cpp +0 -0
  110. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexFlat.cpp +1 -1
  111. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexIVFFlat.cpp +0 -0
  112. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexIVFPQ.cpp +141 -52
  113. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuMemoryException.cpp +0 -0
  114. data/vendor/faiss/{gpu → faiss/gpu}/test/TestUtils.cpp +4 -2
  115. data/vendor/faiss/{gpu → faiss/gpu}/test/TestUtils.h +0 -0
  116. data/vendor/faiss/{gpu → faiss/gpu}/test/demo_ivfpq_indexing_gpu.cpp +7 -5
  117. data/vendor/faiss/{gpu → faiss/gpu}/utils/DeviceUtils.h +1 -1
  118. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +213 -0
  119. data/vendor/faiss/{gpu → faiss/gpu}/utils/StackDeviceMemory.h +25 -40
  120. data/vendor/faiss/{gpu → faiss/gpu}/utils/StaticUtils.h +0 -0
  121. data/vendor/faiss/{gpu → faiss/gpu}/utils/Timer.cpp +0 -0
  122. data/vendor/faiss/{gpu → faiss/gpu}/utils/Timer.h +0 -0
  123. data/vendor/faiss/{impl → faiss/impl}/AuxIndexStructures.cpp +1 -0
  124. data/vendor/faiss/{impl → faiss/impl}/AuxIndexStructures.h +3 -1
  125. data/vendor/faiss/{impl → faiss/impl}/FaissAssert.h +1 -0
  126. data/vendor/faiss/{impl → faiss/impl}/FaissException.cpp +26 -0
  127. data/vendor/faiss/{impl → faiss/impl}/FaissException.h +4 -0
  128. data/vendor/faiss/{impl → faiss/impl}/HNSW.cpp +26 -26
  129. data/vendor/faiss/{impl → faiss/impl}/HNSW.h +19 -11
  130. data/vendor/faiss/{impl → faiss/impl}/PolysemousTraining.cpp +1 -1
  131. data/vendor/faiss/{impl → faiss/impl}/PolysemousTraining.h +1 -1
  132. data/vendor/faiss/{impl → faiss/impl}/ProductQuantizer-inl.h +0 -1
  133. data/vendor/faiss/{impl → faiss/impl}/ProductQuantizer.cpp +9 -9
  134. data/vendor/faiss/{impl → faiss/impl}/ProductQuantizer.h +0 -0
  135. data/vendor/faiss/{impl → faiss/impl}/ScalarQuantizer.cpp +63 -39
  136. data/vendor/faiss/{impl → faiss/impl}/ScalarQuantizer.h +1 -1
  137. data/vendor/faiss/{impl → faiss/impl}/ThreadedIndex-inl.h +0 -0
  138. data/vendor/faiss/{impl → faiss/impl}/ThreadedIndex.h +0 -0
  139. data/vendor/faiss/{impl → faiss/impl}/index_read.cpp +99 -116
  140. data/vendor/faiss/{impl → faiss/impl}/index_write.cpp +15 -50
  141. data/vendor/faiss/{impl → faiss/impl}/io.cpp +15 -10
  142. data/vendor/faiss/{impl → faiss/impl}/io.h +22 -8
  143. data/vendor/faiss/faiss/impl/io_macros.h +57 -0
  144. data/vendor/faiss/{impl → faiss/impl}/lattice_Zn.cpp +52 -36
  145. data/vendor/faiss/{impl → faiss/impl}/lattice_Zn.h +3 -3
  146. data/vendor/faiss/faiss/impl/platform_macros.h +24 -0
  147. data/vendor/faiss/{index_factory.cpp → faiss/index_factory.cpp} +33 -12
  148. data/vendor/faiss/{index_factory.h → faiss/index_factory.h} +0 -0
  149. data/vendor/faiss/{index_io.h → faiss/index_io.h} +55 -1
  150. data/vendor/faiss/faiss/python/python_callbacks.cpp +112 -0
  151. data/vendor/faiss/faiss/python/python_callbacks.h +45 -0
  152. data/vendor/faiss/{utils → faiss/utils}/Heap.cpp +5 -5
  153. data/vendor/faiss/{utils → faiss/utils}/Heap.h +1 -3
  154. data/vendor/faiss/{utils → faiss/utils}/WorkerThread.cpp +0 -0
  155. data/vendor/faiss/{utils → faiss/utils}/WorkerThread.h +0 -0
  156. data/vendor/faiss/{utils → faiss/utils}/distances.cpp +28 -13
  157. data/vendor/faiss/{utils → faiss/utils}/distances.h +2 -1
  158. data/vendor/faiss/{utils → faiss/utils}/distances_simd.cpp +5 -5
  159. data/vendor/faiss/{utils → faiss/utils}/extra_distances.cpp +8 -7
  160. data/vendor/faiss/{utils → faiss/utils}/extra_distances.h +0 -0
  161. data/vendor/faiss/{utils → faiss/utils}/hamming-inl.h +1 -3
  162. data/vendor/faiss/{utils → faiss/utils}/hamming.cpp +8 -7
  163. data/vendor/faiss/{utils → faiss/utils}/hamming.h +7 -1
  164. data/vendor/faiss/{utils → faiss/utils}/random.cpp +5 -5
  165. data/vendor/faiss/{utils → faiss/utils}/random.h +0 -0
  166. data/vendor/faiss/{utils → faiss/utils}/utils.cpp +27 -28
  167. data/vendor/faiss/{utils → faiss/utils}/utils.h +4 -0
  168. data/vendor/faiss/misc/test_blas.cpp +4 -1
  169. data/vendor/faiss/tests/test_binary_flat.cpp +0 -2
  170. data/vendor/faiss/tests/test_dealloc_invlists.cpp +6 -1
  171. data/vendor/faiss/tests/test_ivfpq_codec.cpp +4 -1
  172. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +6 -4
  173. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +12 -5
  174. data/vendor/faiss/tests/test_merge.cpp +6 -3
  175. data/vendor/faiss/tests/test_ondisk_ivf.cpp +7 -2
  176. data/vendor/faiss/tests/test_pairs_decoding.cpp +5 -1
  177. data/vendor/faiss/tests/test_params_override.cpp +7 -2
  178. data/vendor/faiss/tests/test_sliding_ivf.cpp +10 -4
  179. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +14 -8
  180. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +11 -7
  181. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +12 -7
  182. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +6 -3
  183. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +7 -3
  184. metadata +154 -153
  185. data/vendor/faiss/gpu/GpuResources.cpp +0 -52
  186. data/vendor/faiss/gpu/GpuResources.h +0 -73
  187. data/vendor/faiss/gpu/StandardGpuResources.cpp +0 -303
  188. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +0 -77
  189. data/vendor/faiss/gpu/utils/DeviceMemory.h +0 -71
  190. data/vendor/faiss/gpu/utils/MemorySpace.cpp +0 -89
  191. data/vendor/faiss/gpu/utils/MemorySpace.h +0 -44
  192. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +0 -239
@@ -0,0 +1,572 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+
9
+ #include <faiss/gpu/StandardGpuResources.h>
10
+ #include <faiss/gpu/utils/DeviceUtils.h>
11
+ #include <faiss/gpu/utils/StaticUtils.h>
12
+ #include <faiss/impl/FaissAssert.h>
13
+ #include <limits>
14
+ #include <iostream>
15
+ #include <sstream>
16
+
17
+ namespace faiss { namespace gpu {
18
+
19
+ namespace {
20
+
21
+ // How many streams per device we allocate by default (for multi-streaming)
22
+ constexpr int kNumStreams = 2;
23
+
24
+ // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
25
+ constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
26
+
27
+ // Default temporary memory allocation for <= 4 GiB memory GPUs
28
+ constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
29
+
30
+ // Default temporary memory allocation for <= 8 GiB memory GPUs
31
+ constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
32
+
33
+ // Maximum temporary memory allocation for all GPUs
34
+ constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
35
+
36
+ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
37
+ // Produce a sorted list of all outstanding allocations by type
38
+ std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
+
40
+ for (auto& entry : map) {
41
+ auto& a = entry.second;
42
+
43
+ auto it = stats.find(a.type);
44
+ if (it != stats.end()) {
45
+ stats[a.type].first++;
46
+ stats[a.type].second += a.size;
47
+ } else {
48
+ stats[a.type] = std::make_pair(1, a.size);
49
+ }
50
+ }
51
+
52
+ std::stringstream ss;
53
+ for (auto& entry : stats) {
54
+ ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
+ << entry.second.first << " allocations, "
56
+ << entry.second.second << " bytes\n";
57
+ }
58
+
59
+ return ss.str();
60
+ }
61
+
62
+ }
63
+
64
+ //
65
+ // StandardGpuResourcesImpl
66
+ //
67
+
68
+ StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
69
+ pinnedMemAlloc_(nullptr),
70
+ pinnedMemAllocSize_(0),
71
+ // let the adjustment function determine the memory size for us by passing
72
+ // in a huge value that will then be adjusted
73
+ tempMemSize_(getDefaultTempMemForGPU(-1,
74
+ std::numeric_limits<size_t>::max())),
75
+ pinnedMemSize_(kDefaultPinnedMemoryAllocation),
76
+ allocLogging_(false) {
77
+ }
78
+
79
+ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
+ // The temporary memory allocator has allocated memory through us, so clean
81
+ // that up before we finish fully de-initializing ourselves
82
+ tempMemory_.clear();
83
+
84
+ // Make sure all allocations have been freed
85
+ bool allocError = false;
86
+
87
+ for (auto& entry : allocs_) {
88
+ auto& map = entry.second;
89
+
90
+ if (!map.empty()) {
91
+ std::cerr
92
+ << "StandardGpuResources destroyed with allocations outstanding:\n"
93
+ << "Device " << entry.first << " outstanding allocations:\n";
94
+ std::cerr << allocsToString(map);
95
+ allocError = true;
96
+ }
97
+ }
98
+
99
+ FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
100
+
101
+ for (auto& entry : defaultStreams_) {
102
+ DeviceScope scope(entry.first);
103
+
104
+ auto it = userDefaultStreams_.find(entry.first);
105
+ if (it == userDefaultStreams_.end()) {
106
+ // The user did not specify this stream, thus we are the ones
107
+ // who have created it
108
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
109
+ }
110
+ }
111
+
112
+ for (auto& entry : alternateStreams_) {
113
+ DeviceScope scope(entry.first);
114
+
115
+ for (auto stream : entry.second) {
116
+ CUDA_VERIFY(cudaStreamDestroy(stream));
117
+ }
118
+ }
119
+
120
+ for (auto& entry : asyncCopyStreams_) {
121
+ DeviceScope scope(entry.first);
122
+
123
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
124
+ }
125
+
126
+ for (auto& entry : blasHandles_) {
127
+ DeviceScope scope(entry.first);
128
+
129
+ auto blasStatus = cublasDestroy(entry.second);
130
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
131
+ }
132
+
133
+ if (pinnedMemAlloc_) {
134
+ auto err = cudaFreeHost(pinnedMemAlloc_);
135
+ FAISS_ASSERT_FMT(err == cudaSuccess,
136
+ "Failed to cudaFreeHost pointer %p (error %d %s)",
137
+ pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
138
+ }
139
+ }
140
+
141
+ size_t
142
+ StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
143
+ size_t requested) {
144
+ auto totalMem = device != -1 ?
145
+ getDeviceProperties(device).totalGlobalMem :
146
+ std::numeric_limits<size_t>::max();
147
+
148
+ if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
149
+ // If the GPU has <= 4 GiB of memory, reserve 512 MiB
150
+
151
+ if (requested > k4GiBTempMem) {
152
+ return k4GiBTempMem;
153
+ }
154
+ } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
155
+ // If the GPU has <= 8 GiB of memory, reserve 1 GiB
156
+
157
+ if (requested > k8GiBTempMem) {
158
+ return k8GiBTempMem;
159
+ }
160
+ } else {
161
+ // Never use more than 1.5 GiB
162
+ if (requested > kMaxTempMem) {
163
+ return kMaxTempMem;
164
+ }
165
+ }
166
+
167
+ // use whatever lower limit the user requested
168
+ return requested;
169
+ }
170
+
171
+ void
172
+ StandardGpuResourcesImpl::noTempMemory() {
173
+ setTempMemory(0);
174
+ }
175
+
176
+ void
177
+ StandardGpuResourcesImpl::setTempMemory(size_t size) {
178
+ if (tempMemSize_ != size) {
179
+ // adjust based on general limits
180
+ tempMemSize_ = getDefaultTempMemForGPU(-1, size);
181
+
182
+ // We need to re-initialize memory resources for all current devices that
183
+ // have been initialized.
184
+ // This should be safe to do, even if we are currently running work, because
185
+ // the cudaFree call that this implies will force-synchronize all GPUs with
186
+ // the CPU
187
+ for (auto& p : tempMemory_) {
188
+ int device = p.first;
189
+ // Free the existing memory first
190
+ p.second.reset();
191
+
192
+ // Allocate new
193
+ p.second = std::unique_ptr<StackDeviceMemory>(
194
+ new StackDeviceMemory(this,
195
+ p.first,
196
+ // adjust for this specific device
197
+ getDefaultTempMemForGPU(device, tempMemSize_)));
198
+ }
199
+ }
200
+ }
201
+
202
+ void
203
+ StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
204
+ // Should not call this after devices have been initialized
205
+ FAISS_ASSERT(defaultStreams_.size() == 0);
206
+ FAISS_ASSERT(!pinnedMemAlloc_);
207
+
208
+ pinnedMemSize_ = size;
209
+ }
210
+
211
+ void
212
+ StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
213
+ auto it = defaultStreams_.find(device);
214
+ if (it != defaultStreams_.end()) {
215
+ // Replace this stream with the user stream
216
+ CUDA_VERIFY(cudaStreamDestroy(it->second));
217
+ it->second = stream;
218
+ }
219
+
220
+ userDefaultStreams_[device] = stream;
221
+ }
222
+
223
+ void
224
+ StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
225
+ for (int dev = 0; dev < getNumDevices(); ++dev) {
226
+ setDefaultStream(dev, nullptr);
227
+ }
228
+ }
229
+
230
+ void
231
+ StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
232
+ allocLogging_ = enable;
233
+ }
234
+
235
+ bool
236
+ StandardGpuResourcesImpl::isInitialized(int device) const {
237
+ // Use default streams as a marker for whether or not a certain
238
+ // device has been initialized
239
+ return defaultStreams_.count(device) != 0;
240
+ }
241
+
242
+ void
243
+ StandardGpuResourcesImpl::initializeForDevice(int device) {
244
+ if (isInitialized(device)) {
245
+ return;
246
+ }
247
+
248
+ // If this is the first device that we're initializing, create our
249
+ // pinned memory allocation
250
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
251
+ auto err =
252
+ cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
253
+
254
+ FAISS_THROW_IF_NOT_FMT(
255
+ err == cudaSuccess,
256
+ "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
257
+ "async copy buffer (error %d %s)",
258
+ pinnedMemSize_, (int) err, cudaGetErrorString(err));
259
+
260
+ pinnedMemAllocSize_ = pinnedMemSize_;
261
+ }
262
+
263
+ FAISS_ASSERT(device < getNumDevices());
264
+ DeviceScope scope(device);
265
+
266
+ // Make sure that device properties for all devices are cached
267
+ auto& prop = getDeviceProperties(device);
268
+
269
+ // Also check to make sure we meet our minimum compute capability (3.0)
270
+ FAISS_ASSERT_FMT(prop.major >= 3,
271
+ "Device id %d with CC %d.%d not supported, "
272
+ "need 3.0+ compute capability",
273
+ device, prop.major, prop.minor);
274
+
275
+ // Create streams
276
+ cudaStream_t defaultStream = 0;
277
+ auto it = userDefaultStreams_.find(device);
278
+ if (it != userDefaultStreams_.end()) {
279
+ // We already have a stream provided by the user
280
+ defaultStream = it->second;
281
+ } else {
282
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
283
+ cudaStreamNonBlocking));
284
+ }
285
+
286
+ defaultStreams_[device] = defaultStream;
287
+
288
+ cudaStream_t asyncCopyStream = 0;
289
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
290
+ cudaStreamNonBlocking));
291
+
292
+ asyncCopyStreams_[device] = asyncCopyStream;
293
+
294
+ std::vector<cudaStream_t> deviceStreams;
295
+ for (int j = 0; j < kNumStreams; ++j) {
296
+ cudaStream_t stream = 0;
297
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
298
+ cudaStreamNonBlocking));
299
+
300
+ deviceStreams.push_back(stream);
301
+ }
302
+
303
+ alternateStreams_[device] = std::move(deviceStreams);
304
+
305
+ // Create cuBLAS handle
306
+ cublasHandle_t blasHandle = 0;
307
+ auto blasStatus = cublasCreate(&blasHandle);
308
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
309
+ blasHandles_[device] = blasHandle;
310
+
311
+ // Enable tensor core support if available
312
+ #if CUDA_VERSION >= 9000 && CUDA_VERSION < 11000
313
+ // This flag was deprecated in CUDA 11
314
+ if (getTensorCoreSupport(device)) {
315
+ cublasSetMathMode(blasHandle, CUBLAS_TENSOR_OP_MATH);
316
+ }
317
+ #endif
318
+ #if CUDA_VERSION >= 11000
319
+ cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
320
+ #endif
321
+
322
+ FAISS_ASSERT(allocs_.count(device) == 0);
323
+ allocs_[device] = std::unordered_map<void*, AllocRequest>();
324
+
325
+ FAISS_ASSERT(tempMemory_.count(device) == 0);
326
+ auto mem = std::unique_ptr<StackDeviceMemory>(
327
+ new StackDeviceMemory(this,
328
+ device,
329
+ // adjust for this specific device
330
+ getDefaultTempMemForGPU(device, tempMemSize_)));
331
+
332
+ tempMemory_.emplace(device, std::move(mem));
333
+ }
334
+
335
+ cublasHandle_t
336
+ StandardGpuResourcesImpl::getBlasHandle(int device) {
337
+ initializeForDevice(device);
338
+ return blasHandles_[device];
339
+ }
340
+
341
+ cudaStream_t
342
+ StandardGpuResourcesImpl::getDefaultStream(int device) {
343
+ initializeForDevice(device);
344
+ return defaultStreams_[device];
345
+ }
346
+
347
+ std::vector<cudaStream_t>
348
+ StandardGpuResourcesImpl::getAlternateStreams(int device) {
349
+ initializeForDevice(device);
350
+ return alternateStreams_[device];
351
+ }
352
+
353
+ std::pair<void*, size_t>
354
+ StandardGpuResourcesImpl::getPinnedMemory() {
355
+ return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
356
+ }
357
+
358
+ cudaStream_t
359
+ StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
360
+ initializeForDevice(device);
361
+ return asyncCopyStreams_[device];
362
+ }
363
+
364
+ void*
365
+ StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
366
+ initializeForDevice(req.device);
367
+
368
+ // We don't allocate a placeholder for zero-sized allocations
369
+ if (req.size == 0) {
370
+ return nullptr;
371
+ }
372
+
373
+ // Make sure that the allocation is a multiple of 16 bytes for alignment
374
+ // purposes
375
+ auto adjReq = req;
376
+ adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
377
+
378
+ void* p = nullptr;
379
+
380
+ if (allocLogging_) {
381
+ std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
382
+ }
383
+
384
+ if (adjReq.space == MemorySpace::Temporary) {
385
+ // If we don't have enough space in our temporary memory manager, we need
386
+ // to allocate this request separately
387
+ auto& tempMem = tempMemory_[adjReq.device];
388
+
389
+ if (adjReq.size > tempMem->getSizeAvailable()) {
390
+ // We need to allocate this ourselves
391
+ AllocRequest newReq = adjReq;
392
+ newReq.space = MemorySpace::Device;
393
+ newReq.type = AllocType::TemporaryMemoryOverflow;
394
+
395
+ return allocMemory(newReq);
396
+ }
397
+
398
+ // Otherwise, we can handle this locally
399
+ p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
400
+
401
+ } else if (adjReq.space == MemorySpace::Device) {
402
+ auto err = cudaMalloc(&p, adjReq.size);
403
+
404
+ // Throw if we fail to allocate
405
+ if (err != cudaSuccess) {
406
+ auto& map = allocs_[req.device];
407
+
408
+ std::stringstream ss;
409
+ ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
410
+ << "on device " << adjReq.device << " (error "
411
+ << (int) err << " " << cudaGetErrorString(err)
412
+ << "\nOutstanding allocations:\n" << allocsToString(map);
413
+ auto str = ss.str();
414
+
415
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
416
+ }
417
+ } else if (adjReq.space == MemorySpace::Unified) {
418
+ auto err = cudaMallocManaged(&p, adjReq.size);
419
+
420
+ if (err != cudaSuccess) {
421
+ auto& map = allocs_[req.device];
422
+
423
+ std::stringstream ss;
424
+ ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
425
+ << "(error " << (int) err << " " << cudaGetErrorString(err)
426
+ << "\nOutstanding allocations:\n" << allocsToString(map);
427
+ auto str = ss.str();
428
+
429
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
430
+ }
431
+ } else {
432
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
433
+ }
434
+
435
+ allocs_[adjReq.device][p] = adjReq;
436
+
437
+ return p;
438
+ }
439
+
440
+ void
441
+ StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
442
+ FAISS_ASSERT(isInitialized(device));
443
+
444
+ if (!p) {
445
+ return;
446
+ }
447
+
448
+ auto& a = allocs_[device];
449
+ auto it = a.find(p);
450
+ FAISS_ASSERT(it != a.end());
451
+
452
+ auto& req = it->second;
453
+
454
+ if (allocLogging_) {
455
+ std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
456
+ }
457
+
458
+ if (req.space == MemorySpace::Temporary) {
459
+ tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
460
+
461
+ } else if (req.space == MemorySpace::Device ||
462
+ req.space == MemorySpace::Unified) {
463
+ auto err = cudaFree(p);
464
+ FAISS_ASSERT_FMT(err == cudaSuccess,
465
+ "Failed to cudaFree pointer %p (error %d %s)",
466
+ p, (int) err, cudaGetErrorString(err));
467
+
468
+ } else {
469
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
470
+ }
471
+
472
+ a.erase(it);
473
+ }
474
+
475
+ size_t
476
+ StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
477
+ FAISS_ASSERT(isInitialized(device));
478
+
479
+ auto it = tempMemory_.find(device);
480
+ FAISS_ASSERT(it != tempMemory_.end());
481
+
482
+ return it->second->getSizeAvailable();
483
+ }
484
+
485
+ std::map<int, std::map<std::string, std::pair<int, size_t>>>
486
+ StandardGpuResourcesImpl::getMemoryInfo() const {
487
+ using AT = std::map<std::string, std::pair<int, size_t>>;
488
+
489
+ std::map<int, AT> out;
490
+
491
+ for (auto& entry : allocs_) {
492
+ AT outDevice;
493
+
494
+ for (auto& a : entry.second) {
495
+ auto& v = outDevice[allocTypeToString(a.second.type)];
496
+ v.first++;
497
+ v.second += a.second.size;
498
+ }
499
+
500
+ out[entry.first] = std::move(outDevice);
501
+ }
502
+
503
+ return out;
504
+ }
505
+
506
+ //
507
+ // StandardGpuResources
508
+ //
509
+
510
+ StandardGpuResources::StandardGpuResources()
511
+ : res_(new StandardGpuResourcesImpl) {
512
+ }
513
+
514
+ StandardGpuResources::~StandardGpuResources() {
515
+ }
516
+
517
+ std::shared_ptr<GpuResources>
518
+ StandardGpuResources::getResources() {
519
+ return res_;
520
+ }
521
+
522
+ void
523
+ StandardGpuResources::noTempMemory() {
524
+ res_->noTempMemory();
525
+ }
526
+
527
+ void
528
+ StandardGpuResources::setTempMemory(size_t size) {
529
+ res_->setTempMemory(size);
530
+ }
531
+
532
+ void
533
+ StandardGpuResources::setPinnedMemory(size_t size) {
534
+ res_->setPinnedMemory(size);
535
+ }
536
+
537
+ void
538
+ StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
539
+ res_->setDefaultStream(device, stream);
540
+ }
541
+
542
+ void
543
+ StandardGpuResources::setDefaultNullStreamAllDevices() {
544
+ res_->setDefaultNullStreamAllDevices();
545
+ }
546
+
547
+ std::map<int, std::map<std::string, std::pair<int, size_t>>>
548
+ StandardGpuResources::getMemoryInfo() const {
549
+ return res_->getMemoryInfo();
550
+ }
551
+
552
+ cudaStream_t
553
+ StandardGpuResources::getDefaultStream(int device) {
554
+ return res_->getDefaultStream(device);
555
+ }
556
+
557
+ size_t
558
+ StandardGpuResources::getTempMemoryAvailable(int device) const {
559
+ return res_->getTempMemoryAvailable(device);
560
+ }
561
+
562
+ void
563
+ StandardGpuResources::syncDefaultStreamCurrentDevice() {
564
+ res_->syncDefaultStreamCurrentDevice();
565
+ }
566
+
567
+ void
568
+ StandardGpuResources::setLogMemoryAllocations(bool enable) {
569
+ res_->setLogMemoryAllocations(enable);
570
+ }
571
+
572
+ } } // namespace