faiss 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +1 -1
  4. data/lib/faiss/version.rb +1 -1
  5. data/vendor/faiss/benchs/bench_6bit_codec.cpp +80 -0
  6. data/vendor/faiss/c_api/AutoTune_c.h +2 -0
  7. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -6
  8. data/vendor/faiss/c_api/IndexShards_c.h +1 -4
  9. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +4 -2
  10. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +1 -1
  11. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +1 -1
  12. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +1 -1
  13. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +1 -1
  14. data/vendor/faiss/demos/demo_imi_flat.cpp +5 -2
  15. data/vendor/faiss/demos/demo_imi_pq.cpp +6 -2
  16. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +7 -2
  17. data/vendor/faiss/{AutoTune.cpp → faiss/AutoTune.cpp} +9 -9
  18. data/vendor/faiss/{AutoTune.h → faiss/AutoTune.h} +0 -0
  19. data/vendor/faiss/{Clustering.cpp → faiss/Clustering.cpp} +13 -12
  20. data/vendor/faiss/{Clustering.h → faiss/Clustering.h} +0 -0
  21. data/vendor/faiss/{DirectMap.cpp → faiss/DirectMap.cpp} +0 -0
  22. data/vendor/faiss/{DirectMap.h → faiss/DirectMap.h} +0 -0
  23. data/vendor/faiss/{IVFlib.cpp → faiss/IVFlib.cpp} +86 -11
  24. data/vendor/faiss/{IVFlib.h → faiss/IVFlib.h} +26 -8
  25. data/vendor/faiss/{Index.cpp → faiss/Index.cpp} +0 -0
  26. data/vendor/faiss/{Index.h → faiss/Index.h} +1 -1
  27. data/vendor/faiss/{Index2Layer.cpp → faiss/Index2Layer.cpp} +12 -11
  28. data/vendor/faiss/{Index2Layer.h → faiss/Index2Layer.h} +0 -0
  29. data/vendor/faiss/{IndexBinary.cpp → faiss/IndexBinary.cpp} +2 -1
  30. data/vendor/faiss/{IndexBinary.h → faiss/IndexBinary.h} +0 -0
  31. data/vendor/faiss/{IndexBinaryFlat.cpp → faiss/IndexBinaryFlat.cpp} +0 -0
  32. data/vendor/faiss/{IndexBinaryFlat.h → faiss/IndexBinaryFlat.h} +0 -0
  33. data/vendor/faiss/{IndexBinaryFromFloat.cpp → faiss/IndexBinaryFromFloat.cpp} +1 -0
  34. data/vendor/faiss/{IndexBinaryFromFloat.h → faiss/IndexBinaryFromFloat.h} +0 -0
  35. data/vendor/faiss/{IndexBinaryHNSW.cpp → faiss/IndexBinaryHNSW.cpp} +1 -2
  36. data/vendor/faiss/{IndexBinaryHNSW.h → faiss/IndexBinaryHNSW.h} +0 -0
  37. data/vendor/faiss/{IndexBinaryHash.cpp → faiss/IndexBinaryHash.cpp} +16 -7
  38. data/vendor/faiss/{IndexBinaryHash.h → faiss/IndexBinaryHash.h} +2 -1
  39. data/vendor/faiss/{IndexBinaryIVF.cpp → faiss/IndexBinaryIVF.cpp} +10 -16
  40. data/vendor/faiss/{IndexBinaryIVF.h → faiss/IndexBinaryIVF.h} +1 -1
  41. data/vendor/faiss/{IndexFlat.cpp → faiss/IndexFlat.cpp} +0 -0
  42. data/vendor/faiss/{IndexFlat.h → faiss/IndexFlat.h} +0 -0
  43. data/vendor/faiss/{IndexHNSW.cpp → faiss/IndexHNSW.cpp} +63 -32
  44. data/vendor/faiss/{IndexHNSW.h → faiss/IndexHNSW.h} +0 -0
  45. data/vendor/faiss/{IndexIVF.cpp → faiss/IndexIVF.cpp} +129 -46
  46. data/vendor/faiss/{IndexIVF.h → faiss/IndexIVF.h} +7 -3
  47. data/vendor/faiss/{IndexIVFFlat.cpp → faiss/IndexIVFFlat.cpp} +6 -5
  48. data/vendor/faiss/{IndexIVFFlat.h → faiss/IndexIVFFlat.h} +0 -0
  49. data/vendor/faiss/{IndexIVFPQ.cpp → faiss/IndexIVFPQ.cpp} +9 -8
  50. data/vendor/faiss/{IndexIVFPQ.h → faiss/IndexIVFPQ.h} +4 -2
  51. data/vendor/faiss/{IndexIVFPQR.cpp → faiss/IndexIVFPQR.cpp} +3 -1
  52. data/vendor/faiss/{IndexIVFPQR.h → faiss/IndexIVFPQR.h} +0 -0
  53. data/vendor/faiss/{IndexIVFSpectralHash.cpp → faiss/IndexIVFSpectralHash.cpp} +1 -1
  54. data/vendor/faiss/{IndexIVFSpectralHash.h → faiss/IndexIVFSpectralHash.h} +0 -0
  55. data/vendor/faiss/{IndexLSH.cpp → faiss/IndexLSH.cpp} +0 -0
  56. data/vendor/faiss/{IndexLSH.h → faiss/IndexLSH.h} +0 -0
  57. data/vendor/faiss/{IndexLattice.cpp → faiss/IndexLattice.cpp} +0 -0
  58. data/vendor/faiss/{IndexLattice.h → faiss/IndexLattice.h} +0 -0
  59. data/vendor/faiss/{IndexPQ.cpp → faiss/IndexPQ.cpp} +6 -6
  60. data/vendor/faiss/{IndexPQ.h → faiss/IndexPQ.h} +3 -1
  61. data/vendor/faiss/{IndexPreTransform.cpp → faiss/IndexPreTransform.cpp} +0 -0
  62. data/vendor/faiss/{IndexPreTransform.h → faiss/IndexPreTransform.h} +0 -0
  63. data/vendor/faiss/{IndexReplicas.cpp → faiss/IndexReplicas.cpp} +102 -10
  64. data/vendor/faiss/{IndexReplicas.h → faiss/IndexReplicas.h} +6 -0
  65. data/vendor/faiss/{IndexScalarQuantizer.cpp → faiss/IndexScalarQuantizer.cpp} +3 -3
  66. data/vendor/faiss/{IndexScalarQuantizer.h → faiss/IndexScalarQuantizer.h} +0 -0
  67. data/vendor/faiss/{IndexShards.cpp → faiss/IndexShards.cpp} +37 -12
  68. data/vendor/faiss/{IndexShards.h → faiss/IndexShards.h} +3 -4
  69. data/vendor/faiss/{InvertedLists.cpp → faiss/InvertedLists.cpp} +2 -2
  70. data/vendor/faiss/{InvertedLists.h → faiss/InvertedLists.h} +1 -0
  71. data/vendor/faiss/{MatrixStats.cpp → faiss/MatrixStats.cpp} +0 -0
  72. data/vendor/faiss/{MatrixStats.h → faiss/MatrixStats.h} +0 -0
  73. data/vendor/faiss/{MetaIndexes.cpp → faiss/MetaIndexes.cpp} +5 -3
  74. data/vendor/faiss/{MetaIndexes.h → faiss/MetaIndexes.h} +0 -0
  75. data/vendor/faiss/{MetricType.h → faiss/MetricType.h} +0 -0
  76. data/vendor/faiss/{OnDiskInvertedLists.cpp → faiss/OnDiskInvertedLists.cpp} +141 -3
  77. data/vendor/faiss/{OnDiskInvertedLists.h → faiss/OnDiskInvertedLists.h} +27 -7
  78. data/vendor/faiss/{VectorTransform.cpp → faiss/VectorTransform.cpp} +4 -3
  79. data/vendor/faiss/{VectorTransform.h → faiss/VectorTransform.h} +0 -0
  80. data/vendor/faiss/{clone_index.cpp → faiss/clone_index.cpp} +0 -0
  81. data/vendor/faiss/{clone_index.h → faiss/clone_index.h} +0 -0
  82. data/vendor/faiss/{gpu → faiss/gpu}/GpuAutoTune.cpp +0 -0
  83. data/vendor/faiss/{gpu → faiss/gpu}/GpuAutoTune.h +0 -0
  84. data/vendor/faiss/{gpu → faiss/gpu}/GpuCloner.cpp +14 -14
  85. data/vendor/faiss/{gpu → faiss/gpu}/GpuCloner.h +6 -7
  86. data/vendor/faiss/{gpu → faiss/gpu}/GpuClonerOptions.cpp +0 -0
  87. data/vendor/faiss/{gpu → faiss/gpu}/GpuClonerOptions.h +0 -0
  88. data/vendor/faiss/{gpu → faiss/gpu}/GpuDistance.h +12 -4
  89. data/vendor/faiss/{gpu → faiss/gpu}/GpuFaissAssert.h +0 -0
  90. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndex.h +3 -9
  91. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexBinaryFlat.h +7 -7
  92. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexFlat.h +35 -10
  93. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVF.h +1 -2
  94. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVFFlat.h +4 -3
  95. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVFPQ.h +21 -4
  96. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndexIVFScalarQuantizer.h +4 -3
  97. data/vendor/faiss/{gpu → faiss/gpu}/GpuIndicesOptions.h +0 -0
  98. data/vendor/faiss/faiss/gpu/GpuResources.cpp +200 -0
  99. data/vendor/faiss/faiss/gpu/GpuResources.h +264 -0
  100. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +572 -0
  101. data/vendor/faiss/{gpu → faiss/gpu}/StandardGpuResources.h +83 -15
  102. data/vendor/faiss/{gpu → faiss/gpu}/impl/RemapIndices.cpp +0 -0
  103. data/vendor/faiss/{gpu → faiss/gpu}/impl/RemapIndices.h +0 -0
  104. data/vendor/faiss/{gpu → faiss/gpu}/perf/IndexWrapper-inl.h +1 -1
  105. data/vendor/faiss/{gpu → faiss/gpu}/perf/IndexWrapper.h +1 -1
  106. data/vendor/faiss/{gpu → faiss/gpu}/perf/PerfClustering.cpp +1 -1
  107. data/vendor/faiss/{gpu → faiss/gpu}/perf/PerfIVFPQAdd.cpp +0 -0
  108. data/vendor/faiss/{gpu → faiss/gpu}/perf/WriteIndex.cpp +0 -0
  109. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexBinaryFlat.cpp +0 -0
  110. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexFlat.cpp +1 -1
  111. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexIVFFlat.cpp +0 -0
  112. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexIVFPQ.cpp +141 -52
  113. data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuMemoryException.cpp +0 -0
  114. data/vendor/faiss/{gpu → faiss/gpu}/test/TestUtils.cpp +4 -2
  115. data/vendor/faiss/{gpu → faiss/gpu}/test/TestUtils.h +0 -0
  116. data/vendor/faiss/{gpu → faiss/gpu}/test/demo_ivfpq_indexing_gpu.cpp +7 -5
  117. data/vendor/faiss/{gpu → faiss/gpu}/utils/DeviceUtils.h +1 -1
  118. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +213 -0
  119. data/vendor/faiss/{gpu → faiss/gpu}/utils/StackDeviceMemory.h +25 -40
  120. data/vendor/faiss/{gpu → faiss/gpu}/utils/StaticUtils.h +0 -0
  121. data/vendor/faiss/{gpu → faiss/gpu}/utils/Timer.cpp +0 -0
  122. data/vendor/faiss/{gpu → faiss/gpu}/utils/Timer.h +0 -0
  123. data/vendor/faiss/{impl → faiss/impl}/AuxIndexStructures.cpp +1 -0
  124. data/vendor/faiss/{impl → faiss/impl}/AuxIndexStructures.h +3 -1
  125. data/vendor/faiss/{impl → faiss/impl}/FaissAssert.h +1 -0
  126. data/vendor/faiss/{impl → faiss/impl}/FaissException.cpp +26 -0
  127. data/vendor/faiss/{impl → faiss/impl}/FaissException.h +4 -0
  128. data/vendor/faiss/{impl → faiss/impl}/HNSW.cpp +26 -26
  129. data/vendor/faiss/{impl → faiss/impl}/HNSW.h +19 -11
  130. data/vendor/faiss/{impl → faiss/impl}/PolysemousTraining.cpp +1 -1
  131. data/vendor/faiss/{impl → faiss/impl}/PolysemousTraining.h +1 -1
  132. data/vendor/faiss/{impl → faiss/impl}/ProductQuantizer-inl.h +0 -1
  133. data/vendor/faiss/{impl → faiss/impl}/ProductQuantizer.cpp +9 -9
  134. data/vendor/faiss/{impl → faiss/impl}/ProductQuantizer.h +0 -0
  135. data/vendor/faiss/{impl → faiss/impl}/ScalarQuantizer.cpp +63 -39
  136. data/vendor/faiss/{impl → faiss/impl}/ScalarQuantizer.h +1 -1
  137. data/vendor/faiss/{impl → faiss/impl}/ThreadedIndex-inl.h +0 -0
  138. data/vendor/faiss/{impl → faiss/impl}/ThreadedIndex.h +0 -0
  139. data/vendor/faiss/{impl → faiss/impl}/index_read.cpp +99 -116
  140. data/vendor/faiss/{impl → faiss/impl}/index_write.cpp +15 -50
  141. data/vendor/faiss/{impl → faiss/impl}/io.cpp +15 -10
  142. data/vendor/faiss/{impl → faiss/impl}/io.h +22 -8
  143. data/vendor/faiss/faiss/impl/io_macros.h +57 -0
  144. data/vendor/faiss/{impl → faiss/impl}/lattice_Zn.cpp +52 -36
  145. data/vendor/faiss/{impl → faiss/impl}/lattice_Zn.h +3 -3
  146. data/vendor/faiss/faiss/impl/platform_macros.h +24 -0
  147. data/vendor/faiss/{index_factory.cpp → faiss/index_factory.cpp} +33 -12
  148. data/vendor/faiss/{index_factory.h → faiss/index_factory.h} +0 -0
  149. data/vendor/faiss/{index_io.h → faiss/index_io.h} +55 -1
  150. data/vendor/faiss/faiss/python/python_callbacks.cpp +112 -0
  151. data/vendor/faiss/faiss/python/python_callbacks.h +45 -0
  152. data/vendor/faiss/{utils → faiss/utils}/Heap.cpp +5 -5
  153. data/vendor/faiss/{utils → faiss/utils}/Heap.h +1 -3
  154. data/vendor/faiss/{utils → faiss/utils}/WorkerThread.cpp +0 -0
  155. data/vendor/faiss/{utils → faiss/utils}/WorkerThread.h +0 -0
  156. data/vendor/faiss/{utils → faiss/utils}/distances.cpp +28 -13
  157. data/vendor/faiss/{utils → faiss/utils}/distances.h +2 -1
  158. data/vendor/faiss/{utils → faiss/utils}/distances_simd.cpp +5 -5
  159. data/vendor/faiss/{utils → faiss/utils}/extra_distances.cpp +8 -7
  160. data/vendor/faiss/{utils → faiss/utils}/extra_distances.h +0 -0
  161. data/vendor/faiss/{utils → faiss/utils}/hamming-inl.h +1 -3
  162. data/vendor/faiss/{utils → faiss/utils}/hamming.cpp +8 -7
  163. data/vendor/faiss/{utils → faiss/utils}/hamming.h +7 -1
  164. data/vendor/faiss/{utils → faiss/utils}/random.cpp +5 -5
  165. data/vendor/faiss/{utils → faiss/utils}/random.h +0 -0
  166. data/vendor/faiss/{utils → faiss/utils}/utils.cpp +27 -28
  167. data/vendor/faiss/{utils → faiss/utils}/utils.h +4 -0
  168. data/vendor/faiss/misc/test_blas.cpp +4 -1
  169. data/vendor/faiss/tests/test_binary_flat.cpp +0 -2
  170. data/vendor/faiss/tests/test_dealloc_invlists.cpp +6 -1
  171. data/vendor/faiss/tests/test_ivfpq_codec.cpp +4 -1
  172. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +6 -4
  173. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +12 -5
  174. data/vendor/faiss/tests/test_merge.cpp +6 -3
  175. data/vendor/faiss/tests/test_ondisk_ivf.cpp +7 -2
  176. data/vendor/faiss/tests/test_pairs_decoding.cpp +5 -1
  177. data/vendor/faiss/tests/test_params_override.cpp +7 -2
  178. data/vendor/faiss/tests/test_sliding_ivf.cpp +10 -4
  179. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +14 -8
  180. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +11 -7
  181. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +12 -7
  182. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +6 -3
  183. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +7 -3
  184. metadata +154 -153
  185. data/vendor/faiss/gpu/GpuResources.cpp +0 -52
  186. data/vendor/faiss/gpu/GpuResources.h +0 -73
  187. data/vendor/faiss/gpu/StandardGpuResources.cpp +0 -303
  188. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +0 -77
  189. data/vendor/faiss/gpu/utils/DeviceMemory.h +0 -71
  190. data/vendor/faiss/gpu/utils/MemorySpace.cpp +0 -89
  191. data/vendor/faiss/gpu/utils/MemorySpace.h +0 -44
  192. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +0 -239
@@ -0,0 +1,572 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+
9
+ #include <faiss/gpu/StandardGpuResources.h>
10
+ #include <faiss/gpu/utils/DeviceUtils.h>
11
+ #include <faiss/gpu/utils/StaticUtils.h>
12
+ #include <faiss/impl/FaissAssert.h>
13
+ #include <limits>
14
+ #include <iostream>
15
+ #include <sstream>
16
+
17
+ namespace faiss { namespace gpu {
18
+
19
+ namespace {
20
+
21
+ // How many streams per device we allocate by default (for multi-streaming)
22
+ constexpr int kNumStreams = 2;
23
+
24
+ // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
25
+ constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
26
+
27
+ // Default temporary memory allocation for <= 4 GiB memory GPUs
28
+ constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
29
+
30
+ // Default temporary memory allocation for <= 8 GiB memory GPUs
31
+ constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
32
+
33
+ // Maximum temporary memory allocation for all GPUs
34
+ constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
35
+
36
+ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
37
+ // Produce a sorted list of all outstanding allocations by type
38
+ std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
+
40
+ for (auto& entry : map) {
41
+ auto& a = entry.second;
42
+
43
+ auto it = stats.find(a.type);
44
+ if (it != stats.end()) {
45
+ stats[a.type].first++;
46
+ stats[a.type].second += a.size;
47
+ } else {
48
+ stats[a.type] = std::make_pair(1, a.size);
49
+ }
50
+ }
51
+
52
+ std::stringstream ss;
53
+ for (auto& entry : stats) {
54
+ ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
+ << entry.second.first << " allocations, "
56
+ << entry.second.second << " bytes\n";
57
+ }
58
+
59
+ return ss.str();
60
+ }
61
+
62
+ }
63
+
64
+ //
65
+ // StandardGpuResourcesImpl
66
+ //
67
+
68
+ StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
69
+ pinnedMemAlloc_(nullptr),
70
+ pinnedMemAllocSize_(0),
71
+ // let the adjustment function determine the memory size for us by passing
72
+ // in a huge value that will then be adjusted
73
+ tempMemSize_(getDefaultTempMemForGPU(-1,
74
+ std::numeric_limits<size_t>::max())),
75
+ pinnedMemSize_(kDefaultPinnedMemoryAllocation),
76
+ allocLogging_(false) {
77
+ }
78
+
79
+ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
+ // The temporary memory allocator has allocated memory through us, so clean
81
+ // that up before we finish fully de-initializing ourselves
82
+ tempMemory_.clear();
83
+
84
+ // Make sure all allocations have been freed
85
+ bool allocError = false;
86
+
87
+ for (auto& entry : allocs_) {
88
+ auto& map = entry.second;
89
+
90
+ if (!map.empty()) {
91
+ std::cerr
92
+ << "StandardGpuResources destroyed with allocations outstanding:\n"
93
+ << "Device " << entry.first << " outstanding allocations:\n";
94
+ std::cerr << allocsToString(map);
95
+ allocError = true;
96
+ }
97
+ }
98
+
99
+ FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
100
+
101
+ for (auto& entry : defaultStreams_) {
102
+ DeviceScope scope(entry.first);
103
+
104
+ auto it = userDefaultStreams_.find(entry.first);
105
+ if (it == userDefaultStreams_.end()) {
106
+ // The user did not specify this stream, thus we are the ones
107
+ // who have created it
108
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
109
+ }
110
+ }
111
+
112
+ for (auto& entry : alternateStreams_) {
113
+ DeviceScope scope(entry.first);
114
+
115
+ for (auto stream : entry.second) {
116
+ CUDA_VERIFY(cudaStreamDestroy(stream));
117
+ }
118
+ }
119
+
120
+ for (auto& entry : asyncCopyStreams_) {
121
+ DeviceScope scope(entry.first);
122
+
123
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
124
+ }
125
+
126
+ for (auto& entry : blasHandles_) {
127
+ DeviceScope scope(entry.first);
128
+
129
+ auto blasStatus = cublasDestroy(entry.second);
130
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
131
+ }
132
+
133
+ if (pinnedMemAlloc_) {
134
+ auto err = cudaFreeHost(pinnedMemAlloc_);
135
+ FAISS_ASSERT_FMT(err == cudaSuccess,
136
+ "Failed to cudaFreeHost pointer %p (error %d %s)",
137
+ pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
138
+ }
139
+ }
140
+
141
+ size_t
142
+ StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
143
+ size_t requested) {
144
+ auto totalMem = device != -1 ?
145
+ getDeviceProperties(device).totalGlobalMem :
146
+ std::numeric_limits<size_t>::max();
147
+
148
+ if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
149
+ // If the GPU has <= 4 GiB of memory, reserve 512 MiB
150
+
151
+ if (requested > k4GiBTempMem) {
152
+ return k4GiBTempMem;
153
+ }
154
+ } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
155
+ // If the GPU has <= 8 GiB of memory, reserve 1 GiB
156
+
157
+ if (requested > k8GiBTempMem) {
158
+ return k8GiBTempMem;
159
+ }
160
+ } else {
161
+ // Never use more than 1.5 GiB
162
+ if (requested > kMaxTempMem) {
163
+ return kMaxTempMem;
164
+ }
165
+ }
166
+
167
+ // use whatever lower limit the user requested
168
+ return requested;
169
+ }
170
+
171
+ void
172
+ StandardGpuResourcesImpl::noTempMemory() {
173
+ setTempMemory(0);
174
+ }
175
+
176
+ void
177
+ StandardGpuResourcesImpl::setTempMemory(size_t size) {
178
+ if (tempMemSize_ != size) {
179
+ // adjust based on general limits
180
+ tempMemSize_ = getDefaultTempMemForGPU(-1, size);
181
+
182
+ // We need to re-initialize memory resources for all current devices that
183
+ // have been initialized.
184
+ // This should be safe to do, even if we are currently running work, because
185
+ // the cudaFree call that this implies will force-synchronize all GPUs with
186
+ // the CPU
187
+ for (auto& p : tempMemory_) {
188
+ int device = p.first;
189
+ // Free the existing memory first
190
+ p.second.reset();
191
+
192
+ // Allocate new
193
+ p.second = std::unique_ptr<StackDeviceMemory>(
194
+ new StackDeviceMemory(this,
195
+ p.first,
196
+ // adjust for this specific device
197
+ getDefaultTempMemForGPU(device, tempMemSize_)));
198
+ }
199
+ }
200
+ }
201
+
202
+ void
203
+ StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
204
+ // Should not call this after devices have been initialized
205
+ FAISS_ASSERT(defaultStreams_.size() == 0);
206
+ FAISS_ASSERT(!pinnedMemAlloc_);
207
+
208
+ pinnedMemSize_ = size;
209
+ }
210
+
211
+ void
212
+ StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
213
+ auto it = defaultStreams_.find(device);
214
+ if (it != defaultStreams_.end()) {
215
+ // Replace this stream with the user stream
216
+ CUDA_VERIFY(cudaStreamDestroy(it->second));
217
+ it->second = stream;
218
+ }
219
+
220
+ userDefaultStreams_[device] = stream;
221
+ }
222
+
223
+ void
224
+ StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
225
+ for (int dev = 0; dev < getNumDevices(); ++dev) {
226
+ setDefaultStream(dev, nullptr);
227
+ }
228
+ }
229
+
230
+ void
231
+ StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
232
+ allocLogging_ = enable;
233
+ }
234
+
235
+ bool
236
+ StandardGpuResourcesImpl::isInitialized(int device) const {
237
+ // Use default streams as a marker for whether or not a certain
238
+ // device has been initialized
239
+ return defaultStreams_.count(device) != 0;
240
+ }
241
+
242
+ void
243
+ StandardGpuResourcesImpl::initializeForDevice(int device) {
244
+ if (isInitialized(device)) {
245
+ return;
246
+ }
247
+
248
+ // If this is the first device that we're initializing, create our
249
+ // pinned memory allocation
250
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
251
+ auto err =
252
+ cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
253
+
254
+ FAISS_THROW_IF_NOT_FMT(
255
+ err == cudaSuccess,
256
+ "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
257
+ "async copy buffer (error %d %s)",
258
+ pinnedMemSize_, (int) err, cudaGetErrorString(err));
259
+
260
+ pinnedMemAllocSize_ = pinnedMemSize_;
261
+ }
262
+
263
+ FAISS_ASSERT(device < getNumDevices());
264
+ DeviceScope scope(device);
265
+
266
+ // Make sure that device properties for all devices are cached
267
+ auto& prop = getDeviceProperties(device);
268
+
269
+ // Also check to make sure we meet our minimum compute capability (3.0)
270
+ FAISS_ASSERT_FMT(prop.major >= 3,
271
+ "Device id %d with CC %d.%d not supported, "
272
+ "need 3.0+ compute capability",
273
+ device, prop.major, prop.minor);
274
+
275
+ // Create streams
276
+ cudaStream_t defaultStream = 0;
277
+ auto it = userDefaultStreams_.find(device);
278
+ if (it != userDefaultStreams_.end()) {
279
+ // We already have a stream provided by the user
280
+ defaultStream = it->second;
281
+ } else {
282
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
283
+ cudaStreamNonBlocking));
284
+ }
285
+
286
+ defaultStreams_[device] = defaultStream;
287
+
288
+ cudaStream_t asyncCopyStream = 0;
289
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
290
+ cudaStreamNonBlocking));
291
+
292
+ asyncCopyStreams_[device] = asyncCopyStream;
293
+
294
+ std::vector<cudaStream_t> deviceStreams;
295
+ for (int j = 0; j < kNumStreams; ++j) {
296
+ cudaStream_t stream = 0;
297
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
298
+ cudaStreamNonBlocking));
299
+
300
+ deviceStreams.push_back(stream);
301
+ }
302
+
303
+ alternateStreams_[device] = std::move(deviceStreams);
304
+
305
+ // Create cuBLAS handle
306
+ cublasHandle_t blasHandle = 0;
307
+ auto blasStatus = cublasCreate(&blasHandle);
308
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
309
+ blasHandles_[device] = blasHandle;
310
+
311
+ // Enable tensor core support if available
312
+ #if CUDA_VERSION >= 9000 && CUDA_VERSION < 11000
313
+ // This flag was deprecated in CUDA 11
314
+ if (getTensorCoreSupport(device)) {
315
+ cublasSetMathMode(blasHandle, CUBLAS_TENSOR_OP_MATH);
316
+ }
317
+ #endif
318
+ #if CUDA_VERSION >= 11000
319
+ cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
320
+ #endif
321
+
322
+ FAISS_ASSERT(allocs_.count(device) == 0);
323
+ allocs_[device] = std::unordered_map<void*, AllocRequest>();
324
+
325
+ FAISS_ASSERT(tempMemory_.count(device) == 0);
326
+ auto mem = std::unique_ptr<StackDeviceMemory>(
327
+ new StackDeviceMemory(this,
328
+ device,
329
+ // adjust for this specific device
330
+ getDefaultTempMemForGPU(device, tempMemSize_)));
331
+
332
+ tempMemory_.emplace(device, std::move(mem));
333
+ }
334
+
335
+ cublasHandle_t
336
+ StandardGpuResourcesImpl::getBlasHandle(int device) {
337
+ initializeForDevice(device);
338
+ return blasHandles_[device];
339
+ }
340
+
341
+ cudaStream_t
342
+ StandardGpuResourcesImpl::getDefaultStream(int device) {
343
+ initializeForDevice(device);
344
+ return defaultStreams_[device];
345
+ }
346
+
347
+ std::vector<cudaStream_t>
348
+ StandardGpuResourcesImpl::getAlternateStreams(int device) {
349
+ initializeForDevice(device);
350
+ return alternateStreams_[device];
351
+ }
352
+
353
+ std::pair<void*, size_t>
354
+ StandardGpuResourcesImpl::getPinnedMemory() {
355
+ return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
356
+ }
357
+
358
+ cudaStream_t
359
+ StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
360
+ initializeForDevice(device);
361
+ return asyncCopyStreams_[device];
362
+ }
363
+
364
+ void*
365
+ StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
366
+ initializeForDevice(req.device);
367
+
368
+ // We don't allocate a placeholder for zero-sized allocations
369
+ if (req.size == 0) {
370
+ return nullptr;
371
+ }
372
+
373
+ // Make sure that the allocation is a multiple of 16 bytes for alignment
374
+ // purposes
375
+ auto adjReq = req;
376
+ adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
377
+
378
+ void* p = nullptr;
379
+
380
+ if (allocLogging_) {
381
+ std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
382
+ }
383
+
384
+ if (adjReq.space == MemorySpace::Temporary) {
385
+ // If we don't have enough space in our temporary memory manager, we need
386
+ // to allocate this request separately
387
+ auto& tempMem = tempMemory_[adjReq.device];
388
+
389
+ if (adjReq.size > tempMem->getSizeAvailable()) {
390
+ // We need to allocate this ourselves
391
+ AllocRequest newReq = adjReq;
392
+ newReq.space = MemorySpace::Device;
393
+ newReq.type = AllocType::TemporaryMemoryOverflow;
394
+
395
+ return allocMemory(newReq);
396
+ }
397
+
398
+ // Otherwise, we can handle this locally
399
+ p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
400
+
401
+ } else if (adjReq.space == MemorySpace::Device) {
402
+ auto err = cudaMalloc(&p, adjReq.size);
403
+
404
+ // Throw if we fail to allocate
405
+ if (err != cudaSuccess) {
406
+ auto& map = allocs_[req.device];
407
+
408
+ std::stringstream ss;
409
+ ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
410
+ << "on device " << adjReq.device << " (error "
411
+ << (int) err << " " << cudaGetErrorString(err)
412
+ << "\nOutstanding allocations:\n" << allocsToString(map);
413
+ auto str = ss.str();
414
+
415
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
416
+ }
417
+ } else if (adjReq.space == MemorySpace::Unified) {
418
+ auto err = cudaMallocManaged(&p, adjReq.size);
419
+
420
+ if (err != cudaSuccess) {
421
+ auto& map = allocs_[req.device];
422
+
423
+ std::stringstream ss;
424
+ ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
425
+ << "(error " << (int) err << " " << cudaGetErrorString(err)
426
+ << "\nOutstanding allocations:\n" << allocsToString(map);
427
+ auto str = ss.str();
428
+
429
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
430
+ }
431
+ } else {
432
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
433
+ }
434
+
435
+ allocs_[adjReq.device][p] = adjReq;
436
+
437
+ return p;
438
+ }
439
+
440
+ void
441
+ StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
442
+ FAISS_ASSERT(isInitialized(device));
443
+
444
+ if (!p) {
445
+ return;
446
+ }
447
+
448
+ auto& a = allocs_[device];
449
+ auto it = a.find(p);
450
+ FAISS_ASSERT(it != a.end());
451
+
452
+ auto& req = it->second;
453
+
454
+ if (allocLogging_) {
455
+ std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
456
+ }
457
+
458
+ if (req.space == MemorySpace::Temporary) {
459
+ tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
460
+
461
+ } else if (req.space == MemorySpace::Device ||
462
+ req.space == MemorySpace::Unified) {
463
+ auto err = cudaFree(p);
464
+ FAISS_ASSERT_FMT(err == cudaSuccess,
465
+ "Failed to cudaFree pointer %p (error %d %s)",
466
+ p, (int) err, cudaGetErrorString(err));
467
+
468
+ } else {
469
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
470
+ }
471
+
472
+ a.erase(it);
473
+ }
474
+
475
+ size_t
476
+ StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
477
+ FAISS_ASSERT(isInitialized(device));
478
+
479
+ auto it = tempMemory_.find(device);
480
+ FAISS_ASSERT(it != tempMemory_.end());
481
+
482
+ return it->second->getSizeAvailable();
483
+ }
484
+
485
+ std::map<int, std::map<std::string, std::pair<int, size_t>>>
486
+ StandardGpuResourcesImpl::getMemoryInfo() const {
487
+ using AT = std::map<std::string, std::pair<int, size_t>>;
488
+
489
+ std::map<int, AT> out;
490
+
491
+ for (auto& entry : allocs_) {
492
+ AT outDevice;
493
+
494
+ for (auto& a : entry.second) {
495
+ auto& v = outDevice[allocTypeToString(a.second.type)];
496
+ v.first++;
497
+ v.second += a.second.size;
498
+ }
499
+
500
+ out[entry.first] = std::move(outDevice);
501
+ }
502
+
503
+ return out;
504
+ }
505
+
506
+ //
507
+ // StandardGpuResources
508
+ //
509
+
510
+ StandardGpuResources::StandardGpuResources()
511
+ : res_(new StandardGpuResourcesImpl) {
512
+ }
513
+
514
+ StandardGpuResources::~StandardGpuResources() {
515
+ }
516
+
517
+ std::shared_ptr<GpuResources>
518
+ StandardGpuResources::getResources() {
519
+ return res_;
520
+ }
521
+
522
+ void
523
+ StandardGpuResources::noTempMemory() {
524
+ res_->noTempMemory();
525
+ }
526
+
527
+ void
528
+ StandardGpuResources::setTempMemory(size_t size) {
529
+ res_->setTempMemory(size);
530
+ }
531
+
532
+ void
533
+ StandardGpuResources::setPinnedMemory(size_t size) {
534
+ res_->setPinnedMemory(size);
535
+ }
536
+
537
+ void
538
+ StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
539
+ res_->setDefaultStream(device, stream);
540
+ }
541
+
542
+ void
543
+ StandardGpuResources::setDefaultNullStreamAllDevices() {
544
+ res_->setDefaultNullStreamAllDevices();
545
+ }
546
+
547
+ std::map<int, std::map<std::string, std::pair<int, size_t>>>
548
+ StandardGpuResources::getMemoryInfo() const {
549
+ return res_->getMemoryInfo();
550
+ }
551
+
552
+ cudaStream_t
553
+ StandardGpuResources::getDefaultStream(int device) {
554
+ return res_->getDefaultStream(device);
555
+ }
556
+
557
+ size_t
558
+ StandardGpuResources::getTempMemoryAvailable(int device) const {
559
+ return res_->getTempMemoryAvailable(device);
560
+ }
561
+
562
+ void
563
+ StandardGpuResources::syncDefaultStreamCurrentDevice() {
564
+ res_->syncDefaultStreamCurrentDevice();
565
+ }
566
+
567
+ void
568
+ StandardGpuResources::setLogMemoryAllocations(bool enable) {
569
+ res_->setLogMemoryAllocations(enable);
570
+ }
571
+
572
+ } } // namespace