RubyGems - faiss - Versions diffs - 0.1.2 → 0.1.3 - Mend

faiss 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

data/vendor/faiss/{gpu → faiss/gpu}/StandardGpuResources.h RENAMED

@@ -11,18 +11,20 @@
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/StackDeviceMemory.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
+#include <functional>
+#include <map>
 #include <unordered_map>
 #include <vector>
 namespace faiss { namespace gpu {
-/// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory
-class StandardGpuResources : public GpuResources {
+/// Standard implementation of the GpuResources object that provides for a
+/// temporary memory manager
+class StandardGpuResourcesImpl : public GpuResources {
  public:
-  StandardGpuResources();
+  StandardGpuResourcesImpl();
-  ~StandardGpuResources() override;
+  ~StandardGpuResourcesImpl() override;
   /// Disable allocation of temporary memory; all temporary memory
   /// requests will call cudaMalloc / cudaFree at the point of use
@@ -46,9 +48,9 @@ class StandardGpuResources : public GpuResources {
   /// for all devices
   void setDefaultNullStreamAllDevices();
-  /// Enable or disable the warning about not having enough temporary memory
-  /// when cudaMalloc gets called
-  void setCudaMallocWarning(bool b);
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable);
  public:
   /// Internal system calls
@@ -62,7 +64,17 @@ class StandardGpuResources : public GpuResources {
   std::vector<cudaStream_t> getAlternateStreams(int device) override;
-  DeviceMemory& getMemoryManager(int device) override;
+  /// Allocate non-temporary GPU memory
+  void* allocMemory(const AllocRequest& req) override;
+  /// Returns a previous allocation
+  void deallocMemory(int device, void* in) override;
+  size_t getTempMemoryAvailable(int device) const override;
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>>
+  getMemoryInfo() const;
   std::pair<void*, size_t> getPinnedMemory() override;
@@ -77,6 +89,13 @@ class StandardGpuResources : public GpuResources {
   static size_t getDefaultTempMemForGPU(int device, size_t requested);
  private:
+  /// Set of currently outstanding memory allocations per device
+  /// device -> (alloc request, allocated ptr)
+  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
+  /// Temporary memory provider, per each device
+  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
   /// Our default stream that work is ordered on, one per each device
   std::unordered_map<int, cudaStream_t> defaultStreams_;
@@ -85,7 +104,7 @@ class StandardGpuResources : public GpuResources {
   std::unordered_map<int, cudaStream_t> userDefaultStreams_;
   /// Other streams we can use, per each device
-  std::unordered_map<int, std::vector<cudaStream_t> > alternateStreams_;
+  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
   /// Async copy stream to use for GPU <-> CPU pinned memory copies
   std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
@@ -93,9 +112,6 @@ class StandardGpuResources : public GpuResources {
   /// cuBLAS handle for each device
   std::unordered_map<int, cublasHandle_t> blasHandles_;
-  /// Temporary memory provider, per each device
-  std::unordered_map<int, std::unique_ptr<StackDeviceMemory> > memory_;
   /// Pinned memory allocation for use with this GPU
   void* pinnedMemAlloc_;
   size_t pinnedMemAllocSize_;
@@ -107,8 +123,60 @@ class StandardGpuResources : public GpuResources {
   /// Amount of pinned memory we should allocate
   size_t pinnedMemSize_;
-  /// Whether or not a warning upon cudaMalloc is generated
-  bool cudaMallocWarning_;
+  /// Whether or not we log every GPU memory allocation and deallocation
+  bool allocLogging_;
+};
+/// Default implementation of GpuResources that allocates a cuBLAS
+/// stream and 2 streams for use, as well as temporary memory
+class StandardGpuResources : public GpuResourcesProvider {
+ public:
+  StandardGpuResources();
+  ~StandardGpuResources() override;
+  std::shared_ptr<GpuResources> getResources() override;
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory();
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size);
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size);
+  /// Called to change the stream for work ordering
+  void setDefaultStream(int device, cudaStream_t stream);
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices();
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>>
+  getMemoryInfo() const;
+  /// Returns the current default stream
+  cudaStream_t getDefaultStream(int device);
+  /// Returns the current amount of temp memory available
+  size_t getTempMemoryAvailable(int device) const;
+  /// Synchronize our default stream with the CPU
+  void syncDefaultStreamCurrentDevice();
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable);
+ private:
+  std::shared_ptr<StandardGpuResourcesImpl> res_;
 };
 } } // namespace

data/vendor/faiss/{gpu → faiss/gpu}/impl/RemapIndices.cpp RENAMED

File without changes

data/vendor/faiss/{gpu → faiss/gpu}/impl/RemapIndices.h RENAMED

File without changes

data/vendor/faiss/{gpu → faiss/gpu}/perf/IndexWrapper-inl.h RENAMED

@@ -13,7 +13,7 @@ namespace faiss { namespace gpu {
 template <typename GpuIndex>
 IndexWrapper<GpuIndex>::IndexWrapper(
   int numGpus,
-  std::function<std::unique_ptr<GpuIndex>(GpuResources*, int)> init) {
+  std::function<std::unique_ptr<GpuIndex>(GpuResourcesProvider*, int)> init) {
   FAISS_ASSERT(numGpus <= faiss::gpu::getNumDevices());
   for (int i = 0; i < numGpus; ++i) {
     auto res = std::unique_ptr<faiss::gpu::StandardGpuResources>(

data/vendor/faiss/{gpu → faiss/gpu}/perf/IndexWrapper.h RENAMED

@@ -27,7 +27,7 @@ struct IndexWrapper {
   IndexWrapper(
     int numGpus,
-    std::function<std::unique_ptr<GpuIndex>(GpuResources*, int)> init);
+    std::function<std::unique_ptr<GpuIndex>(GpuResourcesProvider*, int)> init);
   faiss::Index* getIndex();
   void runOnIndices(std::function<void(GpuIndex*)> f);

data/vendor/faiss/{gpu → faiss/gpu}/perf/PerfClustering.cpp RENAMED

@@ -53,7 +53,7 @@ int main(int argc, char** argv) {
   printf("transposed storage %s\n", FLAGS_transposed ? "enabled" : "disabled");
   printf("verbose %s\n", FLAGS_verbose ? "enabled" : "disabled");
-  auto initFn = [](faiss::gpu::GpuResources* res, int dev) ->
+  auto initFn = [](faiss::gpu::GpuResourcesProvider* res, int dev) ->
     std::unique_ptr<faiss::gpu::GpuIndexFlat> {
     if (FLAGS_pinned_mem >= 0) {
       ((faiss::gpu::StandardGpuResources*) res)->setPinnedMemory(

data/vendor/faiss/{gpu → faiss/gpu}/perf/PerfIVFPQAdd.cpp RENAMED

File without changes

data/vendor/faiss/{gpu → faiss/gpu}/perf/WriteIndex.cpp RENAMED

File without changes

data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexBinaryFlat.cpp RENAMED

File without changes

data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexFlat.cpp RENAMED

@@ -277,7 +277,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
   faiss::gpu::GpuIndexFlatConfig config;
-  config.device = 0;
+  config.device = device;
   config.useFloat16 = false;
   config.storeTransposed = false;

data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexIVFFlat.cpp RENAMED

File without changes

data/vendor/faiss/{gpu → faiss/gpu}/test/TestGpuIndexIVFPQ.cpp RENAMED

@@ -51,7 +51,7 @@ struct Options {
     //   support non-multiple of 8 subcodes for IVFPQ.
     bitsPerCode = 8;
     nprobe = std::min(faiss::gpu::randVal(40, 1000), numCentroids);
-    numQuery = faiss::gpu::randVal(1, 8);
+    numQuery = faiss::gpu::randVal(4, 8);
     // Due to the approximate nature of the query and of floating point
     // differences between GPU and CPU, to stay within our error bounds, only
@@ -91,7 +91,7 @@ struct Options {
   }
   float getCompareEpsilon() const {
-    return 0.03f;
+    return 0.035f;
   }
   float getPctMaxDiff1() const {
@@ -131,12 +131,12 @@ TEST(TestGpuIndexIVFPQ, Query_L2) {
     cpuIndex.train(opt.numTrain, trainVecs.data());
     cpuIndex.add(opt.numAdd, addVecs.data());
+    // Use the default temporary memory management to test the memory manager
     faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = opt.device;
-    config.usePrecomputedTables = opt.usePrecomputed;
+    config.usePrecomputedTables = (tries % 2 == 0);
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
@@ -151,6 +151,93 @@ TEST(TestGpuIndexIVFPQ, Query_L2) {
   }
 }
+void testMMCodeDistance(faiss::MetricType mt) {
+  // Explicitly test the code distance via batch matrix multiplication route
+  // (even for dimension sizes that would otherwise be handled by the
+  // specialized route (via enabling `useMMCodeDistance`)
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    faiss::IndexFlat coarseQuantizer(opt.dim, mt);
+    faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                               opt.codes, opt.bitsPerCode);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+    // Use the default temporary memory management to test the memory manager
+    faiss::gpu::StandardGpuResources res;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = false;
+    config.useMMCodeDistance = true;
+    config.indicesOptions = opt.indicesOpt;
+    // Make sure that the float16 version works as well
+    config.useFloat16LookupTables = (tries % 2 == 0);
+    config.flatConfig.useFloat16 = (tries % 2 == 1);
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.setNumProbes(opt.nprobe);
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
+  // These sizes are not specialized, they will fall back to the MM version
+  for (int dimPerSubQ : {7, 11}) {
+    Options opt;
+    opt.codes = 12;
+    opt.dim = dimPerSubQ * opt.codes;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    faiss::IndexFlat coarseQuantizer(opt.dim, mt);
+    faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                               opt.codes, opt.bitsPerCode);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+    // Use the default temporary memory management to test the memory manager
+    faiss::gpu::StandardGpuResources res;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = false;
+    config.indicesOptions = opt.indicesOpt;
+    // Make sure that the float16 version works as well
+    config.useFloat16LookupTables = (dimPerSubQ == 7);
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.setNumProbes(opt.nprobe);
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
+}
+TEST(TestGpuIndexIVFPQ, Query_L2_MMCodeDistance) {
+  testMMCodeDistance(faiss::MetricType::METRIC_L2);
+}
+TEST(TestGpuIndexIVFPQ, Query_IP_MMCodeDistance) {
+  testMMCodeDistance(faiss::MetricType::METRIC_INNER_PRODUCT);
+}
 TEST(TestGpuIndexIVFPQ, Query_IP) {
   for (int tries = 0; tries < 2; ++tries) {
     Options opt;
@@ -167,8 +254,8 @@ TEST(TestGpuIndexIVFPQ, Query_IP) {
     cpuIndex.train(opt.numTrain, trainVecs.data());
     cpuIndex.add(opt.numAdd, addVecs.data());
+    // Use the default temporary memory management to test the memory manager
     faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = opt.device;
@@ -199,8 +286,8 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) {
   cpuIndex.nprobe = opt.nprobe;
   cpuIndex.train(opt.numTrain, trainVecs.data());
+  // Use the default temporary memory management to test the memory manager
   faiss::gpu::StandardGpuResources res;
-  res.noTempMemory();
   faiss::gpu::GpuIndexIVFPQConfig config;
   config.device = opt.device;
@@ -235,8 +322,8 @@ TEST(TestGpuIndexIVFPQ, Add_L2) {
     cpuIndex.nprobe = opt.nprobe;
     cpuIndex.train(opt.numTrain, trainVecs.data());
+    // Use the default temporary memory management to test the memory manager
     faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = opt.device;
@@ -272,8 +359,8 @@ TEST(TestGpuIndexIVFPQ, Add_IP) {
     cpuIndex.nprobe = opt.nprobe;
     cpuIndex.train(opt.numTrain, trainVecs.data());
+    // Use the default temporary memory management to test the memory manager
     faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = opt.device;
@@ -296,54 +383,56 @@ TEST(TestGpuIndexIVFPQ, Add_IP) {
 }
 TEST(TestGpuIndexIVFPQ, CopyTo) {
-  Options opt;
-  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-  faiss::gpu::StandardGpuResources res;
-  res.noTempMemory();
+    // Use the default temporary memory management to test the memory manager
+    faiss::gpu::StandardGpuResources res;
-  faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device = opt.device;
-  config.usePrecomputedTables = opt.usePrecomputed;
-  config.indicesOptions = opt.indicesOpt;
-  config.useFloat16LookupTables = opt.useFloat16;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = (tries % 2 == 0);
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
-  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res,
-                                     opt.dim,
-                                     opt.numCentroids,
-                                     opt.codes,
-                                     opt.bitsPerCode,
-                                     faiss::METRIC_L2,
-                                     config);
-  gpuIndex.setNumProbes(opt.nprobe);
-  gpuIndex.train(opt.numTrain, trainVecs.data());
-  gpuIndex.add(opt.numAdd, addVecs.data());
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res,
+                                       opt.dim,
+                                       opt.numCentroids,
+                                       opt.codes,
+                                       opt.bitsPerCode,
+                                       faiss::METRIC_L2,
+                                       config);
+    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
-  // Use garbage values to see if we overwrite them
-  faiss::IndexFlatL2 cpuQuantizer(1);
-  faiss::IndexIVFPQ cpuIndex(&cpuQuantizer, 1, 1, 1, 1);
+    // Use garbage values to see if we overwrite them
+    faiss::IndexFlatL2 cpuQuantizer(1);
+    faiss::IndexIVFPQ cpuIndex(&cpuQuantizer, 1, 1, 1, 1);
-  gpuIndex.copyTo(&cpuIndex);
+    gpuIndex.copyTo(&cpuIndex);
-  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-  EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-  EXPECT_EQ(cpuIndex.d, opt.dim);
-  EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-  EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
-  EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
-  EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
-  EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
-  EXPECT_EQ(gpuIndex.getBitsPerCode(), opt.bitsPerCode);
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+    EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
+    EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
+    EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
+    EXPECT_EQ(gpuIndex.getBitsPerCode(), opt.bitsPerCode);
-  // Query both objects; results should be equivalent
-  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
-                             opt.numQuery, opt.dim, opt.k, opt.toString(),
-                             opt.getCompareEpsilon(),
-                             opt.getPctMaxDiff1(),
-                             opt.getPctMaxDiffN());
+    // Query both objects; results should be equivalent
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
 }
 TEST(TestGpuIndexIVFPQ, CopyFrom) {
@@ -358,9 +447,8 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) {
   cpuIndex.train(opt.numTrain, trainVecs.data());
   cpuIndex.add(opt.numAdd, addVecs.data());
-  // Use garbage values to see if we overwrite them
+  // Use the default temporary memory management to test the memory manager
   faiss::gpu::StandardGpuResources res;
-  res.noTempMemory();
   faiss::gpu::GpuIndexIVFPQConfig config;
   config.device = opt.device;
@@ -368,6 +456,7 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) {
   config.indicesOptions = opt.indicesOpt;
   config.useFloat16LookupTables = opt.useFloat16;
+  // Use garbage values to see if we overwrite them
   faiss::gpu::GpuIndexIVFPQ
     gpuIndex(&res, 1, 1, 1, 1, faiss::METRIC_L2, config);
   gpuIndex.setNumProbes(1);
@@ -401,8 +490,8 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) {
   std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
   std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+  // Use the default temporary memory management to test the memory manager
   faiss::gpu::StandardGpuResources res;
-  res.noTempMemory();
   faiss::gpu::GpuIndexIVFPQConfig config;
   config.device = opt.device;
@@ -447,8 +536,8 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) {
 TEST(TestGpuIndexIVFPQ, AddNaN) {
   Options opt;
+  // Use the default temporary memory management to test the memory manager
   faiss::gpu::StandardGpuResources res;
-  res.noTempMemory();
   faiss::gpu::GpuIndexIVFPQConfig config;
   config.device = opt.device;