RubyGems - faiss - Versions diffs - 0.1.2 → 0.1.3 - Mend

faiss 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

data/vendor/faiss/{impl → faiss/impl}/io.h RENAMED

@@ -29,7 +29,7 @@ struct IOReader {
     // name that can be used in error messages
     std::string name;
-    // fread
+    // fread. Returns number of items read or 0 in case of EOF.
     virtual size_t operator()(
          void *ptr, size_t size, size_t nitems) = 0;
@@ -43,7 +43,7 @@ struct IOWriter {
     // name that can be used in error messages
     std::string name;
-    // fwrite
+    // fwrite. Return number of items written
     virtual size_t operator()(
          const void *ptr, size_t size, size_t nitems) = 0;
@@ -97,6 +97,10 @@ struct FileIOWriter: IOWriter {
 /*******************************************************
  * Buffered reader + writer
+ *
+ * They attempt to read and write only buffers of size bsz to the
+ * underlying reader or writer. This is done by splitting or merging
+ * the read/write functions.
  *******************************************************/
@@ -105,24 +109,32 @@ struct FileIOWriter: IOWriter {
 struct BufferedIOReader: IOReader {
     IOReader *reader;
-    size_t bsz, totsz, ofs;
+    size_t bsz;
+    size_t ofs;    ///< offset in input stream
+    size_t ofs2;   ///< number of bytes returned to caller
     size_t b0, b1; ///< range of available bytes in the buffer
     std::vector<char> buffer;
-    BufferedIOReader(IOReader *reader, size_t bsz,
-                     size_t totsz=(size_t)(-1));
+    /**
+     * @param bsz    buffer size (bytes). Reads will be done by batched of
+     *               this size
+     */
+    explicit BufferedIOReader(IOReader *reader, size_t bsz = 1024 * 1024);
     size_t operator()(void *ptr, size_t size, size_t nitems) override;
 };
 struct BufferedIOWriter: IOWriter {
     IOWriter *writer;
-    size_t bsz, ofs;
-    size_t b0; ///< amount of data in buffer
+    size_t bsz;
+    size_t ofs;
+    size_t ofs2;     ///< number of bytes received from caller
+    size_t b0;       ///< amount of data in buffer
     std::vector<char> buffer;
-    BufferedIOWriter(IOWriter *writer, size_t bsz);
+    explicit BufferedIOWriter(IOWriter *writer, size_t bsz = 1024 * 1024);
     size_t operator()(const void *ptr, size_t size, size_t nitems) override;
@@ -132,5 +144,7 @@ struct BufferedIOWriter: IOWriter {
 /// cast a 4-character string to a uint32_t that can be written and read easily
 uint32_t fourcc (const char sx[4]);
+uint32_t fourcc (const std::string & sx);
 } // namespace faiss

data/vendor/faiss/faiss/impl/io_macros.h ADDED

@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+/*************************************************************
+ * I/O macros
+ *
+ * we use macros so that we have a line number to report in abort
+ * (). This makes debugging a lot easier. The IOReader or IOWriter is
+ * always called f and thus is not passed in as a macro parameter.
+ **************************************************************/
+#define READANDCHECK(ptr, n) {                                  \
+        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);              \
+        FAISS_THROW_IF_NOT_FMT(ret == (n),                      \
+            "read error in %s: %zd != %zd (%s)",                \
+            f->name.c_str(), ret, size_t(n), strerror(errno));  \
+    }
+#define READ1(x)  READANDCHECK(&(x), 1)
+// will fail if we write 256G of data at once...
+#define READVECTOR(vec)                                          \
+  {                                                              \
+    size_t size;                                                 \
+    READANDCHECK(&size, 1);                                      \
+    FAISS_THROW_IF_NOT(size >= 0 && size < (uint64_t{1} << 40)); \
+    (vec).resize(size);                                          \
+    READANDCHECK((vec).data(), size);                            \
+  }
+#define READSTRING(s) {                      \
+        size_t size = (s).size ();            \
+        WRITEANDCHECK (&size, 1);               \
+        WRITEANDCHECK ((s).c_str(), size);      \
+    }
+#define WRITEANDCHECK(ptr, n) {                                 \
+        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);              \
+        FAISS_THROW_IF_NOT_FMT(ret == (n),                      \
+            "write error in %s: %zd != %zd (%s)",               \
+            f->name.c_str(), ret, size_t(n), strerror(errno));  \
+    }
+#define WRITE1(x) WRITEANDCHECK(&(x), 1)
+#define WRITEVECTOR(vec) {                      \
+        size_t size = (vec).size ();            \
+        WRITEANDCHECK (&size, 1);               \
+        WRITEANDCHECK ((vec).data (), size);    \
+    }

data/vendor/faiss/{impl → faiss/impl}/lattice_Zn.cpp RENAMED

@@ -21,6 +21,22 @@
 #include <faiss/utils/distances.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+static inline int __builtin_ctzll(uint64_t x) {
+    unsigned long ret;
+    _BitScanForward64(&ret, x);
+    return (int)ret;
+}
+static inline int __builtin_clzll(uint64_t x) {
+    return (int)__lzcnt64(x);
+}
+#endif // _MSC_VER
 namespace faiss {
 /********************************************
@@ -102,7 +118,7 @@ int decode_comb_1 (uint64_t *n, int k1, int r) {
 }
 // optimized version for < 64 bits
-long repeats_encode_64 (
+uint64_t repeats_encode_64 (
      const std::vector<Repeat> & repeats,
      int dim, const float *c)
 {
@@ -115,12 +131,12 @@ long repeats_encode_64 (
         uint64_t tosee = ~coded;
         for(;;) {
             // directly jump to next available slot.
-            int i = __builtin_ctzl(tosee);
-            tosee &= ~(1UL << i) ;
+            int i = __builtin_ctzll(tosee);
+            tosee &= ~(uint64_t{1} << i) ;
             if (c[i] == r->val) {
                 code_comb += comb(rank, occ + 1);
                 occ++;
-                coded |= 1UL << i;
+                coded |= uint64_t{1} << i;
                 if (occ == r->n) break;
             }
             rank++;
@@ -148,13 +164,13 @@ void repeats_decode_64(
         int occ = 0;
         int rank = nfree;
         int next_rank = decode_comb_1 (&code_comb, r->n, rank);
-        uint64_t tosee = ((1UL << dim) - 1) ^ decoded;
+        uint64_t tosee = ((uint64_t{1} << dim) - 1) ^ decoded;
         for(;;) {
-            int i = 63 - __builtin_clzl(tosee);
-            tosee &= ~(1UL << i);
+            int i = 63 - __builtin_clzll(tosee);
+            tosee &= ~(uint64_t{1} << i);
             rank--;
             if (rank == next_rank) {
-                decoded |= 1UL << i;
+                decoded |= uint64_t{1} << i;
                 c[i] = r->val;
                 occ++;
                 if (occ == r->n) break;
@@ -190,9 +206,9 @@ Repeats::Repeats (int dim, const float *c): dim(dim)
 }
-long Repeats::count () const
+uint64_t Repeats::count () const
 {
-    long accu = 1;
+    uint64_t accu = 1;
     int remain = dim;
     for (int i = 0; i < repeats.size(); i++) {
         accu *= comb(remain, repeats[i].n);
@@ -204,7 +220,7 @@ long Repeats::count () const
 // version with a bool vector that works for > 64 dim
-long Repeats::encode(const float *c) const
+uint64_t Repeats::encode(const float *c) const
 {
     if (dim < 64) {
         return repeats_encode_64 (repeats, dim, c);
@@ -306,20 +322,20 @@ void EnumeratedVectors::decode_multi(size_t n, const uint64_t * codes,
 void EnumeratedVectors::find_nn (
                   size_t nc, const uint64_t * codes,
                   size_t nq, const float *xq,
-                  long *labels, float *distances)
+                  int64_t *labels, float *distances)
 {
-    for (long i = 0; i < nq; i++) {
+    for (size_t i = 0; i < nq; i++) {
         distances[i] = -1e20;
         labels[i] = -1;
     }
-    float c[dim];
-    for(long i = 0; i < nc; i++) {
+    std::vector<float> c(dim);
+    for(size_t i = 0; i < nc; i++) {
         uint64_t code = codes[nc];
-        decode(code, c);
-        for (long j = 0; j < nq; j++) {
+        decode(code, c.data());
+        for (size_t j = 0; j < nq; j++) {
             const float *x = xq + j * dim;
-            float dis = fvec_inner_product(x, c, dim);
+            float dis = fvec_inner_product(x, c.data(), dim);
             if (dis > distances[j]) {
                 distances[j] = dis;
                 labels[j] = i;
@@ -341,9 +357,9 @@ ZnSphereSearch::ZnSphereSearch(int dim, int r2): dimS(dim), r2(r2) {
 }
 float ZnSphereSearch::search(const float *x, float *c) const {
-    float tmp[dimS * 2];
-    int tmp_int[dimS];
-    return search(x, c, tmp, tmp_int);
+    std::vector<float> tmp(dimS * 2);
+    std::vector<int> tmp_int(dimS);
+    return search(x, c, tmp.data(), tmp_int.data());
 }
 float ZnSphereSearch::search(const float *x, float *c,
@@ -430,19 +446,19 @@ ZnSphereCodec::ZnSphereCodec(int dim, int r2):
 }
 uint64_t ZnSphereCodec::search_and_encode(const float *x) const {
-    float tmp[dim * 2];
-    int tmp_int[dim];
+    std::vector<float> tmp(dim * 2);
+    std::vector<int> tmp_int(dim);
     int ano; // atom number
-    float c[dim];
-    search(x, c, tmp, tmp_int, &ano);
+    std::vector<float> c(dim);
+    search(x, c.data(), tmp.data(), tmp_int.data(), &ano);
     uint64_t signs = 0;
-    float cabs[dim];
+    std::vector<float> cabs(dim);
     int nnz = 0;
     for (int i = 0; i < dim; i++) {
         cabs[i] = fabs(c[i]);
         if (c[i] != 0) {
             if (c[i] < 0) {
-                signs |= 1UL << nnz;
+                signs |= uint64_t{1} << nnz;
             }
             nnz ++;
         }
@@ -450,7 +466,7 @@ uint64_t ZnSphereCodec::search_and_encode(const float *x) const {
     const CodeSegment &cs = code_segments[ano];
     assert(nnz == cs.signbits);
     uint64_t code = cs.c0 + signs;
-    code += cs.encode(cabs) << cs.signbits;
+    code += cs.encode(cabs.data()) << cs.signbits;
     return code;
 }
@@ -560,13 +576,13 @@ ZnSphereCodecRec::ZnSphereCodecRec(int dim, int r2):
         std::vector<float> &cache = decode_cache[r2sub];
         int dimsub = (1 << cache_level);
         cache.resize (nvi * dimsub);
-        float c[dim];
+        std::vector<float> c(dim);
         uint64_t code0 = get_nv_cum(cache_level + 1, r2,
                                  r2 - r2sub);
         for (int i = 0; i < nvi; i++) {
-            decode(i + code0, c);
-            memcpy(&cache[i * dimsub], c + dim - dimsub,
-                   dimsub * sizeof(*c));
+            decode(i + code0, c.data());
+            memcpy(&cache[i * dimsub], c.data() + dim - dimsub,
+                   dimsub * sizeof(*c.data()));
         }
     }
     decode_cache_ld = cache_level;
@@ -581,8 +597,8 @@ uint64_t ZnSphereCodecRec::encode(const float *c) const
 uint64_t ZnSphereCodecRec::encode_centroid(const float *c) const
 {
-    uint64_t codes[dim];
-    int norm2s[dim];
+    std::vector<uint64_t> codes(dim);
+    std::vector<int> norm2s(dim);
     for(int i = 0; i < dim; i++) {
         if (c[i] == 0) {
             codes[i] = 0;
@@ -617,8 +633,8 @@ uint64_t ZnSphereCodecRec::encode_centroid(const float *c) const
 void ZnSphereCodecRec::decode(uint64_t code, float *c) const
 {
-    uint64_t codes[dim];
-    int norm2s[dim];
+    std::vector<uint64_t> codes(dim);
+    std::vector<int> norm2s(dim);
     codes[0] = code;
     norm2s[0] = r2;

data/vendor/faiss/{impl → faiss/impl}/lattice_Zn.h RENAMED

@@ -80,7 +80,7 @@ struct EnumeratedVectors {
     // (decodes and computes distances)
     void find_nn (size_t n, const uint64_t * codes,
                   size_t nq, const float *xq,
-                  long *idx, float *dis);
+                  int64_t *idx, float *dis);
     virtual ~EnumeratedVectors() {}
@@ -103,9 +103,9 @@ struct Repeats {
     Repeats(int dim = 0, const float *c = nullptr);
     // count number of possible codes for this atom
-    long count() const;
+    uint64_t count() const;
-    long encode(const float *c) const;
+    uint64_t encode(const float *c) const;
     void decode(uint64_t code, float *c) const;
 };

data/vendor/faiss/faiss/impl/platform_macros.h ADDED

@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#ifdef _MSC_VER
+#ifdef FAISS_MAIN_LIB
+#define FAISS_API __declspec(dllexport)
+#else // _FAISS_MAIN_LIB
+#define FAISS_API __declspec(dllimport)
+#endif // FAISS_MAIN_LIB
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#else
+#define FAISS_API
+#endif // _MSC_VER

data/vendor/faiss/{index_factory.cpp → faiss/index_factory.cpp} RENAMED

@@ -13,9 +13,9 @@
 #include <faiss/AutoTune.h>
+#include <cinttypes>
 #include <cmath>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
 #include <faiss/utils/random.h>
@@ -38,6 +38,7 @@
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexBinaryHNSW.h>
 #include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexBinaryHash.h>
 namespace faiss {
@@ -81,14 +82,14 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
     ScopeDeleter1<Index> del_coarse_quantizer, del_index;
-    char description[strlen(description_in) + 1];
+    std::string description(description_in);
     char *ptr;
-    memcpy (description, description_in, strlen(description_in) + 1);
     int64_t ncentroids = -1;
     bool use_2layer = false;
+    int hnsw_M = -1;
-    for (char *tok = strtok_r (description, " ,", &ptr);
+    for (char *tok = strtok_r (&description[0], " ,", &ptr);
          tok;
          tok = strtok_r (nullptr, " ,", &ptr)) {
         int d_out, opq_M, nbit, M, M2, pq_m, ncent, r2;
@@ -138,12 +139,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
         // coarse quantizers
         } else if (!coarse_quantizer &&
-                   sscanf (tok, "IVF%ld_HNSW%d", &ncentroids, &M) == 2) {
-            FAISS_THROW_IF_NOT (metric == METRIC_L2);
+                   sscanf (tok, "IVF%" PRId64 "_HNSW%d", &ncentroids, &M) == 2) {
             coarse_quantizer_1 = new IndexHNSWFlat (d, M);
         } else if (!coarse_quantizer &&
-                   sscanf (tok, "IVF%ld", &ncentroids) == 1) {
+                   sscanf (tok, "IVF%" PRId64, &ncentroids) == 1) {
             if (metric == METRIC_L2) {
                 coarse_quantizer_1 = new IndexFlatL2 (d);
             } else {
@@ -164,7 +164,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
             use_2layer = true;
         } else if (!coarse_quantizer &&
-                   sscanf (tok, "Residual%ld", &ncentroids) == 1) {
+                   sscanf (tok, "Residual%" PRId64, &ncentroids) == 1) {
             coarse_quantizer_1 = new IndexFlatL2 (d);
             use_2layer = true;
@@ -186,6 +186,8 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                 del_coarse_quantizer.release ();
                 index_ivf->own_fields = true;
                 index_1 = index_ivf;
+            } else if (hnsw_M > 0) {
+                index_1 = new IndexHNSWFlat (d, hnsw_M, metric);
             } else {
                 FAISS_THROW_IF_NOT_MSG (stok != "FlatDedup",
                                         "dedup supported only for IVFFlat");
@@ -209,6 +211,8 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                 del_coarse_quantizer.release ();
                 index_ivf->own_fields = true;
                 index_1 = index_ivf;
+            } else if (hnsw_M > 0) {
+                index_1 = new IndexHNSWSQ(d, qt, hnsw_M, metric);
             } else {
                 index_1 = new IndexScalarQuantizer (d, qt, metric);
             }
@@ -248,6 +252,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                     index_2l->q1.own_fields = true;
                     index_1 = index_2l;
                 }
+            } else if (hnsw_M > 0) {
+                IndexHNSWPQ *ipq = new IndexHNSWPQ(d, M, hnsw_M);
+                dynamic_cast<IndexPQ*>(ipq->storage)->do_polysemous_training =
+                    do_polysemous_training;
+                index_1 = ipq;
             } else {
                 IndexPQ *index_pq = new IndexPQ (d, M, nbit, metric);
                 index_pq->do_polysemous_training = do_polysemous_training;
@@ -272,13 +281,14 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
         } else if (!index &&
                    sscanf (tok, "HNSW%d_PQ%d", &M, &pq_m) == 2) {
             index_1 = new IndexHNSWPQ (d, pq_m, M);
-        } else if (!index &&
-                   sscanf (tok, "HNSW%d", &M) == 1) {
-            index_1 = new IndexHNSWFlat (d, M);
         } else if (!index &&
                    sscanf (tok, "HNSW%d_SQ%d", &M, &pq_m) == 2 &&
                    pq_m == 8) {
             index_1 = new IndexHNSWSQ (d, ScalarQuantizer::QT_8bit, M);
+        } else if (!index &&
+                   sscanf (tok, "HNSW%d", &M) == 1) {
+            hnsw_M = M;
+            // here it is unclear what we want: HNSW flat or HNSWx,Y ?
         } else if (!index && (stok == "LSH" || stok == "LSHr" ||
                               stok == "LSHrt" || stok == "LSHt")) {
             bool rotate_data = strstr(tok, "r") != nullptr;
@@ -318,6 +328,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
         }
     }
+    if (!index && hnsw_M > 0) {
+        index = new IndexHNSWFlat (d, hnsw_M, metric);
+        del_index.set (index);
+    }
     FAISS_THROW_IF_NOT_FMT(index, "description %s did not generate an index",
                     description_in);
@@ -355,7 +370,7 @@ IndexBinary *index_binary_factory(int d, const char *description)
     IndexBinary *index = nullptr;
     int ncentroids = -1;
-    int M;
+    int M, nhash, b;
     if (sscanf(description, "BIVF%d_HNSW%d", &ncentroids, &M) == 2) {
         IndexBinaryIVF *index_ivf = new IndexBinaryIVF(
@@ -375,6 +390,12 @@ IndexBinary *index_binary_factory(int d, const char *description)
         IndexBinaryHNSW *index_hnsw = new IndexBinaryHNSW(d, M);
         index = index_hnsw;
+    } else if (sscanf(description, "BHash%dx%d", &nhash, &b) == 2) {
+        index = new IndexBinaryMultiHash (d, nhash, b);
+    } else if (sscanf(description, "BHash%d", &b) == 1) {
+        index = new IndexBinaryHash (d, b);
     } else if (std::string(description) == "BFlat") {
         index = new IndexBinaryFlat(d);