RubyGems - faiss - Versions diffs - 0.1.0 → 0.1.1 - Mend

faiss 0.1.0 → 0.1.1

Files changed (226) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +103 -3
data/ext/faiss/ext.cpp +99 -32
data/ext/faiss/extconf.rb +12 -2
data/lib/faiss/ext.bundle +0 -0
data/lib/faiss/index.rb +3 -3
data/lib/faiss/index_binary.rb +3 -3
data/lib/faiss/kmeans.rb +1 -1
data/lib/faiss/pca_matrix.rb +2 -2
data/lib/faiss/product_quantizer.rb +3 -3
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/AutoTune.cpp +719 -0
data/vendor/faiss/AutoTune.h +212 -0
data/vendor/faiss/Clustering.cpp +261 -0
data/vendor/faiss/Clustering.h +101 -0
data/vendor/faiss/IVFlib.cpp +339 -0
data/vendor/faiss/IVFlib.h +132 -0
data/vendor/faiss/Index.cpp +171 -0
data/vendor/faiss/Index.h +261 -0
data/vendor/faiss/Index2Layer.cpp +437 -0
data/vendor/faiss/Index2Layer.h +85 -0
data/vendor/faiss/IndexBinary.cpp +77 -0
data/vendor/faiss/IndexBinary.h +163 -0
data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
data/vendor/faiss/IndexBinaryFlat.h +54 -0
data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
data/vendor/faiss/IndexBinaryHNSW.h +56 -0
data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
data/vendor/faiss/IndexBinaryIVF.h +211 -0
data/vendor/faiss/IndexFlat.cpp +508 -0
data/vendor/faiss/IndexFlat.h +175 -0
data/vendor/faiss/IndexHNSW.cpp +1090 -0
data/vendor/faiss/IndexHNSW.h +170 -0
data/vendor/faiss/IndexIVF.cpp +909 -0
data/vendor/faiss/IndexIVF.h +353 -0
data/vendor/faiss/IndexIVFFlat.cpp +502 -0
data/vendor/faiss/IndexIVFFlat.h +118 -0
data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
data/vendor/faiss/IndexIVFPQ.h +161 -0
data/vendor/faiss/IndexIVFPQR.cpp +219 -0
data/vendor/faiss/IndexIVFPQR.h +65 -0
data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
data/vendor/faiss/IndexLSH.cpp +225 -0
data/vendor/faiss/IndexLSH.h +87 -0
data/vendor/faiss/IndexLattice.cpp +143 -0
data/vendor/faiss/IndexLattice.h +68 -0
data/vendor/faiss/IndexPQ.cpp +1188 -0
data/vendor/faiss/IndexPQ.h +199 -0
data/vendor/faiss/IndexPreTransform.cpp +288 -0
data/vendor/faiss/IndexPreTransform.h +91 -0
data/vendor/faiss/IndexReplicas.cpp +123 -0
data/vendor/faiss/IndexReplicas.h +76 -0
data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
data/vendor/faiss/IndexScalarQuantizer.h +127 -0
data/vendor/faiss/IndexShards.cpp +317 -0
data/vendor/faiss/IndexShards.h +100 -0
data/vendor/faiss/InvertedLists.cpp +623 -0
data/vendor/faiss/InvertedLists.h +334 -0
data/vendor/faiss/LICENSE +21 -0
data/vendor/faiss/MatrixStats.cpp +252 -0
data/vendor/faiss/MatrixStats.h +62 -0
data/vendor/faiss/MetaIndexes.cpp +351 -0
data/vendor/faiss/MetaIndexes.h +126 -0
data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
data/vendor/faiss/OnDiskInvertedLists.h +127 -0
data/vendor/faiss/VectorTransform.cpp +1157 -0
data/vendor/faiss/VectorTransform.h +322 -0
data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
data/vendor/faiss/c_api/AutoTune_c.h +64 -0
data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
data/vendor/faiss/c_api/Clustering_c.h +117 -0
data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
data/vendor/faiss/c_api/IndexShards_c.h +42 -0
data/vendor/faiss/c_api/Index_c.cpp +105 -0
data/vendor/faiss/c_api/Index_c.h +183 -0
data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
data/vendor/faiss/c_api/clone_index_c.h +32 -0
data/vendor/faiss/c_api/error_c.h +42 -0
data/vendor/faiss/c_api/error_impl.cpp +27 -0
data/vendor/faiss/c_api/error_impl.h +16 -0
data/vendor/faiss/c_api/faiss_c.h +58 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
data/vendor/faiss/c_api/index_factory_c.h +30 -0
data/vendor/faiss/c_api/index_io_c.cpp +42 -0
data/vendor/faiss/c_api/index_io_c.h +50 -0
data/vendor/faiss/c_api/macros_impl.h +110 -0
data/vendor/faiss/clone_index.cpp +147 -0
data/vendor/faiss/clone_index.h +38 -0
data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
data/vendor/faiss/gpu/GpuCloner.h +82 -0
data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
data/vendor/faiss/gpu/GpuDistance.h +52 -0
data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
data/vendor/faiss/gpu/GpuIndex.h +148 -0
data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
data/vendor/faiss/gpu/GpuResources.cpp +52 -0
data/vendor/faiss/gpu/GpuResources.h +73 -0
data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
data/vendor/faiss/gpu/test/TestUtils.h +93 -0
data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
data/vendor/faiss/gpu/utils/Timer.h +52 -0
data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
data/vendor/faiss/impl/FaissAssert.h +95 -0
data/vendor/faiss/impl/FaissException.cpp +66 -0
data/vendor/faiss/impl/FaissException.h +71 -0
data/vendor/faiss/impl/HNSW.cpp +818 -0
data/vendor/faiss/impl/HNSW.h +275 -0
data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
data/vendor/faiss/impl/PolysemousTraining.h +158 -0
data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
data/vendor/faiss/impl/ProductQuantizer.h +242 -0
data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
data/vendor/faiss/impl/ThreadedIndex.h +80 -0
data/vendor/faiss/impl/index_read.cpp +793 -0
data/vendor/faiss/impl/index_write.cpp +558 -0
data/vendor/faiss/impl/io.cpp +142 -0
data/vendor/faiss/impl/io.h +98 -0
data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
data/vendor/faiss/impl/lattice_Zn.h +199 -0
data/vendor/faiss/index_factory.cpp +392 -0
data/vendor/faiss/index_factory.h +25 -0
data/vendor/faiss/index_io.h +75 -0
data/vendor/faiss/misc/test_blas.cpp +84 -0
data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
data/vendor/faiss/tests/test_merge.cpp +258 -0
data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
data/vendor/faiss/tests/test_params_override.cpp +231 -0
data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
data/vendor/faiss/utils/Heap.cpp +122 -0
data/vendor/faiss/utils/Heap.h +495 -0
data/vendor/faiss/utils/WorkerThread.cpp +126 -0
data/vendor/faiss/utils/WorkerThread.h +61 -0
data/vendor/faiss/utils/distances.cpp +765 -0
data/vendor/faiss/utils/distances.h +243 -0
data/vendor/faiss/utils/distances_simd.cpp +809 -0
data/vendor/faiss/utils/extra_distances.cpp +336 -0
data/vendor/faiss/utils/extra_distances.h +54 -0
data/vendor/faiss/utils/hamming-inl.h +472 -0
data/vendor/faiss/utils/hamming.cpp +792 -0
data/vendor/faiss/utils/hamming.h +220 -0
data/vendor/faiss/utils/random.cpp +192 -0
data/vendor/faiss/utils/random.h +60 -0
data/vendor/faiss/utils/utils.cpp +783 -0
data/vendor/faiss/utils/utils.h +181 -0
metadata +216 -2

data/vendor/faiss/IndexIVF.h ADDED Viewed

@@ -0,0 +1,353 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_IVF_H
+#define FAISS_INDEX_IVF_H
+#include <vector>
+#include <stdint.h>
+#include <faiss/Index.h>
+#include <faiss/InvertedLists.h>
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+namespace faiss {
+/** Encapsulates a quantizer object for the IndexIVF
+ *
+ * The class isolates the fields that are independent of the storage
+ * of the lists (especially training)
+ */
+struct Level1Quantizer {
+    Index * quantizer;        ///< quantizer that maps vectors to inverted lists
+    size_t nlist;             ///< number of possible key values
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
+    bool own_fields;          ///< whether object owns the quantizer
+    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train_q1 (size_t n, const float *x, bool verbose,
+                   MetricType metric_type);
+    /// compute the number of bytes required to store list ids
+    size_t coarse_code_size () const;
+    void encode_listno (Index::idx_t list_no, uint8_t *code) const;
+    Index::idx_t decode_listno (const uint8_t *code) const;
+    Level1Quantizer (Index * quantizer, size_t nlist);
+    Level1Quantizer ();
+    ~Level1Quantizer ();
+};
+struct IVFSearchParameters {
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+    virtual ~IVFSearchParameters () {}
+};
+struct InvertedListScanner;
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an Index instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * The inverted list object is required only after trainng. If none is
+ * set externally, an ArrayInvertedLists is used automatically.
+ *
+ * At search time, the vector to be searched is also quantized, and
+ * only the list corresponding to the quantization index is
+ * searched. This speeds up the search by making it
+ * non-exhaustive. This can be relaxed using multi-probe search: a few
+ * (nprobe) quantization indices are selected and several inverted
+ * lists are visited.
+ *
+ * Sub-classes implement a post-filtering of the index that refines
+ * the distance estimation from the query to databse vectors.
+ */
+struct IndexIVF: Index, Level1Quantizer {
+    /// Acess to the actual data
+    InvertedLists *invlists;
+    bool own_invlists;
+    size_t code_size;              ///< code size per vector in bytes
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+    /** Parallel mode determines how queries are parallelized with OpenMP
+     *
+     * 0 (default): parallelize over queries
+     * 1: parallelize over over inverted lists
+     * 2: parallelize over both
+     */
+    int parallel_mode;
+    /// map for direct access to the elements. Enables reconstruct().
+    bool maintain_direct_map;
+    std::vector <idx_t> direct_map;
+    /** The Inverted file takes a quantizer (an Index) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexIVF is in use.
+     */
+    IndexIVF (Index * quantizer, size_t d,
+              size_t nlist, size_t code_size,
+              MetricType metric = METRIC_L2);
+    void reset() override;
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train(idx_t n, const float* x) override;
+    /// Calls add_with_ids with NULL ids
+    void add(idx_t n, const float* x) override;
+    /// default implementation that calls encode_vectors
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+    /** Encodes a set of vectors as they would appear in the inverted lists
+     *
+     * @param list_nos   inverted list ids as returned by the
+     *                   quantizer (size n). -1s are ignored.
+     * @param codes      output codes, size n * code_size
+     * @param include_listno
+     *                   include the list ids in the code (in this case add
+     *                   ceil(log8(nlist)) to the code size)
+     */
+    virtual void encode_vectors(idx_t n, const float* x,
+                                const idx_t *list_nos,
+                                uint8_t * codes,
+                                bool include_listno = false) const = 0;
+    /// Sub-classes that encode the residuals can train their encoders here
+    /// does nothing by default
+    virtual void train_residual (idx_t n, const float *x);
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    virtual void search_preassigned (idx_t n, const float *x, idx_t k,
+                                     const idx_t *assign,
+                                     const float *centroid_dis,
+                                     float *distances, idx_t *labels,
+                                     bool store_pairs,
+                                     const IVFSearchParameters *params=nullptr
+                                     ) const;
+    /** assign the vectors, then call search_preassign */
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
+    void range_search_preassigned(idx_t nx, const float *x, float radius,
+                                  const idx_t *keys, const float *coarse_dis,
+                                  RangeSearchResult *result) const;
+    /// get a scanner for this index (store_pairs means ignore labels)
+    virtual InvertedListScanner *get_InvertedListScanner (
+        bool store_pairs=false) const;
+    void reconstruct (idx_t key, float* recons) const override;
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     */
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                          float* recons) const;
+    /// Dataset manipulation functions
+    size_t remove_ids(const IDSelector& sel) override;
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    void check_compatible_for_merge (const IndexIVF &other) const;
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from (IndexIVF &other, idx_t add_id);
+    /** copy a subset of the entries index to the other index
+     *
+     * if subset_type == 0: copies ids in [a1, a2)
+     * if subset_type == 1: copies ids if id % a1 == a2
+     * if subset_type == 2: copies inverted lists such that a1
+     *                      elements are left before and a2 elements are after
+     */
+    virtual void copy_subset_to (IndexIVF & other, int subset_type,
+                                 idx_t a1, idx_t a2) const;
+    ~IndexIVF() override;
+    size_t get_list_size (size_t list_no) const
+    { return invlists->list_size(list_no); }
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map (bool new_maintain_direct_map=true);
+    /// replace the inverted lists, old one is deallocated if own_invlists
+    void replace_invlists (InvertedLists *il, bool own=false);
+    /* The standalone codec interface (except sa_decode that is specific) */
+    size_t sa_code_size () const override;
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+    IndexIVF ();
+};
+struct RangeQueryResult;
+/** Object that handles a query. The inverted lists to scan are
+ * provided externally. The object has a lot of state, but
+ * distance_to_code and scan_codes can be called in multiple
+ * threads */
+struct InvertedListScanner {
+    using idx_t = Index::idx_t;
+    /// from now on we handle this query.
+    virtual void set_query (const float *query_vector) = 0;
+    /// following codes come from this inverted list
+    virtual void set_list (idx_t list_no, float coarse_dis) = 0;
+    /// compute a single query-to-code distance
+    virtual float distance_to_code (const uint8_t *code) const = 0;
+    /** scan a set of codes, compute distances to current query and
+     * update heap of results if necessary.
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     * @return number of heap updates performed
+     */
+    virtual size_t scan_codes (size_t n,
+                               const uint8_t *codes,
+                               const idx_t *ids,
+                               float *distances, idx_t *labels,
+                               size_t k) const = 0;
+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   float radius,
+                                   RangeQueryResult &result) const;
+    virtual ~InvertedListScanner () {}
+};
+struct IndexIVFStats {
+    size_t nq;       // nb of queries run
+    size_t nlist;    // nb of inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+    size_t nheap_updates; // nb of times the heap was updated
+    double quantization_time; // time spent quantizing vectors (in ms)
+    double search_time;       // time spent searching lists (in ms)
+    IndexIVFStats () {reset (); }
+    void reset ();
+};
+// global var that collects them all
+extern IndexIVFStats indexIVF_stats;
+} // namespace faiss
+#endif

data/vendor/faiss/IndexIVFFlat.cpp ADDED Viewed

@@ -0,0 +1,502 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexIVFFlat.h>
+#include <cstdio>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+namespace faiss {
+/*****************************************
+ * IndexIVFFlat implementation
+ ******************************************/
+IndexIVFFlat::IndexIVFFlat (Index * quantizer,
+                            size_t d, size_t nlist, MetricType metric):
+    IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
+{
+    code_size = sizeof(float) * d;
+}
+void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const idx_t *xids)
+{
+    add_core (n, x, xids, nullptr);
+}
+void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
+                             const int64_t *precomputed_idx)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
+                            "cannot have direct map and add with ids");
+    const int64_t * idx;
+    ScopeDeleter<int64_t> del;
+    if (precomputed_idx) {
+        idx = precomputed_idx;
+    } else {
+        int64_t * idx0 = new int64_t [n];
+        del.set (idx0);
+        quantizer->assign (n, x, idx0);
+        idx = idx0;
+    }
+    int64_t n_add = 0;
+    for (size_t i = 0; i < n; i++) {
+        int64_t id = xids ? xids[i] : ntotal + i;
+        int64_t list_no = idx [i];
+        if (list_no < 0)
+            continue;
+        const float *xi = x + i * d;
+        size_t offset = invlists->add_entry (
+              list_no, id, (const uint8_t*) xi);
+        if (maintain_direct_map)
+            direct_map.push_back (list_no << 32 | offset);
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
+               n_add, n);
+    }
+    ntotal += n;
+}
+void IndexIVFFlat::encode_vectors(idx_t n, const float* x,
+                                  const idx_t * list_nos,
+                                  uint8_t * codes,
+                                  bool include_listnos) const
+{
+    if (!include_listnos) {
+        memcpy (codes, x, code_size * n);
+    } else {
+        size_t coarse_size = coarse_code_size ();
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            uint8_t *code = codes + i * (code_size + coarse_size);
+            const float *xi = x + i * d;
+            if (list_no >= 0) {
+                encode_listno (list_no, code);
+                memcpy (code + coarse_size, xi, code_size);
+            } else {
+                memset (code, 0, code_size + coarse_size);
+            }
+        }
+    }
+}
+void IndexIVFFlat::sa_decode (idx_t n, const uint8_t *bytes,
+                                      float *x) const
+{
+    size_t coarse_size = coarse_code_size ();
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t *code = bytes + i * (code_size + coarse_size);
+        float *xi = x + i * d;
+        memcpy (xi, code + coarse_size, code_size);
+    }
+}
+namespace {
+template<MetricType metric, class C>
+struct IVFFlatScanner: InvertedListScanner {
+    size_t d;
+    bool store_pairs;
+    IVFFlatScanner(size_t d, bool store_pairs):
+        d(d), store_pairs(store_pairs) {}
+    const float *xi;
+    void set_query (const float *query) override {
+        this->xi = query;
+    }
+    idx_t list_no;
+    void set_list (idx_t list_no, float /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+    float distance_to_code (const uint8_t *code) const override {
+        const float *yj = (float*)code;
+        float dis = metric == METRIC_INNER_PRODUCT ?
+            fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+        return dis;
+    }
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (simi[0], dis)) {
+                heap_pop<C> (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                heap_push<C> (k, simi, idxi, dis, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (radius, dis)) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+        }
+    }
+};
+} // anonymous namespace
+InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
+     (bool store_pairs) const
+{
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFFlatScanner<
+            METRIC_INNER_PRODUCT, CMin<float, int64_t> > (d, store_pairs);
+    } else if (metric_type == METRIC_L2) {
+        return new IVFFlatScanner<
+            METRIC_L2, CMax<float, int64_t> >(d, store_pairs);
+    } else {
+        FAISS_THROW_MSG("metric type not supported");
+    }
+    return nullptr;
+}
+void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
+{
+    FAISS_THROW_IF_NOT (maintain_direct_map);
+    FAISS_THROW_IF_NOT (is_trained);
+    std::vector<idx_t> assign (n);
+    quantizer->assign (n, x, assign.data());
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = new_ids[i];
+        FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
+                                "id to update out of range");
+        { // remove old one
+            int64_t dm = direct_map[id];
+            int64_t ofs = dm & 0xffffffff;
+            int64_t il = dm >> 32;
+            size_t l = invlists->list_size (il);
+            if (ofs != l - 1) { // move l - 1 to ofs
+                int64_t id2 = invlists->get_single_id (il, l - 1);
+                direct_map[id2] = (il << 32) | ofs;
+                invlists->update_entry (il, ofs, id2,
+                                        invlists->get_single_code (il, l - 1));
+            }
+            invlists->resize (il, l - 1);
+        }
+        { // insert new one
+            int64_t il = assign[i];
+            size_t l = invlists->list_size (il);
+            int64_t dm = (il << 32) | l;
+            direct_map[id] = dm;
+            invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
+        }
+    }
+}
+void IndexIVFFlat::reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                            float* recons) const
+{
+    memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
+}
+/*****************************************
+ * IndexIVFFlatDedup implementation
+ ******************************************/
+IndexIVFFlatDedup::IndexIVFFlatDedup (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType metric_type):
+    IndexIVFFlat (quantizer, d, nlist_, metric_type)
+{}
+void IndexIVFFlatDedup::train(idx_t n, const float* x)
+{
+    std::unordered_map<uint64_t, idx_t> map;
+    float * x2 = new float [n * d];
+    ScopeDeleter<float> del (x2);
+    int64_t n2 = 0;
+    for (int64_t i = 0; i < n; i++) {
+        uint64_t hash = hash_bytes((uint8_t *)(x + i * d), code_size);
+        if (map.count(hash) &&
+            !memcmp (x2 + map[hash] * d, x + i * d, code_size)) {
+            // is duplicate, skip
+        } else {
+            map [hash] = n2;
+            memcpy (x2 + n2 * d, x + i * d, code_size);
+            n2 ++;
+        }
+    }
+    if (verbose) {
+        printf ("IndexIVFFlatDedup::train: train on %ld points after dedup "
+                "(was %ld points)\n", n2, n);
+    }
+    IndexIVFFlat::train (n2, x2);
+}
+void IndexIVFFlatDedup::add_with_ids(
+           idx_t na, const float* x, const idx_t* xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    FAISS_THROW_IF_NOT_MSG (
+           !maintain_direct_map,
+           "IVFFlatDedup not implemented with direct_map");
+    int64_t * idx = new int64_t [na];
+    ScopeDeleter<int64_t> del (idx);
+    quantizer->assign (na, x, idx);
+    int64_t n_add = 0, n_dup = 0;
+    // TODO make a omp loop with this
+    for (size_t i = 0; i < na; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        int64_t list_no = idx [i];
+        if (list_no < 0) {
+            continue;
+        }
+        const float *xi = x + i * d;
+        // search if there is already an entry with that id
+        InvertedLists::ScopedCodes codes (invlists, list_no);
+        int64_t n = invlists->list_size (list_no);
+        int64_t offset = -1;
+        for (int64_t o = 0; o < n; o++) {
+            if (!memcmp (codes.get() + o * code_size,
+                         xi, code_size)) {
+                offset = o;
+                break;
+            }
+        }
+        if (offset == -1) { // not found
+            invlists->add_entry (list_no, id, (const uint8_t*) xi);
+        } else {
+            // mark equivalence
+            idx_t id2 = invlists->get_single_id (list_no, offset);
+            std::pair<idx_t, idx_t> pair (id2, id);
+            instances.insert (pair);
+            n_dup ++;
+        }
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
+               " (out of which %ld are duplicates)\n",
+               n_add, na, n_dup);
+    }
+    ntotal += n_add;
+}
+void IndexIVFFlatDedup::search_preassigned (
+           idx_t n, const float *x, idx_t k,
+           const idx_t *assign,
+           const float *centroid_dis,
+           float *distances, idx_t *labels,
+           bool store_pairs,
+           const IVFSearchParameters *params) const
+{
+    FAISS_THROW_IF_NOT_MSG (
+           !store_pairs, "store_pairs not supported in IVFDedup");
+    IndexIVFFlat::search_preassigned (n, x, k, assign, centroid_dis,
+                                      distances, labels, false,
+                                      params);
+    std::vector <idx_t> labels2 (k);
+    std::vector <float> dis2 (k);
+    for (int64_t i = 0; i < n; i++) {
+        idx_t *labels1 = labels + i * k;
+        float *dis1 = distances + i * k;
+        int64_t j = 0;
+        for (; j < k; j++) {
+            if (instances.find (labels1[j]) != instances.end ()) {
+                // a duplicate: special handling
+                break;
+            }
+        }
+        if (j < k) {
+            // there are duplicates, special handling
+            int64_t j0 = j;
+            int64_t rp = j;
+            while (j < k) {
+                auto range = instances.equal_range (labels1[rp]);
+                float dis = dis1[rp];
+                labels2[j] = labels1[rp];
+                dis2[j] = dis;
+                j ++;
+                for (auto it = range.first; j < k && it != range.second; ++it) {
+                    labels2[j] = it->second;
+                    dis2[j] = dis;
+                    j++;
+                }
+                rp++;
+            }
+            memcpy (labels1 + j0, labels2.data() + j0,
+                    sizeof(labels1[0]) * (k - j0));
+            memcpy (dis1 + j0, dis2.data() + j0,
+                    sizeof(dis2[0]) * (k - j0));
+        }
+    }
+}
+size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel)
+{
+    std::unordered_map<idx_t, idx_t> replace;
+    std::vector<std::pair<idx_t, idx_t> > toadd;
+    for (auto it = instances.begin(); it != instances.end(); ) {
+        if (sel.is_member(it->first)) {
+            // then we erase this entry
+            if (!sel.is_member(it->second)) {
+                // if the second is not erased
+                if (replace.count(it->first) == 0) {
+                    replace[it->first] = it->second;
+                } else { // remember we should add an element
+                    std::pair<idx_t, idx_t> new_entry (
+                          replace[it->first], it->second);
+                    toadd.push_back(new_entry);
+                }
+            }
+            it = instances.erase(it);
+        } else {
+            if (sel.is_member(it->second)) {
+                it = instances.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+    instances.insert (toadd.begin(), toadd.end());
+    // mostly copied from IndexIVF.cpp
+    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+                    "direct map remove not implemented");
+    std::vector<int64_t> toremove(nlist);
+#pragma omp parallel for
+    for (int64_t i = 0; i < nlist; i++) {
+        int64_t l0 = invlists->list_size (i), l = l0, j = 0;
+        InvertedLists::ScopedIds idsi (invlists, i);
+        while (j < l) {
+            if (sel.is_member (idsi[j])) {
+                if (replace.count(idsi[j]) == 0) {
+                    l--;
+                    invlists->update_entry (
+                        i, j,
+                        invlists->get_single_id (i, l),
+                        InvertedLists::ScopedCodes (invlists, i, l).get());
+                } else {
+                    invlists->update_entry (
+                        i, j,
+                        replace[idsi[j]],
+                        InvertedLists::ScopedCodes (invlists, i, j).get());
+                    j++;
+                }
+            } else {
+                j++;
+            }
+        }
+        toremove[i] = l0 - l;
+    }
+    // this will not run well in parallel on ondisk because of possible shrinks
+    int64_t nremove = 0;
+    for (int64_t i = 0; i < nlist; i++) {
+        if (toremove[i] > 0) {
+            nremove += toremove[i];
+            invlists->resize(
+                i, invlists->list_size(i) - toremove[i]);
+        }
+    }
+    ntotal -= nremove;
+    return nremove;
+}
+void IndexIVFFlatDedup::range_search(
+        idx_t ,
+        const float* ,
+        float ,
+        RangeSearchResult* ) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+void IndexIVFFlatDedup::reconstruct_from_offset (
+         int64_t , int64_t , float* ) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+} // namespace faiss