hnswlib 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/build.yml +20 -0
- data/.gitignore +18 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +10 -0
- data/LICENSE.txt +176 -0
- data/README.md +56 -0
- data/Rakefile +17 -0
- data/ext/hnswlib/extconf.rb +11 -0
- data/ext/hnswlib/hnswlibext.cpp +29 -0
- data/ext/hnswlib/hnswlibext.hpp +420 -0
- data/ext/hnswlib/src/LICENSE +201 -0
- data/ext/hnswlib/src/bruteforce.h +152 -0
- data/ext/hnswlib/src/hnswalg.h +1192 -0
- data/ext/hnswlib/src/hnswlib.h +108 -0
- data/ext/hnswlib/src/space_ip.h +282 -0
- data/ext/hnswlib/src/space_l2.h +281 -0
- data/ext/hnswlib/src/visited_list_pool.h +78 -0
- data/hnswlib.gemspec +35 -0
- data/lib/hnswlib.rb +154 -0
- data/lib/hnswlib/version.rb +9 -0
- metadata +69 -0
| @@ -0,0 +1,108 @@ | |
| 1 | 
            +
            #pragma once
         | 
| 2 | 
            +
            #ifndef NO_MANUAL_VECTORIZATION
         | 
| 3 | 
            +
            #ifdef __SSE__
         | 
| 4 | 
            +
            #define USE_SSE
         | 
| 5 | 
            +
            #ifdef __AVX__
         | 
| 6 | 
            +
            #define USE_AVX
         | 
| 7 | 
            +
            #endif
         | 
| 8 | 
            +
            #endif
         | 
| 9 | 
            +
            #endif
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            #if defined(USE_AVX) || defined(USE_SSE)
         | 
| 12 | 
            +
            #ifdef _MSC_VER
         | 
| 13 | 
            +
            #include <intrin.h>
         | 
| 14 | 
            +
            #include <stdexcept>
         | 
| 15 | 
            +
            #else
         | 
| 16 | 
            +
            #include <x86intrin.h>
         | 
| 17 | 
            +
            #endif
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            #if defined(__GNUC__)
         | 
| 20 | 
            +
            #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
         | 
| 21 | 
            +
            #else
         | 
| 22 | 
            +
            #define PORTABLE_ALIGN32 __declspec(align(32))
         | 
| 23 | 
            +
            #endif
         | 
| 24 | 
            +
            #endif
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            #include <queue>
         | 
| 27 | 
            +
            #include <vector>
         | 
| 28 | 
            +
            #include <iostream>
         | 
| 29 | 
            +
            #include <string.h>
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            namespace hnswlib {
         | 
| 32 | 
            +
                typedef size_t labeltype;
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                template <typename T>
         | 
| 35 | 
            +
                class pairGreater {
         | 
| 36 | 
            +
                public:
         | 
| 37 | 
            +
                    bool operator()(const T& p1, const T& p2) {
         | 
| 38 | 
            +
                        return p1.first > p2.first;
         | 
| 39 | 
            +
                    }
         | 
| 40 | 
            +
                };
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                template<typename T>
         | 
| 43 | 
            +
                static void writeBinaryPOD(std::ostream &out, const T &podRef) {
         | 
| 44 | 
            +
                    out.write((char *) &podRef, sizeof(T));
         | 
| 45 | 
            +
                }
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                template<typename T>
         | 
| 48 | 
            +
                static void readBinaryPOD(std::istream &in, T &podRef) {
         | 
| 49 | 
            +
                    in.read((char *) &podRef, sizeof(T));
         | 
| 50 | 
            +
                }
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                template<typename MTYPE>
         | 
| 53 | 
            +
                using DISTFUNC = MTYPE(*)(const void *, const void *, const void *);
         | 
| 54 | 
            +
             | 
| 55 | 
            +
             | 
| 56 | 
            +
                template<typename MTYPE>
         | 
| 57 | 
            +
                class SpaceInterface {
         | 
| 58 | 
            +
                public:
         | 
| 59 | 
            +
                    //virtual void search(void *);
         | 
| 60 | 
            +
                    virtual size_t get_data_size() = 0;
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    virtual DISTFUNC<MTYPE> get_dist_func() = 0;
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    virtual void *get_dist_func_param() = 0;
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    virtual ~SpaceInterface() {}
         | 
| 67 | 
            +
                };
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                template<typename dist_t>
         | 
| 70 | 
            +
                class AlgorithmInterface {
         | 
| 71 | 
            +
                public:
         | 
| 72 | 
            +
                    virtual void addPoint(const void *datapoint, labeltype label)=0;
         | 
| 73 | 
            +
                    virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(const void *, size_t) const = 0;
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                    // Return k nearest neighbor in the order of closer fist
         | 
| 76 | 
            +
                    virtual std::vector<std::pair<dist_t, labeltype>>
         | 
| 77 | 
            +
                        searchKnnCloserFirst(const void* query_data, size_t k) const;
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    virtual void saveIndex(const std::string &location)=0;
         | 
| 80 | 
            +
                    virtual ~AlgorithmInterface(){
         | 
| 81 | 
            +
                    }
         | 
| 82 | 
            +
                };
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                template<typename dist_t>
         | 
| 85 | 
            +
                std::vector<std::pair<dist_t, labeltype>>
         | 
| 86 | 
            +
                AlgorithmInterface<dist_t>::searchKnnCloserFirst(const void* query_data, size_t k) const {
         | 
| 87 | 
            +
                    std::vector<std::pair<dist_t, labeltype>> result;
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    // here searchKnn returns the result in the order of further first
         | 
| 90 | 
            +
                    auto ret = searchKnn(query_data, k);
         | 
| 91 | 
            +
                    {
         | 
| 92 | 
            +
                        size_t sz = ret.size();
         | 
| 93 | 
            +
                        result.resize(sz);
         | 
| 94 | 
            +
                        while (!ret.empty()) {
         | 
| 95 | 
            +
                            result[--sz] = ret.top();
         | 
| 96 | 
            +
                            ret.pop();
         | 
| 97 | 
            +
                        }
         | 
| 98 | 
            +
                    }
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    return result;
         | 
| 101 | 
            +
                }
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            }
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            #include "space_l2.h"
         | 
| 106 | 
            +
            #include "space_ip.h"
         | 
| 107 | 
            +
            #include "bruteforce.h"
         | 
| 108 | 
            +
            #include "hnswalg.h"
         | 
| @@ -0,0 +1,282 @@ | |
| 1 | 
            +
            #pragma once
         | 
| 2 | 
            +
            #include "hnswlib.h"
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            namespace hnswlib {
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                static float
         | 
| 7 | 
            +
                InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
         | 
| 8 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 9 | 
            +
                    float res = 0;
         | 
| 10 | 
            +
                    for (unsigned i = 0; i < qty; i++) {
         | 
| 11 | 
            +
                        res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
         | 
| 12 | 
            +
                    }
         | 
| 13 | 
            +
                    return (1.0f - res);
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            #if defined(USE_AVX)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            // Favor using AVX if available.
         | 
| 20 | 
            +
                static float
         | 
| 21 | 
            +
                InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 22 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 23 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 24 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 25 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    size_t qty16 = qty / 16;
         | 
| 28 | 
            +
                    size_t qty4 = qty / 4;
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    const float *pEnd1 = pVect1 + 16 * qty16;
         | 
| 31 | 
            +
                    const float *pEnd2 = pVect1 + 4 * qty4;
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                    __m256 sum256 = _mm256_set1_ps(0);
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 36 | 
            +
                        //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                        __m256 v1 = _mm256_loadu_ps(pVect1);
         | 
| 39 | 
            +
                        pVect1 += 8;
         | 
| 40 | 
            +
                        __m256 v2 = _mm256_loadu_ps(pVect2);
         | 
| 41 | 
            +
                        pVect2 += 8;
         | 
| 42 | 
            +
                        sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                        v1 = _mm256_loadu_ps(pVect1);
         | 
| 45 | 
            +
                        pVect1 += 8;
         | 
| 46 | 
            +
                        v2 = _mm256_loadu_ps(pVect2);
         | 
| 47 | 
            +
                        pVect2 += 8;
         | 
| 48 | 
            +
                        sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
         | 
| 49 | 
            +
                    }
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    __m128 v1, v2;
         | 
| 52 | 
            +
                    __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    while (pVect1 < pEnd2) {
         | 
| 55 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 56 | 
            +
                        pVect1 += 4;
         | 
| 57 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 58 | 
            +
                        pVect2 += 4;
         | 
| 59 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 60 | 
            +
                    }
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    _mm_store_ps(TmpRes, sum_prod);
         | 
| 63 | 
            +
                    float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];;
         | 
| 64 | 
            +
                    return 1.0f - sum;
         | 
| 65 | 
            +
            }
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            #elif defined(USE_SSE)
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                static float
         | 
| 70 | 
            +
                InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 71 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 72 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 73 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 74 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    size_t qty16 = qty / 16;
         | 
| 77 | 
            +
                    size_t qty4 = qty / 4;
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    const float *pEnd1 = pVect1 + 16 * qty16;
         | 
| 80 | 
            +
                    const float *pEnd2 = pVect1 + 4 * qty4;
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    __m128 v1, v2;
         | 
| 83 | 
            +
                    __m128 sum_prod = _mm_set1_ps(0);
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 86 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 87 | 
            +
                        pVect1 += 4;
         | 
| 88 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 89 | 
            +
                        pVect2 += 4;
         | 
| 90 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 93 | 
            +
                        pVect1 += 4;
         | 
| 94 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 95 | 
            +
                        pVect2 += 4;
         | 
| 96 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 99 | 
            +
                        pVect1 += 4;
         | 
| 100 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 101 | 
            +
                        pVect2 += 4;
         | 
| 102 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 105 | 
            +
                        pVect1 += 4;
         | 
| 106 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 107 | 
            +
                        pVect2 += 4;
         | 
| 108 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 109 | 
            +
                    }
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    while (pVect1 < pEnd2) {
         | 
| 112 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 113 | 
            +
                        pVect1 += 4;
         | 
| 114 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 115 | 
            +
                        pVect2 += 4;
         | 
| 116 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 117 | 
            +
                    }
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                    _mm_store_ps(TmpRes, sum_prod);
         | 
| 120 | 
            +
                    float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    return 1.0f - sum;
         | 
| 123 | 
            +
                }
         | 
| 124 | 
            +
             | 
| 125 | 
            +
            #endif
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            #if defined(USE_AVX)
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                static float
         | 
| 130 | 
            +
                InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 131 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 132 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 133 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 134 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    size_t qty16 = qty / 16;
         | 
| 137 | 
            +
             | 
| 138 | 
            +
             | 
| 139 | 
            +
                    const float *pEnd1 = pVect1 + 16 * qty16;
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                    __m256 sum256 = _mm256_set1_ps(0);
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 144 | 
            +
                        //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                        __m256 v1 = _mm256_loadu_ps(pVect1);
         | 
| 147 | 
            +
                        pVect1 += 8;
         | 
| 148 | 
            +
                        __m256 v2 = _mm256_loadu_ps(pVect2);
         | 
| 149 | 
            +
                        pVect2 += 8;
         | 
| 150 | 
            +
                        sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                        v1 = _mm256_loadu_ps(pVect1);
         | 
| 153 | 
            +
                        pVect1 += 8;
         | 
| 154 | 
            +
                        v2 = _mm256_loadu_ps(pVect2);
         | 
| 155 | 
            +
                        pVect2 += 8;
         | 
| 156 | 
            +
                        sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
         | 
| 157 | 
            +
                    }
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                    _mm256_store_ps(TmpRes, sum256);
         | 
| 160 | 
            +
                    float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    return 1.0f - sum;
         | 
| 163 | 
            +
                }
         | 
| 164 | 
            +
             | 
| 165 | 
            +
            #elif defined(USE_SSE)
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                  static float
         | 
| 168 | 
            +
                  InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 169 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 170 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 171 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 172 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 173 | 
            +
             | 
| 174 | 
            +
                    size_t qty16 = qty / 16;
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                    const float *pEnd1 = pVect1 + 16 * qty16;
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                    __m128 v1, v2;
         | 
| 179 | 
            +
                    __m128 sum_prod = _mm_set1_ps(0);
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 182 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 183 | 
            +
                        pVect1 += 4;
         | 
| 184 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 185 | 
            +
                        pVect2 += 4;
         | 
| 186 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 189 | 
            +
                        pVect1 += 4;
         | 
| 190 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 191 | 
            +
                        pVect2 += 4;
         | 
| 192 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 195 | 
            +
                        pVect1 += 4;
         | 
| 196 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 197 | 
            +
                        pVect2 += 4;
         | 
| 198 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 201 | 
            +
                        pVect1 += 4;
         | 
| 202 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 203 | 
            +
                        pVect2 += 4;
         | 
| 204 | 
            +
                        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
         | 
| 205 | 
            +
                    }
         | 
| 206 | 
            +
                    _mm_store_ps(TmpRes, sum_prod);
         | 
| 207 | 
            +
                    float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                    return 1.0f - sum;
         | 
| 210 | 
            +
                }
         | 
| 211 | 
            +
             | 
| 212 | 
            +
            #endif
         | 
| 213 | 
            +
             | 
| 214 | 
            +
            #if defined(USE_SSE) || defined(USE_AVX)
         | 
| 215 | 
            +
                static float
         | 
| 216 | 
            +
                InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 217 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 218 | 
            +
                    size_t qty16 = qty >> 4 << 4;
         | 
| 219 | 
            +
                    float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
         | 
| 220 | 
            +
                    float *pVect1 = (float *) pVect1v + qty16;
         | 
| 221 | 
            +
                    float *pVect2 = (float *) pVect2v + qty16;
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                    size_t qty_left = qty - qty16;
         | 
| 224 | 
            +
                    float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
         | 
| 225 | 
            +
                    return res + res_tail - 1.0f;
         | 
| 226 | 
            +
                }
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                static float
         | 
| 229 | 
            +
                InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 230 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 231 | 
            +
                    size_t qty4 = qty >> 2 << 2;
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                    float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
         | 
| 234 | 
            +
                    size_t qty_left = qty - qty4;
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                    float *pVect1 = (float *) pVect1v + qty4;
         | 
| 237 | 
            +
                    float *pVect2 = (float *) pVect2v + qty4;
         | 
| 238 | 
            +
                    float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
         | 
| 239 | 
            +
             | 
| 240 | 
            +
                    return res + res_tail - 1.0f;
         | 
| 241 | 
            +
                }
         | 
| 242 | 
            +
            #endif
         | 
| 243 | 
            +
             | 
| 244 | 
            +
                class InnerProductSpace : public SpaceInterface<float> {
         | 
| 245 | 
            +
             | 
| 246 | 
            +
                    DISTFUNC<float> fstdistfunc_;
         | 
| 247 | 
            +
                    size_t data_size_;
         | 
| 248 | 
            +
                    size_t dim_;
         | 
| 249 | 
            +
                public:
         | 
| 250 | 
            +
                    InnerProductSpace(size_t dim) {
         | 
| 251 | 
            +
                        fstdistfunc_ = InnerProduct;
         | 
| 252 | 
            +
                #if defined(USE_AVX) || defined(USE_SSE)
         | 
| 253 | 
            +
                        if (dim % 16 == 0)
         | 
| 254 | 
            +
                            fstdistfunc_ = InnerProductSIMD16Ext;
         | 
| 255 | 
            +
                        else if (dim % 4 == 0)
         | 
| 256 | 
            +
                            fstdistfunc_ = InnerProductSIMD4Ext;
         | 
| 257 | 
            +
                        else if (dim > 16)
         | 
| 258 | 
            +
                            fstdistfunc_ = InnerProductSIMD16ExtResiduals;
         | 
| 259 | 
            +
                        else if (dim > 4)
         | 
| 260 | 
            +
                            fstdistfunc_ = InnerProductSIMD4ExtResiduals;
         | 
| 261 | 
            +
                #endif
         | 
| 262 | 
            +
                        dim_ = dim;
         | 
| 263 | 
            +
                        data_size_ = dim * sizeof(float);
         | 
| 264 | 
            +
                    }
         | 
| 265 | 
            +
             | 
| 266 | 
            +
                    size_t get_data_size() {
         | 
| 267 | 
            +
                        return data_size_;
         | 
| 268 | 
            +
                    }
         | 
| 269 | 
            +
             | 
| 270 | 
            +
                    DISTFUNC<float> get_dist_func() {
         | 
| 271 | 
            +
                        return fstdistfunc_;
         | 
| 272 | 
            +
                    }
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                    void *get_dist_func_param() {
         | 
| 275 | 
            +
                        return &dim_;
         | 
| 276 | 
            +
                    }
         | 
| 277 | 
            +
             | 
| 278 | 
            +
                ~InnerProductSpace() {}
         | 
| 279 | 
            +
                };
         | 
| 280 | 
            +
             | 
| 281 | 
            +
             | 
| 282 | 
            +
            }
         | 
| @@ -0,0 +1,281 @@ | |
| 1 | 
            +
            #pragma once
         | 
| 2 | 
            +
            #include "hnswlib.h"
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            namespace hnswlib {
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                static float
         | 
| 7 | 
            +
                L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 8 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 9 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 10 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                    float res = 0;
         | 
| 13 | 
            +
                    for (size_t i = 0; i < qty; i++) {
         | 
| 14 | 
            +
                        float t = *pVect1 - *pVect2;
         | 
| 15 | 
            +
                        pVect1++;
         | 
| 16 | 
            +
                        pVect2++;
         | 
| 17 | 
            +
                        res += t * t;
         | 
| 18 | 
            +
                    }
         | 
| 19 | 
            +
                    return (res);
         | 
| 20 | 
            +
                }
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            #if defined(USE_AVX)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                // Favor using AVX if available.
         | 
| 25 | 
            +
                static float
         | 
| 26 | 
            +
                L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 27 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 28 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 29 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 30 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 31 | 
            +
                    size_t qty16 = qty >> 4;
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                    const float *pEnd1 = pVect1 + (qty16 << 4);
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    __m256 diff, v1, v2;
         | 
| 36 | 
            +
                    __m256 sum = _mm256_set1_ps(0);
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 39 | 
            +
                        v1 = _mm256_loadu_ps(pVect1);
         | 
| 40 | 
            +
                        pVect1 += 8;
         | 
| 41 | 
            +
                        v2 = _mm256_loadu_ps(pVect2);
         | 
| 42 | 
            +
                        pVect2 += 8;
         | 
| 43 | 
            +
                        diff = _mm256_sub_ps(v1, v2);
         | 
| 44 | 
            +
                        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                        v1 = _mm256_loadu_ps(pVect1);
         | 
| 47 | 
            +
                        pVect1 += 8;
         | 
| 48 | 
            +
                        v2 = _mm256_loadu_ps(pVect2);
         | 
| 49 | 
            +
                        pVect2 += 8;
         | 
| 50 | 
            +
                        diff = _mm256_sub_ps(v1, v2);
         | 
| 51 | 
            +
                        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
         | 
| 52 | 
            +
                    }
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    _mm256_store_ps(TmpRes, sum);
         | 
| 55 | 
            +
                    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
         | 
| 56 | 
            +
                }
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            #elif defined(USE_SSE)
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                static float
         | 
| 61 | 
            +
                L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 62 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 63 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 64 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 65 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 66 | 
            +
                    size_t qty16 = qty >> 4;
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                    const float *pEnd1 = pVect1 + (qty16 << 4);
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    __m128 diff, v1, v2;
         | 
| 71 | 
            +
                    __m128 sum = _mm_set1_ps(0);
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 74 | 
            +
                        //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
         | 
| 75 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 76 | 
            +
                        pVect1 += 4;
         | 
| 77 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 78 | 
            +
                        pVect2 += 4;
         | 
| 79 | 
            +
                        diff = _mm_sub_ps(v1, v2);
         | 
| 80 | 
            +
                        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 83 | 
            +
                        pVect1 += 4;
         | 
| 84 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 85 | 
            +
                        pVect2 += 4;
         | 
| 86 | 
            +
                        diff = _mm_sub_ps(v1, v2);
         | 
| 87 | 
            +
                        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 90 | 
            +
                        pVect1 += 4;
         | 
| 91 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 92 | 
            +
                        pVect2 += 4;
         | 
| 93 | 
            +
                        diff = _mm_sub_ps(v1, v2);
         | 
| 94 | 
            +
                        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 97 | 
            +
                        pVect1 += 4;
         | 
| 98 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 99 | 
            +
                        pVect2 += 4;
         | 
| 100 | 
            +
                        diff = _mm_sub_ps(v1, v2);
         | 
| 101 | 
            +
                        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         | 
| 102 | 
            +
                    }
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    _mm_store_ps(TmpRes, sum);
         | 
| 105 | 
            +
                    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
         | 
| 106 | 
            +
                }
         | 
| 107 | 
            +
            #endif
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            #if defined(USE_SSE) || defined(USE_AVX)
         | 
| 110 | 
            +
                static float
         | 
| 111 | 
            +
                L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 112 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 113 | 
            +
                    size_t qty16 = qty >> 4 << 4;
         | 
| 114 | 
            +
                    float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
         | 
| 115 | 
            +
                    float *pVect1 = (float *) pVect1v + qty16;
         | 
| 116 | 
            +
                    float *pVect2 = (float *) pVect2v + qty16;
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    size_t qty_left = qty - qty16;
         | 
| 119 | 
            +
                    float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
         | 
| 120 | 
            +
                    return (res + res_tail);
         | 
| 121 | 
            +
                }
         | 
| 122 | 
            +
            #endif
         | 
| 123 | 
            +
             | 
| 124 | 
            +
             | 
| 125 | 
            +
            #ifdef USE_SSE
         | 
| 126 | 
            +
                static float
         | 
| 127 | 
            +
                L2SqrSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 128 | 
            +
                    float PORTABLE_ALIGN32 TmpRes[8];
         | 
| 129 | 
            +
                    float *pVect1 = (float *) pVect1v;
         | 
| 130 | 
            +
                    float *pVect2 = (float *) pVect2v;
         | 
| 131 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 132 | 
            +
             | 
| 133 | 
            +
             | 
| 134 | 
            +
                    size_t qty4 = qty >> 2;
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    const float *pEnd1 = pVect1 + (qty4 << 2);
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    __m128 diff, v1, v2;
         | 
| 139 | 
            +
                    __m128 sum = _mm_set1_ps(0);
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                    while (pVect1 < pEnd1) {
         | 
| 142 | 
            +
                        v1 = _mm_loadu_ps(pVect1);
         | 
| 143 | 
            +
                        pVect1 += 4;
         | 
| 144 | 
            +
                        v2 = _mm_loadu_ps(pVect2);
         | 
| 145 | 
            +
                        pVect2 += 4;
         | 
| 146 | 
            +
                        diff = _mm_sub_ps(v1, v2);
         | 
| 147 | 
            +
                        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         | 
| 148 | 
            +
                    }
         | 
| 149 | 
            +
                    _mm_store_ps(TmpRes, sum);
         | 
| 150 | 
            +
                    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
         | 
| 151 | 
            +
                }
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                static float
         | 
| 154 | 
            +
                L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         | 
| 155 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 156 | 
            +
                    size_t qty4 = qty >> 2 << 2;
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
         | 
| 159 | 
            +
                    size_t qty_left = qty - qty4;
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    float *pVect1 = (float *) pVect1v + qty4;
         | 
| 162 | 
            +
                    float *pVect2 = (float *) pVect2v + qty4;
         | 
| 163 | 
            +
                    float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                    return (res + res_tail);
         | 
| 166 | 
            +
                }
         | 
| 167 | 
            +
            #endif
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                class L2Space : public SpaceInterface<float> {
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                    DISTFUNC<float> fstdistfunc_;
         | 
| 172 | 
            +
                    size_t data_size_;
         | 
| 173 | 
            +
                    size_t dim_;
         | 
| 174 | 
            +
                public:
         | 
| 175 | 
            +
                    L2Space(size_t dim) {
         | 
| 176 | 
            +
                        fstdistfunc_ = L2Sqr;
         | 
| 177 | 
            +
                    #if defined(USE_SSE) || defined(USE_AVX)
         | 
| 178 | 
            +
                        if (dim % 16 == 0)
         | 
| 179 | 
            +
                            fstdistfunc_ = L2SqrSIMD16Ext;
         | 
| 180 | 
            +
                        else if (dim % 4 == 0)
         | 
| 181 | 
            +
                            fstdistfunc_ = L2SqrSIMD4Ext;
         | 
| 182 | 
            +
                        else if (dim > 16)
         | 
| 183 | 
            +
                            fstdistfunc_ = L2SqrSIMD16ExtResiduals;
         | 
| 184 | 
            +
                        else if (dim > 4)
         | 
| 185 | 
            +
                            fstdistfunc_ = L2SqrSIMD4ExtResiduals;
         | 
| 186 | 
            +
                    #endif
         | 
| 187 | 
            +
                        dim_ = dim;
         | 
| 188 | 
            +
                        data_size_ = dim * sizeof(float);
         | 
| 189 | 
            +
                    }
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                    size_t get_data_size() {
         | 
| 192 | 
            +
                        return data_size_;
         | 
| 193 | 
            +
                    }
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                    DISTFUNC<float> get_dist_func() {
         | 
| 196 | 
            +
                        return fstdistfunc_;
         | 
| 197 | 
            +
                    }
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                    void *get_dist_func_param() {
         | 
| 200 | 
            +
                        return &dim_;
         | 
| 201 | 
            +
                    }
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    ~L2Space() {}
         | 
| 204 | 
            +
                };
         | 
| 205 | 
            +
             | 
| 206 | 
            +
                static int
         | 
| 207 | 
            +
                L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                    size_t qty = *((size_t *) qty_ptr);
         | 
| 210 | 
            +
                    int res = 0;
         | 
| 211 | 
            +
                    unsigned char *a = (unsigned char *) pVect1;
         | 
| 212 | 
            +
                    unsigned char *b = (unsigned char *) pVect2;
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                    qty = qty >> 2;
         | 
| 215 | 
            +
                    for (size_t i = 0; i < qty; i++) {
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                        res += ((*a) - (*b)) * ((*a) - (*b));
         | 
| 218 | 
            +
                        a++;
         | 
| 219 | 
            +
                        b++;
         | 
| 220 | 
            +
                        res += ((*a) - (*b)) * ((*a) - (*b));
         | 
| 221 | 
            +
                        a++;
         | 
| 222 | 
            +
                        b++;
         | 
| 223 | 
            +
                        res += ((*a) - (*b)) * ((*a) - (*b));
         | 
| 224 | 
            +
                        a++;
         | 
| 225 | 
            +
                        b++;
         | 
| 226 | 
            +
                        res += ((*a) - (*b)) * ((*a) - (*b));
         | 
| 227 | 
            +
                        a++;
         | 
| 228 | 
            +
                        b++;
         | 
| 229 | 
            +
                    }
         | 
| 230 | 
            +
                    return (res);
         | 
| 231 | 
            +
                }
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
         | 
| 234 | 
            +
                    size_t qty = *((size_t*)qty_ptr);
         | 
| 235 | 
            +
                    int res = 0;
         | 
| 236 | 
            +
                    unsigned char* a = (unsigned char*)pVect1;
         | 
| 237 | 
            +
                    unsigned char* b = (unsigned char*)pVect2;
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                    for(size_t i = 0; i < qty; i++)
         | 
| 240 | 
            +
                    {
         | 
| 241 | 
            +
                        res += ((*a) - (*b)) * ((*a) - (*b));
         | 
| 242 | 
            +
                        a++;
         | 
| 243 | 
            +
                        b++;
         | 
| 244 | 
            +
                    }
         | 
| 245 | 
            +
                    return (res);
         | 
| 246 | 
            +
                }
         | 
| 247 | 
            +
             | 
| 248 | 
            +
                class L2SpaceI : public SpaceInterface<int> {
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                    DISTFUNC<int> fstdistfunc_;
         | 
| 251 | 
            +
                    size_t data_size_;
         | 
| 252 | 
            +
                    size_t dim_;
         | 
| 253 | 
            +
                public:
         | 
| 254 | 
            +
                    L2SpaceI(size_t dim) {
         | 
| 255 | 
            +
                        if(dim % 4 == 0) {
         | 
| 256 | 
            +
                            fstdistfunc_ = L2SqrI4x;
         | 
| 257 | 
            +
                        }
         | 
| 258 | 
            +
                        else {
         | 
| 259 | 
            +
                            fstdistfunc_ = L2SqrI;
         | 
| 260 | 
            +
                        }
         | 
| 261 | 
            +
                        dim_ = dim;
         | 
| 262 | 
            +
                        data_size_ = dim * sizeof(unsigned char);
         | 
| 263 | 
            +
                    }
         | 
| 264 | 
            +
             | 
| 265 | 
            +
                    size_t get_data_size() {
         | 
| 266 | 
            +
                        return data_size_;
         | 
| 267 | 
            +
                    }
         | 
| 268 | 
            +
             | 
| 269 | 
            +
                    DISTFUNC<int> get_dist_func() {
         | 
| 270 | 
            +
                        return fstdistfunc_;
         | 
| 271 | 
            +
                    }
         | 
| 272 | 
            +
             | 
| 273 | 
            +
                    void *get_dist_func_param() {
         | 
| 274 | 
            +
                        return &dim_;
         | 
| 275 | 
            +
                    }
         | 
| 276 | 
            +
             | 
| 277 | 
            +
                    ~L2SpaceI() {}
         | 
| 278 | 
            +
                };
         | 
| 279 | 
            +
             | 
| 280 | 
            +
             | 
| 281 | 
            +
            }
         |