hnswlib 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/build.yml +20 -0
- data/.gitignore +18 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +10 -0
- data/LICENSE.txt +176 -0
- data/README.md +56 -0
- data/Rakefile +17 -0
- data/ext/hnswlib/extconf.rb +11 -0
- data/ext/hnswlib/hnswlibext.cpp +29 -0
- data/ext/hnswlib/hnswlibext.hpp +420 -0
- data/ext/hnswlib/src/LICENSE +201 -0
- data/ext/hnswlib/src/bruteforce.h +152 -0
- data/ext/hnswlib/src/hnswalg.h +1192 -0
- data/ext/hnswlib/src/hnswlib.h +108 -0
- data/ext/hnswlib/src/space_ip.h +282 -0
- data/ext/hnswlib/src/space_l2.h +281 -0
- data/ext/hnswlib/src/visited_list_pool.h +78 -0
- data/hnswlib.gemspec +35 -0
- data/lib/hnswlib.rb +154 -0
- data/lib/hnswlib/version.rb +9 -0
- metadata +69 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
#pragma once
|
2
|
+
#ifndef NO_MANUAL_VECTORIZATION
|
3
|
+
#ifdef __SSE__
|
4
|
+
#define USE_SSE
|
5
|
+
#ifdef __AVX__
|
6
|
+
#define USE_AVX
|
7
|
+
#endif
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if defined(USE_AVX) || defined(USE_SSE)
|
12
|
+
#ifdef _MSC_VER
|
13
|
+
#include <intrin.h>
|
14
|
+
#include <stdexcept>
|
15
|
+
#else
|
16
|
+
#include <x86intrin.h>
|
17
|
+
#endif
|
18
|
+
|
19
|
+
#if defined(__GNUC__)
|
20
|
+
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
|
21
|
+
#else
|
22
|
+
#define PORTABLE_ALIGN32 __declspec(align(32))
|
23
|
+
#endif
|
24
|
+
#endif
|
25
|
+
|
26
|
+
#include <queue>
|
27
|
+
#include <vector>
|
28
|
+
#include <iostream>
|
29
|
+
#include <string.h>
|
30
|
+
|
31
|
+
namespace hnswlib {
|
32
|
+
typedef size_t labeltype;
|
33
|
+
|
34
|
+
template <typename T>
|
35
|
+
class pairGreater {
|
36
|
+
public:
|
37
|
+
bool operator()(const T& p1, const T& p2) {
|
38
|
+
return p1.first > p2.first;
|
39
|
+
}
|
40
|
+
};
|
41
|
+
|
42
|
+
template<typename T>
|
43
|
+
static void writeBinaryPOD(std::ostream &out, const T &podRef) {
|
44
|
+
out.write((char *) &podRef, sizeof(T));
|
45
|
+
}
|
46
|
+
|
47
|
+
template<typename T>
|
48
|
+
static void readBinaryPOD(std::istream &in, T &podRef) {
|
49
|
+
in.read((char *) &podRef, sizeof(T));
|
50
|
+
}
|
51
|
+
|
52
|
+
template<typename MTYPE>
|
53
|
+
using DISTFUNC = MTYPE(*)(const void *, const void *, const void *);
|
54
|
+
|
55
|
+
|
56
|
+
template<typename MTYPE>
|
57
|
+
class SpaceInterface {
|
58
|
+
public:
|
59
|
+
//virtual void search(void *);
|
60
|
+
virtual size_t get_data_size() = 0;
|
61
|
+
|
62
|
+
virtual DISTFUNC<MTYPE> get_dist_func() = 0;
|
63
|
+
|
64
|
+
virtual void *get_dist_func_param() = 0;
|
65
|
+
|
66
|
+
virtual ~SpaceInterface() {}
|
67
|
+
};
|
68
|
+
|
69
|
+
template<typename dist_t>
|
70
|
+
class AlgorithmInterface {
|
71
|
+
public:
|
72
|
+
virtual void addPoint(const void *datapoint, labeltype label)=0;
|
73
|
+
virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(const void *, size_t) const = 0;
|
74
|
+
|
75
|
+
// Return k nearest neighbor in the order of closer fist
|
76
|
+
virtual std::vector<std::pair<dist_t, labeltype>>
|
77
|
+
searchKnnCloserFirst(const void* query_data, size_t k) const;
|
78
|
+
|
79
|
+
virtual void saveIndex(const std::string &location)=0;
|
80
|
+
virtual ~AlgorithmInterface(){
|
81
|
+
}
|
82
|
+
};
|
83
|
+
|
84
|
+
template<typename dist_t>
|
85
|
+
std::vector<std::pair<dist_t, labeltype>>
|
86
|
+
AlgorithmInterface<dist_t>::searchKnnCloserFirst(const void* query_data, size_t k) const {
|
87
|
+
std::vector<std::pair<dist_t, labeltype>> result;
|
88
|
+
|
89
|
+
// here searchKnn returns the result in the order of further first
|
90
|
+
auto ret = searchKnn(query_data, k);
|
91
|
+
{
|
92
|
+
size_t sz = ret.size();
|
93
|
+
result.resize(sz);
|
94
|
+
while (!ret.empty()) {
|
95
|
+
result[--sz] = ret.top();
|
96
|
+
ret.pop();
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
return result;
|
101
|
+
}
|
102
|
+
|
103
|
+
}
|
104
|
+
|
105
|
+
#include "space_l2.h"
|
106
|
+
#include "space_ip.h"
|
107
|
+
#include "bruteforce.h"
|
108
|
+
#include "hnswalg.h"
|
@@ -0,0 +1,282 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "hnswlib.h"
|
3
|
+
|
4
|
+
namespace hnswlib {
|
5
|
+
|
6
|
+
static float
|
7
|
+
InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
|
8
|
+
size_t qty = *((size_t *) qty_ptr);
|
9
|
+
float res = 0;
|
10
|
+
for (unsigned i = 0; i < qty; i++) {
|
11
|
+
res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
|
12
|
+
}
|
13
|
+
return (1.0f - res);
|
14
|
+
|
15
|
+
}
|
16
|
+
|
17
|
+
#if defined(USE_AVX)
|
18
|
+
|
19
|
+
// Favor using AVX if available.
|
20
|
+
static float
|
21
|
+
InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
22
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
23
|
+
float *pVect1 = (float *) pVect1v;
|
24
|
+
float *pVect2 = (float *) pVect2v;
|
25
|
+
size_t qty = *((size_t *) qty_ptr);
|
26
|
+
|
27
|
+
size_t qty16 = qty / 16;
|
28
|
+
size_t qty4 = qty / 4;
|
29
|
+
|
30
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
31
|
+
const float *pEnd2 = pVect1 + 4 * qty4;
|
32
|
+
|
33
|
+
__m256 sum256 = _mm256_set1_ps(0);
|
34
|
+
|
35
|
+
while (pVect1 < pEnd1) {
|
36
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
37
|
+
|
38
|
+
__m256 v1 = _mm256_loadu_ps(pVect1);
|
39
|
+
pVect1 += 8;
|
40
|
+
__m256 v2 = _mm256_loadu_ps(pVect2);
|
41
|
+
pVect2 += 8;
|
42
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
43
|
+
|
44
|
+
v1 = _mm256_loadu_ps(pVect1);
|
45
|
+
pVect1 += 8;
|
46
|
+
v2 = _mm256_loadu_ps(pVect2);
|
47
|
+
pVect2 += 8;
|
48
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
49
|
+
}
|
50
|
+
|
51
|
+
__m128 v1, v2;
|
52
|
+
__m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
|
53
|
+
|
54
|
+
while (pVect1 < pEnd2) {
|
55
|
+
v1 = _mm_loadu_ps(pVect1);
|
56
|
+
pVect1 += 4;
|
57
|
+
v2 = _mm_loadu_ps(pVect2);
|
58
|
+
pVect2 += 4;
|
59
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
60
|
+
}
|
61
|
+
|
62
|
+
_mm_store_ps(TmpRes, sum_prod);
|
63
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];;
|
64
|
+
return 1.0f - sum;
|
65
|
+
}
|
66
|
+
|
67
|
+
#elif defined(USE_SSE)
|
68
|
+
|
69
|
+
static float
|
70
|
+
InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
71
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
72
|
+
float *pVect1 = (float *) pVect1v;
|
73
|
+
float *pVect2 = (float *) pVect2v;
|
74
|
+
size_t qty = *((size_t *) qty_ptr);
|
75
|
+
|
76
|
+
size_t qty16 = qty / 16;
|
77
|
+
size_t qty4 = qty / 4;
|
78
|
+
|
79
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
80
|
+
const float *pEnd2 = pVect1 + 4 * qty4;
|
81
|
+
|
82
|
+
__m128 v1, v2;
|
83
|
+
__m128 sum_prod = _mm_set1_ps(0);
|
84
|
+
|
85
|
+
while (pVect1 < pEnd1) {
|
86
|
+
v1 = _mm_loadu_ps(pVect1);
|
87
|
+
pVect1 += 4;
|
88
|
+
v2 = _mm_loadu_ps(pVect2);
|
89
|
+
pVect2 += 4;
|
90
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
91
|
+
|
92
|
+
v1 = _mm_loadu_ps(pVect1);
|
93
|
+
pVect1 += 4;
|
94
|
+
v2 = _mm_loadu_ps(pVect2);
|
95
|
+
pVect2 += 4;
|
96
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
97
|
+
|
98
|
+
v1 = _mm_loadu_ps(pVect1);
|
99
|
+
pVect1 += 4;
|
100
|
+
v2 = _mm_loadu_ps(pVect2);
|
101
|
+
pVect2 += 4;
|
102
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
103
|
+
|
104
|
+
v1 = _mm_loadu_ps(pVect1);
|
105
|
+
pVect1 += 4;
|
106
|
+
v2 = _mm_loadu_ps(pVect2);
|
107
|
+
pVect2 += 4;
|
108
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
109
|
+
}
|
110
|
+
|
111
|
+
while (pVect1 < pEnd2) {
|
112
|
+
v1 = _mm_loadu_ps(pVect1);
|
113
|
+
pVect1 += 4;
|
114
|
+
v2 = _mm_loadu_ps(pVect2);
|
115
|
+
pVect2 += 4;
|
116
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
117
|
+
}
|
118
|
+
|
119
|
+
_mm_store_ps(TmpRes, sum_prod);
|
120
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
121
|
+
|
122
|
+
return 1.0f - sum;
|
123
|
+
}
|
124
|
+
|
125
|
+
#endif
|
126
|
+
|
127
|
+
#if defined(USE_AVX)
|
128
|
+
|
129
|
+
static float
|
130
|
+
InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
131
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
132
|
+
float *pVect1 = (float *) pVect1v;
|
133
|
+
float *pVect2 = (float *) pVect2v;
|
134
|
+
size_t qty = *((size_t *) qty_ptr);
|
135
|
+
|
136
|
+
size_t qty16 = qty / 16;
|
137
|
+
|
138
|
+
|
139
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
140
|
+
|
141
|
+
__m256 sum256 = _mm256_set1_ps(0);
|
142
|
+
|
143
|
+
while (pVect1 < pEnd1) {
|
144
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
145
|
+
|
146
|
+
__m256 v1 = _mm256_loadu_ps(pVect1);
|
147
|
+
pVect1 += 8;
|
148
|
+
__m256 v2 = _mm256_loadu_ps(pVect2);
|
149
|
+
pVect2 += 8;
|
150
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
151
|
+
|
152
|
+
v1 = _mm256_loadu_ps(pVect1);
|
153
|
+
pVect1 += 8;
|
154
|
+
v2 = _mm256_loadu_ps(pVect2);
|
155
|
+
pVect2 += 8;
|
156
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
157
|
+
}
|
158
|
+
|
159
|
+
_mm256_store_ps(TmpRes, sum256);
|
160
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
161
|
+
|
162
|
+
return 1.0f - sum;
|
163
|
+
}
|
164
|
+
|
165
|
+
#elif defined(USE_SSE)
|
166
|
+
|
167
|
+
static float
|
168
|
+
InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
169
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
170
|
+
float *pVect1 = (float *) pVect1v;
|
171
|
+
float *pVect2 = (float *) pVect2v;
|
172
|
+
size_t qty = *((size_t *) qty_ptr);
|
173
|
+
|
174
|
+
size_t qty16 = qty / 16;
|
175
|
+
|
176
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
177
|
+
|
178
|
+
__m128 v1, v2;
|
179
|
+
__m128 sum_prod = _mm_set1_ps(0);
|
180
|
+
|
181
|
+
while (pVect1 < pEnd1) {
|
182
|
+
v1 = _mm_loadu_ps(pVect1);
|
183
|
+
pVect1 += 4;
|
184
|
+
v2 = _mm_loadu_ps(pVect2);
|
185
|
+
pVect2 += 4;
|
186
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
187
|
+
|
188
|
+
v1 = _mm_loadu_ps(pVect1);
|
189
|
+
pVect1 += 4;
|
190
|
+
v2 = _mm_loadu_ps(pVect2);
|
191
|
+
pVect2 += 4;
|
192
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
193
|
+
|
194
|
+
v1 = _mm_loadu_ps(pVect1);
|
195
|
+
pVect1 += 4;
|
196
|
+
v2 = _mm_loadu_ps(pVect2);
|
197
|
+
pVect2 += 4;
|
198
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
199
|
+
|
200
|
+
v1 = _mm_loadu_ps(pVect1);
|
201
|
+
pVect1 += 4;
|
202
|
+
v2 = _mm_loadu_ps(pVect2);
|
203
|
+
pVect2 += 4;
|
204
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
205
|
+
}
|
206
|
+
_mm_store_ps(TmpRes, sum_prod);
|
207
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
208
|
+
|
209
|
+
return 1.0f - sum;
|
210
|
+
}
|
211
|
+
|
212
|
+
#endif
|
213
|
+
|
214
|
+
#if defined(USE_SSE) || defined(USE_AVX)
|
215
|
+
static float
|
216
|
+
InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
217
|
+
size_t qty = *((size_t *) qty_ptr);
|
218
|
+
size_t qty16 = qty >> 4 << 4;
|
219
|
+
float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
|
220
|
+
float *pVect1 = (float *) pVect1v + qty16;
|
221
|
+
float *pVect2 = (float *) pVect2v + qty16;
|
222
|
+
|
223
|
+
size_t qty_left = qty - qty16;
|
224
|
+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
225
|
+
return res + res_tail - 1.0f;
|
226
|
+
}
|
227
|
+
|
228
|
+
static float
|
229
|
+
InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
230
|
+
size_t qty = *((size_t *) qty_ptr);
|
231
|
+
size_t qty4 = qty >> 2 << 2;
|
232
|
+
|
233
|
+
float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
|
234
|
+
size_t qty_left = qty - qty4;
|
235
|
+
|
236
|
+
float *pVect1 = (float *) pVect1v + qty4;
|
237
|
+
float *pVect2 = (float *) pVect2v + qty4;
|
238
|
+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
239
|
+
|
240
|
+
return res + res_tail - 1.0f;
|
241
|
+
}
|
242
|
+
#endif
|
243
|
+
|
244
|
+
class InnerProductSpace : public SpaceInterface<float> {
|
245
|
+
|
246
|
+
DISTFUNC<float> fstdistfunc_;
|
247
|
+
size_t data_size_;
|
248
|
+
size_t dim_;
|
249
|
+
public:
|
250
|
+
InnerProductSpace(size_t dim) {
|
251
|
+
fstdistfunc_ = InnerProduct;
|
252
|
+
#if defined(USE_AVX) || defined(USE_SSE)
|
253
|
+
if (dim % 16 == 0)
|
254
|
+
fstdistfunc_ = InnerProductSIMD16Ext;
|
255
|
+
else if (dim % 4 == 0)
|
256
|
+
fstdistfunc_ = InnerProductSIMD4Ext;
|
257
|
+
else if (dim > 16)
|
258
|
+
fstdistfunc_ = InnerProductSIMD16ExtResiduals;
|
259
|
+
else if (dim > 4)
|
260
|
+
fstdistfunc_ = InnerProductSIMD4ExtResiduals;
|
261
|
+
#endif
|
262
|
+
dim_ = dim;
|
263
|
+
data_size_ = dim * sizeof(float);
|
264
|
+
}
|
265
|
+
|
266
|
+
size_t get_data_size() {
|
267
|
+
return data_size_;
|
268
|
+
}
|
269
|
+
|
270
|
+
DISTFUNC<float> get_dist_func() {
|
271
|
+
return fstdistfunc_;
|
272
|
+
}
|
273
|
+
|
274
|
+
void *get_dist_func_param() {
|
275
|
+
return &dim_;
|
276
|
+
}
|
277
|
+
|
278
|
+
~InnerProductSpace() {}
|
279
|
+
};
|
280
|
+
|
281
|
+
|
282
|
+
}
|
@@ -0,0 +1,281 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "hnswlib.h"
|
3
|
+
|
4
|
+
namespace hnswlib {
|
5
|
+
|
6
|
+
static float
|
7
|
+
L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
8
|
+
float *pVect1 = (float *) pVect1v;
|
9
|
+
float *pVect2 = (float *) pVect2v;
|
10
|
+
size_t qty = *((size_t *) qty_ptr);
|
11
|
+
|
12
|
+
float res = 0;
|
13
|
+
for (size_t i = 0; i < qty; i++) {
|
14
|
+
float t = *pVect1 - *pVect2;
|
15
|
+
pVect1++;
|
16
|
+
pVect2++;
|
17
|
+
res += t * t;
|
18
|
+
}
|
19
|
+
return (res);
|
20
|
+
}
|
21
|
+
|
22
|
+
#if defined(USE_AVX)
|
23
|
+
|
24
|
+
// Favor using AVX if available.
|
25
|
+
static float
|
26
|
+
L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
27
|
+
float *pVect1 = (float *) pVect1v;
|
28
|
+
float *pVect2 = (float *) pVect2v;
|
29
|
+
size_t qty = *((size_t *) qty_ptr);
|
30
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
31
|
+
size_t qty16 = qty >> 4;
|
32
|
+
|
33
|
+
const float *pEnd1 = pVect1 + (qty16 << 4);
|
34
|
+
|
35
|
+
__m256 diff, v1, v2;
|
36
|
+
__m256 sum = _mm256_set1_ps(0);
|
37
|
+
|
38
|
+
while (pVect1 < pEnd1) {
|
39
|
+
v1 = _mm256_loadu_ps(pVect1);
|
40
|
+
pVect1 += 8;
|
41
|
+
v2 = _mm256_loadu_ps(pVect2);
|
42
|
+
pVect2 += 8;
|
43
|
+
diff = _mm256_sub_ps(v1, v2);
|
44
|
+
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
|
45
|
+
|
46
|
+
v1 = _mm256_loadu_ps(pVect1);
|
47
|
+
pVect1 += 8;
|
48
|
+
v2 = _mm256_loadu_ps(pVect2);
|
49
|
+
pVect2 += 8;
|
50
|
+
diff = _mm256_sub_ps(v1, v2);
|
51
|
+
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
|
52
|
+
}
|
53
|
+
|
54
|
+
_mm256_store_ps(TmpRes, sum);
|
55
|
+
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
56
|
+
}
|
57
|
+
|
58
|
+
#elif defined(USE_SSE)
|
59
|
+
|
60
|
+
static float
|
61
|
+
L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
62
|
+
float *pVect1 = (float *) pVect1v;
|
63
|
+
float *pVect2 = (float *) pVect2v;
|
64
|
+
size_t qty = *((size_t *) qty_ptr);
|
65
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
66
|
+
size_t qty16 = qty >> 4;
|
67
|
+
|
68
|
+
const float *pEnd1 = pVect1 + (qty16 << 4);
|
69
|
+
|
70
|
+
__m128 diff, v1, v2;
|
71
|
+
__m128 sum = _mm_set1_ps(0);
|
72
|
+
|
73
|
+
while (pVect1 < pEnd1) {
|
74
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
75
|
+
v1 = _mm_loadu_ps(pVect1);
|
76
|
+
pVect1 += 4;
|
77
|
+
v2 = _mm_loadu_ps(pVect2);
|
78
|
+
pVect2 += 4;
|
79
|
+
diff = _mm_sub_ps(v1, v2);
|
80
|
+
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
81
|
+
|
82
|
+
v1 = _mm_loadu_ps(pVect1);
|
83
|
+
pVect1 += 4;
|
84
|
+
v2 = _mm_loadu_ps(pVect2);
|
85
|
+
pVect2 += 4;
|
86
|
+
diff = _mm_sub_ps(v1, v2);
|
87
|
+
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
88
|
+
|
89
|
+
v1 = _mm_loadu_ps(pVect1);
|
90
|
+
pVect1 += 4;
|
91
|
+
v2 = _mm_loadu_ps(pVect2);
|
92
|
+
pVect2 += 4;
|
93
|
+
diff = _mm_sub_ps(v1, v2);
|
94
|
+
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
95
|
+
|
96
|
+
v1 = _mm_loadu_ps(pVect1);
|
97
|
+
pVect1 += 4;
|
98
|
+
v2 = _mm_loadu_ps(pVect2);
|
99
|
+
pVect2 += 4;
|
100
|
+
diff = _mm_sub_ps(v1, v2);
|
101
|
+
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
102
|
+
}
|
103
|
+
|
104
|
+
_mm_store_ps(TmpRes, sum);
|
105
|
+
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
106
|
+
}
|
107
|
+
#endif
|
108
|
+
|
109
|
+
#if defined(USE_SSE) || defined(USE_AVX)
|
110
|
+
static float
|
111
|
+
L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
112
|
+
size_t qty = *((size_t *) qty_ptr);
|
113
|
+
size_t qty16 = qty >> 4 << 4;
|
114
|
+
float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
|
115
|
+
float *pVect1 = (float *) pVect1v + qty16;
|
116
|
+
float *pVect2 = (float *) pVect2v + qty16;
|
117
|
+
|
118
|
+
size_t qty_left = qty - qty16;
|
119
|
+
float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
|
120
|
+
return (res + res_tail);
|
121
|
+
}
|
122
|
+
#endif
|
123
|
+
|
124
|
+
|
125
|
+
#ifdef USE_SSE
|
126
|
+
static float
|
127
|
+
L2SqrSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
128
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
129
|
+
float *pVect1 = (float *) pVect1v;
|
130
|
+
float *pVect2 = (float *) pVect2v;
|
131
|
+
size_t qty = *((size_t *) qty_ptr);
|
132
|
+
|
133
|
+
|
134
|
+
size_t qty4 = qty >> 2;
|
135
|
+
|
136
|
+
const float *pEnd1 = pVect1 + (qty4 << 2);
|
137
|
+
|
138
|
+
__m128 diff, v1, v2;
|
139
|
+
__m128 sum = _mm_set1_ps(0);
|
140
|
+
|
141
|
+
while (pVect1 < pEnd1) {
|
142
|
+
v1 = _mm_loadu_ps(pVect1);
|
143
|
+
pVect1 += 4;
|
144
|
+
v2 = _mm_loadu_ps(pVect2);
|
145
|
+
pVect2 += 4;
|
146
|
+
diff = _mm_sub_ps(v1, v2);
|
147
|
+
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
148
|
+
}
|
149
|
+
_mm_store_ps(TmpRes, sum);
|
150
|
+
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
151
|
+
}
|
152
|
+
|
153
|
+
static float
|
154
|
+
L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
155
|
+
size_t qty = *((size_t *) qty_ptr);
|
156
|
+
size_t qty4 = qty >> 2 << 2;
|
157
|
+
|
158
|
+
float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
|
159
|
+
size_t qty_left = qty - qty4;
|
160
|
+
|
161
|
+
float *pVect1 = (float *) pVect1v + qty4;
|
162
|
+
float *pVect2 = (float *) pVect2v + qty4;
|
163
|
+
float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
|
164
|
+
|
165
|
+
return (res + res_tail);
|
166
|
+
}
|
167
|
+
#endif
|
168
|
+
|
169
|
+
class L2Space : public SpaceInterface<float> {
|
170
|
+
|
171
|
+
DISTFUNC<float> fstdistfunc_;
|
172
|
+
size_t data_size_;
|
173
|
+
size_t dim_;
|
174
|
+
public:
|
175
|
+
L2Space(size_t dim) {
|
176
|
+
fstdistfunc_ = L2Sqr;
|
177
|
+
#if defined(USE_SSE) || defined(USE_AVX)
|
178
|
+
if (dim % 16 == 0)
|
179
|
+
fstdistfunc_ = L2SqrSIMD16Ext;
|
180
|
+
else if (dim % 4 == 0)
|
181
|
+
fstdistfunc_ = L2SqrSIMD4Ext;
|
182
|
+
else if (dim > 16)
|
183
|
+
fstdistfunc_ = L2SqrSIMD16ExtResiduals;
|
184
|
+
else if (dim > 4)
|
185
|
+
fstdistfunc_ = L2SqrSIMD4ExtResiduals;
|
186
|
+
#endif
|
187
|
+
dim_ = dim;
|
188
|
+
data_size_ = dim * sizeof(float);
|
189
|
+
}
|
190
|
+
|
191
|
+
size_t get_data_size() {
|
192
|
+
return data_size_;
|
193
|
+
}
|
194
|
+
|
195
|
+
DISTFUNC<float> get_dist_func() {
|
196
|
+
return fstdistfunc_;
|
197
|
+
}
|
198
|
+
|
199
|
+
void *get_dist_func_param() {
|
200
|
+
return &dim_;
|
201
|
+
}
|
202
|
+
|
203
|
+
~L2Space() {}
|
204
|
+
};
|
205
|
+
|
206
|
+
static int
|
207
|
+
L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
|
208
|
+
|
209
|
+
size_t qty = *((size_t *) qty_ptr);
|
210
|
+
int res = 0;
|
211
|
+
unsigned char *a = (unsigned char *) pVect1;
|
212
|
+
unsigned char *b = (unsigned char *) pVect2;
|
213
|
+
|
214
|
+
qty = qty >> 2;
|
215
|
+
for (size_t i = 0; i < qty; i++) {
|
216
|
+
|
217
|
+
res += ((*a) - (*b)) * ((*a) - (*b));
|
218
|
+
a++;
|
219
|
+
b++;
|
220
|
+
res += ((*a) - (*b)) * ((*a) - (*b));
|
221
|
+
a++;
|
222
|
+
b++;
|
223
|
+
res += ((*a) - (*b)) * ((*a) - (*b));
|
224
|
+
a++;
|
225
|
+
b++;
|
226
|
+
res += ((*a) - (*b)) * ((*a) - (*b));
|
227
|
+
a++;
|
228
|
+
b++;
|
229
|
+
}
|
230
|
+
return (res);
|
231
|
+
}
|
232
|
+
|
233
|
+
static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
|
234
|
+
size_t qty = *((size_t*)qty_ptr);
|
235
|
+
int res = 0;
|
236
|
+
unsigned char* a = (unsigned char*)pVect1;
|
237
|
+
unsigned char* b = (unsigned char*)pVect2;
|
238
|
+
|
239
|
+
for(size_t i = 0; i < qty; i++)
|
240
|
+
{
|
241
|
+
res += ((*a) - (*b)) * ((*a) - (*b));
|
242
|
+
a++;
|
243
|
+
b++;
|
244
|
+
}
|
245
|
+
return (res);
|
246
|
+
}
|
247
|
+
|
248
|
+
class L2SpaceI : public SpaceInterface<int> {
|
249
|
+
|
250
|
+
DISTFUNC<int> fstdistfunc_;
|
251
|
+
size_t data_size_;
|
252
|
+
size_t dim_;
|
253
|
+
public:
|
254
|
+
L2SpaceI(size_t dim) {
|
255
|
+
if(dim % 4 == 0) {
|
256
|
+
fstdistfunc_ = L2SqrI4x;
|
257
|
+
}
|
258
|
+
else {
|
259
|
+
fstdistfunc_ = L2SqrI;
|
260
|
+
}
|
261
|
+
dim_ = dim;
|
262
|
+
data_size_ = dim * sizeof(unsigned char);
|
263
|
+
}
|
264
|
+
|
265
|
+
size_t get_data_size() {
|
266
|
+
return data_size_;
|
267
|
+
}
|
268
|
+
|
269
|
+
DISTFUNC<int> get_dist_func() {
|
270
|
+
return fstdistfunc_;
|
271
|
+
}
|
272
|
+
|
273
|
+
void *get_dist_func_param() {
|
274
|
+
return &dim_;
|
275
|
+
}
|
276
|
+
|
277
|
+
~L2SpaceI() {}
|
278
|
+
};
|
279
|
+
|
280
|
+
|
281
|
+
}
|