umappp 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -4
- data/ext/umappp/umappp.cpp +41 -43
- data/lib/umappp/version.rb +1 -1
- data/lib/umappp.rb +5 -4
- data/vendor/aarand/aarand.hpp +141 -28
- data/vendor/annoy/annoylib.h +1 -1
- data/vendor/hnswlib/bruteforce.h +142 -127
- data/vendor/hnswlib/hnswalg.h +1018 -939
- data/vendor/hnswlib/hnswlib.h +149 -58
- data/vendor/hnswlib/space_ip.h +322 -229
- data/vendor/hnswlib/space_l2.h +283 -240
- data/vendor/hnswlib/visited_list_pool.h +54 -55
- data/vendor/irlba/irlba.hpp +12 -27
- data/vendor/irlba/lanczos.hpp +30 -31
- data/vendor/irlba/parallel.hpp +37 -38
- data/vendor/irlba/utils.hpp +12 -23
- data/vendor/irlba/wrappers.hpp +239 -70
- data/vendor/kmeans/Details.hpp +1 -1
- data/vendor/kmeans/HartiganWong.hpp +28 -2
- data/vendor/kmeans/InitializeKmeansPP.hpp +29 -1
- data/vendor/kmeans/Kmeans.hpp +25 -2
- data/vendor/kmeans/Lloyd.hpp +29 -2
- data/vendor/kmeans/MiniBatch.hpp +48 -8
- data/vendor/knncolle/Annoy/Annoy.hpp +3 -0
- data/vendor/knncolle/Hnsw/Hnsw.hpp +3 -0
- data/vendor/knncolle/Kmknn/Kmknn.hpp +11 -1
- data/vendor/knncolle/utils/find_nearest_neighbors.hpp +8 -6
- data/vendor/umappp/Umap.hpp +85 -43
- data/vendor/umappp/optimize_layout.hpp +410 -133
- data/vendor/umappp/spectral_init.hpp +4 -1
- metadata +6 -6
data/vendor/hnswlib/space_ip.h
CHANGED
@@ -3,280 +3,373 @@
|
|
3
3
|
|
4
4
|
namespace hnswlib {
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
}
|
13
|
-
return (1.0f - res);
|
14
|
-
|
6
|
+
static float
|
7
|
+
InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
|
8
|
+
size_t qty = *((size_t *) qty_ptr);
|
9
|
+
float res = 0;
|
10
|
+
for (unsigned i = 0; i < qty; i++) {
|
11
|
+
res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
|
15
12
|
}
|
13
|
+
return res;
|
14
|
+
}
|
15
|
+
|
16
|
+
static float
|
17
|
+
InnerProductDistance(const void *pVect1, const void *pVect2, const void *qty_ptr) {
|
18
|
+
return 1.0f - InnerProduct(pVect1, pVect2, qty_ptr);
|
19
|
+
}
|
16
20
|
|
17
21
|
#if defined(USE_AVX)
|
18
22
|
|
19
23
|
// Favor using AVX if available.
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
24
|
+
static float
|
25
|
+
InnerProductSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
26
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
27
|
+
float *pVect1 = (float *) pVect1v;
|
28
|
+
float *pVect2 = (float *) pVect2v;
|
29
|
+
size_t qty = *((size_t *) qty_ptr);
|
30
|
+
|
31
|
+
size_t qty16 = qty / 16;
|
32
|
+
size_t qty4 = qty / 4;
|
33
|
+
|
34
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
35
|
+
const float *pEnd2 = pVect1 + 4 * qty4;
|
36
|
+
|
37
|
+
__m256 sum256 = _mm256_set1_ps(0);
|
38
|
+
|
39
|
+
while (pVect1 < pEnd1) {
|
40
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
41
|
+
|
42
|
+
__m256 v1 = _mm256_loadu_ps(pVect1);
|
43
|
+
pVect1 += 8;
|
44
|
+
__m256 v2 = _mm256_loadu_ps(pVect2);
|
45
|
+
pVect2 += 8;
|
46
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
47
|
+
|
48
|
+
v1 = _mm256_loadu_ps(pVect1);
|
49
|
+
pVect1 += 8;
|
50
|
+
v2 = _mm256_loadu_ps(pVect2);
|
51
|
+
pVect2 += 8;
|
52
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
53
|
+
}
|
50
54
|
|
51
|
-
|
52
|
-
|
55
|
+
__m128 v1, v2;
|
56
|
+
__m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
|
53
57
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
while (pVect1 < pEnd2) {
|
59
|
+
v1 = _mm_loadu_ps(pVect1);
|
60
|
+
pVect1 += 4;
|
61
|
+
v2 = _mm_loadu_ps(pVect2);
|
62
|
+
pVect2 += 4;
|
63
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
64
|
+
}
|
61
65
|
|
62
|
-
|
63
|
-
|
64
|
-
|
66
|
+
_mm_store_ps(TmpRes, sum_prod);
|
67
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
68
|
+
return sum;
|
65
69
|
}
|
66
70
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
float PORTABLE_ALIGN32 TmpRes[8];
|
72
|
-
float *pVect1 = (float *) pVect1v;
|
73
|
-
float *pVect2 = (float *) pVect2v;
|
74
|
-
size_t qty = *((size_t *) qty_ptr);
|
75
|
-
|
76
|
-
size_t qty16 = qty / 16;
|
77
|
-
size_t qty4 = qty / 4;
|
78
|
-
|
79
|
-
const float *pEnd1 = pVect1 + 16 * qty16;
|
80
|
-
const float *pEnd2 = pVect1 + 4 * qty4;
|
81
|
-
|
82
|
-
__m128 v1, v2;
|
83
|
-
__m128 sum_prod = _mm_set1_ps(0);
|
84
|
-
|
85
|
-
while (pVect1 < pEnd1) {
|
86
|
-
v1 = _mm_loadu_ps(pVect1);
|
87
|
-
pVect1 += 4;
|
88
|
-
v2 = _mm_loadu_ps(pVect2);
|
89
|
-
pVect2 += 4;
|
90
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
91
|
-
|
92
|
-
v1 = _mm_loadu_ps(pVect1);
|
93
|
-
pVect1 += 4;
|
94
|
-
v2 = _mm_loadu_ps(pVect2);
|
95
|
-
pVect2 += 4;
|
96
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
97
|
-
|
98
|
-
v1 = _mm_loadu_ps(pVect1);
|
99
|
-
pVect1 += 4;
|
100
|
-
v2 = _mm_loadu_ps(pVect2);
|
101
|
-
pVect2 += 4;
|
102
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
103
|
-
|
104
|
-
v1 = _mm_loadu_ps(pVect1);
|
105
|
-
pVect1 += 4;
|
106
|
-
v2 = _mm_loadu_ps(pVect2);
|
107
|
-
pVect2 += 4;
|
108
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
109
|
-
}
|
71
|
+
static float
|
72
|
+
InnerProductDistanceSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
73
|
+
return 1.0f - InnerProductSIMD4ExtAVX(pVect1v, pVect2v, qty_ptr);
|
74
|
+
}
|
110
75
|
|
111
|
-
|
112
|
-
v1 = _mm_loadu_ps(pVect1);
|
113
|
-
pVect1 += 4;
|
114
|
-
v2 = _mm_loadu_ps(pVect2);
|
115
|
-
pVect2 += 4;
|
116
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
117
|
-
}
|
76
|
+
#endif
|
118
77
|
|
119
|
-
|
120
|
-
|
78
|
+
#if defined(USE_SSE)
|
79
|
+
|
80
|
+
static float
|
81
|
+
InnerProductSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
82
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
83
|
+
float *pVect1 = (float *) pVect1v;
|
84
|
+
float *pVect2 = (float *) pVect2v;
|
85
|
+
size_t qty = *((size_t *) qty_ptr);
|
86
|
+
|
87
|
+
size_t qty16 = qty / 16;
|
88
|
+
size_t qty4 = qty / 4;
|
89
|
+
|
90
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
91
|
+
const float *pEnd2 = pVect1 + 4 * qty4;
|
92
|
+
|
93
|
+
__m128 v1, v2;
|
94
|
+
__m128 sum_prod = _mm_set1_ps(0);
|
95
|
+
|
96
|
+
while (pVect1 < pEnd1) {
|
97
|
+
v1 = _mm_loadu_ps(pVect1);
|
98
|
+
pVect1 += 4;
|
99
|
+
v2 = _mm_loadu_ps(pVect2);
|
100
|
+
pVect2 += 4;
|
101
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
102
|
+
|
103
|
+
v1 = _mm_loadu_ps(pVect1);
|
104
|
+
pVect1 += 4;
|
105
|
+
v2 = _mm_loadu_ps(pVect2);
|
106
|
+
pVect2 += 4;
|
107
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
108
|
+
|
109
|
+
v1 = _mm_loadu_ps(pVect1);
|
110
|
+
pVect1 += 4;
|
111
|
+
v2 = _mm_loadu_ps(pVect2);
|
112
|
+
pVect2 += 4;
|
113
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
114
|
+
|
115
|
+
v1 = _mm_loadu_ps(pVect1);
|
116
|
+
pVect1 += 4;
|
117
|
+
v2 = _mm_loadu_ps(pVect2);
|
118
|
+
pVect2 += 4;
|
119
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
120
|
+
}
|
121
121
|
|
122
|
-
|
122
|
+
while (pVect1 < pEnd2) {
|
123
|
+
v1 = _mm_loadu_ps(pVect1);
|
124
|
+
pVect1 += 4;
|
125
|
+
v2 = _mm_loadu_ps(pVect2);
|
126
|
+
pVect2 += 4;
|
127
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
123
128
|
}
|
124
129
|
|
125
|
-
|
130
|
+
_mm_store_ps(TmpRes, sum_prod);
|
131
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
126
132
|
|
127
|
-
|
133
|
+
return sum;
|
134
|
+
}
|
128
135
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
float *pVect2 = (float *) pVect2v;
|
134
|
-
size_t qty = *((size_t *) qty_ptr);
|
136
|
+
static float
|
137
|
+
InnerProductDistanceSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
138
|
+
return 1.0f - InnerProductSIMD4ExtSSE(pVect1v, pVect2v, qty_ptr);
|
139
|
+
}
|
135
140
|
|
136
|
-
|
141
|
+
#endif
|
137
142
|
|
138
143
|
|
139
|
-
|
144
|
+
#if defined(USE_AVX512)
|
140
145
|
|
141
|
-
|
146
|
+
static float
|
147
|
+
InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
148
|
+
float PORTABLE_ALIGN64 TmpRes[16];
|
149
|
+
float *pVect1 = (float *) pVect1v;
|
150
|
+
float *pVect2 = (float *) pVect2v;
|
151
|
+
size_t qty = *((size_t *) qty_ptr);
|
142
152
|
|
143
|
-
|
144
|
-
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
153
|
+
size_t qty16 = qty / 16;
|
145
154
|
|
146
|
-
__m256 v1 = _mm256_loadu_ps(pVect1);
|
147
|
-
pVect1 += 8;
|
148
|
-
__m256 v2 = _mm256_loadu_ps(pVect2);
|
149
|
-
pVect2 += 8;
|
150
|
-
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
151
155
|
|
152
|
-
|
153
|
-
pVect1 += 8;
|
154
|
-
v2 = _mm256_loadu_ps(pVect2);
|
155
|
-
pVect2 += 8;
|
156
|
-
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
157
|
-
}
|
156
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
158
157
|
|
159
|
-
|
160
|
-
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
158
|
+
__m512 sum512 = _mm512_set1_ps(0);
|
161
159
|
|
162
|
-
|
160
|
+
while (pVect1 < pEnd1) {
|
161
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
162
|
+
|
163
|
+
__m512 v1 = _mm512_loadu_ps(pVect1);
|
164
|
+
pVect1 += 16;
|
165
|
+
__m512 v2 = _mm512_loadu_ps(pVect2);
|
166
|
+
pVect2 += 16;
|
167
|
+
sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
|
163
168
|
}
|
164
169
|
|
165
|
-
|
166
|
-
|
167
|
-
static float
|
168
|
-
InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
169
|
-
float PORTABLE_ALIGN32 TmpRes[8];
|
170
|
-
float *pVect1 = (float *) pVect1v;
|
171
|
-
float *pVect2 = (float *) pVect2v;
|
172
|
-
size_t qty = *((size_t *) qty_ptr);
|
173
|
-
|
174
|
-
size_t qty16 = qty / 16;
|
175
|
-
|
176
|
-
const float *pEnd1 = pVect1 + 16 * qty16;
|
177
|
-
|
178
|
-
__m128 v1, v2;
|
179
|
-
__m128 sum_prod = _mm_set1_ps(0);
|
180
|
-
|
181
|
-
while (pVect1 < pEnd1) {
|
182
|
-
v1 = _mm_loadu_ps(pVect1);
|
183
|
-
pVect1 += 4;
|
184
|
-
v2 = _mm_loadu_ps(pVect2);
|
185
|
-
pVect2 += 4;
|
186
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
187
|
-
|
188
|
-
v1 = _mm_loadu_ps(pVect1);
|
189
|
-
pVect1 += 4;
|
190
|
-
v2 = _mm_loadu_ps(pVect2);
|
191
|
-
pVect2 += 4;
|
192
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
193
|
-
|
194
|
-
v1 = _mm_loadu_ps(pVect1);
|
195
|
-
pVect1 += 4;
|
196
|
-
v2 = _mm_loadu_ps(pVect2);
|
197
|
-
pVect2 += 4;
|
198
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
199
|
-
|
200
|
-
v1 = _mm_loadu_ps(pVect1);
|
201
|
-
pVect1 += 4;
|
202
|
-
v2 = _mm_loadu_ps(pVect2);
|
203
|
-
pVect2 += 4;
|
204
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
205
|
-
}
|
206
|
-
_mm_store_ps(TmpRes, sum_prod);
|
207
|
-
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
170
|
+
_mm512_store_ps(TmpRes, sum512);
|
171
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
|
208
172
|
|
209
|
-
|
210
|
-
|
173
|
+
return sum;
|
174
|
+
}
|
175
|
+
|
176
|
+
static float
|
177
|
+
InnerProductDistanceSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
178
|
+
return 1.0f - InnerProductSIMD16ExtAVX512(pVect1v, pVect2v, qty_ptr);
|
179
|
+
}
|
211
180
|
|
212
181
|
#endif
|
213
182
|
|
214
|
-
#if defined(
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
183
|
+
#if defined(USE_AVX)
|
184
|
+
|
185
|
+
static float
|
186
|
+
InnerProductSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
187
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
188
|
+
float *pVect1 = (float *) pVect1v;
|
189
|
+
float *pVect2 = (float *) pVect2v;
|
190
|
+
size_t qty = *((size_t *) qty_ptr);
|
191
|
+
|
192
|
+
size_t qty16 = qty / 16;
|
193
|
+
|
194
|
+
|
195
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
196
|
+
|
197
|
+
__m256 sum256 = _mm256_set1_ps(0);
|
198
|
+
|
199
|
+
while (pVect1 < pEnd1) {
|
200
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
201
|
+
|
202
|
+
__m256 v1 = _mm256_loadu_ps(pVect1);
|
203
|
+
pVect1 += 8;
|
204
|
+
__m256 v2 = _mm256_loadu_ps(pVect2);
|
205
|
+
pVect2 += 8;
|
206
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
207
|
+
|
208
|
+
v1 = _mm256_loadu_ps(pVect1);
|
209
|
+
pVect1 += 8;
|
210
|
+
v2 = _mm256_loadu_ps(pVect2);
|
211
|
+
pVect2 += 8;
|
212
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
226
213
|
}
|
227
214
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
215
|
+
_mm256_store_ps(TmpRes, sum256);
|
216
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
217
|
+
|
218
|
+
return sum;
|
219
|
+
}
|
232
220
|
|
233
|
-
|
234
|
-
|
221
|
+
static float
|
222
|
+
InnerProductDistanceSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
223
|
+
return 1.0f - InnerProductSIMD16ExtAVX(pVect1v, pVect2v, qty_ptr);
|
224
|
+
}
|
235
225
|
|
236
|
-
|
237
|
-
float *pVect2 = (float *) pVect2v + qty4;
|
238
|
-
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
226
|
+
#endif
|
239
227
|
|
240
|
-
|
228
|
+
#if defined(USE_SSE)
|
229
|
+
|
230
|
+
static float
|
231
|
+
InnerProductSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
232
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
233
|
+
float *pVect1 = (float *) pVect1v;
|
234
|
+
float *pVect2 = (float *) pVect2v;
|
235
|
+
size_t qty = *((size_t *) qty_ptr);
|
236
|
+
|
237
|
+
size_t qty16 = qty / 16;
|
238
|
+
|
239
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
240
|
+
|
241
|
+
__m128 v1, v2;
|
242
|
+
__m128 sum_prod = _mm_set1_ps(0);
|
243
|
+
|
244
|
+
while (pVect1 < pEnd1) {
|
245
|
+
v1 = _mm_loadu_ps(pVect1);
|
246
|
+
pVect1 += 4;
|
247
|
+
v2 = _mm_loadu_ps(pVect2);
|
248
|
+
pVect2 += 4;
|
249
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
250
|
+
|
251
|
+
v1 = _mm_loadu_ps(pVect1);
|
252
|
+
pVect1 += 4;
|
253
|
+
v2 = _mm_loadu_ps(pVect2);
|
254
|
+
pVect2 += 4;
|
255
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
256
|
+
|
257
|
+
v1 = _mm_loadu_ps(pVect1);
|
258
|
+
pVect1 += 4;
|
259
|
+
v2 = _mm_loadu_ps(pVect2);
|
260
|
+
pVect2 += 4;
|
261
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
262
|
+
|
263
|
+
v1 = _mm_loadu_ps(pVect1);
|
264
|
+
pVect1 += 4;
|
265
|
+
v2 = _mm_loadu_ps(pVect2);
|
266
|
+
pVect2 += 4;
|
267
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
241
268
|
}
|
269
|
+
_mm_store_ps(TmpRes, sum_prod);
|
270
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
271
|
+
|
272
|
+
return sum;
|
273
|
+
}
|
274
|
+
|
275
|
+
static float
|
276
|
+
InnerProductDistanceSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
277
|
+
return 1.0f - InnerProductSIMD16ExtSSE(pVect1v, pVect2v, qty_ptr);
|
278
|
+
}
|
279
|
+
|
280
|
+
#endif
|
281
|
+
|
282
|
+
#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
|
283
|
+
static DISTFUNC<float> InnerProductSIMD16Ext = InnerProductSIMD16ExtSSE;
|
284
|
+
static DISTFUNC<float> InnerProductSIMD4Ext = InnerProductSIMD4ExtSSE;
|
285
|
+
static DISTFUNC<float> InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtSSE;
|
286
|
+
static DISTFUNC<float> InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtSSE;
|
287
|
+
|
288
|
+
static float
|
289
|
+
InnerProductDistanceSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
290
|
+
size_t qty = *((size_t *) qty_ptr);
|
291
|
+
size_t qty16 = qty >> 4 << 4;
|
292
|
+
float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
|
293
|
+
float *pVect1 = (float *) pVect1v + qty16;
|
294
|
+
float *pVect2 = (float *) pVect2v + qty16;
|
295
|
+
|
296
|
+
size_t qty_left = qty - qty16;
|
297
|
+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
298
|
+
return 1.0f - (res + res_tail);
|
299
|
+
}
|
300
|
+
|
301
|
+
static float
|
302
|
+
InnerProductDistanceSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
303
|
+
size_t qty = *((size_t *) qty_ptr);
|
304
|
+
size_t qty4 = qty >> 2 << 2;
|
305
|
+
|
306
|
+
float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
|
307
|
+
size_t qty_left = qty - qty4;
|
308
|
+
|
309
|
+
float *pVect1 = (float *) pVect1v + qty4;
|
310
|
+
float *pVect2 = (float *) pVect2v + qty4;
|
311
|
+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
312
|
+
|
313
|
+
return 1.0f - (res + res_tail);
|
314
|
+
}
|
242
315
|
#endif
|
243
316
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
317
|
+
class InnerProductSpace : public SpaceInterface<float> {
|
318
|
+
DISTFUNC<float> fstdistfunc_;
|
319
|
+
size_t data_size_;
|
320
|
+
size_t dim_;
|
321
|
+
|
322
|
+
public:
|
323
|
+
InnerProductSpace(size_t dim) {
|
324
|
+
fstdistfunc_ = InnerProductDistance;
|
325
|
+
#if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
|
326
|
+
#if defined(USE_AVX512)
|
327
|
+
if (AVX512Capable()) {
|
328
|
+
InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
|
329
|
+
InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
|
330
|
+
} else if (AVXCapable()) {
|
331
|
+
InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
|
332
|
+
InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
|
333
|
+
}
|
334
|
+
#elif defined(USE_AVX)
|
335
|
+
if (AVXCapable()) {
|
336
|
+
InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
|
337
|
+
InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
|
338
|
+
}
|
261
339
|
#endif
|
262
|
-
|
263
|
-
|
340
|
+
#if defined(USE_AVX)
|
341
|
+
if (AVXCapable()) {
|
342
|
+
InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
|
343
|
+
InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
|
264
344
|
}
|
345
|
+
#endif
|
265
346
|
|
266
|
-
|
267
|
-
|
268
|
-
|
347
|
+
if (dim % 16 == 0)
|
348
|
+
fstdistfunc_ = InnerProductDistanceSIMD16Ext;
|
349
|
+
else if (dim % 4 == 0)
|
350
|
+
fstdistfunc_ = InnerProductDistanceSIMD4Ext;
|
351
|
+
else if (dim > 16)
|
352
|
+
fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals;
|
353
|
+
else if (dim > 4)
|
354
|
+
fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals;
|
355
|
+
#endif
|
356
|
+
dim_ = dim;
|
357
|
+
data_size_ = dim * sizeof(float);
|
358
|
+
}
|
269
359
|
|
270
|
-
|
271
|
-
|
272
|
-
|
360
|
+
size_t get_data_size() {
|
361
|
+
return data_size_;
|
362
|
+
}
|
273
363
|
|
274
|
-
|
275
|
-
|
276
|
-
|
364
|
+
DISTFUNC<float> get_dist_func() {
|
365
|
+
return fstdistfunc_;
|
366
|
+
}
|
277
367
|
|
278
|
-
|
279
|
-
|
368
|
+
void *get_dist_func_param() {
|
369
|
+
return &dim_;
|
370
|
+
}
|
280
371
|
|
372
|
+
~InnerProductSpace() {}
|
373
|
+
};
|
281
374
|
|
282
|
-
}
|
375
|
+
} // namespace hnswlib
|