umappp 0.1.6 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -16
- data/ext/umappp/numo.hpp +957 -833
- data/ext/umappp/umappp.cpp +39 -45
- data/lib/umappp/version.rb +1 -1
- data/lib/umappp.rb +5 -4
- data/vendor/aarand/aarand.hpp +141 -28
- data/vendor/annoy/annoylib.h +1 -1
- data/vendor/hnswlib/bruteforce.h +142 -127
- data/vendor/hnswlib/hnswalg.h +1018 -939
- data/vendor/hnswlib/hnswlib.h +149 -58
- data/vendor/hnswlib/space_ip.h +322 -229
- data/vendor/hnswlib/space_l2.h +283 -240
- data/vendor/hnswlib/visited_list_pool.h +54 -55
- data/vendor/irlba/irlba.hpp +12 -27
- data/vendor/irlba/lanczos.hpp +30 -31
- data/vendor/irlba/parallel.hpp +37 -38
- data/vendor/irlba/utils.hpp +12 -23
- data/vendor/irlba/wrappers.hpp +239 -70
- data/vendor/kmeans/Details.hpp +1 -1
- data/vendor/kmeans/HartiganWong.hpp +28 -2
- data/vendor/kmeans/InitializeKmeansPP.hpp +29 -1
- data/vendor/kmeans/Kmeans.hpp +25 -2
- data/vendor/kmeans/Lloyd.hpp +29 -2
- data/vendor/kmeans/MiniBatch.hpp +48 -8
- data/vendor/knncolle/Annoy/Annoy.hpp +3 -0
- data/vendor/knncolle/Hnsw/Hnsw.hpp +3 -0
- data/vendor/knncolle/Kmknn/Kmknn.hpp +11 -1
- data/vendor/knncolle/utils/find_nearest_neighbors.hpp +8 -6
- data/vendor/umappp/Umap.hpp +85 -43
- data/vendor/umappp/optimize_layout.hpp +410 -133
- data/vendor/umappp/spectral_init.hpp +4 -1
- metadata +7 -10
data/vendor/hnswlib/space_ip.h
CHANGED
@@ -3,280 +3,373 @@
|
|
3
3
|
|
4
4
|
namespace hnswlib {
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
}
|
13
|
-
return (1.0f - res);
|
14
|
-
|
6
|
+
static float
|
7
|
+
InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
|
8
|
+
size_t qty = *((size_t *) qty_ptr);
|
9
|
+
float res = 0;
|
10
|
+
for (unsigned i = 0; i < qty; i++) {
|
11
|
+
res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
|
15
12
|
}
|
13
|
+
return res;
|
14
|
+
}
|
15
|
+
|
16
|
+
static float
|
17
|
+
InnerProductDistance(const void *pVect1, const void *pVect2, const void *qty_ptr) {
|
18
|
+
return 1.0f - InnerProduct(pVect1, pVect2, qty_ptr);
|
19
|
+
}
|
16
20
|
|
17
21
|
#if defined(USE_AVX)
|
18
22
|
|
19
23
|
// Favor using AVX if available.
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
24
|
+
static float
|
25
|
+
InnerProductSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
26
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
27
|
+
float *pVect1 = (float *) pVect1v;
|
28
|
+
float *pVect2 = (float *) pVect2v;
|
29
|
+
size_t qty = *((size_t *) qty_ptr);
|
30
|
+
|
31
|
+
size_t qty16 = qty / 16;
|
32
|
+
size_t qty4 = qty / 4;
|
33
|
+
|
34
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
35
|
+
const float *pEnd2 = pVect1 + 4 * qty4;
|
36
|
+
|
37
|
+
__m256 sum256 = _mm256_set1_ps(0);
|
38
|
+
|
39
|
+
while (pVect1 < pEnd1) {
|
40
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
41
|
+
|
42
|
+
__m256 v1 = _mm256_loadu_ps(pVect1);
|
43
|
+
pVect1 += 8;
|
44
|
+
__m256 v2 = _mm256_loadu_ps(pVect2);
|
45
|
+
pVect2 += 8;
|
46
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
47
|
+
|
48
|
+
v1 = _mm256_loadu_ps(pVect1);
|
49
|
+
pVect1 += 8;
|
50
|
+
v2 = _mm256_loadu_ps(pVect2);
|
51
|
+
pVect2 += 8;
|
52
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
53
|
+
}
|
50
54
|
|
51
|
-
|
52
|
-
|
55
|
+
__m128 v1, v2;
|
56
|
+
__m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
|
53
57
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
while (pVect1 < pEnd2) {
|
59
|
+
v1 = _mm_loadu_ps(pVect1);
|
60
|
+
pVect1 += 4;
|
61
|
+
v2 = _mm_loadu_ps(pVect2);
|
62
|
+
pVect2 += 4;
|
63
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
64
|
+
}
|
61
65
|
|
62
|
-
|
63
|
-
|
64
|
-
|
66
|
+
_mm_store_ps(TmpRes, sum_prod);
|
67
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
68
|
+
return sum;
|
65
69
|
}
|
66
70
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
float PORTABLE_ALIGN32 TmpRes[8];
|
72
|
-
float *pVect1 = (float *) pVect1v;
|
73
|
-
float *pVect2 = (float *) pVect2v;
|
74
|
-
size_t qty = *((size_t *) qty_ptr);
|
75
|
-
|
76
|
-
size_t qty16 = qty / 16;
|
77
|
-
size_t qty4 = qty / 4;
|
78
|
-
|
79
|
-
const float *pEnd1 = pVect1 + 16 * qty16;
|
80
|
-
const float *pEnd2 = pVect1 + 4 * qty4;
|
81
|
-
|
82
|
-
__m128 v1, v2;
|
83
|
-
__m128 sum_prod = _mm_set1_ps(0);
|
84
|
-
|
85
|
-
while (pVect1 < pEnd1) {
|
86
|
-
v1 = _mm_loadu_ps(pVect1);
|
87
|
-
pVect1 += 4;
|
88
|
-
v2 = _mm_loadu_ps(pVect2);
|
89
|
-
pVect2 += 4;
|
90
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
91
|
-
|
92
|
-
v1 = _mm_loadu_ps(pVect1);
|
93
|
-
pVect1 += 4;
|
94
|
-
v2 = _mm_loadu_ps(pVect2);
|
95
|
-
pVect2 += 4;
|
96
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
97
|
-
|
98
|
-
v1 = _mm_loadu_ps(pVect1);
|
99
|
-
pVect1 += 4;
|
100
|
-
v2 = _mm_loadu_ps(pVect2);
|
101
|
-
pVect2 += 4;
|
102
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
103
|
-
|
104
|
-
v1 = _mm_loadu_ps(pVect1);
|
105
|
-
pVect1 += 4;
|
106
|
-
v2 = _mm_loadu_ps(pVect2);
|
107
|
-
pVect2 += 4;
|
108
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
109
|
-
}
|
71
|
+
static float
|
72
|
+
InnerProductDistanceSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
73
|
+
return 1.0f - InnerProductSIMD4ExtAVX(pVect1v, pVect2v, qty_ptr);
|
74
|
+
}
|
110
75
|
|
111
|
-
|
112
|
-
v1 = _mm_loadu_ps(pVect1);
|
113
|
-
pVect1 += 4;
|
114
|
-
v2 = _mm_loadu_ps(pVect2);
|
115
|
-
pVect2 += 4;
|
116
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
117
|
-
}
|
76
|
+
#endif
|
118
77
|
|
119
|
-
|
120
|
-
|
78
|
+
#if defined(USE_SSE)
|
79
|
+
|
80
|
+
static float
|
81
|
+
InnerProductSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
82
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
83
|
+
float *pVect1 = (float *) pVect1v;
|
84
|
+
float *pVect2 = (float *) pVect2v;
|
85
|
+
size_t qty = *((size_t *) qty_ptr);
|
86
|
+
|
87
|
+
size_t qty16 = qty / 16;
|
88
|
+
size_t qty4 = qty / 4;
|
89
|
+
|
90
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
91
|
+
const float *pEnd2 = pVect1 + 4 * qty4;
|
92
|
+
|
93
|
+
__m128 v1, v2;
|
94
|
+
__m128 sum_prod = _mm_set1_ps(0);
|
95
|
+
|
96
|
+
while (pVect1 < pEnd1) {
|
97
|
+
v1 = _mm_loadu_ps(pVect1);
|
98
|
+
pVect1 += 4;
|
99
|
+
v2 = _mm_loadu_ps(pVect2);
|
100
|
+
pVect2 += 4;
|
101
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
102
|
+
|
103
|
+
v1 = _mm_loadu_ps(pVect1);
|
104
|
+
pVect1 += 4;
|
105
|
+
v2 = _mm_loadu_ps(pVect2);
|
106
|
+
pVect2 += 4;
|
107
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
108
|
+
|
109
|
+
v1 = _mm_loadu_ps(pVect1);
|
110
|
+
pVect1 += 4;
|
111
|
+
v2 = _mm_loadu_ps(pVect2);
|
112
|
+
pVect2 += 4;
|
113
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
114
|
+
|
115
|
+
v1 = _mm_loadu_ps(pVect1);
|
116
|
+
pVect1 += 4;
|
117
|
+
v2 = _mm_loadu_ps(pVect2);
|
118
|
+
pVect2 += 4;
|
119
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
120
|
+
}
|
121
121
|
|
122
|
-
|
122
|
+
while (pVect1 < pEnd2) {
|
123
|
+
v1 = _mm_loadu_ps(pVect1);
|
124
|
+
pVect1 += 4;
|
125
|
+
v2 = _mm_loadu_ps(pVect2);
|
126
|
+
pVect2 += 4;
|
127
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
123
128
|
}
|
124
129
|
|
125
|
-
|
130
|
+
_mm_store_ps(TmpRes, sum_prod);
|
131
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
126
132
|
|
127
|
-
|
133
|
+
return sum;
|
134
|
+
}
|
128
135
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
float *pVect2 = (float *) pVect2v;
|
134
|
-
size_t qty = *((size_t *) qty_ptr);
|
136
|
+
static float
|
137
|
+
InnerProductDistanceSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
138
|
+
return 1.0f - InnerProductSIMD4ExtSSE(pVect1v, pVect2v, qty_ptr);
|
139
|
+
}
|
135
140
|
|
136
|
-
|
141
|
+
#endif
|
137
142
|
|
138
143
|
|
139
|
-
|
144
|
+
#if defined(USE_AVX512)
|
140
145
|
|
141
|
-
|
146
|
+
static float
|
147
|
+
InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
148
|
+
float PORTABLE_ALIGN64 TmpRes[16];
|
149
|
+
float *pVect1 = (float *) pVect1v;
|
150
|
+
float *pVect2 = (float *) pVect2v;
|
151
|
+
size_t qty = *((size_t *) qty_ptr);
|
142
152
|
|
143
|
-
|
144
|
-
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
153
|
+
size_t qty16 = qty / 16;
|
145
154
|
|
146
|
-
__m256 v1 = _mm256_loadu_ps(pVect1);
|
147
|
-
pVect1 += 8;
|
148
|
-
__m256 v2 = _mm256_loadu_ps(pVect2);
|
149
|
-
pVect2 += 8;
|
150
|
-
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
151
155
|
|
152
|
-
|
153
|
-
pVect1 += 8;
|
154
|
-
v2 = _mm256_loadu_ps(pVect2);
|
155
|
-
pVect2 += 8;
|
156
|
-
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
157
|
-
}
|
156
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
158
157
|
|
159
|
-
|
160
|
-
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
158
|
+
__m512 sum512 = _mm512_set1_ps(0);
|
161
159
|
|
162
|
-
|
160
|
+
while (pVect1 < pEnd1) {
|
161
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
162
|
+
|
163
|
+
__m512 v1 = _mm512_loadu_ps(pVect1);
|
164
|
+
pVect1 += 16;
|
165
|
+
__m512 v2 = _mm512_loadu_ps(pVect2);
|
166
|
+
pVect2 += 16;
|
167
|
+
sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
|
163
168
|
}
|
164
169
|
|
165
|
-
|
166
|
-
|
167
|
-
static float
|
168
|
-
InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
169
|
-
float PORTABLE_ALIGN32 TmpRes[8];
|
170
|
-
float *pVect1 = (float *) pVect1v;
|
171
|
-
float *pVect2 = (float *) pVect2v;
|
172
|
-
size_t qty = *((size_t *) qty_ptr);
|
173
|
-
|
174
|
-
size_t qty16 = qty / 16;
|
175
|
-
|
176
|
-
const float *pEnd1 = pVect1 + 16 * qty16;
|
177
|
-
|
178
|
-
__m128 v1, v2;
|
179
|
-
__m128 sum_prod = _mm_set1_ps(0);
|
180
|
-
|
181
|
-
while (pVect1 < pEnd1) {
|
182
|
-
v1 = _mm_loadu_ps(pVect1);
|
183
|
-
pVect1 += 4;
|
184
|
-
v2 = _mm_loadu_ps(pVect2);
|
185
|
-
pVect2 += 4;
|
186
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
187
|
-
|
188
|
-
v1 = _mm_loadu_ps(pVect1);
|
189
|
-
pVect1 += 4;
|
190
|
-
v2 = _mm_loadu_ps(pVect2);
|
191
|
-
pVect2 += 4;
|
192
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
193
|
-
|
194
|
-
v1 = _mm_loadu_ps(pVect1);
|
195
|
-
pVect1 += 4;
|
196
|
-
v2 = _mm_loadu_ps(pVect2);
|
197
|
-
pVect2 += 4;
|
198
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
199
|
-
|
200
|
-
v1 = _mm_loadu_ps(pVect1);
|
201
|
-
pVect1 += 4;
|
202
|
-
v2 = _mm_loadu_ps(pVect2);
|
203
|
-
pVect2 += 4;
|
204
|
-
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
205
|
-
}
|
206
|
-
_mm_store_ps(TmpRes, sum_prod);
|
207
|
-
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
170
|
+
_mm512_store_ps(TmpRes, sum512);
|
171
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
|
208
172
|
|
209
|
-
|
210
|
-
|
173
|
+
return sum;
|
174
|
+
}
|
175
|
+
|
176
|
+
static float
|
177
|
+
InnerProductDistanceSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
178
|
+
return 1.0f - InnerProductSIMD16ExtAVX512(pVect1v, pVect2v, qty_ptr);
|
179
|
+
}
|
211
180
|
|
212
181
|
#endif
|
213
182
|
|
214
|
-
#if defined(
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
183
|
+
#if defined(USE_AVX)
|
184
|
+
|
185
|
+
static float
|
186
|
+
InnerProductSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
187
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
188
|
+
float *pVect1 = (float *) pVect1v;
|
189
|
+
float *pVect2 = (float *) pVect2v;
|
190
|
+
size_t qty = *((size_t *) qty_ptr);
|
191
|
+
|
192
|
+
size_t qty16 = qty / 16;
|
193
|
+
|
194
|
+
|
195
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
196
|
+
|
197
|
+
__m256 sum256 = _mm256_set1_ps(0);
|
198
|
+
|
199
|
+
while (pVect1 < pEnd1) {
|
200
|
+
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
|
201
|
+
|
202
|
+
__m256 v1 = _mm256_loadu_ps(pVect1);
|
203
|
+
pVect1 += 8;
|
204
|
+
__m256 v2 = _mm256_loadu_ps(pVect2);
|
205
|
+
pVect2 += 8;
|
206
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
207
|
+
|
208
|
+
v1 = _mm256_loadu_ps(pVect1);
|
209
|
+
pVect1 += 8;
|
210
|
+
v2 = _mm256_loadu_ps(pVect2);
|
211
|
+
pVect2 += 8;
|
212
|
+
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
|
226
213
|
}
|
227
214
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
215
|
+
_mm256_store_ps(TmpRes, sum256);
|
216
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
217
|
+
|
218
|
+
return sum;
|
219
|
+
}
|
232
220
|
|
233
|
-
|
234
|
-
|
221
|
+
static float
|
222
|
+
InnerProductDistanceSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
223
|
+
return 1.0f - InnerProductSIMD16ExtAVX(pVect1v, pVect2v, qty_ptr);
|
224
|
+
}
|
235
225
|
|
236
|
-
|
237
|
-
float *pVect2 = (float *) pVect2v + qty4;
|
238
|
-
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
226
|
+
#endif
|
239
227
|
|
240
|
-
|
228
|
+
#if defined(USE_SSE)
|
229
|
+
|
230
|
+
static float
|
231
|
+
InnerProductSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
232
|
+
float PORTABLE_ALIGN32 TmpRes[8];
|
233
|
+
float *pVect1 = (float *) pVect1v;
|
234
|
+
float *pVect2 = (float *) pVect2v;
|
235
|
+
size_t qty = *((size_t *) qty_ptr);
|
236
|
+
|
237
|
+
size_t qty16 = qty / 16;
|
238
|
+
|
239
|
+
const float *pEnd1 = pVect1 + 16 * qty16;
|
240
|
+
|
241
|
+
__m128 v1, v2;
|
242
|
+
__m128 sum_prod = _mm_set1_ps(0);
|
243
|
+
|
244
|
+
while (pVect1 < pEnd1) {
|
245
|
+
v1 = _mm_loadu_ps(pVect1);
|
246
|
+
pVect1 += 4;
|
247
|
+
v2 = _mm_loadu_ps(pVect2);
|
248
|
+
pVect2 += 4;
|
249
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
250
|
+
|
251
|
+
v1 = _mm_loadu_ps(pVect1);
|
252
|
+
pVect1 += 4;
|
253
|
+
v2 = _mm_loadu_ps(pVect2);
|
254
|
+
pVect2 += 4;
|
255
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
256
|
+
|
257
|
+
v1 = _mm_loadu_ps(pVect1);
|
258
|
+
pVect1 += 4;
|
259
|
+
v2 = _mm_loadu_ps(pVect2);
|
260
|
+
pVect2 += 4;
|
261
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
262
|
+
|
263
|
+
v1 = _mm_loadu_ps(pVect1);
|
264
|
+
pVect1 += 4;
|
265
|
+
v2 = _mm_loadu_ps(pVect2);
|
266
|
+
pVect2 += 4;
|
267
|
+
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
|
241
268
|
}
|
269
|
+
_mm_store_ps(TmpRes, sum_prod);
|
270
|
+
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
271
|
+
|
272
|
+
return sum;
|
273
|
+
}
|
274
|
+
|
275
|
+
static float
|
276
|
+
InnerProductDistanceSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
277
|
+
return 1.0f - InnerProductSIMD16ExtSSE(pVect1v, pVect2v, qty_ptr);
|
278
|
+
}
|
279
|
+
|
280
|
+
#endif
|
281
|
+
|
282
|
+
#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
|
283
|
+
static DISTFUNC<float> InnerProductSIMD16Ext = InnerProductSIMD16ExtSSE;
|
284
|
+
static DISTFUNC<float> InnerProductSIMD4Ext = InnerProductSIMD4ExtSSE;
|
285
|
+
static DISTFUNC<float> InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtSSE;
|
286
|
+
static DISTFUNC<float> InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtSSE;
|
287
|
+
|
288
|
+
static float
|
289
|
+
InnerProductDistanceSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
290
|
+
size_t qty = *((size_t *) qty_ptr);
|
291
|
+
size_t qty16 = qty >> 4 << 4;
|
292
|
+
float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
|
293
|
+
float *pVect1 = (float *) pVect1v + qty16;
|
294
|
+
float *pVect2 = (float *) pVect2v + qty16;
|
295
|
+
|
296
|
+
size_t qty_left = qty - qty16;
|
297
|
+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
298
|
+
return 1.0f - (res + res_tail);
|
299
|
+
}
|
300
|
+
|
301
|
+
static float
|
302
|
+
InnerProductDistanceSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
|
303
|
+
size_t qty = *((size_t *) qty_ptr);
|
304
|
+
size_t qty4 = qty >> 2 << 2;
|
305
|
+
|
306
|
+
float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
|
307
|
+
size_t qty_left = qty - qty4;
|
308
|
+
|
309
|
+
float *pVect1 = (float *) pVect1v + qty4;
|
310
|
+
float *pVect2 = (float *) pVect2v + qty4;
|
311
|
+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
|
312
|
+
|
313
|
+
return 1.0f - (res + res_tail);
|
314
|
+
}
|
242
315
|
#endif
|
243
316
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
317
|
+
class InnerProductSpace : public SpaceInterface<float> {
|
318
|
+
DISTFUNC<float> fstdistfunc_;
|
319
|
+
size_t data_size_;
|
320
|
+
size_t dim_;
|
321
|
+
|
322
|
+
public:
|
323
|
+
InnerProductSpace(size_t dim) {
|
324
|
+
fstdistfunc_ = InnerProductDistance;
|
325
|
+
#if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
|
326
|
+
#if defined(USE_AVX512)
|
327
|
+
if (AVX512Capable()) {
|
328
|
+
InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
|
329
|
+
InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
|
330
|
+
} else if (AVXCapable()) {
|
331
|
+
InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
|
332
|
+
InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
|
333
|
+
}
|
334
|
+
#elif defined(USE_AVX)
|
335
|
+
if (AVXCapable()) {
|
336
|
+
InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
|
337
|
+
InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
|
338
|
+
}
|
261
339
|
#endif
|
262
|
-
|
263
|
-
|
340
|
+
#if defined(USE_AVX)
|
341
|
+
if (AVXCapable()) {
|
342
|
+
InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
|
343
|
+
InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
|
264
344
|
}
|
345
|
+
#endif
|
265
346
|
|
266
|
-
|
267
|
-
|
268
|
-
|
347
|
+
if (dim % 16 == 0)
|
348
|
+
fstdistfunc_ = InnerProductDistanceSIMD16Ext;
|
349
|
+
else if (dim % 4 == 0)
|
350
|
+
fstdistfunc_ = InnerProductDistanceSIMD4Ext;
|
351
|
+
else if (dim > 16)
|
352
|
+
fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals;
|
353
|
+
else if (dim > 4)
|
354
|
+
fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals;
|
355
|
+
#endif
|
356
|
+
dim_ = dim;
|
357
|
+
data_size_ = dim * sizeof(float);
|
358
|
+
}
|
269
359
|
|
270
|
-
|
271
|
-
|
272
|
-
|
360
|
+
size_t get_data_size() {
|
361
|
+
return data_size_;
|
362
|
+
}
|
273
363
|
|
274
|
-
|
275
|
-
|
276
|
-
|
364
|
+
DISTFUNC<float> get_dist_func() {
|
365
|
+
return fstdistfunc_;
|
366
|
+
}
|
277
367
|
|
278
|
-
|
279
|
-
|
368
|
+
void *get_dist_func_param() {
|
369
|
+
return &dim_;
|
370
|
+
}
|
280
371
|
|
372
|
+
~InnerProductSpace() {}
|
373
|
+
};
|
281
374
|
|
282
|
-
}
|
375
|
+
} // namespace hnswlib
|