umappp 0.1.5 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,280 +3,373 @@
3
3
 
4
4
  namespace hnswlib {
5
5
 
6
- static float
7
- InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
8
- size_t qty = *((size_t *) qty_ptr);
9
- float res = 0;
10
- for (unsigned i = 0; i < qty; i++) {
11
- res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
12
- }
13
- return (1.0f - res);
14
-
6
+ static float
7
+ InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
8
+ size_t qty = *((size_t *) qty_ptr);
9
+ float res = 0;
10
+ for (unsigned i = 0; i < qty; i++) {
11
+ res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
15
12
  }
13
+ return res;
14
+ }
15
+
16
+ static float
17
+ InnerProductDistance(const void *pVect1, const void *pVect2, const void *qty_ptr) {
18
+ return 1.0f - InnerProduct(pVect1, pVect2, qty_ptr);
19
+ }
16
20
 
17
21
  #if defined(USE_AVX)
18
22
 
19
23
  // Favor using AVX if available.
20
- static float
21
- InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
22
- float PORTABLE_ALIGN32 TmpRes[8];
23
- float *pVect1 = (float *) pVect1v;
24
- float *pVect2 = (float *) pVect2v;
25
- size_t qty = *((size_t *) qty_ptr);
26
-
27
- size_t qty16 = qty / 16;
28
- size_t qty4 = qty / 4;
29
-
30
- const float *pEnd1 = pVect1 + 16 * qty16;
31
- const float *pEnd2 = pVect1 + 4 * qty4;
32
-
33
- __m256 sum256 = _mm256_set1_ps(0);
34
-
35
- while (pVect1 < pEnd1) {
36
- //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
37
-
38
- __m256 v1 = _mm256_loadu_ps(pVect1);
39
- pVect1 += 8;
40
- __m256 v2 = _mm256_loadu_ps(pVect2);
41
- pVect2 += 8;
42
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
43
-
44
- v1 = _mm256_loadu_ps(pVect1);
45
- pVect1 += 8;
46
- v2 = _mm256_loadu_ps(pVect2);
47
- pVect2 += 8;
48
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
49
- }
24
+ static float
25
+ InnerProductSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
26
+ float PORTABLE_ALIGN32 TmpRes[8];
27
+ float *pVect1 = (float *) pVect1v;
28
+ float *pVect2 = (float *) pVect2v;
29
+ size_t qty = *((size_t *) qty_ptr);
30
+
31
+ size_t qty16 = qty / 16;
32
+ size_t qty4 = qty / 4;
33
+
34
+ const float *pEnd1 = pVect1 + 16 * qty16;
35
+ const float *pEnd2 = pVect1 + 4 * qty4;
36
+
37
+ __m256 sum256 = _mm256_set1_ps(0);
38
+
39
+ while (pVect1 < pEnd1) {
40
+ //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
41
+
42
+ __m256 v1 = _mm256_loadu_ps(pVect1);
43
+ pVect1 += 8;
44
+ __m256 v2 = _mm256_loadu_ps(pVect2);
45
+ pVect2 += 8;
46
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
47
+
48
+ v1 = _mm256_loadu_ps(pVect1);
49
+ pVect1 += 8;
50
+ v2 = _mm256_loadu_ps(pVect2);
51
+ pVect2 += 8;
52
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
53
+ }
50
54
 
51
- __m128 v1, v2;
52
- __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
55
+ __m128 v1, v2;
56
+ __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
53
57
 
54
- while (pVect1 < pEnd2) {
55
- v1 = _mm_loadu_ps(pVect1);
56
- pVect1 += 4;
57
- v2 = _mm_loadu_ps(pVect2);
58
- pVect2 += 4;
59
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
60
- }
58
+ while (pVect1 < pEnd2) {
59
+ v1 = _mm_loadu_ps(pVect1);
60
+ pVect1 += 4;
61
+ v2 = _mm_loadu_ps(pVect2);
62
+ pVect2 += 4;
63
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
64
+ }
61
65
 
62
- _mm_store_ps(TmpRes, sum_prod);
63
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];;
64
- return 1.0f - sum;
66
+ _mm_store_ps(TmpRes, sum_prod);
67
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
68
+ return sum;
65
69
  }
66
70
 
67
- #elif defined(USE_SSE)
68
-
69
- static float
70
- InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
71
- float PORTABLE_ALIGN32 TmpRes[8];
72
- float *pVect1 = (float *) pVect1v;
73
- float *pVect2 = (float *) pVect2v;
74
- size_t qty = *((size_t *) qty_ptr);
75
-
76
- size_t qty16 = qty / 16;
77
- size_t qty4 = qty / 4;
78
-
79
- const float *pEnd1 = pVect1 + 16 * qty16;
80
- const float *pEnd2 = pVect1 + 4 * qty4;
81
-
82
- __m128 v1, v2;
83
- __m128 sum_prod = _mm_set1_ps(0);
84
-
85
- while (pVect1 < pEnd1) {
86
- v1 = _mm_loadu_ps(pVect1);
87
- pVect1 += 4;
88
- v2 = _mm_loadu_ps(pVect2);
89
- pVect2 += 4;
90
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
91
-
92
- v1 = _mm_loadu_ps(pVect1);
93
- pVect1 += 4;
94
- v2 = _mm_loadu_ps(pVect2);
95
- pVect2 += 4;
96
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
97
-
98
- v1 = _mm_loadu_ps(pVect1);
99
- pVect1 += 4;
100
- v2 = _mm_loadu_ps(pVect2);
101
- pVect2 += 4;
102
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
103
-
104
- v1 = _mm_loadu_ps(pVect1);
105
- pVect1 += 4;
106
- v2 = _mm_loadu_ps(pVect2);
107
- pVect2 += 4;
108
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
109
- }
71
+ static float
72
+ InnerProductDistanceSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
73
+ return 1.0f - InnerProductSIMD4ExtAVX(pVect1v, pVect2v, qty_ptr);
74
+ }
110
75
 
111
- while (pVect1 < pEnd2) {
112
- v1 = _mm_loadu_ps(pVect1);
113
- pVect1 += 4;
114
- v2 = _mm_loadu_ps(pVect2);
115
- pVect2 += 4;
116
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
117
- }
76
+ #endif
118
77
 
119
- _mm_store_ps(TmpRes, sum_prod);
120
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
78
+ #if defined(USE_SSE)
79
+
80
+ static float
81
+ InnerProductSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
82
+ float PORTABLE_ALIGN32 TmpRes[8];
83
+ float *pVect1 = (float *) pVect1v;
84
+ float *pVect2 = (float *) pVect2v;
85
+ size_t qty = *((size_t *) qty_ptr);
86
+
87
+ size_t qty16 = qty / 16;
88
+ size_t qty4 = qty / 4;
89
+
90
+ const float *pEnd1 = pVect1 + 16 * qty16;
91
+ const float *pEnd2 = pVect1 + 4 * qty4;
92
+
93
+ __m128 v1, v2;
94
+ __m128 sum_prod = _mm_set1_ps(0);
95
+
96
+ while (pVect1 < pEnd1) {
97
+ v1 = _mm_loadu_ps(pVect1);
98
+ pVect1 += 4;
99
+ v2 = _mm_loadu_ps(pVect2);
100
+ pVect2 += 4;
101
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
102
+
103
+ v1 = _mm_loadu_ps(pVect1);
104
+ pVect1 += 4;
105
+ v2 = _mm_loadu_ps(pVect2);
106
+ pVect2 += 4;
107
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
108
+
109
+ v1 = _mm_loadu_ps(pVect1);
110
+ pVect1 += 4;
111
+ v2 = _mm_loadu_ps(pVect2);
112
+ pVect2 += 4;
113
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
114
+
115
+ v1 = _mm_loadu_ps(pVect1);
116
+ pVect1 += 4;
117
+ v2 = _mm_loadu_ps(pVect2);
118
+ pVect2 += 4;
119
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
120
+ }
121
121
 
122
- return 1.0f - sum;
122
+ while (pVect1 < pEnd2) {
123
+ v1 = _mm_loadu_ps(pVect1);
124
+ pVect1 += 4;
125
+ v2 = _mm_loadu_ps(pVect2);
126
+ pVect2 += 4;
127
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
123
128
  }
124
129
 
125
- #endif
130
+ _mm_store_ps(TmpRes, sum_prod);
131
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
126
132
 
127
- #if defined(USE_AVX)
133
+ return sum;
134
+ }
128
135
 
129
- static float
130
- InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
131
- float PORTABLE_ALIGN32 TmpRes[8];
132
- float *pVect1 = (float *) pVect1v;
133
- float *pVect2 = (float *) pVect2v;
134
- size_t qty = *((size_t *) qty_ptr);
136
+ static float
137
+ InnerProductDistanceSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
138
+ return 1.0f - InnerProductSIMD4ExtSSE(pVect1v, pVect2v, qty_ptr);
139
+ }
135
140
 
136
- size_t qty16 = qty / 16;
141
+ #endif
137
142
 
138
143
 
139
- const float *pEnd1 = pVect1 + 16 * qty16;
144
+ #if defined(USE_AVX512)
140
145
 
141
- __m256 sum256 = _mm256_set1_ps(0);
146
+ static float
147
+ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
148
+ float PORTABLE_ALIGN64 TmpRes[16];
149
+ float *pVect1 = (float *) pVect1v;
150
+ float *pVect2 = (float *) pVect2v;
151
+ size_t qty = *((size_t *) qty_ptr);
142
152
 
143
- while (pVect1 < pEnd1) {
144
- //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
153
+ size_t qty16 = qty / 16;
145
154
 
146
- __m256 v1 = _mm256_loadu_ps(pVect1);
147
- pVect1 += 8;
148
- __m256 v2 = _mm256_loadu_ps(pVect2);
149
- pVect2 += 8;
150
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
151
155
 
152
- v1 = _mm256_loadu_ps(pVect1);
153
- pVect1 += 8;
154
- v2 = _mm256_loadu_ps(pVect2);
155
- pVect2 += 8;
156
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
157
- }
156
+ const float *pEnd1 = pVect1 + 16 * qty16;
158
157
 
159
- _mm256_store_ps(TmpRes, sum256);
160
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
158
+ __m512 sum512 = _mm512_set1_ps(0);
161
159
 
162
- return 1.0f - sum;
160
+ while (pVect1 < pEnd1) {
161
+ //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
162
+
163
+ __m512 v1 = _mm512_loadu_ps(pVect1);
164
+ pVect1 += 16;
165
+ __m512 v2 = _mm512_loadu_ps(pVect2);
166
+ pVect2 += 16;
167
+ sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
163
168
  }
164
169
 
165
- #elif defined(USE_SSE)
166
-
167
- static float
168
- InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
169
- float PORTABLE_ALIGN32 TmpRes[8];
170
- float *pVect1 = (float *) pVect1v;
171
- float *pVect2 = (float *) pVect2v;
172
- size_t qty = *((size_t *) qty_ptr);
173
-
174
- size_t qty16 = qty / 16;
175
-
176
- const float *pEnd1 = pVect1 + 16 * qty16;
177
-
178
- __m128 v1, v2;
179
- __m128 sum_prod = _mm_set1_ps(0);
180
-
181
- while (pVect1 < pEnd1) {
182
- v1 = _mm_loadu_ps(pVect1);
183
- pVect1 += 4;
184
- v2 = _mm_loadu_ps(pVect2);
185
- pVect2 += 4;
186
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
187
-
188
- v1 = _mm_loadu_ps(pVect1);
189
- pVect1 += 4;
190
- v2 = _mm_loadu_ps(pVect2);
191
- pVect2 += 4;
192
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
193
-
194
- v1 = _mm_loadu_ps(pVect1);
195
- pVect1 += 4;
196
- v2 = _mm_loadu_ps(pVect2);
197
- pVect2 += 4;
198
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
199
-
200
- v1 = _mm_loadu_ps(pVect1);
201
- pVect1 += 4;
202
- v2 = _mm_loadu_ps(pVect2);
203
- pVect2 += 4;
204
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
205
- }
206
- _mm_store_ps(TmpRes, sum_prod);
207
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
170
+ _mm512_store_ps(TmpRes, sum512);
171
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
208
172
 
209
- return 1.0f - sum;
210
- }
173
+ return sum;
174
+ }
175
+
176
+ static float
177
+ InnerProductDistanceSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
178
+ return 1.0f - InnerProductSIMD16ExtAVX512(pVect1v, pVect2v, qty_ptr);
179
+ }
211
180
 
212
181
  #endif
213
182
 
214
- #if defined(USE_SSE) || defined(USE_AVX)
215
- static float
216
- InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
217
- size_t qty = *((size_t *) qty_ptr);
218
- size_t qty16 = qty >> 4 << 4;
219
- float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
220
- float *pVect1 = (float *) pVect1v + qty16;
221
- float *pVect2 = (float *) pVect2v + qty16;
222
-
223
- size_t qty_left = qty - qty16;
224
- float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
225
- return res + res_tail - 1.0f;
183
+ #if defined(USE_AVX)
184
+
185
+ static float
186
+ InnerProductSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
187
+ float PORTABLE_ALIGN32 TmpRes[8];
188
+ float *pVect1 = (float *) pVect1v;
189
+ float *pVect2 = (float *) pVect2v;
190
+ size_t qty = *((size_t *) qty_ptr);
191
+
192
+ size_t qty16 = qty / 16;
193
+
194
+
195
+ const float *pEnd1 = pVect1 + 16 * qty16;
196
+
197
+ __m256 sum256 = _mm256_set1_ps(0);
198
+
199
+ while (pVect1 < pEnd1) {
200
+ //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
201
+
202
+ __m256 v1 = _mm256_loadu_ps(pVect1);
203
+ pVect1 += 8;
204
+ __m256 v2 = _mm256_loadu_ps(pVect2);
205
+ pVect2 += 8;
206
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
207
+
208
+ v1 = _mm256_loadu_ps(pVect1);
209
+ pVect1 += 8;
210
+ v2 = _mm256_loadu_ps(pVect2);
211
+ pVect2 += 8;
212
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
226
213
  }
227
214
 
228
- static float
229
- InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
230
- size_t qty = *((size_t *) qty_ptr);
231
- size_t qty4 = qty >> 2 << 2;
215
+ _mm256_store_ps(TmpRes, sum256);
216
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
217
+
218
+ return sum;
219
+ }
232
220
 
233
- float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
234
- size_t qty_left = qty - qty4;
221
+ static float
222
+ InnerProductDistanceSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
223
+ return 1.0f - InnerProductSIMD16ExtAVX(pVect1v, pVect2v, qty_ptr);
224
+ }
235
225
 
236
- float *pVect1 = (float *) pVect1v + qty4;
237
- float *pVect2 = (float *) pVect2v + qty4;
238
- float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
226
+ #endif
239
227
 
240
- return res + res_tail - 1.0f;
228
+ #if defined(USE_SSE)
229
+
230
+ static float
231
+ InnerProductSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
232
+ float PORTABLE_ALIGN32 TmpRes[8];
233
+ float *pVect1 = (float *) pVect1v;
234
+ float *pVect2 = (float *) pVect2v;
235
+ size_t qty = *((size_t *) qty_ptr);
236
+
237
+ size_t qty16 = qty / 16;
238
+
239
+ const float *pEnd1 = pVect1 + 16 * qty16;
240
+
241
+ __m128 v1, v2;
242
+ __m128 sum_prod = _mm_set1_ps(0);
243
+
244
+ while (pVect1 < pEnd1) {
245
+ v1 = _mm_loadu_ps(pVect1);
246
+ pVect1 += 4;
247
+ v2 = _mm_loadu_ps(pVect2);
248
+ pVect2 += 4;
249
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
250
+
251
+ v1 = _mm_loadu_ps(pVect1);
252
+ pVect1 += 4;
253
+ v2 = _mm_loadu_ps(pVect2);
254
+ pVect2 += 4;
255
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
256
+
257
+ v1 = _mm_loadu_ps(pVect1);
258
+ pVect1 += 4;
259
+ v2 = _mm_loadu_ps(pVect2);
260
+ pVect2 += 4;
261
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
262
+
263
+ v1 = _mm_loadu_ps(pVect1);
264
+ pVect1 += 4;
265
+ v2 = _mm_loadu_ps(pVect2);
266
+ pVect2 += 4;
267
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
241
268
  }
269
+ _mm_store_ps(TmpRes, sum_prod);
270
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
271
+
272
+ return sum;
273
+ }
274
+
275
+ static float
276
+ InnerProductDistanceSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
277
+ return 1.0f - InnerProductSIMD16ExtSSE(pVect1v, pVect2v, qty_ptr);
278
+ }
279
+
280
+ #endif
281
+
282
+ #if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
283
+ static DISTFUNC<float> InnerProductSIMD16Ext = InnerProductSIMD16ExtSSE;
284
+ static DISTFUNC<float> InnerProductSIMD4Ext = InnerProductSIMD4ExtSSE;
285
+ static DISTFUNC<float> InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtSSE;
286
+ static DISTFUNC<float> InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtSSE;
287
+
288
+ static float
289
+ InnerProductDistanceSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
290
+ size_t qty = *((size_t *) qty_ptr);
291
+ size_t qty16 = qty >> 4 << 4;
292
+ float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
293
+ float *pVect1 = (float *) pVect1v + qty16;
294
+ float *pVect2 = (float *) pVect2v + qty16;
295
+
296
+ size_t qty_left = qty - qty16;
297
+ float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
298
+ return 1.0f - (res + res_tail);
299
+ }
300
+
301
+ static float
302
+ InnerProductDistanceSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
303
+ size_t qty = *((size_t *) qty_ptr);
304
+ size_t qty4 = qty >> 2 << 2;
305
+
306
+ float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
307
+ size_t qty_left = qty - qty4;
308
+
309
+ float *pVect1 = (float *) pVect1v + qty4;
310
+ float *pVect2 = (float *) pVect2v + qty4;
311
+ float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
312
+
313
+ return 1.0f - (res + res_tail);
314
+ }
242
315
  #endif
243
316
 
244
- class InnerProductSpace : public SpaceInterface<float> {
245
-
246
- DISTFUNC<float> fstdistfunc_;
247
- size_t data_size_;
248
- size_t dim_;
249
- public:
250
- InnerProductSpace(size_t dim) {
251
- fstdistfunc_ = InnerProduct;
252
- #if defined(USE_AVX) || defined(USE_SSE)
253
- if (dim % 16 == 0)
254
- fstdistfunc_ = InnerProductSIMD16Ext;
255
- else if (dim % 4 == 0)
256
- fstdistfunc_ = InnerProductSIMD4Ext;
257
- else if (dim > 16)
258
- fstdistfunc_ = InnerProductSIMD16ExtResiduals;
259
- else if (dim > 4)
260
- fstdistfunc_ = InnerProductSIMD4ExtResiduals;
317
+ class InnerProductSpace : public SpaceInterface<float> {
318
+ DISTFUNC<float> fstdistfunc_;
319
+ size_t data_size_;
320
+ size_t dim_;
321
+
322
+ public:
323
+ InnerProductSpace(size_t dim) {
324
+ fstdistfunc_ = InnerProductDistance;
325
+ #if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
326
+ #if defined(USE_AVX512)
327
+ if (AVX512Capable()) {
328
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
329
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
330
+ } else if (AVXCapable()) {
331
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
332
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
333
+ }
334
+ #elif defined(USE_AVX)
335
+ if (AVXCapable()) {
336
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
337
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
338
+ }
261
339
  #endif
262
- dim_ = dim;
263
- data_size_ = dim * sizeof(float);
340
+ #if defined(USE_AVX)
341
+ if (AVXCapable()) {
342
+ InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
343
+ InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
264
344
  }
345
+ #endif
265
346
 
266
- size_t get_data_size() {
267
- return data_size_;
268
- }
347
+ if (dim % 16 == 0)
348
+ fstdistfunc_ = InnerProductDistanceSIMD16Ext;
349
+ else if (dim % 4 == 0)
350
+ fstdistfunc_ = InnerProductDistanceSIMD4Ext;
351
+ else if (dim > 16)
352
+ fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals;
353
+ else if (dim > 4)
354
+ fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals;
355
+ #endif
356
+ dim_ = dim;
357
+ data_size_ = dim * sizeof(float);
358
+ }
269
359
 
270
- DISTFUNC<float> get_dist_func() {
271
- return fstdistfunc_;
272
- }
360
+ size_t get_data_size() {
361
+ return data_size_;
362
+ }
273
363
 
274
- void *get_dist_func_param() {
275
- return &dim_;
276
- }
364
+ DISTFUNC<float> get_dist_func() {
365
+ return fstdistfunc_;
366
+ }
277
367
 
278
- ~InnerProductSpace() {}
279
- };
368
+ void *get_dist_func_param() {
369
+ return &dim_;
370
+ }
280
371
 
372
+ ~InnerProductSpace() {}
373
+ };
281
374
 
282
- }
375
+ } // namespace hnswlib