umappp 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,280 +3,373 @@
3
3
 
4
4
  namespace hnswlib {
5
5
 
6
- static float
7
- InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
8
- size_t qty = *((size_t *) qty_ptr);
9
- float res = 0;
10
- for (unsigned i = 0; i < qty; i++) {
11
- res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
12
- }
13
- return (1.0f - res);
14
-
6
+ static float
7
+ InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr) {
8
+ size_t qty = *((size_t *) qty_ptr);
9
+ float res = 0;
10
+ for (unsigned i = 0; i < qty; i++) {
11
+ res += ((float *) pVect1)[i] * ((float *) pVect2)[i];
15
12
  }
13
+ return res;
14
+ }
15
+
16
+ static float
17
+ InnerProductDistance(const void *pVect1, const void *pVect2, const void *qty_ptr) {
18
+ return 1.0f - InnerProduct(pVect1, pVect2, qty_ptr);
19
+ }
16
20
 
17
21
  #if defined(USE_AVX)
18
22
 
19
23
  // Favor using AVX if available.
20
- static float
21
- InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
22
- float PORTABLE_ALIGN32 TmpRes[8];
23
- float *pVect1 = (float *) pVect1v;
24
- float *pVect2 = (float *) pVect2v;
25
- size_t qty = *((size_t *) qty_ptr);
26
-
27
- size_t qty16 = qty / 16;
28
- size_t qty4 = qty / 4;
29
-
30
- const float *pEnd1 = pVect1 + 16 * qty16;
31
- const float *pEnd2 = pVect1 + 4 * qty4;
32
-
33
- __m256 sum256 = _mm256_set1_ps(0);
34
-
35
- while (pVect1 < pEnd1) {
36
- //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
37
-
38
- __m256 v1 = _mm256_loadu_ps(pVect1);
39
- pVect1 += 8;
40
- __m256 v2 = _mm256_loadu_ps(pVect2);
41
- pVect2 += 8;
42
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
43
-
44
- v1 = _mm256_loadu_ps(pVect1);
45
- pVect1 += 8;
46
- v2 = _mm256_loadu_ps(pVect2);
47
- pVect2 += 8;
48
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
49
- }
24
+ static float
25
+ InnerProductSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
26
+ float PORTABLE_ALIGN32 TmpRes[8];
27
+ float *pVect1 = (float *) pVect1v;
28
+ float *pVect2 = (float *) pVect2v;
29
+ size_t qty = *((size_t *) qty_ptr);
30
+
31
+ size_t qty16 = qty / 16;
32
+ size_t qty4 = qty / 4;
33
+
34
+ const float *pEnd1 = pVect1 + 16 * qty16;
35
+ const float *pEnd2 = pVect1 + 4 * qty4;
36
+
37
+ __m256 sum256 = _mm256_set1_ps(0);
38
+
39
+ while (pVect1 < pEnd1) {
40
+ //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
41
+
42
+ __m256 v1 = _mm256_loadu_ps(pVect1);
43
+ pVect1 += 8;
44
+ __m256 v2 = _mm256_loadu_ps(pVect2);
45
+ pVect2 += 8;
46
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
47
+
48
+ v1 = _mm256_loadu_ps(pVect1);
49
+ pVect1 += 8;
50
+ v2 = _mm256_loadu_ps(pVect2);
51
+ pVect2 += 8;
52
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
53
+ }
50
54
 
51
- __m128 v1, v2;
52
- __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
55
+ __m128 v1, v2;
56
+ __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
53
57
 
54
- while (pVect1 < pEnd2) {
55
- v1 = _mm_loadu_ps(pVect1);
56
- pVect1 += 4;
57
- v2 = _mm_loadu_ps(pVect2);
58
- pVect2 += 4;
59
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
60
- }
58
+ while (pVect1 < pEnd2) {
59
+ v1 = _mm_loadu_ps(pVect1);
60
+ pVect1 += 4;
61
+ v2 = _mm_loadu_ps(pVect2);
62
+ pVect2 += 4;
63
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
64
+ }
61
65
 
62
- _mm_store_ps(TmpRes, sum_prod);
63
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];;
64
- return 1.0f - sum;
66
+ _mm_store_ps(TmpRes, sum_prod);
67
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
68
+ return sum;
65
69
  }
66
70
 
67
- #elif defined(USE_SSE)
68
-
69
- static float
70
- InnerProductSIMD4Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
71
- float PORTABLE_ALIGN32 TmpRes[8];
72
- float *pVect1 = (float *) pVect1v;
73
- float *pVect2 = (float *) pVect2v;
74
- size_t qty = *((size_t *) qty_ptr);
75
-
76
- size_t qty16 = qty / 16;
77
- size_t qty4 = qty / 4;
78
-
79
- const float *pEnd1 = pVect1 + 16 * qty16;
80
- const float *pEnd2 = pVect1 + 4 * qty4;
81
-
82
- __m128 v1, v2;
83
- __m128 sum_prod = _mm_set1_ps(0);
84
-
85
- while (pVect1 < pEnd1) {
86
- v1 = _mm_loadu_ps(pVect1);
87
- pVect1 += 4;
88
- v2 = _mm_loadu_ps(pVect2);
89
- pVect2 += 4;
90
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
91
-
92
- v1 = _mm_loadu_ps(pVect1);
93
- pVect1 += 4;
94
- v2 = _mm_loadu_ps(pVect2);
95
- pVect2 += 4;
96
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
97
-
98
- v1 = _mm_loadu_ps(pVect1);
99
- pVect1 += 4;
100
- v2 = _mm_loadu_ps(pVect2);
101
- pVect2 += 4;
102
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
103
-
104
- v1 = _mm_loadu_ps(pVect1);
105
- pVect1 += 4;
106
- v2 = _mm_loadu_ps(pVect2);
107
- pVect2 += 4;
108
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
109
- }
71
+ static float
72
+ InnerProductDistanceSIMD4ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
73
+ return 1.0f - InnerProductSIMD4ExtAVX(pVect1v, pVect2v, qty_ptr);
74
+ }
110
75
 
111
- while (pVect1 < pEnd2) {
112
- v1 = _mm_loadu_ps(pVect1);
113
- pVect1 += 4;
114
- v2 = _mm_loadu_ps(pVect2);
115
- pVect2 += 4;
116
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
117
- }
76
+ #endif
118
77
 
119
- _mm_store_ps(TmpRes, sum_prod);
120
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
78
+ #if defined(USE_SSE)
79
+
80
+ static float
81
+ InnerProductSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
82
+ float PORTABLE_ALIGN32 TmpRes[8];
83
+ float *pVect1 = (float *) pVect1v;
84
+ float *pVect2 = (float *) pVect2v;
85
+ size_t qty = *((size_t *) qty_ptr);
86
+
87
+ size_t qty16 = qty / 16;
88
+ size_t qty4 = qty / 4;
89
+
90
+ const float *pEnd1 = pVect1 + 16 * qty16;
91
+ const float *pEnd2 = pVect1 + 4 * qty4;
92
+
93
+ __m128 v1, v2;
94
+ __m128 sum_prod = _mm_set1_ps(0);
95
+
96
+ while (pVect1 < pEnd1) {
97
+ v1 = _mm_loadu_ps(pVect1);
98
+ pVect1 += 4;
99
+ v2 = _mm_loadu_ps(pVect2);
100
+ pVect2 += 4;
101
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
102
+
103
+ v1 = _mm_loadu_ps(pVect1);
104
+ pVect1 += 4;
105
+ v2 = _mm_loadu_ps(pVect2);
106
+ pVect2 += 4;
107
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
108
+
109
+ v1 = _mm_loadu_ps(pVect1);
110
+ pVect1 += 4;
111
+ v2 = _mm_loadu_ps(pVect2);
112
+ pVect2 += 4;
113
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
114
+
115
+ v1 = _mm_loadu_ps(pVect1);
116
+ pVect1 += 4;
117
+ v2 = _mm_loadu_ps(pVect2);
118
+ pVect2 += 4;
119
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
120
+ }
121
121
 
122
- return 1.0f - sum;
122
+ while (pVect1 < pEnd2) {
123
+ v1 = _mm_loadu_ps(pVect1);
124
+ pVect1 += 4;
125
+ v2 = _mm_loadu_ps(pVect2);
126
+ pVect2 += 4;
127
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
123
128
  }
124
129
 
125
- #endif
130
+ _mm_store_ps(TmpRes, sum_prod);
131
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
126
132
 
127
- #if defined(USE_AVX)
133
+ return sum;
134
+ }
128
135
 
129
- static float
130
- InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
131
- float PORTABLE_ALIGN32 TmpRes[8];
132
- float *pVect1 = (float *) pVect1v;
133
- float *pVect2 = (float *) pVect2v;
134
- size_t qty = *((size_t *) qty_ptr);
136
+ static float
137
+ InnerProductDistanceSIMD4ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
138
+ return 1.0f - InnerProductSIMD4ExtSSE(pVect1v, pVect2v, qty_ptr);
139
+ }
135
140
 
136
- size_t qty16 = qty / 16;
141
+ #endif
137
142
 
138
143
 
139
- const float *pEnd1 = pVect1 + 16 * qty16;
144
+ #if defined(USE_AVX512)
140
145
 
141
- __m256 sum256 = _mm256_set1_ps(0);
146
+ static float
147
+ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
148
+ float PORTABLE_ALIGN64 TmpRes[16];
149
+ float *pVect1 = (float *) pVect1v;
150
+ float *pVect2 = (float *) pVect2v;
151
+ size_t qty = *((size_t *) qty_ptr);
142
152
 
143
- while (pVect1 < pEnd1) {
144
- //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
153
+ size_t qty16 = qty / 16;
145
154
 
146
- __m256 v1 = _mm256_loadu_ps(pVect1);
147
- pVect1 += 8;
148
- __m256 v2 = _mm256_loadu_ps(pVect2);
149
- pVect2 += 8;
150
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
151
155
 
152
- v1 = _mm256_loadu_ps(pVect1);
153
- pVect1 += 8;
154
- v2 = _mm256_loadu_ps(pVect2);
155
- pVect2 += 8;
156
- sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
157
- }
156
+ const float *pEnd1 = pVect1 + 16 * qty16;
158
157
 
159
- _mm256_store_ps(TmpRes, sum256);
160
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
158
+ __m512 sum512 = _mm512_set1_ps(0);
161
159
 
162
- return 1.0f - sum;
160
+ while (pVect1 < pEnd1) {
161
+ //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
162
+
163
+ __m512 v1 = _mm512_loadu_ps(pVect1);
164
+ pVect1 += 16;
165
+ __m512 v2 = _mm512_loadu_ps(pVect2);
166
+ pVect2 += 16;
167
+ sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
163
168
  }
164
169
 
165
- #elif defined(USE_SSE)
166
-
167
- static float
168
- InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
169
- float PORTABLE_ALIGN32 TmpRes[8];
170
- float *pVect1 = (float *) pVect1v;
171
- float *pVect2 = (float *) pVect2v;
172
- size_t qty = *((size_t *) qty_ptr);
173
-
174
- size_t qty16 = qty / 16;
175
-
176
- const float *pEnd1 = pVect1 + 16 * qty16;
177
-
178
- __m128 v1, v2;
179
- __m128 sum_prod = _mm_set1_ps(0);
180
-
181
- while (pVect1 < pEnd1) {
182
- v1 = _mm_loadu_ps(pVect1);
183
- pVect1 += 4;
184
- v2 = _mm_loadu_ps(pVect2);
185
- pVect2 += 4;
186
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
187
-
188
- v1 = _mm_loadu_ps(pVect1);
189
- pVect1 += 4;
190
- v2 = _mm_loadu_ps(pVect2);
191
- pVect2 += 4;
192
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
193
-
194
- v1 = _mm_loadu_ps(pVect1);
195
- pVect1 += 4;
196
- v2 = _mm_loadu_ps(pVect2);
197
- pVect2 += 4;
198
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
199
-
200
- v1 = _mm_loadu_ps(pVect1);
201
- pVect1 += 4;
202
- v2 = _mm_loadu_ps(pVect2);
203
- pVect2 += 4;
204
- sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
205
- }
206
- _mm_store_ps(TmpRes, sum_prod);
207
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
170
+ _mm512_store_ps(TmpRes, sum512);
171
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
208
172
 
209
- return 1.0f - sum;
210
- }
173
+ return sum;
174
+ }
175
+
176
+ static float
177
+ InnerProductDistanceSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
178
+ return 1.0f - InnerProductSIMD16ExtAVX512(pVect1v, pVect2v, qty_ptr);
179
+ }
211
180
 
212
181
  #endif
213
182
 
214
- #if defined(USE_SSE) || defined(USE_AVX)
215
- static float
216
- InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
217
- size_t qty = *((size_t *) qty_ptr);
218
- size_t qty16 = qty >> 4 << 4;
219
- float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
220
- float *pVect1 = (float *) pVect1v + qty16;
221
- float *pVect2 = (float *) pVect2v + qty16;
222
-
223
- size_t qty_left = qty - qty16;
224
- float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
225
- return res + res_tail - 1.0f;
183
+ #if defined(USE_AVX)
184
+
185
+ static float
186
+ InnerProductSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
187
+ float PORTABLE_ALIGN32 TmpRes[8];
188
+ float *pVect1 = (float *) pVect1v;
189
+ float *pVect2 = (float *) pVect2v;
190
+ size_t qty = *((size_t *) qty_ptr);
191
+
192
+ size_t qty16 = qty / 16;
193
+
194
+
195
+ const float *pEnd1 = pVect1 + 16 * qty16;
196
+
197
+ __m256 sum256 = _mm256_set1_ps(0);
198
+
199
+ while (pVect1 < pEnd1) {
200
+ //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
201
+
202
+ __m256 v1 = _mm256_loadu_ps(pVect1);
203
+ pVect1 += 8;
204
+ __m256 v2 = _mm256_loadu_ps(pVect2);
205
+ pVect2 += 8;
206
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
207
+
208
+ v1 = _mm256_loadu_ps(pVect1);
209
+ pVect1 += 8;
210
+ v2 = _mm256_loadu_ps(pVect2);
211
+ pVect2 += 8;
212
+ sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
226
213
  }
227
214
 
228
- static float
229
- InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
230
- size_t qty = *((size_t *) qty_ptr);
231
- size_t qty4 = qty >> 2 << 2;
215
+ _mm256_store_ps(TmpRes, sum256);
216
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
217
+
218
+ return sum;
219
+ }
232
220
 
233
- float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
234
- size_t qty_left = qty - qty4;
221
+ static float
222
+ InnerProductDistanceSIMD16ExtAVX(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
223
+ return 1.0f - InnerProductSIMD16ExtAVX(pVect1v, pVect2v, qty_ptr);
224
+ }
235
225
 
236
- float *pVect1 = (float *) pVect1v + qty4;
237
- float *pVect2 = (float *) pVect2v + qty4;
238
- float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
226
+ #endif
239
227
 
240
- return res + res_tail - 1.0f;
228
+ #if defined(USE_SSE)
229
+
230
+ static float
231
+ InnerProductSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
232
+ float PORTABLE_ALIGN32 TmpRes[8];
233
+ float *pVect1 = (float *) pVect1v;
234
+ float *pVect2 = (float *) pVect2v;
235
+ size_t qty = *((size_t *) qty_ptr);
236
+
237
+ size_t qty16 = qty / 16;
238
+
239
+ const float *pEnd1 = pVect1 + 16 * qty16;
240
+
241
+ __m128 v1, v2;
242
+ __m128 sum_prod = _mm_set1_ps(0);
243
+
244
+ while (pVect1 < pEnd1) {
245
+ v1 = _mm_loadu_ps(pVect1);
246
+ pVect1 += 4;
247
+ v2 = _mm_loadu_ps(pVect2);
248
+ pVect2 += 4;
249
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
250
+
251
+ v1 = _mm_loadu_ps(pVect1);
252
+ pVect1 += 4;
253
+ v2 = _mm_loadu_ps(pVect2);
254
+ pVect2 += 4;
255
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
256
+
257
+ v1 = _mm_loadu_ps(pVect1);
258
+ pVect1 += 4;
259
+ v2 = _mm_loadu_ps(pVect2);
260
+ pVect2 += 4;
261
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
262
+
263
+ v1 = _mm_loadu_ps(pVect1);
264
+ pVect1 += 4;
265
+ v2 = _mm_loadu_ps(pVect2);
266
+ pVect2 += 4;
267
+ sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
241
268
  }
269
+ _mm_store_ps(TmpRes, sum_prod);
270
+ float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
271
+
272
+ return sum;
273
+ }
274
+
275
+ static float
276
+ InnerProductDistanceSIMD16ExtSSE(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
277
+ return 1.0f - InnerProductSIMD16ExtSSE(pVect1v, pVect2v, qty_ptr);
278
+ }
279
+
280
+ #endif
281
+
282
+ #if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
283
+ static DISTFUNC<float> InnerProductSIMD16Ext = InnerProductSIMD16ExtSSE;
284
+ static DISTFUNC<float> InnerProductSIMD4Ext = InnerProductSIMD4ExtSSE;
285
+ static DISTFUNC<float> InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtSSE;
286
+ static DISTFUNC<float> InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtSSE;
287
+
288
+ static float
289
+ InnerProductDistanceSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
290
+ size_t qty = *((size_t *) qty_ptr);
291
+ size_t qty16 = qty >> 4 << 4;
292
+ float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
293
+ float *pVect1 = (float *) pVect1v + qty16;
294
+ float *pVect2 = (float *) pVect2v + qty16;
295
+
296
+ size_t qty_left = qty - qty16;
297
+ float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
298
+ return 1.0f - (res + res_tail);
299
+ }
300
+
301
+ static float
302
+ InnerProductDistanceSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
303
+ size_t qty = *((size_t *) qty_ptr);
304
+ size_t qty4 = qty >> 2 << 2;
305
+
306
+ float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
307
+ size_t qty_left = qty - qty4;
308
+
309
+ float *pVect1 = (float *) pVect1v + qty4;
310
+ float *pVect2 = (float *) pVect2v + qty4;
311
+ float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
312
+
313
+ return 1.0f - (res + res_tail);
314
+ }
242
315
  #endif
243
316
 
244
- class InnerProductSpace : public SpaceInterface<float> {
245
-
246
- DISTFUNC<float> fstdistfunc_;
247
- size_t data_size_;
248
- size_t dim_;
249
- public:
250
- InnerProductSpace(size_t dim) {
251
- fstdistfunc_ = InnerProduct;
252
- #if defined(USE_AVX) || defined(USE_SSE)
253
- if (dim % 16 == 0)
254
- fstdistfunc_ = InnerProductSIMD16Ext;
255
- else if (dim % 4 == 0)
256
- fstdistfunc_ = InnerProductSIMD4Ext;
257
- else if (dim > 16)
258
- fstdistfunc_ = InnerProductSIMD16ExtResiduals;
259
- else if (dim > 4)
260
- fstdistfunc_ = InnerProductSIMD4ExtResiduals;
317
+ class InnerProductSpace : public SpaceInterface<float> {
318
+ DISTFUNC<float> fstdistfunc_;
319
+ size_t data_size_;
320
+ size_t dim_;
321
+
322
+ public:
323
+ InnerProductSpace(size_t dim) {
324
+ fstdistfunc_ = InnerProductDistance;
325
+ #if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
326
+ #if defined(USE_AVX512)
327
+ if (AVX512Capable()) {
328
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
329
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
330
+ } else if (AVXCapable()) {
331
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
332
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
333
+ }
334
+ #elif defined(USE_AVX)
335
+ if (AVXCapable()) {
336
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
337
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
338
+ }
261
339
  #endif
262
- dim_ = dim;
263
- data_size_ = dim * sizeof(float);
340
+ #if defined(USE_AVX)
341
+ if (AVXCapable()) {
342
+ InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
343
+ InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
264
344
  }
345
+ #endif
265
346
 
266
- size_t get_data_size() {
267
- return data_size_;
268
- }
347
+ if (dim % 16 == 0)
348
+ fstdistfunc_ = InnerProductDistanceSIMD16Ext;
349
+ else if (dim % 4 == 0)
350
+ fstdistfunc_ = InnerProductDistanceSIMD4Ext;
351
+ else if (dim > 16)
352
+ fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals;
353
+ else if (dim > 4)
354
+ fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals;
355
+ #endif
356
+ dim_ = dim;
357
+ data_size_ = dim * sizeof(float);
358
+ }
269
359
 
270
- DISTFUNC<float> get_dist_func() {
271
- return fstdistfunc_;
272
- }
360
+ size_t get_data_size() {
361
+ return data_size_;
362
+ }
273
363
 
274
- void *get_dist_func_param() {
275
- return &dim_;
276
- }
364
+ DISTFUNC<float> get_dist_func() {
365
+ return fstdistfunc_;
366
+ }
277
367
 
278
- ~InnerProductSpace() {}
279
- };
368
+ void *get_dist_func_param() {
369
+ return &dim_;
370
+ }
280
371
 
372
+ ~InnerProductSpace() {}
373
+ };
281
374
 
282
- }
375
+ } // namespace hnswlib