faiss 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +9 -2
- data/ext/faiss/index.cpp +1 -1
- data/ext/faiss/index_binary.cpp +2 -2
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +7 -7
- data/vendor/faiss/faiss/AutoTune.h +0 -1
- data/vendor/faiss/faiss/Clustering.cpp +4 -18
- data/vendor/faiss/faiss/Clustering.h +31 -21
- data/vendor/faiss/faiss/IVFlib.cpp +22 -11
- data/vendor/faiss/faiss/Index.cpp +1 -1
- data/vendor/faiss/faiss/Index.h +20 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
- data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
- data/vendor/faiss/faiss/IndexBinary.h +8 -19
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
- data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
- data/vendor/faiss/faiss/IndexFastScan.h +9 -8
- data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
- data/vendor/faiss/faiss/IndexFlat.h +20 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
- data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
- data/vendor/faiss/faiss/IndexHNSW.h +12 -48
- data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
- data/vendor/faiss/faiss/IndexIDMap.h +24 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
- data/vendor/faiss/faiss/IndexIVF.h +37 -5
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
- data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
- data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
- data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
- data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
- data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
- data/vendor/faiss/faiss/IndexNSG.h +10 -10
- data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
- data/vendor/faiss/faiss/IndexPQ.h +1 -4
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
- data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
- data/vendor/faiss/faiss/IndexRefine.h +7 -0
- data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
- data/vendor/faiss/faiss/IndexShards.cpp +21 -29
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
- data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
- data/vendor/faiss/faiss/MatrixStats.h +21 -9
- data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
- data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
- data/vendor/faiss/faiss/VectorTransform.h +7 -7
- data/vendor/faiss/faiss/clone_index.cpp +15 -10
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
- data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
- data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
- data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
- data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
- data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
- data/vendor/faiss/faiss/impl/FaissException.h +13 -34
- data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
- data/vendor/faiss/faiss/impl/HNSW.h +9 -8
- data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
- data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
- data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
- data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
- data/vendor/faiss/faiss/impl/io.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
- data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
- data/vendor/faiss/faiss/index_factory.cpp +10 -7
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
- data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
- data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
- data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
- data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
- data/vendor/faiss/faiss/utils/distances.cpp +128 -74
- data/vendor/faiss/faiss/utils/distances.h +81 -4
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
- data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
- data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
- data/vendor/faiss/faiss/utils/fp16.h +2 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
- data/vendor/faiss/faiss/utils/hamming.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
- data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
- data/vendor/faiss/faiss/utils/prefetch.h +77 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
- data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
- data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
- data/vendor/faiss/faiss/utils/sorting.h +27 -0
- data/vendor/faiss/faiss/utils/utils.cpp +112 -6
- data/vendor/faiss/faiss/utils/utils.h +57 -20
- metadata +10 -3
@@ -62,7 +62,7 @@ void kernel(
|
|
62
62
|
const float* const __restrict y,
|
63
63
|
const float* const __restrict y_transposed,
|
64
64
|
const size_t ny,
|
65
|
-
|
65
|
+
Top1BlockResultHandler<CMax<float, int64_t>>& res,
|
66
66
|
const float* __restrict y_norms,
|
67
67
|
const size_t i) {
|
68
68
|
const size_t ny_p =
|
@@ -73,7 +73,7 @@ void kernel(
|
|
73
73
|
|
74
74
|
// prefetch the next point
|
75
75
|
#if defined(__AVX2__)
|
76
|
-
_mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
|
76
|
+
_mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
|
77
77
|
#endif
|
78
78
|
|
79
79
|
// load a single point from x
|
@@ -226,7 +226,7 @@ void exhaustive_L2sqr_fused_cmax(
|
|
226
226
|
const float* const __restrict y,
|
227
227
|
size_t nx,
|
228
228
|
size_t ny,
|
229
|
-
|
229
|
+
Top1BlockResultHandler<CMax<float, int64_t>>& res,
|
230
230
|
const float* __restrict y_norms) {
|
231
231
|
// BLAS does not like empty matrices
|
232
232
|
if (nx == 0 || ny == 0) {
|
@@ -270,7 +270,7 @@ void exhaustive_L2sqr_fused_cmax(
|
|
270
270
|
x, y, y_transposed.data(), ny, res, y_norms, i);
|
271
271
|
}
|
272
272
|
|
273
|
-
// Does nothing for
|
273
|
+
// Does nothing for Top1BlockResultHandler, but
|
274
274
|
// keeping the call for the consistency.
|
275
275
|
res.end_multiple();
|
276
276
|
InterruptCallback::check();
|
@@ -284,7 +284,7 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
|
|
284
284
|
size_t d,
|
285
285
|
size_t nx,
|
286
286
|
size_t ny,
|
287
|
-
|
287
|
+
Top1BlockResultHandler<CMax<float, int64_t>>& res,
|
288
288
|
const float* y_norms) {
|
289
289
|
// Process only cases with certain dimensionalities.
|
290
290
|
// An acceptable dimensionality value is limited by the number of
|
@@ -223,6 +223,76 @@ float fvec_L2sqr(const float* x, const float* y, size_t d) {
|
|
223
223
|
}
|
224
224
|
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
225
225
|
|
226
|
+
/// Special version of inner product that computes 4 distances
|
227
|
+
/// between x and yi
|
228
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
229
|
+
void fvec_inner_product_batch_4(
|
230
|
+
const float* __restrict x,
|
231
|
+
const float* __restrict y0,
|
232
|
+
const float* __restrict y1,
|
233
|
+
const float* __restrict y2,
|
234
|
+
const float* __restrict y3,
|
235
|
+
const size_t d,
|
236
|
+
float& dis0,
|
237
|
+
float& dis1,
|
238
|
+
float& dis2,
|
239
|
+
float& dis3) {
|
240
|
+
float d0 = 0;
|
241
|
+
float d1 = 0;
|
242
|
+
float d2 = 0;
|
243
|
+
float d3 = 0;
|
244
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
245
|
+
for (size_t i = 0; i < d; ++i) {
|
246
|
+
d0 += x[i] * y0[i];
|
247
|
+
d1 += x[i] * y1[i];
|
248
|
+
d2 += x[i] * y2[i];
|
249
|
+
d3 += x[i] * y3[i];
|
250
|
+
}
|
251
|
+
|
252
|
+
dis0 = d0;
|
253
|
+
dis1 = d1;
|
254
|
+
dis2 = d2;
|
255
|
+
dis3 = d3;
|
256
|
+
}
|
257
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
258
|
+
|
259
|
+
/// Special version of L2sqr that computes 4 distances
|
260
|
+
/// between x and yi, which is performance oriented.
|
261
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
262
|
+
void fvec_L2sqr_batch_4(
|
263
|
+
const float* x,
|
264
|
+
const float* y0,
|
265
|
+
const float* y1,
|
266
|
+
const float* y2,
|
267
|
+
const float* y3,
|
268
|
+
const size_t d,
|
269
|
+
float& dis0,
|
270
|
+
float& dis1,
|
271
|
+
float& dis2,
|
272
|
+
float& dis3) {
|
273
|
+
float d0 = 0;
|
274
|
+
float d1 = 0;
|
275
|
+
float d2 = 0;
|
276
|
+
float d3 = 0;
|
277
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
278
|
+
for (size_t i = 0; i < d; ++i) {
|
279
|
+
const float q0 = x[i] - y0[i];
|
280
|
+
const float q1 = x[i] - y1[i];
|
281
|
+
const float q2 = x[i] - y2[i];
|
282
|
+
const float q3 = x[i] - y3[i];
|
283
|
+
d0 += q0 * q0;
|
284
|
+
d1 += q1 * q1;
|
285
|
+
d2 += q2 * q2;
|
286
|
+
d3 += q3 * q3;
|
287
|
+
}
|
288
|
+
|
289
|
+
dis0 = d0;
|
290
|
+
dis1 = d1;
|
291
|
+
dis2 = d2;
|
292
|
+
dis3 = d3;
|
293
|
+
}
|
294
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
295
|
+
|
226
296
|
/*********************************************************
|
227
297
|
* SSE and AVX implementations
|
228
298
|
*/
|
@@ -236,8 +306,10 @@ static inline __m128 masked_read(int d, const float* x) {
|
|
236
306
|
switch (d) {
|
237
307
|
case 3:
|
238
308
|
buf[2] = x[2];
|
309
|
+
[[fallthrough]];
|
239
310
|
case 2:
|
240
311
|
buf[1] = x[1];
|
312
|
+
[[fallthrough]];
|
241
313
|
case 1:
|
242
314
|
buf[0] = x[0];
|
243
315
|
}
|
@@ -247,6 +319,33 @@ static inline __m128 masked_read(int d, const float* x) {
|
|
247
319
|
|
248
320
|
namespace {
|
249
321
|
|
322
|
+
/// helper function
|
323
|
+
inline float horizontal_sum(const __m128 v) {
|
324
|
+
// say, v is [x0, x1, x2, x3]
|
325
|
+
|
326
|
+
// v0 is [x2, x3, ..., ...]
|
327
|
+
const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
|
328
|
+
// v1 is [x0 + x2, x1 + x3, ..., ...]
|
329
|
+
const __m128 v1 = _mm_add_ps(v, v0);
|
330
|
+
// v2 is [x1 + x3, ..., .... ,...]
|
331
|
+
__m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
|
332
|
+
// v3 is [x0 + x1 + x2 + x3, ..., ..., ...]
|
333
|
+
const __m128 v3 = _mm_add_ps(v1, v2);
|
334
|
+
// return v3[0]
|
335
|
+
return _mm_cvtss_f32(v3);
|
336
|
+
}
|
337
|
+
|
338
|
+
#ifdef __AVX2__
|
339
|
+
/// helper function for AVX2
|
340
|
+
inline float horizontal_sum(const __m256 v) {
|
341
|
+
// add high and low parts
|
342
|
+
const __m128 v0 =
|
343
|
+
_mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
|
344
|
+
// perform horizontal sum on v0
|
345
|
+
return horizontal_sum(v0);
|
346
|
+
}
|
347
|
+
#endif
|
348
|
+
|
250
349
|
/// Function that does a component-wise operation between x and y
|
251
350
|
/// to compute L2 distances. ElementOp can then be used in the fvec_op_ny
|
252
351
|
/// functions below
|
@@ -260,6 +359,13 @@ struct ElementOpL2 {
|
|
260
359
|
__m128 tmp = _mm_sub_ps(x, y);
|
261
360
|
return _mm_mul_ps(tmp, tmp);
|
262
361
|
}
|
362
|
+
|
363
|
+
#ifdef __AVX2__
|
364
|
+
static __m256 op(__m256 x, __m256 y) {
|
365
|
+
__m256 tmp = _mm256_sub_ps(x, y);
|
366
|
+
return _mm256_mul_ps(tmp, tmp);
|
367
|
+
}
|
368
|
+
#endif
|
263
369
|
};
|
264
370
|
|
265
371
|
/// Function that does a component-wise operation between x and y
|
@@ -272,6 +378,12 @@ struct ElementOpIP {
|
|
272
378
|
static __m128 op(__m128 x, __m128 y) {
|
273
379
|
return _mm_mul_ps(x, y);
|
274
380
|
}
|
381
|
+
|
382
|
+
#ifdef __AVX2__
|
383
|
+
static __m256 op(__m256 x, __m256 y) {
|
384
|
+
return _mm256_mul_ps(x, y);
|
385
|
+
}
|
386
|
+
#endif
|
275
387
|
};
|
276
388
|
|
277
389
|
template <class ElementOp>
|
@@ -314,6 +426,131 @@ void fvec_op_ny_D2(float* dis, const float* x, const float* y, size_t ny) {
|
|
314
426
|
}
|
315
427
|
}
|
316
428
|
|
429
|
+
#ifdef __AVX2__
|
430
|
+
|
431
|
+
template <>
|
432
|
+
void fvec_op_ny_D2<ElementOpIP>(
|
433
|
+
float* dis,
|
434
|
+
const float* x,
|
435
|
+
const float* y,
|
436
|
+
size_t ny) {
|
437
|
+
const size_t ny8 = ny / 8;
|
438
|
+
size_t i = 0;
|
439
|
+
|
440
|
+
if (ny8 > 0) {
|
441
|
+
// process 8 D2-vectors per loop.
|
442
|
+
_mm_prefetch((const char*)y, _MM_HINT_T0);
|
443
|
+
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
|
444
|
+
|
445
|
+
const __m256 m0 = _mm256_set1_ps(x[0]);
|
446
|
+
const __m256 m1 = _mm256_set1_ps(x[1]);
|
447
|
+
|
448
|
+
for (i = 0; i < ny8 * 8; i += 8) {
|
449
|
+
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
|
450
|
+
|
451
|
+
// load 8x2 matrix and transpose it in registers.
|
452
|
+
// the typical bottleneck is memory access, so
|
453
|
+
// let's trade instructions for the bandwidth.
|
454
|
+
|
455
|
+
__m256 v0;
|
456
|
+
__m256 v1;
|
457
|
+
|
458
|
+
transpose_8x2(
|
459
|
+
_mm256_loadu_ps(y + 0 * 8),
|
460
|
+
_mm256_loadu_ps(y + 1 * 8),
|
461
|
+
v0,
|
462
|
+
v1);
|
463
|
+
|
464
|
+
// compute distances
|
465
|
+
__m256 distances = _mm256_mul_ps(m0, v0);
|
466
|
+
distances = _mm256_fmadd_ps(m1, v1, distances);
|
467
|
+
|
468
|
+
// store
|
469
|
+
_mm256_storeu_ps(dis + i, distances);
|
470
|
+
|
471
|
+
y += 16;
|
472
|
+
}
|
473
|
+
}
|
474
|
+
|
475
|
+
if (i < ny) {
|
476
|
+
// process leftovers
|
477
|
+
float x0 = x[0];
|
478
|
+
float x1 = x[1];
|
479
|
+
|
480
|
+
for (; i < ny; i++) {
|
481
|
+
float distance = x0 * y[0] + x1 * y[1];
|
482
|
+
y += 2;
|
483
|
+
dis[i] = distance;
|
484
|
+
}
|
485
|
+
}
|
486
|
+
}
|
487
|
+
|
488
|
+
template <>
|
489
|
+
void fvec_op_ny_D2<ElementOpL2>(
|
490
|
+
float* dis,
|
491
|
+
const float* x,
|
492
|
+
const float* y,
|
493
|
+
size_t ny) {
|
494
|
+
const size_t ny8 = ny / 8;
|
495
|
+
size_t i = 0;
|
496
|
+
|
497
|
+
if (ny8 > 0) {
|
498
|
+
// process 8 D2-vectors per loop.
|
499
|
+
_mm_prefetch((const char*)y, _MM_HINT_T0);
|
500
|
+
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
|
501
|
+
|
502
|
+
const __m256 m0 = _mm256_set1_ps(x[0]);
|
503
|
+
const __m256 m1 = _mm256_set1_ps(x[1]);
|
504
|
+
|
505
|
+
for (i = 0; i < ny8 * 8; i += 8) {
|
506
|
+
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
|
507
|
+
|
508
|
+
// load 8x2 matrix and transpose it in registers.
|
509
|
+
// the typical bottleneck is memory access, so
|
510
|
+
// let's trade instructions for the bandwidth.
|
511
|
+
|
512
|
+
__m256 v0;
|
513
|
+
__m256 v1;
|
514
|
+
|
515
|
+
transpose_8x2(
|
516
|
+
_mm256_loadu_ps(y + 0 * 8),
|
517
|
+
_mm256_loadu_ps(y + 1 * 8),
|
518
|
+
v0,
|
519
|
+
v1);
|
520
|
+
|
521
|
+
// compute differences
|
522
|
+
const __m256 d0 = _mm256_sub_ps(m0, v0);
|
523
|
+
const __m256 d1 = _mm256_sub_ps(m1, v1);
|
524
|
+
|
525
|
+
// compute squares of differences
|
526
|
+
__m256 distances = _mm256_mul_ps(d0, d0);
|
527
|
+
distances = _mm256_fmadd_ps(d1, d1, distances);
|
528
|
+
|
529
|
+
// store
|
530
|
+
_mm256_storeu_ps(dis + i, distances);
|
531
|
+
|
532
|
+
y += 16;
|
533
|
+
}
|
534
|
+
}
|
535
|
+
|
536
|
+
if (i < ny) {
|
537
|
+
// process leftovers
|
538
|
+
float x0 = x[0];
|
539
|
+
float x1 = x[1];
|
540
|
+
|
541
|
+
for (; i < ny; i++) {
|
542
|
+
float sub0 = x0 - y[0];
|
543
|
+
float sub1 = x1 - y[1];
|
544
|
+
float distance = sub0 * sub0 + sub1 * sub1;
|
545
|
+
|
546
|
+
y += 2;
|
547
|
+
dis[i] = distance;
|
548
|
+
}
|
549
|
+
}
|
550
|
+
}
|
551
|
+
|
552
|
+
#endif
|
553
|
+
|
317
554
|
template <class ElementOp>
|
318
555
|
void fvec_op_ny_D4(float* dis, const float* x, const float* y, size_t ny) {
|
319
556
|
__m128 x0 = _mm_loadu_ps(x);
|
@@ -321,17 +558,12 @@ void fvec_op_ny_D4(float* dis, const float* x, const float* y, size_t ny) {
|
|
321
558
|
for (size_t i = 0; i < ny; i++) {
|
322
559
|
__m128 accu = ElementOp::op(x0, _mm_loadu_ps(y));
|
323
560
|
y += 4;
|
324
|
-
|
325
|
-
accu = _mm_hadd_ps(accu, accu);
|
326
|
-
dis[i] = _mm_cvtss_f32(accu);
|
561
|
+
dis[i] = horizontal_sum(accu);
|
327
562
|
}
|
328
563
|
}
|
329
564
|
|
330
565
|
#ifdef __AVX2__
|
331
566
|
|
332
|
-
// Specialized versions for AVX2 for any CPUs that support gather/scatter.
|
333
|
-
// Todo: implement fvec_op_ny_Dxxx in the same way.
|
334
|
-
|
335
567
|
template <>
|
336
568
|
void fvec_op_ny_D4<ElementOpIP>(
|
337
569
|
float* dis,
|
@@ -343,16 +575,9 @@ void fvec_op_ny_D4<ElementOpIP>(
|
|
343
575
|
|
344
576
|
if (ny8 > 0) {
|
345
577
|
// process 8 D4-vectors per loop.
|
346
|
-
_mm_prefetch(y, _MM_HINT_NTA);
|
347
|
-
_mm_prefetch(y + 16, _MM_HINT_NTA);
|
348
|
-
|
349
|
-
// m0 = (x[0], x[0], x[0], x[0], x[0], x[0], x[0], x[0])
|
350
578
|
const __m256 m0 = _mm256_set1_ps(x[0]);
|
351
|
-
// m1 = (x[1], x[1], x[1], x[1], x[1], x[1], x[1], x[1])
|
352
579
|
const __m256 m1 = _mm256_set1_ps(x[1]);
|
353
|
-
// m2 = (x[2], x[2], x[2], x[2], x[2], x[2], x[2], x[2])
|
354
580
|
const __m256 m2 = _mm256_set1_ps(x[2]);
|
355
|
-
// m3 = (x[3], x[3], x[3], x[3], x[3], x[3], x[3], x[3])
|
356
581
|
const __m256 m3 = _mm256_set1_ps(x[3]);
|
357
582
|
|
358
583
|
for (i = 0; i < ny8 * 8; i += 8) {
|
@@ -395,9 +620,7 @@ void fvec_op_ny_D4<ElementOpIP>(
|
|
395
620
|
for (; i < ny; i++) {
|
396
621
|
__m128 accu = ElementOpIP::op(x0, _mm_loadu_ps(y));
|
397
622
|
y += 4;
|
398
|
-
|
399
|
-
accu = _mm_hadd_ps(accu, accu);
|
400
|
-
dis[i] = _mm_cvtss_f32(accu);
|
623
|
+
dis[i] = horizontal_sum(accu);
|
401
624
|
}
|
402
625
|
}
|
403
626
|
}
|
@@ -413,16 +636,9 @@ void fvec_op_ny_D4<ElementOpL2>(
|
|
413
636
|
|
414
637
|
if (ny8 > 0) {
|
415
638
|
// process 8 D4-vectors per loop.
|
416
|
-
_mm_prefetch(y, _MM_HINT_NTA);
|
417
|
-
_mm_prefetch(y + 16, _MM_HINT_NTA);
|
418
|
-
|
419
|
-
// m0 = (x[0], x[0], x[0], x[0], x[0], x[0], x[0], x[0])
|
420
639
|
const __m256 m0 = _mm256_set1_ps(x[0]);
|
421
|
-
// m1 = (x[1], x[1], x[1], x[1], x[1], x[1], x[1], x[1])
|
422
640
|
const __m256 m1 = _mm256_set1_ps(x[1]);
|
423
|
-
// m2 = (x[2], x[2], x[2], x[2], x[2], x[2], x[2], x[2])
|
424
641
|
const __m256 m2 = _mm256_set1_ps(x[2]);
|
425
|
-
// m3 = (x[3], x[3], x[3], x[3], x[3], x[3], x[3], x[3])
|
426
642
|
const __m256 m3 = _mm256_set1_ps(x[3]);
|
427
643
|
|
428
644
|
for (i = 0; i < ny8 * 8; i += 8) {
|
@@ -471,9 +687,7 @@ void fvec_op_ny_D4<ElementOpL2>(
|
|
471
687
|
for (; i < ny; i++) {
|
472
688
|
__m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
|
473
689
|
y += 4;
|
474
|
-
|
475
|
-
accu = _mm_hadd_ps(accu, accu);
|
476
|
-
dis[i] = _mm_cvtss_f32(accu);
|
690
|
+
dis[i] = horizontal_sum(accu);
|
477
691
|
}
|
478
692
|
}
|
479
693
|
}
|
@@ -496,6 +710,182 @@ void fvec_op_ny_D8(float* dis, const float* x, const float* y, size_t ny) {
|
|
496
710
|
}
|
497
711
|
}
|
498
712
|
|
713
|
+
#ifdef __AVX2__
|
714
|
+
|
715
|
+
template <>
|
716
|
+
void fvec_op_ny_D8<ElementOpIP>(
|
717
|
+
float* dis,
|
718
|
+
const float* x,
|
719
|
+
const float* y,
|
720
|
+
size_t ny) {
|
721
|
+
const size_t ny8 = ny / 8;
|
722
|
+
size_t i = 0;
|
723
|
+
|
724
|
+
if (ny8 > 0) {
|
725
|
+
// process 8 D8-vectors per loop.
|
726
|
+
const __m256 m0 = _mm256_set1_ps(x[0]);
|
727
|
+
const __m256 m1 = _mm256_set1_ps(x[1]);
|
728
|
+
const __m256 m2 = _mm256_set1_ps(x[2]);
|
729
|
+
const __m256 m3 = _mm256_set1_ps(x[3]);
|
730
|
+
const __m256 m4 = _mm256_set1_ps(x[4]);
|
731
|
+
const __m256 m5 = _mm256_set1_ps(x[5]);
|
732
|
+
const __m256 m6 = _mm256_set1_ps(x[6]);
|
733
|
+
const __m256 m7 = _mm256_set1_ps(x[7]);
|
734
|
+
|
735
|
+
for (i = 0; i < ny8 * 8; i += 8) {
|
736
|
+
// load 8x8 matrix and transpose it in registers.
|
737
|
+
// the typical bottleneck is memory access, so
|
738
|
+
// let's trade instructions for the bandwidth.
|
739
|
+
|
740
|
+
__m256 v0;
|
741
|
+
__m256 v1;
|
742
|
+
__m256 v2;
|
743
|
+
__m256 v3;
|
744
|
+
__m256 v4;
|
745
|
+
__m256 v5;
|
746
|
+
__m256 v6;
|
747
|
+
__m256 v7;
|
748
|
+
|
749
|
+
transpose_8x8(
|
750
|
+
_mm256_loadu_ps(y + 0 * 8),
|
751
|
+
_mm256_loadu_ps(y + 1 * 8),
|
752
|
+
_mm256_loadu_ps(y + 2 * 8),
|
753
|
+
_mm256_loadu_ps(y + 3 * 8),
|
754
|
+
_mm256_loadu_ps(y + 4 * 8),
|
755
|
+
_mm256_loadu_ps(y + 5 * 8),
|
756
|
+
_mm256_loadu_ps(y + 6 * 8),
|
757
|
+
_mm256_loadu_ps(y + 7 * 8),
|
758
|
+
v0,
|
759
|
+
v1,
|
760
|
+
v2,
|
761
|
+
v3,
|
762
|
+
v4,
|
763
|
+
v5,
|
764
|
+
v6,
|
765
|
+
v7);
|
766
|
+
|
767
|
+
// compute distances
|
768
|
+
__m256 distances = _mm256_mul_ps(m0, v0);
|
769
|
+
distances = _mm256_fmadd_ps(m1, v1, distances);
|
770
|
+
distances = _mm256_fmadd_ps(m2, v2, distances);
|
771
|
+
distances = _mm256_fmadd_ps(m3, v3, distances);
|
772
|
+
distances = _mm256_fmadd_ps(m4, v4, distances);
|
773
|
+
distances = _mm256_fmadd_ps(m5, v5, distances);
|
774
|
+
distances = _mm256_fmadd_ps(m6, v6, distances);
|
775
|
+
distances = _mm256_fmadd_ps(m7, v7, distances);
|
776
|
+
|
777
|
+
// store
|
778
|
+
_mm256_storeu_ps(dis + i, distances);
|
779
|
+
|
780
|
+
y += 64;
|
781
|
+
}
|
782
|
+
}
|
783
|
+
|
784
|
+
if (i < ny) {
|
785
|
+
// process leftovers
|
786
|
+
__m256 x0 = _mm256_loadu_ps(x);
|
787
|
+
|
788
|
+
for (; i < ny; i++) {
|
789
|
+
__m256 accu = ElementOpIP::op(x0, _mm256_loadu_ps(y));
|
790
|
+
y += 8;
|
791
|
+
dis[i] = horizontal_sum(accu);
|
792
|
+
}
|
793
|
+
}
|
794
|
+
}
|
795
|
+
|
796
|
+
template <>
|
797
|
+
void fvec_op_ny_D8<ElementOpL2>(
|
798
|
+
float* dis,
|
799
|
+
const float* x,
|
800
|
+
const float* y,
|
801
|
+
size_t ny) {
|
802
|
+
const size_t ny8 = ny / 8;
|
803
|
+
size_t i = 0;
|
804
|
+
|
805
|
+
if (ny8 > 0) {
|
806
|
+
// process 8 D8-vectors per loop.
|
807
|
+
const __m256 m0 = _mm256_set1_ps(x[0]);
|
808
|
+
const __m256 m1 = _mm256_set1_ps(x[1]);
|
809
|
+
const __m256 m2 = _mm256_set1_ps(x[2]);
|
810
|
+
const __m256 m3 = _mm256_set1_ps(x[3]);
|
811
|
+
const __m256 m4 = _mm256_set1_ps(x[4]);
|
812
|
+
const __m256 m5 = _mm256_set1_ps(x[5]);
|
813
|
+
const __m256 m6 = _mm256_set1_ps(x[6]);
|
814
|
+
const __m256 m7 = _mm256_set1_ps(x[7]);
|
815
|
+
|
816
|
+
for (i = 0; i < ny8 * 8; i += 8) {
|
817
|
+
// load 8x8 matrix and transpose it in registers.
|
818
|
+
// the typical bottleneck is memory access, so
|
819
|
+
// let's trade instructions for the bandwidth.
|
820
|
+
|
821
|
+
__m256 v0;
|
822
|
+
__m256 v1;
|
823
|
+
__m256 v2;
|
824
|
+
__m256 v3;
|
825
|
+
__m256 v4;
|
826
|
+
__m256 v5;
|
827
|
+
__m256 v6;
|
828
|
+
__m256 v7;
|
829
|
+
|
830
|
+
transpose_8x8(
|
831
|
+
_mm256_loadu_ps(y + 0 * 8),
|
832
|
+
_mm256_loadu_ps(y + 1 * 8),
|
833
|
+
_mm256_loadu_ps(y + 2 * 8),
|
834
|
+
_mm256_loadu_ps(y + 3 * 8),
|
835
|
+
_mm256_loadu_ps(y + 4 * 8),
|
836
|
+
_mm256_loadu_ps(y + 5 * 8),
|
837
|
+
_mm256_loadu_ps(y + 6 * 8),
|
838
|
+
_mm256_loadu_ps(y + 7 * 8),
|
839
|
+
v0,
|
840
|
+
v1,
|
841
|
+
v2,
|
842
|
+
v3,
|
843
|
+
v4,
|
844
|
+
v5,
|
845
|
+
v6,
|
846
|
+
v7);
|
847
|
+
|
848
|
+
// compute differences
|
849
|
+
const __m256 d0 = _mm256_sub_ps(m0, v0);
|
850
|
+
const __m256 d1 = _mm256_sub_ps(m1, v1);
|
851
|
+
const __m256 d2 = _mm256_sub_ps(m2, v2);
|
852
|
+
const __m256 d3 = _mm256_sub_ps(m3, v3);
|
853
|
+
const __m256 d4 = _mm256_sub_ps(m4, v4);
|
854
|
+
const __m256 d5 = _mm256_sub_ps(m5, v5);
|
855
|
+
const __m256 d6 = _mm256_sub_ps(m6, v6);
|
856
|
+
const __m256 d7 = _mm256_sub_ps(m7, v7);
|
857
|
+
|
858
|
+
// compute squares of differences
|
859
|
+
__m256 distances = _mm256_mul_ps(d0, d0);
|
860
|
+
distances = _mm256_fmadd_ps(d1, d1, distances);
|
861
|
+
distances = _mm256_fmadd_ps(d2, d2, distances);
|
862
|
+
distances = _mm256_fmadd_ps(d3, d3, distances);
|
863
|
+
distances = _mm256_fmadd_ps(d4, d4, distances);
|
864
|
+
distances = _mm256_fmadd_ps(d5, d5, distances);
|
865
|
+
distances = _mm256_fmadd_ps(d6, d6, distances);
|
866
|
+
distances = _mm256_fmadd_ps(d7, d7, distances);
|
867
|
+
|
868
|
+
// store
|
869
|
+
_mm256_storeu_ps(dis + i, distances);
|
870
|
+
|
871
|
+
y += 64;
|
872
|
+
}
|
873
|
+
}
|
874
|
+
|
875
|
+
if (i < ny) {
|
876
|
+
// process leftovers
|
877
|
+
__m256 x0 = _mm256_loadu_ps(x);
|
878
|
+
|
879
|
+
for (; i < ny; i++) {
|
880
|
+
__m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
|
881
|
+
y += 8;
|
882
|
+
dis[i] = horizontal_sum(accu);
|
883
|
+
}
|
884
|
+
}
|
885
|
+
}
|
886
|
+
|
887
|
+
#endif
|
888
|
+
|
499
889
|
template <class ElementOp>
|
500
890
|
void fvec_op_ny_D12(float* dis, const float* x, const float* y, size_t ny) {
|
501
891
|
__m128 x0 = _mm_loadu_ps(x);
|
@@ -509,9 +899,7 @@ void fvec_op_ny_D12(float* dis, const float* x, const float* y, size_t ny) {
|
|
509
899
|
y += 4;
|
510
900
|
accu = _mm_add_ps(accu, ElementOp::op(x2, _mm_loadu_ps(y)));
|
511
901
|
y += 4;
|
512
|
-
|
513
|
-
accu = _mm_hadd_ps(accu, accu);
|
514
|
-
dis[i] = _mm_cvtss_f32(accu);
|
902
|
+
dis[i] = horizontal_sum(accu);
|
515
903
|
}
|
516
904
|
}
|
517
905
|
|
@@ -581,7 +969,6 @@ void fvec_L2sqr_ny_y_transposed_D(
|
|
581
969
|
|
582
970
|
// squared length of x
|
583
971
|
float x_sqlen = 0;
|
584
|
-
;
|
585
972
|
for (size_t j = 0; j < DIM; j++) {
|
586
973
|
x_sqlen += x[j] * x[j];
|
587
974
|
}
|
@@ -697,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
|
|
697
1084
|
// process 8 D2-vectors per loop.
|
698
1085
|
const size_t ny8 = ny / 8;
|
699
1086
|
if (ny8 > 0) {
|
700
|
-
_mm_prefetch(y, _MM_HINT_T0);
|
701
|
-
_mm_prefetch(y + 16, _MM_HINT_T0);
|
1087
|
+
_mm_prefetch((const char*)y, _MM_HINT_T0);
|
1088
|
+
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
|
702
1089
|
|
703
1090
|
// track min distance and the closest vector independently
|
704
1091
|
// for each of 8 AVX2 components.
|
@@ -713,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
|
|
713
1100
|
const __m256 m1 = _mm256_set1_ps(x[1]);
|
714
1101
|
|
715
1102
|
for (; i < ny8 * 8; i += 8) {
|
716
|
-
_mm_prefetch(y + 32, _MM_HINT_T0);
|
1103
|
+
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
|
717
1104
|
|
718
1105
|
__m256 v0;
|
719
1106
|
__m256 v1;
|
@@ -892,10 +1279,7 @@ size_t fvec_L2sqr_ny_nearest_D4(
|
|
892
1279
|
for (; i < ny; i++) {
|
893
1280
|
__m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
|
894
1281
|
y += 4;
|
895
|
-
|
896
|
-
accu = _mm_hadd_ps(accu, accu);
|
897
|
-
|
898
|
-
const auto distance = _mm_cvtss_f32(accu);
|
1282
|
+
const float distance = horizontal_sum(accu);
|
899
1283
|
|
900
1284
|
if (current_min_distance > distance) {
|
901
1285
|
current_min_distance = distance;
|
@@ -1031,23 +1415,9 @@ size_t fvec_L2sqr_ny_nearest_D8(
|
|
1031
1415
|
__m256 x0 = _mm256_loadu_ps(x);
|
1032
1416
|
|
1033
1417
|
for (; i < ny; i++) {
|
1034
|
-
__m256
|
1035
|
-
__m256 accu = _mm256_mul_ps(sub, sub);
|
1418
|
+
__m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
|
1036
1419
|
y += 8;
|
1037
|
-
|
1038
|
-
// horitontal sum
|
1039
|
-
const __m256 h0 = _mm256_hadd_ps(accu, accu);
|
1040
|
-
const __m256 h1 = _mm256_hadd_ps(h0, h0);
|
1041
|
-
|
1042
|
-
// extract high and low __m128 regs from __m256
|
1043
|
-
const __m128 h2 = _mm256_extractf128_ps(h1, 1);
|
1044
|
-
const __m128 h3 = _mm256_castps256_ps128(h1);
|
1045
|
-
|
1046
|
-
// get a final hsum into all 4 regs
|
1047
|
-
const __m128 h4 = _mm_add_ss(h2, h3);
|
1048
|
-
|
1049
|
-
// extract f[0] from __m128
|
1050
|
-
const float distance = _mm_cvtss_f32(h4);
|
1420
|
+
const float distance = horizontal_sum(accu);
|
1051
1421
|
|
1052
1422
|
if (current_min_distance > distance) {
|
1053
1423
|
current_min_distance = distance;
|
@@ -1260,21 +1630,6 @@ size_t fvec_L2sqr_ny_nearest_y_transposed(
|
|
1260
1630
|
|
1261
1631
|
#ifdef USE_AVX
|
1262
1632
|
|
1263
|
-
// reads 0 <= d < 8 floats as __m256
|
1264
|
-
static inline __m256 masked_read_8(int d, const float* x) {
|
1265
|
-
assert(0 <= d && d < 8);
|
1266
|
-
if (d < 4) {
|
1267
|
-
__m256 res = _mm256_setzero_ps();
|
1268
|
-
res = _mm256_insertf128_ps(res, masked_read(d, x), 0);
|
1269
|
-
return res;
|
1270
|
-
} else {
|
1271
|
-
__m256 res = _mm256_setzero_ps();
|
1272
|
-
res = _mm256_insertf128_ps(res, _mm_loadu_ps(x), 0);
|
1273
|
-
res = _mm256_insertf128_ps(res, masked_read(d - 4, x + 4), 1);
|
1274
|
-
return res;
|
1275
|
-
}
|
1276
|
-
}
|
1277
|
-
|
1278
1633
|
float fvec_L1(const float* x, const float* y, size_t d) {
|
1279
1634
|
__m256 msum1 = _mm256_setzero_ps();
|
1280
1635
|
__m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
|
@@ -1493,7 +1848,7 @@ void fvec_inner_products_ny(
|
|
1493
1848
|
* heavily optimized table computations
|
1494
1849
|
***************************************************************************/
|
1495
1850
|
|
1496
|
-
static inline void fvec_madd_ref(
|
1851
|
+
[[maybe_unused]] static inline void fvec_madd_ref(
|
1497
1852
|
size_t n,
|
1498
1853
|
const float* a,
|
1499
1854
|
float bf,
|
@@ -1560,7 +1915,7 @@ static inline void fvec_madd_avx2(
|
|
1560
1915
|
|
1561
1916
|
#ifdef __SSE3__
|
1562
1917
|
|
1563
|
-
static inline void fvec_madd_sse(
|
1918
|
+
[[maybe_unused]] static inline void fvec_madd_sse(
|
1564
1919
|
size_t n,
|
1565
1920
|
const float* a,
|
1566
1921
|
float bf,
|
@@ -1807,10 +2162,13 @@ void pq2_8cents_table(
|
|
1807
2162
|
switch (nout) {
|
1808
2163
|
case 4:
|
1809
2164
|
ip3.storeu(out + 3 * ldo);
|
2165
|
+
[[fallthrough]];
|
1810
2166
|
case 3:
|
1811
2167
|
ip2.storeu(out + 2 * ldo);
|
2168
|
+
[[fallthrough]];
|
1812
2169
|
case 2:
|
1813
2170
|
ip1.storeu(out + 1 * ldo);
|
2171
|
+
[[fallthrough]];
|
1814
2172
|
case 1:
|
1815
2173
|
ip0.storeu(out);
|
1816
2174
|
}
|