faiss 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/AutoTune.h +1 -1
  5. data/vendor/faiss/faiss/Clustering.cpp +35 -4
  6. data/vendor/faiss/faiss/Clustering.h +10 -1
  7. data/vendor/faiss/faiss/IVFlib.cpp +4 -1
  8. data/vendor/faiss/faiss/Index.h +21 -6
  9. data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
  10. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
  11. data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
  12. data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
  13. data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
  14. data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
  15. data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
  16. data/vendor/faiss/faiss/IndexHNSW.h +52 -3
  17. data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
  18. data/vendor/faiss/faiss/IndexIVF.h +9 -1
  19. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
  20. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
  21. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
  22. data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
  24. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
  25. data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
  26. data/vendor/faiss/faiss/IndexLattice.h +3 -22
  27. data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
  28. data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
  29. data/vendor/faiss/faiss/IndexNSG.h +1 -1
  30. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
  31. data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
  32. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  33. data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
  34. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
  35. data/vendor/faiss/faiss/MetricType.h +7 -2
  36. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
  37. data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
  38. data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
  39. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
  40. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
  41. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
  42. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
  43. data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
  44. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
  47. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
  48. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
  49. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
  50. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
  51. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
  52. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
  53. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
  54. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
  55. data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
  56. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
  57. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
  58. data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
  59. data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
  60. data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
  61. data/vendor/faiss/faiss/impl/HNSW.h +43 -22
  62. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
  63. data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
  64. data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
  65. data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
  71. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
  72. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
  73. data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
  74. data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
  75. data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
  76. data/vendor/faiss/faiss/impl/io.cpp +13 -5
  77. data/vendor/faiss/faiss/impl/io.h +4 -4
  78. data/vendor/faiss/faiss/impl/io_macros.h +6 -0
  79. data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
  81. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
  82. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
  83. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
  84. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  85. data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
  86. data/vendor/faiss/faiss/index_factory.cpp +31 -13
  87. data/vendor/faiss/faiss/index_io.h +12 -5
  88. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
  89. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
  90. data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
  91. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
  92. data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
  93. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
  94. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
  95. data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
  96. data/vendor/faiss/faiss/utils/Heap.h +105 -0
  97. data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
  98. data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
  99. data/vendor/faiss/faiss/utils/bf16.h +36 -0
  100. data/vendor/faiss/faiss/utils/distances.cpp +58 -88
  101. data/vendor/faiss/faiss/utils/distances.h +5 -5
  102. data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
  103. data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
  104. data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
  105. data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
  106. data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
  107. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
  108. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
  109. data/vendor/faiss/faiss/utils/random.cpp +43 -0
  110. data/vendor/faiss/faiss/utils/random.h +25 -0
  111. data/vendor/faiss/faiss/utils/simdlib.h +10 -1
  112. data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
  113. data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
  114. data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
  115. data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
  116. data/vendor/faiss/faiss/utils/utils.cpp +10 -3
  117. data/vendor/faiss/faiss/utils/utils.h +3 -0
  118. metadata +16 -4
  119. data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
@@ -23,7 +23,9 @@
23
23
  #include <immintrin.h>
24
24
  #endif
25
25
 
26
- #ifdef __AVX2__
26
+ #if defined(__AVX512F__)
27
+ #include <faiss/utils/transpose/transpose-avx512-inl.h>
28
+ #elif defined(__AVX2__)
27
29
  #include <faiss/utils/transpose/transpose-avx2-inl.h>
28
30
  #endif
29
31
 
@@ -346,6 +348,14 @@ inline float horizontal_sum(const __m256 v) {
346
348
  }
347
349
  #endif
348
350
 
351
+ #ifdef __AVX512F__
352
+ /// helper function for AVX512
353
+ inline float horizontal_sum(const __m512 v) {
354
+ // performs better than adding the high and low parts
355
+ return _mm512_reduce_add_ps(v);
356
+ }
357
+ #endif
358
+
349
359
  /// Function that does a component-wise operation between x and y
350
360
  /// to compute L2 distances. ElementOp can then be used in the fvec_op_ny
351
361
  /// functions below
@@ -366,6 +376,13 @@ struct ElementOpL2 {
366
376
  return _mm256_mul_ps(tmp, tmp);
367
377
  }
368
378
  #endif
379
+
380
+ #ifdef __AVX512F__
381
+ static __m512 op(__m512 x, __m512 y) {
382
+ __m512 tmp = _mm512_sub_ps(x, y);
383
+ return _mm512_mul_ps(tmp, tmp);
384
+ }
385
+ #endif
369
386
  };
370
387
 
371
388
  /// Function that does a component-wise operation between x and y
@@ -384,6 +401,12 @@ struct ElementOpIP {
384
401
  return _mm256_mul_ps(x, y);
385
402
  }
386
403
  #endif
404
+
405
+ #ifdef __AVX512F__
406
+ static __m512 op(__m512 x, __m512 y) {
407
+ return _mm512_mul_ps(x, y);
408
+ }
409
+ #endif
387
410
  };
388
411
 
389
412
  template <class ElementOp>
@@ -426,7 +449,130 @@ void fvec_op_ny_D2(float* dis, const float* x, const float* y, size_t ny) {
426
449
  }
427
450
  }
428
451
 
429
- #ifdef __AVX2__
452
+ #if defined(__AVX512F__)
453
+
454
+ template <>
455
+ void fvec_op_ny_D2<ElementOpIP>(
456
+ float* dis,
457
+ const float* x,
458
+ const float* y,
459
+ size_t ny) {
460
+ const size_t ny16 = ny / 16;
461
+ size_t i = 0;
462
+
463
+ if (ny16 > 0) {
464
+ // process 16 D2-vectors per loop.
465
+ _mm_prefetch((const char*)y, _MM_HINT_T0);
466
+ _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
467
+
468
+ const __m512 m0 = _mm512_set1_ps(x[0]);
469
+ const __m512 m1 = _mm512_set1_ps(x[1]);
470
+
471
+ for (i = 0; i < ny16 * 16; i += 16) {
472
+ _mm_prefetch((const char*)(y + 64), _MM_HINT_T0);
473
+
474
+ // load 16x2 matrix and transpose it in registers.
475
+ // the typical bottleneck is memory access, so
476
+ // let's trade instructions for the bandwidth.
477
+
478
+ __m512 v0;
479
+ __m512 v1;
480
+
481
+ transpose_16x2(
482
+ _mm512_loadu_ps(y + 0 * 16),
483
+ _mm512_loadu_ps(y + 1 * 16),
484
+ v0,
485
+ v1);
486
+
487
+ // compute distances (dot product)
488
+ __m512 distances = _mm512_mul_ps(m0, v0);
489
+ distances = _mm512_fmadd_ps(m1, v1, distances);
490
+
491
+ // store
492
+ _mm512_storeu_ps(dis + i, distances);
493
+
494
+ y += 32; // move to the next set of 16x2 elements
495
+ }
496
+ }
497
+
498
+ if (i < ny) {
499
+ // process leftovers
500
+ float x0 = x[0];
501
+ float x1 = x[1];
502
+
503
+ for (; i < ny; i++) {
504
+ float distance = x0 * y[0] + x1 * y[1];
505
+ y += 2;
506
+ dis[i] = distance;
507
+ }
508
+ }
509
+ }
510
+
511
+ template <>
512
+ void fvec_op_ny_D2<ElementOpL2>(
513
+ float* dis,
514
+ const float* x,
515
+ const float* y,
516
+ size_t ny) {
517
+ const size_t ny16 = ny / 16;
518
+ size_t i = 0;
519
+
520
+ if (ny16 > 0) {
521
+ // process 16 D2-vectors per loop.
522
+ _mm_prefetch((const char*)y, _MM_HINT_T0);
523
+ _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
524
+
525
+ const __m512 m0 = _mm512_set1_ps(x[0]);
526
+ const __m512 m1 = _mm512_set1_ps(x[1]);
527
+
528
+ for (i = 0; i < ny16 * 16; i += 16) {
529
+ _mm_prefetch((const char*)(y + 64), _MM_HINT_T0);
530
+
531
+ // load 16x2 matrix and transpose it in registers.
532
+ // the typical bottleneck is memory access, so
533
+ // let's trade instructions for the bandwidth.
534
+
535
+ __m512 v0;
536
+ __m512 v1;
537
+
538
+ transpose_16x2(
539
+ _mm512_loadu_ps(y + 0 * 16),
540
+ _mm512_loadu_ps(y + 1 * 16),
541
+ v0,
542
+ v1);
543
+
544
+ // compute differences
545
+ const __m512 d0 = _mm512_sub_ps(m0, v0);
546
+ const __m512 d1 = _mm512_sub_ps(m1, v1);
547
+
548
+ // compute squares of differences
549
+ __m512 distances = _mm512_mul_ps(d0, d0);
550
+ distances = _mm512_fmadd_ps(d1, d1, distances);
551
+
552
+ // store
553
+ _mm512_storeu_ps(dis + i, distances);
554
+
555
+ y += 32; // move to the next set of 16x2 elements
556
+ }
557
+ }
558
+
559
+ if (i < ny) {
560
+ // process leftovers
561
+ float x0 = x[0];
562
+ float x1 = x[1];
563
+
564
+ for (; i < ny; i++) {
565
+ float sub0 = x0 - y[0];
566
+ float sub1 = x1 - y[1];
567
+ float distance = sub0 * sub0 + sub1 * sub1;
568
+
569
+ y += 2;
570
+ dis[i] = distance;
571
+ }
572
+ }
573
+ }
574
+
575
+ #elif defined(__AVX2__)
430
576
 
431
577
  template <>
432
578
  void fvec_op_ny_D2<ElementOpIP>(
@@ -562,7 +708,137 @@ void fvec_op_ny_D4(float* dis, const float* x, const float* y, size_t ny) {
562
708
  }
563
709
  }
564
710
 
565
- #ifdef __AVX2__
711
+ #if defined(__AVX512F__)
712
+
713
+ template <>
714
+ void fvec_op_ny_D4<ElementOpIP>(
715
+ float* dis,
716
+ const float* x,
717
+ const float* y,
718
+ size_t ny) {
719
+ const size_t ny16 = ny / 16;
720
+ size_t i = 0;
721
+
722
+ if (ny16 > 0) {
723
+ // process 16 D4-vectors per loop.
724
+ const __m512 m0 = _mm512_set1_ps(x[0]);
725
+ const __m512 m1 = _mm512_set1_ps(x[1]);
726
+ const __m512 m2 = _mm512_set1_ps(x[2]);
727
+ const __m512 m3 = _mm512_set1_ps(x[3]);
728
+
729
+ for (i = 0; i < ny16 * 16; i += 16) {
730
+ // load 16x4 matrix and transpose it in registers.
731
+ // the typical bottleneck is memory access, so
732
+ // let's trade instructions for the bandwidth.
733
+
734
+ __m512 v0;
735
+ __m512 v1;
736
+ __m512 v2;
737
+ __m512 v3;
738
+
739
+ transpose_16x4(
740
+ _mm512_loadu_ps(y + 0 * 16),
741
+ _mm512_loadu_ps(y + 1 * 16),
742
+ _mm512_loadu_ps(y + 2 * 16),
743
+ _mm512_loadu_ps(y + 3 * 16),
744
+ v0,
745
+ v1,
746
+ v2,
747
+ v3);
748
+
749
+ // compute distances
750
+ __m512 distances = _mm512_mul_ps(m0, v0);
751
+ distances = _mm512_fmadd_ps(m1, v1, distances);
752
+ distances = _mm512_fmadd_ps(m2, v2, distances);
753
+ distances = _mm512_fmadd_ps(m3, v3, distances);
754
+
755
+ // store
756
+ _mm512_storeu_ps(dis + i, distances);
757
+
758
+ y += 64; // move to the next set of 16x4 elements
759
+ }
760
+ }
761
+
762
+ if (i < ny) {
763
+ // process leftovers
764
+ __m128 x0 = _mm_loadu_ps(x);
765
+
766
+ for (; i < ny; i++) {
767
+ __m128 accu = ElementOpIP::op(x0, _mm_loadu_ps(y));
768
+ y += 4;
769
+ dis[i] = horizontal_sum(accu);
770
+ }
771
+ }
772
+ }
773
+
774
+ template <>
775
+ void fvec_op_ny_D4<ElementOpL2>(
776
+ float* dis,
777
+ const float* x,
778
+ const float* y,
779
+ size_t ny) {
780
+ const size_t ny16 = ny / 16;
781
+ size_t i = 0;
782
+
783
+ if (ny16 > 0) {
784
+ // process 16 D4-vectors per loop.
785
+ const __m512 m0 = _mm512_set1_ps(x[0]);
786
+ const __m512 m1 = _mm512_set1_ps(x[1]);
787
+ const __m512 m2 = _mm512_set1_ps(x[2]);
788
+ const __m512 m3 = _mm512_set1_ps(x[3]);
789
+
790
+ for (i = 0; i < ny16 * 16; i += 16) {
791
+ // load 16x4 matrix and transpose it in registers.
792
+ // the typical bottleneck is memory access, so
793
+ // let's trade instructions for the bandwidth.
794
+
795
+ __m512 v0;
796
+ __m512 v1;
797
+ __m512 v2;
798
+ __m512 v3;
799
+
800
+ transpose_16x4(
801
+ _mm512_loadu_ps(y + 0 * 16),
802
+ _mm512_loadu_ps(y + 1 * 16),
803
+ _mm512_loadu_ps(y + 2 * 16),
804
+ _mm512_loadu_ps(y + 3 * 16),
805
+ v0,
806
+ v1,
807
+ v2,
808
+ v3);
809
+
810
+ // compute differences
811
+ const __m512 d0 = _mm512_sub_ps(m0, v0);
812
+ const __m512 d1 = _mm512_sub_ps(m1, v1);
813
+ const __m512 d2 = _mm512_sub_ps(m2, v2);
814
+ const __m512 d3 = _mm512_sub_ps(m3, v3);
815
+
816
+ // compute squares of differences
817
+ __m512 distances = _mm512_mul_ps(d0, d0);
818
+ distances = _mm512_fmadd_ps(d1, d1, distances);
819
+ distances = _mm512_fmadd_ps(d2, d2, distances);
820
+ distances = _mm512_fmadd_ps(d3, d3, distances);
821
+
822
+ // store
823
+ _mm512_storeu_ps(dis + i, distances);
824
+
825
+ y += 64; // move to the next set of 16x4 elements
826
+ }
827
+ }
828
+
829
+ if (i < ny) {
830
+ // process leftovers
831
+ __m128 x0 = _mm_loadu_ps(x);
832
+
833
+ for (; i < ny; i++) {
834
+ __m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
835
+ y += 4;
836
+ dis[i] = horizontal_sum(accu);
837
+ }
838
+ }
839
+ }
840
+
841
+ #elif defined(__AVX2__)
566
842
 
567
843
  template <>
568
844
  void fvec_op_ny_D4<ElementOpIP>(
@@ -710,7 +986,181 @@ void fvec_op_ny_D8(float* dis, const float* x, const float* y, size_t ny) {
710
986
  }
711
987
  }
712
988
 
713
- #ifdef __AVX2__
989
+ #if defined(__AVX512F__)
990
+
991
+ template <>
992
+ void fvec_op_ny_D8<ElementOpIP>(
993
+ float* dis,
994
+ const float* x,
995
+ const float* y,
996
+ size_t ny) {
997
+ const size_t ny16 = ny / 16;
998
+ size_t i = 0;
999
+
1000
+ if (ny16 > 0) {
1001
+ // process 16 D16-vectors per loop.
1002
+ const __m512 m0 = _mm512_set1_ps(x[0]);
1003
+ const __m512 m1 = _mm512_set1_ps(x[1]);
1004
+ const __m512 m2 = _mm512_set1_ps(x[2]);
1005
+ const __m512 m3 = _mm512_set1_ps(x[3]);
1006
+ const __m512 m4 = _mm512_set1_ps(x[4]);
1007
+ const __m512 m5 = _mm512_set1_ps(x[5]);
1008
+ const __m512 m6 = _mm512_set1_ps(x[6]);
1009
+ const __m512 m7 = _mm512_set1_ps(x[7]);
1010
+
1011
+ for (i = 0; i < ny16 * 16; i += 16) {
1012
+ // load 16x8 matrix and transpose it in registers.
1013
+ // the typical bottleneck is memory access, so
1014
+ // let's trade instructions for the bandwidth.
1015
+
1016
+ __m512 v0;
1017
+ __m512 v1;
1018
+ __m512 v2;
1019
+ __m512 v3;
1020
+ __m512 v4;
1021
+ __m512 v5;
1022
+ __m512 v6;
1023
+ __m512 v7;
1024
+
1025
+ transpose_16x8(
1026
+ _mm512_loadu_ps(y + 0 * 16),
1027
+ _mm512_loadu_ps(y + 1 * 16),
1028
+ _mm512_loadu_ps(y + 2 * 16),
1029
+ _mm512_loadu_ps(y + 3 * 16),
1030
+ _mm512_loadu_ps(y + 4 * 16),
1031
+ _mm512_loadu_ps(y + 5 * 16),
1032
+ _mm512_loadu_ps(y + 6 * 16),
1033
+ _mm512_loadu_ps(y + 7 * 16),
1034
+ v0,
1035
+ v1,
1036
+ v2,
1037
+ v3,
1038
+ v4,
1039
+ v5,
1040
+ v6,
1041
+ v7);
1042
+
1043
+ // compute distances
1044
+ __m512 distances = _mm512_mul_ps(m0, v0);
1045
+ distances = _mm512_fmadd_ps(m1, v1, distances);
1046
+ distances = _mm512_fmadd_ps(m2, v2, distances);
1047
+ distances = _mm512_fmadd_ps(m3, v3, distances);
1048
+ distances = _mm512_fmadd_ps(m4, v4, distances);
1049
+ distances = _mm512_fmadd_ps(m5, v5, distances);
1050
+ distances = _mm512_fmadd_ps(m6, v6, distances);
1051
+ distances = _mm512_fmadd_ps(m7, v7, distances);
1052
+
1053
+ // store
1054
+ _mm512_storeu_ps(dis + i, distances);
1055
+
1056
+ y += 128; // 16 floats * 8 rows
1057
+ }
1058
+ }
1059
+
1060
+ if (i < ny) {
1061
+ // process leftovers
1062
+ __m256 x0 = _mm256_loadu_ps(x);
1063
+
1064
+ for (; i < ny; i++) {
1065
+ __m256 accu = ElementOpIP::op(x0, _mm256_loadu_ps(y));
1066
+ y += 8;
1067
+ dis[i] = horizontal_sum(accu);
1068
+ }
1069
+ }
1070
+ }
1071
+
1072
+ template <>
1073
+ void fvec_op_ny_D8<ElementOpL2>(
1074
+ float* dis,
1075
+ const float* x,
1076
+ const float* y,
1077
+ size_t ny) {
1078
+ const size_t ny16 = ny / 16;
1079
+ size_t i = 0;
1080
+
1081
+ if (ny16 > 0) {
1082
+ // process 16 D16-vectors per loop.
1083
+ const __m512 m0 = _mm512_set1_ps(x[0]);
1084
+ const __m512 m1 = _mm512_set1_ps(x[1]);
1085
+ const __m512 m2 = _mm512_set1_ps(x[2]);
1086
+ const __m512 m3 = _mm512_set1_ps(x[3]);
1087
+ const __m512 m4 = _mm512_set1_ps(x[4]);
1088
+ const __m512 m5 = _mm512_set1_ps(x[5]);
1089
+ const __m512 m6 = _mm512_set1_ps(x[6]);
1090
+ const __m512 m7 = _mm512_set1_ps(x[7]);
1091
+
1092
+ for (i = 0; i < ny16 * 16; i += 16) {
1093
+ // load 16x8 matrix and transpose it in registers.
1094
+ // the typical bottleneck is memory access, so
1095
+ // let's trade instructions for the bandwidth.
1096
+
1097
+ __m512 v0;
1098
+ __m512 v1;
1099
+ __m512 v2;
1100
+ __m512 v3;
1101
+ __m512 v4;
1102
+ __m512 v5;
1103
+ __m512 v6;
1104
+ __m512 v7;
1105
+
1106
+ transpose_16x8(
1107
+ _mm512_loadu_ps(y + 0 * 16),
1108
+ _mm512_loadu_ps(y + 1 * 16),
1109
+ _mm512_loadu_ps(y + 2 * 16),
1110
+ _mm512_loadu_ps(y + 3 * 16),
1111
+ _mm512_loadu_ps(y + 4 * 16),
1112
+ _mm512_loadu_ps(y + 5 * 16),
1113
+ _mm512_loadu_ps(y + 6 * 16),
1114
+ _mm512_loadu_ps(y + 7 * 16),
1115
+ v0,
1116
+ v1,
1117
+ v2,
1118
+ v3,
1119
+ v4,
1120
+ v5,
1121
+ v6,
1122
+ v7);
1123
+
1124
+ // compute differences
1125
+ const __m512 d0 = _mm512_sub_ps(m0, v0);
1126
+ const __m512 d1 = _mm512_sub_ps(m1, v1);
1127
+ const __m512 d2 = _mm512_sub_ps(m2, v2);
1128
+ const __m512 d3 = _mm512_sub_ps(m3, v3);
1129
+ const __m512 d4 = _mm512_sub_ps(m4, v4);
1130
+ const __m512 d5 = _mm512_sub_ps(m5, v5);
1131
+ const __m512 d6 = _mm512_sub_ps(m6, v6);
1132
+ const __m512 d7 = _mm512_sub_ps(m7, v7);
1133
+
1134
+ // compute squares of differences
1135
+ __m512 distances = _mm512_mul_ps(d0, d0);
1136
+ distances = _mm512_fmadd_ps(d1, d1, distances);
1137
+ distances = _mm512_fmadd_ps(d2, d2, distances);
1138
+ distances = _mm512_fmadd_ps(d3, d3, distances);
1139
+ distances = _mm512_fmadd_ps(d4, d4, distances);
1140
+ distances = _mm512_fmadd_ps(d5, d5, distances);
1141
+ distances = _mm512_fmadd_ps(d6, d6, distances);
1142
+ distances = _mm512_fmadd_ps(d7, d7, distances);
1143
+
1144
+ // store
1145
+ _mm512_storeu_ps(dis + i, distances);
1146
+
1147
+ y += 128; // 16 floats * 8 rows
1148
+ }
1149
+ }
1150
+
1151
+ if (i < ny) {
1152
+ // process leftovers
1153
+ __m256 x0 = _mm256_loadu_ps(x);
1154
+
1155
+ for (; i < ny; i++) {
1156
+ __m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
1157
+ y += 8;
1158
+ dis[i] = horizontal_sum(accu);
1159
+ }
1160
+ }
1161
+ }
1162
+
1163
+ #elif defined(__AVX2__)
714
1164
 
715
1165
  template <>
716
1166
  void fvec_op_ny_D8<ElementOpIP>(
@@ -955,7 +1405,83 @@ void fvec_inner_products_ny(
955
1405
  #undef DISPATCH
956
1406
  }
957
1407
 
958
- #ifdef __AVX2__
1408
+ #if defined(__AVX512F__)
1409
+
1410
+ template <size_t DIM>
1411
+ void fvec_L2sqr_ny_y_transposed_D(
1412
+ float* distances,
1413
+ const float* x,
1414
+ const float* y,
1415
+ const float* y_sqlen,
1416
+ const size_t d_offset,
1417
+ size_t ny) {
1418
+ // current index being processed
1419
+ size_t i = 0;
1420
+
1421
+ // squared length of x
1422
+ float x_sqlen = 0;
1423
+ for (size_t j = 0; j < DIM; j++) {
1424
+ x_sqlen += x[j] * x[j];
1425
+ }
1426
+
1427
+ // process 16 vectors per loop
1428
+ const size_t ny16 = ny / 16;
1429
+
1430
+ if (ny16 > 0) {
1431
+ // m[i] = (2 * x[i], ... 2 * x[i])
1432
+ __m512 m[DIM];
1433
+ for (size_t j = 0; j < DIM; j++) {
1434
+ m[j] = _mm512_set1_ps(x[j]);
1435
+ m[j] = _mm512_add_ps(m[j], m[j]); // m[j] = 2 * x[j]
1436
+ }
1437
+
1438
+ __m512 x_sqlen_ymm = _mm512_set1_ps(x_sqlen);
1439
+
1440
+ for (; i < ny16 * 16; i += 16) {
1441
+ // Load vectors for 16 dimensions
1442
+ __m512 v[DIM];
1443
+ for (size_t j = 0; j < DIM; j++) {
1444
+ v[j] = _mm512_loadu_ps(y + j * d_offset);
1445
+ }
1446
+
1447
+ // Compute dot products
1448
+ __m512 dp = _mm512_fnmadd_ps(m[0], v[0], x_sqlen_ymm);
1449
+ for (size_t j = 1; j < DIM; j++) {
1450
+ dp = _mm512_fnmadd_ps(m[j], v[j], dp);
1451
+ }
1452
+
1453
+ // Compute y^2 - (2 * x, y) + x^2
1454
+ __m512 distances_v = _mm512_add_ps(_mm512_loadu_ps(y_sqlen), dp);
1455
+
1456
+ _mm512_storeu_ps(distances + i, distances_v);
1457
+
1458
+ // Scroll y and y_sqlen forward
1459
+ y += 16;
1460
+ y_sqlen += 16;
1461
+ }
1462
+ }
1463
+
1464
+ if (i < ny) {
1465
+ // Process leftovers
1466
+ for (; i < ny; i++) {
1467
+ float dp = 0;
1468
+ for (size_t j = 0; j < DIM; j++) {
1469
+ dp += x[j] * y[j * d_offset];
1470
+ }
1471
+
1472
+ // Compute y^2 - 2 * (x, y), which is sufficient for looking for the
1473
+ // lowest distance.
1474
+ const float distance = y_sqlen[0] - 2 * dp + x_sqlen;
1475
+ distances[i] = distance;
1476
+
1477
+ y += 1;
1478
+ y_sqlen += 1;
1479
+ }
1480
+ }
1481
+ }
1482
+
1483
+ #elif defined(__AVX2__)
1484
+
959
1485
  template <size_t DIM>
960
1486
  void fvec_L2sqr_ny_y_transposed_D(
961
1487
  float* distances,
@@ -1031,6 +1557,7 @@ void fvec_L2sqr_ny_y_transposed_D(
1031
1557
  }
1032
1558
  }
1033
1559
  }
1560
+
1034
1561
  #endif
1035
1562
 
1036
1563
  void fvec_L2sqr_ny_transposed(
@@ -1065,7 +1592,316 @@ void fvec_L2sqr_ny_transposed(
1065
1592
  #endif
1066
1593
  }
1067
1594
 
1068
- #ifdef __AVX2__
1595
+ #if defined(__AVX512F__)
1596
+
1597
+ size_t fvec_L2sqr_ny_nearest_D2(
1598
+ float* distances_tmp_buffer,
1599
+ const float* x,
1600
+ const float* y,
1601
+ size_t ny) {
1602
+ // this implementation does not use distances_tmp_buffer.
1603
+
1604
+ size_t i = 0;
1605
+ float current_min_distance = HUGE_VALF;
1606
+ size_t current_min_index = 0;
1607
+
1608
+ const size_t ny16 = ny / 16;
1609
+ if (ny16 > 0) {
1610
+ _mm_prefetch((const char*)y, _MM_HINT_T0);
1611
+ _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
1612
+
1613
+ __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
1614
+ __m512i min_indices = _mm512_set1_epi32(0);
1615
+
1616
+ __m512i current_indices = _mm512_setr_epi32(
1617
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1618
+ const __m512i indices_increment = _mm512_set1_epi32(16);
1619
+
1620
+ const __m512 m0 = _mm512_set1_ps(x[0]);
1621
+ const __m512 m1 = _mm512_set1_ps(x[1]);
1622
+
1623
+ for (; i < ny16 * 16; i += 16) {
1624
+ _mm_prefetch((const char*)(y + 64), _MM_HINT_T0);
1625
+
1626
+ __m512 v0;
1627
+ __m512 v1;
1628
+
1629
+ transpose_16x2(
1630
+ _mm512_loadu_ps(y + 0 * 16),
1631
+ _mm512_loadu_ps(y + 1 * 16),
1632
+ v0,
1633
+ v1);
1634
+
1635
+ const __m512 d0 = _mm512_sub_ps(m0, v0);
1636
+ const __m512 d1 = _mm512_sub_ps(m1, v1);
1637
+
1638
+ __m512 distances = _mm512_mul_ps(d0, d0);
1639
+ distances = _mm512_fmadd_ps(d1, d1, distances);
1640
+
1641
+ __mmask16 comparison =
1642
+ _mm512_cmp_ps_mask(distances, min_distances, _CMP_LT_OS);
1643
+
1644
+ min_distances = _mm512_min_ps(distances, min_distances);
1645
+ min_indices = _mm512_mask_blend_epi32(
1646
+ comparison, min_indices, current_indices);
1647
+
1648
+ current_indices =
1649
+ _mm512_add_epi32(current_indices, indices_increment);
1650
+
1651
+ y += 32;
1652
+ }
1653
+
1654
+ alignas(64) float min_distances_scalar[16];
1655
+ alignas(64) uint32_t min_indices_scalar[16];
1656
+ _mm512_store_ps(min_distances_scalar, min_distances);
1657
+ _mm512_store_epi32(min_indices_scalar, min_indices);
1658
+
1659
+ for (size_t j = 0; j < 16; j++) {
1660
+ if (current_min_distance > min_distances_scalar[j]) {
1661
+ current_min_distance = min_distances_scalar[j];
1662
+ current_min_index = min_indices_scalar[j];
1663
+ }
1664
+ }
1665
+ }
1666
+
1667
+ if (i < ny) {
1668
+ float x0 = x[0];
1669
+ float x1 = x[1];
1670
+
1671
+ for (; i < ny; i++) {
1672
+ float sub0 = x0 - y[0];
1673
+ float sub1 = x1 - y[1];
1674
+ float distance = sub0 * sub0 + sub1 * sub1;
1675
+
1676
+ y += 2;
1677
+
1678
+ if (current_min_distance > distance) {
1679
+ current_min_distance = distance;
1680
+ current_min_index = i;
1681
+ }
1682
+ }
1683
+ }
1684
+
1685
+ return current_min_index;
1686
+ }
1687
+
1688
+ size_t fvec_L2sqr_ny_nearest_D4(
1689
+ float* distances_tmp_buffer,
1690
+ const float* x,
1691
+ const float* y,
1692
+ size_t ny) {
1693
+ // this implementation does not use distances_tmp_buffer.
1694
+
1695
+ size_t i = 0;
1696
+ float current_min_distance = HUGE_VALF;
1697
+ size_t current_min_index = 0;
1698
+
1699
+ const size_t ny16 = ny / 16;
1700
+
1701
+ if (ny16 > 0) {
1702
+ __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
1703
+ __m512i min_indices = _mm512_set1_epi32(0);
1704
+
1705
+ __m512i current_indices = _mm512_setr_epi32(
1706
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1707
+ const __m512i indices_increment = _mm512_set1_epi32(16);
1708
+
1709
+ const __m512 m0 = _mm512_set1_ps(x[0]);
1710
+ const __m512 m1 = _mm512_set1_ps(x[1]);
1711
+ const __m512 m2 = _mm512_set1_ps(x[2]);
1712
+ const __m512 m3 = _mm512_set1_ps(x[3]);
1713
+
1714
+ for (; i < ny16 * 16; i += 16) {
1715
+ __m512 v0;
1716
+ __m512 v1;
1717
+ __m512 v2;
1718
+ __m512 v3;
1719
+
1720
+ transpose_16x4(
1721
+ _mm512_loadu_ps(y + 0 * 16),
1722
+ _mm512_loadu_ps(y + 1 * 16),
1723
+ _mm512_loadu_ps(y + 2 * 16),
1724
+ _mm512_loadu_ps(y + 3 * 16),
1725
+ v0,
1726
+ v1,
1727
+ v2,
1728
+ v3);
1729
+
1730
+ const __m512 d0 = _mm512_sub_ps(m0, v0);
1731
+ const __m512 d1 = _mm512_sub_ps(m1, v1);
1732
+ const __m512 d2 = _mm512_sub_ps(m2, v2);
1733
+ const __m512 d3 = _mm512_sub_ps(m3, v3);
1734
+
1735
+ __m512 distances = _mm512_mul_ps(d0, d0);
1736
+ distances = _mm512_fmadd_ps(d1, d1, distances);
1737
+ distances = _mm512_fmadd_ps(d2, d2, distances);
1738
+ distances = _mm512_fmadd_ps(d3, d3, distances);
1739
+
1740
+ __mmask16 comparison =
1741
+ _mm512_cmp_ps_mask(distances, min_distances, _CMP_LT_OS);
1742
+
1743
+ min_distances = _mm512_min_ps(distances, min_distances);
1744
+ min_indices = _mm512_mask_blend_epi32(
1745
+ comparison, min_indices, current_indices);
1746
+
1747
+ current_indices =
1748
+ _mm512_add_epi32(current_indices, indices_increment);
1749
+
1750
+ y += 64;
1751
+ }
1752
+
1753
+ alignas(64) float min_distances_scalar[16];
1754
+ alignas(64) uint32_t min_indices_scalar[16];
1755
+ _mm512_store_ps(min_distances_scalar, min_distances);
1756
+ _mm512_store_epi32(min_indices_scalar, min_indices);
1757
+
1758
+ for (size_t j = 0; j < 16; j++) {
1759
+ if (current_min_distance > min_distances_scalar[j]) {
1760
+ current_min_distance = min_distances_scalar[j];
1761
+ current_min_index = min_indices_scalar[j];
1762
+ }
1763
+ }
1764
+ }
1765
+
1766
+ if (i < ny) {
1767
+ __m128 x0 = _mm_loadu_ps(x);
1768
+
1769
+ for (; i < ny; i++) {
1770
+ __m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
1771
+ y += 4;
1772
+ const float distance = horizontal_sum(accu);
1773
+
1774
+ if (current_min_distance > distance) {
1775
+ current_min_distance = distance;
1776
+ current_min_index = i;
1777
+ }
1778
+ }
1779
+ }
1780
+
1781
+ return current_min_index;
1782
+ }
1783
+
1784
+ size_t fvec_L2sqr_ny_nearest_D8(
1785
+ float* distances_tmp_buffer,
1786
+ const float* x,
1787
+ const float* y,
1788
+ size_t ny) {
1789
+ // this implementation does not use distances_tmp_buffer.
1790
+
1791
+ size_t i = 0;
1792
+ float current_min_distance = HUGE_VALF;
1793
+ size_t current_min_index = 0;
1794
+
1795
+ const size_t ny16 = ny / 16;
1796
+ if (ny16 > 0) {
1797
+ __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
1798
+ __m512i min_indices = _mm512_set1_epi32(0);
1799
+
1800
+ __m512i current_indices = _mm512_setr_epi32(
1801
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1802
+ const __m512i indices_increment = _mm512_set1_epi32(16);
1803
+
1804
+ const __m512 m0 = _mm512_set1_ps(x[0]);
1805
+ const __m512 m1 = _mm512_set1_ps(x[1]);
1806
+ const __m512 m2 = _mm512_set1_ps(x[2]);
1807
+ const __m512 m3 = _mm512_set1_ps(x[3]);
1808
+
1809
+ const __m512 m4 = _mm512_set1_ps(x[4]);
1810
+ const __m512 m5 = _mm512_set1_ps(x[5]);
1811
+ const __m512 m6 = _mm512_set1_ps(x[6]);
1812
+ const __m512 m7 = _mm512_set1_ps(x[7]);
1813
+
1814
+ for (; i < ny16 * 16; i += 16) {
1815
+ __m512 v0;
1816
+ __m512 v1;
1817
+ __m512 v2;
1818
+ __m512 v3;
1819
+ __m512 v4;
1820
+ __m512 v5;
1821
+ __m512 v6;
1822
+ __m512 v7;
1823
+
1824
+ transpose_16x8(
1825
+ _mm512_loadu_ps(y + 0 * 16),
1826
+ _mm512_loadu_ps(y + 1 * 16),
1827
+ _mm512_loadu_ps(y + 2 * 16),
1828
+ _mm512_loadu_ps(y + 3 * 16),
1829
+ _mm512_loadu_ps(y + 4 * 16),
1830
+ _mm512_loadu_ps(y + 5 * 16),
1831
+ _mm512_loadu_ps(y + 6 * 16),
1832
+ _mm512_loadu_ps(y + 7 * 16),
1833
+ v0,
1834
+ v1,
1835
+ v2,
1836
+ v3,
1837
+ v4,
1838
+ v5,
1839
+ v6,
1840
+ v7);
1841
+
1842
+ const __m512 d0 = _mm512_sub_ps(m0, v0);
1843
+ const __m512 d1 = _mm512_sub_ps(m1, v1);
1844
+ const __m512 d2 = _mm512_sub_ps(m2, v2);
1845
+ const __m512 d3 = _mm512_sub_ps(m3, v3);
1846
+ const __m512 d4 = _mm512_sub_ps(m4, v4);
1847
+ const __m512 d5 = _mm512_sub_ps(m5, v5);
1848
+ const __m512 d6 = _mm512_sub_ps(m6, v6);
1849
+ const __m512 d7 = _mm512_sub_ps(m7, v7);
1850
+
1851
+ __m512 distances = _mm512_mul_ps(d0, d0);
1852
+ distances = _mm512_fmadd_ps(d1, d1, distances);
1853
+ distances = _mm512_fmadd_ps(d2, d2, distances);
1854
+ distances = _mm512_fmadd_ps(d3, d3, distances);
1855
+ distances = _mm512_fmadd_ps(d4, d4, distances);
1856
+ distances = _mm512_fmadd_ps(d5, d5, distances);
1857
+ distances = _mm512_fmadd_ps(d6, d6, distances);
1858
+ distances = _mm512_fmadd_ps(d7, d7, distances);
1859
+
1860
+ __mmask16 comparison =
1861
+ _mm512_cmp_ps_mask(distances, min_distances, _CMP_LT_OS);
1862
+
1863
+ min_distances = _mm512_min_ps(distances, min_distances);
1864
+ min_indices = _mm512_mask_blend_epi32(
1865
+ comparison, min_indices, current_indices);
1866
+
1867
+ current_indices =
1868
+ _mm512_add_epi32(current_indices, indices_increment);
1869
+
1870
+ y += 128;
1871
+ }
1872
+
1873
+ alignas(64) float min_distances_scalar[16];
1874
+ alignas(64) uint32_t min_indices_scalar[16];
1875
+ _mm512_store_ps(min_distances_scalar, min_distances);
1876
+ _mm512_store_epi32(min_indices_scalar, min_indices);
1877
+
1878
+ for (size_t j = 0; j < 16; j++) {
1879
+ if (current_min_distance > min_distances_scalar[j]) {
1880
+ current_min_distance = min_distances_scalar[j];
1881
+ current_min_index = min_indices_scalar[j];
1882
+ }
1883
+ }
1884
+ }
1885
+
1886
+ if (i < ny) {
1887
+ __m256 x0 = _mm256_loadu_ps(x);
1888
+
1889
+ for (; i < ny; i++) {
1890
+ __m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
1891
+ y += 8;
1892
+ const float distance = horizontal_sum(accu);
1893
+
1894
+ if (current_min_distance > distance) {
1895
+ current_min_distance = distance;
1896
+ current_min_index = i;
1897
+ }
1898
+ }
1899
+ }
1900
+
1901
+ return current_min_index;
1902
+ }
1903
+
1904
+ #elif defined(__AVX2__)
1069
1905
 
1070
1906
  size_t fvec_L2sqr_ny_nearest_D2(
1071
1907
  float* distances_tmp_buffer,
@@ -1476,7 +2312,123 @@ size_t fvec_L2sqr_ny_nearest(
1476
2312
  #undef DISPATCH
1477
2313
  }
1478
2314
 
1479
- #ifdef __AVX2__
2315
+ #if defined(__AVX512F__)
2316
+
2317
+ template <size_t DIM>
2318
+ size_t fvec_L2sqr_ny_nearest_y_transposed_D(
2319
+ float* distances_tmp_buffer,
2320
+ const float* x,
2321
+ const float* y,
2322
+ const float* y_sqlen,
2323
+ const size_t d_offset,
2324
+ size_t ny) {
2325
+ // This implementation does not use distances_tmp_buffer.
2326
+
2327
+ // Current index being processed
2328
+ size_t i = 0;
2329
+
2330
+ // Min distance and the index of the closest vector so far
2331
+ float current_min_distance = HUGE_VALF;
2332
+ size_t current_min_index = 0;
2333
+
2334
+ // Process 16 vectors per loop
2335
+ const size_t ny16 = ny / 16;
2336
+
2337
+ if (ny16 > 0) {
2338
+ // Track min distance and the closest vector independently
2339
+ // for each of 16 AVX-512 components.
2340
+ __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
2341
+ __m512i min_indices = _mm512_set1_epi32(0);
2342
+
2343
+ __m512i current_indices = _mm512_setr_epi32(
2344
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2345
+ const __m512i indices_increment = _mm512_set1_epi32(16);
2346
+
2347
+ // m[i] = (2 * x[i], ... 2 * x[i])
2348
+ __m512 m[DIM];
2349
+ for (size_t j = 0; j < DIM; j++) {
2350
+ m[j] = _mm512_set1_ps(x[j]);
2351
+ m[j] = _mm512_add_ps(m[j], m[j]);
2352
+ }
2353
+
2354
+ for (; i < ny16 * 16; i += 16) {
2355
+ // Compute dot products
2356
+ const __m512 v0 = _mm512_loadu_ps(y + 0 * d_offset);
2357
+ __m512 dp = _mm512_mul_ps(m[0], v0);
2358
+ for (size_t j = 1; j < DIM; j++) {
2359
+ const __m512 vj = _mm512_loadu_ps(y + j * d_offset);
2360
+ dp = _mm512_fmadd_ps(m[j], vj, dp);
2361
+ }
2362
+
2363
+ // Compute y^2 - (2 * x, y), which is sufficient for looking for the
2364
+ // lowest distance.
2365
+ // x^2 is the constant that can be avoided.
2366
+ const __m512 distances =
2367
+ _mm512_sub_ps(_mm512_loadu_ps(y_sqlen), dp);
2368
+
2369
+ // Compare the new distances to the min distances
2370
+ __mmask16 comparison =
2371
+ _mm512_cmp_ps_mask(min_distances, distances, _CMP_LT_OS);
2372
+
2373
+ // Update min distances and indices with closest vectors if needed
2374
+ min_distances =
2375
+ _mm512_mask_blend_ps(comparison, distances, min_distances);
2376
+ min_indices = _mm512_castps_si512(_mm512_mask_blend_ps(
2377
+ comparison,
2378
+ _mm512_castsi512_ps(current_indices),
2379
+ _mm512_castsi512_ps(min_indices)));
2380
+
2381
+ // Update current indices values. Basically, +16 to each of the 16
2382
+ // AVX-512 components.
2383
+ current_indices =
2384
+ _mm512_add_epi32(current_indices, indices_increment);
2385
+
2386
+ // Scroll y and y_sqlen forward.
2387
+ y += 16;
2388
+ y_sqlen += 16;
2389
+ }
2390
+
2391
+ // Dump values and find the minimum distance / minimum index
2392
+ float min_distances_scalar[16];
2393
+ uint32_t min_indices_scalar[16];
2394
+ _mm512_storeu_ps(min_distances_scalar, min_distances);
2395
+ _mm512_storeu_si512((__m512i*)(min_indices_scalar), min_indices);
2396
+
2397
+ for (size_t j = 0; j < 16; j++) {
2398
+ if (current_min_distance > min_distances_scalar[j]) {
2399
+ current_min_distance = min_distances_scalar[j];
2400
+ current_min_index = min_indices_scalar[j];
2401
+ }
2402
+ }
2403
+ }
2404
+
2405
+ if (i < ny) {
2406
+ // Process leftovers
2407
+ for (; i < ny; i++) {
2408
+ float dp = 0;
2409
+ for (size_t j = 0; j < DIM; j++) {
2410
+ dp += x[j] * y[j * d_offset];
2411
+ }
2412
+
2413
+ // Compute y^2 - 2 * (x, y), which is sufficient for looking for the
2414
+ // lowest distance.
2415
+ const float distance = y_sqlen[0] - 2 * dp;
2416
+
2417
+ if (current_min_distance > distance) {
2418
+ current_min_distance = distance;
2419
+ current_min_index = i;
2420
+ }
2421
+
2422
+ y += 1;
2423
+ y_sqlen += 1;
2424
+ }
2425
+ }
2426
+
2427
+ return current_min_index;
2428
+ }
2429
+
2430
+ #elif defined(__AVX2__)
2431
+
1480
2432
  template <size_t DIM>
1481
2433
  size_t fvec_L2sqr_ny_nearest_y_transposed_D(
1482
2434
  float* distances_tmp_buffer,
@@ -1592,6 +2544,7 @@ size_t fvec_L2sqr_ny_nearest_y_transposed_D(
1592
2544
 
1593
2545
  return current_min_index;
1594
2546
  }
2547
+
1595
2548
  #endif
1596
2549
 
1597
2550
  size_t fvec_L2sqr_ny_nearest_y_transposed(
@@ -1858,7 +2811,39 @@ void fvec_inner_products_ny(
1858
2811
  c[i] = a[i] + bf * b[i];
1859
2812
  }
1860
2813
 
1861
- #ifdef __AVX2__
2814
+ #if defined(__AVX512F__)
2815
+
2816
+ static inline void fvec_madd_avx512(
2817
+ const size_t n,
2818
+ const float* __restrict a,
2819
+ const float bf,
2820
+ const float* __restrict b,
2821
+ float* __restrict c) {
2822
+ const size_t n16 = n / 16;
2823
+ const size_t n_for_masking = n % 16;
2824
+
2825
+ const __m512 bfmm = _mm512_set1_ps(bf);
2826
+
2827
+ size_t idx = 0;
2828
+ for (idx = 0; idx < n16 * 16; idx += 16) {
2829
+ const __m512 ax = _mm512_loadu_ps(a + idx);
2830
+ const __m512 bx = _mm512_loadu_ps(b + idx);
2831
+ const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
2832
+ _mm512_storeu_ps(c + idx, abmul);
2833
+ }
2834
+
2835
+ if (n_for_masking > 0) {
2836
+ const __mmask16 mask = (1 << n_for_masking) - 1;
2837
+
2838
+ const __m512 ax = _mm512_maskz_loadu_ps(mask, a + idx);
2839
+ const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx);
2840
+ const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
2841
+ _mm512_mask_storeu_ps(c + idx, mask, abmul);
2842
+ }
2843
+ }
2844
+
2845
+ #elif defined(__AVX2__)
2846
+
1862
2847
  static inline void fvec_madd_avx2(
1863
2848
  const size_t n,
1864
2849
  const float* __restrict a,
@@ -1911,6 +2896,7 @@ static inline void fvec_madd_avx2(
1911
2896
  _mm256_maskstore_ps(c + idx, mask, abmul);
1912
2897
  }
1913
2898
  }
2899
+
1914
2900
  #endif
1915
2901
 
1916
2902
  #ifdef __SSE3__
@@ -1936,7 +2922,9 @@ static inline void fvec_madd_avx2(
1936
2922
  }
1937
2923
 
1938
2924
  void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
1939
- #ifdef __AVX2__
2925
+ #ifdef __AVX512F__
2926
+ fvec_madd_avx512(n, a, bf, b, c);
2927
+ #elif __AVX2__
1940
2928
  fvec_madd_avx2(n, a, bf, b, c);
1941
2929
  #else
1942
2930
  if ((n & 3) == 0 && ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)