faiss 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE.txt +1 -1
  4. data/ext/faiss/ext.cpp +1 -1
  5. data/ext/faiss/extconf.rb +5 -6
  6. data/ext/faiss/index_binary.cpp +76 -17
  7. data/ext/faiss/{index.cpp → index_rb.cpp} +108 -35
  8. data/ext/faiss/kmeans.cpp +12 -9
  9. data/ext/faiss/numo.hpp +11 -9
  10. data/ext/faiss/pca_matrix.cpp +10 -8
  11. data/ext/faiss/product_quantizer.cpp +14 -12
  12. data/ext/faiss/{utils.cpp → utils_rb.cpp} +10 -3
  13. data/ext/faiss/{utils.h → utils_rb.h} +6 -0
  14. data/lib/faiss/version.rb +1 -1
  15. data/lib/faiss.rb +1 -1
  16. data/vendor/faiss/faiss/AutoTune.cpp +130 -11
  17. data/vendor/faiss/faiss/AutoTune.h +14 -1
  18. data/vendor/faiss/faiss/Clustering.cpp +59 -10
  19. data/vendor/faiss/faiss/Clustering.h +12 -0
  20. data/vendor/faiss/faiss/IVFlib.cpp +31 -28
  21. data/vendor/faiss/faiss/Index.cpp +20 -8
  22. data/vendor/faiss/faiss/Index.h +25 -3
  23. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +19 -24
  24. data/vendor/faiss/faiss/IndexBinary.cpp +1 -0
  25. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +9 -4
  26. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +45 -11
  27. data/vendor/faiss/faiss/IndexFastScan.cpp +35 -22
  28. data/vendor/faiss/faiss/IndexFastScan.h +10 -1
  29. data/vendor/faiss/faiss/IndexFlat.cpp +193 -136
  30. data/vendor/faiss/faiss/IndexFlat.h +16 -1
  31. data/vendor/faiss/faiss/IndexFlatCodes.cpp +46 -22
  32. data/vendor/faiss/faiss/IndexFlatCodes.h +7 -1
  33. data/vendor/faiss/faiss/IndexHNSW.cpp +24 -50
  34. data/vendor/faiss/faiss/IndexHNSW.h +14 -12
  35. data/vendor/faiss/faiss/IndexIDMap.cpp +1 -1
  36. data/vendor/faiss/faiss/IndexIVF.cpp +76 -49
  37. data/vendor/faiss/faiss/IndexIVF.h +14 -4
  38. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +11 -8
  39. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +2 -2
  40. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +25 -14
  41. data/vendor/faiss/faiss/IndexIVFFastScan.h +26 -22
  42. data/vendor/faiss/faiss/IndexIVFFlat.cpp +10 -61
  43. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +39 -111
  44. data/vendor/faiss/faiss/IndexIVFPQ.cpp +89 -147
  45. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +37 -5
  46. data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -1
  47. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +42 -30
  48. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -2
  49. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +246 -97
  50. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +32 -29
  51. data/vendor/faiss/faiss/IndexLSH.cpp +8 -6
  52. data/vendor/faiss/faiss/IndexLattice.cpp +29 -24
  53. data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -0
  54. data/vendor/faiss/faiss/IndexNSG.cpp +2 -1
  55. data/vendor/faiss/faiss/IndexNSG.h +0 -2
  56. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +1 -1
  57. data/vendor/faiss/faiss/IndexPQ.cpp +19 -10
  58. data/vendor/faiss/faiss/IndexRaBitQ.cpp +26 -13
  59. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -2
  60. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +132 -78
  61. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +14 -12
  62. data/vendor/faiss/faiss/IndexRefine.cpp +0 -30
  63. data/vendor/faiss/faiss/IndexShards.cpp +3 -4
  64. data/vendor/faiss/faiss/MetricType.h +16 -0
  65. data/vendor/faiss/faiss/VectorTransform.cpp +120 -0
  66. data/vendor/faiss/faiss/VectorTransform.h +23 -0
  67. data/vendor/faiss/faiss/clone_index.cpp +7 -4
  68. data/vendor/faiss/faiss/{cppcontrib/factory_tools.cpp → factory_tools.cpp} +1 -1
  69. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +1 -1
  70. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +37 -11
  71. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -28
  72. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +367 -0
  73. data/vendor/faiss/faiss/impl/ClusteringInitialization.h +107 -0
  74. data/vendor/faiss/faiss/impl/CodePacker.cpp +4 -0
  75. data/vendor/faiss/faiss/impl/CodePacker.h +11 -3
  76. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +83 -0
  77. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.h +47 -0
  78. data/vendor/faiss/faiss/impl/FaissAssert.h +60 -2
  79. data/vendor/faiss/faiss/impl/HNSW.cpp +25 -34
  80. data/vendor/faiss/faiss/impl/HNSW.h +8 -6
  81. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +34 -27
  82. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -1
  83. data/vendor/faiss/faiss/impl/NSG.cpp +6 -5
  84. data/vendor/faiss/faiss/impl/NSG.h +17 -7
  85. data/vendor/faiss/faiss/impl/Panorama.cpp +53 -46
  86. data/vendor/faiss/faiss/impl/Panorama.h +22 -6
  87. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +16 -5
  88. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +70 -58
  89. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +92 -0
  90. data/vendor/faiss/faiss/impl/RaBitQUtils.h +93 -31
  91. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +12 -28
  92. data/vendor/faiss/faiss/impl/RaBitQuantizer.h +3 -10
  93. data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.cpp +15 -41
  94. data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.h +0 -4
  95. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +14 -9
  96. data/vendor/faiss/faiss/impl/ResultHandler.h +131 -50
  97. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +67 -2358
  98. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -2
  99. data/vendor/faiss/faiss/impl/VisitedTable.cpp +42 -0
  100. data/vendor/faiss/faiss/impl/VisitedTable.h +69 -0
  101. data/vendor/faiss/faiss/impl/expanded_scanners.h +158 -0
  102. data/vendor/faiss/faiss/impl/index_read.cpp +829 -471
  103. data/vendor/faiss/faiss/impl/index_read_utils.h +0 -1
  104. data/vendor/faiss/faiss/impl/index_write.cpp +17 -8
  105. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +47 -20
  106. data/vendor/faiss/faiss/impl/mapped_io.cpp +9 -2
  107. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +7 -2
  108. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +11 -3
  109. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +19 -13
  110. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +29 -21
  111. data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx2.h → pq_code_distance/pq_code_distance-avx2.cpp} +42 -215
  112. data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx512.h → pq_code_distance/pq_code_distance-avx512.cpp} +68 -107
  113. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +141 -0
  114. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +23 -0
  115. data/vendor/faiss/faiss/impl/{code_distance/code_distance-sve.h → pq_code_distance/pq_code_distance-sve.cpp} +57 -144
  116. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +9 -6
  117. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +121 -0
  118. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +136 -0
  119. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +280 -0
  120. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +164 -0
  121. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +94 -0
  122. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +455 -0
  123. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +430 -0
  124. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +329 -0
  125. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +467 -0
  126. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +203 -0
  127. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +42 -0
  128. data/vendor/faiss/faiss/impl/simd_dispatch.h +139 -0
  129. data/vendor/faiss/faiss/impl/simd_result_handlers.h +18 -18
  130. data/vendor/faiss/faiss/index_factory.cpp +35 -16
  131. data/vendor/faiss/faiss/index_io.h +29 -3
  132. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +7 -4
  133. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +1 -1
  134. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +9 -19
  135. data/vendor/faiss/faiss/svs/IndexSVSFlat.h +2 -0
  136. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +2 -1
  137. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +9 -1
  138. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +9 -0
  139. data/vendor/faiss/faiss/utils/Heap.cpp +46 -0
  140. data/vendor/faiss/faiss/utils/Heap.h +21 -0
  141. data/vendor/faiss/faiss/utils/NeuralNet.cpp +10 -7
  142. data/vendor/faiss/faiss/utils/distances.cpp +141 -23
  143. data/vendor/faiss/faiss/utils/distances.h +98 -0
  144. data/vendor/faiss/faiss/utils/distances_dispatch.h +170 -0
  145. data/vendor/faiss/faiss/utils/distances_simd.cpp +74 -3511
  146. data/vendor/faiss/faiss/utils/extra_distances-inl.h +164 -157
  147. data/vendor/faiss/faiss/utils/extra_distances.cpp +52 -95
  148. data/vendor/faiss/faiss/utils/extra_distances.h +47 -1
  149. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -1
  150. data/vendor/faiss/faiss/utils/partitioning.cpp +1 -1
  151. data/vendor/faiss/faiss/utils/pq_code_distance.h +251 -0
  152. data/vendor/faiss/faiss/utils/rabitq_simd.h +260 -0
  153. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +150 -0
  154. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +568 -0
  155. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +153 -0
  156. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +1185 -0
  157. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +1092 -0
  158. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +391 -0
  159. data/vendor/faiss/faiss/utils/simd_levels.cpp +322 -0
  160. data/vendor/faiss/faiss/utils/simd_levels.h +91 -0
  161. data/vendor/faiss/faiss/utils/simdlib_avx2.h +12 -1
  162. data/vendor/faiss/faiss/utils/simdlib_avx512.h +69 -0
  163. data/vendor/faiss/faiss/utils/simdlib_neon.h +6 -0
  164. data/vendor/faiss/faiss/utils/sorting.cpp +4 -4
  165. data/vendor/faiss/faiss/utils/utils.cpp +16 -9
  166. metadata +47 -18
  167. data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +0 -81
  168. data/vendor/faiss/faiss/impl/code_distance/code_distance.h +0 -186
  169. /data/vendor/faiss/faiss/{cppcontrib/factory_tools.h → factory_tools.h} +0 -0
@@ -0,0 +1,568 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <arm_sve.h>
9
+
10
+ #include <faiss/utils/distances.h>
11
+
12
+ #define AUTOVEC_LEVEL SIMDLevel::ARM_SVE
13
+ #include <faiss/utils/simd_impl/distances_autovec-inl.h>
14
+
15
+ namespace faiss {
16
+
17
+ template <>
18
+ void fvec_madd<SIMDLevel::ARM_SVE>(
19
+ const size_t n,
20
+ const float* __restrict a,
21
+ const float bf,
22
+ const float* __restrict b,
23
+ float* __restrict c) {
24
+ const size_t lanes = static_cast<size_t>(svcntw());
25
+ const size_t lanes2 = lanes * 2;
26
+ const size_t lanes3 = lanes * 3;
27
+ const size_t lanes4 = lanes * 4;
28
+ size_t i = 0;
29
+ for (; i + lanes4 < n; i += lanes4) {
30
+ const auto mask = svptrue_b32();
31
+ const auto ai0 = svld1_f32(mask, a + i);
32
+ const auto ai1 = svld1_f32(mask, a + i + lanes);
33
+ const auto ai2 = svld1_f32(mask, a + i + lanes2);
34
+ const auto ai3 = svld1_f32(mask, a + i + lanes3);
35
+ const auto bi0 = svld1_f32(mask, b + i);
36
+ const auto bi1 = svld1_f32(mask, b + i + lanes);
37
+ const auto bi2 = svld1_f32(mask, b + i + lanes2);
38
+ const auto bi3 = svld1_f32(mask, b + i + lanes3);
39
+ const auto ci0 = svmla_n_f32_x(mask, ai0, bi0, bf);
40
+ const auto ci1 = svmla_n_f32_x(mask, ai1, bi1, bf);
41
+ const auto ci2 = svmla_n_f32_x(mask, ai2, bi2, bf);
42
+ const auto ci3 = svmla_n_f32_x(mask, ai3, bi3, bf);
43
+ svst1_f32(mask, c + i, ci0);
44
+ svst1_f32(mask, c + i + lanes, ci1);
45
+ svst1_f32(mask, c + i + lanes2, ci2);
46
+ svst1_f32(mask, c + i + lanes3, ci3);
47
+ }
48
+ const auto mask0 = svwhilelt_b32_u64(i, n);
49
+ const auto mask1 = svwhilelt_b32_u64(i + lanes, n);
50
+ const auto mask2 = svwhilelt_b32_u64(i + lanes2, n);
51
+ const auto mask3 = svwhilelt_b32_u64(i + lanes3, n);
52
+ const auto ai0 = svld1_f32(mask0, a + i);
53
+ const auto ai1 = svld1_f32(mask1, a + i + lanes);
54
+ const auto ai2 = svld1_f32(mask2, a + i + lanes2);
55
+ const auto ai3 = svld1_f32(mask3, a + i + lanes3);
56
+ const auto bi0 = svld1_f32(mask0, b + i);
57
+ const auto bi1 = svld1_f32(mask1, b + i + lanes);
58
+ const auto bi2 = svld1_f32(mask2, b + i + lanes2);
59
+ const auto bi3 = svld1_f32(mask3, b + i + lanes3);
60
+ const auto ci0 = svmla_n_f32_x(mask0, ai0, bi0, bf);
61
+ const auto ci1 = svmla_n_f32_x(mask1, ai1, bi1, bf);
62
+ const auto ci2 = svmla_n_f32_x(mask2, ai2, bi2, bf);
63
+ const auto ci3 = svmla_n_f32_x(mask3, ai3, bi3, bf);
64
+ svst1_f32(mask0, c + i, ci0);
65
+ svst1_f32(mask1, c + i + lanes, ci1);
66
+ svst1_f32(mask2, c + i + lanes2, ci2);
67
+ svst1_f32(mask3, c + i + lanes3, ci3);
68
+ }
69
+
70
+ template <>
71
+ int fvec_madd_and_argmin<SIMDLevel::ARM_SVE>(
72
+ size_t n,
73
+ const float* a,
74
+ float bf,
75
+ const float* b,
76
+ float* c) {
77
+ float vmin = 1e20;
78
+ int imin = -1;
79
+
80
+ for (size_t i = 0; i < n; i++) {
81
+ c[i] = a[i] + bf * b[i];
82
+ if (c[i] < vmin) {
83
+ vmin = c[i];
84
+ imin = i;
85
+ }
86
+ }
87
+ return imin;
88
+ }
89
+
90
+ struct ElementOpIP {
91
+ static svfloat32_t op(svbool_t pg, svfloat32_t x, svfloat32_t y) {
92
+ return svmul_f32_x(pg, x, y);
93
+ }
94
+ static svfloat32_t merge(
95
+ svbool_t pg,
96
+ svfloat32_t z,
97
+ svfloat32_t x,
98
+ svfloat32_t y) {
99
+ return svmla_f32_x(pg, z, x, y);
100
+ }
101
+ };
102
+
103
+ template <typename ElementOp>
104
+ void fvec_op_ny_sve_d1(float* dis, const float* x, const float* y, size_t ny) {
105
+ const size_t lanes = svcntw();
106
+ const size_t lanes2 = lanes * 2;
107
+ const size_t lanes3 = lanes * 3;
108
+ const size_t lanes4 = lanes * 4;
109
+ const svbool_t pg = svptrue_b32();
110
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
111
+ size_t i = 0;
112
+ for (; i + lanes4 < ny; i += lanes4) {
113
+ svfloat32_t y0 = svld1_f32(pg, y);
114
+ svfloat32_t y1 = svld1_f32(pg, y + lanes);
115
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
116
+ svfloat32_t y3 = svld1_f32(pg, y + lanes3);
117
+ y0 = ElementOp::op(pg, x0, y0);
118
+ y1 = ElementOp::op(pg, x0, y1);
119
+ y2 = ElementOp::op(pg, x0, y2);
120
+ y3 = ElementOp::op(pg, x0, y3);
121
+ svst1_f32(pg, dis, y0);
122
+ svst1_f32(pg, dis + lanes, y1);
123
+ svst1_f32(pg, dis + lanes2, y2);
124
+ svst1_f32(pg, dis + lanes3, y3);
125
+ y += lanes4;
126
+ dis += lanes4;
127
+ }
128
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
129
+ const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
130
+ const svbool_t pg2 = svwhilelt_b32_u64(i + lanes2, ny);
131
+ const svbool_t pg3 = svwhilelt_b32_u64(i + lanes3, ny);
132
+ svfloat32_t y0 = svld1_f32(pg0, y);
133
+ svfloat32_t y1 = svld1_f32(pg1, y + lanes);
134
+ svfloat32_t y2 = svld1_f32(pg2, y + lanes2);
135
+ svfloat32_t y3 = svld1_f32(pg3, y + lanes3);
136
+ y0 = ElementOp::op(pg0, x0, y0);
137
+ y1 = ElementOp::op(pg1, x0, y1);
138
+ y2 = ElementOp::op(pg2, x0, y2);
139
+ y3 = ElementOp::op(pg3, x0, y3);
140
+ svst1_f32(pg0, dis, y0);
141
+ svst1_f32(pg1, dis + lanes, y1);
142
+ svst1_f32(pg2, dis + lanes2, y2);
143
+ svst1_f32(pg3, dis + lanes3, y3);
144
+ }
145
+
146
+ template <typename ElementOp>
147
+ void fvec_op_ny_sve_d2(float* dis, const float* x, const float* y, size_t ny) {
148
+ const size_t lanes = svcntw();
149
+ const size_t lanes2 = lanes * 2;
150
+ const size_t lanes4 = lanes * 4;
151
+ const svbool_t pg = svptrue_b32();
152
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
153
+ const svfloat32_t x1 = svdup_n_f32(x[1]);
154
+ size_t i = 0;
155
+ for (; i + lanes2 < ny; i += lanes2) {
156
+ const svfloat32x2_t y0 = svld2_f32(pg, y);
157
+ const svfloat32x2_t y1 = svld2_f32(pg, y + lanes2);
158
+ svfloat32_t y00 = svget2_f32(y0, 0);
159
+ const svfloat32_t y01 = svget2_f32(y0, 1);
160
+ svfloat32_t y10 = svget2_f32(y1, 0);
161
+ const svfloat32_t y11 = svget2_f32(y1, 1);
162
+ y00 = ElementOp::op(pg, x0, y00);
163
+ y10 = ElementOp::op(pg, x0, y10);
164
+ y00 = ElementOp::merge(pg, y00, x1, y01);
165
+ y10 = ElementOp::merge(pg, y10, x1, y11);
166
+ svst1_f32(pg, dis, y00);
167
+ svst1_f32(pg, dis + lanes, y10);
168
+ y += lanes4;
169
+ dis += lanes2;
170
+ }
171
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
172
+ const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
173
+ const svfloat32x2_t y0 = svld2_f32(pg0, y);
174
+ const svfloat32x2_t y1 = svld2_f32(pg1, y + lanes2);
175
+ svfloat32_t y00 = svget2_f32(y0, 0);
176
+ const svfloat32_t y01 = svget2_f32(y0, 1);
177
+ svfloat32_t y10 = svget2_f32(y1, 0);
178
+ const svfloat32_t y11 = svget2_f32(y1, 1);
179
+ y00 = ElementOp::op(pg0, x0, y00);
180
+ y10 = ElementOp::op(pg1, x0, y10);
181
+ y00 = ElementOp::merge(pg0, y00, x1, y01);
182
+ y10 = ElementOp::merge(pg1, y10, x1, y11);
183
+ svst1_f32(pg0, dis, y00);
184
+ svst1_f32(pg1, dis + lanes, y10);
185
+ }
186
+
187
+ template <typename ElementOp>
188
+ void fvec_op_ny_sve_d4(float* dis, const float* x, const float* y, size_t ny) {
189
+ const size_t lanes = svcntw();
190
+ const size_t lanes4 = lanes * 4;
191
+ const svbool_t pg = svptrue_b32();
192
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
193
+ const svfloat32_t x1 = svdup_n_f32(x[1]);
194
+ const svfloat32_t x2 = svdup_n_f32(x[2]);
195
+ const svfloat32_t x3 = svdup_n_f32(x[3]);
196
+ size_t i = 0;
197
+ for (; i + lanes < ny; i += lanes) {
198
+ const svfloat32x4_t y0 = svld4_f32(pg, y);
199
+ svfloat32_t y00 = svget4_f32(y0, 0);
200
+ const svfloat32_t y01 = svget4_f32(y0, 1);
201
+ svfloat32_t y02 = svget4_f32(y0, 2);
202
+ const svfloat32_t y03 = svget4_f32(y0, 3);
203
+ y00 = ElementOp::op(pg, x0, y00);
204
+ y02 = ElementOp::op(pg, x2, y02);
205
+ y00 = ElementOp::merge(pg, y00, x1, y01);
206
+ y02 = ElementOp::merge(pg, y02, x3, y03);
207
+ y00 = svadd_f32_x(pg, y00, y02);
208
+ svst1_f32(pg, dis, y00);
209
+ y += lanes4;
210
+ dis += lanes;
211
+ }
212
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
213
+ const svfloat32x4_t y0 = svld4_f32(pg0, y);
214
+ svfloat32_t y00 = svget4_f32(y0, 0);
215
+ const svfloat32_t y01 = svget4_f32(y0, 1);
216
+ svfloat32_t y02 = svget4_f32(y0, 2);
217
+ const svfloat32_t y03 = svget4_f32(y0, 3);
218
+ y00 = ElementOp::op(pg0, x0, y00);
219
+ y02 = ElementOp::op(pg0, x2, y02);
220
+ y00 = ElementOp::merge(pg0, y00, x1, y01);
221
+ y02 = ElementOp::merge(pg0, y02, x3, y03);
222
+ y00 = svadd_f32_x(pg0, y00, y02);
223
+ svst1_f32(pg0, dis, y00);
224
+ }
225
+
226
+ template <typename ElementOp>
227
+ void fvec_op_ny_sve_d8(float* dis, const float* x, const float* y, size_t ny) {
228
+ const size_t lanes = svcntw();
229
+ const size_t lanes4 = lanes * 4;
230
+ const size_t lanes8 = lanes * 8;
231
+ const svbool_t pg = svptrue_b32();
232
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
233
+ const svfloat32_t x1 = svdup_n_f32(x[1]);
234
+ const svfloat32_t x2 = svdup_n_f32(x[2]);
235
+ const svfloat32_t x3 = svdup_n_f32(x[3]);
236
+ const svfloat32_t x4 = svdup_n_f32(x[4]);
237
+ const svfloat32_t x5 = svdup_n_f32(x[5]);
238
+ const svfloat32_t x6 = svdup_n_f32(x[6]);
239
+ const svfloat32_t x7 = svdup_n_f32(x[7]);
240
+ size_t i = 0;
241
+ for (; i + lanes < ny; i += lanes) {
242
+ const svfloat32x4_t ya = svld4_f32(pg, y);
243
+ const svfloat32x4_t yb = svld4_f32(pg, y + lanes4);
244
+ const svfloat32_t ya0 = svget4_f32(ya, 0);
245
+ const svfloat32_t ya1 = svget4_f32(ya, 1);
246
+ const svfloat32_t ya2 = svget4_f32(ya, 2);
247
+ const svfloat32_t ya3 = svget4_f32(ya, 3);
248
+ const svfloat32_t yb0 = svget4_f32(yb, 0);
249
+ const svfloat32_t yb1 = svget4_f32(yb, 1);
250
+ const svfloat32_t yb2 = svget4_f32(yb, 2);
251
+ const svfloat32_t yb3 = svget4_f32(yb, 3);
252
+ svfloat32_t y0 = svuzp1(ya0, yb0);
253
+ const svfloat32_t y1 = svuzp1(ya1, yb1);
254
+ svfloat32_t y2 = svuzp1(ya2, yb2);
255
+ const svfloat32_t y3 = svuzp1(ya3, yb3);
256
+ svfloat32_t y4 = svuzp2(ya0, yb0);
257
+ const svfloat32_t y5 = svuzp2(ya1, yb1);
258
+ svfloat32_t y6 = svuzp2(ya2, yb2);
259
+ const svfloat32_t y7 = svuzp2(ya3, yb3);
260
+ y0 = ElementOp::op(pg, x0, y0);
261
+ y2 = ElementOp::op(pg, x2, y2);
262
+ y4 = ElementOp::op(pg, x4, y4);
263
+ y6 = ElementOp::op(pg, x6, y6);
264
+ y0 = ElementOp::merge(pg, y0, x1, y1);
265
+ y2 = ElementOp::merge(pg, y2, x3, y3);
266
+ y4 = ElementOp::merge(pg, y4, x5, y5);
267
+ y6 = ElementOp::merge(pg, y6, x7, y7);
268
+ y0 = svadd_f32_x(pg, y0, y2);
269
+ y4 = svadd_f32_x(pg, y4, y6);
270
+ y0 = svadd_f32_x(pg, y0, y4);
271
+ svst1_f32(pg, dis, y0);
272
+ y += lanes8;
273
+ dis += lanes;
274
+ }
275
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
276
+ const svbool_t pga = svwhilelt_b32_u64(i * 2, ny * 2);
277
+ const svbool_t pgb = svwhilelt_b32_u64(i * 2 + lanes, ny * 2);
278
+ const svfloat32x4_t ya = svld4_f32(pga, y);
279
+ const svfloat32x4_t yb = svld4_f32(pgb, y + lanes4);
280
+ const svfloat32_t ya0 = svget4_f32(ya, 0);
281
+ const svfloat32_t ya1 = svget4_f32(ya, 1);
282
+ const svfloat32_t ya2 = svget4_f32(ya, 2);
283
+ const svfloat32_t ya3 = svget4_f32(ya, 3);
284
+ const svfloat32_t yb0 = svget4_f32(yb, 0);
285
+ const svfloat32_t yb1 = svget4_f32(yb, 1);
286
+ const svfloat32_t yb2 = svget4_f32(yb, 2);
287
+ const svfloat32_t yb3 = svget4_f32(yb, 3);
288
+ svfloat32_t y0 = svuzp1(ya0, yb0);
289
+ const svfloat32_t y1 = svuzp1(ya1, yb1);
290
+ svfloat32_t y2 = svuzp1(ya2, yb2);
291
+ const svfloat32_t y3 = svuzp1(ya3, yb3);
292
+ svfloat32_t y4 = svuzp2(ya0, yb0);
293
+ const svfloat32_t y5 = svuzp2(ya1, yb1);
294
+ svfloat32_t y6 = svuzp2(ya2, yb2);
295
+ const svfloat32_t y7 = svuzp2(ya3, yb3);
296
+ y0 = ElementOp::op(pg0, x0, y0);
297
+ y2 = ElementOp::op(pg0, x2, y2);
298
+ y4 = ElementOp::op(pg0, x4, y4);
299
+ y6 = ElementOp::op(pg0, x6, y6);
300
+ y0 = ElementOp::merge(pg0, y0, x1, y1);
301
+ y2 = ElementOp::merge(pg0, y2, x3, y3);
302
+ y4 = ElementOp::merge(pg0, y4, x5, y5);
303
+ y6 = ElementOp::merge(pg0, y6, x7, y7);
304
+ y0 = svadd_f32_x(pg0, y0, y2);
305
+ y4 = svadd_f32_x(pg0, y4, y6);
306
+ y0 = svadd_f32_x(pg0, y0, y4);
307
+ svst1_f32(pg0, dis, y0);
308
+ y += lanes8;
309
+ dis += lanes;
310
+ }
311
+
312
+ template <typename ElementOp>
313
+ void fvec_op_ny_sve_lanes1(
314
+ float* dis,
315
+ const float* x,
316
+ const float* y,
317
+ size_t ny) {
318
+ const size_t lanes = svcntw();
319
+ const size_t lanes2 = lanes * 2;
320
+ const size_t lanes3 = lanes * 3;
321
+ const size_t lanes4 = lanes * 4;
322
+ const svbool_t pg = svptrue_b32();
323
+ const svfloat32_t x0 = svld1_f32(pg, x);
324
+ size_t i = 0;
325
+ for (; i + 3 < ny; i += 4) {
326
+ svfloat32_t y0 = svld1_f32(pg, y);
327
+ svfloat32_t y1 = svld1_f32(pg, y + lanes);
328
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
329
+ svfloat32_t y3 = svld1_f32(pg, y + lanes3);
330
+ y += lanes4;
331
+ y0 = ElementOp::op(pg, x0, y0);
332
+ y1 = ElementOp::op(pg, x0, y1);
333
+ y2 = ElementOp::op(pg, x0, y2);
334
+ y3 = ElementOp::op(pg, x0, y3);
335
+ dis[i] = svaddv_f32(pg, y0);
336
+ dis[i + 1] = svaddv_f32(pg, y1);
337
+ dis[i + 2] = svaddv_f32(pg, y2);
338
+ dis[i + 3] = svaddv_f32(pg, y3);
339
+ }
340
+ for (; i < ny; ++i) {
341
+ svfloat32_t y0 = svld1_f32(pg, y);
342
+ y += lanes;
343
+ y0 = ElementOp::op(pg, x0, y0);
344
+ dis[i] = svaddv_f32(pg, y0);
345
+ }
346
+ }
347
+
348
+ template <typename ElementOp>
349
+ void fvec_op_ny_sve_lanes2(
350
+ float* dis,
351
+ const float* x,
352
+ const float* y,
353
+ size_t ny) {
354
+ const size_t lanes = svcntw();
355
+ const size_t lanes2 = lanes * 2;
356
+ const size_t lanes3 = lanes * 3;
357
+ const size_t lanes4 = lanes * 4;
358
+ const svbool_t pg = svptrue_b32();
359
+ const svfloat32_t x0 = svld1_f32(pg, x);
360
+ const svfloat32_t x1 = svld1_f32(pg, x + lanes);
361
+ size_t i = 0;
362
+ for (; i + 1 < ny; i += 2) {
363
+ svfloat32_t y00 = svld1_f32(pg, y);
364
+ const svfloat32_t y01 = svld1_f32(pg, y + lanes);
365
+ svfloat32_t y10 = svld1_f32(pg, y + lanes2);
366
+ const svfloat32_t y11 = svld1_f32(pg, y + lanes3);
367
+ y += lanes4;
368
+ y00 = ElementOp::op(pg, x0, y00);
369
+ y10 = ElementOp::op(pg, x0, y10);
370
+ y00 = ElementOp::merge(pg, y00, x1, y01);
371
+ y10 = ElementOp::merge(pg, y10, x1, y11);
372
+ dis[i] = svaddv_f32(pg, y00);
373
+ dis[i + 1] = svaddv_f32(pg, y10);
374
+ }
375
+ if (i < ny) {
376
+ svfloat32_t y0 = svld1_f32(pg, y);
377
+ const svfloat32_t y1 = svld1_f32(pg, y + lanes);
378
+ y0 = ElementOp::op(pg, x0, y0);
379
+ y0 = ElementOp::merge(pg, y0, x1, y1);
380
+ dis[i] = svaddv_f32(pg, y0);
381
+ }
382
+ }
383
+
384
+ template <typename ElementOp>
385
+ void fvec_op_ny_sve_lanes3(
386
+ float* dis,
387
+ const float* x,
388
+ const float* y,
389
+ size_t ny) {
390
+ const size_t lanes = svcntw();
391
+ const size_t lanes2 = lanes * 2;
392
+ const size_t lanes3 = lanes * 3;
393
+ const svbool_t pg = svptrue_b32();
394
+ const svfloat32_t x0 = svld1_f32(pg, x);
395
+ const svfloat32_t x1 = svld1_f32(pg, x + lanes);
396
+ const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
397
+ for (size_t i = 0; i < ny; ++i) {
398
+ svfloat32_t y0 = svld1_f32(pg, y);
399
+ const svfloat32_t y1 = svld1_f32(pg, y + lanes);
400
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
401
+ y += lanes3;
402
+ y0 = ElementOp::op(pg, x0, y0);
403
+ y0 = ElementOp::merge(pg, y0, x1, y1);
404
+ y0 = ElementOp::merge(pg, y0, x2, y2);
405
+ dis[i] = svaddv_f32(pg, y0);
406
+ }
407
+ }
408
+
409
+ template <typename ElementOp>
410
+ void fvec_op_ny_sve_lanes4(
411
+ float* dis,
412
+ const float* x,
413
+ const float* y,
414
+ size_t ny) {
415
+ const size_t lanes = svcntw();
416
+ const size_t lanes2 = lanes * 2;
417
+ const size_t lanes3 = lanes * 3;
418
+ const size_t lanes4 = lanes * 4;
419
+ const svbool_t pg = svptrue_b32();
420
+ const svfloat32_t x0 = svld1_f32(pg, x);
421
+ const svfloat32_t x1 = svld1_f32(pg, x + lanes);
422
+ const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
423
+ const svfloat32_t x3 = svld1_f32(pg, x + lanes3);
424
+ for (size_t i = 0; i < ny; ++i) {
425
+ svfloat32_t y0 = svld1_f32(pg, y);
426
+ const svfloat32_t y1 = svld1_f32(pg, y + lanes);
427
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
428
+ const svfloat32_t y3 = svld1_f32(pg, y + lanes3);
429
+ y += lanes4;
430
+ y0 = ElementOp::op(pg, x0, y0);
431
+ y2 = ElementOp::op(pg, x2, y2);
432
+ y0 = ElementOp::merge(pg, y0, x1, y1);
433
+ y2 = ElementOp::merge(pg, y2, x3, y3);
434
+ y0 = svadd_f32_x(pg, y0, y2);
435
+ dis[i] = svaddv_f32(pg, y0);
436
+ }
437
+ }
438
+
439
+ template <>
440
+ void fvec_inner_products_ny<SIMDLevel::ARM_SVE>(
441
+ float* dis,
442
+ const float* x,
443
+ const float* y,
444
+ size_t d,
445
+ size_t ny) {
446
+ const size_t lanes = svcntw();
447
+ switch (d) {
448
+ case 1:
449
+ fvec_op_ny_sve_d1<ElementOpIP>(dis, x, y, ny);
450
+ break;
451
+ case 2:
452
+ fvec_op_ny_sve_d2<ElementOpIP>(dis, x, y, ny);
453
+ break;
454
+ case 4:
455
+ fvec_op_ny_sve_d4<ElementOpIP>(dis, x, y, ny);
456
+ break;
457
+ case 8:
458
+ fvec_op_ny_sve_d8<ElementOpIP>(dis, x, y, ny);
459
+ break;
460
+ default:
461
+ if (d == lanes)
462
+ fvec_op_ny_sve_lanes1<ElementOpIP>(dis, x, y, ny);
463
+ else if (d == lanes * 2)
464
+ fvec_op_ny_sve_lanes2<ElementOpIP>(dis, x, y, ny);
465
+ else if (d == lanes * 3)
466
+ fvec_op_ny_sve_lanes3<ElementOpIP>(dis, x, y, ny);
467
+ else if (d == lanes * 4)
468
+ fvec_op_ny_sve_lanes4<ElementOpIP>(dis, x, y, ny);
469
+ else {
470
+ // Fallback: use autovectorized inner product
471
+ for (size_t i = 0; i < ny; i++) {
472
+ dis[i] = fvec_inner_product<SIMDLevel::ARM_SVE>(x, y, d);
473
+ y += d;
474
+ }
475
+ }
476
+ break;
477
+ }
478
+ }
479
+
480
+ template <>
481
+ void fvec_L2sqr_ny<SIMDLevel::ARM_SVE>(
482
+ float* dis,
483
+ const float* x,
484
+ const float* y,
485
+ size_t d,
486
+ size_t ny) {
487
+ // Use autovectorized L2sqr in a loop
488
+ for (size_t i = 0; i < ny; i++) {
489
+ dis[i] = fvec_L2sqr<SIMDLevel::ARM_SVE>(x, y, d);
490
+ y += d;
491
+ }
492
+ }
493
+
494
+ template <>
495
+ size_t fvec_L2sqr_ny_nearest<SIMDLevel::ARM_SVE>(
496
+ float* distances_tmp_buffer,
497
+ const float* x,
498
+ const float* y,
499
+ size_t d,
500
+ size_t ny) {
501
+ fvec_L2sqr_ny<SIMDLevel::ARM_SVE>(distances_tmp_buffer, x, y, d, ny);
502
+
503
+ size_t nearest_idx = 0;
504
+ float min_dis = HUGE_VALF;
505
+
506
+ for (size_t i = 0; i < ny; i++) {
507
+ if (distances_tmp_buffer[i] < min_dis) {
508
+ min_dis = distances_tmp_buffer[i];
509
+ nearest_idx = i;
510
+ }
511
+ }
512
+
513
+ return nearest_idx;
514
+ }
515
+
516
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
517
+ template <>
518
+ void fvec_L2sqr_ny_transposed<SIMDLevel::ARM_SVE>(
519
+ float* dis,
520
+ const float* x,
521
+ const float* y,
522
+ const float* y_sqlen,
523
+ size_t d,
524
+ size_t d_offset,
525
+ size_t ny) {
526
+ float x_sqlen = 0;
527
+ FAISS_PRAGMA_IMPRECISE_LOOP
528
+ for (size_t j = 0; j < d; j++) {
529
+ x_sqlen += x[j] * x[j];
530
+ }
531
+
532
+ for (size_t i = 0; i < ny; i++) {
533
+ float dp = 0;
534
+ FAISS_PRAGMA_IMPRECISE_LOOP
535
+ for (size_t j = 0; j < d; j++) {
536
+ dp += x[j] * y[i + j * d_offset];
537
+ }
538
+ dis[i] = x_sqlen + y_sqlen[i] - 2 * dp;
539
+ }
540
+ }
541
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
542
+
543
+ template <>
544
+ size_t fvec_L2sqr_ny_nearest_y_transposed<SIMDLevel::ARM_SVE>(
545
+ float* distances_tmp_buffer,
546
+ const float* x,
547
+ const float* y,
548
+ const float* y_sqlen,
549
+ size_t d,
550
+ size_t d_offset,
551
+ size_t ny) {
552
+ fvec_L2sqr_ny_transposed<SIMDLevel::ARM_SVE>(
553
+ distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
554
+
555
+ size_t nearest_idx = 0;
556
+ float min_dis = HUGE_VALF;
557
+
558
+ for (size_t i = 0; i < ny; i++) {
559
+ if (distances_tmp_buffer[i] < min_dis) {
560
+ min_dis = distances_tmp_buffer[i];
561
+ nearest_idx = i;
562
+ }
563
+ }
564
+
565
+ return nearest_idx;
566
+ }
567
+
568
+ } // namespace faiss
@@ -0,0 +1,153 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <faiss/impl/platform_macros.h>
11
+ #include <faiss/utils/distances.h>
12
+
13
+ namespace faiss {
14
+
15
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
16
+ template <>
17
+ float fvec_norm_L2sqr<AUTOVEC_LEVEL>(const float* x, size_t d) {
18
+ // the double in the _ref is suspected to be a typo. Some of the manual
19
+ // implementations this replaces used float.
20
+ float res = 0;
21
+ FAISS_PRAGMA_IMPRECISE_LOOP
22
+ for (size_t i = 0; i != d; ++i) {
23
+ res += x[i] * x[i];
24
+ }
25
+
26
+ return res;
27
+ }
28
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
29
+
30
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
31
+ template <>
32
+ float fvec_L2sqr<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
33
+ size_t i;
34
+ float res = 0;
35
+ FAISS_PRAGMA_IMPRECISE_LOOP
36
+ for (i = 0; i < d; i++) {
37
+ const float tmp = x[i] - y[i];
38
+ res += tmp * tmp;
39
+ }
40
+ return res;
41
+ }
42
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
43
+
44
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
45
+ template <>
46
+ float fvec_inner_product<AUTOVEC_LEVEL>(
47
+ const float* x,
48
+ const float* y,
49
+ size_t d) {
50
+ float res = 0.F;
51
+ FAISS_PRAGMA_IMPRECISE_LOOP
52
+ for (size_t i = 0; i != d; ++i) {
53
+ res += x[i] * y[i];
54
+ }
55
+ return res;
56
+ }
57
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
58
+
59
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
60
+ template <>
61
+ float fvec_L1<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
62
+ size_t i;
63
+ float res = 0;
64
+ FAISS_PRAGMA_IMPRECISE_LOOP
65
+ for (i = 0; i < d; i++) {
66
+ const float tmp = x[i] - y[i];
67
+ res += fabs(tmp);
68
+ }
69
+ return res;
70
+ }
71
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
72
+
73
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
74
+ template <>
75
+ float fvec_Linf<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
76
+ float res = 0;
77
+ FAISS_PRAGMA_IMPRECISE_LOOP
78
+ for (size_t i = 0; i < d; i++) {
79
+ res = fmax(res, fabs(x[i] - y[i]));
80
+ }
81
+ return res;
82
+ }
83
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
84
+
85
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
86
+ template <>
87
+ void fvec_inner_product_batch_4<AUTOVEC_LEVEL>(
88
+ const float* x,
89
+ const float* y0,
90
+ const float* y1,
91
+ const float* y2,
92
+ const float* y3,
93
+ const size_t d,
94
+ float& dis0,
95
+ float& dis1,
96
+ float& dis2,
97
+ float& dis3) {
98
+ float d0 = 0;
99
+ float d1 = 0;
100
+ float d2 = 0;
101
+ float d3 = 0;
102
+ FAISS_PRAGMA_IMPRECISE_LOOP
103
+ for (size_t i = 0; i < d; ++i) {
104
+ d0 += x[i] * y0[i];
105
+ d1 += x[i] * y1[i];
106
+ d2 += x[i] * y2[i];
107
+ d3 += x[i] * y3[i];
108
+ }
109
+
110
+ dis0 = d0;
111
+ dis1 = d1;
112
+ dis2 = d2;
113
+ dis3 = d3;
114
+ }
115
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
116
+
117
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
118
+ template <>
119
+ void fvec_L2sqr_batch_4<AUTOVEC_LEVEL>(
120
+ const float* x,
121
+ const float* y0,
122
+ const float* y1,
123
+ const float* y2,
124
+ const float* y3,
125
+ const size_t d,
126
+ float& dis0,
127
+ float& dis1,
128
+ float& dis2,
129
+ float& dis3) {
130
+ float d0 = 0;
131
+ float d1 = 0;
132
+ float d2 = 0;
133
+ float d3 = 0;
134
+ FAISS_PRAGMA_IMPRECISE_LOOP
135
+ for (size_t i = 0; i < d; ++i) {
136
+ const float q0 = x[i] - y0[i];
137
+ const float q1 = x[i] - y1[i];
138
+ const float q2 = x[i] - y2[i];
139
+ const float q3 = x[i] - y3[i];
140
+ d0 += q0 * q0;
141
+ d1 += q1 * q1;
142
+ d2 += q2 * q2;
143
+ d3 += q3 * q3;
144
+ }
145
+
146
+ dis0 = d0;
147
+ dis1 = d1;
148
+ dis2 = d2;
149
+ dis3 = d3;
150
+ }
151
+ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
152
+
153
+ } // namespace faiss