faiss 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/ext/faiss/ext.cpp +1 -1
- data/ext/faiss/extconf.rb +5 -6
- data/ext/faiss/index_binary.cpp +76 -17
- data/ext/faiss/{index.cpp → index_rb.cpp} +108 -35
- data/ext/faiss/kmeans.cpp +12 -9
- data/ext/faiss/numo.hpp +11 -9
- data/ext/faiss/pca_matrix.cpp +10 -8
- data/ext/faiss/product_quantizer.cpp +14 -12
- data/ext/faiss/{utils.cpp → utils_rb.cpp} +10 -3
- data/ext/faiss/{utils.h → utils_rb.h} +6 -0
- data/lib/faiss/version.rb +1 -1
- data/lib/faiss.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +130 -11
- data/vendor/faiss/faiss/AutoTune.h +14 -1
- data/vendor/faiss/faiss/Clustering.cpp +59 -10
- data/vendor/faiss/faiss/Clustering.h +12 -0
- data/vendor/faiss/faiss/IVFlib.cpp +31 -28
- data/vendor/faiss/faiss/Index.cpp +20 -8
- data/vendor/faiss/faiss/Index.h +25 -3
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +19 -24
- data/vendor/faiss/faiss/IndexBinary.cpp +1 -0
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +9 -4
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +45 -11
- data/vendor/faiss/faiss/IndexFastScan.cpp +35 -22
- data/vendor/faiss/faiss/IndexFastScan.h +10 -1
- data/vendor/faiss/faiss/IndexFlat.cpp +193 -136
- data/vendor/faiss/faiss/IndexFlat.h +16 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +46 -22
- data/vendor/faiss/faiss/IndexFlatCodes.h +7 -1
- data/vendor/faiss/faiss/IndexHNSW.cpp +24 -50
- data/vendor/faiss/faiss/IndexHNSW.h +14 -12
- data/vendor/faiss/faiss/IndexIDMap.cpp +1 -1
- data/vendor/faiss/faiss/IndexIVF.cpp +76 -49
- data/vendor/faiss/faiss/IndexIVF.h +14 -4
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +11 -8
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +2 -2
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +25 -14
- data/vendor/faiss/faiss/IndexIVFFastScan.h +26 -22
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +10 -61
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +39 -111
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +89 -147
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +37 -5
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +42 -30
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -2
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +246 -97
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +32 -29
- data/vendor/faiss/faiss/IndexLSH.cpp +8 -6
- data/vendor/faiss/faiss/IndexLattice.cpp +29 -24
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +2 -1
- data/vendor/faiss/faiss/IndexNSG.h +0 -2
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +1 -1
- data/vendor/faiss/faiss/IndexPQ.cpp +19 -10
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +26 -13
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -2
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +132 -78
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +14 -12
- data/vendor/faiss/faiss/IndexRefine.cpp +0 -30
- data/vendor/faiss/faiss/IndexShards.cpp +3 -4
- data/vendor/faiss/faiss/MetricType.h +16 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +120 -0
- data/vendor/faiss/faiss/VectorTransform.h +23 -0
- data/vendor/faiss/faiss/clone_index.cpp +7 -4
- data/vendor/faiss/faiss/{cppcontrib/factory_tools.cpp → factory_tools.cpp} +1 -1
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +1 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +37 -11
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -28
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +367 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.h +107 -0
- data/vendor/faiss/faiss/impl/CodePacker.cpp +4 -0
- data/vendor/faiss/faiss/impl/CodePacker.h +11 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +83 -0
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.h +47 -0
- data/vendor/faiss/faiss/impl/FaissAssert.h +60 -2
- data/vendor/faiss/faiss/impl/HNSW.cpp +25 -34
- data/vendor/faiss/faiss/impl/HNSW.h +8 -6
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +34 -27
- data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -1
- data/vendor/faiss/faiss/impl/NSG.cpp +6 -5
- data/vendor/faiss/faiss/impl/NSG.h +17 -7
- data/vendor/faiss/faiss/impl/Panorama.cpp +53 -46
- data/vendor/faiss/faiss/impl/Panorama.h +22 -6
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +16 -5
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +70 -58
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +92 -0
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +93 -31
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +12 -28
- data/vendor/faiss/faiss/impl/RaBitQuantizer.h +3 -10
- data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.cpp +15 -41
- data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.h +0 -4
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +14 -9
- data/vendor/faiss/faiss/impl/ResultHandler.h +131 -50
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +67 -2358
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -2
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +42 -0
- data/vendor/faiss/faiss/impl/VisitedTable.h +69 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +158 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +829 -471
- data/vendor/faiss/faiss/impl/index_read_utils.h +0 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +17 -8
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +47 -20
- data/vendor/faiss/faiss/impl/mapped_io.cpp +9 -2
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +7 -2
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +11 -3
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +19 -13
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +29 -21
- data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx2.h → pq_code_distance/pq_code_distance-avx2.cpp} +42 -215
- data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx512.h → pq_code_distance/pq_code_distance-avx512.cpp} +68 -107
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +141 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +23 -0
- data/vendor/faiss/faiss/impl/{code_distance/code_distance-sve.h → pq_code_distance/pq_code_distance-sve.cpp} +57 -144
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +9 -6
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +121 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +136 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +280 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +164 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +94 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +455 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +430 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +329 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +467 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +203 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +42 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +139 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +18 -18
- data/vendor/faiss/faiss/index_factory.cpp +35 -16
- data/vendor/faiss/faiss/index_io.h +29 -3
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +7 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +9 -19
- data/vendor/faiss/faiss/svs/IndexSVSFlat.h +2 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +9 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +9 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +46 -0
- data/vendor/faiss/faiss/utils/Heap.h +21 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +10 -7
- data/vendor/faiss/faiss/utils/distances.cpp +141 -23
- data/vendor/faiss/faiss/utils/distances.h +98 -0
- data/vendor/faiss/faiss/utils/distances_dispatch.h +170 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +74 -3511
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +164 -157
- data/vendor/faiss/faiss/utils/extra_distances.cpp +52 -95
- data/vendor/faiss/faiss/utils/extra_distances.h +47 -1
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -1
- data/vendor/faiss/faiss/utils/partitioning.cpp +1 -1
- data/vendor/faiss/faiss/utils/pq_code_distance.h +251 -0
- data/vendor/faiss/faiss/utils/rabitq_simd.h +260 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +150 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +568 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +153 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +1185 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +1092 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +391 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +322 -0
- data/vendor/faiss/faiss/utils/simd_levels.h +91 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +12 -1
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +69 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +6 -0
- data/vendor/faiss/faiss/utils/sorting.cpp +4 -4
- data/vendor/faiss/faiss/utils/utils.cpp +16 -9
- metadata +47 -18
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +0 -81
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +0 -186
- /data/vendor/faiss/faiss/{cppcontrib/factory_tools.h → factory_tools.h} +0 -0
|
@@ -0,0 +1,568 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#include <arm_sve.h>
|
|
9
|
+
|
|
10
|
+
#include <faiss/utils/distances.h>
|
|
11
|
+
|
|
12
|
+
#define AUTOVEC_LEVEL SIMDLevel::ARM_SVE
|
|
13
|
+
#include <faiss/utils/simd_impl/distances_autovec-inl.h>
|
|
14
|
+
|
|
15
|
+
namespace faiss {
|
|
16
|
+
|
|
17
|
+
template <>
|
|
18
|
+
void fvec_madd<SIMDLevel::ARM_SVE>(
|
|
19
|
+
const size_t n,
|
|
20
|
+
const float* __restrict a,
|
|
21
|
+
const float bf,
|
|
22
|
+
const float* __restrict b,
|
|
23
|
+
float* __restrict c) {
|
|
24
|
+
const size_t lanes = static_cast<size_t>(svcntw());
|
|
25
|
+
const size_t lanes2 = lanes * 2;
|
|
26
|
+
const size_t lanes3 = lanes * 3;
|
|
27
|
+
const size_t lanes4 = lanes * 4;
|
|
28
|
+
size_t i = 0;
|
|
29
|
+
for (; i + lanes4 < n; i += lanes4) {
|
|
30
|
+
const auto mask = svptrue_b32();
|
|
31
|
+
const auto ai0 = svld1_f32(mask, a + i);
|
|
32
|
+
const auto ai1 = svld1_f32(mask, a + i + lanes);
|
|
33
|
+
const auto ai2 = svld1_f32(mask, a + i + lanes2);
|
|
34
|
+
const auto ai3 = svld1_f32(mask, a + i + lanes3);
|
|
35
|
+
const auto bi0 = svld1_f32(mask, b + i);
|
|
36
|
+
const auto bi1 = svld1_f32(mask, b + i + lanes);
|
|
37
|
+
const auto bi2 = svld1_f32(mask, b + i + lanes2);
|
|
38
|
+
const auto bi3 = svld1_f32(mask, b + i + lanes3);
|
|
39
|
+
const auto ci0 = svmla_n_f32_x(mask, ai0, bi0, bf);
|
|
40
|
+
const auto ci1 = svmla_n_f32_x(mask, ai1, bi1, bf);
|
|
41
|
+
const auto ci2 = svmla_n_f32_x(mask, ai2, bi2, bf);
|
|
42
|
+
const auto ci3 = svmla_n_f32_x(mask, ai3, bi3, bf);
|
|
43
|
+
svst1_f32(mask, c + i, ci0);
|
|
44
|
+
svst1_f32(mask, c + i + lanes, ci1);
|
|
45
|
+
svst1_f32(mask, c + i + lanes2, ci2);
|
|
46
|
+
svst1_f32(mask, c + i + lanes3, ci3);
|
|
47
|
+
}
|
|
48
|
+
const auto mask0 = svwhilelt_b32_u64(i, n);
|
|
49
|
+
const auto mask1 = svwhilelt_b32_u64(i + lanes, n);
|
|
50
|
+
const auto mask2 = svwhilelt_b32_u64(i + lanes2, n);
|
|
51
|
+
const auto mask3 = svwhilelt_b32_u64(i + lanes3, n);
|
|
52
|
+
const auto ai0 = svld1_f32(mask0, a + i);
|
|
53
|
+
const auto ai1 = svld1_f32(mask1, a + i + lanes);
|
|
54
|
+
const auto ai2 = svld1_f32(mask2, a + i + lanes2);
|
|
55
|
+
const auto ai3 = svld1_f32(mask3, a + i + lanes3);
|
|
56
|
+
const auto bi0 = svld1_f32(mask0, b + i);
|
|
57
|
+
const auto bi1 = svld1_f32(mask1, b + i + lanes);
|
|
58
|
+
const auto bi2 = svld1_f32(mask2, b + i + lanes2);
|
|
59
|
+
const auto bi3 = svld1_f32(mask3, b + i + lanes3);
|
|
60
|
+
const auto ci0 = svmla_n_f32_x(mask0, ai0, bi0, bf);
|
|
61
|
+
const auto ci1 = svmla_n_f32_x(mask1, ai1, bi1, bf);
|
|
62
|
+
const auto ci2 = svmla_n_f32_x(mask2, ai2, bi2, bf);
|
|
63
|
+
const auto ci3 = svmla_n_f32_x(mask3, ai3, bi3, bf);
|
|
64
|
+
svst1_f32(mask0, c + i, ci0);
|
|
65
|
+
svst1_f32(mask1, c + i + lanes, ci1);
|
|
66
|
+
svst1_f32(mask2, c + i + lanes2, ci2);
|
|
67
|
+
svst1_f32(mask3, c + i + lanes3, ci3);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
template <>
|
|
71
|
+
int fvec_madd_and_argmin<SIMDLevel::ARM_SVE>(
|
|
72
|
+
size_t n,
|
|
73
|
+
const float* a,
|
|
74
|
+
float bf,
|
|
75
|
+
const float* b,
|
|
76
|
+
float* c) {
|
|
77
|
+
float vmin = 1e20;
|
|
78
|
+
int imin = -1;
|
|
79
|
+
|
|
80
|
+
for (size_t i = 0; i < n; i++) {
|
|
81
|
+
c[i] = a[i] + bf * b[i];
|
|
82
|
+
if (c[i] < vmin) {
|
|
83
|
+
vmin = c[i];
|
|
84
|
+
imin = i;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return imin;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
struct ElementOpIP {
|
|
91
|
+
static svfloat32_t op(svbool_t pg, svfloat32_t x, svfloat32_t y) {
|
|
92
|
+
return svmul_f32_x(pg, x, y);
|
|
93
|
+
}
|
|
94
|
+
static svfloat32_t merge(
|
|
95
|
+
svbool_t pg,
|
|
96
|
+
svfloat32_t z,
|
|
97
|
+
svfloat32_t x,
|
|
98
|
+
svfloat32_t y) {
|
|
99
|
+
return svmla_f32_x(pg, z, x, y);
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
template <typename ElementOp>
|
|
104
|
+
void fvec_op_ny_sve_d1(float* dis, const float* x, const float* y, size_t ny) {
|
|
105
|
+
const size_t lanes = svcntw();
|
|
106
|
+
const size_t lanes2 = lanes * 2;
|
|
107
|
+
const size_t lanes3 = lanes * 3;
|
|
108
|
+
const size_t lanes4 = lanes * 4;
|
|
109
|
+
const svbool_t pg = svptrue_b32();
|
|
110
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
|
111
|
+
size_t i = 0;
|
|
112
|
+
for (; i + lanes4 < ny; i += lanes4) {
|
|
113
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
|
114
|
+
svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
|
115
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
|
116
|
+
svfloat32_t y3 = svld1_f32(pg, y + lanes3);
|
|
117
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
118
|
+
y1 = ElementOp::op(pg, x0, y1);
|
|
119
|
+
y2 = ElementOp::op(pg, x0, y2);
|
|
120
|
+
y3 = ElementOp::op(pg, x0, y3);
|
|
121
|
+
svst1_f32(pg, dis, y0);
|
|
122
|
+
svst1_f32(pg, dis + lanes, y1);
|
|
123
|
+
svst1_f32(pg, dis + lanes2, y2);
|
|
124
|
+
svst1_f32(pg, dis + lanes3, y3);
|
|
125
|
+
y += lanes4;
|
|
126
|
+
dis += lanes4;
|
|
127
|
+
}
|
|
128
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
|
129
|
+
const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
|
|
130
|
+
const svbool_t pg2 = svwhilelt_b32_u64(i + lanes2, ny);
|
|
131
|
+
const svbool_t pg3 = svwhilelt_b32_u64(i + lanes3, ny);
|
|
132
|
+
svfloat32_t y0 = svld1_f32(pg0, y);
|
|
133
|
+
svfloat32_t y1 = svld1_f32(pg1, y + lanes);
|
|
134
|
+
svfloat32_t y2 = svld1_f32(pg2, y + lanes2);
|
|
135
|
+
svfloat32_t y3 = svld1_f32(pg3, y + lanes3);
|
|
136
|
+
y0 = ElementOp::op(pg0, x0, y0);
|
|
137
|
+
y1 = ElementOp::op(pg1, x0, y1);
|
|
138
|
+
y2 = ElementOp::op(pg2, x0, y2);
|
|
139
|
+
y3 = ElementOp::op(pg3, x0, y3);
|
|
140
|
+
svst1_f32(pg0, dis, y0);
|
|
141
|
+
svst1_f32(pg1, dis + lanes, y1);
|
|
142
|
+
svst1_f32(pg2, dis + lanes2, y2);
|
|
143
|
+
svst1_f32(pg3, dis + lanes3, y3);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
template <typename ElementOp>
|
|
147
|
+
void fvec_op_ny_sve_d2(float* dis, const float* x, const float* y, size_t ny) {
|
|
148
|
+
const size_t lanes = svcntw();
|
|
149
|
+
const size_t lanes2 = lanes * 2;
|
|
150
|
+
const size_t lanes4 = lanes * 4;
|
|
151
|
+
const svbool_t pg = svptrue_b32();
|
|
152
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
|
153
|
+
const svfloat32_t x1 = svdup_n_f32(x[1]);
|
|
154
|
+
size_t i = 0;
|
|
155
|
+
for (; i + lanes2 < ny; i += lanes2) {
|
|
156
|
+
const svfloat32x2_t y0 = svld2_f32(pg, y);
|
|
157
|
+
const svfloat32x2_t y1 = svld2_f32(pg, y + lanes2);
|
|
158
|
+
svfloat32_t y00 = svget2_f32(y0, 0);
|
|
159
|
+
const svfloat32_t y01 = svget2_f32(y0, 1);
|
|
160
|
+
svfloat32_t y10 = svget2_f32(y1, 0);
|
|
161
|
+
const svfloat32_t y11 = svget2_f32(y1, 1);
|
|
162
|
+
y00 = ElementOp::op(pg, x0, y00);
|
|
163
|
+
y10 = ElementOp::op(pg, x0, y10);
|
|
164
|
+
y00 = ElementOp::merge(pg, y00, x1, y01);
|
|
165
|
+
y10 = ElementOp::merge(pg, y10, x1, y11);
|
|
166
|
+
svst1_f32(pg, dis, y00);
|
|
167
|
+
svst1_f32(pg, dis + lanes, y10);
|
|
168
|
+
y += lanes4;
|
|
169
|
+
dis += lanes2;
|
|
170
|
+
}
|
|
171
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
|
172
|
+
const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
|
|
173
|
+
const svfloat32x2_t y0 = svld2_f32(pg0, y);
|
|
174
|
+
const svfloat32x2_t y1 = svld2_f32(pg1, y + lanes2);
|
|
175
|
+
svfloat32_t y00 = svget2_f32(y0, 0);
|
|
176
|
+
const svfloat32_t y01 = svget2_f32(y0, 1);
|
|
177
|
+
svfloat32_t y10 = svget2_f32(y1, 0);
|
|
178
|
+
const svfloat32_t y11 = svget2_f32(y1, 1);
|
|
179
|
+
y00 = ElementOp::op(pg0, x0, y00);
|
|
180
|
+
y10 = ElementOp::op(pg1, x0, y10);
|
|
181
|
+
y00 = ElementOp::merge(pg0, y00, x1, y01);
|
|
182
|
+
y10 = ElementOp::merge(pg1, y10, x1, y11);
|
|
183
|
+
svst1_f32(pg0, dis, y00);
|
|
184
|
+
svst1_f32(pg1, dis + lanes, y10);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
template <typename ElementOp>
|
|
188
|
+
void fvec_op_ny_sve_d4(float* dis, const float* x, const float* y, size_t ny) {
|
|
189
|
+
const size_t lanes = svcntw();
|
|
190
|
+
const size_t lanes4 = lanes * 4;
|
|
191
|
+
const svbool_t pg = svptrue_b32();
|
|
192
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
|
193
|
+
const svfloat32_t x1 = svdup_n_f32(x[1]);
|
|
194
|
+
const svfloat32_t x2 = svdup_n_f32(x[2]);
|
|
195
|
+
const svfloat32_t x3 = svdup_n_f32(x[3]);
|
|
196
|
+
size_t i = 0;
|
|
197
|
+
for (; i + lanes < ny; i += lanes) {
|
|
198
|
+
const svfloat32x4_t y0 = svld4_f32(pg, y);
|
|
199
|
+
svfloat32_t y00 = svget4_f32(y0, 0);
|
|
200
|
+
const svfloat32_t y01 = svget4_f32(y0, 1);
|
|
201
|
+
svfloat32_t y02 = svget4_f32(y0, 2);
|
|
202
|
+
const svfloat32_t y03 = svget4_f32(y0, 3);
|
|
203
|
+
y00 = ElementOp::op(pg, x0, y00);
|
|
204
|
+
y02 = ElementOp::op(pg, x2, y02);
|
|
205
|
+
y00 = ElementOp::merge(pg, y00, x1, y01);
|
|
206
|
+
y02 = ElementOp::merge(pg, y02, x3, y03);
|
|
207
|
+
y00 = svadd_f32_x(pg, y00, y02);
|
|
208
|
+
svst1_f32(pg, dis, y00);
|
|
209
|
+
y += lanes4;
|
|
210
|
+
dis += lanes;
|
|
211
|
+
}
|
|
212
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
|
213
|
+
const svfloat32x4_t y0 = svld4_f32(pg0, y);
|
|
214
|
+
svfloat32_t y00 = svget4_f32(y0, 0);
|
|
215
|
+
const svfloat32_t y01 = svget4_f32(y0, 1);
|
|
216
|
+
svfloat32_t y02 = svget4_f32(y0, 2);
|
|
217
|
+
const svfloat32_t y03 = svget4_f32(y0, 3);
|
|
218
|
+
y00 = ElementOp::op(pg0, x0, y00);
|
|
219
|
+
y02 = ElementOp::op(pg0, x2, y02);
|
|
220
|
+
y00 = ElementOp::merge(pg0, y00, x1, y01);
|
|
221
|
+
y02 = ElementOp::merge(pg0, y02, x3, y03);
|
|
222
|
+
y00 = svadd_f32_x(pg0, y00, y02);
|
|
223
|
+
svst1_f32(pg0, dis, y00);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
template <typename ElementOp>
|
|
227
|
+
void fvec_op_ny_sve_d8(float* dis, const float* x, const float* y, size_t ny) {
|
|
228
|
+
const size_t lanes = svcntw();
|
|
229
|
+
const size_t lanes4 = lanes * 4;
|
|
230
|
+
const size_t lanes8 = lanes * 8;
|
|
231
|
+
const svbool_t pg = svptrue_b32();
|
|
232
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
|
233
|
+
const svfloat32_t x1 = svdup_n_f32(x[1]);
|
|
234
|
+
const svfloat32_t x2 = svdup_n_f32(x[2]);
|
|
235
|
+
const svfloat32_t x3 = svdup_n_f32(x[3]);
|
|
236
|
+
const svfloat32_t x4 = svdup_n_f32(x[4]);
|
|
237
|
+
const svfloat32_t x5 = svdup_n_f32(x[5]);
|
|
238
|
+
const svfloat32_t x6 = svdup_n_f32(x[6]);
|
|
239
|
+
const svfloat32_t x7 = svdup_n_f32(x[7]);
|
|
240
|
+
size_t i = 0;
|
|
241
|
+
for (; i + lanes < ny; i += lanes) {
|
|
242
|
+
const svfloat32x4_t ya = svld4_f32(pg, y);
|
|
243
|
+
const svfloat32x4_t yb = svld4_f32(pg, y + lanes4);
|
|
244
|
+
const svfloat32_t ya0 = svget4_f32(ya, 0);
|
|
245
|
+
const svfloat32_t ya1 = svget4_f32(ya, 1);
|
|
246
|
+
const svfloat32_t ya2 = svget4_f32(ya, 2);
|
|
247
|
+
const svfloat32_t ya3 = svget4_f32(ya, 3);
|
|
248
|
+
const svfloat32_t yb0 = svget4_f32(yb, 0);
|
|
249
|
+
const svfloat32_t yb1 = svget4_f32(yb, 1);
|
|
250
|
+
const svfloat32_t yb2 = svget4_f32(yb, 2);
|
|
251
|
+
const svfloat32_t yb3 = svget4_f32(yb, 3);
|
|
252
|
+
svfloat32_t y0 = svuzp1(ya0, yb0);
|
|
253
|
+
const svfloat32_t y1 = svuzp1(ya1, yb1);
|
|
254
|
+
svfloat32_t y2 = svuzp1(ya2, yb2);
|
|
255
|
+
const svfloat32_t y3 = svuzp1(ya3, yb3);
|
|
256
|
+
svfloat32_t y4 = svuzp2(ya0, yb0);
|
|
257
|
+
const svfloat32_t y5 = svuzp2(ya1, yb1);
|
|
258
|
+
svfloat32_t y6 = svuzp2(ya2, yb2);
|
|
259
|
+
const svfloat32_t y7 = svuzp2(ya3, yb3);
|
|
260
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
261
|
+
y2 = ElementOp::op(pg, x2, y2);
|
|
262
|
+
y4 = ElementOp::op(pg, x4, y4);
|
|
263
|
+
y6 = ElementOp::op(pg, x6, y6);
|
|
264
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
|
265
|
+
y2 = ElementOp::merge(pg, y2, x3, y3);
|
|
266
|
+
y4 = ElementOp::merge(pg, y4, x5, y5);
|
|
267
|
+
y6 = ElementOp::merge(pg, y6, x7, y7);
|
|
268
|
+
y0 = svadd_f32_x(pg, y0, y2);
|
|
269
|
+
y4 = svadd_f32_x(pg, y4, y6);
|
|
270
|
+
y0 = svadd_f32_x(pg, y0, y4);
|
|
271
|
+
svst1_f32(pg, dis, y0);
|
|
272
|
+
y += lanes8;
|
|
273
|
+
dis += lanes;
|
|
274
|
+
}
|
|
275
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
|
276
|
+
const svbool_t pga = svwhilelt_b32_u64(i * 2, ny * 2);
|
|
277
|
+
const svbool_t pgb = svwhilelt_b32_u64(i * 2 + lanes, ny * 2);
|
|
278
|
+
const svfloat32x4_t ya = svld4_f32(pga, y);
|
|
279
|
+
const svfloat32x4_t yb = svld4_f32(pgb, y + lanes4);
|
|
280
|
+
const svfloat32_t ya0 = svget4_f32(ya, 0);
|
|
281
|
+
const svfloat32_t ya1 = svget4_f32(ya, 1);
|
|
282
|
+
const svfloat32_t ya2 = svget4_f32(ya, 2);
|
|
283
|
+
const svfloat32_t ya3 = svget4_f32(ya, 3);
|
|
284
|
+
const svfloat32_t yb0 = svget4_f32(yb, 0);
|
|
285
|
+
const svfloat32_t yb1 = svget4_f32(yb, 1);
|
|
286
|
+
const svfloat32_t yb2 = svget4_f32(yb, 2);
|
|
287
|
+
const svfloat32_t yb3 = svget4_f32(yb, 3);
|
|
288
|
+
svfloat32_t y0 = svuzp1(ya0, yb0);
|
|
289
|
+
const svfloat32_t y1 = svuzp1(ya1, yb1);
|
|
290
|
+
svfloat32_t y2 = svuzp1(ya2, yb2);
|
|
291
|
+
const svfloat32_t y3 = svuzp1(ya3, yb3);
|
|
292
|
+
svfloat32_t y4 = svuzp2(ya0, yb0);
|
|
293
|
+
const svfloat32_t y5 = svuzp2(ya1, yb1);
|
|
294
|
+
svfloat32_t y6 = svuzp2(ya2, yb2);
|
|
295
|
+
const svfloat32_t y7 = svuzp2(ya3, yb3);
|
|
296
|
+
y0 = ElementOp::op(pg0, x0, y0);
|
|
297
|
+
y2 = ElementOp::op(pg0, x2, y2);
|
|
298
|
+
y4 = ElementOp::op(pg0, x4, y4);
|
|
299
|
+
y6 = ElementOp::op(pg0, x6, y6);
|
|
300
|
+
y0 = ElementOp::merge(pg0, y0, x1, y1);
|
|
301
|
+
y2 = ElementOp::merge(pg0, y2, x3, y3);
|
|
302
|
+
y4 = ElementOp::merge(pg0, y4, x5, y5);
|
|
303
|
+
y6 = ElementOp::merge(pg0, y6, x7, y7);
|
|
304
|
+
y0 = svadd_f32_x(pg0, y0, y2);
|
|
305
|
+
y4 = svadd_f32_x(pg0, y4, y6);
|
|
306
|
+
y0 = svadd_f32_x(pg0, y0, y4);
|
|
307
|
+
svst1_f32(pg0, dis, y0);
|
|
308
|
+
y += lanes8;
|
|
309
|
+
dis += lanes;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
template <typename ElementOp>
|
|
313
|
+
void fvec_op_ny_sve_lanes1(
|
|
314
|
+
float* dis,
|
|
315
|
+
const float* x,
|
|
316
|
+
const float* y,
|
|
317
|
+
size_t ny) {
|
|
318
|
+
const size_t lanes = svcntw();
|
|
319
|
+
const size_t lanes2 = lanes * 2;
|
|
320
|
+
const size_t lanes3 = lanes * 3;
|
|
321
|
+
const size_t lanes4 = lanes * 4;
|
|
322
|
+
const svbool_t pg = svptrue_b32();
|
|
323
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
|
324
|
+
size_t i = 0;
|
|
325
|
+
for (; i + 3 < ny; i += 4) {
|
|
326
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
|
327
|
+
svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
|
328
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
|
329
|
+
svfloat32_t y3 = svld1_f32(pg, y + lanes3);
|
|
330
|
+
y += lanes4;
|
|
331
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
332
|
+
y1 = ElementOp::op(pg, x0, y1);
|
|
333
|
+
y2 = ElementOp::op(pg, x0, y2);
|
|
334
|
+
y3 = ElementOp::op(pg, x0, y3);
|
|
335
|
+
dis[i] = svaddv_f32(pg, y0);
|
|
336
|
+
dis[i + 1] = svaddv_f32(pg, y1);
|
|
337
|
+
dis[i + 2] = svaddv_f32(pg, y2);
|
|
338
|
+
dis[i + 3] = svaddv_f32(pg, y3);
|
|
339
|
+
}
|
|
340
|
+
for (; i < ny; ++i) {
|
|
341
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
|
342
|
+
y += lanes;
|
|
343
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
344
|
+
dis[i] = svaddv_f32(pg, y0);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
template <typename ElementOp>
|
|
349
|
+
void fvec_op_ny_sve_lanes2(
|
|
350
|
+
float* dis,
|
|
351
|
+
const float* x,
|
|
352
|
+
const float* y,
|
|
353
|
+
size_t ny) {
|
|
354
|
+
const size_t lanes = svcntw();
|
|
355
|
+
const size_t lanes2 = lanes * 2;
|
|
356
|
+
const size_t lanes3 = lanes * 3;
|
|
357
|
+
const size_t lanes4 = lanes * 4;
|
|
358
|
+
const svbool_t pg = svptrue_b32();
|
|
359
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
|
360
|
+
const svfloat32_t x1 = svld1_f32(pg, x + lanes);
|
|
361
|
+
size_t i = 0;
|
|
362
|
+
for (; i + 1 < ny; i += 2) {
|
|
363
|
+
svfloat32_t y00 = svld1_f32(pg, y);
|
|
364
|
+
const svfloat32_t y01 = svld1_f32(pg, y + lanes);
|
|
365
|
+
svfloat32_t y10 = svld1_f32(pg, y + lanes2);
|
|
366
|
+
const svfloat32_t y11 = svld1_f32(pg, y + lanes3);
|
|
367
|
+
y += lanes4;
|
|
368
|
+
y00 = ElementOp::op(pg, x0, y00);
|
|
369
|
+
y10 = ElementOp::op(pg, x0, y10);
|
|
370
|
+
y00 = ElementOp::merge(pg, y00, x1, y01);
|
|
371
|
+
y10 = ElementOp::merge(pg, y10, x1, y11);
|
|
372
|
+
dis[i] = svaddv_f32(pg, y00);
|
|
373
|
+
dis[i + 1] = svaddv_f32(pg, y10);
|
|
374
|
+
}
|
|
375
|
+
if (i < ny) {
|
|
376
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
|
377
|
+
const svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
|
378
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
379
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
|
380
|
+
dis[i] = svaddv_f32(pg, y0);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
template <typename ElementOp>
|
|
385
|
+
void fvec_op_ny_sve_lanes3(
|
|
386
|
+
float* dis,
|
|
387
|
+
const float* x,
|
|
388
|
+
const float* y,
|
|
389
|
+
size_t ny) {
|
|
390
|
+
const size_t lanes = svcntw();
|
|
391
|
+
const size_t lanes2 = lanes * 2;
|
|
392
|
+
const size_t lanes3 = lanes * 3;
|
|
393
|
+
const svbool_t pg = svptrue_b32();
|
|
394
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
|
395
|
+
const svfloat32_t x1 = svld1_f32(pg, x + lanes);
|
|
396
|
+
const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
|
|
397
|
+
for (size_t i = 0; i < ny; ++i) {
|
|
398
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
|
399
|
+
const svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
|
400
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
|
401
|
+
y += lanes3;
|
|
402
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
403
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
|
404
|
+
y0 = ElementOp::merge(pg, y0, x2, y2);
|
|
405
|
+
dis[i] = svaddv_f32(pg, y0);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
template <typename ElementOp>
|
|
410
|
+
void fvec_op_ny_sve_lanes4(
|
|
411
|
+
float* dis,
|
|
412
|
+
const float* x,
|
|
413
|
+
const float* y,
|
|
414
|
+
size_t ny) {
|
|
415
|
+
const size_t lanes = svcntw();
|
|
416
|
+
const size_t lanes2 = lanes * 2;
|
|
417
|
+
const size_t lanes3 = lanes * 3;
|
|
418
|
+
const size_t lanes4 = lanes * 4;
|
|
419
|
+
const svbool_t pg = svptrue_b32();
|
|
420
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
|
421
|
+
const svfloat32_t x1 = svld1_f32(pg, x + lanes);
|
|
422
|
+
const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
|
|
423
|
+
const svfloat32_t x3 = svld1_f32(pg, x + lanes3);
|
|
424
|
+
for (size_t i = 0; i < ny; ++i) {
|
|
425
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
|
426
|
+
const svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
|
427
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
|
428
|
+
const svfloat32_t y3 = svld1_f32(pg, y + lanes3);
|
|
429
|
+
y += lanes4;
|
|
430
|
+
y0 = ElementOp::op(pg, x0, y0);
|
|
431
|
+
y2 = ElementOp::op(pg, x2, y2);
|
|
432
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
|
433
|
+
y2 = ElementOp::merge(pg, y2, x3, y3);
|
|
434
|
+
y0 = svadd_f32_x(pg, y0, y2);
|
|
435
|
+
dis[i] = svaddv_f32(pg, y0);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
template <>
|
|
440
|
+
void fvec_inner_products_ny<SIMDLevel::ARM_SVE>(
|
|
441
|
+
float* dis,
|
|
442
|
+
const float* x,
|
|
443
|
+
const float* y,
|
|
444
|
+
size_t d,
|
|
445
|
+
size_t ny) {
|
|
446
|
+
const size_t lanes = svcntw();
|
|
447
|
+
switch (d) {
|
|
448
|
+
case 1:
|
|
449
|
+
fvec_op_ny_sve_d1<ElementOpIP>(dis, x, y, ny);
|
|
450
|
+
break;
|
|
451
|
+
case 2:
|
|
452
|
+
fvec_op_ny_sve_d2<ElementOpIP>(dis, x, y, ny);
|
|
453
|
+
break;
|
|
454
|
+
case 4:
|
|
455
|
+
fvec_op_ny_sve_d4<ElementOpIP>(dis, x, y, ny);
|
|
456
|
+
break;
|
|
457
|
+
case 8:
|
|
458
|
+
fvec_op_ny_sve_d8<ElementOpIP>(dis, x, y, ny);
|
|
459
|
+
break;
|
|
460
|
+
default:
|
|
461
|
+
if (d == lanes)
|
|
462
|
+
fvec_op_ny_sve_lanes1<ElementOpIP>(dis, x, y, ny);
|
|
463
|
+
else if (d == lanes * 2)
|
|
464
|
+
fvec_op_ny_sve_lanes2<ElementOpIP>(dis, x, y, ny);
|
|
465
|
+
else if (d == lanes * 3)
|
|
466
|
+
fvec_op_ny_sve_lanes3<ElementOpIP>(dis, x, y, ny);
|
|
467
|
+
else if (d == lanes * 4)
|
|
468
|
+
fvec_op_ny_sve_lanes4<ElementOpIP>(dis, x, y, ny);
|
|
469
|
+
else {
|
|
470
|
+
// Fallback: use autovectorized inner product
|
|
471
|
+
for (size_t i = 0; i < ny; i++) {
|
|
472
|
+
dis[i] = fvec_inner_product<SIMDLevel::ARM_SVE>(x, y, d);
|
|
473
|
+
y += d;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
break;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
template <>
|
|
481
|
+
void fvec_L2sqr_ny<SIMDLevel::ARM_SVE>(
|
|
482
|
+
float* dis,
|
|
483
|
+
const float* x,
|
|
484
|
+
const float* y,
|
|
485
|
+
size_t d,
|
|
486
|
+
size_t ny) {
|
|
487
|
+
// Use autovectorized L2sqr in a loop
|
|
488
|
+
for (size_t i = 0; i < ny; i++) {
|
|
489
|
+
dis[i] = fvec_L2sqr<SIMDLevel::ARM_SVE>(x, y, d);
|
|
490
|
+
y += d;
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
template <>
|
|
495
|
+
size_t fvec_L2sqr_ny_nearest<SIMDLevel::ARM_SVE>(
|
|
496
|
+
float* distances_tmp_buffer,
|
|
497
|
+
const float* x,
|
|
498
|
+
const float* y,
|
|
499
|
+
size_t d,
|
|
500
|
+
size_t ny) {
|
|
501
|
+
fvec_L2sqr_ny<SIMDLevel::ARM_SVE>(distances_tmp_buffer, x, y, d, ny);
|
|
502
|
+
|
|
503
|
+
size_t nearest_idx = 0;
|
|
504
|
+
float min_dis = HUGE_VALF;
|
|
505
|
+
|
|
506
|
+
for (size_t i = 0; i < ny; i++) {
|
|
507
|
+
if (distances_tmp_buffer[i] < min_dis) {
|
|
508
|
+
min_dis = distances_tmp_buffer[i];
|
|
509
|
+
nearest_idx = i;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
return nearest_idx;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
517
|
+
template <>
|
|
518
|
+
void fvec_L2sqr_ny_transposed<SIMDLevel::ARM_SVE>(
|
|
519
|
+
float* dis,
|
|
520
|
+
const float* x,
|
|
521
|
+
const float* y,
|
|
522
|
+
const float* y_sqlen,
|
|
523
|
+
size_t d,
|
|
524
|
+
size_t d_offset,
|
|
525
|
+
size_t ny) {
|
|
526
|
+
float x_sqlen = 0;
|
|
527
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
528
|
+
for (size_t j = 0; j < d; j++) {
|
|
529
|
+
x_sqlen += x[j] * x[j];
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
for (size_t i = 0; i < ny; i++) {
|
|
533
|
+
float dp = 0;
|
|
534
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
535
|
+
for (size_t j = 0; j < d; j++) {
|
|
536
|
+
dp += x[j] * y[i + j * d_offset];
|
|
537
|
+
}
|
|
538
|
+
dis[i] = x_sqlen + y_sqlen[i] - 2 * dp;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
542
|
+
|
|
543
|
+
template <>
|
|
544
|
+
size_t fvec_L2sqr_ny_nearest_y_transposed<SIMDLevel::ARM_SVE>(
|
|
545
|
+
float* distances_tmp_buffer,
|
|
546
|
+
const float* x,
|
|
547
|
+
const float* y,
|
|
548
|
+
const float* y_sqlen,
|
|
549
|
+
size_t d,
|
|
550
|
+
size_t d_offset,
|
|
551
|
+
size_t ny) {
|
|
552
|
+
fvec_L2sqr_ny_transposed<SIMDLevel::ARM_SVE>(
|
|
553
|
+
distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
|
|
554
|
+
|
|
555
|
+
size_t nearest_idx = 0;
|
|
556
|
+
float min_dis = HUGE_VALF;
|
|
557
|
+
|
|
558
|
+
for (size_t i = 0; i < ny; i++) {
|
|
559
|
+
if (distances_tmp_buffer[i] < min_dis) {
|
|
560
|
+
min_dis = distances_tmp_buffer[i];
|
|
561
|
+
nearest_idx = i;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return nearest_idx;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
} // namespace faiss
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
#include <faiss/impl/platform_macros.h>
|
|
11
|
+
#include <faiss/utils/distances.h>
|
|
12
|
+
|
|
13
|
+
namespace faiss {
|
|
14
|
+
|
|
15
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
16
|
+
template <>
|
|
17
|
+
float fvec_norm_L2sqr<AUTOVEC_LEVEL>(const float* x, size_t d) {
|
|
18
|
+
// the double in the _ref is suspected to be a typo. Some of the manual
|
|
19
|
+
// implementations this replaces used float.
|
|
20
|
+
float res = 0;
|
|
21
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
22
|
+
for (size_t i = 0; i != d; ++i) {
|
|
23
|
+
res += x[i] * x[i];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
return res;
|
|
27
|
+
}
|
|
28
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
29
|
+
|
|
30
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
31
|
+
template <>
|
|
32
|
+
float fvec_L2sqr<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
|
|
33
|
+
size_t i;
|
|
34
|
+
float res = 0;
|
|
35
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
36
|
+
for (i = 0; i < d; i++) {
|
|
37
|
+
const float tmp = x[i] - y[i];
|
|
38
|
+
res += tmp * tmp;
|
|
39
|
+
}
|
|
40
|
+
return res;
|
|
41
|
+
}
|
|
42
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
43
|
+
|
|
44
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
45
|
+
template <>
|
|
46
|
+
float fvec_inner_product<AUTOVEC_LEVEL>(
|
|
47
|
+
const float* x,
|
|
48
|
+
const float* y,
|
|
49
|
+
size_t d) {
|
|
50
|
+
float res = 0.F;
|
|
51
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
52
|
+
for (size_t i = 0; i != d; ++i) {
|
|
53
|
+
res += x[i] * y[i];
|
|
54
|
+
}
|
|
55
|
+
return res;
|
|
56
|
+
}
|
|
57
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
58
|
+
|
|
59
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
60
|
+
template <>
|
|
61
|
+
float fvec_L1<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
|
|
62
|
+
size_t i;
|
|
63
|
+
float res = 0;
|
|
64
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
65
|
+
for (i = 0; i < d; i++) {
|
|
66
|
+
const float tmp = x[i] - y[i];
|
|
67
|
+
res += fabs(tmp);
|
|
68
|
+
}
|
|
69
|
+
return res;
|
|
70
|
+
}
|
|
71
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
72
|
+
|
|
73
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
74
|
+
template <>
|
|
75
|
+
float fvec_Linf<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
|
|
76
|
+
float res = 0;
|
|
77
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
78
|
+
for (size_t i = 0; i < d; i++) {
|
|
79
|
+
res = fmax(res, fabs(x[i] - y[i]));
|
|
80
|
+
}
|
|
81
|
+
return res;
|
|
82
|
+
}
|
|
83
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
84
|
+
|
|
85
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
86
|
+
template <>
|
|
87
|
+
void fvec_inner_product_batch_4<AUTOVEC_LEVEL>(
|
|
88
|
+
const float* x,
|
|
89
|
+
const float* y0,
|
|
90
|
+
const float* y1,
|
|
91
|
+
const float* y2,
|
|
92
|
+
const float* y3,
|
|
93
|
+
const size_t d,
|
|
94
|
+
float& dis0,
|
|
95
|
+
float& dis1,
|
|
96
|
+
float& dis2,
|
|
97
|
+
float& dis3) {
|
|
98
|
+
float d0 = 0;
|
|
99
|
+
float d1 = 0;
|
|
100
|
+
float d2 = 0;
|
|
101
|
+
float d3 = 0;
|
|
102
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
103
|
+
for (size_t i = 0; i < d; ++i) {
|
|
104
|
+
d0 += x[i] * y0[i];
|
|
105
|
+
d1 += x[i] * y1[i];
|
|
106
|
+
d2 += x[i] * y2[i];
|
|
107
|
+
d3 += x[i] * y3[i];
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
dis0 = d0;
|
|
111
|
+
dis1 = d1;
|
|
112
|
+
dis2 = d2;
|
|
113
|
+
dis3 = d3;
|
|
114
|
+
}
|
|
115
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
116
|
+
|
|
117
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
|
|
118
|
+
template <>
|
|
119
|
+
void fvec_L2sqr_batch_4<AUTOVEC_LEVEL>(
|
|
120
|
+
const float* x,
|
|
121
|
+
const float* y0,
|
|
122
|
+
const float* y1,
|
|
123
|
+
const float* y2,
|
|
124
|
+
const float* y3,
|
|
125
|
+
const size_t d,
|
|
126
|
+
float& dis0,
|
|
127
|
+
float& dis1,
|
|
128
|
+
float& dis2,
|
|
129
|
+
float& dis3) {
|
|
130
|
+
float d0 = 0;
|
|
131
|
+
float d1 = 0;
|
|
132
|
+
float d2 = 0;
|
|
133
|
+
float d3 = 0;
|
|
134
|
+
FAISS_PRAGMA_IMPRECISE_LOOP
|
|
135
|
+
for (size_t i = 0; i < d; ++i) {
|
|
136
|
+
const float q0 = x[i] - y0[i];
|
|
137
|
+
const float q1 = x[i] - y1[i];
|
|
138
|
+
const float q2 = x[i] - y2[i];
|
|
139
|
+
const float q3 = x[i] - y3[i];
|
|
140
|
+
d0 += q0 * q0;
|
|
141
|
+
d1 += q1 * q1;
|
|
142
|
+
d2 += q2 * q2;
|
|
143
|
+
d3 += q3 * q3;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
dis0 = d0;
|
|
147
|
+
dis1 = d1;
|
|
148
|
+
dis2 = d2;
|
|
149
|
+
dis3 = d3;
|
|
150
|
+
}
|
|
151
|
+
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
|
152
|
+
|
|
153
|
+
} // namespace faiss
|