faiss 0.2.7 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +9 -2
- data/ext/faiss/index.cpp +1 -1
- data/ext/faiss/index_binary.cpp +2 -2
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/lib/faiss/version.rb +1 -1
- data/lib/faiss.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +7 -7
- data/vendor/faiss/faiss/AutoTune.h +0 -1
- data/vendor/faiss/faiss/Clustering.cpp +4 -18
- data/vendor/faiss/faiss/Clustering.h +31 -21
- data/vendor/faiss/faiss/IVFlib.cpp +22 -11
- data/vendor/faiss/faiss/Index.cpp +1 -1
- data/vendor/faiss/faiss/Index.h +20 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
- data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
- data/vendor/faiss/faiss/IndexBinary.h +8 -19
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
- data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
- data/vendor/faiss/faiss/IndexFastScan.h +9 -8
- data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
- data/vendor/faiss/faiss/IndexFlat.h +20 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
- data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
- data/vendor/faiss/faiss/IndexHNSW.h +12 -48
- data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
- data/vendor/faiss/faiss/IndexIDMap.h +24 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
- data/vendor/faiss/faiss/IndexIVF.h +37 -5
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
- data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
- data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
- data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
- data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
- data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
- data/vendor/faiss/faiss/IndexNSG.h +10 -10
- data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
- data/vendor/faiss/faiss/IndexPQ.h +1 -4
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
- data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
- data/vendor/faiss/faiss/IndexRefine.h +7 -0
- data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
- data/vendor/faiss/faiss/IndexShards.cpp +21 -29
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
- data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
- data/vendor/faiss/faiss/MatrixStats.h +21 -9
- data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
- data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
- data/vendor/faiss/faiss/VectorTransform.h +7 -7
- data/vendor/faiss/faiss/clone_index.cpp +15 -10
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
- data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
- data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
- data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
- data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
- data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
- data/vendor/faiss/faiss/impl/FaissException.h +13 -34
- data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
- data/vendor/faiss/faiss/impl/HNSW.h +9 -8
- data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
- data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
- data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
- data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
- data/vendor/faiss/faiss/impl/io.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
- data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
- data/vendor/faiss/faiss/index_factory.cpp +10 -7
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
- data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
- data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
- data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
- data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
- data/vendor/faiss/faiss/utils/distances.cpp +128 -74
- data/vendor/faiss/faiss/utils/distances.h +81 -4
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
- data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
- data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
- data/vendor/faiss/faiss/utils/fp16.h +2 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
- data/vendor/faiss/faiss/utils/hamming.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
- data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
- data/vendor/faiss/faiss/utils/prefetch.h +77 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
- data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
- data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
- data/vendor/faiss/faiss/utils/sorting.h +27 -0
- data/vendor/faiss/faiss/utils/utils.cpp +112 -6
- data/vendor/faiss/faiss/utils/utils.h +57 -20
- metadata +11 -4
@@ -65,14 +65,6 @@ struct ScalarQuantizer : Quantizer {
|
|
65
65
|
|
66
66
|
void train(size_t n, const float* x) override;
|
67
67
|
|
68
|
-
/// Used by an IVF index to train based on the residuals
|
69
|
-
void train_residual(
|
70
|
-
size_t n,
|
71
|
-
const float* x,
|
72
|
-
Index* quantizer,
|
73
|
-
bool by_residual,
|
74
|
-
bool verbose);
|
75
|
-
|
76
68
|
/** Encode a set of vectors
|
77
69
|
*
|
78
70
|
* @param x vectors to encode, size n * d
|
@@ -13,25 +13,218 @@
|
|
13
13
|
|
14
14
|
#include <type_traits>
|
15
15
|
|
16
|
+
#include <faiss/impl/ProductQuantizer.h>
|
16
17
|
#include <faiss/impl/code_distance/code_distance-generic.h>
|
17
18
|
|
18
19
|
namespace {
|
19
20
|
|
21
|
+
inline float horizontal_sum(const __m128 v) {
|
22
|
+
const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
|
23
|
+
const __m128 v1 = _mm_add_ps(v, v0);
|
24
|
+
__m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
|
25
|
+
const __m128 v3 = _mm_add_ps(v1, v2);
|
26
|
+
return _mm_cvtss_f32(v3);
|
27
|
+
}
|
28
|
+
|
20
29
|
// Computes a horizontal sum over an __m256 register
|
21
|
-
inline float horizontal_sum(const __m256
|
22
|
-
const
|
23
|
-
|
30
|
+
inline float horizontal_sum(const __m256 v) {
|
31
|
+
const __m128 v0 =
|
32
|
+
_mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
|
33
|
+
return horizontal_sum(v0);
|
34
|
+
}
|
35
|
+
|
36
|
+
// processes a single code for M=4, ksub=256, nbits=8
|
37
|
+
float inline distance_single_code_avx2_pqdecoder8_m4(
|
38
|
+
// precomputed distances, layout (4, 256)
|
39
|
+
const float* sim_table,
|
40
|
+
const uint8_t* code) {
|
41
|
+
float result = 0;
|
42
|
+
|
43
|
+
const float* tab = sim_table;
|
44
|
+
constexpr size_t ksub = 1 << 8;
|
45
|
+
|
46
|
+
const __m128i vksub = _mm_set1_epi32(ksub);
|
47
|
+
__m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
|
48
|
+
offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
|
49
|
+
|
50
|
+
// accumulators of partial sums
|
51
|
+
__m128 partialSum;
|
52
|
+
|
53
|
+
// load 4 uint8 values
|
54
|
+
const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code));
|
55
|
+
{
|
56
|
+
// convert uint8 values (low part of __m128i) to int32
|
57
|
+
// values
|
58
|
+
const __m128i idx1 = _mm_cvtepu8_epi32(mm1);
|
24
59
|
|
25
|
-
|
26
|
-
|
27
|
-
const __m128 h3 = _mm256_castps256_ps128(h1);
|
60
|
+
// add offsets
|
61
|
+
const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
|
28
62
|
|
29
|
-
|
30
|
-
|
63
|
+
// gather 8 values, similar to 8 operations of tab[idx]
|
64
|
+
__m128 collected =
|
65
|
+
_mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
|
31
66
|
|
32
|
-
|
33
|
-
|
34
|
-
|
67
|
+
// collect partial sums
|
68
|
+
partialSum = collected;
|
69
|
+
}
|
70
|
+
|
71
|
+
// horizontal sum for partialSum
|
72
|
+
result = horizontal_sum(partialSum);
|
73
|
+
return result;
|
74
|
+
}
|
75
|
+
|
76
|
+
// processes a single code for M=8, ksub=256, nbits=8
|
77
|
+
float inline distance_single_code_avx2_pqdecoder8_m8(
|
78
|
+
// precomputed distances, layout (8, 256)
|
79
|
+
const float* sim_table,
|
80
|
+
const uint8_t* code) {
|
81
|
+
float result = 0;
|
82
|
+
|
83
|
+
const float* tab = sim_table;
|
84
|
+
constexpr size_t ksub = 1 << 8;
|
85
|
+
|
86
|
+
const __m256i vksub = _mm256_set1_epi32(ksub);
|
87
|
+
__m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
88
|
+
offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
|
89
|
+
|
90
|
+
// accumulators of partial sums
|
91
|
+
__m256 partialSum;
|
92
|
+
|
93
|
+
// load 8 uint8 values
|
94
|
+
const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code);
|
95
|
+
{
|
96
|
+
// convert uint8 values (low part of __m128i) to int32
|
97
|
+
// values
|
98
|
+
const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
|
99
|
+
|
100
|
+
// add offsets
|
101
|
+
const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
|
102
|
+
|
103
|
+
// gather 8 values, similar to 8 operations of tab[idx]
|
104
|
+
__m256 collected =
|
105
|
+
_mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
|
106
|
+
|
107
|
+
// collect partial sums
|
108
|
+
partialSum = collected;
|
109
|
+
}
|
110
|
+
|
111
|
+
// horizontal sum for partialSum
|
112
|
+
result = horizontal_sum(partialSum);
|
113
|
+
return result;
|
114
|
+
}
|
115
|
+
|
116
|
+
// processes four codes for M=4, ksub=256, nbits=8
|
117
|
+
inline void distance_four_codes_avx2_pqdecoder8_m4(
|
118
|
+
// precomputed distances, layout (4, 256)
|
119
|
+
const float* sim_table,
|
120
|
+
// codes
|
121
|
+
const uint8_t* __restrict code0,
|
122
|
+
const uint8_t* __restrict code1,
|
123
|
+
const uint8_t* __restrict code2,
|
124
|
+
const uint8_t* __restrict code3,
|
125
|
+
// computed distances
|
126
|
+
float& result0,
|
127
|
+
float& result1,
|
128
|
+
float& result2,
|
129
|
+
float& result3) {
|
130
|
+
constexpr intptr_t N = 4;
|
131
|
+
|
132
|
+
const float* tab = sim_table;
|
133
|
+
constexpr size_t ksub = 1 << 8;
|
134
|
+
|
135
|
+
// process 8 values
|
136
|
+
const __m128i vksub = _mm_set1_epi32(ksub);
|
137
|
+
__m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
|
138
|
+
offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
|
139
|
+
|
140
|
+
// accumulators of partial sums
|
141
|
+
__m128 partialSums[N];
|
142
|
+
|
143
|
+
// load 4 uint8 values
|
144
|
+
__m128i mm1[N];
|
145
|
+
mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0));
|
146
|
+
mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1));
|
147
|
+
mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2));
|
148
|
+
mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3));
|
149
|
+
|
150
|
+
for (intptr_t j = 0; j < N; j++) {
|
151
|
+
// convert uint8 values (low part of __m128i) to int32
|
152
|
+
// values
|
153
|
+
const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]);
|
154
|
+
|
155
|
+
// add offsets
|
156
|
+
const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
|
157
|
+
|
158
|
+
// gather 4 values, similar to 4 operations of tab[idx]
|
159
|
+
__m128 collected =
|
160
|
+
_mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
|
161
|
+
|
162
|
+
// collect partial sums
|
163
|
+
partialSums[j] = collected;
|
164
|
+
}
|
165
|
+
|
166
|
+
// horizontal sum for partialSum
|
167
|
+
result0 = horizontal_sum(partialSums[0]);
|
168
|
+
result1 = horizontal_sum(partialSums[1]);
|
169
|
+
result2 = horizontal_sum(partialSums[2]);
|
170
|
+
result3 = horizontal_sum(partialSums[3]);
|
171
|
+
}
|
172
|
+
|
173
|
+
// processes four codes for M=8, ksub=256, nbits=8
|
174
|
+
inline void distance_four_codes_avx2_pqdecoder8_m8(
|
175
|
+
// precomputed distances, layout (8, 256)
|
176
|
+
const float* sim_table,
|
177
|
+
// codes
|
178
|
+
const uint8_t* __restrict code0,
|
179
|
+
const uint8_t* __restrict code1,
|
180
|
+
const uint8_t* __restrict code2,
|
181
|
+
const uint8_t* __restrict code3,
|
182
|
+
// computed distances
|
183
|
+
float& result0,
|
184
|
+
float& result1,
|
185
|
+
float& result2,
|
186
|
+
float& result3) {
|
187
|
+
constexpr intptr_t N = 4;
|
188
|
+
|
189
|
+
const float* tab = sim_table;
|
190
|
+
constexpr size_t ksub = 1 << 8;
|
191
|
+
|
192
|
+
// process 8 values
|
193
|
+
const __m256i vksub = _mm256_set1_epi32(ksub);
|
194
|
+
__m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
195
|
+
offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
|
196
|
+
|
197
|
+
// accumulators of partial sums
|
198
|
+
__m256 partialSums[N];
|
199
|
+
|
200
|
+
// load 8 uint8 values
|
201
|
+
__m128i mm1[N];
|
202
|
+
mm1[0] = _mm_loadu_si64((const __m128i_u*)code0);
|
203
|
+
mm1[1] = _mm_loadu_si64((const __m128i_u*)code1);
|
204
|
+
mm1[2] = _mm_loadu_si64((const __m128i_u*)code2);
|
205
|
+
mm1[3] = _mm_loadu_si64((const __m128i_u*)code3);
|
206
|
+
|
207
|
+
for (intptr_t j = 0; j < N; j++) {
|
208
|
+
// convert uint8 values (low part of __m128i) to int32
|
209
|
+
// values
|
210
|
+
const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
|
211
|
+
|
212
|
+
// add offsets
|
213
|
+
const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
|
214
|
+
|
215
|
+
// gather 8 values, similar to 8 operations of tab[idx]
|
216
|
+
__m256 collected =
|
217
|
+
_mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
|
218
|
+
|
219
|
+
// collect partial sums
|
220
|
+
partialSums[j] = collected;
|
221
|
+
}
|
222
|
+
|
223
|
+
// horizontal sum for partialSum
|
224
|
+
result0 = horizontal_sum(partialSums[0]);
|
225
|
+
result1 = horizontal_sum(partialSums[1]);
|
226
|
+
result2 = horizontal_sum(partialSums[2]);
|
227
|
+
result3 = horizontal_sum(partialSums[3]);
|
35
228
|
}
|
36
229
|
|
37
230
|
} // namespace
|
@@ -41,36 +234,48 @@ namespace faiss {
|
|
41
234
|
template <typename PQDecoderT>
|
42
235
|
typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
|
43
236
|
type inline distance_single_code_avx2(
|
44
|
-
//
|
45
|
-
const
|
237
|
+
// number of subquantizers
|
238
|
+
const size_t M,
|
239
|
+
// number of bits per quantization index
|
240
|
+
const size_t nbits,
|
46
241
|
// precomputed distances, layout (M, ksub)
|
47
242
|
const float* sim_table,
|
48
243
|
const uint8_t* code) {
|
49
244
|
// default implementation
|
50
|
-
return distance_single_code_generic<PQDecoderT>(
|
245
|
+
return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
|
51
246
|
}
|
52
247
|
|
53
248
|
template <typename PQDecoderT>
|
54
249
|
typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
|
55
250
|
type inline distance_single_code_avx2(
|
56
|
-
//
|
57
|
-
const
|
251
|
+
// number of subquantizers
|
252
|
+
const size_t M,
|
253
|
+
// number of bits per quantization index
|
254
|
+
const size_t nbits,
|
58
255
|
// precomputed distances, layout (M, ksub)
|
59
256
|
const float* sim_table,
|
60
257
|
const uint8_t* code) {
|
258
|
+
if (M == 4) {
|
259
|
+
return distance_single_code_avx2_pqdecoder8_m4(sim_table, code);
|
260
|
+
}
|
261
|
+
if (M == 8) {
|
262
|
+
return distance_single_code_avx2_pqdecoder8_m8(sim_table, code);
|
263
|
+
}
|
264
|
+
|
61
265
|
float result = 0;
|
266
|
+
constexpr size_t ksub = 1 << 8;
|
62
267
|
|
63
268
|
size_t m = 0;
|
64
|
-
const size_t pqM16 =
|
269
|
+
const size_t pqM16 = M / 16;
|
65
270
|
|
66
271
|
const float* tab = sim_table;
|
67
272
|
|
68
273
|
if (pqM16 > 0) {
|
69
274
|
// process 16 values per loop
|
70
275
|
|
71
|
-
const __m256i
|
276
|
+
const __m256i vksub = _mm256_set1_epi32(ksub);
|
72
277
|
__m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
73
|
-
offsets_0 = _mm256_mullo_epi32(offsets_0,
|
278
|
+
offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
|
74
279
|
|
75
280
|
// accumulators of partial sums
|
76
281
|
__m256 partialSum = _mm256_setzero_ps();
|
@@ -91,7 +296,7 @@ typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
|
|
91
296
|
// gather 8 values, similar to 8 operations of tab[idx]
|
92
297
|
__m256 collected = _mm256_i32gather_ps(
|
93
298
|
tab, indices_to_read_from, sizeof(float));
|
94
|
-
tab +=
|
299
|
+
tab += ksub * 8;
|
95
300
|
|
96
301
|
// collect partial sums
|
97
302
|
partialSum = _mm256_add_ps(partialSum, collected);
|
@@ -111,7 +316,7 @@ typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
|
|
111
316
|
// gather 8 values, similar to 8 operations of tab[idx]
|
112
317
|
__m256 collected = _mm256_i32gather_ps(
|
113
318
|
tab, indices_to_read_from, sizeof(float));
|
114
|
-
tab +=
|
319
|
+
tab += ksub * 8;
|
115
320
|
|
116
321
|
// collect partial sums
|
117
322
|
partialSum = _mm256_add_ps(partialSum, collected);
|
@@ -123,13 +328,13 @@ typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
|
|
123
328
|
}
|
124
329
|
|
125
330
|
//
|
126
|
-
if (m <
|
331
|
+
if (m < M) {
|
127
332
|
// process leftovers
|
128
|
-
PQDecoder8 decoder(code + m,
|
333
|
+
PQDecoder8 decoder(code + m, nbits);
|
129
334
|
|
130
|
-
for (; m <
|
335
|
+
for (; m < M; m++) {
|
131
336
|
result += tab[decoder.decode()];
|
132
|
-
tab +=
|
337
|
+
tab += ksub;
|
133
338
|
}
|
134
339
|
}
|
135
340
|
|
@@ -140,8 +345,10 @@ template <typename PQDecoderT>
|
|
140
345
|
typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
|
141
346
|
type
|
142
347
|
distance_four_codes_avx2(
|
143
|
-
//
|
144
|
-
const
|
348
|
+
// number of subquantizers
|
349
|
+
const size_t M,
|
350
|
+
// number of bits per quantization index
|
351
|
+
const size_t nbits,
|
145
352
|
// precomputed distances, layout (M, ksub)
|
146
353
|
const float* sim_table,
|
147
354
|
// codes
|
@@ -155,7 +362,8 @@ typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
|
|
155
362
|
float& result2,
|
156
363
|
float& result3) {
|
157
364
|
distance_four_codes_generic<PQDecoderT>(
|
158
|
-
|
365
|
+
M,
|
366
|
+
nbits,
|
159
367
|
sim_table,
|
160
368
|
code0,
|
161
369
|
code1,
|
@@ -171,8 +379,10 @@ typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
|
|
171
379
|
template <typename PQDecoderT>
|
172
380
|
typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
|
173
381
|
distance_four_codes_avx2(
|
174
|
-
//
|
175
|
-
const
|
382
|
+
// number of subquantizers
|
383
|
+
const size_t M,
|
384
|
+
// number of bits per quantization index
|
385
|
+
const size_t nbits,
|
176
386
|
// precomputed distances, layout (M, ksub)
|
177
387
|
const float* sim_table,
|
178
388
|
// codes
|
@@ -185,13 +395,41 @@ distance_four_codes_avx2(
|
|
185
395
|
float& result1,
|
186
396
|
float& result2,
|
187
397
|
float& result3) {
|
398
|
+
if (M == 4) {
|
399
|
+
distance_four_codes_avx2_pqdecoder8_m4(
|
400
|
+
sim_table,
|
401
|
+
code0,
|
402
|
+
code1,
|
403
|
+
code2,
|
404
|
+
code3,
|
405
|
+
result0,
|
406
|
+
result1,
|
407
|
+
result2,
|
408
|
+
result3);
|
409
|
+
return;
|
410
|
+
}
|
411
|
+
if (M == 8) {
|
412
|
+
distance_four_codes_avx2_pqdecoder8_m8(
|
413
|
+
sim_table,
|
414
|
+
code0,
|
415
|
+
code1,
|
416
|
+
code2,
|
417
|
+
code3,
|
418
|
+
result0,
|
419
|
+
result1,
|
420
|
+
result2,
|
421
|
+
result3);
|
422
|
+
return;
|
423
|
+
}
|
424
|
+
|
188
425
|
result0 = 0;
|
189
426
|
result1 = 0;
|
190
427
|
result2 = 0;
|
191
428
|
result3 = 0;
|
429
|
+
constexpr size_t ksub = 1 << 8;
|
192
430
|
|
193
431
|
size_t m = 0;
|
194
|
-
const size_t pqM16 =
|
432
|
+
const size_t pqM16 = M / 16;
|
195
433
|
|
196
434
|
constexpr intptr_t N = 4;
|
197
435
|
|
@@ -199,9 +437,9 @@ distance_four_codes_avx2(
|
|
199
437
|
|
200
438
|
if (pqM16 > 0) {
|
201
439
|
// process 16 values per loop
|
202
|
-
const __m256i
|
440
|
+
const __m256i vksub = _mm256_set1_epi32(ksub);
|
203
441
|
__m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
204
|
-
offsets_0 = _mm256_mullo_epi32(offsets_0,
|
442
|
+
offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
|
205
443
|
|
206
444
|
// accumulators of partial sums
|
207
445
|
__m256 partialSums[N];
|
@@ -235,7 +473,7 @@ distance_four_codes_avx2(
|
|
235
473
|
// collect partial sums
|
236
474
|
partialSums[j] = _mm256_add_ps(partialSums[j], collected);
|
237
475
|
}
|
238
|
-
tab +=
|
476
|
+
tab += ksub * 8;
|
239
477
|
|
240
478
|
// process next 8 codes
|
241
479
|
for (intptr_t j = 0; j < N; j++) {
|
@@ -259,7 +497,7 @@ distance_four_codes_avx2(
|
|
259
497
|
partialSums[j] = _mm256_add_ps(partialSums[j], collected);
|
260
498
|
}
|
261
499
|
|
262
|
-
tab +=
|
500
|
+
tab += ksub * 8;
|
263
501
|
}
|
264
502
|
|
265
503
|
// horizontal sum for partialSum
|
@@ -270,18 +508,18 @@ distance_four_codes_avx2(
|
|
270
508
|
}
|
271
509
|
|
272
510
|
//
|
273
|
-
if (m <
|
511
|
+
if (m < M) {
|
274
512
|
// process leftovers
|
275
|
-
PQDecoder8 decoder0(code0 + m,
|
276
|
-
PQDecoder8 decoder1(code1 + m,
|
277
|
-
PQDecoder8 decoder2(code2 + m,
|
278
|
-
PQDecoder8 decoder3(code3 + m,
|
279
|
-
for (; m <
|
513
|
+
PQDecoder8 decoder0(code0 + m, nbits);
|
514
|
+
PQDecoder8 decoder1(code1 + m, nbits);
|
515
|
+
PQDecoder8 decoder2(code2 + m, nbits);
|
516
|
+
PQDecoder8 decoder3(code3 + m, nbits);
|
517
|
+
for (; m < M; m++) {
|
280
518
|
result0 += tab[decoder0.decode()];
|
281
519
|
result1 += tab[decoder1.decode()];
|
282
520
|
result2 += tab[decoder2.decode()];
|
283
521
|
result3 += tab[decoder3.decode()];
|
284
|
-
tab +=
|
522
|
+
tab += ksub;
|
285
523
|
}
|
286
524
|
}
|
287
525
|
}
|
@@ -7,27 +7,31 @@
|
|
7
7
|
|
8
8
|
#pragma once
|
9
9
|
|
10
|
-
#include <
|
10
|
+
#include <cstddef>
|
11
|
+
#include <cstdint>
|
11
12
|
|
12
13
|
namespace faiss {
|
13
14
|
|
14
15
|
/// Returns the distance to a single code.
|
15
16
|
template <typename PQDecoderT>
|
16
17
|
inline float distance_single_code_generic(
|
17
|
-
//
|
18
|
-
const
|
18
|
+
// number of subquantizers
|
19
|
+
const size_t M,
|
20
|
+
// number of bits per quantization index
|
21
|
+
const size_t nbits,
|
19
22
|
// precomputed distances, layout (M, ksub)
|
20
23
|
const float* sim_table,
|
21
24
|
// the code
|
22
25
|
const uint8_t* code) {
|
23
|
-
PQDecoderT decoder(code,
|
26
|
+
PQDecoderT decoder(code, nbits);
|
27
|
+
const size_t ksub = 1 << nbits;
|
24
28
|
|
25
29
|
const float* tab = sim_table;
|
26
30
|
float result = 0;
|
27
31
|
|
28
|
-
for (size_t m = 0; m <
|
32
|
+
for (size_t m = 0; m < M; m++) {
|
29
33
|
result += tab[decoder.decode()];
|
30
|
-
tab +=
|
34
|
+
tab += ksub;
|
31
35
|
}
|
32
36
|
|
33
37
|
return result;
|
@@ -37,8 +41,10 @@ inline float distance_single_code_generic(
|
|
37
41
|
/// General-purpose version.
|
38
42
|
template <typename PQDecoderT>
|
39
43
|
inline void distance_four_codes_generic(
|
40
|
-
//
|
41
|
-
const
|
44
|
+
// number of subquantizers
|
45
|
+
const size_t M,
|
46
|
+
// number of bits per quantization index
|
47
|
+
const size_t nbits,
|
42
48
|
// precomputed distances, layout (M, ksub)
|
43
49
|
const float* sim_table,
|
44
50
|
// codes
|
@@ -51,10 +57,11 @@ inline void distance_four_codes_generic(
|
|
51
57
|
float& result1,
|
52
58
|
float& result2,
|
53
59
|
float& result3) {
|
54
|
-
PQDecoderT decoder0(code0,
|
55
|
-
PQDecoderT decoder1(code1,
|
56
|
-
PQDecoderT decoder2(code2,
|
57
|
-
PQDecoderT decoder3(code3,
|
60
|
+
PQDecoderT decoder0(code0, nbits);
|
61
|
+
PQDecoderT decoder1(code1, nbits);
|
62
|
+
PQDecoderT decoder2(code2, nbits);
|
63
|
+
PQDecoderT decoder3(code3, nbits);
|
64
|
+
const size_t ksub = 1 << nbits;
|
58
65
|
|
59
66
|
const float* tab = sim_table;
|
60
67
|
result0 = 0;
|
@@ -62,12 +69,12 @@ inline void distance_four_codes_generic(
|
|
62
69
|
result2 = 0;
|
63
70
|
result3 = 0;
|
64
71
|
|
65
|
-
for (size_t m = 0; m <
|
72
|
+
for (size_t m = 0; m < M; m++) {
|
66
73
|
result0 += tab[decoder0.decode()];
|
67
74
|
result1 += tab[decoder1.decode()];
|
68
75
|
result2 += tab[decoder2.decode()];
|
69
76
|
result3 += tab[decoder3.decode()];
|
70
|
-
tab +=
|
77
|
+
tab += ksub;
|
71
78
|
}
|
72
79
|
}
|
73
80
|
|
@@ -32,19 +32,23 @@ namespace faiss {
|
|
32
32
|
|
33
33
|
template <typename PQDecoderT>
|
34
34
|
inline float distance_single_code(
|
35
|
-
//
|
36
|
-
const
|
35
|
+
// number of subquantizers
|
36
|
+
const size_t M,
|
37
|
+
// number of bits per quantization index
|
38
|
+
const size_t nbits,
|
37
39
|
// precomputed distances, layout (M, ksub)
|
38
40
|
const float* sim_table,
|
39
41
|
// the code
|
40
42
|
const uint8_t* code) {
|
41
|
-
return distance_single_code_avx2<PQDecoderT>(
|
43
|
+
return distance_single_code_avx2<PQDecoderT>(M, nbits, sim_table, code);
|
42
44
|
}
|
43
45
|
|
44
46
|
template <typename PQDecoderT>
|
45
47
|
inline void distance_four_codes(
|
46
|
-
//
|
47
|
-
const
|
48
|
+
// number of subquantizers
|
49
|
+
const size_t M,
|
50
|
+
// number of bits per quantization index
|
51
|
+
const size_t nbits,
|
48
52
|
// precomputed distances, layout (M, ksub)
|
49
53
|
const float* sim_table,
|
50
54
|
// codes
|
@@ -58,7 +62,8 @@ inline void distance_four_codes(
|
|
58
62
|
float& result2,
|
59
63
|
float& result3) {
|
60
64
|
distance_four_codes_avx2<PQDecoderT>(
|
61
|
-
|
65
|
+
M,
|
66
|
+
nbits,
|
62
67
|
sim_table,
|
63
68
|
code0,
|
64
69
|
code1,
|
@@ -80,19 +85,23 @@ namespace faiss {
|
|
80
85
|
|
81
86
|
template <typename PQDecoderT>
|
82
87
|
inline float distance_single_code(
|
83
|
-
//
|
84
|
-
const
|
88
|
+
// number of subquantizers
|
89
|
+
const size_t M,
|
90
|
+
// number of bits per quantization index
|
91
|
+
const size_t nbits,
|
85
92
|
// precomputed distances, layout (M, ksub)
|
86
93
|
const float* sim_table,
|
87
94
|
// the code
|
88
95
|
const uint8_t* code) {
|
89
|
-
return distance_single_code_generic<PQDecoderT>(
|
96
|
+
return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
|
90
97
|
}
|
91
98
|
|
92
99
|
template <typename PQDecoderT>
|
93
100
|
inline void distance_four_codes(
|
94
|
-
//
|
95
|
-
const
|
101
|
+
// number of subquantizers
|
102
|
+
const size_t M,
|
103
|
+
// number of bits per quantization index
|
104
|
+
const size_t nbits,
|
96
105
|
// precomputed distances, layout (M, ksub)
|
97
106
|
const float* sim_table,
|
98
107
|
// codes
|
@@ -106,7 +115,8 @@ inline void distance_four_codes(
|
|
106
115
|
float& result2,
|
107
116
|
float& result3) {
|
108
117
|
distance_four_codes_generic<PQDecoderT>(
|
109
|
-
|
118
|
+
M,
|
119
|
+
nbits,
|
110
120
|
sim_table,
|
111
121
|
code0,
|
112
122
|
code1,
|