faiss 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
- data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +4 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
- data/vendor/faiss/faiss/impl/HNSW.h +51 -13
- data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
- data/vendor/faiss/faiss/impl/Panorama.h +11 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
- data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
- data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
- data/vendor/faiss/faiss/impl/io_macros.h +25 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
- data/vendor/faiss/faiss/index_factory.cpp +5 -1
- data/vendor/faiss/faiss/index_io.h +16 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
- data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
- metadata +12 -2
|
@@ -122,6 +122,49 @@ inline float32x4x2_t elementaryBlock8x1bAccum(
|
|
|
122
122
|
return {result0, result1};
|
|
123
123
|
}
|
|
124
124
|
|
|
125
|
+
// Processes 8 float values.
|
|
126
|
+
// Returns {
|
|
127
|
+
// val[0] = {*fine0[0..1], *fine1[0..1]};
|
|
128
|
+
// val[1] = {*fine2[0..1], *fine3[0..1]};
|
|
129
|
+
// }
|
|
130
|
+
inline float32x4x2_t elementaryBlock2x4b(
|
|
131
|
+
const float* const __restrict fine0,
|
|
132
|
+
const float* const __restrict fine1,
|
|
133
|
+
const float* const __restrict fine2,
|
|
134
|
+
const float* const __restrict fine3) {
|
|
135
|
+
const auto fine0Value = vld1_f32(fine0);
|
|
136
|
+
const auto fine1Value = vld1_f32(fine1);
|
|
137
|
+
const auto fine2Value = vld1_f32(fine2);
|
|
138
|
+
const auto fine3Value = vld1_f32(fine3);
|
|
139
|
+
|
|
140
|
+
const auto result0 = vcombine_f32(fine0Value, fine1Value);
|
|
141
|
+
const auto result1 = vcombine_f32(fine2Value, fine3Value);
|
|
142
|
+
|
|
143
|
+
return {result0, result1};
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Processes 8 float values.
|
|
147
|
+
// Returns {
|
|
148
|
+
// val[0] = existingValue.val[0] + weight * {*fine0[0..1], *fine1[0..1]};
|
|
149
|
+
// val[1] = existingValue.val[1] + weight * {*fine2[0..1], *fine3[0..1]};
|
|
150
|
+
// }
|
|
151
|
+
inline float32x4x2_t elementaryBlock2x4bAccum(
|
|
152
|
+
const float* const __restrict fine0,
|
|
153
|
+
const float* const __restrict fine1,
|
|
154
|
+
const float* const __restrict fine2,
|
|
155
|
+
const float* const __restrict fine3,
|
|
156
|
+
const float weight,
|
|
157
|
+
const float32x4x2_t existingValue) {
|
|
158
|
+
const auto fineValue = elementaryBlock2x4b(fine0, fine1, fine2, fine3);
|
|
159
|
+
|
|
160
|
+
const auto weightNeon = vdupq_n_f32(weight);
|
|
161
|
+
const auto result0 =
|
|
162
|
+
vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
|
|
163
|
+
const auto result1 =
|
|
164
|
+
vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
|
|
165
|
+
return {result0, result1};
|
|
166
|
+
}
|
|
167
|
+
|
|
125
168
|
// The following code uses template-based for-loop unrolling,
|
|
126
169
|
// because the compiler does not do that on its own as needed.
|
|
127
170
|
// The idea is the following:
|
|
@@ -141,16 +184,660 @@ inline float32x4x2_t elementaryBlock8x1bAccum(
|
|
|
141
184
|
// Initiate the loop:
|
|
142
185
|
// Foo<0, MAX>::bar();
|
|
143
186
|
|
|
144
|
-
template <
|
|
145
|
-
intptr_t DIM,
|
|
146
|
-
intptr_t FINE_SIZE,
|
|
147
|
-
intptr_t FINE_BITS,
|
|
148
|
-
intptr_t CPOS,
|
|
149
|
-
bool
|
|
150
|
-
bool
|
|
151
|
-
bool
|
|
152
|
-
bool
|
|
153
|
-
|
|
187
|
+
template <
|
|
188
|
+
intptr_t DIM,
|
|
189
|
+
intptr_t FINE_SIZE,
|
|
190
|
+
intptr_t FINE_BITS,
|
|
191
|
+
intptr_t CPOS,
|
|
192
|
+
bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
|
|
193
|
+
bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
|
|
194
|
+
bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
|
|
195
|
+
bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
|
|
196
|
+
bool DIM_EQ_CPOS = DIM == CPOS>
|
|
197
|
+
struct IndexPQDecoderImpl;
|
|
198
|
+
|
|
199
|
+
template <
|
|
200
|
+
intptr_t DIM,
|
|
201
|
+
intptr_t CPOS,
|
|
202
|
+
intptr_t FINE_BITS,
|
|
203
|
+
bool QPOS_LEFT_GE_8,
|
|
204
|
+
bool QPOS_LEFT_GE_4>
|
|
205
|
+
struct IndexPQDecoderImpl<
|
|
206
|
+
DIM,
|
|
207
|
+
2,
|
|
208
|
+
FINE_BITS,
|
|
209
|
+
CPOS,
|
|
210
|
+
true,
|
|
211
|
+
false,
|
|
212
|
+
QPOS_LEFT_GE_8,
|
|
213
|
+
QPOS_LEFT_GE_4,
|
|
214
|
+
false> {
|
|
215
|
+
static constexpr intptr_t FINE_SIZE = 2;
|
|
216
|
+
|
|
217
|
+
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
218
|
+
static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
|
|
219
|
+
|
|
220
|
+
static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
|
|
221
|
+
|
|
222
|
+
static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
|
|
223
|
+
|
|
224
|
+
// process 1 sample
|
|
225
|
+
static void store(
|
|
226
|
+
const float* const __restrict pqFineCentroids0,
|
|
227
|
+
const uint8_t* const __restrict code0,
|
|
228
|
+
float* const __restrict outputStore) {
|
|
229
|
+
// fine quantizer
|
|
230
|
+
const uint8_t* const __restrict fine0 = code0;
|
|
231
|
+
|
|
232
|
+
// process chunks, 2 float
|
|
233
|
+
// but 8 floats per loop
|
|
234
|
+
|
|
235
|
+
const intptr_t fineCode0a = detail::
|
|
236
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
237
|
+
fine0);
|
|
238
|
+
const intptr_t fineCode0b = detail::
|
|
239
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
240
|
+
fine0);
|
|
241
|
+
const intptr_t fineCode0c = detail::
|
|
242
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
243
|
+
fine0);
|
|
244
|
+
const intptr_t fineCode0d = detail::
|
|
245
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
246
|
+
fine0);
|
|
247
|
+
|
|
248
|
+
const auto storeValue = elementaryBlock2x4b(
|
|
249
|
+
pqFineCentroids0 +
|
|
250
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
251
|
+
fineCode0a) *
|
|
252
|
+
FINE_SIZE +
|
|
253
|
+
fineCentroidOffset,
|
|
254
|
+
pqFineCentroids0 +
|
|
255
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
256
|
+
fineCode0b) *
|
|
257
|
+
FINE_SIZE +
|
|
258
|
+
fineCentroidOffset,
|
|
259
|
+
pqFineCentroids0 +
|
|
260
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
261
|
+
fineCode0c) *
|
|
262
|
+
FINE_SIZE +
|
|
263
|
+
fineCentroidOffset,
|
|
264
|
+
pqFineCentroids0 +
|
|
265
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
266
|
+
fineCode0d) *
|
|
267
|
+
FINE_SIZE +
|
|
268
|
+
fineCentroidOffset);
|
|
269
|
+
|
|
270
|
+
vst1q_f32(outputStore + CPOS, storeValue.val[0]);
|
|
271
|
+
vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
|
|
272
|
+
|
|
273
|
+
// next
|
|
274
|
+
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
|
|
275
|
+
pqFineCentroids0, code0, outputStore);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// process 1 sample
|
|
279
|
+
static void accum(
|
|
280
|
+
const float* const __restrict pqFineCentroids0,
|
|
281
|
+
const uint8_t* const __restrict code0,
|
|
282
|
+
const float weight0,
|
|
283
|
+
float* const __restrict outputAccum) {
|
|
284
|
+
// fine quantizer
|
|
285
|
+
const uint8_t* const __restrict fine0 = code0;
|
|
286
|
+
|
|
287
|
+
// process chunks, 2 float
|
|
288
|
+
// but 8 floats per loop
|
|
289
|
+
|
|
290
|
+
const intptr_t fineCode0a = detail::
|
|
291
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
292
|
+
fine0);
|
|
293
|
+
const intptr_t fineCode0b = detail::
|
|
294
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
295
|
+
fine0);
|
|
296
|
+
const intptr_t fineCode0c = detail::
|
|
297
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
298
|
+
fine0);
|
|
299
|
+
const intptr_t fineCode0d = detail::
|
|
300
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
301
|
+
fine0);
|
|
302
|
+
|
|
303
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
304
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
305
|
+
|
|
306
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
307
|
+
pqFineCentroids0 +
|
|
308
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
309
|
+
fineCode0a) *
|
|
310
|
+
FINE_SIZE +
|
|
311
|
+
fineCentroidOffset,
|
|
312
|
+
pqFineCentroids0 +
|
|
313
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
314
|
+
fineCode0b) *
|
|
315
|
+
FINE_SIZE +
|
|
316
|
+
fineCentroidOffset,
|
|
317
|
+
pqFineCentroids0 +
|
|
318
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
319
|
+
fineCode0c) *
|
|
320
|
+
FINE_SIZE +
|
|
321
|
+
fineCentroidOffset,
|
|
322
|
+
pqFineCentroids0 +
|
|
323
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
324
|
+
fineCode0d) *
|
|
325
|
+
FINE_SIZE +
|
|
326
|
+
fineCentroidOffset,
|
|
327
|
+
weight0,
|
|
328
|
+
{existingValue0, existingValue1});
|
|
329
|
+
|
|
330
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
331
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
332
|
+
|
|
333
|
+
// next
|
|
334
|
+
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
|
335
|
+
pqFineCentroids0, code0, weight0, outputAccum);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Process 2 samples.
|
|
339
|
+
// Each code uses its own fine pq centroids table.
|
|
340
|
+
static void accum(
|
|
341
|
+
const float* const __restrict pqFineCentroids0,
|
|
342
|
+
const uint8_t* const __restrict code0,
|
|
343
|
+
const float weight0,
|
|
344
|
+
const float* const __restrict pqFineCentroids1,
|
|
345
|
+
const uint8_t* const __restrict code1,
|
|
346
|
+
const float weight1,
|
|
347
|
+
float* const __restrict outputAccum) {
|
|
348
|
+
// fine quantizer
|
|
349
|
+
const uint8_t* const __restrict fine0 = code0;
|
|
350
|
+
const uint8_t* const __restrict fine1 = code1;
|
|
351
|
+
|
|
352
|
+
// process chunks, 2 float
|
|
353
|
+
// but 8 floats per loop
|
|
354
|
+
|
|
355
|
+
const intptr_t fineCode0a = detail::
|
|
356
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
357
|
+
fine0);
|
|
358
|
+
const intptr_t fineCode0b = detail::
|
|
359
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
360
|
+
fine0);
|
|
361
|
+
const intptr_t fineCode0c = detail::
|
|
362
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
363
|
+
fine0);
|
|
364
|
+
const intptr_t fineCode0d = detail::
|
|
365
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
366
|
+
fine0);
|
|
367
|
+
const intptr_t fineCode1a = detail::
|
|
368
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
369
|
+
fine1);
|
|
370
|
+
const intptr_t fineCode1b = detail::
|
|
371
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
372
|
+
fine1);
|
|
373
|
+
const intptr_t fineCode1c = detail::
|
|
374
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
375
|
+
fine1);
|
|
376
|
+
const intptr_t fineCode1d = detail::
|
|
377
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
378
|
+
fine1);
|
|
379
|
+
|
|
380
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
381
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
382
|
+
|
|
383
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
384
|
+
pqFineCentroids0 +
|
|
385
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
386
|
+
fineCode0a) *
|
|
387
|
+
FINE_SIZE +
|
|
388
|
+
fineCentroidOffset,
|
|
389
|
+
pqFineCentroids0 +
|
|
390
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
391
|
+
fineCode0b) *
|
|
392
|
+
FINE_SIZE +
|
|
393
|
+
fineCentroidOffset,
|
|
394
|
+
pqFineCentroids0 +
|
|
395
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
396
|
+
fineCode0c) *
|
|
397
|
+
FINE_SIZE +
|
|
398
|
+
fineCentroidOffset,
|
|
399
|
+
pqFineCentroids0 +
|
|
400
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
401
|
+
fineCode0d) *
|
|
402
|
+
FINE_SIZE +
|
|
403
|
+
fineCentroidOffset,
|
|
404
|
+
weight0,
|
|
405
|
+
{existingValue0, existingValue1});
|
|
406
|
+
|
|
407
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
408
|
+
pqFineCentroids1 +
|
|
409
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
410
|
+
fineCode1a) *
|
|
411
|
+
FINE_SIZE +
|
|
412
|
+
fineCentroidOffset,
|
|
413
|
+
pqFineCentroids1 +
|
|
414
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
415
|
+
fineCode1b) *
|
|
416
|
+
FINE_SIZE +
|
|
417
|
+
fineCentroidOffset,
|
|
418
|
+
pqFineCentroids1 +
|
|
419
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
420
|
+
fineCode1c) *
|
|
421
|
+
FINE_SIZE +
|
|
422
|
+
fineCentroidOffset,
|
|
423
|
+
pqFineCentroids1 +
|
|
424
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
425
|
+
fineCode1d) *
|
|
426
|
+
FINE_SIZE +
|
|
427
|
+
fineCentroidOffset,
|
|
428
|
+
weight1,
|
|
429
|
+
existingValue);
|
|
430
|
+
|
|
431
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
432
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
433
|
+
|
|
434
|
+
// next
|
|
435
|
+
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
|
436
|
+
pqFineCentroids0,
|
|
437
|
+
code0,
|
|
438
|
+
weight0,
|
|
439
|
+
pqFineCentroids1,
|
|
440
|
+
code1,
|
|
441
|
+
weight1,
|
|
442
|
+
outputAccum);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Process 2 samples.
|
|
446
|
+
// Fine pq centroids table is shared among codes.
|
|
447
|
+
static void accum(
|
|
448
|
+
const float* const __restrict pqFineCentroids,
|
|
449
|
+
const uint8_t* const __restrict code0,
|
|
450
|
+
const float weight0,
|
|
451
|
+
const uint8_t* const __restrict code1,
|
|
452
|
+
const float weight1,
|
|
453
|
+
float* const __restrict outputAccum) {
|
|
454
|
+
// fine quantizer
|
|
455
|
+
const uint8_t* const __restrict fine0 = code0;
|
|
456
|
+
const uint8_t* const __restrict fine1 = code1;
|
|
457
|
+
|
|
458
|
+
// process chunks, 2 float
|
|
459
|
+
// but 8 floats per loop
|
|
460
|
+
|
|
461
|
+
const intptr_t fineCode0a = detail::
|
|
462
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
463
|
+
fine0);
|
|
464
|
+
const intptr_t fineCode0b = detail::
|
|
465
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
466
|
+
fine0);
|
|
467
|
+
const intptr_t fineCode0c = detail::
|
|
468
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
469
|
+
fine0);
|
|
470
|
+
const intptr_t fineCode0d = detail::
|
|
471
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
472
|
+
fine0);
|
|
473
|
+
const intptr_t fineCode1a = detail::
|
|
474
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
475
|
+
fine1);
|
|
476
|
+
const intptr_t fineCode1b = detail::
|
|
477
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
478
|
+
fine1);
|
|
479
|
+
const intptr_t fineCode1c = detail::
|
|
480
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
481
|
+
fine1);
|
|
482
|
+
const intptr_t fineCode1d = detail::
|
|
483
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
484
|
+
fine1);
|
|
485
|
+
|
|
486
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
487
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
488
|
+
|
|
489
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
490
|
+
pqFineCentroids +
|
|
491
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
492
|
+
fineCode0a) *
|
|
493
|
+
FINE_SIZE +
|
|
494
|
+
fineCentroidOffset,
|
|
495
|
+
pqFineCentroids +
|
|
496
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
497
|
+
fineCode0b) *
|
|
498
|
+
FINE_SIZE +
|
|
499
|
+
fineCentroidOffset,
|
|
500
|
+
pqFineCentroids +
|
|
501
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
502
|
+
fineCode0c) *
|
|
503
|
+
FINE_SIZE +
|
|
504
|
+
fineCentroidOffset,
|
|
505
|
+
pqFineCentroids +
|
|
506
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
507
|
+
fineCode0d) *
|
|
508
|
+
FINE_SIZE +
|
|
509
|
+
fineCentroidOffset,
|
|
510
|
+
weight0,
|
|
511
|
+
{existingValue0, existingValue1});
|
|
512
|
+
|
|
513
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
514
|
+
pqFineCentroids +
|
|
515
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
516
|
+
fineCode1a) *
|
|
517
|
+
FINE_SIZE +
|
|
518
|
+
fineCentroidOffset,
|
|
519
|
+
pqFineCentroids +
|
|
520
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
521
|
+
fineCode1b) *
|
|
522
|
+
FINE_SIZE +
|
|
523
|
+
fineCentroidOffset,
|
|
524
|
+
pqFineCentroids +
|
|
525
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
526
|
+
fineCode1c) *
|
|
527
|
+
FINE_SIZE +
|
|
528
|
+
fineCentroidOffset,
|
|
529
|
+
pqFineCentroids +
|
|
530
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
531
|
+
fineCode1d) *
|
|
532
|
+
FINE_SIZE +
|
|
533
|
+
fineCentroidOffset,
|
|
534
|
+
weight1,
|
|
535
|
+
existingValue);
|
|
536
|
+
|
|
537
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
538
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
539
|
+
|
|
540
|
+
// next
|
|
541
|
+
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
|
542
|
+
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Process 3 samples.
|
|
546
|
+
// Each code uses its own fine pq centroids table.
|
|
547
|
+
static void accum(
|
|
548
|
+
const float* const __restrict pqFineCentroids0,
|
|
549
|
+
const uint8_t* const __restrict code0,
|
|
550
|
+
const float weight0,
|
|
551
|
+
const float* const __restrict pqFineCentroids1,
|
|
552
|
+
const uint8_t* const __restrict code1,
|
|
553
|
+
const float weight1,
|
|
554
|
+
const float* const __restrict pqFineCentroids2,
|
|
555
|
+
const uint8_t* const __restrict code2,
|
|
556
|
+
const float weight2,
|
|
557
|
+
float* const __restrict outputAccum) {
|
|
558
|
+
// fine quantizer
|
|
559
|
+
const uint8_t* const __restrict fine0 = code0;
|
|
560
|
+
const uint8_t* const __restrict fine1 = code1;
|
|
561
|
+
const uint8_t* const __restrict fine2 = code2;
|
|
562
|
+
|
|
563
|
+
// process chunks, 2 float
|
|
564
|
+
// but 8 floats per loop
|
|
565
|
+
|
|
566
|
+
const intptr_t fineCode0a = detail::
|
|
567
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
568
|
+
fine0);
|
|
569
|
+
const intptr_t fineCode0b = detail::
|
|
570
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
571
|
+
fine0);
|
|
572
|
+
const intptr_t fineCode0c = detail::
|
|
573
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
574
|
+
fine0);
|
|
575
|
+
const intptr_t fineCode0d = detail::
|
|
576
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
577
|
+
fine0);
|
|
578
|
+
const intptr_t fineCode1a = detail::
|
|
579
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
580
|
+
fine1);
|
|
581
|
+
const intptr_t fineCode1b = detail::
|
|
582
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
583
|
+
fine1);
|
|
584
|
+
const intptr_t fineCode1c = detail::
|
|
585
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
586
|
+
fine1);
|
|
587
|
+
const intptr_t fineCode1d = detail::
|
|
588
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
589
|
+
fine1);
|
|
590
|
+
const intptr_t fineCode2a = detail::
|
|
591
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
592
|
+
fine2);
|
|
593
|
+
const intptr_t fineCode2b = detail::
|
|
594
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
595
|
+
fine2);
|
|
596
|
+
const intptr_t fineCode2c = detail::
|
|
597
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
598
|
+
fine2);
|
|
599
|
+
const intptr_t fineCode2d = detail::
|
|
600
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
601
|
+
fine2);
|
|
602
|
+
|
|
603
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
604
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
605
|
+
|
|
606
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
607
|
+
pqFineCentroids0 +
|
|
608
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
609
|
+
fineCode0a) *
|
|
610
|
+
FINE_SIZE +
|
|
611
|
+
fineCentroidOffset,
|
|
612
|
+
pqFineCentroids0 +
|
|
613
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
614
|
+
fineCode0b) *
|
|
615
|
+
FINE_SIZE +
|
|
616
|
+
fineCentroidOffset,
|
|
617
|
+
pqFineCentroids0 +
|
|
618
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
619
|
+
fineCode0c) *
|
|
620
|
+
FINE_SIZE +
|
|
621
|
+
fineCentroidOffset,
|
|
622
|
+
pqFineCentroids0 +
|
|
623
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
624
|
+
fineCode0d) *
|
|
625
|
+
FINE_SIZE +
|
|
626
|
+
fineCentroidOffset,
|
|
627
|
+
weight0,
|
|
628
|
+
{existingValue0, existingValue1});
|
|
629
|
+
|
|
630
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
631
|
+
pqFineCentroids1 +
|
|
632
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
633
|
+
fineCode1a) *
|
|
634
|
+
FINE_SIZE +
|
|
635
|
+
fineCentroidOffset,
|
|
636
|
+
pqFineCentroids1 +
|
|
637
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
638
|
+
fineCode1b) *
|
|
639
|
+
FINE_SIZE +
|
|
640
|
+
fineCentroidOffset,
|
|
641
|
+
pqFineCentroids1 +
|
|
642
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
643
|
+
fineCode1c) *
|
|
644
|
+
FINE_SIZE +
|
|
645
|
+
fineCentroidOffset,
|
|
646
|
+
pqFineCentroids1 +
|
|
647
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
648
|
+
fineCode1d) *
|
|
649
|
+
FINE_SIZE +
|
|
650
|
+
fineCentroidOffset,
|
|
651
|
+
weight1,
|
|
652
|
+
existingValue);
|
|
653
|
+
|
|
654
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
655
|
+
pqFineCentroids2 +
|
|
656
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
657
|
+
fineCode2a) *
|
|
658
|
+
FINE_SIZE +
|
|
659
|
+
fineCentroidOffset,
|
|
660
|
+
pqFineCentroids2 +
|
|
661
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
662
|
+
fineCode2b) *
|
|
663
|
+
FINE_SIZE +
|
|
664
|
+
fineCentroidOffset,
|
|
665
|
+
pqFineCentroids2 +
|
|
666
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
667
|
+
fineCode2c) *
|
|
668
|
+
FINE_SIZE +
|
|
669
|
+
fineCentroidOffset,
|
|
670
|
+
pqFineCentroids2 +
|
|
671
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
672
|
+
fineCode2d) *
|
|
673
|
+
FINE_SIZE +
|
|
674
|
+
fineCentroidOffset,
|
|
675
|
+
weight2,
|
|
676
|
+
existingValue);
|
|
677
|
+
|
|
678
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
679
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
680
|
+
|
|
681
|
+
// next
|
|
682
|
+
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
|
683
|
+
pqFineCentroids0,
|
|
684
|
+
code0,
|
|
685
|
+
weight0,
|
|
686
|
+
pqFineCentroids1,
|
|
687
|
+
code1,
|
|
688
|
+
weight1,
|
|
689
|
+
pqFineCentroids2,
|
|
690
|
+
code2,
|
|
691
|
+
weight2,
|
|
692
|
+
outputAccum);
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
// Process 3 samples.
|
|
696
|
+
// Fine pq centroids table is shared among codes.
|
|
697
|
+
static void accum(
|
|
698
|
+
const float* const __restrict pqFineCentroids,
|
|
699
|
+
const uint8_t* const __restrict code0,
|
|
700
|
+
const float weight0,
|
|
701
|
+
const uint8_t* const __restrict code1,
|
|
702
|
+
const float weight1,
|
|
703
|
+
const uint8_t* const __restrict code2,
|
|
704
|
+
const float weight2,
|
|
705
|
+
float* const __restrict outputAccum) {
|
|
706
|
+
// fine quantizer
|
|
707
|
+
const uint8_t* const __restrict fine0 = code0;
|
|
708
|
+
const uint8_t* const __restrict fine1 = code1;
|
|
709
|
+
const uint8_t* const __restrict fine2 = code2;
|
|
710
|
+
|
|
711
|
+
// process chunks, 2 float
|
|
712
|
+
// but 8 floats per loop
|
|
713
|
+
|
|
714
|
+
const intptr_t fineCode0a = detail::
|
|
715
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
716
|
+
fine0);
|
|
717
|
+
const intptr_t fineCode0b = detail::
|
|
718
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
719
|
+
fine0);
|
|
720
|
+
const intptr_t fineCode0c = detail::
|
|
721
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
722
|
+
fine0);
|
|
723
|
+
const intptr_t fineCode0d = detail::
|
|
724
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
725
|
+
fine0);
|
|
726
|
+
const intptr_t fineCode1a = detail::
|
|
727
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
728
|
+
fine1);
|
|
729
|
+
const intptr_t fineCode1b = detail::
|
|
730
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
731
|
+
fine1);
|
|
732
|
+
const intptr_t fineCode1c = detail::
|
|
733
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
734
|
+
fine1);
|
|
735
|
+
const intptr_t fineCode1d = detail::
|
|
736
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
737
|
+
fine1);
|
|
738
|
+
const intptr_t fineCode2a = detail::
|
|
739
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
740
|
+
fine2);
|
|
741
|
+
const intptr_t fineCode2b = detail::
|
|
742
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
743
|
+
fine2);
|
|
744
|
+
const intptr_t fineCode2c = detail::
|
|
745
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
746
|
+
fine2);
|
|
747
|
+
const intptr_t fineCode2d = detail::
|
|
748
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
749
|
+
fine2);
|
|
750
|
+
|
|
751
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
752
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
753
|
+
|
|
754
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
755
|
+
pqFineCentroids +
|
|
756
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
757
|
+
fineCode0a) *
|
|
758
|
+
FINE_SIZE +
|
|
759
|
+
fineCentroidOffset,
|
|
760
|
+
pqFineCentroids +
|
|
761
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
762
|
+
fineCode0b) *
|
|
763
|
+
FINE_SIZE +
|
|
764
|
+
fineCentroidOffset,
|
|
765
|
+
pqFineCentroids +
|
|
766
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
767
|
+
fineCode0c) *
|
|
768
|
+
FINE_SIZE +
|
|
769
|
+
fineCentroidOffset,
|
|
770
|
+
pqFineCentroids +
|
|
771
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
772
|
+
fineCode0d) *
|
|
773
|
+
FINE_SIZE +
|
|
774
|
+
fineCentroidOffset,
|
|
775
|
+
weight0,
|
|
776
|
+
{existingValue0, existingValue1});
|
|
777
|
+
|
|
778
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
779
|
+
pqFineCentroids +
|
|
780
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
781
|
+
fineCode1a) *
|
|
782
|
+
FINE_SIZE +
|
|
783
|
+
fineCentroidOffset,
|
|
784
|
+
pqFineCentroids +
|
|
785
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
786
|
+
fineCode1b) *
|
|
787
|
+
FINE_SIZE +
|
|
788
|
+
fineCentroidOffset,
|
|
789
|
+
pqFineCentroids +
|
|
790
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
791
|
+
fineCode1c) *
|
|
792
|
+
FINE_SIZE +
|
|
793
|
+
fineCentroidOffset,
|
|
794
|
+
pqFineCentroids +
|
|
795
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
796
|
+
fineCode1d) *
|
|
797
|
+
FINE_SIZE +
|
|
798
|
+
fineCentroidOffset,
|
|
799
|
+
weight1,
|
|
800
|
+
existingValue);
|
|
801
|
+
|
|
802
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
803
|
+
pqFineCentroids +
|
|
804
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
805
|
+
fineCode2a) *
|
|
806
|
+
FINE_SIZE +
|
|
807
|
+
fineCentroidOffset,
|
|
808
|
+
pqFineCentroids +
|
|
809
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
810
|
+
fineCode2b) *
|
|
811
|
+
FINE_SIZE +
|
|
812
|
+
fineCentroidOffset,
|
|
813
|
+
pqFineCentroids +
|
|
814
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
815
|
+
fineCode2c) *
|
|
816
|
+
FINE_SIZE +
|
|
817
|
+
fineCentroidOffset,
|
|
818
|
+
pqFineCentroids +
|
|
819
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
820
|
+
fineCode2d) *
|
|
821
|
+
FINE_SIZE +
|
|
822
|
+
fineCentroidOffset,
|
|
823
|
+
weight2,
|
|
824
|
+
existingValue);
|
|
825
|
+
|
|
826
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
827
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
828
|
+
|
|
829
|
+
// next
|
|
830
|
+
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
|
831
|
+
pqFineCentroids,
|
|
832
|
+
code0,
|
|
833
|
+
weight0,
|
|
834
|
+
code1,
|
|
835
|
+
weight1,
|
|
836
|
+
code2,
|
|
837
|
+
weight2,
|
|
838
|
+
outputAccum);
|
|
839
|
+
}
|
|
840
|
+
};
|
|
154
841
|
|
|
155
842
|
template <
|
|
156
843
|
intptr_t DIM,
|
|
@@ -163,6 +850,7 @@ struct IndexPQDecoderImpl<
|
|
|
163
850
|
4,
|
|
164
851
|
FINE_BITS,
|
|
165
852
|
CPOS,
|
|
853
|
+
false,
|
|
166
854
|
true,
|
|
167
855
|
QPOS_LEFT_GE_8,
|
|
168
856
|
QPOS_LEFT_GE_4,
|
|
@@ -609,6 +1297,7 @@ struct IndexPQDecoderImpl<
|
|
|
609
1297
|
FINE_BITS,
|
|
610
1298
|
CPOS,
|
|
611
1299
|
false,
|
|
1300
|
+
false,
|
|
612
1301
|
true,
|
|
613
1302
|
true,
|
|
614
1303
|
false> {
|
|
@@ -938,6 +1627,7 @@ struct IndexPQDecoderImpl<
|
|
|
938
1627
|
CPOS,
|
|
939
1628
|
false,
|
|
940
1629
|
false,
|
|
1630
|
+
false,
|
|
941
1631
|
true,
|
|
942
1632
|
false> {
|
|
943
1633
|
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
@@ -1252,6 +1942,7 @@ template <
|
|
|
1252
1942
|
intptr_t DIM,
|
|
1253
1943
|
intptr_t FINE_SIZE,
|
|
1254
1944
|
intptr_t FINE_BITS,
|
|
1945
|
+
bool FINE_SIZE_EQ_2,
|
|
1255
1946
|
bool FINE_SIZE_EQ_4,
|
|
1256
1947
|
bool QPOS_LEFT_GE_8,
|
|
1257
1948
|
bool QPOS_LEFT_GE_4>
|
|
@@ -1260,6 +1951,7 @@ struct IndexPQDecoderImpl<
|
|
|
1260
1951
|
FINE_SIZE,
|
|
1261
1952
|
FINE_BITS,
|
|
1262
1953
|
DIM,
|
|
1954
|
+
FINE_SIZE_EQ_2,
|
|
1263
1955
|
FINE_SIZE_EQ_4,
|
|
1264
1956
|
QPOS_LEFT_GE_8,
|
|
1265
1957
|
QPOS_LEFT_GE_4,
|