faiss 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
- data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +4 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
- data/vendor/faiss/faiss/impl/HNSW.h +51 -13
- data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
- data/vendor/faiss/faiss/impl/Panorama.h +11 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
- data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
- data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
- data/vendor/faiss/faiss/impl/io_macros.h +25 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
- data/vendor/faiss/faiss/index_factory.cpp +5 -1
- data/vendor/faiss/faiss/index_io.h +16 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
- data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
- metadata +12 -2
|
@@ -142,6 +142,57 @@ inline float32x4x2_t elementaryBlock8x1bAccum(
|
|
|
142
142
|
return {result0, result1};
|
|
143
143
|
}
|
|
144
144
|
|
|
145
|
+
// Processes 8 float values.
|
|
146
|
+
// Returns {
|
|
147
|
+
// val[0] = {*coarse[0..1] + *fine0[0..1], *coarse[2..3] + *fine1[0..1]};
|
|
148
|
+
// val[1] = {*coarse[4..5] + *fine2[0..1], *coarse[6..7] + *fine3[0..1]};
|
|
149
|
+
// }
|
|
150
|
+
inline float32x4x2_t elementaryBlock2x4b(
|
|
151
|
+
const float* const __restrict coarse,
|
|
152
|
+
const float* const __restrict fine0,
|
|
153
|
+
const float* const __restrict fine1,
|
|
154
|
+
const float* const __restrict fine2,
|
|
155
|
+
const float* const __restrict fine3) {
|
|
156
|
+
const auto fine0Value = vld1_f32(fine0);
|
|
157
|
+
const auto fine1Value = vld1_f32(fine1);
|
|
158
|
+
const auto fine2Value = vld1_f32(fine2);
|
|
159
|
+
const auto fine3Value = vld1_f32(fine3);
|
|
160
|
+
|
|
161
|
+
const auto coarseValue0 = vld1q_f32(coarse);
|
|
162
|
+
const auto coarseValue1 = vld1q_f32(coarse + 4);
|
|
163
|
+
|
|
164
|
+
const auto fineResult0 = vcombine_f32(fine0Value, fine1Value);
|
|
165
|
+
const auto fineResult1 = vcombine_f32(fine2Value, fine3Value);
|
|
166
|
+
|
|
167
|
+
return {vaddq_f32(fineResult0, coarseValue0),
|
|
168
|
+
vaddq_f32(fineResult1, coarseValue1)};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Processes 8 float values.
|
|
172
|
+
// Returns {
|
|
173
|
+
// val[0] = existingValue.val[0] + weight * (*coarse[0..1] + *fine0[0..1],
|
|
174
|
+
// *coarse[2..3] + *fine1[0..1]); val[1] = existingValue.val[1] + weight *
|
|
175
|
+
// (*coarse[4..5] + *fine2[0..1], *coarse[6..7] + *fine3[0..1]);
|
|
176
|
+
// }
|
|
177
|
+
inline float32x4x2_t elementaryBlock2x4bAccum(
|
|
178
|
+
const float* const __restrict coarse,
|
|
179
|
+
const float* const __restrict fine0,
|
|
180
|
+
const float* const __restrict fine1,
|
|
181
|
+
const float* const __restrict fine2,
|
|
182
|
+
const float* const __restrict fine3,
|
|
183
|
+
const float weight,
|
|
184
|
+
const float32x4x2_t existingValue) {
|
|
185
|
+
const auto combinedValue =
|
|
186
|
+
elementaryBlock2x4b(coarse, fine0, fine1, fine2, fine3);
|
|
187
|
+
|
|
188
|
+
const auto weightNeon = vdupq_n_f32(weight);
|
|
189
|
+
const auto result0 =
|
|
190
|
+
vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
|
|
191
|
+
const auto result1 =
|
|
192
|
+
vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
|
|
193
|
+
return {result0, result1};
|
|
194
|
+
}
|
|
195
|
+
|
|
145
196
|
// The following code uses template-based for-loop unrolling,
|
|
146
197
|
// because the compiler does not do that on its own as needed.
|
|
147
198
|
// The idea is the following:
|
|
@@ -161,18 +212,852 @@ inline float32x4x2_t elementaryBlock8x1bAccum(
|
|
|
161
212
|
// Initiate the loop:
|
|
162
213
|
// Foo<0, MAX>::bar();
|
|
163
214
|
|
|
164
|
-
template <
|
|
165
|
-
intptr_t DIM,
|
|
166
|
-
intptr_t COARSE_SIZE,
|
|
167
|
-
intptr_t FINE_SIZE,
|
|
168
|
-
intptr_t COARSE_BITS,
|
|
169
|
-
intptr_t FINE_BITS,
|
|
170
|
-
intptr_t CPOS,
|
|
171
|
-
bool
|
|
172
|
-
bool
|
|
173
|
-
bool
|
|
174
|
-
bool
|
|
175
|
-
|
|
215
|
+
template <
|
|
216
|
+
intptr_t DIM,
|
|
217
|
+
intptr_t COARSE_SIZE,
|
|
218
|
+
intptr_t FINE_SIZE,
|
|
219
|
+
intptr_t COARSE_BITS,
|
|
220
|
+
intptr_t FINE_BITS,
|
|
221
|
+
intptr_t CPOS,
|
|
222
|
+
bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
|
|
223
|
+
bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
|
|
224
|
+
bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
|
|
225
|
+
bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
|
|
226
|
+
bool DIM_EQ_CPOS = DIM == CPOS>
|
|
227
|
+
struct Index2LevelDecoderImpl;
|
|
228
|
+
|
|
229
|
+
template <
|
|
230
|
+
intptr_t DIM,
|
|
231
|
+
intptr_t COARSE_SIZE,
|
|
232
|
+
intptr_t COARSE_BITS,
|
|
233
|
+
intptr_t FINE_BITS,
|
|
234
|
+
intptr_t CPOS,
|
|
235
|
+
bool QPOS_LEFT_GE_8,
|
|
236
|
+
bool QPOS_LEFT_GE_4>
|
|
237
|
+
struct Index2LevelDecoderImpl<
|
|
238
|
+
DIM,
|
|
239
|
+
COARSE_SIZE,
|
|
240
|
+
2,
|
|
241
|
+
COARSE_BITS,
|
|
242
|
+
FINE_BITS,
|
|
243
|
+
CPOS,
|
|
244
|
+
true,
|
|
245
|
+
false,
|
|
246
|
+
QPOS_LEFT_GE_8,
|
|
247
|
+
QPOS_LEFT_GE_4,
|
|
248
|
+
false> {
|
|
249
|
+
static constexpr intptr_t FINE_SIZE = 2;
|
|
250
|
+
|
|
251
|
+
static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
|
|
252
|
+
static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
|
|
253
|
+
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
254
|
+
static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
|
|
255
|
+
|
|
256
|
+
static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
|
|
257
|
+
|
|
258
|
+
// coarse quantizer storage
|
|
259
|
+
static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
|
|
260
|
+
|
|
261
|
+
// coarse quantizer bytes start from 0
|
|
262
|
+
// fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
|
|
263
|
+
static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
|
|
264
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
|
|
265
|
+
N_COARSE_ELEMENTS * COARSE_BITS;
|
|
266
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
|
|
267
|
+
(N_COARSE_ELEMENTS_BITS + 7) / 8;
|
|
268
|
+
|
|
269
|
+
static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
|
|
270
|
+
|
|
271
|
+
// process 1 sample
|
|
272
|
+
static void store(
|
|
273
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
274
|
+
const float* const __restrict pqFineCentroids0,
|
|
275
|
+
const uint8_t* const __restrict code0,
|
|
276
|
+
float* const __restrict outputStore) {
|
|
277
|
+
// coarse quantizer
|
|
278
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
279
|
+
|
|
280
|
+
// fine quantizer
|
|
281
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
282
|
+
|
|
283
|
+
// process chunks, 2 float
|
|
284
|
+
// but 8 floats per loop
|
|
285
|
+
|
|
286
|
+
const intptr_t coarseCode0 = detail::
|
|
287
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
288
|
+
get(coarse0);
|
|
289
|
+
const intptr_t fineCode0a = detail::
|
|
290
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
291
|
+
fine0);
|
|
292
|
+
const intptr_t fineCode0b = detail::
|
|
293
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
294
|
+
fine0);
|
|
295
|
+
const intptr_t fineCode0c = detail::
|
|
296
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
297
|
+
fine0);
|
|
298
|
+
const intptr_t fineCode0d = detail::
|
|
299
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
300
|
+
fine0);
|
|
301
|
+
|
|
302
|
+
const auto storeValue = elementaryBlock2x4b(
|
|
303
|
+
pqCoarseCentroids0 +
|
|
304
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
|
305
|
+
COARSE_SIZE +
|
|
306
|
+
coarseCentroidOffset,
|
|
307
|
+
pqFineCentroids0 +
|
|
308
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
309
|
+
fineCode0a) *
|
|
310
|
+
FINE_SIZE +
|
|
311
|
+
fineCentroidOffset,
|
|
312
|
+
pqFineCentroids0 +
|
|
313
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
314
|
+
fineCode0b) *
|
|
315
|
+
FINE_SIZE +
|
|
316
|
+
fineCentroidOffset,
|
|
317
|
+
pqFineCentroids0 +
|
|
318
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
319
|
+
fineCode0c) *
|
|
320
|
+
FINE_SIZE +
|
|
321
|
+
fineCentroidOffset,
|
|
322
|
+
pqFineCentroids0 +
|
|
323
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
324
|
+
fineCode0d) *
|
|
325
|
+
FINE_SIZE +
|
|
326
|
+
fineCentroidOffset);
|
|
327
|
+
|
|
328
|
+
vst1q_f32(outputStore + CPOS, storeValue.val[0]);
|
|
329
|
+
vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
|
|
330
|
+
|
|
331
|
+
// next
|
|
332
|
+
Index2LevelDecoderImpl<
|
|
333
|
+
DIM,
|
|
334
|
+
COARSE_SIZE,
|
|
335
|
+
FINE_SIZE,
|
|
336
|
+
COARSE_BITS,
|
|
337
|
+
FINE_BITS,
|
|
338
|
+
CPOS + 8>::
|
|
339
|
+
store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// process 1 sample
|
|
343
|
+
static void accum(
|
|
344
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
345
|
+
const float* const __restrict pqFineCentroids0,
|
|
346
|
+
const uint8_t* const __restrict code0,
|
|
347
|
+
const float weight0,
|
|
348
|
+
float* const __restrict outputAccum) {
|
|
349
|
+
// coarse quantizer
|
|
350
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
351
|
+
|
|
352
|
+
// fine quantizer
|
|
353
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
354
|
+
|
|
355
|
+
// process chunks, 2 float
|
|
356
|
+
// but 8 floats per loop
|
|
357
|
+
|
|
358
|
+
const intptr_t coarseCode0 = detail::
|
|
359
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
360
|
+
get(coarse0);
|
|
361
|
+
const intptr_t fineCode0a = detail::
|
|
362
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
363
|
+
fine0);
|
|
364
|
+
const intptr_t fineCode0b = detail::
|
|
365
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
366
|
+
fine0);
|
|
367
|
+
const intptr_t fineCode0c = detail::
|
|
368
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
369
|
+
fine0);
|
|
370
|
+
const intptr_t fineCode0d = detail::
|
|
371
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
372
|
+
fine0);
|
|
373
|
+
|
|
374
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
375
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
376
|
+
|
|
377
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
378
|
+
pqCoarseCentroids0 +
|
|
379
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
|
380
|
+
COARSE_SIZE +
|
|
381
|
+
coarseCentroidOffset,
|
|
382
|
+
pqFineCentroids0 +
|
|
383
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
384
|
+
fineCode0a) *
|
|
385
|
+
FINE_SIZE +
|
|
386
|
+
fineCentroidOffset,
|
|
387
|
+
pqFineCentroids0 +
|
|
388
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
389
|
+
fineCode0b) *
|
|
390
|
+
FINE_SIZE +
|
|
391
|
+
fineCentroidOffset,
|
|
392
|
+
pqFineCentroids0 +
|
|
393
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
394
|
+
fineCode0c) *
|
|
395
|
+
FINE_SIZE +
|
|
396
|
+
fineCentroidOffset,
|
|
397
|
+
pqFineCentroids0 +
|
|
398
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
399
|
+
fineCode0d) *
|
|
400
|
+
FINE_SIZE +
|
|
401
|
+
fineCentroidOffset,
|
|
402
|
+
weight0,
|
|
403
|
+
{existingValue0, existingValue1});
|
|
404
|
+
|
|
405
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
406
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
407
|
+
|
|
408
|
+
// next
|
|
409
|
+
Index2LevelDecoderImpl<
|
|
410
|
+
DIM,
|
|
411
|
+
COARSE_SIZE,
|
|
412
|
+
FINE_SIZE,
|
|
413
|
+
COARSE_BITS,
|
|
414
|
+
FINE_BITS,
|
|
415
|
+
CPOS + 8>::
|
|
416
|
+
accum(pqCoarseCentroids0,
|
|
417
|
+
pqFineCentroids0,
|
|
418
|
+
code0,
|
|
419
|
+
weight0,
|
|
420
|
+
outputAccum);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Process 2 samples.
|
|
424
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
425
|
+
// table.
|
|
426
|
+
static void accum(
|
|
427
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
428
|
+
const float* const __restrict pqFineCentroids0,
|
|
429
|
+
const uint8_t* const __restrict code0,
|
|
430
|
+
const float weight0,
|
|
431
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
432
|
+
const float* const __restrict pqFineCentroids1,
|
|
433
|
+
const uint8_t* const __restrict code1,
|
|
434
|
+
const float weight1,
|
|
435
|
+
float* const __restrict outputAccum) {
|
|
436
|
+
// coarse quantizer
|
|
437
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
438
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
439
|
+
|
|
440
|
+
// fine quantizer
|
|
441
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
442
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
443
|
+
|
|
444
|
+
// process chunks, 2 float
|
|
445
|
+
// but 8 floats per loop
|
|
446
|
+
|
|
447
|
+
const intptr_t coarseCode0 = detail::
|
|
448
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
449
|
+
get(coarse0);
|
|
450
|
+
const intptr_t fineCode0a = detail::
|
|
451
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
452
|
+
fine0);
|
|
453
|
+
const intptr_t fineCode0b = detail::
|
|
454
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
455
|
+
fine0);
|
|
456
|
+
const intptr_t fineCode0c = detail::
|
|
457
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
458
|
+
fine0);
|
|
459
|
+
const intptr_t fineCode0d = detail::
|
|
460
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
461
|
+
fine0);
|
|
462
|
+
const intptr_t coarseCode1 = detail::
|
|
463
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
464
|
+
get(coarse1);
|
|
465
|
+
const intptr_t fineCode1a = detail::
|
|
466
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
467
|
+
fine1);
|
|
468
|
+
const intptr_t fineCode1b = detail::
|
|
469
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
470
|
+
fine1);
|
|
471
|
+
const intptr_t fineCode1c = detail::
|
|
472
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
473
|
+
fine1);
|
|
474
|
+
const intptr_t fineCode1d = detail::
|
|
475
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
476
|
+
fine1);
|
|
477
|
+
|
|
478
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
479
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
480
|
+
|
|
481
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
482
|
+
pqCoarseCentroids0 +
|
|
483
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
|
484
|
+
COARSE_SIZE +
|
|
485
|
+
coarseCentroidOffset,
|
|
486
|
+
pqFineCentroids0 +
|
|
487
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
488
|
+
fineCode0a) *
|
|
489
|
+
FINE_SIZE +
|
|
490
|
+
fineCentroidOffset,
|
|
491
|
+
pqFineCentroids0 +
|
|
492
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
493
|
+
fineCode0b) *
|
|
494
|
+
FINE_SIZE +
|
|
495
|
+
fineCentroidOffset,
|
|
496
|
+
pqFineCentroids0 +
|
|
497
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
498
|
+
fineCode0c) *
|
|
499
|
+
FINE_SIZE +
|
|
500
|
+
fineCentroidOffset,
|
|
501
|
+
pqFineCentroids0 +
|
|
502
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
503
|
+
fineCode0d) *
|
|
504
|
+
FINE_SIZE +
|
|
505
|
+
fineCentroidOffset,
|
|
506
|
+
weight0,
|
|
507
|
+
{existingValue0, existingValue1});
|
|
508
|
+
|
|
509
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
510
|
+
pqCoarseCentroids1 +
|
|
511
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
|
512
|
+
COARSE_SIZE +
|
|
513
|
+
coarseCentroidOffset,
|
|
514
|
+
pqFineCentroids1 +
|
|
515
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
516
|
+
fineCode1a) *
|
|
517
|
+
FINE_SIZE +
|
|
518
|
+
fineCentroidOffset,
|
|
519
|
+
pqFineCentroids1 +
|
|
520
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
521
|
+
fineCode1b) *
|
|
522
|
+
FINE_SIZE +
|
|
523
|
+
fineCentroidOffset,
|
|
524
|
+
pqFineCentroids1 +
|
|
525
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
526
|
+
fineCode1c) *
|
|
527
|
+
FINE_SIZE +
|
|
528
|
+
fineCentroidOffset,
|
|
529
|
+
pqFineCentroids1 +
|
|
530
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
531
|
+
fineCode1d) *
|
|
532
|
+
FINE_SIZE +
|
|
533
|
+
fineCentroidOffset,
|
|
534
|
+
weight1,
|
|
535
|
+
existingValue);
|
|
536
|
+
|
|
537
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
538
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
539
|
+
|
|
540
|
+
// next
|
|
541
|
+
Index2LevelDecoderImpl<
|
|
542
|
+
DIM,
|
|
543
|
+
COARSE_SIZE,
|
|
544
|
+
FINE_SIZE,
|
|
545
|
+
COARSE_BITS,
|
|
546
|
+
FINE_BITS,
|
|
547
|
+
CPOS + 8>::
|
|
548
|
+
accum(pqCoarseCentroids0,
|
|
549
|
+
pqFineCentroids0,
|
|
550
|
+
code0,
|
|
551
|
+
weight0,
|
|
552
|
+
pqCoarseCentroids1,
|
|
553
|
+
pqFineCentroids1,
|
|
554
|
+
code1,
|
|
555
|
+
weight1,
|
|
556
|
+
outputAccum);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// Process 2 samples.
|
|
560
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
561
|
+
// codes.
|
|
562
|
+
static void accum(
|
|
563
|
+
const float* const __restrict pqCoarseCentroids,
|
|
564
|
+
const float* const __restrict pqFineCentroids,
|
|
565
|
+
const uint8_t* const __restrict code0,
|
|
566
|
+
const float weight0,
|
|
567
|
+
const uint8_t* const __restrict code1,
|
|
568
|
+
const float weight1,
|
|
569
|
+
float* const __restrict outputAccum) {
|
|
570
|
+
// coarse quantizer
|
|
571
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
572
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
573
|
+
|
|
574
|
+
// fine quantizer
|
|
575
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
576
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
577
|
+
|
|
578
|
+
// process chunks, 2 float
|
|
579
|
+
// but 8 floats per loop
|
|
580
|
+
|
|
581
|
+
const intptr_t coarseCode0 = detail::
|
|
582
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
583
|
+
get(coarse0);
|
|
584
|
+
const intptr_t fineCode0a = detail::
|
|
585
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
586
|
+
fine0);
|
|
587
|
+
const intptr_t fineCode0b = detail::
|
|
588
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
589
|
+
fine0);
|
|
590
|
+
const intptr_t fineCode0c = detail::
|
|
591
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
592
|
+
fine0);
|
|
593
|
+
const intptr_t fineCode0d = detail::
|
|
594
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
595
|
+
fine0);
|
|
596
|
+
const intptr_t coarseCode1 = detail::
|
|
597
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
598
|
+
get(coarse1);
|
|
599
|
+
const intptr_t fineCode1a = detail::
|
|
600
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
601
|
+
fine1);
|
|
602
|
+
const intptr_t fineCode1b = detail::
|
|
603
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
604
|
+
fine1);
|
|
605
|
+
const intptr_t fineCode1c = detail::
|
|
606
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
607
|
+
fine1);
|
|
608
|
+
const intptr_t fineCode1d = detail::
|
|
609
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
610
|
+
fine1);
|
|
611
|
+
|
|
612
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
613
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
614
|
+
|
|
615
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
616
|
+
pqCoarseCentroids +
|
|
617
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
|
618
|
+
COARSE_SIZE +
|
|
619
|
+
coarseCentroidOffset,
|
|
620
|
+
pqFineCentroids +
|
|
621
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
622
|
+
fineCode0a) *
|
|
623
|
+
FINE_SIZE +
|
|
624
|
+
fineCentroidOffset,
|
|
625
|
+
pqFineCentroids +
|
|
626
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
627
|
+
fineCode0b) *
|
|
628
|
+
FINE_SIZE +
|
|
629
|
+
fineCentroidOffset,
|
|
630
|
+
pqFineCentroids +
|
|
631
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
632
|
+
fineCode0c) *
|
|
633
|
+
FINE_SIZE +
|
|
634
|
+
fineCentroidOffset,
|
|
635
|
+
pqFineCentroids +
|
|
636
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
637
|
+
fineCode0d) *
|
|
638
|
+
FINE_SIZE +
|
|
639
|
+
fineCentroidOffset,
|
|
640
|
+
weight0,
|
|
641
|
+
{existingValue0, existingValue1});
|
|
642
|
+
|
|
643
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
644
|
+
pqCoarseCentroids +
|
|
645
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
|
646
|
+
COARSE_SIZE +
|
|
647
|
+
coarseCentroidOffset,
|
|
648
|
+
pqFineCentroids +
|
|
649
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
650
|
+
fineCode1a) *
|
|
651
|
+
FINE_SIZE +
|
|
652
|
+
fineCentroidOffset,
|
|
653
|
+
pqFineCentroids +
|
|
654
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
655
|
+
fineCode1b) *
|
|
656
|
+
FINE_SIZE +
|
|
657
|
+
fineCentroidOffset,
|
|
658
|
+
pqFineCentroids +
|
|
659
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
660
|
+
fineCode1c) *
|
|
661
|
+
FINE_SIZE +
|
|
662
|
+
fineCentroidOffset,
|
|
663
|
+
pqFineCentroids +
|
|
664
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
665
|
+
fineCode1d) *
|
|
666
|
+
FINE_SIZE +
|
|
667
|
+
fineCentroidOffset,
|
|
668
|
+
weight1,
|
|
669
|
+
existingValue);
|
|
670
|
+
|
|
671
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
672
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
673
|
+
|
|
674
|
+
// next
|
|
675
|
+
Index2LevelDecoderImpl<
|
|
676
|
+
DIM,
|
|
677
|
+
COARSE_SIZE,
|
|
678
|
+
FINE_SIZE,
|
|
679
|
+
COARSE_BITS,
|
|
680
|
+
FINE_BITS,
|
|
681
|
+
CPOS + 8>::
|
|
682
|
+
accum(pqCoarseCentroids,
|
|
683
|
+
pqFineCentroids,
|
|
684
|
+
code0,
|
|
685
|
+
weight0,
|
|
686
|
+
code1,
|
|
687
|
+
weight1,
|
|
688
|
+
outputAccum);
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// Process 3 samples.
|
|
692
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
693
|
+
// table.
|
|
694
|
+
static void accum(
|
|
695
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
696
|
+
const float* const __restrict pqFineCentroids0,
|
|
697
|
+
const uint8_t* const __restrict code0,
|
|
698
|
+
const float weight0,
|
|
699
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
700
|
+
const float* const __restrict pqFineCentroids1,
|
|
701
|
+
const uint8_t* const __restrict code1,
|
|
702
|
+
const float weight1,
|
|
703
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
704
|
+
const float* const __restrict pqFineCentroids2,
|
|
705
|
+
const uint8_t* const __restrict code2,
|
|
706
|
+
const float weight2,
|
|
707
|
+
float* const __restrict outputAccum) {
|
|
708
|
+
// coarse quantizer
|
|
709
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
710
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
711
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
712
|
+
|
|
713
|
+
// fine quantizer
|
|
714
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
715
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
716
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
717
|
+
|
|
718
|
+
// process chunks, 2 float
|
|
719
|
+
// but 8 floats per loop
|
|
720
|
+
|
|
721
|
+
const intptr_t coarseCode0 = detail::
|
|
722
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
723
|
+
get(coarse0);
|
|
724
|
+
const intptr_t fineCode0a = detail::
|
|
725
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
726
|
+
fine0);
|
|
727
|
+
const intptr_t fineCode0b = detail::
|
|
728
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
729
|
+
fine0);
|
|
730
|
+
const intptr_t fineCode0c = detail::
|
|
731
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
732
|
+
fine0);
|
|
733
|
+
const intptr_t fineCode0d = detail::
|
|
734
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
735
|
+
fine0);
|
|
736
|
+
const intptr_t coarseCode1 = detail::
|
|
737
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
738
|
+
get(coarse1);
|
|
739
|
+
const intptr_t fineCode1a = detail::
|
|
740
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
741
|
+
fine1);
|
|
742
|
+
const intptr_t fineCode1b = detail::
|
|
743
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
744
|
+
fine1);
|
|
745
|
+
const intptr_t fineCode1c = detail::
|
|
746
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
747
|
+
fine1);
|
|
748
|
+
const intptr_t fineCode1d = detail::
|
|
749
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
750
|
+
fine1);
|
|
751
|
+
const intptr_t coarseCode2 = detail::
|
|
752
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
753
|
+
get(coarse2);
|
|
754
|
+
const intptr_t fineCode2a = detail::
|
|
755
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
756
|
+
fine2);
|
|
757
|
+
const intptr_t fineCode2b = detail::
|
|
758
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
759
|
+
fine2);
|
|
760
|
+
const intptr_t fineCode2c = detail::
|
|
761
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
762
|
+
fine2);
|
|
763
|
+
const intptr_t fineCode2d = detail::
|
|
764
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
765
|
+
fine2);
|
|
766
|
+
|
|
767
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
768
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
769
|
+
|
|
770
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
771
|
+
pqCoarseCentroids0 +
|
|
772
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
|
773
|
+
COARSE_SIZE +
|
|
774
|
+
coarseCentroidOffset,
|
|
775
|
+
pqFineCentroids0 +
|
|
776
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
777
|
+
fineCode0a) *
|
|
778
|
+
FINE_SIZE +
|
|
779
|
+
fineCentroidOffset,
|
|
780
|
+
pqFineCentroids0 +
|
|
781
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
782
|
+
fineCode0b) *
|
|
783
|
+
FINE_SIZE +
|
|
784
|
+
fineCentroidOffset,
|
|
785
|
+
pqFineCentroids0 +
|
|
786
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
787
|
+
fineCode0c) *
|
|
788
|
+
FINE_SIZE +
|
|
789
|
+
fineCentroidOffset,
|
|
790
|
+
pqFineCentroids0 +
|
|
791
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
792
|
+
fineCode0d) *
|
|
793
|
+
FINE_SIZE +
|
|
794
|
+
fineCentroidOffset,
|
|
795
|
+
weight0,
|
|
796
|
+
{existingValue0, existingValue1});
|
|
797
|
+
|
|
798
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
799
|
+
pqCoarseCentroids1 +
|
|
800
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
|
801
|
+
COARSE_SIZE +
|
|
802
|
+
coarseCentroidOffset,
|
|
803
|
+
pqFineCentroids1 +
|
|
804
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
805
|
+
fineCode1a) *
|
|
806
|
+
FINE_SIZE +
|
|
807
|
+
fineCentroidOffset,
|
|
808
|
+
pqFineCentroids1 +
|
|
809
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
810
|
+
fineCode1b) *
|
|
811
|
+
FINE_SIZE +
|
|
812
|
+
fineCentroidOffset,
|
|
813
|
+
pqFineCentroids1 +
|
|
814
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
815
|
+
fineCode1c) *
|
|
816
|
+
FINE_SIZE +
|
|
817
|
+
fineCentroidOffset,
|
|
818
|
+
pqFineCentroids1 +
|
|
819
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
820
|
+
fineCode1d) *
|
|
821
|
+
FINE_SIZE +
|
|
822
|
+
fineCentroidOffset,
|
|
823
|
+
weight1,
|
|
824
|
+
existingValue);
|
|
825
|
+
|
|
826
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
827
|
+
pqCoarseCentroids2 +
|
|
828
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
|
829
|
+
COARSE_SIZE +
|
|
830
|
+
coarseCentroidOffset,
|
|
831
|
+
pqFineCentroids2 +
|
|
832
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
833
|
+
fineCode2a) *
|
|
834
|
+
FINE_SIZE +
|
|
835
|
+
fineCentroidOffset,
|
|
836
|
+
pqFineCentroids2 +
|
|
837
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
838
|
+
fineCode2b) *
|
|
839
|
+
FINE_SIZE +
|
|
840
|
+
fineCentroidOffset,
|
|
841
|
+
pqFineCentroids2 +
|
|
842
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
843
|
+
fineCode2c) *
|
|
844
|
+
FINE_SIZE +
|
|
845
|
+
fineCentroidOffset,
|
|
846
|
+
pqFineCentroids2 +
|
|
847
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
848
|
+
fineCode2d) *
|
|
849
|
+
FINE_SIZE +
|
|
850
|
+
fineCentroidOffset,
|
|
851
|
+
weight2,
|
|
852
|
+
existingValue);
|
|
853
|
+
|
|
854
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
855
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
856
|
+
|
|
857
|
+
// next
|
|
858
|
+
Index2LevelDecoderImpl<
|
|
859
|
+
DIM,
|
|
860
|
+
COARSE_SIZE,
|
|
861
|
+
FINE_SIZE,
|
|
862
|
+
COARSE_BITS,
|
|
863
|
+
FINE_BITS,
|
|
864
|
+
CPOS + 8>::
|
|
865
|
+
accum(pqCoarseCentroids0,
|
|
866
|
+
pqFineCentroids0,
|
|
867
|
+
code0,
|
|
868
|
+
weight0,
|
|
869
|
+
pqCoarseCentroids1,
|
|
870
|
+
pqFineCentroids1,
|
|
871
|
+
code1,
|
|
872
|
+
weight1,
|
|
873
|
+
pqCoarseCentroids2,
|
|
874
|
+
pqFineCentroids2,
|
|
875
|
+
code2,
|
|
876
|
+
weight2,
|
|
877
|
+
outputAccum);
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
// Process 3 samples.
|
|
881
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
882
|
+
// codes.
|
|
883
|
+
static void accum(
|
|
884
|
+
const float* const __restrict pqCoarseCentroids,
|
|
885
|
+
const float* const __restrict pqFineCentroids,
|
|
886
|
+
const uint8_t* const __restrict code0,
|
|
887
|
+
const float weight0,
|
|
888
|
+
const uint8_t* const __restrict code1,
|
|
889
|
+
const float weight1,
|
|
890
|
+
const uint8_t* const __restrict code2,
|
|
891
|
+
const float weight2,
|
|
892
|
+
float* const __restrict outputAccum) {
|
|
893
|
+
// coarse quantizer
|
|
894
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
895
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
896
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
897
|
+
|
|
898
|
+
// fine quantizer
|
|
899
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
900
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
901
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
902
|
+
|
|
903
|
+
// process chunks, 2 float
|
|
904
|
+
// but 8 floats per loop
|
|
905
|
+
|
|
906
|
+
const intptr_t coarseCode0 = detail::
|
|
907
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
908
|
+
get(coarse0);
|
|
909
|
+
const intptr_t fineCode0a = detail::
|
|
910
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
911
|
+
fine0);
|
|
912
|
+
const intptr_t fineCode0b = detail::
|
|
913
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
914
|
+
fine0);
|
|
915
|
+
const intptr_t fineCode0c = detail::
|
|
916
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
917
|
+
fine0);
|
|
918
|
+
const intptr_t fineCode0d = detail::
|
|
919
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
920
|
+
fine0);
|
|
921
|
+
const intptr_t coarseCode1 = detail::
|
|
922
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
923
|
+
get(coarse1);
|
|
924
|
+
const intptr_t fineCode1a = detail::
|
|
925
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
926
|
+
fine1);
|
|
927
|
+
const intptr_t fineCode1b = detail::
|
|
928
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
929
|
+
fine1);
|
|
930
|
+
const intptr_t fineCode1c = detail::
|
|
931
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
932
|
+
fine1);
|
|
933
|
+
const intptr_t fineCode1d = detail::
|
|
934
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
935
|
+
fine1);
|
|
936
|
+
const intptr_t coarseCode2 = detail::
|
|
937
|
+
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
|
938
|
+
get(coarse2);
|
|
939
|
+
const intptr_t fineCode2a = detail::
|
|
940
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
|
941
|
+
fine2);
|
|
942
|
+
const intptr_t fineCode2b = detail::
|
|
943
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
|
944
|
+
fine2);
|
|
945
|
+
const intptr_t fineCode2c = detail::
|
|
946
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(
|
|
947
|
+
fine2);
|
|
948
|
+
const intptr_t fineCode2d = detail::
|
|
949
|
+
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(
|
|
950
|
+
fine2);
|
|
951
|
+
|
|
952
|
+
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
|
953
|
+
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
|
954
|
+
|
|
955
|
+
auto existingValue = elementaryBlock2x4bAccum(
|
|
956
|
+
pqCoarseCentroids +
|
|
957
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
|
958
|
+
COARSE_SIZE +
|
|
959
|
+
coarseCentroidOffset,
|
|
960
|
+
pqFineCentroids +
|
|
961
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
962
|
+
fineCode0a) *
|
|
963
|
+
FINE_SIZE +
|
|
964
|
+
fineCentroidOffset,
|
|
965
|
+
pqFineCentroids +
|
|
966
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
967
|
+
fineCode0b) *
|
|
968
|
+
FINE_SIZE +
|
|
969
|
+
fineCentroidOffset,
|
|
970
|
+
pqFineCentroids +
|
|
971
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
972
|
+
fineCode0c) *
|
|
973
|
+
FINE_SIZE +
|
|
974
|
+
fineCentroidOffset,
|
|
975
|
+
pqFineCentroids +
|
|
976
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
977
|
+
fineCode0d) *
|
|
978
|
+
FINE_SIZE +
|
|
979
|
+
fineCentroidOffset,
|
|
980
|
+
weight0,
|
|
981
|
+
{existingValue0, existingValue1});
|
|
982
|
+
|
|
983
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
984
|
+
pqCoarseCentroids +
|
|
985
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
|
986
|
+
COARSE_SIZE +
|
|
987
|
+
coarseCentroidOffset,
|
|
988
|
+
pqFineCentroids +
|
|
989
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
990
|
+
fineCode1a) *
|
|
991
|
+
FINE_SIZE +
|
|
992
|
+
fineCentroidOffset,
|
|
993
|
+
pqFineCentroids +
|
|
994
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
995
|
+
fineCode1b) *
|
|
996
|
+
FINE_SIZE +
|
|
997
|
+
fineCentroidOffset,
|
|
998
|
+
pqFineCentroids +
|
|
999
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
1000
|
+
fineCode1c) *
|
|
1001
|
+
FINE_SIZE +
|
|
1002
|
+
fineCentroidOffset,
|
|
1003
|
+
pqFineCentroids +
|
|
1004
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
1005
|
+
fineCode1d) *
|
|
1006
|
+
FINE_SIZE +
|
|
1007
|
+
fineCentroidOffset,
|
|
1008
|
+
weight1,
|
|
1009
|
+
existingValue);
|
|
1010
|
+
|
|
1011
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
1012
|
+
pqCoarseCentroids +
|
|
1013
|
+
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
|
1014
|
+
COARSE_SIZE +
|
|
1015
|
+
coarseCentroidOffset,
|
|
1016
|
+
pqFineCentroids +
|
|
1017
|
+
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
|
1018
|
+
fineCode2a) *
|
|
1019
|
+
FINE_SIZE +
|
|
1020
|
+
fineCentroidOffset,
|
|
1021
|
+
pqFineCentroids +
|
|
1022
|
+
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
|
1023
|
+
fineCode2b) *
|
|
1024
|
+
FINE_SIZE +
|
|
1025
|
+
fineCentroidOffset,
|
|
1026
|
+
pqFineCentroids +
|
|
1027
|
+
((fineCentroidIdx + 2) * FINE_TABLE_BYTES +
|
|
1028
|
+
fineCode2c) *
|
|
1029
|
+
FINE_SIZE +
|
|
1030
|
+
fineCentroidOffset,
|
|
1031
|
+
pqFineCentroids +
|
|
1032
|
+
((fineCentroidIdx + 3) * FINE_TABLE_BYTES +
|
|
1033
|
+
fineCode2d) *
|
|
1034
|
+
FINE_SIZE +
|
|
1035
|
+
fineCentroidOffset,
|
|
1036
|
+
weight2,
|
|
1037
|
+
existingValue);
|
|
1038
|
+
|
|
1039
|
+
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
|
1040
|
+
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
|
1041
|
+
|
|
1042
|
+
// next
|
|
1043
|
+
Index2LevelDecoderImpl<
|
|
1044
|
+
DIM,
|
|
1045
|
+
COARSE_SIZE,
|
|
1046
|
+
FINE_SIZE,
|
|
1047
|
+
COARSE_BITS,
|
|
1048
|
+
FINE_BITS,
|
|
1049
|
+
CPOS + 8>::
|
|
1050
|
+
accum(pqCoarseCentroids,
|
|
1051
|
+
pqFineCentroids,
|
|
1052
|
+
code0,
|
|
1053
|
+
weight0,
|
|
1054
|
+
code1,
|
|
1055
|
+
weight1,
|
|
1056
|
+
code2,
|
|
1057
|
+
weight2,
|
|
1058
|
+
outputAccum);
|
|
1059
|
+
}
|
|
1060
|
+
};
|
|
176
1061
|
|
|
177
1062
|
template <
|
|
178
1063
|
intptr_t DIM,
|
|
@@ -189,6 +1074,7 @@ struct Index2LevelDecoderImpl<
|
|
|
189
1074
|
COARSE_BITS,
|
|
190
1075
|
FINE_BITS,
|
|
191
1076
|
CPOS,
|
|
1077
|
+
false,
|
|
192
1078
|
true,
|
|
193
1079
|
QPOS_LEFT_GE_8,
|
|
194
1080
|
QPOS_LEFT_GE_4,
|
|
@@ -829,6 +1715,7 @@ struct Index2LevelDecoderImpl<
|
|
|
829
1715
|
FINE_BITS,
|
|
830
1716
|
CPOS,
|
|
831
1717
|
false,
|
|
1718
|
+
false,
|
|
832
1719
|
true,
|
|
833
1720
|
true,
|
|
834
1721
|
false> {
|
|
@@ -1353,6 +2240,7 @@ struct Index2LevelDecoderImpl<
|
|
|
1353
2240
|
CPOS,
|
|
1354
2241
|
false,
|
|
1355
2242
|
false,
|
|
2243
|
+
false,
|
|
1356
2244
|
true,
|
|
1357
2245
|
false> {
|
|
1358
2246
|
static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
|
|
@@ -1856,6 +2744,7 @@ template <
|
|
|
1856
2744
|
intptr_t FINE_SIZE,
|
|
1857
2745
|
intptr_t COARSE_BITS,
|
|
1858
2746
|
intptr_t FINE_BITS,
|
|
2747
|
+
bool FINE_SIZE_EQ_2,
|
|
1859
2748
|
bool FINE_SIZE_EQ_4,
|
|
1860
2749
|
bool QPOS_LEFT_GE_8,
|
|
1861
2750
|
bool QPOS_LEFT_GE_4>
|
|
@@ -1866,6 +2755,7 @@ struct Index2LevelDecoderImpl<
|
|
|
1866
2755
|
COARSE_BITS,
|
|
1867
2756
|
FINE_BITS,
|
|
1868
2757
|
DIM,
|
|
2758
|
+
FINE_SIZE_EQ_2,
|
|
1869
2759
|
FINE_SIZE_EQ_4,
|
|
1870
2760
|
QPOS_LEFT_GE_8,
|
|
1871
2761
|
QPOS_LEFT_GE_4,
|