faiss 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
- data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +4 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
- data/vendor/faiss/faiss/impl/HNSW.h +51 -13
- data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
- data/vendor/faiss/faiss/impl/Panorama.h +11 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
- data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
- data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
- data/vendor/faiss/faiss/impl/io_macros.h +25 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
- data/vendor/faiss/faiss/index_factory.cpp +5 -1
- data/vendor/faiss/faiss/index_io.h +16 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
- data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
- metadata +12 -2
|
@@ -214,6 +214,12 @@ struct QuantizerTemplate<
|
|
|
214
214
|
return simd16float32(_mm512_fmadd_ps(
|
|
215
215
|
xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)));
|
|
216
216
|
}
|
|
217
|
+
|
|
218
|
+
/// Raw codec decode without denormalization
|
|
219
|
+
FAISS_ALWAYS_INLINE simd16float32
|
|
220
|
+
decode_16_raw(const uint8_t* code, int i) const {
|
|
221
|
+
return Codec::decode_16_components(code, i);
|
|
222
|
+
}
|
|
217
223
|
};
|
|
218
224
|
|
|
219
225
|
template <class Codec>
|
|
@@ -247,32 +253,78 @@ struct QuantizerTemplate<
|
|
|
247
253
|
* TurboQuant MSE quantizer
|
|
248
254
|
**********************************************************/
|
|
249
255
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
256
|
+
// 1-bit MSE AVX512: 16 comparisons → 2 bytes via mask compare.
|
|
257
|
+
template <>
|
|
258
|
+
struct QuantizerTurboQuantMSE<1, SIMDLevel::AVX512>
|
|
259
|
+
: QuantizerTurboQuantMSE<1, SIMDLevel::NONE> {
|
|
260
|
+
using Base = QuantizerTurboQuantMSE<1, SIMDLevel::NONE>;
|
|
261
|
+
|
|
262
|
+
QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
|
|
263
|
+
: Base(d, trained) {
|
|
264
|
+
assert(d % 16 == 0);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
FAISS_ALWAYS_INLINE simd16float32
|
|
268
|
+
reconstruct_16_components(const uint8_t* code, int i) const {
|
|
269
|
+
return simd16float32(_mm512_i32gather_ps(
|
|
270
|
+
unpack_16x1bit_to_u32(code, i),
|
|
271
|
+
this->centroids,
|
|
272
|
+
sizeof(float)));
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
void encode_vector(const float* x, uint8_t* code) const final {
|
|
276
|
+
__m512 boundary = _mm512_set1_ps(this->boundaries[0]);
|
|
277
|
+
for (size_t i = 0; i < this->d; i += 16) {
|
|
278
|
+
__m512 vals = _mm512_loadu_ps(x + i);
|
|
279
|
+
__mmask16 mask = _mm512_cmp_ps_mask(vals, boundary, _CMP_GT_OQ);
|
|
280
|
+
uint16_t bits = _cvtmask16_u32(mask);
|
|
281
|
+
memcpy(code + i / 8, &bits, sizeof(uint16_t));
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
void decode_vector(const uint8_t* code, float* x) const final {
|
|
286
|
+
for (size_t i = 0; i < this->d; i += 16) {
|
|
287
|
+
simd16float32 xi =
|
|
288
|
+
reconstruct_16_components(code, static_cast<int>(i));
|
|
289
|
+
_mm512_storeu_ps(x + i, xi.f);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
};
|
|
275
293
|
|
|
294
|
+
// 2-4 bit MSE AVX512: decode via gather, encode stays scalar.
|
|
295
|
+
#define DEFINE_TQMSE_AVX512_MULTIBIT(NBITS, UNPACK_EXPR) \
|
|
296
|
+
template <> \
|
|
297
|
+
struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX512> \
|
|
298
|
+
: QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
|
|
299
|
+
using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
|
|
300
|
+
\
|
|
301
|
+
QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
|
|
302
|
+
: Base(d, trained) { \
|
|
303
|
+
assert(d % 16 == 0); \
|
|
304
|
+
} \
|
|
305
|
+
\
|
|
306
|
+
FAISS_ALWAYS_INLINE simd16float32 \
|
|
307
|
+
reconstruct_16_components(const uint8_t* code, int i) const { \
|
|
308
|
+
return simd16float32(_mm512_i32gather_ps( \
|
|
309
|
+
(UNPACK_EXPR), this->centroids, sizeof(float))); \
|
|
310
|
+
} \
|
|
311
|
+
\
|
|
312
|
+
void decode_vector(const uint8_t* code, float* x) const final { \
|
|
313
|
+
for (size_t i = 0; i < this->d; i += 16) { \
|
|
314
|
+
simd16float32 xi = \
|
|
315
|
+
reconstruct_16_components(code, static_cast<int>(i)); \
|
|
316
|
+
_mm512_storeu_ps(x + i, xi.f); \
|
|
317
|
+
} \
|
|
318
|
+
} \
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
DEFINE_TQMSE_AVX512_MULTIBIT(2, unpack_16x2bit_to_u32(code, i));
|
|
322
|
+
DEFINE_TQMSE_AVX512_MULTIBIT(3, unpack_16x3bit_to_u32(code, i));
|
|
323
|
+
DEFINE_TQMSE_AVX512_MULTIBIT(4, unpack_16x4bit_to_u32(code, i));
|
|
324
|
+
|
|
325
|
+
#undef DEFINE_TQMSE_AVX512_MULTIBIT
|
|
326
|
+
|
|
327
|
+
// 8-bit MSE AVX512
|
|
276
328
|
template <>
|
|
277
329
|
struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX512>
|
|
278
330
|
: QuantizerTurboQuantMSE<8, SIMDLevel::NONE> {
|
|
@@ -291,6 +343,14 @@ struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX512>
|
|
|
291
343
|
return simd16float32(
|
|
292
344
|
_mm512_i32gather_ps(indices, this->centroids, sizeof(float)));
|
|
293
345
|
}
|
|
346
|
+
|
|
347
|
+
void decode_vector(const uint8_t* code, float* x) const final {
|
|
348
|
+
for (size_t i = 0; i < this->d; i += 16) {
|
|
349
|
+
simd16float32 xi =
|
|
350
|
+
reconstruct_16_components(code, static_cast<int>(i));
|
|
351
|
+
_mm512_storeu_ps(x + i, xi.f);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
294
354
|
};
|
|
295
355
|
|
|
296
356
|
/**********************************************************
|
|
@@ -411,6 +471,22 @@ struct SimilarityL2<SIMDLevel::AVX512> {
|
|
|
411
471
|
FAISS_ALWAYS_INLINE float result_16() {
|
|
412
472
|
return horizontal_add(accu16);
|
|
413
473
|
}
|
|
474
|
+
|
|
475
|
+
static void adjust_query_for_raw_decode(
|
|
476
|
+
const float* x,
|
|
477
|
+
float* q_adj,
|
|
478
|
+
size_t d,
|
|
479
|
+
float vmin,
|
|
480
|
+
float vdiff,
|
|
481
|
+
float& scale_factor,
|
|
482
|
+
float& bias) {
|
|
483
|
+
float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
|
|
484
|
+
for (size_t i = 0; i < d; i++) {
|
|
485
|
+
q_adj[i] = (x[i] - vmin) * inv_vdiff;
|
|
486
|
+
}
|
|
487
|
+
scale_factor = vdiff * vdiff;
|
|
488
|
+
bias = 0;
|
|
489
|
+
}
|
|
414
490
|
};
|
|
415
491
|
|
|
416
492
|
template <>
|
|
@@ -445,6 +521,23 @@ struct SimilarityIP<SIMDLevel::AVX512> {
|
|
|
445
521
|
FAISS_ALWAYS_INLINE float result_16() {
|
|
446
522
|
return horizontal_add(accu16);
|
|
447
523
|
}
|
|
524
|
+
|
|
525
|
+
static void adjust_query_for_raw_decode(
|
|
526
|
+
const float* x,
|
|
527
|
+
float* q_adj,
|
|
528
|
+
size_t d,
|
|
529
|
+
float vmin,
|
|
530
|
+
float vdiff,
|
|
531
|
+
float& scale_factor,
|
|
532
|
+
float& bias) {
|
|
533
|
+
float sum_q = 0;
|
|
534
|
+
for (size_t i = 0; i < d; i++) {
|
|
535
|
+
q_adj[i] = x[i];
|
|
536
|
+
sum_q += x[i];
|
|
537
|
+
}
|
|
538
|
+
scale_factor = vdiff;
|
|
539
|
+
bias = vmin * sum_q;
|
|
540
|
+
}
|
|
448
541
|
};
|
|
449
542
|
|
|
450
543
|
/**********************************************************
|
|
@@ -458,8 +551,23 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
|
|
|
458
551
|
|
|
459
552
|
Quantizer quant;
|
|
460
553
|
|
|
554
|
+
// Pre-adjusted query buffer for uniform quantizers
|
|
555
|
+
std::vector<float> q_adj;
|
|
556
|
+
float scale_factor = 0;
|
|
557
|
+
float bias = 0;
|
|
558
|
+
|
|
559
|
+
static constexpr bool has_decode_raw() {
|
|
560
|
+
return requires(const Quantizer& q, const uint8_t* c, int i) {
|
|
561
|
+
{ q.decode_16_raw(c, i) };
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
|
|
461
565
|
DCTemplate(size_t d, const std::vector<float>& trained)
|
|
462
|
-
: quant(d, trained) {
|
|
566
|
+
: quant(d, trained) {
|
|
567
|
+
if constexpr (has_decode_raw()) {
|
|
568
|
+
q_adj.resize(d);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
463
571
|
|
|
464
572
|
float compute_distance(const float* x, const uint8_t* code) const {
|
|
465
573
|
Similarity sim(x);
|
|
@@ -485,6 +593,26 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
|
|
|
485
593
|
|
|
486
594
|
void set_query(const float* x) final {
|
|
487
595
|
q = x;
|
|
596
|
+
if constexpr (has_decode_raw()) {
|
|
597
|
+
Sim::adjust_query_for_raw_decode(
|
|
598
|
+
x,
|
|
599
|
+
q_adj.data(),
|
|
600
|
+
quant.d,
|
|
601
|
+
quant.vmin,
|
|
602
|
+
quant.vdiff,
|
|
603
|
+
scale_factor,
|
|
604
|
+
bias);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
float query_to_code_predecoded(const uint8_t* code) const {
|
|
609
|
+
Similarity sim(q_adj.data());
|
|
610
|
+
sim.begin_16();
|
|
611
|
+
for (size_t i = 0; i < quant.d; i += 16) {
|
|
612
|
+
simd16float32 xi = quant.decode_16_raw(code, i);
|
|
613
|
+
sim.add_16_components(xi);
|
|
614
|
+
}
|
|
615
|
+
return bias + scale_factor * sim.result_16();
|
|
488
616
|
}
|
|
489
617
|
|
|
490
618
|
float symmetric_dis(idx_t i, idx_t j) override {
|
|
@@ -493,7 +621,11 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
|
|
|
493
621
|
}
|
|
494
622
|
|
|
495
623
|
float query_to_code(const uint8_t* code) const final {
|
|
496
|
-
|
|
624
|
+
if constexpr (has_decode_raw()) {
|
|
625
|
+
return query_to_code_predecoded(code);
|
|
626
|
+
} else {
|
|
627
|
+
return compute_distance(q, code);
|
|
628
|
+
}
|
|
497
629
|
}
|
|
498
630
|
|
|
499
631
|
void query_to_codes_batch_4(
|
|
@@ -588,10 +720,50 @@ struct DistanceComputerByte<Similarity, SIMDLevel::AVX512>
|
|
|
588
720
|
}
|
|
589
721
|
};
|
|
590
722
|
|
|
723
|
+
/**********************************************************
|
|
724
|
+
* TurboQuant masked_sum AVX512 specialization
|
|
725
|
+
**********************************************************/
|
|
726
|
+
|
|
727
|
+
template <SIMDLevel SL0>
|
|
728
|
+
float turboq_masked_sum(const float* arr, const uint8_t* bits, size_t d);
|
|
729
|
+
|
|
730
|
+
template <>
|
|
731
|
+
float turboq_masked_sum<SIMDLevel::AVX512>(
|
|
732
|
+
const float* arr,
|
|
733
|
+
const uint8_t* bits,
|
|
734
|
+
size_t d) {
|
|
735
|
+
__m512 acc = _mm512_setzero_ps();
|
|
736
|
+
size_t i = 0;
|
|
737
|
+
size_t full_16 = (d / 16) * 16;
|
|
738
|
+
for (; i < full_16; i += 16) {
|
|
739
|
+
uint16_t mask16;
|
|
740
|
+
memcpy(&mask16, bits + i / 8, sizeof(mask16));
|
|
741
|
+
__mmask16 k = _cvtu32_mask16(mask16);
|
|
742
|
+
__m512 vals = _mm512_loadu_ps(arr + i);
|
|
743
|
+
acc = _mm512_mask_add_ps(acc, k, acc, vals);
|
|
744
|
+
}
|
|
745
|
+
float result = _mm512_reduce_add_ps(acc);
|
|
746
|
+
if (i < d) {
|
|
747
|
+
size_t remaining = d - i;
|
|
748
|
+
__mmask16 tail_mask = _cvtu32_mask16((1u << remaining) - 1);
|
|
749
|
+
__m512 tail_vals = _mm512_maskz_loadu_ps(tail_mask, arr + i);
|
|
750
|
+
uint16_t bits_tail = 0;
|
|
751
|
+
size_t bytes_remaining = (remaining + 7) / 8;
|
|
752
|
+
memcpy(&bits_tail, bits + i / 8, bytes_remaining);
|
|
753
|
+
__mmask16 bits_k = _cvtu32_mask16(bits_tail);
|
|
754
|
+
__mmask16 combined = _kand_mask16(tail_mask, bits_k);
|
|
755
|
+
__m512 masked_tail = _mm512_maskz_mov_ps(combined, tail_vals);
|
|
756
|
+
result += _mm512_reduce_add_ps(masked_tail);
|
|
757
|
+
}
|
|
758
|
+
return result;
|
|
759
|
+
}
|
|
760
|
+
|
|
591
761
|
} // namespace scalar_quantizer
|
|
592
762
|
} // namespace faiss
|
|
593
763
|
|
|
764
|
+
#ifndef SQ_AVX512_SKIP_DISPATCH
|
|
594
765
|
#define THE_LEVEL_TO_DISPATCH SIMDLevel::AVX512
|
|
595
766
|
#include <faiss/impl/scalar_quantizer/sq-dispatch.h>
|
|
767
|
+
#endif
|
|
596
768
|
|
|
597
769
|
#endif // COMPILE_SIMD_AVX512
|