faiss 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
- data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +4 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
- data/vendor/faiss/faiss/impl/HNSW.h +51 -13
- data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
- data/vendor/faiss/faiss/impl/Panorama.h +11 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
- data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
- data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
- data/vendor/faiss/faiss/impl/io_macros.h +25 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
- data/vendor/faiss/faiss/index_factory.cpp +5 -1
- data/vendor/faiss/faiss/index_io.h +16 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
- data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
- metadata +12 -2
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* @file rabitq_avx512_spr.cpp
|
|
10
|
+
*
|
|
11
|
+
* RaBitQ SIMD kernels specialized for SIMDLevel::AVX512_SPR.
|
|
12
|
+
*
|
|
13
|
+
* Sapphire Rapids (SPR) and later Intel microarchitectures expose
|
|
14
|
+
* AVX-512 VPOPCNTDQ (vpopcntq), which performs a per-lane 64-bit
|
|
15
|
+
* popcount in a single instruction. This is used here to replace the
|
|
16
|
+
* multi-step shuffle/pshufb-based popcount used by the generic AVX-512
|
|
17
|
+
* specialization in rabitq_avx512.cpp. The popcount-heavy kernels
|
|
18
|
+
* (bitwise_and_dot_product, bitwise_xor_dot_product, popcount) become
|
|
19
|
+
* substantially shorter and faster on SPR+ as a result.
|
|
20
|
+
*
|
|
21
|
+
* Build / dispatch behavior:
|
|
22
|
+
* - faiss_avx512 (AVX-512 only, no SPR features): NOT compiled.
|
|
23
|
+
* The existing AVX512 specialization in rabitq_avx512.cpp is used.
|
|
24
|
+
* - faiss_avx512_spr (statically built for SPR+): compiled. The
|
|
25
|
+
* SINGLE_SIMD_LEVEL is AVX512_SPR, so this specialization is
|
|
26
|
+
* selected by static dispatch.
|
|
27
|
+
* - faiss with FAISS_OPT_LEVEL=dd (dynamic dispatch): compiled with
|
|
28
|
+
* -mavx512vpopcntdq as a per-file flag. Selected at runtime when
|
|
29
|
+
* SIMDConfig::level == SIMDLevel::AVX512_SPR.
|
|
30
|
+
*
|
|
31
|
+
* The floating-point multi-bit inner-product kernel does not benefit
|
|
32
|
+
* from VPOPCNTDQ, so this TU forwards compute_inner_product<SPR> to
|
|
33
|
+
* the AVX512 implementation to avoid duplicating that code path.
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
#ifdef COMPILE_SIMD_AVX512_SPR
|
|
37
|
+
|
|
38
|
+
#include <faiss/utils/popcount.h>
|
|
39
|
+
#include <faiss/utils/rabitq_simd.h>
|
|
40
|
+
#include <immintrin.h>
|
|
41
|
+
#include <cstdint>
|
|
42
|
+
|
|
43
|
+
#if defined(_MSC_VER)
|
|
44
|
+
#include <intrin.h>
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
namespace faiss::rabitq {
|
|
48
|
+
|
|
49
|
+
// Forward declarations for the AVX512 specializations defined in
|
|
50
|
+
// rabitq_avx512.cpp. They live in the same TU group on SPR builds, so
|
|
51
|
+
// we can reuse them as a tail handler / fallback. Declaring rather
|
|
52
|
+
// than redefining avoids ODR risk and keeps a single source of truth
|
|
53
|
+
// for the floating-point kernel.
|
|
54
|
+
template <>
|
|
55
|
+
uint64_t bitwise_and_dot_product<SIMDLevel::AVX512>(
|
|
56
|
+
const uint8_t* query,
|
|
57
|
+
const uint8_t* data,
|
|
58
|
+
size_t size,
|
|
59
|
+
size_t qb);
|
|
60
|
+
template <>
|
|
61
|
+
uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512>(
|
|
62
|
+
const uint8_t* query,
|
|
63
|
+
const uint8_t* data,
|
|
64
|
+
size_t size,
|
|
65
|
+
size_t qb);
|
|
66
|
+
template <>
|
|
67
|
+
uint64_t popcount<SIMDLevel::AVX512>(const uint8_t* data, size_t size);
|
|
68
|
+
|
|
69
|
+
namespace {
|
|
70
|
+
|
|
71
|
+
// 512-bit popcount using AVX-512 VPOPCNTDQ (vpopcntq).
|
|
72
|
+
// Single-instruction per-lane popcount on 8x uint64 lanes.
|
|
73
|
+
inline __m512i popcount_512_vpopcntdq(__m512i v) {
|
|
74
|
+
return _mm512_popcnt_epi64(v);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// 256-bit popcount using AVX-512VL VPOPCNTDQ.
|
|
78
|
+
// AVX512VL is part of the SPR feature set, so vpopcntq is available
|
|
79
|
+
// on 256-bit registers via _mm256_popcnt_epi64.
|
|
80
|
+
inline __m256i popcount_256_vpopcntdq(__m256i v) {
|
|
81
|
+
return _mm256_popcnt_epi64(v);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// 128-bit popcount using AVX-512VL VPOPCNTDQ.
|
|
85
|
+
inline __m128i popcount_128_vpopcntdq(__m128i v) {
|
|
86
|
+
return _mm_popcnt_epi64(v);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
inline uint64_t reduce_add_256(__m256i v) {
|
|
90
|
+
alignas(32) uint64_t lanes[4];
|
|
91
|
+
_mm256_store_si256(reinterpret_cast<__m256i*>(lanes), v);
|
|
92
|
+
return lanes[0] + lanes[1] + lanes[2] + lanes[3];
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
inline uint64_t reduce_add_128(__m128i v) {
|
|
96
|
+
alignas(16) uint64_t lanes[2];
|
|
97
|
+
_mm_store_si128(reinterpret_cast<__m128i*>(lanes), v);
|
|
98
|
+
return lanes[0] + lanes[1];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
} // namespace
|
|
102
|
+
|
|
103
|
+
template <>
|
|
104
|
+
uint64_t bitwise_and_dot_product<SIMDLevel::AVX512_SPR>(
|
|
105
|
+
const uint8_t* query,
|
|
106
|
+
const uint8_t* data,
|
|
107
|
+
size_t size,
|
|
108
|
+
size_t qb) {
|
|
109
|
+
uint64_t sum = 0;
|
|
110
|
+
size_t offset = 0;
|
|
111
|
+
|
|
112
|
+
// 512-bit main loop: vpopcntq replaces the shuffle-based popcount,
|
|
113
|
+
// halving the instruction count per iteration relative to AVX512.
|
|
114
|
+
if (size_t step = 512 / 8; offset + step <= size) {
|
|
115
|
+
__m512i sum_512 = _mm512_setzero_si512();
|
|
116
|
+
for (; offset + step <= size; offset += step) {
|
|
117
|
+
__m512i v_x = _mm512_loadu_si512(
|
|
118
|
+
reinterpret_cast<const __m512i*>(data + offset));
|
|
119
|
+
for (size_t j = 0; j < qb; j++) {
|
|
120
|
+
__m512i v_q = _mm512_loadu_si512(
|
|
121
|
+
reinterpret_cast<const __m512i*>(
|
|
122
|
+
query + j * size + offset));
|
|
123
|
+
__m512i v_and = _mm512_and_si512(v_q, v_x);
|
|
124
|
+
__m512i v_popcnt = popcount_512_vpopcntdq(v_and);
|
|
125
|
+
__m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
|
|
126
|
+
sum_512 = _mm512_add_epi64(sum_512, v_shifted);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
sum += _mm512_reduce_add_epi64(sum_512);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// 256-bit tail.
|
|
133
|
+
if (size_t step = 256 / 8; offset + step <= size) {
|
|
134
|
+
__m256i sum_256 = _mm256_setzero_si256();
|
|
135
|
+
for (; offset + step <= size; offset += step) {
|
|
136
|
+
__m256i v_x = _mm256_loadu_si256(
|
|
137
|
+
reinterpret_cast<const __m256i*>(data + offset));
|
|
138
|
+
for (size_t j = 0; j < qb; j++) {
|
|
139
|
+
__m256i v_q = _mm256_loadu_si256(
|
|
140
|
+
reinterpret_cast<const __m256i*>(
|
|
141
|
+
query + j * size + offset));
|
|
142
|
+
__m256i v_and = _mm256_and_si256(v_q, v_x);
|
|
143
|
+
__m256i v_popcnt = popcount_256_vpopcntdq(v_and);
|
|
144
|
+
__m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
|
|
145
|
+
sum_256 = _mm256_add_epi64(sum_256, v_shifted);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
sum += reduce_add_256(sum_256);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// 128-bit tail.
|
|
152
|
+
__m128i sum_128 = _mm_setzero_si128();
|
|
153
|
+
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
154
|
+
__m128i v_x = _mm_loadu_si128(
|
|
155
|
+
reinterpret_cast<const __m128i*>(data + offset));
|
|
156
|
+
for (size_t j = 0; j < qb; j++) {
|
|
157
|
+
__m128i v_q = _mm_loadu_si128(
|
|
158
|
+
reinterpret_cast<const __m128i*>(
|
|
159
|
+
query + j * size + offset));
|
|
160
|
+
__m128i v_and = _mm_and_si128(v_q, v_x);
|
|
161
|
+
__m128i v_popcnt = popcount_128_vpopcntdq(v_and);
|
|
162
|
+
__m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
|
|
163
|
+
sum_128 = _mm_add_epi64(sum_128, v_shifted);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
sum += reduce_add_128(sum_128);
|
|
167
|
+
|
|
168
|
+
// 64-bit scalar tail.
|
|
169
|
+
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
170
|
+
const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
|
|
171
|
+
for (size_t j = 0; j < qb; j++) {
|
|
172
|
+
const auto qv = *reinterpret_cast<const uint64_t*>(
|
|
173
|
+
query + j * size + offset);
|
|
174
|
+
sum += static_cast<uint64_t>(popcount64(qv & yv)) << j;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// Byte tail.
|
|
178
|
+
for (; offset < size; ++offset) {
|
|
179
|
+
const auto yv = *(data + offset);
|
|
180
|
+
for (size_t j = 0; j < qb; j++) {
|
|
181
|
+
const auto qv = *(query + j * size + offset);
|
|
182
|
+
sum += static_cast<uint64_t>(popcount32(qv & yv)) << j;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return sum;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
template <>
|
|
189
|
+
uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512_SPR>(
|
|
190
|
+
const uint8_t* query,
|
|
191
|
+
const uint8_t* data,
|
|
192
|
+
size_t size,
|
|
193
|
+
size_t qb) {
|
|
194
|
+
uint64_t sum = 0;
|
|
195
|
+
size_t offset = 0;
|
|
196
|
+
|
|
197
|
+
if (size_t step = 512 / 8; offset + step <= size) {
|
|
198
|
+
__m512i sum_512 = _mm512_setzero_si512();
|
|
199
|
+
for (; offset + step <= size; offset += step) {
|
|
200
|
+
__m512i v_x = _mm512_loadu_si512(
|
|
201
|
+
reinterpret_cast<const __m512i*>(data + offset));
|
|
202
|
+
for (size_t j = 0; j < qb; j++) {
|
|
203
|
+
__m512i v_q = _mm512_loadu_si512(
|
|
204
|
+
reinterpret_cast<const __m512i*>(
|
|
205
|
+
query + j * size + offset));
|
|
206
|
+
__m512i v_xor = _mm512_xor_si512(v_q, v_x);
|
|
207
|
+
__m512i v_popcnt = popcount_512_vpopcntdq(v_xor);
|
|
208
|
+
__m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
|
|
209
|
+
sum_512 = _mm512_add_epi64(sum_512, v_shifted);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
sum += _mm512_reduce_add_epi64(sum_512);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (size_t step = 256 / 8; offset + step <= size) {
|
|
216
|
+
__m256i sum_256 = _mm256_setzero_si256();
|
|
217
|
+
for (; offset + step <= size; offset += step) {
|
|
218
|
+
__m256i v_x = _mm256_loadu_si256(
|
|
219
|
+
reinterpret_cast<const __m256i*>(data + offset));
|
|
220
|
+
for (size_t j = 0; j < qb; j++) {
|
|
221
|
+
__m256i v_q = _mm256_loadu_si256(
|
|
222
|
+
reinterpret_cast<const __m256i*>(
|
|
223
|
+
query + j * size + offset));
|
|
224
|
+
__m256i v_xor = _mm256_xor_si256(v_q, v_x);
|
|
225
|
+
__m256i v_popcnt = popcount_256_vpopcntdq(v_xor);
|
|
226
|
+
__m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
|
|
227
|
+
sum_256 = _mm256_add_epi64(sum_256, v_shifted);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
sum += reduce_add_256(sum_256);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
__m128i sum_128 = _mm_setzero_si128();
|
|
234
|
+
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
235
|
+
__m128i v_x = _mm_loadu_si128(
|
|
236
|
+
reinterpret_cast<const __m128i*>(data + offset));
|
|
237
|
+
for (size_t j = 0; j < qb; j++) {
|
|
238
|
+
__m128i v_q = _mm_loadu_si128(
|
|
239
|
+
reinterpret_cast<const __m128i*>(
|
|
240
|
+
query + j * size + offset));
|
|
241
|
+
__m128i v_xor = _mm_xor_si128(v_q, v_x);
|
|
242
|
+
__m128i v_popcnt = popcount_128_vpopcntdq(v_xor);
|
|
243
|
+
__m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
|
|
244
|
+
sum_128 = _mm_add_epi64(sum_128, v_shifted);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
sum += reduce_add_128(sum_128);
|
|
248
|
+
|
|
249
|
+
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
250
|
+
const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
|
|
251
|
+
for (size_t j = 0; j < qb; j++) {
|
|
252
|
+
const auto qv = *reinterpret_cast<const uint64_t*>(
|
|
253
|
+
query + j * size + offset);
|
|
254
|
+
sum += static_cast<uint64_t>(popcount64(qv ^ yv)) << j;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
for (; offset < size; ++offset) {
|
|
258
|
+
const auto yv = *(data + offset);
|
|
259
|
+
for (size_t j = 0; j < qb; j++) {
|
|
260
|
+
const auto qv = *(query + j * size + offset);
|
|
261
|
+
sum += static_cast<uint64_t>(popcount32(qv ^ yv)) << j;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return sum;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
template <>
|
|
268
|
+
uint64_t popcount<SIMDLevel::AVX512_SPR>(const uint8_t* data, size_t size) {
|
|
269
|
+
uint64_t sum = 0;
|
|
270
|
+
size_t offset = 0;
|
|
271
|
+
|
|
272
|
+
if (offset + 512 / 8 <= size) {
|
|
273
|
+
__m512i sum_512 = _mm512_setzero_si512();
|
|
274
|
+
for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
|
|
275
|
+
__m512i v_x = _mm512_loadu_si512(
|
|
276
|
+
reinterpret_cast<const __m512i*>(data + offset));
|
|
277
|
+
__m512i v_popcnt = popcount_512_vpopcntdq(v_x);
|
|
278
|
+
sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
|
|
279
|
+
}
|
|
280
|
+
sum += _mm512_reduce_add_epi64(sum_512);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if (offset + 256 / 8 <= size) {
|
|
284
|
+
__m256i sum_256 = _mm256_setzero_si256();
|
|
285
|
+
for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
|
|
286
|
+
__m256i v_x = _mm256_loadu_si256(
|
|
287
|
+
reinterpret_cast<const __m256i*>(data + offset));
|
|
288
|
+
__m256i v_popcnt = popcount_256_vpopcntdq(v_x);
|
|
289
|
+
sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
|
|
290
|
+
}
|
|
291
|
+
sum += reduce_add_256(sum_256);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
__m128i sum_128 = _mm_setzero_si128();
|
|
295
|
+
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
296
|
+
__m128i v_x = _mm_loadu_si128(
|
|
297
|
+
reinterpret_cast<const __m128i*>(data + offset));
|
|
298
|
+
sum_128 = _mm_add_epi64(sum_128, popcount_128_vpopcntdq(v_x));
|
|
299
|
+
}
|
|
300
|
+
sum += reduce_add_128(sum_128);
|
|
301
|
+
|
|
302
|
+
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
303
|
+
const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
|
|
304
|
+
sum += popcount64(yv);
|
|
305
|
+
}
|
|
306
|
+
for (; offset < size; ++offset) {
|
|
307
|
+
const auto yv = *(data + offset);
|
|
308
|
+
sum += popcount32(yv);
|
|
309
|
+
}
|
|
310
|
+
return sum;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
} // namespace faiss::rabitq
|
|
314
|
+
|
|
315
|
+
namespace faiss::rabitq::multibit {
|
|
316
|
+
|
|
317
|
+
// Forward-declare the AVX512 floating-point inner-product kernel.
|
|
318
|
+
// VPOPCNTDQ does not help this kernel (it operates on FP32), so we
|
|
319
|
+
// reuse the AVX512 implementation rather than duplicate it.
|
|
320
|
+
template <>
|
|
321
|
+
float compute_inner_product<SIMDLevel::AVX512>(
|
|
322
|
+
const uint8_t* __restrict sign_bits,
|
|
323
|
+
const uint8_t* __restrict ex_code,
|
|
324
|
+
const float* __restrict rotated_q,
|
|
325
|
+
size_t d,
|
|
326
|
+
size_t ex_bits,
|
|
327
|
+
float cb);
|
|
328
|
+
|
|
329
|
+
template <>
|
|
330
|
+
float compute_inner_product<SIMDLevel::AVX512_SPR>(
|
|
331
|
+
const uint8_t* __restrict sign_bits,
|
|
332
|
+
const uint8_t* __restrict ex_code,
|
|
333
|
+
const float* __restrict rotated_q,
|
|
334
|
+
size_t d,
|
|
335
|
+
size_t ex_bits,
|
|
336
|
+
float cb) {
|
|
337
|
+
return compute_inner_product<SIMDLevel::AVX512>(
|
|
338
|
+
sign_bits, ex_code, rotated_q, d, ex_bits, cb);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
} // namespace faiss::rabitq::multibit
|
|
342
|
+
|
|
343
|
+
#endif // COMPILE_SIMD_AVX512_SPR
|
|
@@ -129,6 +129,9 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
|
|
|
129
129
|
asm volatile("cpuid"
|
|
130
130
|
: "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
|
|
131
131
|
: "a"(eax), "c"(ecx));
|
|
132
|
+
// Save EDX before xgetbv clobbers it — needed for
|
|
133
|
+
// AVX512_FP16 check (bit 23) in the SPR detection below.
|
|
134
|
+
unsigned int cpuid7_edx = edx;
|
|
132
135
|
|
|
133
136
|
unsigned int xcr0;
|
|
134
137
|
asm volatile("xgetbv" : "=a"(xcr0), "=d"(edx) : "c"(0));
|
|
@@ -155,8 +158,15 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
|
|
|
155
158
|
(1 << static_cast<int>(SIMDLevel::AVX512));
|
|
156
159
|
|
|
157
160
|
#if defined(COMPILE_SIMD_AVX512_SPR)
|
|
158
|
-
// Check for Sapphire Rapids features
|
|
161
|
+
// Check for Sapphire Rapids features.
|
|
162
|
+
// The SPR code path is compiled with -mavx512fp16, so we
|
|
163
|
+
// must verify both AVX512_BF16 and AVX512_FP16 before
|
|
164
|
+
// dispatching to it. AMD Zen 4 (bergamo) has BF16 but
|
|
165
|
+
// not FP16 — using SPR code there causes SIGILL.
|
|
159
166
|
// CPUID EAX=7, ECX=1: EAX bit 5 = AVX512_BF16
|
|
167
|
+
// CPUID EAX=7, ECX=0: EDX bit 23 = AVX512_FP16
|
|
168
|
+
// (Linux: X86_FEATURE_AVX512_FP16 = 18*32+23)
|
|
169
|
+
bool has_avx512_fp16 = (cpuid7_edx & (1 << 23)) != 0;
|
|
160
170
|
unsigned int eax1, ebx1, ecx1, edx1;
|
|
161
171
|
eax1 = 7;
|
|
162
172
|
ecx1 = 1;
|
|
@@ -164,7 +174,7 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
|
|
|
164
174
|
: "=a"(eax1), "=b"(ebx1), "=c"(ecx1), "=d"(edx1)
|
|
165
175
|
: "a"(eax1), "c"(ecx1));
|
|
166
176
|
bool has_avx512_bf16 = (eax1 & (1 << 5)) != 0;
|
|
167
|
-
if (has_avx512_bf16) {
|
|
177
|
+
if (has_avx512_bf16 && has_avx512_fp16) {
|
|
168
178
|
detected_level = SIMDLevel::AVX512_SPR;
|
|
169
179
|
supported_simd_levels |=
|
|
170
180
|
(1 << static_cast<int>(SIMDLevel::AVX512_SPR));
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: faiss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
@@ -233,12 +233,16 @@ files:
|
|
|
233
233
|
- vendor/faiss/faiss/gpu/utils/Timer.h
|
|
234
234
|
- vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h
|
|
235
235
|
- vendor/faiss/faiss/gpu_metal/MetalCloner.h
|
|
236
|
+
- vendor/faiss/faiss/gpu_metal/MetalDistance.h
|
|
236
237
|
- vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h
|
|
237
238
|
- vendor/faiss/faiss/gpu_metal/MetalIndex.h
|
|
238
239
|
- vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h
|
|
240
|
+
- vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h
|
|
239
241
|
- vendor/faiss/faiss/gpu_metal/MetalKernels.h
|
|
242
|
+
- vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h
|
|
240
243
|
- vendor/faiss/faiss/gpu_metal/MetalResources.h
|
|
241
244
|
- vendor/faiss/faiss/gpu_metal/StandardMetalResources.h
|
|
245
|
+
- vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h
|
|
242
246
|
- vendor/faiss/faiss/impl/AdSampling.cpp
|
|
243
247
|
- vendor/faiss/faiss/impl/AdSampling.h
|
|
244
248
|
- vendor/faiss/faiss/impl/AdditiveQuantizer.cpp
|
|
@@ -365,6 +369,7 @@ files:
|
|
|
365
369
|
- vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h
|
|
366
370
|
- vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h
|
|
367
371
|
- vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp
|
|
372
|
+
- vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h
|
|
368
373
|
- vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp
|
|
369
374
|
- vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
|
|
370
375
|
- vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h
|
|
@@ -374,6 +379,8 @@ files:
|
|
|
374
379
|
- vendor/faiss/faiss/impl/scalar_quantizer/scanners.h
|
|
375
380
|
- vendor/faiss/faiss/impl/scalar_quantizer/similarities.h
|
|
376
381
|
- vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp
|
|
382
|
+
- vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h
|
|
383
|
+
- vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp
|
|
377
384
|
- vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp
|
|
378
385
|
- vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h
|
|
379
386
|
- vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp
|
|
@@ -453,8 +460,10 @@ files:
|
|
|
453
460
|
- vendor/faiss/faiss/utils/hamming_distance/common.h
|
|
454
461
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp
|
|
455
462
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp
|
|
463
|
+
- vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp
|
|
456
464
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h
|
|
457
465
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h
|
|
466
|
+
- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h
|
|
458
467
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h
|
|
459
468
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h
|
|
460
469
|
- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h
|
|
@@ -489,6 +498,7 @@ files:
|
|
|
489
498
|
- vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h
|
|
490
499
|
- vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp
|
|
491
500
|
- vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp
|
|
501
|
+
- vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp
|
|
492
502
|
- vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp
|
|
493
503
|
- vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp
|
|
494
504
|
- vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h
|
|
@@ -521,7 +531,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
521
531
|
- !ruby/object:Gem::Version
|
|
522
532
|
version: '0'
|
|
523
533
|
requirements: []
|
|
524
|
-
rubygems_version: 4.0.
|
|
534
|
+
rubygems_version: 4.0.14
|
|
525
535
|
specification_version: 4
|
|
526
536
|
summary: Efficient similarity search and clustering for Ruby
|
|
527
537
|
test_files: []
|