faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/Index.h +1 -1
  5. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
  6. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
  7. data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
  8. data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
  9. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
  10. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
  11. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
  12. data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
  13. data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
  14. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
  15. data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
  16. data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
  17. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
  18. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
  19. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
  20. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
  21. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
  22. data/vendor/faiss/faiss/factory_tools.cpp +4 -0
  23. data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
  24. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
  25. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
  26. data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
  27. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
  28. data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
  29. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
  30. data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
  31. data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
  32. data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
  33. data/vendor/faiss/faiss/impl/HNSW.h +51 -13
  34. data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
  35. data/vendor/faiss/faiss/impl/Panorama.h +11 -0
  36. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
  37. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
  38. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
  39. data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
  40. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
  41. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
  42. data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
  43. data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
  44. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
  45. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
  46. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
  47. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
  48. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
  49. data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
  50. data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
  51. data/vendor/faiss/faiss/impl/io_macros.h +25 -0
  52. data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
  53. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
  54. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
  55. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
  56. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
  57. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
  58. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
  59. data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
  60. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
  61. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
  62. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
  63. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
  64. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
  65. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
  66. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
  67. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
  68. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
  69. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
  70. data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
  71. data/vendor/faiss/faiss/index_factory.cpp +5 -1
  72. data/vendor/faiss/faiss/index_io.h +16 -0
  73. data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
  74. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
  75. data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
  76. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
  77. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
  78. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
  79. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
  80. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
  81. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
  82. data/vendor/faiss/faiss/utils/bf16.h +34 -0
  83. data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
  84. data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
  85. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
  86. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
  87. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
  88. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
  89. data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
  90. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
  91. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
  92. data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
  93. metadata +12 -2
@@ -0,0 +1,343 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ /**
9
+ * @file rabitq_avx512_spr.cpp
10
+ *
11
+ * RaBitQ SIMD kernels specialized for SIMDLevel::AVX512_SPR.
12
+ *
13
+ * Sapphire Rapids (SPR) and later Intel microarchitectures expose
14
+ * AVX-512 VPOPCNTDQ (vpopcntq), which performs a per-lane 64-bit
15
+ * popcount in a single instruction. This is used here to replace the
16
+ * multi-step shuffle/pshufb-based popcount used by the generic AVX-512
17
+ * specialization in rabitq_avx512.cpp. The popcount-heavy kernels
18
+ * (bitwise_and_dot_product, bitwise_xor_dot_product, popcount) become
19
+ * substantially shorter and faster on SPR+ as a result.
20
+ *
21
+ * Build / dispatch behavior:
22
+ * - faiss_avx512 (AVX-512 only, no SPR features): NOT compiled.
23
+ * The existing AVX512 specialization in rabitq_avx512.cpp is used.
24
+ * - faiss_avx512_spr (statically built for SPR+): compiled. The
25
+ * SINGLE_SIMD_LEVEL is AVX512_SPR, so this specialization is
26
+ * selected by static dispatch.
27
+ * - faiss with FAISS_OPT_LEVEL=dd (dynamic dispatch): compiled with
28
+ * -mavx512vpopcntdq as a per-file flag. Selected at runtime when
29
+ * SIMDConfig::level == SIMDLevel::AVX512_SPR.
30
+ *
31
+ * The floating-point multi-bit inner-product kernel does not benefit
32
+ * from VPOPCNTDQ, so this TU forwards compute_inner_product<SPR> to
33
+ * the AVX512 implementation to avoid duplicating that code path.
34
+ */
35
+
36
+ #ifdef COMPILE_SIMD_AVX512_SPR
37
+
38
+ #include <faiss/utils/popcount.h>
39
+ #include <faiss/utils/rabitq_simd.h>
40
+ #include <immintrin.h>
41
+ #include <cstdint>
42
+
43
+ #if defined(_MSC_VER)
44
+ #include <intrin.h>
45
+ #endif
46
+
47
+ namespace faiss::rabitq {
48
+
49
+ // Forward declarations for the AVX512 specializations defined in
50
+ // rabitq_avx512.cpp. They live in the same TU group on SPR builds, so
51
+ // we can reuse them as a tail handler / fallback. Declaring rather
52
+ // than redefining avoids ODR risk and keeps a single source of truth
53
+ // for the floating-point kernel.
54
+ template <>
55
+ uint64_t bitwise_and_dot_product<SIMDLevel::AVX512>(
56
+ const uint8_t* query,
57
+ const uint8_t* data,
58
+ size_t size,
59
+ size_t qb);
60
+ template <>
61
+ uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512>(
62
+ const uint8_t* query,
63
+ const uint8_t* data,
64
+ size_t size,
65
+ size_t qb);
66
+ template <>
67
+ uint64_t popcount<SIMDLevel::AVX512>(const uint8_t* data, size_t size);
68
+
69
+ namespace {
70
+
71
+ // 512-bit popcount using AVX-512 VPOPCNTDQ (vpopcntq).
72
+ // Single-instruction per-lane popcount on 8x uint64 lanes.
73
+ inline __m512i popcount_512_vpopcntdq(__m512i v) {
74
+ return _mm512_popcnt_epi64(v);
75
+ }
76
+
77
+ // 256-bit popcount using AVX-512VL VPOPCNTDQ.
78
+ // AVX512VL is part of the SPR feature set, so vpopcntq is available
79
+ // on 256-bit registers via _mm256_popcnt_epi64.
80
+ inline __m256i popcount_256_vpopcntdq(__m256i v) {
81
+ return _mm256_popcnt_epi64(v);
82
+ }
83
+
84
+ // 128-bit popcount using AVX-512VL VPOPCNTDQ.
85
+ inline __m128i popcount_128_vpopcntdq(__m128i v) {
86
+ return _mm_popcnt_epi64(v);
87
+ }
88
+
89
+ inline uint64_t reduce_add_256(__m256i v) {
90
+ alignas(32) uint64_t lanes[4];
91
+ _mm256_store_si256(reinterpret_cast<__m256i*>(lanes), v);
92
+ return lanes[0] + lanes[1] + lanes[2] + lanes[3];
93
+ }
94
+
95
+ inline uint64_t reduce_add_128(__m128i v) {
96
+ alignas(16) uint64_t lanes[2];
97
+ _mm_store_si128(reinterpret_cast<__m128i*>(lanes), v);
98
+ return lanes[0] + lanes[1];
99
+ }
100
+
101
+ } // namespace
102
+
103
+ template <>
104
+ uint64_t bitwise_and_dot_product<SIMDLevel::AVX512_SPR>(
105
+ const uint8_t* query,
106
+ const uint8_t* data,
107
+ size_t size,
108
+ size_t qb) {
109
+ uint64_t sum = 0;
110
+ size_t offset = 0;
111
+
112
+ // 512-bit main loop: vpopcntq replaces the shuffle-based popcount,
113
+ // halving the instruction count per iteration relative to AVX512.
114
+ if (size_t step = 512 / 8; offset + step <= size) {
115
+ __m512i sum_512 = _mm512_setzero_si512();
116
+ for (; offset + step <= size; offset += step) {
117
+ __m512i v_x = _mm512_loadu_si512(
118
+ reinterpret_cast<const __m512i*>(data + offset));
119
+ for (size_t j = 0; j < qb; j++) {
120
+ __m512i v_q = _mm512_loadu_si512(
121
+ reinterpret_cast<const __m512i*>(
122
+ query + j * size + offset));
123
+ __m512i v_and = _mm512_and_si512(v_q, v_x);
124
+ __m512i v_popcnt = popcount_512_vpopcntdq(v_and);
125
+ __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
126
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
127
+ }
128
+ }
129
+ sum += _mm512_reduce_add_epi64(sum_512);
130
+ }
131
+
132
+ // 256-bit tail.
133
+ if (size_t step = 256 / 8; offset + step <= size) {
134
+ __m256i sum_256 = _mm256_setzero_si256();
135
+ for (; offset + step <= size; offset += step) {
136
+ __m256i v_x = _mm256_loadu_si256(
137
+ reinterpret_cast<const __m256i*>(data + offset));
138
+ for (size_t j = 0; j < qb; j++) {
139
+ __m256i v_q = _mm256_loadu_si256(
140
+ reinterpret_cast<const __m256i*>(
141
+ query + j * size + offset));
142
+ __m256i v_and = _mm256_and_si256(v_q, v_x);
143
+ __m256i v_popcnt = popcount_256_vpopcntdq(v_and);
144
+ __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
145
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
146
+ }
147
+ }
148
+ sum += reduce_add_256(sum_256);
149
+ }
150
+
151
+ // 128-bit tail.
152
+ __m128i sum_128 = _mm_setzero_si128();
153
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
154
+ __m128i v_x = _mm_loadu_si128(
155
+ reinterpret_cast<const __m128i*>(data + offset));
156
+ for (size_t j = 0; j < qb; j++) {
157
+ __m128i v_q = _mm_loadu_si128(
158
+ reinterpret_cast<const __m128i*>(
159
+ query + j * size + offset));
160
+ __m128i v_and = _mm_and_si128(v_q, v_x);
161
+ __m128i v_popcnt = popcount_128_vpopcntdq(v_and);
162
+ __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
163
+ sum_128 = _mm_add_epi64(sum_128, v_shifted);
164
+ }
165
+ }
166
+ sum += reduce_add_128(sum_128);
167
+
168
+ // 64-bit scalar tail.
169
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
170
+ const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
171
+ for (size_t j = 0; j < qb; j++) {
172
+ const auto qv = *reinterpret_cast<const uint64_t*>(
173
+ query + j * size + offset);
174
+ sum += static_cast<uint64_t>(popcount64(qv & yv)) << j;
175
+ }
176
+ }
177
+ // Byte tail.
178
+ for (; offset < size; ++offset) {
179
+ const auto yv = *(data + offset);
180
+ for (size_t j = 0; j < qb; j++) {
181
+ const auto qv = *(query + j * size + offset);
182
+ sum += static_cast<uint64_t>(popcount32(qv & yv)) << j;
183
+ }
184
+ }
185
+ return sum;
186
+ }
187
+
188
+ template <>
189
+ uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512_SPR>(
190
+ const uint8_t* query,
191
+ const uint8_t* data,
192
+ size_t size,
193
+ size_t qb) {
194
+ uint64_t sum = 0;
195
+ size_t offset = 0;
196
+
197
+ if (size_t step = 512 / 8; offset + step <= size) {
198
+ __m512i sum_512 = _mm512_setzero_si512();
199
+ for (; offset + step <= size; offset += step) {
200
+ __m512i v_x = _mm512_loadu_si512(
201
+ reinterpret_cast<const __m512i*>(data + offset));
202
+ for (size_t j = 0; j < qb; j++) {
203
+ __m512i v_q = _mm512_loadu_si512(
204
+ reinterpret_cast<const __m512i*>(
205
+ query + j * size + offset));
206
+ __m512i v_xor = _mm512_xor_si512(v_q, v_x);
207
+ __m512i v_popcnt = popcount_512_vpopcntdq(v_xor);
208
+ __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
209
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
210
+ }
211
+ }
212
+ sum += _mm512_reduce_add_epi64(sum_512);
213
+ }
214
+
215
+ if (size_t step = 256 / 8; offset + step <= size) {
216
+ __m256i sum_256 = _mm256_setzero_si256();
217
+ for (; offset + step <= size; offset += step) {
218
+ __m256i v_x = _mm256_loadu_si256(
219
+ reinterpret_cast<const __m256i*>(data + offset));
220
+ for (size_t j = 0; j < qb; j++) {
221
+ __m256i v_q = _mm256_loadu_si256(
222
+ reinterpret_cast<const __m256i*>(
223
+ query + j * size + offset));
224
+ __m256i v_xor = _mm256_xor_si256(v_q, v_x);
225
+ __m256i v_popcnt = popcount_256_vpopcntdq(v_xor);
226
+ __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
227
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
228
+ }
229
+ }
230
+ sum += reduce_add_256(sum_256);
231
+ }
232
+
233
+ __m128i sum_128 = _mm_setzero_si128();
234
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
235
+ __m128i v_x = _mm_loadu_si128(
236
+ reinterpret_cast<const __m128i*>(data + offset));
237
+ for (size_t j = 0; j < qb; j++) {
238
+ __m128i v_q = _mm_loadu_si128(
239
+ reinterpret_cast<const __m128i*>(
240
+ query + j * size + offset));
241
+ __m128i v_xor = _mm_xor_si128(v_q, v_x);
242
+ __m128i v_popcnt = popcount_128_vpopcntdq(v_xor);
243
+ __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
244
+ sum_128 = _mm_add_epi64(sum_128, v_shifted);
245
+ }
246
+ }
247
+ sum += reduce_add_128(sum_128);
248
+
249
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
250
+ const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
251
+ for (size_t j = 0; j < qb; j++) {
252
+ const auto qv = *reinterpret_cast<const uint64_t*>(
253
+ query + j * size + offset);
254
+ sum += static_cast<uint64_t>(popcount64(qv ^ yv)) << j;
255
+ }
256
+ }
257
+ for (; offset < size; ++offset) {
258
+ const auto yv = *(data + offset);
259
+ for (size_t j = 0; j < qb; j++) {
260
+ const auto qv = *(query + j * size + offset);
261
+ sum += static_cast<uint64_t>(popcount32(qv ^ yv)) << j;
262
+ }
263
+ }
264
+ return sum;
265
+ }
266
+
267
+ template <>
268
+ uint64_t popcount<SIMDLevel::AVX512_SPR>(const uint8_t* data, size_t size) {
269
+ uint64_t sum = 0;
270
+ size_t offset = 0;
271
+
272
+ if (offset + 512 / 8 <= size) {
273
+ __m512i sum_512 = _mm512_setzero_si512();
274
+ for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
275
+ __m512i v_x = _mm512_loadu_si512(
276
+ reinterpret_cast<const __m512i*>(data + offset));
277
+ __m512i v_popcnt = popcount_512_vpopcntdq(v_x);
278
+ sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
279
+ }
280
+ sum += _mm512_reduce_add_epi64(sum_512);
281
+ }
282
+
283
+ if (offset + 256 / 8 <= size) {
284
+ __m256i sum_256 = _mm256_setzero_si256();
285
+ for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
286
+ __m256i v_x = _mm256_loadu_si256(
287
+ reinterpret_cast<const __m256i*>(data + offset));
288
+ __m256i v_popcnt = popcount_256_vpopcntdq(v_x);
289
+ sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
290
+ }
291
+ sum += reduce_add_256(sum_256);
292
+ }
293
+
294
+ __m128i sum_128 = _mm_setzero_si128();
295
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
296
+ __m128i v_x = _mm_loadu_si128(
297
+ reinterpret_cast<const __m128i*>(data + offset));
298
+ sum_128 = _mm_add_epi64(sum_128, popcount_128_vpopcntdq(v_x));
299
+ }
300
+ sum += reduce_add_128(sum_128);
301
+
302
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
303
+ const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
304
+ sum += popcount64(yv);
305
+ }
306
+ for (; offset < size; ++offset) {
307
+ const auto yv = *(data + offset);
308
+ sum += popcount32(yv);
309
+ }
310
+ return sum;
311
+ }
312
+
313
+ } // namespace faiss::rabitq
314
+
315
+ namespace faiss::rabitq::multibit {
316
+
317
+ // Forward-declare the AVX512 floating-point inner-product kernel.
318
+ // VPOPCNTDQ does not help this kernel (it operates on FP32), so we
319
+ // reuse the AVX512 implementation rather than duplicate it.
320
+ template <>
321
+ float compute_inner_product<SIMDLevel::AVX512>(
322
+ const uint8_t* __restrict sign_bits,
323
+ const uint8_t* __restrict ex_code,
324
+ const float* __restrict rotated_q,
325
+ size_t d,
326
+ size_t ex_bits,
327
+ float cb);
328
+
329
+ template <>
330
+ float compute_inner_product<SIMDLevel::AVX512_SPR>(
331
+ const uint8_t* __restrict sign_bits,
332
+ const uint8_t* __restrict ex_code,
333
+ const float* __restrict rotated_q,
334
+ size_t d,
335
+ size_t ex_bits,
336
+ float cb) {
337
+ return compute_inner_product<SIMDLevel::AVX512>(
338
+ sign_bits, ex_code, rotated_q, d, ex_bits, cb);
339
+ }
340
+
341
+ } // namespace faiss::rabitq::multibit
342
+
343
+ #endif // COMPILE_SIMD_AVX512_SPR
@@ -129,6 +129,9 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
129
129
  asm volatile("cpuid"
130
130
  : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
131
131
  : "a"(eax), "c"(ecx));
132
+ // Save EDX before xgetbv clobbers it — needed for
133
+ // AVX512_FP16 check (bit 23) in the SPR detection below.
134
+ unsigned int cpuid7_edx = edx;
132
135
 
133
136
  unsigned int xcr0;
134
137
  asm volatile("xgetbv" : "=a"(xcr0), "=d"(edx) : "c"(0));
@@ -155,8 +158,15 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
155
158
  (1 << static_cast<int>(SIMDLevel::AVX512));
156
159
 
157
160
  #if defined(COMPILE_SIMD_AVX512_SPR)
158
- // Check for Sapphire Rapids features (AVX512_BF16)
161
+ // Check for Sapphire Rapids features.
162
+ // The SPR code path is compiled with -mavx512fp16, so we
163
+ // must verify both AVX512_BF16 and AVX512_FP16 before
164
+ // dispatching to it. AMD Zen 4 (bergamo) has BF16 but
165
+ // not FP16 — using SPR code there causes SIGILL.
159
166
  // CPUID EAX=7, ECX=1: EAX bit 5 = AVX512_BF16
167
+ // CPUID EAX=7, ECX=0: EDX bit 23 = AVX512_FP16
168
+ // (Linux: X86_FEATURE_AVX512_FP16 = 18*32+23)
169
+ bool has_avx512_fp16 = (cpuid7_edx & (1 << 23)) != 0;
160
170
  unsigned int eax1, ebx1, ecx1, edx1;
161
171
  eax1 = 7;
162
172
  ecx1 = 1;
@@ -164,7 +174,7 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
164
174
  : "=a"(eax1), "=b"(ebx1), "=c"(ecx1), "=d"(edx1)
165
175
  : "a"(eax1), "c"(ecx1));
166
176
  bool has_avx512_bf16 = (eax1 & (1 << 5)) != 0;
167
- if (has_avx512_bf16) {
177
+ if (has_avx512_bf16 && has_avx512_fp16) {
168
178
  detected_level = SIMDLevel::AVX512_SPR;
169
179
  supported_simd_levels |=
170
180
  (1 << static_cast<int>(SIMDLevel::AVX512_SPR));
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: faiss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -233,12 +233,16 @@ files:
233
233
  - vendor/faiss/faiss/gpu/utils/Timer.h
234
234
  - vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h
235
235
  - vendor/faiss/faiss/gpu_metal/MetalCloner.h
236
+ - vendor/faiss/faiss/gpu_metal/MetalDistance.h
236
237
  - vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h
237
238
  - vendor/faiss/faiss/gpu_metal/MetalIndex.h
238
239
  - vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h
240
+ - vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h
239
241
  - vendor/faiss/faiss/gpu_metal/MetalKernels.h
242
+ - vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h
240
243
  - vendor/faiss/faiss/gpu_metal/MetalResources.h
241
244
  - vendor/faiss/faiss/gpu_metal/StandardMetalResources.h
245
+ - vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h
242
246
  - vendor/faiss/faiss/impl/AdSampling.cpp
243
247
  - vendor/faiss/faiss/impl/AdSampling.h
244
248
  - vendor/faiss/faiss/impl/AdditiveQuantizer.cpp
@@ -365,6 +369,7 @@ files:
365
369
  - vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h
366
370
  - vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h
367
371
  - vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp
372
+ - vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h
368
373
  - vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp
369
374
  - vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
370
375
  - vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h
@@ -374,6 +379,8 @@ files:
374
379
  - vendor/faiss/faiss/impl/scalar_quantizer/scanners.h
375
380
  - vendor/faiss/faiss/impl/scalar_quantizer/similarities.h
376
381
  - vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp
382
+ - vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h
383
+ - vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp
377
384
  - vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp
378
385
  - vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h
379
386
  - vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp
@@ -453,8 +460,10 @@ files:
453
460
  - vendor/faiss/faiss/utils/hamming_distance/common.h
454
461
  - vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp
455
462
  - vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp
463
+ - vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp
456
464
  - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h
457
465
  - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h
466
+ - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h
458
467
  - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h
459
468
  - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h
460
469
  - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h
@@ -489,6 +498,7 @@ files:
489
498
  - vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h
490
499
  - vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp
491
500
  - vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp
501
+ - vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp
492
502
  - vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp
493
503
  - vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp
494
504
  - vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h
@@ -521,7 +531,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
521
531
  - !ruby/object:Gem::Version
522
532
  version: '0'
523
533
  requirements: []
524
- rubygems_version: 4.0.10
534
+ rubygems_version: 4.0.14
525
535
  specification_version: 4
526
536
  summary: Efficient similarity search and clustering for Ruby
527
537
  test_files: []