faiss 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/AutoTune.cpp +39 -29
  5. data/vendor/faiss/faiss/Clustering.cpp +4 -2
  6. data/vendor/faiss/faiss/IVFlib.cpp +14 -7
  7. data/vendor/faiss/faiss/Index.h +72 -3
  8. data/vendor/faiss/faiss/Index2Layer.cpp +2 -4
  9. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +0 -1
  10. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +1 -0
  11. data/vendor/faiss/faiss/IndexBinary.h +46 -3
  12. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +118 -4
  13. data/vendor/faiss/faiss/IndexBinaryHNSW.h +41 -0
  14. data/vendor/faiss/faiss/IndexBinaryHash.cpp +0 -1
  15. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +18 -7
  16. data/vendor/faiss/faiss/IndexBinaryIVF.h +5 -1
  17. data/vendor/faiss/faiss/IndexFlat.cpp +6 -4
  18. data/vendor/faiss/faiss/IndexHNSW.cpp +65 -24
  19. data/vendor/faiss/faiss/IndexHNSW.h +10 -1
  20. data/vendor/faiss/faiss/IndexIDMap.cpp +96 -18
  21. data/vendor/faiss/faiss/IndexIDMap.h +20 -0
  22. data/vendor/faiss/faiss/IndexIVF.cpp +28 -10
  23. data/vendor/faiss/faiss/IndexIVF.h +16 -1
  24. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -16
  25. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +18 -6
  26. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +33 -21
  27. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +16 -6
  28. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +24 -15
  29. data/vendor/faiss/faiss/IndexIVFFastScan.h +4 -2
  30. data/vendor/faiss/faiss/IndexIVFFlat.cpp +59 -43
  31. data/vendor/faiss/faiss/IndexIVFFlat.h +10 -2
  32. data/vendor/faiss/faiss/IndexIVFPQ.cpp +16 -3
  33. data/vendor/faiss/faiss/IndexIVFPQ.h +8 -1
  34. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +14 -6
  35. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +2 -1
  36. data/vendor/faiss/faiss/IndexIVFPQR.cpp +14 -4
  37. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  38. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +28 -3
  39. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +8 -1
  40. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +9 -2
  41. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  42. data/vendor/faiss/faiss/IndexLattice.cpp +8 -4
  43. data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -7
  44. data/vendor/faiss/faiss/IndexNSG.cpp +3 -3
  45. data/vendor/faiss/faiss/IndexPQ.cpp +0 -1
  46. data/vendor/faiss/faiss/IndexPQ.h +1 -0
  47. data/vendor/faiss/faiss/IndexPQFastScan.cpp +0 -2
  48. data/vendor/faiss/faiss/IndexPreTransform.cpp +4 -2
  49. data/vendor/faiss/faiss/IndexRefine.cpp +11 -6
  50. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +16 -4
  51. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -3
  52. data/vendor/faiss/faiss/IndexShards.cpp +7 -6
  53. data/vendor/faiss/faiss/MatrixStats.cpp +16 -8
  54. data/vendor/faiss/faiss/MetaIndexes.cpp +12 -6
  55. data/vendor/faiss/faiss/MetricType.h +5 -3
  56. data/vendor/faiss/faiss/clone_index.cpp +2 -4
  57. data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +6 -0
  58. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +9 -4
  59. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +32 -10
  60. data/vendor/faiss/faiss/gpu/GpuIndex.h +88 -0
  61. data/vendor/faiss/faiss/gpu/GpuIndexBinaryCagra.h +125 -0
  62. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +39 -4
  63. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +3 -3
  64. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -1
  65. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +3 -2
  66. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +41 -0
  67. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +6 -3
  68. data/vendor/faiss/faiss/impl/HNSW.cpp +34 -19
  69. data/vendor/faiss/faiss/impl/IDSelector.cpp +2 -1
  70. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +2 -3
  71. data/vendor/faiss/faiss/impl/NNDescent.cpp +17 -9
  72. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +42 -21
  73. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +6 -24
  74. data/vendor/faiss/faiss/impl/ResultHandler.h +56 -47
  75. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +28 -15
  76. data/vendor/faiss/faiss/impl/index_read.cpp +36 -11
  77. data/vendor/faiss/faiss/impl/index_write.cpp +19 -6
  78. data/vendor/faiss/faiss/impl/io.cpp +9 -5
  79. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +18 -11
  80. data/vendor/faiss/faiss/impl/mapped_io.cpp +4 -7
  81. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +0 -1
  82. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +0 -1
  83. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +6 -6
  84. data/vendor/faiss/faiss/impl/zerocopy_io.cpp +1 -1
  85. data/vendor/faiss/faiss/impl/zerocopy_io.h +2 -2
  86. data/vendor/faiss/faiss/index_factory.cpp +49 -33
  87. data/vendor/faiss/faiss/index_factory.h +8 -2
  88. data/vendor/faiss/faiss/index_io.h +0 -3
  89. data/vendor/faiss/faiss/invlists/DirectMap.cpp +2 -1
  90. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +12 -6
  91. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +8 -4
  92. data/vendor/faiss/faiss/utils/Heap.cpp +15 -8
  93. data/vendor/faiss/faiss/utils/Heap.h +23 -12
  94. data/vendor/faiss/faiss/utils/distances.cpp +42 -21
  95. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
  96. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +1 -1
  97. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -3
  98. data/vendor/faiss/faiss/utils/extra_distances-inl.h +27 -4
  99. data/vendor/faiss/faiss/utils/extra_distances.cpp +8 -4
  100. data/vendor/faiss/faiss/utils/hamming.cpp +20 -10
  101. data/vendor/faiss/faiss/utils/partitioning.cpp +8 -4
  102. data/vendor/faiss/faiss/utils/quantize_lut.cpp +17 -9
  103. data/vendor/faiss/faiss/utils/rabitq_simd.h +539 -0
  104. data/vendor/faiss/faiss/utils/random.cpp +14 -7
  105. data/vendor/faiss/faiss/utils/utils.cpp +0 -3
  106. metadata +5 -2
@@ -0,0 +1,539 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <cstddef>
11
+ #include <cstdint>
12
+
13
+ // Only include x86 SIMD intrinsics on x86/x86_64 architectures
14
+ #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
15
+ defined(_M_IX86)
16
+ #include <immintrin.h>
17
+ #endif
18
+
19
+ namespace faiss {
20
+
21
+ #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
22
+ defined(_M_IX86)
23
+ /**
24
+ * Returns the lookup table for AVX512 popcount operations.
25
+ * This table is used for lookup-based popcount implementation.
26
+ *
27
+ * @return Lookup table as __m512i register
28
+ */
29
+ inline __m512i get_lookup_512() {
30
+ return _mm512_set_epi8(
31
+ /* f */ 4,
32
+ /* e */ 3,
33
+ /* d */ 3,
34
+ /* c */ 2,
35
+ /* b */ 3,
36
+ /* a */ 2,
37
+ /* 9 */ 2,
38
+ /* 8 */ 1,
39
+ /* 7 */ 3,
40
+ /* 6 */ 2,
41
+ /* 5 */ 2,
42
+ /* 4 */ 1,
43
+ /* 3 */ 2,
44
+ /* 2 */ 1,
45
+ /* 1 */ 1,
46
+ /* 0 */ 0,
47
+ /* f */ 4,
48
+ /* e */ 3,
49
+ /* d */ 3,
50
+ /* c */ 2,
51
+ /* b */ 3,
52
+ /* a */ 2,
53
+ /* 9 */ 2,
54
+ /* 8 */ 1,
55
+ /* 7 */ 3,
56
+ /* 6 */ 2,
57
+ /* 5 */ 2,
58
+ /* 4 */ 1,
59
+ /* 3 */ 2,
60
+ /* 2 */ 1,
61
+ /* 1 */ 1,
62
+ /* 0 */ 0,
63
+ /* f */ 4,
64
+ /* e */ 3,
65
+ /* d */ 3,
66
+ /* c */ 2,
67
+ /* b */ 3,
68
+ /* a */ 2,
69
+ /* 9 */ 2,
70
+ /* 8 */ 1,
71
+ /* 7 */ 3,
72
+ /* 6 */ 2,
73
+ /* 5 */ 2,
74
+ /* 4 */ 1,
75
+ /* 3 */ 2,
76
+ /* 2 */ 1,
77
+ /* 1 */ 1,
78
+ /* 0 */ 0,
79
+ /* f */ 4,
80
+ /* e */ 3,
81
+ /* d */ 3,
82
+ /* c */ 2,
83
+ /* b */ 3,
84
+ /* a */ 2,
85
+ /* 9 */ 2,
86
+ /* 8 */ 1,
87
+ /* 7 */ 3,
88
+ /* 6 */ 2,
89
+ /* 5 */ 2,
90
+ /* 4 */ 1,
91
+ /* 3 */ 2,
92
+ /* 2 */ 1,
93
+ /* 1 */ 1,
94
+ /* 0 */ 0);
95
+ }
96
+
97
+ /**
98
+ * Returns the lookup table for AVX2 popcount operations.
99
+ * This table is used for lookup-based popcount implementation.
100
+ *
101
+ * @return Lookup table as __m256i register
102
+ */
103
+ inline __m256i get_lookup_256() {
104
+ return _mm256_setr_epi8(
105
+ /* 0 */ 0,
106
+ /* 1 */ 1,
107
+ /* 2 */ 1,
108
+ /* 3 */ 2,
109
+ /* 4 */ 1,
110
+ /* 5 */ 2,
111
+ /* 6 */ 2,
112
+ /* 7 */ 3,
113
+ /* 8 */ 1,
114
+ /* 9 */ 2,
115
+ /* a */ 2,
116
+ /* b */ 3,
117
+ /* c */ 2,
118
+ /* d */ 3,
119
+ /* e */ 3,
120
+ /* f */ 4,
121
+ /* 0 */ 0,
122
+ /* 1 */ 1,
123
+ /* 2 */ 1,
124
+ /* 3 */ 2,
125
+ /* 4 */ 1,
126
+ /* 5 */ 2,
127
+ /* 6 */ 2,
128
+ /* 7 */ 3,
129
+ /* 8 */ 1,
130
+ /* 9 */ 2,
131
+ /* a */ 2,
132
+ /* b */ 3,
133
+ /* c */ 2,
134
+ /* d */ 3,
135
+ /* e */ 3,
136
+ /* f */ 4);
137
+ }
138
+
139
+ /**
140
+ * Performs lookup-based popcount on AVX512 registers.
141
+ *
142
+ * @param v_and Input vector to count bits in
143
+ * @return Vector with popcount results
144
+ */
145
+ inline __m512i popcount_lookup_avx512(__m512i v_and) {
146
+ const __m512i lookup = get_lookup_512();
147
+ const __m512i low_mask = _mm512_set1_epi8(0x0f);
148
+
149
+ const __m512i lo = _mm512_and_si512(v_and, low_mask);
150
+ const __m512i hi = _mm512_and_si512(_mm512_srli_epi16(v_and, 4), low_mask);
151
+ const __m512i popcnt1 = _mm512_shuffle_epi8(lookup, lo);
152
+ const __m512i popcnt2 = _mm512_shuffle_epi8(lookup, hi);
153
+ return _mm512_add_epi8(popcnt1, popcnt2);
154
+ }
155
+
156
+ /**
157
+ * Performs lookup-based popcount on AVX2 registers.
158
+ *
159
+ * @param v_and Input vector to count bits in
160
+ * @return Vector with popcount results
161
+ */
162
+ inline __m256i popcount_lookup_avx2(__m256i v_and) {
163
+ const __m256i lookup = get_lookup_256();
164
+ const __m256i low_mask = _mm256_set1_epi8(0x0f);
165
+
166
+ const __m256i lo = _mm256_and_si256(v_and, low_mask);
167
+ const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v_and, 4), low_mask);
168
+ const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
169
+ const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);
170
+ return _mm256_add_epi8(popcnt1, popcnt2);
171
+ }
172
+ #endif
173
+
174
+ #if defined(__AVX512F__) && defined(__AVX512VPOPCNTDQ__)
175
+
176
+ /**
177
+ * AVX512-optimized version of dot product computation between query and binary
178
+ * data. Requires AVX512F and AVX512VPOPCNTDQ instruction sets.
179
+ *
180
+ * @param query Pointer to rearranged rotated query data
181
+ * @param binary_data Pointer to binary data
182
+ * @param d Dimension
183
+ * @param qb Number of quantization bits
184
+ * @return Dot product result as float
185
+ */
186
+ inline float rabitq_dp_popcnt_avx512(
187
+ const uint8_t* query,
188
+ const uint8_t* binary_data,
189
+ size_t d,
190
+ size_t qb) {
191
+ __m512i sum_512 = _mm512_setzero_si512();
192
+
193
+ const size_t di_8b = (d + 7) / 8;
194
+
195
+ const size_t d_512 = (d / 512) * 512;
196
+ const size_t d_256 = (d / 256) * 256;
197
+ const size_t d_128 = (d / 128) * 128;
198
+
199
+ for (size_t i = 0; i < d_512; i += 512) {
200
+ __m512i v_x = _mm512_loadu_si512((const __m512i*)(binary_data + i / 8));
201
+ for (size_t j = 0; j < qb; j++) {
202
+ __m512i v_q = _mm512_loadu_si512(
203
+ (const __m512i*)(query + j * di_8b + i / 8));
204
+ __m512i v_and = _mm512_and_si512(v_q, v_x);
205
+ __m512i v_popcnt = _mm512_popcnt_epi32(v_and);
206
+ sum_512 = _mm512_add_epi32(sum_512, _mm512_slli_epi32(v_popcnt, j));
207
+ }
208
+ }
209
+
210
+ __m256i sum_256 = _mm256_add_epi32(
211
+ _mm512_extracti32x8_epi32(sum_512, 0),
212
+ _mm512_extracti32x8_epi32(sum_512, 1));
213
+
214
+ if (d_256 != d_512) {
215
+ __m256i v_x =
216
+ _mm256_loadu_si256((const __m256i*)(binary_data + d_512 / 8));
217
+ for (size_t j = 0; j < qb; j++) {
218
+ __m256i v_q = _mm256_loadu_si256(
219
+ (const __m256i*)(query + j * di_8b + d_512 / 8));
220
+ __m256i v_and = _mm256_and_si256(v_q, v_x);
221
+ __m256i v_popcnt = _mm256_popcnt_epi32(v_and);
222
+ sum_256 = _mm256_add_epi32(sum_256, _mm256_slli_epi32(v_popcnt, j));
223
+ }
224
+ }
225
+
226
+ __m128i sum_128 = _mm_add_epi32(
227
+ _mm256_extracti32x4_epi32(sum_256, 0),
228
+ _mm256_extracti32x4_epi32(sum_256, 1));
229
+
230
+ if (d_128 != d_256) {
231
+ __m128i v_x =
232
+ _mm_loadu_si128((const __m128i*)(binary_data + d_256 / 8));
233
+ for (size_t j = 0; j < qb; j++) {
234
+ __m128i v_q = _mm_loadu_si128(
235
+ (const __m128i*)(query + j * di_8b + d_256 / 8));
236
+ __m128i v_and = _mm_and_si128(v_q, v_x);
237
+ __m128i v_popcnt = _mm_popcnt_epi32(v_and);
238
+ sum_128 = _mm_add_epi32(sum_128, _mm_slli_epi32(v_popcnt, j));
239
+ }
240
+ }
241
+
242
+ if (d != d_128) {
243
+ const size_t leftovers = d - d_128;
244
+ const __mmask16 mask = (1 << ((leftovers + 7) / 8)) - 1;
245
+
246
+ __m128i v_x = _mm_maskz_loadu_epi8(
247
+ mask, (const __m128i*)(binary_data + d_128 / 8));
248
+ for (size_t j = 0; j < qb; j++) {
249
+ __m128i v_q = _mm_maskz_loadu_epi8(
250
+ mask, (const __m128i*)(query + j * di_8b + d_128 / 8));
251
+ __m128i v_and = _mm_and_si128(v_q, v_x);
252
+ __m128i v_popcnt = _mm_popcnt_epi32(v_and);
253
+ sum_128 = _mm_add_epi32(sum_128, _mm_slli_epi32(v_popcnt, j));
254
+ }
255
+ }
256
+
257
+ int sum_64le = 0;
258
+ sum_64le += _mm_extract_epi32(sum_128, 0);
259
+ sum_64le += _mm_extract_epi32(sum_128, 1);
260
+ sum_64le += _mm_extract_epi32(sum_128, 2);
261
+ sum_64le += _mm_extract_epi32(sum_128, 3);
262
+
263
+ return static_cast<float>(sum_64le);
264
+ }
265
+ #endif
266
+
267
+ #if defined(__AVX512F__) && !defined(__AVX512VPOPCNTDQ__)
268
+ /**
269
+ * AVX512-optimized version of dot product computation between query and binary
270
+ * data. Uses AVX512F instructions but does not require AVX512VPOPCNTDQ.
271
+ *
272
+ * @param query Pointer to rearranged rotated query data
273
+ * @param binary_data Pointer to binary data
274
+ * @param d Dimension
275
+ * @param qb Number of quantization bits
276
+ * @return Dot product result as float
277
+ */
278
+ inline float rabitq_dp_popcnt_avx512_fallback(
279
+ const uint8_t* query,
280
+ const uint8_t* binary_data,
281
+ size_t d,
282
+ size_t qb) {
283
+ const size_t di_8b = (d + 7) / 8;
284
+ const size_t d_512 = (d / 512) * 512;
285
+ const size_t d_256 = (d / 256) * 256;
286
+ const size_t d_128 = (d / 128) * 128;
287
+
288
+ // Use the lookup-based popcount helper function
289
+
290
+ __m512i sum_512 = _mm512_setzero_si512();
291
+
292
+ // Process 512 bits (64 bytes) at a time using lookup-based popcount
293
+ for (size_t i = 0; i < d_512; i += 512) {
294
+ __m512i v_x = _mm512_loadu_si512((const __m512i*)(binary_data + i / 8));
295
+ for (size_t j = 0; j < qb; j++) {
296
+ __m512i v_q = _mm512_loadu_si512(
297
+ (const __m512i*)(query + j * di_8b + i / 8));
298
+ __m512i v_and = _mm512_and_si512(v_q, v_x);
299
+
300
+ // Use the popcount_lookup_avx512 helper function
301
+ __m512i v_popcnt = popcount_lookup_avx512(v_and);
302
+
303
+ // Sum bytes to 32-bit integers
304
+ __m512i v_sad = _mm512_sad_epu8(v_popcnt, _mm512_setzero_si512());
305
+
306
+ // Shift by j and add to sum
307
+ __m512i v_shifted = _mm512_slli_epi64(v_sad, j);
308
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
309
+ }
310
+ }
311
+
312
+ // Handle 256-bit section if needed
313
+ __m256i sum_256 = _mm256_setzero_si256();
314
+ if (d_256 != d_512) {
315
+ __m256i v_x =
316
+ _mm256_loadu_si256((const __m256i*)(binary_data + d_512 / 8));
317
+ for (size_t j = 0; j < qb; j++) {
318
+ __m256i v_q = _mm256_loadu_si256(
319
+ (const __m256i*)(query + j * di_8b + d_512 / 8));
320
+ __m256i v_and = _mm256_and_si256(v_q, v_x);
321
+
322
+ // Use the popcount_lookup_avx2 helper function
323
+ __m256i v_popcnt = popcount_lookup_avx2(v_and);
324
+
325
+ // Sum bytes to 64-bit integers
326
+ __m256i v_sad = _mm256_sad_epu8(v_popcnt, _mm256_setzero_si256());
327
+
328
+ // Shift by j and add to sum
329
+ __m256i v_shifted = _mm256_slli_epi64(v_sad, j);
330
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
331
+ }
332
+ }
333
+
334
+ // Handle 128-bit section and leftovers
335
+ __m128i sum_128 = _mm_setzero_si128();
336
+ if (d_128 != d_256) {
337
+ __m128i v_x =
338
+ _mm_loadu_si128((const __m128i*)(binary_data + d_256 / 8));
339
+ for (size_t j = 0; j < qb; j++) {
340
+ __m128i v_q = _mm_loadu_si128(
341
+ (const __m128i*)(query + j * di_8b + d_256 / 8));
342
+ __m128i v_and = _mm_and_si128(v_q, v_x);
343
+
344
+ // Scalar popcount for each 64-bit lane
345
+ uint64_t lane0 = _mm_extract_epi64(v_and, 0);
346
+ uint64_t lane1 = _mm_extract_epi64(v_and, 1);
347
+ uint64_t pop0 = __builtin_popcountll(lane0) << j;
348
+ uint64_t pop1 = __builtin_popcountll(lane1) << j;
349
+ sum_128 = _mm_add_epi64(sum_128, _mm_set_epi64x(pop1, pop0));
350
+ }
351
+ }
352
+
353
+ // Handle remaining bytes (less than 16)
354
+ uint64_t sum_leftover = 0;
355
+ size_t d_leftover = d - d_128;
356
+ if (d_leftover > 0) {
357
+ for (size_t j = 0; j < qb; j++) {
358
+ for (size_t k = 0; k < (d_leftover + 7) / 8; ++k) {
359
+ uint8_t qv = query[j * di_8b + d_128 / 8 + k];
360
+ uint8_t yv = binary_data[d_128 / 8 + k];
361
+ sum_leftover += (__builtin_popcount(qv & yv) << j);
362
+ }
363
+ }
364
+ }
365
+
366
+ // Horizontal sum of all lanes
367
+ uint64_t sum = 0;
368
+
369
+ // Sum from 512-bit registers
370
+ alignas(64) uint64_t lanes512[8];
371
+ _mm512_store_si512((__m512i*)lanes512, sum_512);
372
+ for (int i = 0; i < 8; ++i) {
373
+ sum += lanes512[i];
374
+ }
375
+
376
+ // Sum from 256-bit registers
377
+ alignas(32) uint64_t lanes256[4];
378
+ _mm256_store_si256((__m256i*)lanes256, sum_256);
379
+ for (int i = 0; i < 4; ++i) {
380
+ sum += lanes256[i];
381
+ }
382
+
383
+ // Sum from 128-bit registers
384
+ alignas(16) uint64_t lanes128[2];
385
+ _mm_store_si128((__m128i*)lanes128, sum_128);
386
+ sum += lanes128[0] + lanes128[1];
387
+
388
+ // Add leftovers
389
+ sum += sum_leftover;
390
+
391
+ return static_cast<float>(sum);
392
+ }
393
+ #endif
394
+
395
+ #ifdef __AVX2__
396
+
397
+ /**
398
+ * AVX2-optimized version of dot product computation between query and binary
399
+ * data.
400
+ *
401
+ * @param query Pointer to rearranged rotated query data
402
+ * @param binary_data Pointer to binary data
403
+ * @param d Dimension
404
+ * @param qb Number of quantization bits
405
+ * @return Dot product result as float
406
+ */
407
+
408
+ inline float rabitq_dp_popcnt_avx2(
409
+ const uint8_t* query,
410
+ const uint8_t* binary_data,
411
+ size_t d,
412
+ size_t qb) {
413
+ const size_t di_8b = (d + 7) / 8;
414
+ const size_t d_256 = (d / 256) * 256;
415
+ const size_t d_128 = (d / 128) * 128;
416
+
417
+ // Use the lookup-based popcount helper function
418
+
419
+ __m256i sum_256 = _mm256_setzero_si256();
420
+
421
+ // Process 256 bits (32 bytes) at a time using lookup-based popcount
422
+ for (size_t i = 0; i < d_256; i += 256) {
423
+ __m256i v_x = _mm256_loadu_si256((const __m256i*)(binary_data + i / 8));
424
+ for (size_t j = 0; j < qb; j++) {
425
+ __m256i v_q = _mm256_loadu_si256(
426
+ (const __m256i*)(query + j * di_8b + i / 8));
427
+ __m256i v_and = _mm256_and_si256(v_q, v_x);
428
+
429
+ // Use the popcount_lookup_avx2 helper function
430
+ __m256i v_popcnt = popcount_lookup_avx2(v_and);
431
+
432
+ // Convert byte counts to 64-bit lanes and shift by j
433
+ __m256i v_sad = _mm256_sad_epu8(v_popcnt, _mm256_setzero_si256());
434
+ __m256i v_shifted = _mm256_slli_epi64(v_sad, static_cast<int>(j));
435
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
436
+ }
437
+ }
438
+
439
+ // Handle leftovers with 128-bit SIMD
440
+ __m128i sum_128 = _mm_setzero_si128();
441
+ if (d_128 != d_256) {
442
+ __m128i v_x =
443
+ _mm_loadu_si128((const __m128i*)(binary_data + d_256 / 8));
444
+ for (size_t j = 0; j < qb; j++) {
445
+ __m128i v_q = _mm_loadu_si128(
446
+ (const __m128i*)(query + j * di_8b + d_256 / 8));
447
+ __m128i v_and = _mm_and_si128(v_q, v_x);
448
+ // Scalar popcount for each 64-bit lane
449
+ uint64_t lane0 = _mm_extract_epi64(v_and, 0);
450
+ uint64_t lane1 = _mm_extract_epi64(v_and, 1);
451
+ uint64_t pop0 = __builtin_popcountll(lane0) << j;
452
+ uint64_t pop1 = __builtin_popcountll(lane1) << j;
453
+ sum_128 = _mm_add_epi64(sum_128, _mm_set_epi64x(pop1, pop0));
454
+ }
455
+ }
456
+
457
+ // Handle remaining bytes (less than 16)
458
+ uint64_t sum_leftover = 0;
459
+ size_t d_leftover = d - d_128;
460
+ if (d_leftover > 0) {
461
+ for (size_t j = 0; j < qb; j++) {
462
+ for (size_t k = 0; k < (d_leftover + 7) / 8; ++k) {
463
+ uint8_t qv = query[j * di_8b + d_128 / 8 + k];
464
+ uint8_t yv = binary_data[d_128 / 8 + k];
465
+ sum_leftover += (__builtin_popcount(qv & yv) << j);
466
+ }
467
+ }
468
+ }
469
+
470
+ // Horizontal sum of all lanes
471
+ uint64_t sum = 0;
472
+ // sum_256: 4 lanes of 64 bits
473
+ alignas(32) uint64_t lanes[4];
474
+ _mm256_store_si256((__m256i*)lanes, sum_256);
475
+ for (int i = 0; i < 4; ++i) {
476
+ sum += lanes[i];
477
+ }
478
+ // sum_128: 2 lanes of 64 bits
479
+ alignas(16) uint64_t lanes128[2];
480
+ _mm_store_si128((__m128i*)lanes128, sum_128);
481
+ sum += lanes128[0] + lanes128[1];
482
+ // leftovers
483
+ sum += sum_leftover;
484
+
485
+ return static_cast<float>(sum);
486
+ }
487
+ #endif
488
+
489
+ /**
490
+ * Compute dot product between query and binary data using popcount operations.
491
+ *
492
+ * @param query Pointer to rearranged rotated query data
493
+ * @param binary_data Pointer to binary data
494
+ * @param d Dimension
495
+ * @param qb Number of quantization bits
496
+ * @return Dot product result as float
497
+ */
498
+ inline float rabitq_dp_popcnt(
499
+ const uint8_t* query,
500
+ const uint8_t* binary_data,
501
+ size_t d,
502
+ size_t qb) {
503
+ #if defined(__AVX512F__) && defined(__AVX512VPOPCNTDQ__)
504
+ return rabitq_dp_popcnt_avx512(query, binary_data, d, qb);
505
+ #elif defined(__AVX512F__)
506
+ return rabitq_dp_popcnt_avx512_fallback(query, binary_data, d, qb);
507
+ #elif defined(__AVX2__)
508
+ return rabitq_dp_popcnt_avx2(query, binary_data, d, qb);
509
+ #else
510
+ const size_t di_8b = (d + 7) / 8;
511
+ const size_t di_64b = (di_8b / 8) * 8;
512
+
513
+ uint64_t dot_qo = 0;
514
+ for (size_t j = 0; j < qb; j++) {
515
+ const uint8_t* query_j = query + j * di_8b;
516
+
517
+ // process 64-bit popcounts
518
+ uint64_t count_dot = 0;
519
+ for (size_t i = 0; i < di_64b; i += 8) {
520
+ const auto qv = *(const uint64_t*)(query_j + i);
521
+ const auto yv = *(const uint64_t*)(binary_data + i);
522
+ count_dot += __builtin_popcountll(qv & yv);
523
+ }
524
+
525
+ // process leftovers
526
+ for (size_t i = di_64b; i < di_8b; i++) {
527
+ const auto qv = *(query_j + i);
528
+ const auto yv = *(binary_data + i);
529
+ count_dot += __builtin_popcount(qv & yv);
530
+ }
531
+
532
+ dot_qo += (count_dot << j);
533
+ }
534
+
535
+ return static_cast<float>(dot_qo);
536
+ #endif
537
+ }
538
+
539
+ } // namespace faiss
@@ -106,8 +106,9 @@ void float_rand(float* x, size_t n, int64_t seed) {
106
106
  const size_t istart = j * n / nblock;
107
107
  const size_t iend = (j + 1) * n / nblock;
108
108
 
109
- for (size_t i = istart; i < iend; i++)
109
+ for (size_t i = istart; i < iend; i++) {
110
110
  x[i] = rng.rand_float();
111
+ }
111
112
  }
112
113
  }
113
114
 
@@ -137,8 +138,9 @@ void float_randn(float* x, size_t n, int64_t seed) {
137
138
  s = a * a + b * b;
138
139
  } while (s >= 1.0);
139
140
  x[i] = a * sqrt(-2.0 * log(s) / s);
140
- } else
141
+ } else {
141
142
  x[i] = b * sqrt(-2.0 * log(s) / s);
143
+ }
142
144
  state = 1 - state;
143
145
  }
144
146
  }
@@ -158,8 +160,9 @@ void int64_rand(int64_t* x, size_t n, int64_t seed) {
158
160
 
159
161
  const size_t istart = j * n / nblock;
160
162
  const size_t iend = (j + 1) * n / nblock;
161
- for (size_t i = istart; i < iend; i++)
163
+ for (size_t i = istart; i < iend; i++) {
162
164
  x[i] = rng.rand_int64();
165
+ }
163
166
  }
164
167
  }
165
168
 
@@ -176,14 +179,16 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed) {
176
179
 
177
180
  const size_t istart = j * n / nblock;
178
181
  const size_t iend = (j + 1) * n / nblock;
179
- for (size_t i = istart; i < iend; i++)
182
+ for (size_t i = istart; i < iend; i++) {
180
183
  x[i] = rng.rand_int64() % max;
184
+ }
181
185
  }
182
186
  }
183
187
 
184
188
  void rand_perm(int* perm, size_t n, int64_t seed) {
185
- for (size_t i = 0; i < n; i++)
189
+ for (size_t i = 0; i < n; i++) {
186
190
  perm[i] = i;
191
+ }
187
192
 
188
193
  RandomGenerator rng(seed);
189
194
 
@@ -194,8 +199,9 @@ void rand_perm(int* perm, size_t n, int64_t seed) {
194
199
  }
195
200
 
196
201
  void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) {
197
- for (size_t i = 0; i < n; i++)
202
+ for (size_t i = 0; i < n; i++) {
198
203
  perm[i] = i;
204
+ }
199
205
 
200
206
  SplitMix64RandomGenerator rng(seed);
201
207
 
@@ -220,8 +226,9 @@ void byte_rand(uint8_t* x, size_t n, int64_t seed) {
220
226
  const size_t iend = (j + 1) * n / nblock;
221
227
 
222
228
  size_t i;
223
- for (i = istart; i < iend; i++)
229
+ for (i = istart; i < iend; i++) {
224
230
  x[i] = rng.rand_int64();
231
+ }
225
232
  }
226
233
  }
227
234
 
@@ -15,8 +15,6 @@
15
15
  #include <cstdio>
16
16
  #include <cstring>
17
17
 
18
- #include <sys/types.h>
19
-
20
18
  #ifdef _MSC_VER
21
19
  #define NOMINMAX
22
20
  #include <windows.h>
@@ -35,7 +33,6 @@
35
33
 
36
34
  #include <faiss/impl/AuxIndexStructures.h>
37
35
  #include <faiss/impl/FaissAssert.h>
38
- #include <faiss/impl/platform_macros.h>
39
36
  #include <faiss/utils/random.h>
40
37
 
41
38
  #ifndef FINTEGER
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: faiss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -178,6 +178,7 @@ files:
178
178
  - vendor/faiss/faiss/gpu/GpuFaissAssert.h
179
179
  - vendor/faiss/faiss/gpu/GpuIcmEncoder.h
180
180
  - vendor/faiss/faiss/gpu/GpuIndex.h
181
+ - vendor/faiss/faiss/gpu/GpuIndexBinaryCagra.h
181
182
  - vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h
182
183
  - vendor/faiss/faiss/gpu/GpuIndexCagra.h
183
184
  - vendor/faiss/faiss/gpu/GpuIndexFlat.h
@@ -212,6 +213,7 @@ files:
212
213
  - vendor/faiss/faiss/gpu/test/TestUtils.cpp
213
214
  - vendor/faiss/faiss/gpu/test/TestUtils.h
214
215
  - vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp
216
+ - vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h
215
217
  - vendor/faiss/faiss/gpu/utils/CuvsUtils.h
216
218
  - vendor/faiss/faiss/gpu/utils/DeviceUtils.h
217
219
  - vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp
@@ -344,6 +346,7 @@ files:
344
346
  - vendor/faiss/faiss/utils/prefetch.h
345
347
  - vendor/faiss/faiss/utils/quantize_lut.cpp
346
348
  - vendor/faiss/faiss/utils/quantize_lut.h
349
+ - vendor/faiss/faiss/utils/rabitq_simd.h
347
350
  - vendor/faiss/faiss/utils/random.cpp
348
351
  - vendor/faiss/faiss/utils/random.h
349
352
  - vendor/faiss/faiss/utils/simdlib.h
@@ -376,7 +379,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
376
379
  - !ruby/object:Gem::Version
377
380
  version: '0'
378
381
  requirements: []
379
- rubygems_version: 3.6.7
382
+ rubygems_version: 3.6.9
380
383
  specification_version: 4
381
384
  summary: Efficient similarity search and clustering for Ruby
382
385
  test_files: []