faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/Index.h +1 -1
  5. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
  6. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
  7. data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
  8. data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
  9. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
  10. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
  11. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
  12. data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
  13. data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
  14. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
  15. data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
  16. data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
  17. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
  18. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
  19. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
  20. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
  21. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
  22. data/vendor/faiss/faiss/factory_tools.cpp +4 -0
  23. data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
  24. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
  25. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
  26. data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
  27. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
  28. data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
  29. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
  30. data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
  31. data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
  32. data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
  33. data/vendor/faiss/faiss/impl/HNSW.h +51 -13
  34. data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
  35. data/vendor/faiss/faiss/impl/Panorama.h +11 -0
  36. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
  37. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
  38. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
  39. data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
  40. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
  41. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
  42. data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
  43. data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
  44. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
  45. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
  46. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
  47. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
  48. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
  49. data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
  50. data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
  51. data/vendor/faiss/faiss/impl/io_macros.h +25 -0
  52. data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
  53. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
  54. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
  55. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
  56. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
  57. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
  58. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
  59. data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
  60. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
  61. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
  62. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
  63. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
  64. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
  65. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
  66. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
  67. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
  68. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
  69. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
  70. data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
  71. data/vendor/faiss/faiss/index_factory.cpp +5 -1
  72. data/vendor/faiss/faiss/index_io.h +16 -0
  73. data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
  74. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
  75. data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
  76. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
  77. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
  78. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
  79. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
  80. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
  81. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
  82. data/vendor/faiss/faiss/utils/bf16.h +34 -0
  83. data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
  84. data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
  85. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
  86. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
  87. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
  88. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
  89. data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
  90. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
  91. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
  92. data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
  93. metadata +12 -2
@@ -35,7 +35,8 @@ struct IndexSVSVamanaLeanVec : IndexSVSVamana {
35
35
  size_t degree,
36
36
  MetricType metric = METRIC_L2,
37
37
  size_t leanvec_dims = 0,
38
- SVSStorageKind storage = SVSStorageKind::SVS_LeanVec4x4);
38
+ SVSStorageKind storage = SVSStorageKind::SVS_LeanVec4x4,
39
+ bool is_static = false);
39
40
 
40
41
  ~IndexSVSVamanaLeanVec() override;
41
42
 
@@ -66,7 +67,7 @@ struct IndexSVSVamanaLeanVec : IndexSVSVamana {
66
67
  svs_runtime::LeanVecTrainingData* training_data{nullptr};
67
68
 
68
69
  protected:
69
- void create_impl() override;
70
+ void create_impl(idx_t n, const float* x) override;
70
71
  };
71
72
 
72
73
  } // namespace faiss
@@ -7,8 +7,13 @@
7
7
 
8
8
  #pragma once
9
9
 
10
+ #include <cstddef>
10
11
  #include <cstdint>
11
12
 
13
+ #if defined(__AVX512F__) || defined(__AVX512BF16__)
14
+ #include <immintrin.h>
15
+ #endif
16
+
12
17
  namespace faiss {
13
18
 
14
19
  namespace {
@@ -33,4 +38,33 @@ inline float decode_bf16(const uint16_t v) {
33
38
  return fp.as_f32;
34
39
  }
35
40
 
41
+ inline void encode_bf16_simd(const float* src, uint16_t* dst, size_t n) {
42
+ size_t i = 0;
43
+ #ifdef __AVX512BF16__
44
+ for (; i + 16 <= n; i += 16) {
45
+ __m512 v = _mm512_loadu_ps(src + i);
46
+ __m256bh encoded = _mm512_cvtneps_pbh(v);
47
+ _mm256_storeu_epi16(dst + i, (__m256i)encoded);
48
+ }
49
+ #endif
50
+ for (; i < n; i++) {
51
+ dst[i] = encode_bf16(src[i]);
52
+ }
53
+ }
54
+
55
+ inline void decode_bf16_simd(const uint16_t* src, float* dst, size_t n) {
56
+ size_t i = 0;
57
+ #if defined(__AVX512F__)
58
+ for (; i + 16 <= n; i += 16) {
59
+ __m256i v = _mm256_loadu_si256((const __m256i*)(src + i));
60
+ __m512i w = _mm512_cvtepu16_epi32(v);
61
+ w = _mm512_slli_epi32(w, 16);
62
+ _mm512_storeu_ps(dst + i, _mm512_castsi512_ps(w));
63
+ }
64
+ #endif
65
+ for (; i < n; i++) {
66
+ dst[i] = decode_bf16(src[i]);
67
+ }
68
+ }
69
+
36
70
  } // namespace faiss
@@ -9,7 +9,6 @@
9
9
 
10
10
  #include <faiss/utils/distances.h>
11
11
 
12
- #include <algorithm>
13
12
  #include <cmath>
14
13
  #include <cstdio>
15
14
  #include <cstring>
@@ -146,7 +146,7 @@ void hammings(
146
146
  size_t nb,
147
147
  size_t ncodes,
148
148
  hamdis_t* __restrict dis) {
149
- with_simd_level([&]<SIMDLevel SL>() {
149
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
150
150
  hammings_fixSL<SL>(a, b, na, nb, ncodes, dis);
151
151
  });
152
152
  }
@@ -170,7 +170,7 @@ void hammings_knn_hc(
170
170
  int order,
171
171
  ApproxTopK_mode_t approx_topk_mode,
172
172
  const faiss::IDSelector* sel) {
173
- with_simd_level([&]<SIMDLevel SL>() {
173
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
174
174
  hammings_knn_hc_fixSL<SL>(
175
175
  ha, a, b, nb, ncodes, order, approx_topk_mode, sel);
176
176
  });
@@ -186,7 +186,7 @@ void hammings_knn_mc(
186
186
  int32_t* __restrict distances,
187
187
  int64_t* __restrict labels,
188
188
  const faiss::IDSelector* sel) {
189
- with_simd_level([&]<SIMDLevel SL>() {
189
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
190
190
  hammings_knn_mc_fixSL<SL>(
191
191
  a, b, na, nb, k, ncodes, distances, labels, sel);
192
192
  });
@@ -201,7 +201,7 @@ void hamming_range_search(
201
201
  size_t code_size,
202
202
  RangeSearchResult* result,
203
203
  const faiss::IDSelector* sel) {
204
- with_simd_level([&]<SIMDLevel SL>() {
204
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
205
205
  hamming_range_search_fixSL<SL>(
206
206
  a, b, na, nb, radius, code_size, result, sel);
207
207
  });
@@ -215,7 +215,7 @@ void hamming_count_thres(
215
215
  hamdis_t ht,
216
216
  size_t ncodes,
217
217
  size_t* nptr) {
218
- with_simd_level([&]<SIMDLevel SL>() {
218
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
219
219
  hamming_count_thres_fixSL<SL>(bs1, bs2, n1, n2, ht, ncodes, nptr);
220
220
  });
221
221
  }
@@ -226,7 +226,7 @@ void crosshamming_count_thres(
226
226
  hamdis_t ht,
227
227
  size_t ncodes,
228
228
  size_t* nptr) {
229
- with_simd_level([&]<SIMDLevel SL>() {
229
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
230
230
  crosshamming_count_thres_fixSL<SL>(dbs, n, ht, ncodes, nptr);
231
231
  });
232
232
  }
@@ -240,7 +240,7 @@ size_t match_hamming_thres(
240
240
  size_t ncodes,
241
241
  int64_t* idx,
242
242
  hamdis_t* dis) {
243
- return with_simd_level([&]<SIMDLevel SL>() -> size_t {
243
+ return with_simd_level_a0_spr([&]<SIMDLevel SL>() -> size_t {
244
244
  return match_hamming_thres_fixSL<SL>(
245
245
  bs1, bs2, n1, n2, ht, ncodes, idx, dis);
246
246
  });
@@ -253,7 +253,7 @@ void generalized_hammings_knn_hc(
253
253
  size_t nb,
254
254
  size_t code_size,
255
255
  int ordered) {
256
- with_simd_level([&]<SIMDLevel SL>() {
256
+ with_simd_level_a0_spr([&]<SIMDLevel SL>() {
257
257
  generalized_hammings_knn_hc_fixSL<SL>(ha, a, b, nb, code_size, ordered);
258
258
  });
259
259
  }
@@ -8,8 +8,9 @@
8
8
  #ifdef COMPILE_SIMD_AVX2
9
9
 
10
10
  #define THE_SIMD_LEVEL SIMDLevel::AVX2
11
- // NOLINTNEXTLINE(facebook-hte-InlineHeader)
11
+ // NOLINTBEGIN(facebook-hte-InlineHeader,facebook-unused-include-check)
12
12
  #include <faiss/utils/hamming_distance/hamming_computer-avx2.h>
13
13
  #include <faiss/utils/hamming_distance/hamming_impl.h>
14
+ // NOLINTEND(facebook-hte-InlineHeader,facebook-unused-include-check)
14
15
 
15
16
  #endif // COMPILE_SIMD_AVX2
@@ -0,0 +1,15 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #ifdef COMPILE_SIMD_AVX512_SPR
9
+
10
+ #define THE_SIMD_LEVEL SIMDLevel::AVX512_SPR
11
+ // NOLINTNEXTLINE(facebook-hte-InlineHeader)
12
+ #include <faiss/utils/hamming_distance/hamming_computer-avx512_spr.h>
13
+ #include <faiss/utils/hamming_distance/hamming_impl.h>
14
+
15
+ #endif // COMPILE_SIMD_AVX512_SPR
@@ -10,10 +10,10 @@
10
10
 
11
11
  // AVX512 HammingComputer and GenHammingComputer specializations.
12
12
  // Types without custom AVX512 code inherit from the NONE specializations
13
- // in hamming_computer-generic.h. Custom specializations for
14
- // HammingComputer64 and HammingComputerDefault use _mm512_popcnt_epi64
15
- // when __AVX512VPOPCNTDQ__ is available. GenHammingComputer classes
16
- // leverage SSE/AVX2 intrinsics.
13
+ // in hamming_computer-generic.h. HammingComputer64 and
14
+ // HammingComputerDefault use scalar popcount here; the VPOPCNTDQ fast
15
+ // path lives in hamming_computer-avx512_spr.h (AVX512_SPR level).
16
+ // GenHammingComputer classes leverage SSE/AVX2 intrinsics.
17
17
 
18
18
  #include <cassert>
19
19
  #include <cstdint>
@@ -74,18 +74,10 @@ struct HammingComputer64_tpl<SIMDLevel::AVX512> {
74
74
 
75
75
  inline int hamming(const uint8_t* b8) const {
76
76
  const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
77
- #ifdef __AVX512VPOPCNTDQ__
78
- __m512i vxor =
79
- _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
80
- __m512i vpcnt = _mm512_popcnt_epi64(vxor);
81
- // reduce performs better than adding the lower and higher parts
82
- return _mm512_reduce_add_epi32(vpcnt);
83
- #else
84
77
  return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
85
78
  popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
86
79
  popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
87
80
  popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
88
- #endif
89
81
  }
90
82
 
91
83
  inline static constexpr int get_code_size() {
@@ -112,27 +104,11 @@ struct HammingComputerDefault_tpl<SIMDLevel::AVX512> {
112
104
  }
113
105
 
114
106
  int hamming(const uint8_t* b8) const {
115
- int accu = 0;
116
-
117
107
  const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
118
108
  const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
119
109
 
120
- int i = 0;
121
- #ifdef __AVX512VPOPCNTDQ__
122
- int quotient64 = quotient8 / 8;
123
- for (; i < quotient64; ++i) {
124
- __m512i vxor = _mm512_xor_si512(
125
- _mm512_loadu_si512(&a64[i * 8]),
126
- _mm512_loadu_si512(&b64[i * 8]));
127
- __m512i vpcnt = _mm512_popcnt_epi64(vxor);
128
- // reduce performs better than adding the lower and higher parts
129
- accu += _mm512_reduce_add_epi32(vpcnt);
130
- }
131
- i *= 8;
132
- #endif
133
- accu += hamming_popcount_tail(
134
- a64, b64, i, quotient8, a8, b8, remainder8);
135
- return accu;
110
+ return hamming_popcount_tail(
111
+ a64, b64, 0, quotient8, a8, b8, remainder8);
136
112
  }
137
113
 
138
114
  inline int get_code_size() const {
@@ -0,0 +1,171 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #ifndef HAMMING_COMPUTER_AVX512_SPR_H
9
+ #define HAMMING_COMPUTER_AVX512_SPR_H
10
+
11
+ // AVX512_SPR HammingComputer specializations using VPOPCNTDQ.
12
+ // On Sapphire Rapids+, _mm512_popcnt_epi64 (and _mm256_popcnt_epi64 with VL)
13
+ // are unconditionally available. This gives a faster path than the scalar
14
+ // popcount fallback used in the base AVX512 specializations when compiled
15
+ // without -mavx512vpopcntdq.
16
+
17
+ #include <cassert>
18
+ #include <cstdint>
19
+
20
+ #include <faiss/impl/platform_macros.h>
21
+ #include <faiss/utils/hamming_distance/hamming_computer-avx512.h>
22
+
23
+ #include <immintrin.h>
24
+
25
+ namespace faiss {
26
+
27
+ /***************************************************************************
28
+ * AVX512_SPR inheriting specializations for types without custom SPR code.
29
+ ***************************************************************************/
30
+
31
+ #define FAISS_INHERIT_HAMMING_SPR(Class) \
32
+ template <> \
33
+ struct Class## \
34
+ _tpl<SIMDLevel::AVX512_SPR> : Class##_tpl<SIMDLevel::AVX512> { \
35
+ using Class##_tpl<SIMDLevel::AVX512>::Class##_tpl; \
36
+ }
37
+
38
+ FAISS_INHERIT_HAMMING_SPR(HammingComputer16);
39
+ FAISS_INHERIT_HAMMING_SPR(HammingComputer20);
40
+ FAISS_INHERIT_HAMMING_SPR(GenHammingComputer8);
41
+ FAISS_INHERIT_HAMMING_SPR(GenHammingComputer16);
42
+ FAISS_INHERIT_HAMMING_SPR(GenHammingComputer32);
43
+ FAISS_INHERIT_HAMMING_SPR(GenHammingComputerM8);
44
+
45
+ #undef FAISS_INHERIT_HAMMING_SPR
46
+
47
+ /***************************************************************************
48
+ * Custom AVX512_SPR specializations using VPOPCNTDQ.
49
+ ***************************************************************************/
50
+
51
+ template <>
52
+ struct HammingComputer32_tpl<SIMDLevel::AVX512_SPR> {
53
+ const uint8_t* a8;
54
+
55
+ HammingComputer32_tpl() {}
56
+
57
+ HammingComputer32_tpl(const uint8_t* a8_in, int code_size) {
58
+ set(a8_in, code_size);
59
+ }
60
+
61
+ void set(const uint8_t* a8_in, FAISS_MAYBE_UNUSED int code_size) {
62
+ assert(code_size == 32);
63
+ a8 = a8_in;
64
+ }
65
+
66
+ inline int hamming(const uint8_t* b8) const {
67
+ __m256i va = _mm256_loadu_si256((const __m256i*)a8);
68
+ __m256i vb = _mm256_loadu_si256((const __m256i*)b8);
69
+ __m256i vxor = _mm256_xor_si256(va, vb);
70
+ __m256i vpcnt = _mm256_popcnt_epi64(vxor);
71
+ __m128i lo = _mm256_castsi256_si128(vpcnt);
72
+ __m128i hi = _mm256_extracti128_si256(vpcnt, 1);
73
+ __m128i sum = _mm_add_epi64(lo, hi);
74
+ return static_cast<int>(
75
+ _mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1));
76
+ }
77
+
78
+ inline static constexpr int get_code_size() {
79
+ return 32;
80
+ }
81
+ };
82
+
83
+ template <>
84
+ struct HammingComputer64_tpl<SIMDLevel::AVX512_SPR> {
85
+ const uint8_t* a8;
86
+
87
+ HammingComputer64_tpl() {}
88
+
89
+ HammingComputer64_tpl(const uint8_t* a8_in, int code_size) {
90
+ set(a8_in, code_size);
91
+ }
92
+
93
+ void set(const uint8_t* a8_in, FAISS_MAYBE_UNUSED int code_size) {
94
+ assert(code_size == 64);
95
+ a8 = a8_in;
96
+ }
97
+
98
+ inline int hamming(const uint8_t* b8) const {
99
+ __m512i vxor = _mm512_xor_si512(
100
+ _mm512_loadu_si512(a8), _mm512_loadu_si512(b8));
101
+ __m512i vpcnt = _mm512_popcnt_epi64(vxor);
102
+ return _mm512_reduce_add_epi32(vpcnt);
103
+ }
104
+
105
+ inline static constexpr int get_code_size() {
106
+ return 64;
107
+ }
108
+ };
109
+
110
+ template <>
111
+ struct HammingComputerDefault_tpl<SIMDLevel::AVX512_SPR> {
112
+ const uint8_t* a8;
113
+ int quotient8;
114
+ int remainder8;
115
+
116
+ HammingComputerDefault_tpl() {}
117
+
118
+ HammingComputerDefault_tpl(const uint8_t* a8_in, int code_size) {
119
+ set(a8_in, code_size);
120
+ }
121
+
122
+ void set(const uint8_t* a8_2, int code_size) {
123
+ this->a8 = a8_2;
124
+ quotient8 = code_size / 8;
125
+ remainder8 = code_size % 8;
126
+ }
127
+
128
+ int hamming(const uint8_t* b8) const {
129
+ int accu = 0;
130
+
131
+ const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
132
+ const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
133
+
134
+ int i = 0;
135
+ int quotient64 = quotient8 / 8;
136
+ for (; i < quotient64; ++i) {
137
+ __m512i vxor = _mm512_xor_si512(
138
+ _mm512_loadu_si512(&a64[i * 8]),
139
+ _mm512_loadu_si512(&b64[i * 8]));
140
+ __m512i vpcnt = _mm512_popcnt_epi64(vxor);
141
+ accu += _mm512_reduce_add_epi32(vpcnt);
142
+ }
143
+ i *= 8;
144
+
145
+ // Handle 4-word (256-bit) remainder with VPOPCNTDQ VL
146
+ if (i + 4 <= quotient8) {
147
+ __m256i vxor = _mm256_xor_si256(
148
+ _mm256_loadu_si256((const __m256i*)&a64[i]),
149
+ _mm256_loadu_si256((const __m256i*)&b64[i]));
150
+ __m256i vpcnt = _mm256_popcnt_epi64(vxor);
151
+ __m128i lo = _mm256_castsi256_si128(vpcnt);
152
+ __m128i hi = _mm256_extracti128_si256(vpcnt, 1);
153
+ __m128i sum = _mm_add_epi64(lo, hi);
154
+ accu += static_cast<int>(
155
+ _mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1));
156
+ i += 4;
157
+ }
158
+
159
+ accu += hamming_popcount_tail(
160
+ a64, b64, i, quotient8, a8, b8, remainder8);
161
+ return accu;
162
+ }
163
+
164
+ inline int get_code_size() const {
165
+ return quotient8 * 8 + remainder8;
166
+ }
167
+ };
168
+
169
+ } // namespace faiss
170
+
171
+ #endif
@@ -18,8 +18,6 @@
18
18
  #include <faiss/utils/AlignedTable.h>
19
19
  #include <faiss/utils/ordered_key_value.h>
20
20
 
21
- #include <faiss/impl/platform_macros.h>
22
-
23
21
  namespace faiss {
24
22
 
25
23
  /******************************************************************
@@ -592,39 +592,12 @@ simd16uint16 accu8to16(simd32uint8 a8) {
592
592
  return hadd(a8_0, a8_1);
593
593
  }
594
594
 
595
- static const simd32uint8 shifts = simd32uint8::create<
596
- 1,
597
- 16,
598
- 0,
599
- 0,
600
- 4,
601
- 64,
602
- 0,
603
- 0,
604
- 0,
605
- 0,
606
- 1,
607
- 16,
608
- 0,
609
- 0,
610
- 4,
611
- 64,
612
- 1,
613
- 16,
614
- 0,
615
- 0,
616
- 4,
617
- 64,
618
- 0,
619
- 0,
620
- 0,
621
- 0,
622
- 1,
623
- 16,
624
- 0,
625
- 0,
626
- 4,
627
- 64>();
595
+ // Lookup table held as a plain byte array in .rodata. Storing it as a
596
+ // `simd32uint8` global would emit an AVX2 initializer into `.init_array` that
597
+ // runs at dlopen, before runtime SIMD dispatch, and SIGILLs on non-AVX2 CPUs
598
+ alignas(32) static const uint8_t shifts[32] = {
599
+ 1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64,
600
+ 1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64};
628
601
 
629
602
  // 2-bit accumulator: we can add only up to 3 elements
630
603
  // on output we return 2*4-bit results
@@ -644,7 +617,8 @@ void compute_accu2(
644
617
  v = pp(v);
645
618
  // 0x800 -> force second half of table
646
619
  simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
647
- a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx)));
620
+ a2 += simd16uint16(
621
+ simd32uint8(shifts).lookup_2_lanes(simd32uint8(idx)));
648
622
  }
649
623
  a4lo += a2 & mask2;
650
624
  a4hi += (a2 >> 2) & mask2;
@@ -694,39 +668,11 @@ simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
694
668
  * 16 bins
695
669
  ************************************************************/
696
670
 
697
- static const simd32uint8 shifts2 = simd32uint8::create<
698
- 1,
699
- 2,
700
- 4,
701
- 8,
702
- 16,
703
- 32,
704
- 64,
705
- 128,
706
- 1,
707
- 2,
708
- 4,
709
- 8,
710
- 16,
711
- 32,
712
- 64,
713
- 128,
714
- 1,
715
- 2,
716
- 4,
717
- 8,
718
- 16,
719
- 32,
720
- 64,
721
- 128,
722
- 1,
723
- 2,
724
- 4,
725
- 8,
726
- 16,
727
- 32,
728
- 64,
729
- 128>();
671
+ // See the note on `shifts` above: kept as a .rodata byte array so its
672
+ // initializer does not emit AVX2 into `.init_array`
673
+ alignas(32) static const uint8_t shifts2[32] = {
674
+ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
675
+ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
730
676
 
731
677
  simd32uint8 shiftr_16(simd32uint8 x, int n) {
732
678
  return simd32uint8(simd16uint16(x) >> n);
@@ -754,7 +700,7 @@ void compute_accu2_16(
754
700
  v = pp(v);
755
701
 
756
702
  simd16uint16 idx = v | (v << 8);
757
- simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
703
+ simd32uint8 a1 = simd32uint8(shifts2).lookup_2_lanes(simd32uint8(idx));
758
704
  // contains 0s for out-of-bounds elements
759
705
 
760
706
  simd16uint16 lt8 = (v >> 3) == simd16uint16(0);