faiss 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
- data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +4 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
- data/vendor/faiss/faiss/impl/HNSW.h +51 -13
- data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
- data/vendor/faiss/faiss/impl/Panorama.h +11 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
- data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
- data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
- data/vendor/faiss/faiss/impl/io_macros.h +25 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
- data/vendor/faiss/faiss/index_factory.cpp +5 -1
- data/vendor/faiss/faiss/index_io.h +16 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
- data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
- metadata +12 -2
|
@@ -35,7 +35,8 @@ struct IndexSVSVamanaLeanVec : IndexSVSVamana {
|
|
|
35
35
|
size_t degree,
|
|
36
36
|
MetricType metric = METRIC_L2,
|
|
37
37
|
size_t leanvec_dims = 0,
|
|
38
|
-
SVSStorageKind storage = SVSStorageKind::SVS_LeanVec4x4
|
|
38
|
+
SVSStorageKind storage = SVSStorageKind::SVS_LeanVec4x4,
|
|
39
|
+
bool is_static = false);
|
|
39
40
|
|
|
40
41
|
~IndexSVSVamanaLeanVec() override;
|
|
41
42
|
|
|
@@ -66,7 +67,7 @@ struct IndexSVSVamanaLeanVec : IndexSVSVamana {
|
|
|
66
67
|
svs_runtime::LeanVecTrainingData* training_data{nullptr};
|
|
67
68
|
|
|
68
69
|
protected:
|
|
69
|
-
void create_impl() override;
|
|
70
|
+
void create_impl(idx_t n, const float* x) override;
|
|
70
71
|
};
|
|
71
72
|
|
|
72
73
|
} // namespace faiss
|
|
@@ -7,8 +7,13 @@
|
|
|
7
7
|
|
|
8
8
|
#pragma once
|
|
9
9
|
|
|
10
|
+
#include <cstddef>
|
|
10
11
|
#include <cstdint>
|
|
11
12
|
|
|
13
|
+
#if defined(__AVX512F__) || defined(__AVX512BF16__)
|
|
14
|
+
#include <immintrin.h>
|
|
15
|
+
#endif
|
|
16
|
+
|
|
12
17
|
namespace faiss {
|
|
13
18
|
|
|
14
19
|
namespace {
|
|
@@ -33,4 +38,33 @@ inline float decode_bf16(const uint16_t v) {
|
|
|
33
38
|
return fp.as_f32;
|
|
34
39
|
}
|
|
35
40
|
|
|
41
|
+
inline void encode_bf16_simd(const float* src, uint16_t* dst, size_t n) {
|
|
42
|
+
size_t i = 0;
|
|
43
|
+
#ifdef __AVX512BF16__
|
|
44
|
+
for (; i + 16 <= n; i += 16) {
|
|
45
|
+
__m512 v = _mm512_loadu_ps(src + i);
|
|
46
|
+
__m256bh encoded = _mm512_cvtneps_pbh(v);
|
|
47
|
+
_mm256_storeu_epi16(dst + i, (__m256i)encoded);
|
|
48
|
+
}
|
|
49
|
+
#endif
|
|
50
|
+
for (; i < n; i++) {
|
|
51
|
+
dst[i] = encode_bf16(src[i]);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
inline void decode_bf16_simd(const uint16_t* src, float* dst, size_t n) {
|
|
56
|
+
size_t i = 0;
|
|
57
|
+
#if defined(__AVX512F__)
|
|
58
|
+
for (; i + 16 <= n; i += 16) {
|
|
59
|
+
__m256i v = _mm256_loadu_si256((const __m256i*)(src + i));
|
|
60
|
+
__m512i w = _mm512_cvtepu16_epi32(v);
|
|
61
|
+
w = _mm512_slli_epi32(w, 16);
|
|
62
|
+
_mm512_storeu_ps(dst + i, _mm512_castsi512_ps(w));
|
|
63
|
+
}
|
|
64
|
+
#endif
|
|
65
|
+
for (; i < n; i++) {
|
|
66
|
+
dst[i] = decode_bf16(src[i]);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
36
70
|
} // namespace faiss
|
|
@@ -146,7 +146,7 @@ void hammings(
|
|
|
146
146
|
size_t nb,
|
|
147
147
|
size_t ncodes,
|
|
148
148
|
hamdis_t* __restrict dis) {
|
|
149
|
-
|
|
149
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
150
150
|
hammings_fixSL<SL>(a, b, na, nb, ncodes, dis);
|
|
151
151
|
});
|
|
152
152
|
}
|
|
@@ -170,7 +170,7 @@ void hammings_knn_hc(
|
|
|
170
170
|
int order,
|
|
171
171
|
ApproxTopK_mode_t approx_topk_mode,
|
|
172
172
|
const faiss::IDSelector* sel) {
|
|
173
|
-
|
|
173
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
174
174
|
hammings_knn_hc_fixSL<SL>(
|
|
175
175
|
ha, a, b, nb, ncodes, order, approx_topk_mode, sel);
|
|
176
176
|
});
|
|
@@ -186,7 +186,7 @@ void hammings_knn_mc(
|
|
|
186
186
|
int32_t* __restrict distances,
|
|
187
187
|
int64_t* __restrict labels,
|
|
188
188
|
const faiss::IDSelector* sel) {
|
|
189
|
-
|
|
189
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
190
190
|
hammings_knn_mc_fixSL<SL>(
|
|
191
191
|
a, b, na, nb, k, ncodes, distances, labels, sel);
|
|
192
192
|
});
|
|
@@ -201,7 +201,7 @@ void hamming_range_search(
|
|
|
201
201
|
size_t code_size,
|
|
202
202
|
RangeSearchResult* result,
|
|
203
203
|
const faiss::IDSelector* sel) {
|
|
204
|
-
|
|
204
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
205
205
|
hamming_range_search_fixSL<SL>(
|
|
206
206
|
a, b, na, nb, radius, code_size, result, sel);
|
|
207
207
|
});
|
|
@@ -215,7 +215,7 @@ void hamming_count_thres(
|
|
|
215
215
|
hamdis_t ht,
|
|
216
216
|
size_t ncodes,
|
|
217
217
|
size_t* nptr) {
|
|
218
|
-
|
|
218
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
219
219
|
hamming_count_thres_fixSL<SL>(bs1, bs2, n1, n2, ht, ncodes, nptr);
|
|
220
220
|
});
|
|
221
221
|
}
|
|
@@ -226,7 +226,7 @@ void crosshamming_count_thres(
|
|
|
226
226
|
hamdis_t ht,
|
|
227
227
|
size_t ncodes,
|
|
228
228
|
size_t* nptr) {
|
|
229
|
-
|
|
229
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
230
230
|
crosshamming_count_thres_fixSL<SL>(dbs, n, ht, ncodes, nptr);
|
|
231
231
|
});
|
|
232
232
|
}
|
|
@@ -240,7 +240,7 @@ size_t match_hamming_thres(
|
|
|
240
240
|
size_t ncodes,
|
|
241
241
|
int64_t* idx,
|
|
242
242
|
hamdis_t* dis) {
|
|
243
|
-
return
|
|
243
|
+
return with_simd_level_a0_spr([&]<SIMDLevel SL>() -> size_t {
|
|
244
244
|
return match_hamming_thres_fixSL<SL>(
|
|
245
245
|
bs1, bs2, n1, n2, ht, ncodes, idx, dis);
|
|
246
246
|
});
|
|
@@ -253,7 +253,7 @@ void generalized_hammings_knn_hc(
|
|
|
253
253
|
size_t nb,
|
|
254
254
|
size_t code_size,
|
|
255
255
|
int ordered) {
|
|
256
|
-
|
|
256
|
+
with_simd_level_a0_spr([&]<SIMDLevel SL>() {
|
|
257
257
|
generalized_hammings_knn_hc_fixSL<SL>(ha, a, b, nb, code_size, ordered);
|
|
258
258
|
});
|
|
259
259
|
}
|
|
@@ -8,8 +8,9 @@
|
|
|
8
8
|
#ifdef COMPILE_SIMD_AVX2
|
|
9
9
|
|
|
10
10
|
#define THE_SIMD_LEVEL SIMDLevel::AVX2
|
|
11
|
-
//
|
|
11
|
+
// NOLINTBEGIN(facebook-hte-InlineHeader,facebook-unused-include-check)
|
|
12
12
|
#include <faiss/utils/hamming_distance/hamming_computer-avx2.h>
|
|
13
13
|
#include <faiss/utils/hamming_distance/hamming_impl.h>
|
|
14
|
+
// NOLINTEND(facebook-hte-InlineHeader,facebook-unused-include-check)
|
|
14
15
|
|
|
15
16
|
#endif // COMPILE_SIMD_AVX2
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifdef COMPILE_SIMD_AVX512_SPR
|
|
9
|
+
|
|
10
|
+
#define THE_SIMD_LEVEL SIMDLevel::AVX512_SPR
|
|
11
|
+
// NOLINTNEXTLINE(facebook-hte-InlineHeader)
|
|
12
|
+
#include <faiss/utils/hamming_distance/hamming_computer-avx512_spr.h>
|
|
13
|
+
#include <faiss/utils/hamming_distance/hamming_impl.h>
|
|
14
|
+
|
|
15
|
+
#endif // COMPILE_SIMD_AVX512_SPR
|
|
@@ -10,10 +10,10 @@
|
|
|
10
10
|
|
|
11
11
|
// AVX512 HammingComputer and GenHammingComputer specializations.
|
|
12
12
|
// Types without custom AVX512 code inherit from the NONE specializations
|
|
13
|
-
// in hamming_computer-generic.h.
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
// leverage SSE/AVX2 intrinsics.
|
|
13
|
+
// in hamming_computer-generic.h. HammingComputer64 and
|
|
14
|
+
// HammingComputerDefault use scalar popcount here; the VPOPCNTDQ fast
|
|
15
|
+
// path lives in hamming_computer-avx512_spr.h (AVX512_SPR level).
|
|
16
|
+
// GenHammingComputer classes leverage SSE/AVX2 intrinsics.
|
|
17
17
|
|
|
18
18
|
#include <cassert>
|
|
19
19
|
#include <cstdint>
|
|
@@ -74,18 +74,10 @@ struct HammingComputer64_tpl<SIMDLevel::AVX512> {
|
|
|
74
74
|
|
|
75
75
|
inline int hamming(const uint8_t* b8) const {
|
|
76
76
|
const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
|
|
77
|
-
#ifdef __AVX512VPOPCNTDQ__
|
|
78
|
-
__m512i vxor =
|
|
79
|
-
_mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
|
|
80
|
-
__m512i vpcnt = _mm512_popcnt_epi64(vxor);
|
|
81
|
-
// reduce performs better than adding the lower and higher parts
|
|
82
|
-
return _mm512_reduce_add_epi32(vpcnt);
|
|
83
|
-
#else
|
|
84
77
|
return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
|
|
85
78
|
popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
|
|
86
79
|
popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
|
|
87
80
|
popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
|
|
88
|
-
#endif
|
|
89
81
|
}
|
|
90
82
|
|
|
91
83
|
inline static constexpr int get_code_size() {
|
|
@@ -112,27 +104,11 @@ struct HammingComputerDefault_tpl<SIMDLevel::AVX512> {
|
|
|
112
104
|
}
|
|
113
105
|
|
|
114
106
|
int hamming(const uint8_t* b8) const {
|
|
115
|
-
int accu = 0;
|
|
116
|
-
|
|
117
107
|
const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
|
|
118
108
|
const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
|
|
119
109
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
int quotient64 = quotient8 / 8;
|
|
123
|
-
for (; i < quotient64; ++i) {
|
|
124
|
-
__m512i vxor = _mm512_xor_si512(
|
|
125
|
-
_mm512_loadu_si512(&a64[i * 8]),
|
|
126
|
-
_mm512_loadu_si512(&b64[i * 8]));
|
|
127
|
-
__m512i vpcnt = _mm512_popcnt_epi64(vxor);
|
|
128
|
-
// reduce performs better than adding the lower and higher parts
|
|
129
|
-
accu += _mm512_reduce_add_epi32(vpcnt);
|
|
130
|
-
}
|
|
131
|
-
i *= 8;
|
|
132
|
-
#endif
|
|
133
|
-
accu += hamming_popcount_tail(
|
|
134
|
-
a64, b64, i, quotient8, a8, b8, remainder8);
|
|
135
|
-
return accu;
|
|
110
|
+
return hamming_popcount_tail(
|
|
111
|
+
a64, b64, 0, quotient8, a8, b8, remainder8);
|
|
136
112
|
}
|
|
137
113
|
|
|
138
114
|
inline int get_code_size() const {
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifndef HAMMING_COMPUTER_AVX512_SPR_H
|
|
9
|
+
#define HAMMING_COMPUTER_AVX512_SPR_H
|
|
10
|
+
|
|
11
|
+
// AVX512_SPR HammingComputer specializations using VPOPCNTDQ.
|
|
12
|
+
// On Sapphire Rapids+, _mm512_popcnt_epi64 (and _mm256_popcnt_epi64 with VL)
|
|
13
|
+
// are unconditionally available. This gives a faster path than the scalar
|
|
14
|
+
// popcount fallback used in the base AVX512 specializations when compiled
|
|
15
|
+
// without -mavx512vpopcntdq.
|
|
16
|
+
|
|
17
|
+
#include <cassert>
|
|
18
|
+
#include <cstdint>
|
|
19
|
+
|
|
20
|
+
#include <faiss/impl/platform_macros.h>
|
|
21
|
+
#include <faiss/utils/hamming_distance/hamming_computer-avx512.h>
|
|
22
|
+
|
|
23
|
+
#include <immintrin.h>
|
|
24
|
+
|
|
25
|
+
namespace faiss {
|
|
26
|
+
|
|
27
|
+
/***************************************************************************
|
|
28
|
+
* AVX512_SPR inheriting specializations for types without custom SPR code.
|
|
29
|
+
***************************************************************************/
|
|
30
|
+
|
|
31
|
+
#define FAISS_INHERIT_HAMMING_SPR(Class) \
|
|
32
|
+
template <> \
|
|
33
|
+
struct Class## \
|
|
34
|
+
_tpl<SIMDLevel::AVX512_SPR> : Class##_tpl<SIMDLevel::AVX512> { \
|
|
35
|
+
using Class##_tpl<SIMDLevel::AVX512>::Class##_tpl; \
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
FAISS_INHERIT_HAMMING_SPR(HammingComputer16);
|
|
39
|
+
FAISS_INHERIT_HAMMING_SPR(HammingComputer20);
|
|
40
|
+
FAISS_INHERIT_HAMMING_SPR(GenHammingComputer8);
|
|
41
|
+
FAISS_INHERIT_HAMMING_SPR(GenHammingComputer16);
|
|
42
|
+
FAISS_INHERIT_HAMMING_SPR(GenHammingComputer32);
|
|
43
|
+
FAISS_INHERIT_HAMMING_SPR(GenHammingComputerM8);
|
|
44
|
+
|
|
45
|
+
#undef FAISS_INHERIT_HAMMING_SPR
|
|
46
|
+
|
|
47
|
+
/***************************************************************************
|
|
48
|
+
* Custom AVX512_SPR specializations using VPOPCNTDQ.
|
|
49
|
+
***************************************************************************/
|
|
50
|
+
|
|
51
|
+
template <>
|
|
52
|
+
struct HammingComputer32_tpl<SIMDLevel::AVX512_SPR> {
|
|
53
|
+
const uint8_t* a8;
|
|
54
|
+
|
|
55
|
+
HammingComputer32_tpl() {}
|
|
56
|
+
|
|
57
|
+
HammingComputer32_tpl(const uint8_t* a8_in, int code_size) {
|
|
58
|
+
set(a8_in, code_size);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
void set(const uint8_t* a8_in, FAISS_MAYBE_UNUSED int code_size) {
|
|
62
|
+
assert(code_size == 32);
|
|
63
|
+
a8 = a8_in;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
inline int hamming(const uint8_t* b8) const {
|
|
67
|
+
__m256i va = _mm256_loadu_si256((const __m256i*)a8);
|
|
68
|
+
__m256i vb = _mm256_loadu_si256((const __m256i*)b8);
|
|
69
|
+
__m256i vxor = _mm256_xor_si256(va, vb);
|
|
70
|
+
__m256i vpcnt = _mm256_popcnt_epi64(vxor);
|
|
71
|
+
__m128i lo = _mm256_castsi256_si128(vpcnt);
|
|
72
|
+
__m128i hi = _mm256_extracti128_si256(vpcnt, 1);
|
|
73
|
+
__m128i sum = _mm_add_epi64(lo, hi);
|
|
74
|
+
return static_cast<int>(
|
|
75
|
+
_mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
inline static constexpr int get_code_size() {
|
|
79
|
+
return 32;
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
template <>
|
|
84
|
+
struct HammingComputer64_tpl<SIMDLevel::AVX512_SPR> {
|
|
85
|
+
const uint8_t* a8;
|
|
86
|
+
|
|
87
|
+
HammingComputer64_tpl() {}
|
|
88
|
+
|
|
89
|
+
HammingComputer64_tpl(const uint8_t* a8_in, int code_size) {
|
|
90
|
+
set(a8_in, code_size);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
void set(const uint8_t* a8_in, FAISS_MAYBE_UNUSED int code_size) {
|
|
94
|
+
assert(code_size == 64);
|
|
95
|
+
a8 = a8_in;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
inline int hamming(const uint8_t* b8) const {
|
|
99
|
+
__m512i vxor = _mm512_xor_si512(
|
|
100
|
+
_mm512_loadu_si512(a8), _mm512_loadu_si512(b8));
|
|
101
|
+
__m512i vpcnt = _mm512_popcnt_epi64(vxor);
|
|
102
|
+
return _mm512_reduce_add_epi32(vpcnt);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
inline static constexpr int get_code_size() {
|
|
106
|
+
return 64;
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
template <>
|
|
111
|
+
struct HammingComputerDefault_tpl<SIMDLevel::AVX512_SPR> {
|
|
112
|
+
const uint8_t* a8;
|
|
113
|
+
int quotient8;
|
|
114
|
+
int remainder8;
|
|
115
|
+
|
|
116
|
+
HammingComputerDefault_tpl() {}
|
|
117
|
+
|
|
118
|
+
HammingComputerDefault_tpl(const uint8_t* a8_in, int code_size) {
|
|
119
|
+
set(a8_in, code_size);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
void set(const uint8_t* a8_2, int code_size) {
|
|
123
|
+
this->a8 = a8_2;
|
|
124
|
+
quotient8 = code_size / 8;
|
|
125
|
+
remainder8 = code_size % 8;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
int hamming(const uint8_t* b8) const {
|
|
129
|
+
int accu = 0;
|
|
130
|
+
|
|
131
|
+
const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
|
|
132
|
+
const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
|
|
133
|
+
|
|
134
|
+
int i = 0;
|
|
135
|
+
int quotient64 = quotient8 / 8;
|
|
136
|
+
for (; i < quotient64; ++i) {
|
|
137
|
+
__m512i vxor = _mm512_xor_si512(
|
|
138
|
+
_mm512_loadu_si512(&a64[i * 8]),
|
|
139
|
+
_mm512_loadu_si512(&b64[i * 8]));
|
|
140
|
+
__m512i vpcnt = _mm512_popcnt_epi64(vxor);
|
|
141
|
+
accu += _mm512_reduce_add_epi32(vpcnt);
|
|
142
|
+
}
|
|
143
|
+
i *= 8;
|
|
144
|
+
|
|
145
|
+
// Handle 4-word (256-bit) remainder with VPOPCNTDQ VL
|
|
146
|
+
if (i + 4 <= quotient8) {
|
|
147
|
+
__m256i vxor = _mm256_xor_si256(
|
|
148
|
+
_mm256_loadu_si256((const __m256i*)&a64[i]),
|
|
149
|
+
_mm256_loadu_si256((const __m256i*)&b64[i]));
|
|
150
|
+
__m256i vpcnt = _mm256_popcnt_epi64(vxor);
|
|
151
|
+
__m128i lo = _mm256_castsi256_si128(vpcnt);
|
|
152
|
+
__m128i hi = _mm256_extracti128_si256(vpcnt, 1);
|
|
153
|
+
__m128i sum = _mm_add_epi64(lo, hi);
|
|
154
|
+
accu += static_cast<int>(
|
|
155
|
+
_mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1));
|
|
156
|
+
i += 4;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
accu += hamming_popcount_tail(
|
|
160
|
+
a64, b64, i, quotient8, a8, b8, remainder8);
|
|
161
|
+
return accu;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
inline int get_code_size() const {
|
|
165
|
+
return quotient8 * 8 + remainder8;
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
} // namespace faiss
|
|
170
|
+
|
|
171
|
+
#endif
|
|
@@ -592,39 +592,12 @@ simd16uint16 accu8to16(simd32uint8 a8) {
|
|
|
592
592
|
return hadd(a8_0, a8_1);
|
|
593
593
|
}
|
|
594
594
|
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
0,
|
|
600
|
-
4,
|
|
601
|
-
64,
|
|
602
|
-
0,
|
|
603
|
-
0,
|
|
604
|
-
0,
|
|
605
|
-
0,
|
|
606
|
-
1,
|
|
607
|
-
16,
|
|
608
|
-
0,
|
|
609
|
-
0,
|
|
610
|
-
4,
|
|
611
|
-
64,
|
|
612
|
-
1,
|
|
613
|
-
16,
|
|
614
|
-
0,
|
|
615
|
-
0,
|
|
616
|
-
4,
|
|
617
|
-
64,
|
|
618
|
-
0,
|
|
619
|
-
0,
|
|
620
|
-
0,
|
|
621
|
-
0,
|
|
622
|
-
1,
|
|
623
|
-
16,
|
|
624
|
-
0,
|
|
625
|
-
0,
|
|
626
|
-
4,
|
|
627
|
-
64>();
|
|
595
|
+
// Lookup table held as a plain byte array in .rodata. Storing it as a
|
|
596
|
+
// `simd32uint8` global would emit an AVX2 initializer into `.init_array` that
|
|
597
|
+
// runs at dlopen, before runtime SIMD dispatch, and SIGILLs on non-AVX2 CPUs
|
|
598
|
+
alignas(32) static const uint8_t shifts[32] = {
|
|
599
|
+
1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64,
|
|
600
|
+
1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64};
|
|
628
601
|
|
|
629
602
|
// 2-bit accumulator: we can add only up to 3 elements
|
|
630
603
|
// on output we return 2*4-bit results
|
|
@@ -644,7 +617,8 @@ void compute_accu2(
|
|
|
644
617
|
v = pp(v);
|
|
645
618
|
// 0x800 -> force second half of table
|
|
646
619
|
simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
|
|
647
|
-
a2 += simd16uint16(
|
|
620
|
+
a2 += simd16uint16(
|
|
621
|
+
simd32uint8(shifts).lookup_2_lanes(simd32uint8(idx)));
|
|
648
622
|
}
|
|
649
623
|
a4lo += a2 & mask2;
|
|
650
624
|
a4hi += (a2 >> 2) & mask2;
|
|
@@ -694,39 +668,11 @@ simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
|
|
|
694
668
|
* 16 bins
|
|
695
669
|
************************************************************/
|
|
696
670
|
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
4,
|
|
701
|
-
8,
|
|
702
|
-
16,
|
|
703
|
-
32,
|
|
704
|
-
64,
|
|
705
|
-
128,
|
|
706
|
-
1,
|
|
707
|
-
2,
|
|
708
|
-
4,
|
|
709
|
-
8,
|
|
710
|
-
16,
|
|
711
|
-
32,
|
|
712
|
-
64,
|
|
713
|
-
128,
|
|
714
|
-
1,
|
|
715
|
-
2,
|
|
716
|
-
4,
|
|
717
|
-
8,
|
|
718
|
-
16,
|
|
719
|
-
32,
|
|
720
|
-
64,
|
|
721
|
-
128,
|
|
722
|
-
1,
|
|
723
|
-
2,
|
|
724
|
-
4,
|
|
725
|
-
8,
|
|
726
|
-
16,
|
|
727
|
-
32,
|
|
728
|
-
64,
|
|
729
|
-
128>();
|
|
671
|
+
// See the note on `shifts` above: kept as a .rodata byte array so its
|
|
672
|
+
// initializer does not emit AVX2 into `.init_array`
|
|
673
|
+
alignas(32) static const uint8_t shifts2[32] = {
|
|
674
|
+
1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
|
|
675
|
+
1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
|
|
730
676
|
|
|
731
677
|
simd32uint8 shiftr_16(simd32uint8 x, int n) {
|
|
732
678
|
return simd32uint8(simd16uint16(x) >> n);
|
|
@@ -754,7 +700,7 @@ void compute_accu2_16(
|
|
|
754
700
|
v = pp(v);
|
|
755
701
|
|
|
756
702
|
simd16uint16 idx = v | (v << 8);
|
|
757
|
-
simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
|
|
703
|
+
simd32uint8 a1 = simd32uint8(shifts2).lookup_2_lanes(simd32uint8(idx));
|
|
758
704
|
// contains 0s for out-of-bounds elements
|
|
759
705
|
|
|
760
706
|
simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
|