faiss 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +1 -1
- data/ext/faiss/extconf.rb +1 -1
- data/ext/faiss/index.cpp +13 -0
- data/lib/faiss/version.rb +1 -1
- data/lib/faiss.rb +2 -2
- data/vendor/faiss/faiss/AutoTune.cpp +15 -4
- data/vendor/faiss/faiss/AutoTune.h +0 -1
- data/vendor/faiss/faiss/Clustering.cpp +1 -5
- data/vendor/faiss/faiss/Clustering.h +0 -2
- data/vendor/faiss/faiss/IVFlib.h +0 -2
- data/vendor/faiss/faiss/Index.h +1 -2
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +17 -3
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +10 -1
- data/vendor/faiss/faiss/IndexBinary.h +0 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -0
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +1 -3
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +273 -48
- data/vendor/faiss/faiss/IndexBinaryIVF.h +18 -11
- data/vendor/faiss/faiss/IndexFastScan.cpp +13 -10
- data/vendor/faiss/faiss/IndexFastScan.h +5 -1
- data/vendor/faiss/faiss/IndexFlat.cpp +16 -3
- data/vendor/faiss/faiss/IndexFlat.h +1 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +5 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +7 -2
- data/vendor/faiss/faiss/IndexHNSW.cpp +3 -6
- data/vendor/faiss/faiss/IndexHNSW.h +0 -1
- data/vendor/faiss/faiss/IndexIDMap.cpp +4 -4
- data/vendor/faiss/faiss/IndexIDMap.h +0 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +155 -129
- data/vendor/faiss/faiss/IndexIVF.h +121 -61
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +12 -11
- data/vendor/faiss/faiss/IndexIVFFastScan.h +6 -1
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +221 -165
- data/vendor/faiss/faiss/IndexIVFPQ.h +1 -0
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +6 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +0 -2
- data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -2
- data/vendor/faiss/faiss/IndexNNDescent.h +0 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +7 -9
- data/vendor/faiss/faiss/IndexRefine.cpp +1 -1
- data/vendor/faiss/faiss/IndexReplicas.cpp +3 -4
- data/vendor/faiss/faiss/IndexReplicas.h +0 -1
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +8 -1
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +7 -0
- data/vendor/faiss/faiss/IndexShards.cpp +26 -109
- data/vendor/faiss/faiss/IndexShards.h +2 -3
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +246 -0
- data/vendor/faiss/faiss/IndexShardsIVF.h +42 -0
- data/vendor/faiss/faiss/MetaIndexes.cpp +86 -0
- data/vendor/faiss/faiss/MetaIndexes.h +29 -0
- data/vendor/faiss/faiss/MetricType.h +14 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +8 -10
- data/vendor/faiss/faiss/VectorTransform.h +1 -3
- data/vendor/faiss/faiss/clone_index.cpp +232 -18
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +25 -3
- data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +7 -0
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +78 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +20 -6
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +7 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +21 -7
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +7 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +7 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +10 -3
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +7 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +11 -3
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +25 -2
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +76 -29
- data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +14 -13
- data/vendor/faiss/faiss/gpu/GpuDistance.h +18 -6
- data/vendor/faiss/faiss/gpu/GpuIndex.h +23 -21
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +10 -10
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -12
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +29 -50
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +3 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +8 -8
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +4 -4
- data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -5
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +9 -7
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +4 -4
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +1 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +55 -6
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +20 -6
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +95 -25
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +67 -16
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +4 -4
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +7 -7
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +4 -4
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +0 -7
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +9 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -7
- data/vendor/faiss/faiss/impl/CodePacker.cpp +67 -0
- data/vendor/faiss/faiss/impl/CodePacker.h +71 -0
- data/vendor/faiss/faiss/impl/DistanceComputer.h +0 -2
- data/vendor/faiss/faiss/impl/HNSW.cpp +3 -7
- data/vendor/faiss/faiss/impl/HNSW.h +6 -9
- data/vendor/faiss/faiss/impl/IDSelector.cpp +1 -1
- data/vendor/faiss/faiss/impl/IDSelector.h +39 -1
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +62 -51
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +11 -12
- data/vendor/faiss/faiss/impl/NNDescent.cpp +3 -9
- data/vendor/faiss/faiss/impl/NNDescent.h +10 -10
- data/vendor/faiss/faiss/impl/NSG.cpp +1 -6
- data/vendor/faiss/faiss/impl/NSG.h +4 -7
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +1 -15
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +11 -10
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +0 -7
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -12
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -4
- data/vendor/faiss/faiss/impl/Quantizer.h +6 -3
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +796 -174
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +16 -8
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +3 -5
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +4 -4
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +3 -3
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +4 -4
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +291 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +74 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +123 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +102 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +13 -10
- data/vendor/faiss/faiss/impl/index_write.cpp +3 -4
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +0 -1
- data/vendor/faiss/faiss/impl/kmeans1d.h +3 -3
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
- data/vendor/faiss/faiss/impl/platform_macros.h +61 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +48 -4
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +18 -4
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
- data/vendor/faiss/faiss/index_factory.cpp +8 -10
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +29 -12
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +8 -2
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
- data/vendor/faiss/faiss/invlists/DirectMap.h +2 -4
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +118 -18
- data/vendor/faiss/faiss/invlists/InvertedLists.h +44 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
- data/vendor/faiss/faiss/python/python_callbacks.h +1 -1
- data/vendor/faiss/faiss/utils/AlignedTable.h +3 -1
- data/vendor/faiss/faiss/utils/Heap.cpp +139 -3
- data/vendor/faiss/faiss/utils/Heap.h +35 -1
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +84 -0
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +196 -0
- data/vendor/faiss/faiss/utils/approx_topk/generic.h +138 -0
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +34 -0
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +367 -0
- data/vendor/faiss/faiss/utils/distances.cpp +61 -7
- data/vendor/faiss/faiss/utils/distances.h +11 -0
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +346 -0
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +36 -0
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +42 -0
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +40 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +352 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +32 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +515 -327
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +17 -1
- data/vendor/faiss/faiss/utils/extra_distances.cpp +37 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +2 -1
- data/vendor/faiss/faiss/utils/fp16-fp16c.h +7 -0
- data/vendor/faiss/faiss/utils/fp16-inl.h +7 -0
- data/vendor/faiss/faiss/utils/fp16.h +7 -0
- data/vendor/faiss/faiss/utils/hamming-inl.h +0 -456
- data/vendor/faiss/faiss/utils/hamming.cpp +104 -120
- data/vendor/faiss/faiss/utils/hamming.h +21 -10
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +535 -0
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +48 -0
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +519 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +26 -0
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +614 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +21 -25
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +344 -3
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +390 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +655 -130
- data/vendor/faiss/faiss/utils/sorting.cpp +692 -0
- data/vendor/faiss/faiss/utils/sorting.h +71 -0
- data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +165 -0
- data/vendor/faiss/faiss/utils/utils.cpp +4 -176
- data/vendor/faiss/faiss/utils/utils.h +2 -9
- metadata +30 -4
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +0 -26
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
#include <algorithm>
|
|
11
|
+
#include <limits>
|
|
12
|
+
#include <utility>
|
|
13
|
+
|
|
14
|
+
#include <faiss/utils/Heap.h>
|
|
15
|
+
#include <faiss/utils/simdlib.h>
|
|
16
|
+
|
|
17
|
+
namespace faiss {
|
|
18
|
+
|
|
19
|
+
// HeapWithBucketsForHamming32 uses simd8uint32 under the hood.
|
|
20
|
+
|
|
21
|
+
template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
|
|
22
|
+
struct HeapWithBucketsForHamming32 {
|
|
23
|
+
// this case was not implemented yet.
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
|
|
27
|
+
struct HeapWithBucketsForHamming32<
|
|
28
|
+
CMax<int, int64_t>,
|
|
29
|
+
NBUCKETS,
|
|
30
|
+
N,
|
|
31
|
+
HammingComputerT> {
|
|
32
|
+
static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
|
|
33
|
+
static_assert(
|
|
34
|
+
(NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
|
|
35
|
+
"Number of buckets needs to be 8, 16, 24, ...");
|
|
36
|
+
|
|
37
|
+
static void addn(
|
|
38
|
+
// number of elements
|
|
39
|
+
const uint32_t n,
|
|
40
|
+
// Hamming computer
|
|
41
|
+
const HammingComputerT& hc,
|
|
42
|
+
// n elements that can be used with hc
|
|
43
|
+
const uint8_t* const __restrict binaryVectors,
|
|
44
|
+
// number of best elements to keep
|
|
45
|
+
const uint32_t k,
|
|
46
|
+
// output distances
|
|
47
|
+
int* const __restrict bh_val,
|
|
48
|
+
// output indices, each being within [0, n) range
|
|
49
|
+
int64_t* const __restrict bh_ids) {
|
|
50
|
+
// forward a call to bs_addn with 1 beam
|
|
51
|
+
bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static void bs_addn(
|
|
55
|
+
// beam_size parameter of Beam Search algorithm
|
|
56
|
+
const uint32_t beam_size,
|
|
57
|
+
// number of elements per beam
|
|
58
|
+
const uint32_t n_per_beam,
|
|
59
|
+
// Hamming computer
|
|
60
|
+
const HammingComputerT& hc,
|
|
61
|
+
// n elements that can be used against hc
|
|
62
|
+
const uint8_t* const __restrict binary_vectors,
|
|
63
|
+
// number of best elements to keep
|
|
64
|
+
const uint32_t k,
|
|
65
|
+
// output distances
|
|
66
|
+
int* const __restrict bh_val,
|
|
67
|
+
// output indices, each being within [0, n_per_beam * beam_size)
|
|
68
|
+
// range
|
|
69
|
+
int64_t* const __restrict bh_ids) {
|
|
70
|
+
//
|
|
71
|
+
using C = CMax<int, int64_t>;
|
|
72
|
+
|
|
73
|
+
// Hamming code size
|
|
74
|
+
const size_t code_size = hc.get_code_size();
|
|
75
|
+
|
|
76
|
+
// main loop
|
|
77
|
+
for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
|
|
78
|
+
simd8uint32 min_distances_i[NBUCKETS_8][N];
|
|
79
|
+
simd8uint32 min_indices_i[NBUCKETS_8][N];
|
|
80
|
+
|
|
81
|
+
for (uint32_t j = 0; j < NBUCKETS_8; j++) {
|
|
82
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
83
|
+
min_distances_i[j][p] =
|
|
84
|
+
simd8uint32(std::numeric_limits<int32_t>::max());
|
|
85
|
+
min_indices_i[j][p] = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
simd8uint32 current_indices(0, 1, 2, 3, 4, 5, 6, 7);
|
|
90
|
+
const simd8uint32 indices_delta(NBUCKETS);
|
|
91
|
+
|
|
92
|
+
const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
|
|
93
|
+
|
|
94
|
+
// put the data into buckets
|
|
95
|
+
for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
|
|
96
|
+
for (uint32_t j = 0; j < NBUCKETS_8; j++) {
|
|
97
|
+
uint32_t hamming_distances[8];
|
|
98
|
+
for (size_t j8 = 0; j8 < 8; j8++) {
|
|
99
|
+
hamming_distances[j8] = hc.hamming(
|
|
100
|
+
binary_vectors +
|
|
101
|
+
(j8 + j * 8 + ip + n_per_beam * beam_index) *
|
|
102
|
+
code_size);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// loop. Compiler should get rid of unneeded ops
|
|
106
|
+
simd8uint32 distance_candidate;
|
|
107
|
+
distance_candidate.loadu(hamming_distances);
|
|
108
|
+
simd8uint32 indices_candidate = current_indices;
|
|
109
|
+
|
|
110
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
111
|
+
simd8uint32 min_distances_new;
|
|
112
|
+
simd8uint32 min_indices_new;
|
|
113
|
+
simd8uint32 max_distances_new;
|
|
114
|
+
simd8uint32 max_indices_new;
|
|
115
|
+
|
|
116
|
+
faiss::cmplt_min_max_fast(
|
|
117
|
+
distance_candidate,
|
|
118
|
+
indices_candidate,
|
|
119
|
+
min_distances_i[j][p],
|
|
120
|
+
min_indices_i[j][p],
|
|
121
|
+
min_distances_new,
|
|
122
|
+
min_indices_new,
|
|
123
|
+
max_distances_new,
|
|
124
|
+
max_indices_new);
|
|
125
|
+
|
|
126
|
+
distance_candidate = max_distances_new;
|
|
127
|
+
indices_candidate = max_indices_new;
|
|
128
|
+
|
|
129
|
+
min_distances_i[j][p] = min_distances_new;
|
|
130
|
+
min_indices_i[j][p] = min_indices_new;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
current_indices += indices_delta;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// fix the indices
|
|
138
|
+
for (uint32_t j = 0; j < NBUCKETS_8; j++) {
|
|
139
|
+
const simd8uint32 offset(n_per_beam * beam_index + j * 8);
|
|
140
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
141
|
+
min_indices_i[j][p] += offset;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// merge every bucket into the regular heap
|
|
146
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
147
|
+
for (uint32_t j = 0; j < NBUCKETS_8; j++) {
|
|
148
|
+
uint32_t min_indices_scalar[8];
|
|
149
|
+
uint32_t min_distances_scalar[8];
|
|
150
|
+
|
|
151
|
+
min_indices_i[j][p].storeu(min_indices_scalar);
|
|
152
|
+
min_distances_i[j][p].storeu(min_distances_scalar);
|
|
153
|
+
|
|
154
|
+
// this exact way is needed to maintain the order as if the
|
|
155
|
+
// input elements were pushed to the heap sequentially
|
|
156
|
+
for (size_t j8 = 0; j8 < 8; j8++) {
|
|
157
|
+
const auto value = min_distances_scalar[j8];
|
|
158
|
+
const auto index = min_indices_scalar[j8];
|
|
159
|
+
|
|
160
|
+
if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
|
|
161
|
+
heap_replace_top<C>(
|
|
162
|
+
k, bh_val, bh_ids, value, index);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// process leftovers
|
|
169
|
+
for (uint32_t ip = nb; ip < n_per_beam; ip++) {
|
|
170
|
+
const auto index = ip + n_per_beam * beam_index;
|
|
171
|
+
const auto value =
|
|
172
|
+
hc.hamming(binary_vectors + (index)*code_size);
|
|
173
|
+
|
|
174
|
+
if (C::cmp(bh_val[0], value)) {
|
|
175
|
+
heap_replace_top<C>(k, bh_val, bh_ids, value, index);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
// HeapWithBucketsForHamming16 uses simd16uint16 under the hood.
|
|
183
|
+
// Less registers needed in total, so higher values of NBUCKETS/N can be used,
|
|
184
|
+
// but somewhat slower.
|
|
185
|
+
// No more than 32K elements currently, but it can be reorganized a bit
|
|
186
|
+
// to be limited to 32K elements per beam.
|
|
187
|
+
|
|
188
|
+
template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
|
|
189
|
+
struct HeapWithBucketsForHamming16 {
|
|
190
|
+
// this case was not implemented yet.
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
|
|
194
|
+
struct HeapWithBucketsForHamming16<
|
|
195
|
+
CMax<int, int64_t>,
|
|
196
|
+
NBUCKETS,
|
|
197
|
+
N,
|
|
198
|
+
HammingComputerT> {
|
|
199
|
+
static constexpr uint32_t NBUCKETS_16 = NBUCKETS / 16;
|
|
200
|
+
static_assert(
|
|
201
|
+
(NBUCKETS) > 0 && ((NBUCKETS % 16) == 0),
|
|
202
|
+
"Number of buckets needs to be 16, 32, 48...");
|
|
203
|
+
|
|
204
|
+
static void addn(
|
|
205
|
+
// number of elements
|
|
206
|
+
const uint32_t n,
|
|
207
|
+
// Hamming computer
|
|
208
|
+
const HammingComputerT& hc,
|
|
209
|
+
// n elements that can be used with hc
|
|
210
|
+
const uint8_t* const __restrict binaryVectors,
|
|
211
|
+
// number of best elements to keep
|
|
212
|
+
const uint32_t k,
|
|
213
|
+
// output distances
|
|
214
|
+
int* const __restrict bh_val,
|
|
215
|
+
// output indices, each being within [0, n) range
|
|
216
|
+
int64_t* const __restrict bh_ids) {
|
|
217
|
+
// forward a call to bs_addn with 1 beam
|
|
218
|
+
bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
static void bs_addn(
|
|
222
|
+
// beam_size parameter of Beam Search algorithm
|
|
223
|
+
const uint32_t beam_size,
|
|
224
|
+
// number of elements per beam
|
|
225
|
+
const uint32_t n_per_beam,
|
|
226
|
+
// Hamming computer
|
|
227
|
+
const HammingComputerT& hc,
|
|
228
|
+
// n elements that can be used against hc
|
|
229
|
+
const uint8_t* const __restrict binary_vectors,
|
|
230
|
+
// number of best elements to keep
|
|
231
|
+
const uint32_t k,
|
|
232
|
+
// output distances
|
|
233
|
+
int* const __restrict bh_val,
|
|
234
|
+
// output indices, each being within [0, n_per_beam * beam_size)
|
|
235
|
+
// range
|
|
236
|
+
int64_t* const __restrict bh_ids) {
|
|
237
|
+
//
|
|
238
|
+
using C = CMax<int, int64_t>;
|
|
239
|
+
|
|
240
|
+
// Hamming code size
|
|
241
|
+
const size_t code_size = hc.get_code_size();
|
|
242
|
+
|
|
243
|
+
// main loop
|
|
244
|
+
for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
|
|
245
|
+
simd16uint16 min_distances_i[NBUCKETS_16][N];
|
|
246
|
+
simd16uint16 min_indices_i[NBUCKETS_16][N];
|
|
247
|
+
|
|
248
|
+
for (uint32_t j = 0; j < NBUCKETS_16; j++) {
|
|
249
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
250
|
+
min_distances_i[j][p] =
|
|
251
|
+
simd16uint16(std::numeric_limits<int16_t>::max());
|
|
252
|
+
min_indices_i[j][p] = simd16uint16(
|
|
253
|
+
0,
|
|
254
|
+
1,
|
|
255
|
+
2,
|
|
256
|
+
3,
|
|
257
|
+
4,
|
|
258
|
+
5,
|
|
259
|
+
6,
|
|
260
|
+
7,
|
|
261
|
+
8,
|
|
262
|
+
9,
|
|
263
|
+
10,
|
|
264
|
+
11,
|
|
265
|
+
12,
|
|
266
|
+
13,
|
|
267
|
+
14,
|
|
268
|
+
15);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
simd16uint16 current_indices(
|
|
273
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
274
|
+
const simd16uint16 indices_delta((uint16_t)NBUCKETS);
|
|
275
|
+
|
|
276
|
+
const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
|
|
277
|
+
|
|
278
|
+
// put the data into buckets
|
|
279
|
+
for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
|
|
280
|
+
for (uint32_t j = 0; j < NBUCKETS_16; j++) {
|
|
281
|
+
uint16_t hamming_distances[16];
|
|
282
|
+
for (size_t j16 = 0; j16 < 16; j16++) {
|
|
283
|
+
hamming_distances[j16] = hc.hamming(
|
|
284
|
+
binary_vectors +
|
|
285
|
+
(j16 + j * 16 + ip + n_per_beam * beam_index) *
|
|
286
|
+
code_size);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// loop. Compiler should get rid of unneeded ops
|
|
290
|
+
simd16uint16 distance_candidate;
|
|
291
|
+
distance_candidate.loadu(hamming_distances);
|
|
292
|
+
simd16uint16 indices_candidate = current_indices;
|
|
293
|
+
|
|
294
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
295
|
+
simd16uint16 min_distances_new;
|
|
296
|
+
simd16uint16 min_indices_new;
|
|
297
|
+
simd16uint16 max_distances_new;
|
|
298
|
+
simd16uint16 max_indices_new;
|
|
299
|
+
|
|
300
|
+
faiss::cmplt_min_max_fast(
|
|
301
|
+
distance_candidate,
|
|
302
|
+
indices_candidate,
|
|
303
|
+
min_distances_i[j][p],
|
|
304
|
+
min_indices_i[j][p],
|
|
305
|
+
min_distances_new,
|
|
306
|
+
min_indices_new,
|
|
307
|
+
max_distances_new,
|
|
308
|
+
max_indices_new);
|
|
309
|
+
|
|
310
|
+
distance_candidate = max_distances_new;
|
|
311
|
+
indices_candidate = max_indices_new;
|
|
312
|
+
|
|
313
|
+
min_distances_i[j][p] = min_distances_new;
|
|
314
|
+
min_indices_i[j][p] = min_indices_new;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
current_indices += indices_delta;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// fix the indices
|
|
322
|
+
for (uint32_t j = 0; j < NBUCKETS_16; j++) {
|
|
323
|
+
const simd16uint16 offset(
|
|
324
|
+
(uint16_t)(n_per_beam * beam_index + j * 16));
|
|
325
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
326
|
+
min_indices_i[j][p] += offset;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// merge every bucket into the regular heap
|
|
331
|
+
for (uint32_t p = 0; p < N; p++) {
|
|
332
|
+
for (uint32_t j = 0; j < NBUCKETS_16; j++) {
|
|
333
|
+
uint16_t min_indices_scalar[16];
|
|
334
|
+
uint16_t min_distances_scalar[16];
|
|
335
|
+
|
|
336
|
+
min_indices_i[j][p].storeu(min_indices_scalar);
|
|
337
|
+
min_distances_i[j][p].storeu(min_distances_scalar);
|
|
338
|
+
|
|
339
|
+
// this exact way is needed to maintain the order as if the
|
|
340
|
+
// input elements were pushed to the heap sequentially
|
|
341
|
+
for (size_t j16 = 0; j16 < 16; j16++) {
|
|
342
|
+
const auto value = min_distances_scalar[j16];
|
|
343
|
+
const auto index = min_indices_scalar[j16];
|
|
344
|
+
|
|
345
|
+
if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
|
|
346
|
+
heap_replace_top<C>(
|
|
347
|
+
k, bh_val, bh_ids, value, index);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// process leftovers
|
|
354
|
+
for (uint32_t ip = nb; ip < n_per_beam; ip++) {
|
|
355
|
+
const auto index = ip + n_per_beam * beam_index;
|
|
356
|
+
const auto value =
|
|
357
|
+
hc.hamming(binary_vectors + (index)*code_size);
|
|
358
|
+
|
|
359
|
+
if (C::cmp(bh_val[0], value)) {
|
|
360
|
+
heap_replace_top<C>(k, bh_val, bh_ids, value, index);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
};
|
|
366
|
+
|
|
367
|
+
} // namespace faiss
|
|
@@ -26,6 +26,8 @@
|
|
|
26
26
|
#include <faiss/impl/IDSelector.h>
|
|
27
27
|
#include <faiss/impl/ResultHandler.h>
|
|
28
28
|
|
|
29
|
+
#include <faiss/utils/distances_fused/distances_fused.h>
|
|
30
|
+
|
|
29
31
|
#ifndef FINTEGER
|
|
30
32
|
#define FINTEGER long
|
|
31
33
|
#endif
|
|
@@ -229,7 +231,7 @@ void exhaustive_inner_product_blas(
|
|
|
229
231
|
// distance correction is an operator that can be applied to transform
|
|
230
232
|
// the distances
|
|
231
233
|
template <class ResultHandler>
|
|
232
|
-
void
|
|
234
|
+
void exhaustive_L2sqr_blas_default_impl(
|
|
233
235
|
const float* x,
|
|
234
236
|
const float* y,
|
|
235
237
|
size_t d,
|
|
@@ -311,10 +313,20 @@ void exhaustive_L2sqr_blas(
|
|
|
311
313
|
}
|
|
312
314
|
}
|
|
313
315
|
|
|
316
|
+
template <class ResultHandler>
|
|
317
|
+
void exhaustive_L2sqr_blas(
|
|
318
|
+
const float* x,
|
|
319
|
+
const float* y,
|
|
320
|
+
size_t d,
|
|
321
|
+
size_t nx,
|
|
322
|
+
size_t ny,
|
|
323
|
+
ResultHandler& res,
|
|
324
|
+
const float* y_norms = nullptr) {
|
|
325
|
+
exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res);
|
|
326
|
+
}
|
|
327
|
+
|
|
314
328
|
#ifdef __AVX2__
|
|
315
|
-
|
|
316
|
-
template <>
|
|
317
|
-
void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
|
|
329
|
+
void exhaustive_L2sqr_blas_cmax_avx2(
|
|
318
330
|
const float* x,
|
|
319
331
|
const float* y,
|
|
320
332
|
size_t d,
|
|
@@ -513,11 +525,53 @@ void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
|
|
|
513
525
|
res.add_result(i, current_min_distance, current_min_index);
|
|
514
526
|
}
|
|
515
527
|
}
|
|
528
|
+
// Does nothing for SingleBestResultHandler, but
|
|
529
|
+
// keeping the call for the consistency.
|
|
530
|
+
res.end_multiple();
|
|
516
531
|
InterruptCallback::check();
|
|
517
532
|
}
|
|
518
533
|
}
|
|
519
534
|
#endif
|
|
520
535
|
|
|
536
|
+
// an override if only a single closest point is needed
|
|
537
|
+
template <>
|
|
538
|
+
void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
|
|
539
|
+
const float* x,
|
|
540
|
+
const float* y,
|
|
541
|
+
size_t d,
|
|
542
|
+
size_t nx,
|
|
543
|
+
size_t ny,
|
|
544
|
+
SingleBestResultHandler<CMax<float, int64_t>>& res,
|
|
545
|
+
const float* y_norms) {
|
|
546
|
+
#if defined(__AVX2__)
|
|
547
|
+
// use a faster fused kernel if available
|
|
548
|
+
if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
|
|
549
|
+
// the kernel is available and it is complete, we're done.
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// run the specialized AVX2 implementation
|
|
554
|
+
exhaustive_L2sqr_blas_cmax_avx2(x, y, d, nx, ny, res, y_norms);
|
|
555
|
+
|
|
556
|
+
#elif defined(__aarch64__)
|
|
557
|
+
// use a faster fused kernel if available
|
|
558
|
+
if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
|
|
559
|
+
// the kernel is available and it is complete, we're done.
|
|
560
|
+
return;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
// run the default implementation
|
|
564
|
+
exhaustive_L2sqr_blas_default_impl<
|
|
565
|
+
SingleBestResultHandler<CMax<float, int64_t>>>(
|
|
566
|
+
x, y, d, nx, ny, res, y_norms);
|
|
567
|
+
#else
|
|
568
|
+
// run the default implementation
|
|
569
|
+
exhaustive_L2sqr_blas_default_impl<
|
|
570
|
+
SingleBestResultHandler<CMax<float, int64_t>>>(
|
|
571
|
+
x, y, d, nx, ny, res, y_norms);
|
|
572
|
+
#endif
|
|
573
|
+
}
|
|
574
|
+
|
|
521
575
|
template <class ResultHandler>
|
|
522
576
|
void knn_L2sqr_select(
|
|
523
577
|
const float* x,
|
|
@@ -770,7 +824,7 @@ void pairwise_indexed_L2sqr(
|
|
|
770
824
|
const float* y,
|
|
771
825
|
const int64_t* iy,
|
|
772
826
|
float* dis) {
|
|
773
|
-
#pragma omp parallel for
|
|
827
|
+
#pragma omp parallel for if (n > 1)
|
|
774
828
|
for (int64_t j = 0; j < n; j++) {
|
|
775
829
|
if (ix[j] >= 0 && iy[j] >= 0) {
|
|
776
830
|
dis[j] = fvec_L2sqr(x + d * ix[j], y + d * iy[j], d);
|
|
@@ -786,7 +840,7 @@ void pairwise_indexed_inner_product(
|
|
|
786
840
|
const float* y,
|
|
787
841
|
const int64_t* iy,
|
|
788
842
|
float* dis) {
|
|
789
|
-
#pragma omp parallel for
|
|
843
|
+
#pragma omp parallel for if (n > 1)
|
|
790
844
|
for (int64_t j = 0; j < n; j++) {
|
|
791
845
|
if (ix[j] >= 0 && iy[j] >= 0) {
|
|
792
846
|
dis[j] = fvec_inner_product(x + d * ix[j], y + d * iy[j], d);
|
|
@@ -887,7 +941,7 @@ void pairwise_L2sqr(
|
|
|
887
941
|
// store in beginning of distance matrix to avoid malloc
|
|
888
942
|
float* b_norms = dis;
|
|
889
943
|
|
|
890
|
-
#pragma omp parallel for
|
|
944
|
+
#pragma omp parallel for if (nb > 1)
|
|
891
945
|
for (int64_t i = 0; i < nb; i++)
|
|
892
946
|
b_norms[i] = fvec_norm_L2sqr(xb + i * ldb, d);
|
|
893
947
|
|
|
@@ -73,6 +73,17 @@ void fvec_L2sqr_ny(
|
|
|
73
73
|
size_t d,
|
|
74
74
|
size_t ny);
|
|
75
75
|
|
|
76
|
+
/* compute ny square L2 distance between x and a set of transposed contiguous
|
|
77
|
+
y vectors. squared lengths of y should be provided as well */
|
|
78
|
+
void fvec_L2sqr_ny_transposed(
|
|
79
|
+
float* dis,
|
|
80
|
+
const float* x,
|
|
81
|
+
const float* y,
|
|
82
|
+
const float* y_sqlen,
|
|
83
|
+
size_t d,
|
|
84
|
+
size_t d_offset,
|
|
85
|
+
size_t ny);
|
|
86
|
+
|
|
76
87
|
/* compute ny square L2 distance between x and a set of contiguous y vectors
|
|
77
88
|
and return the index of the nearest vector.
|
|
78
89
|
return 0 if ny == 0. */
|