faiss 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +23 -21
- data/ext/faiss/extconf.rb +11 -0
- data/ext/faiss/index.cpp +4 -4
- data/ext/faiss/index_binary.cpp +6 -6
- data/ext/faiss/product_quantizer.cpp +4 -4
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +13 -0
- data/vendor/faiss/faiss/IVFlib.cpp +101 -2
- data/vendor/faiss/faiss/IVFlib.h +26 -2
- data/vendor/faiss/faiss/Index.cpp +36 -3
- data/vendor/faiss/faiss/Index.h +43 -6
- data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
- data/vendor/faiss/faiss/Index2Layer.h +6 -1
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
- data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
- data/vendor/faiss/faiss/IndexBinary.h +18 -3
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
- data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
- data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
- data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
- data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
- data/vendor/faiss/faiss/IndexFastScan.h +145 -0
- data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
- data/vendor/faiss/faiss/IndexFlat.h +7 -4
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
- data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
- data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
- data/vendor/faiss/faiss/IndexHNSW.h +4 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
- data/vendor/faiss/faiss/IndexIDMap.h +107 -0
- data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
- data/vendor/faiss/faiss/IndexIVF.h +35 -16
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
- data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
- data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
- data/vendor/faiss/faiss/IndexLSH.h +2 -1
- data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
- data/vendor/faiss/faiss/IndexLattice.h +3 -1
- data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
- data/vendor/faiss/faiss/IndexNSG.h +25 -1
- data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
- data/vendor/faiss/faiss/IndexPQ.h +19 -5
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
- data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
- data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
- data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
- data/vendor/faiss/faiss/IndexRefine.h +4 -2
- data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
- data/vendor/faiss/faiss/IndexReplicas.h +2 -1
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
- data/vendor/faiss/faiss/IndexShards.cpp +4 -1
- data/vendor/faiss/faiss/IndexShards.h +2 -1
- data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
- data/vendor/faiss/faiss/MetaIndexes.h +3 -81
- data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
- data/vendor/faiss/faiss/VectorTransform.h +22 -4
- data/vendor/faiss/faiss/clone_index.cpp +23 -1
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
- data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
- data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
- data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
- data/vendor/faiss/faiss/impl/HNSW.h +19 -16
- data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
- data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
- data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
- data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
- data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
- data/vendor/faiss/faiss/index_factory.cpp +196 -7
- data/vendor/faiss/faiss/index_io.h +5 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
- data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
- data/vendor/faiss/faiss/utils/Heap.h +31 -15
- data/vendor/faiss/faiss/utils/distances.cpp +380 -56
- data/vendor/faiss/faiss/utils/distances.h +113 -15
- data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
- data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
- data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
- data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
- data/vendor/faiss/faiss/utils/fp16.h +11 -0
- data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
- data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
- data/vendor/faiss/faiss/utils/random.cpp +53 -0
- data/vendor/faiss/faiss/utils/random.h +5 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
- data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
- metadata +37 -3
|
@@ -0,0 +1,2058 @@
|
|
|
1
|
+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
|
|
2
|
+
#ifndef LEVEL2_AVX2_INL_H
|
|
3
|
+
#define LEVEL2_AVX2_INL_H
|
|
4
|
+
|
|
5
|
+
#include <immintrin.h>
|
|
6
|
+
|
|
7
|
+
#include <cstddef>
|
|
8
|
+
#include <cstdint>
|
|
9
|
+
|
|
10
|
+
#include <faiss/cppcontrib/detail/UintReader.h>
|
|
11
|
+
|
|
12
|
+
namespace faiss {
|
|
13
|
+
namespace cppcontrib {
|
|
14
|
+
|
|
15
|
+
////////////////////////////////////////////////////////////////////////////////////
|
|
16
|
+
/// Index2LevelDecoder
|
|
17
|
+
////////////////////////////////////////////////////////////////////////////////////
|
|
18
|
+
|
|
19
|
+
namespace {
|
|
20
|
+
|
|
21
|
+
// Processes 8 float values.
|
|
22
|
+
// Returns {
|
|
23
|
+
// [0..1] = *coarse[0..1] + *fine0[0..1];
|
|
24
|
+
// [2..3] = *coarse[2..3] + *fine1[0..1];
|
|
25
|
+
// [4..5] = *coarse[4..5] + *fine2[0..1];
|
|
26
|
+
// [6..7] = *coarse[6..7] + *fine3[0..1];
|
|
27
|
+
// }
|
|
28
|
+
inline __m256 elementaryBlock2x4b(
|
|
29
|
+
const float* const __restrict coarse,
|
|
30
|
+
const float* const __restrict fine0,
|
|
31
|
+
const float* const __restrict fine1,
|
|
32
|
+
const float* const __restrict fine2,
|
|
33
|
+
const float* const __restrict fine3) {
|
|
34
|
+
// load fine
|
|
35
|
+
const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
|
|
36
|
+
*reinterpret_cast<const double*>(fine0),
|
|
37
|
+
*reinterpret_cast<const double*>(fine1),
|
|
38
|
+
*reinterpret_cast<const double*>(fine2),
|
|
39
|
+
*reinterpret_cast<const double*>(fine3)));
|
|
40
|
+
// load coarse
|
|
41
|
+
const __m256 coarseValue = _mm256_loadu_ps(coarse);
|
|
42
|
+
|
|
43
|
+
// add coarse and fine
|
|
44
|
+
return _mm256_add_ps(fineValue, coarseValue);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Processes 8 float values.
|
|
48
|
+
// Returns {
|
|
49
|
+
// [0..1] = existingValue[0..1] + weight * (*coarse[0..1] + *fine0[0..1]);
|
|
50
|
+
// [2..3] = existingValue[0..1] + weight * (*coarse[2..3] + *fine1[0..1]);
|
|
51
|
+
// [4..5] = existingValue[0..1] + weight * (*coarse[4..5] + *fine2[0..1]);
|
|
52
|
+
// [6..7] = existingValue[0..1] + weight * (*coarse[6..7] + *fine3[0..1]);
|
|
53
|
+
// }
|
|
54
|
+
inline __m256 elementaryBlock2x4bAccum(
|
|
55
|
+
const float* const __restrict coarse,
|
|
56
|
+
const float* const __restrict fine0,
|
|
57
|
+
const float* const __restrict fine1,
|
|
58
|
+
const float* const __restrict fine2,
|
|
59
|
+
const float* const __restrict fine3,
|
|
60
|
+
const float weight,
|
|
61
|
+
const __m256 existingValue) {
|
|
62
|
+
// add coarse and fine
|
|
63
|
+
const __m256 combinedValue =
|
|
64
|
+
elementaryBlock2x4b(coarse, fine0, fine1, fine2, fine3);
|
|
65
|
+
|
|
66
|
+
// this operation is expected to be optimized by a compiler
|
|
67
|
+
const __m256 weightAvx2 = _mm256_set1_ps(weight);
|
|
68
|
+
// do fma
|
|
69
|
+
return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Processes 4 float values.
|
|
73
|
+
// Returns {
|
|
74
|
+
// [0..3] = *coarse[0..3] + *fine[0..3];
|
|
75
|
+
// }
|
|
76
|
+
inline __m128 elementaryBlock4x1b(
|
|
77
|
+
const float* const __restrict coarse,
|
|
78
|
+
const float* const __restrict fine) {
|
|
79
|
+
// load fine
|
|
80
|
+
const __m128 fineValue = _mm_loadu_ps(fine);
|
|
81
|
+
// load coarse
|
|
82
|
+
const __m128 coarseValue = _mm_loadu_ps(coarse);
|
|
83
|
+
|
|
84
|
+
// add coarse and fine
|
|
85
|
+
return _mm_add_ps(fineValue, coarseValue);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Processes 4 float values.
|
|
89
|
+
// Returns {
|
|
90
|
+
// [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
|
|
91
|
+
// }
|
|
92
|
+
inline __m128 elementaryBlock4x1bAccum(
|
|
93
|
+
const float* const __restrict coarse,
|
|
94
|
+
const float* const __restrict fine,
|
|
95
|
+
const float weight,
|
|
96
|
+
const __m128 existingValue) {
|
|
97
|
+
// add coarse and fine
|
|
98
|
+
const __m128 combinedValue = elementaryBlock4x1b(coarse, fine);
|
|
99
|
+
|
|
100
|
+
// this operation is expected to be optimized by a compiler
|
|
101
|
+
const __m128 weightAvx = _mm_set1_ps(weight);
|
|
102
|
+
// do fma
|
|
103
|
+
return _mm_fmadd_ps(combinedValue, weightAvx, existingValue);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Processes 8 float values.
|
|
107
|
+
// Returns {
|
|
108
|
+
// [0..3] = *coarse[0..3] + *fine0[0..3];
|
|
109
|
+
// [4..7] = *coarse[4..7] + *fine1[0..3];
|
|
110
|
+
// }
|
|
111
|
+
inline __m256 elementaryBlock4x2b(
|
|
112
|
+
const float* const __restrict coarse,
|
|
113
|
+
const float* const __restrict fine0,
|
|
114
|
+
const float* const __restrict fine1) {
|
|
115
|
+
// load fine
|
|
116
|
+
const __m128 fineValue0 = _mm_loadu_ps(fine0);
|
|
117
|
+
const __m128 fineValue1 = _mm_loadu_ps(fine1);
|
|
118
|
+
// load coarse
|
|
119
|
+
const __m256 coarseValue = _mm256_loadu_ps(coarse);
|
|
120
|
+
|
|
121
|
+
// combine two 4b into a single 8b
|
|
122
|
+
const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
|
|
123
|
+
// add coarse and fine
|
|
124
|
+
return _mm256_add_ps(combinedFineValue, coarseValue);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Processes 8 float values.
|
|
128
|
+
// Returns {
|
|
129
|
+
// [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
|
|
130
|
+
// [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
|
|
131
|
+
// }
|
|
132
|
+
inline __m256 elementaryBlock4x2bAccum(
|
|
133
|
+
const float* const __restrict coarse,
|
|
134
|
+
const float* const __restrict fine0,
|
|
135
|
+
const float* const __restrict fine1,
|
|
136
|
+
const float weight,
|
|
137
|
+
const __m256 existingValue) {
|
|
138
|
+
// add coarse and fine
|
|
139
|
+
const __m256 combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
|
|
140
|
+
|
|
141
|
+
// this operation is expected to be optimized by a compiler
|
|
142
|
+
const __m256 weightAvx2 = _mm256_set1_ps(weight);
|
|
143
|
+
// do fma
|
|
144
|
+
return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Processes 8 float values.
|
|
148
|
+
// Returns {
|
|
149
|
+
// [0..7] = *coarse[0..7] + *fine[0..7];
|
|
150
|
+
// }
|
|
151
|
+
inline __m256 elementaryBlock8x1b(
|
|
152
|
+
const float* const __restrict coarse,
|
|
153
|
+
const float* const __restrict fine) {
|
|
154
|
+
// load fine
|
|
155
|
+
const __m256 fineValue = _mm256_loadu_ps(fine);
|
|
156
|
+
// load coarse
|
|
157
|
+
const __m256 coarseValue = _mm256_loadu_ps(coarse);
|
|
158
|
+
|
|
159
|
+
// add coarse and fine
|
|
160
|
+
return _mm256_add_ps(fineValue, coarseValue);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Processes 8 float values.
|
|
164
|
+
// Returns {
|
|
165
|
+
// [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
|
|
166
|
+
// }
|
|
167
|
+
inline __m256 elementaryBlock8x1bAccum(
|
|
168
|
+
const float* const __restrict coarse,
|
|
169
|
+
const float* const __restrict fine,
|
|
170
|
+
const float weight,
|
|
171
|
+
const __m256 existingValue) {
|
|
172
|
+
// add coarse and fine
|
|
173
|
+
const __m256 combinedValue = elementaryBlock8x1b(coarse, fine);
|
|
174
|
+
|
|
175
|
+
// this operation is expected to be optimized by a compiler
|
|
176
|
+
const __m256 weightAvx2 = _mm256_set1_ps(weight);
|
|
177
|
+
// do fma
|
|
178
|
+
return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// The following code uses template-based for-loop unrolling,
|
|
182
|
+
// because the compiler does not do that on its own as needed.
|
|
183
|
+
// The idea is the following:
|
|
184
|
+
// template<int I, int MAX>
|
|
185
|
+
// struct Foo {
|
|
186
|
+
// static void bar() {
|
|
187
|
+
// doSomething(I);
|
|
188
|
+
// Foo<I + 1, MAX>::bar();
|
|
189
|
+
// }
|
|
190
|
+
// };
|
|
191
|
+
//
|
|
192
|
+
// template<int MAX>
|
|
193
|
+
// struct Foo<MAX, MAX> {
|
|
194
|
+
// static void bar() {}
|
|
195
|
+
// };
|
|
196
|
+
//
|
|
197
|
+
// Initiate the loop:
|
|
198
|
+
// Foo<0, MAX>::bar();
|
|
199
|
+
|
|
200
|
+
template <
|
|
201
|
+
intptr_t DIM,
|
|
202
|
+
intptr_t COARSE_SIZE,
|
|
203
|
+
intptr_t FINE_SIZE,
|
|
204
|
+
intptr_t COARSE_BITS,
|
|
205
|
+
intptr_t FINE_BITS,
|
|
206
|
+
intptr_t CPOS,
|
|
207
|
+
bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
|
|
208
|
+
bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
|
|
209
|
+
bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
|
|
210
|
+
bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
|
|
211
|
+
bool DIM_EQ_CPOS = DIM == CPOS>
|
|
212
|
+
struct Index2LevelDecoderImpl;
|
|
213
|
+
|
|
214
|
+
template <
|
|
215
|
+
intptr_t DIM,
|
|
216
|
+
intptr_t COARSE_SIZE,
|
|
217
|
+
intptr_t COARSE_BITS,
|
|
218
|
+
intptr_t FINE_BITS,
|
|
219
|
+
intptr_t CPOS,
|
|
220
|
+
bool QPOS_LEFT_GE_8,
|
|
221
|
+
bool QPOS_LEFT_GE_4>
|
|
222
|
+
struct Index2LevelDecoderImpl<
|
|
223
|
+
DIM,
|
|
224
|
+
COARSE_SIZE,
|
|
225
|
+
2,
|
|
226
|
+
COARSE_BITS,
|
|
227
|
+
FINE_BITS,
|
|
228
|
+
CPOS,
|
|
229
|
+
true,
|
|
230
|
+
false,
|
|
231
|
+
QPOS_LEFT_GE_8,
|
|
232
|
+
QPOS_LEFT_GE_4,
|
|
233
|
+
false> {
|
|
234
|
+
static constexpr intptr_t FINE_SIZE = 2;
|
|
235
|
+
|
|
236
|
+
static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
|
|
237
|
+
static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
|
|
238
|
+
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
239
|
+
static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
|
|
240
|
+
|
|
241
|
+
static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
|
|
242
|
+
|
|
243
|
+
// coarse quantizer storage
|
|
244
|
+
static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
|
|
245
|
+
|
|
246
|
+
// coarse quantizer bytes start from 0
|
|
247
|
+
// fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
|
|
248
|
+
static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
|
|
249
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
|
|
250
|
+
N_COARSE_ELEMENTS * COARSE_BITS;
|
|
251
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
|
|
252
|
+
(N_COARSE_ELEMENTS_BITS + 7) / 8;
|
|
253
|
+
|
|
254
|
+
static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
|
|
255
|
+
|
|
256
|
+
// process 1 sample
|
|
257
|
+
static void store(
|
|
258
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
259
|
+
const float* const __restrict pqFineCentroids0,
|
|
260
|
+
const uint8_t* const __restrict code0,
|
|
261
|
+
float* const __restrict outputStore) {
|
|
262
|
+
// coarse quantizer
|
|
263
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
264
|
+
|
|
265
|
+
// fine quantizer
|
|
266
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
267
|
+
|
|
268
|
+
// clang-format off
|
|
269
|
+
|
|
270
|
+
// process chunks, 2 float
|
|
271
|
+
// but 8 floats per loop
|
|
272
|
+
|
|
273
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
274
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
275
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
276
|
+
const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
|
|
277
|
+
const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
|
|
278
|
+
|
|
279
|
+
const __m256 storeValue = elementaryBlock2x4b(
|
|
280
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
281
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
282
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
283
|
+
pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
|
|
284
|
+
pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
|
|
285
|
+
|
|
286
|
+
_mm256_storeu_ps(outputStore + CPOS, storeValue);
|
|
287
|
+
|
|
288
|
+
// next
|
|
289
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
|
|
290
|
+
pqCoarseCentroids0, pqFineCentroids0, code0,
|
|
291
|
+
outputStore);
|
|
292
|
+
|
|
293
|
+
// clang-format on
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// process 1 sample
|
|
297
|
+
static void accum(
|
|
298
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
299
|
+
const float* const __restrict pqFineCentroids0,
|
|
300
|
+
const uint8_t* const __restrict code0,
|
|
301
|
+
const float weight0,
|
|
302
|
+
float* const __restrict outputAccum) {
|
|
303
|
+
// coarse quantizer
|
|
304
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
305
|
+
|
|
306
|
+
// fine quantizer
|
|
307
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
308
|
+
|
|
309
|
+
// clang-format off
|
|
310
|
+
|
|
311
|
+
// process chunks, 2 float
|
|
312
|
+
// but 8 floats per loop
|
|
313
|
+
|
|
314
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
315
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
316
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
317
|
+
const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
|
|
318
|
+
const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
|
|
319
|
+
|
|
320
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
321
|
+
|
|
322
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
323
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
324
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
325
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
326
|
+
pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
|
|
327
|
+
pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset, weight0,
|
|
328
|
+
existingValue);
|
|
329
|
+
|
|
330
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
331
|
+
|
|
332
|
+
// next
|
|
333
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
334
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
335
|
+
outputAccum);
|
|
336
|
+
|
|
337
|
+
// clang-format on
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Process 2 samples.
|
|
341
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
342
|
+
// table.
|
|
343
|
+
static void accum(
|
|
344
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
345
|
+
const float* const __restrict pqFineCentroids0,
|
|
346
|
+
const uint8_t* const __restrict code0,
|
|
347
|
+
const float weight0,
|
|
348
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
349
|
+
const float* const __restrict pqFineCentroids1,
|
|
350
|
+
const uint8_t* const __restrict code1,
|
|
351
|
+
const float weight1,
|
|
352
|
+
float* const __restrict outputAccum) {
|
|
353
|
+
// coarse quantizer
|
|
354
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
355
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
356
|
+
|
|
357
|
+
// fine quantizer
|
|
358
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
359
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
360
|
+
|
|
361
|
+
// clang-format off
|
|
362
|
+
|
|
363
|
+
// process chunks, 4 float
|
|
364
|
+
// but 8 floats per loop
|
|
365
|
+
|
|
366
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
367
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
368
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
369
|
+
const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
|
|
370
|
+
const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
|
|
371
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
372
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
373
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
374
|
+
const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
|
|
375
|
+
const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
|
|
376
|
+
|
|
377
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
378
|
+
|
|
379
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
380
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
381
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
382
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
383
|
+
pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
|
|
384
|
+
pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
|
|
385
|
+
weight0,
|
|
386
|
+
existingValue);
|
|
387
|
+
|
|
388
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
389
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
390
|
+
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
391
|
+
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
392
|
+
pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
|
|
393
|
+
pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
|
|
394
|
+
weight1,
|
|
395
|
+
existingValue);
|
|
396
|
+
|
|
397
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
398
|
+
|
|
399
|
+
// next
|
|
400
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
401
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
402
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
403
|
+
outputAccum);
|
|
404
|
+
|
|
405
|
+
// clang-format on
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Process 2 samples.
|
|
409
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
410
|
+
// codes.
|
|
411
|
+
static void accum(
|
|
412
|
+
const float* const __restrict pqCoarseCentroids,
|
|
413
|
+
const float* const __restrict pqFineCentroids,
|
|
414
|
+
const uint8_t* const __restrict code0,
|
|
415
|
+
const float weight0,
|
|
416
|
+
const uint8_t* const __restrict code1,
|
|
417
|
+
const float weight1,
|
|
418
|
+
float* const __restrict outputAccum) {
|
|
419
|
+
// coarse quantizer
|
|
420
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
421
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
422
|
+
|
|
423
|
+
// fine quantizer
|
|
424
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
425
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
426
|
+
|
|
427
|
+
// clang-format off
|
|
428
|
+
|
|
429
|
+
// process chunks, 4 float
|
|
430
|
+
// but 8 floats per loop
|
|
431
|
+
|
|
432
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
433
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
434
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
435
|
+
const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
|
|
436
|
+
const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
|
|
437
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
438
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
439
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
440
|
+
const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
|
|
441
|
+
const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
|
|
442
|
+
|
|
443
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
444
|
+
|
|
445
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
446
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
447
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
448
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
449
|
+
pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
|
|
450
|
+
pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
|
|
451
|
+
weight0,
|
|
452
|
+
existingValue);
|
|
453
|
+
|
|
454
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
455
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
456
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
457
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
458
|
+
pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
|
|
459
|
+
pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
|
|
460
|
+
weight1,
|
|
461
|
+
existingValue);
|
|
462
|
+
|
|
463
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
464
|
+
|
|
465
|
+
// next
|
|
466
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
467
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
468
|
+
code0, weight0,
|
|
469
|
+
code1, weight1,
|
|
470
|
+
outputAccum);
|
|
471
|
+
|
|
472
|
+
// clang-format on
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// Process 3 samples.
|
|
476
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
477
|
+
// table.
|
|
478
|
+
static void accum(
|
|
479
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
480
|
+
const float* const __restrict pqFineCentroids0,
|
|
481
|
+
const uint8_t* const __restrict code0,
|
|
482
|
+
const float weight0,
|
|
483
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
484
|
+
const float* const __restrict pqFineCentroids1,
|
|
485
|
+
const uint8_t* const __restrict code1,
|
|
486
|
+
const float weight1,
|
|
487
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
488
|
+
const float* const __restrict pqFineCentroids2,
|
|
489
|
+
const uint8_t* const __restrict code2,
|
|
490
|
+
const float weight2,
|
|
491
|
+
float* const __restrict outputAccum) {
|
|
492
|
+
// coarse quantizer
|
|
493
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
494
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
495
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
496
|
+
|
|
497
|
+
// fine quantizer
|
|
498
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
499
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
500
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
501
|
+
|
|
502
|
+
// clang-format off
|
|
503
|
+
|
|
504
|
+
// process chunks, 2 float
|
|
505
|
+
// but 8 floats per loop
|
|
506
|
+
|
|
507
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
508
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
509
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
510
|
+
const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
|
|
511
|
+
const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
|
|
512
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
513
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
514
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
515
|
+
const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
|
|
516
|
+
const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
|
|
517
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
518
|
+
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
|
519
|
+
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
|
520
|
+
const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
|
|
521
|
+
const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
|
|
522
|
+
|
|
523
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
524
|
+
|
|
525
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
526
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
527
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
528
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
529
|
+
pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
|
|
530
|
+
pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
|
|
531
|
+
weight0,
|
|
532
|
+
existingValue);
|
|
533
|
+
|
|
534
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
535
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
536
|
+
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
537
|
+
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
538
|
+
pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
|
|
539
|
+
pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
|
|
540
|
+
weight1,
|
|
541
|
+
existingValue);
|
|
542
|
+
|
|
543
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
544
|
+
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
545
|
+
pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
|
546
|
+
pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
|
547
|
+
pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
|
|
548
|
+
pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
|
|
549
|
+
weight2,
|
|
550
|
+
existingValue);
|
|
551
|
+
|
|
552
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
553
|
+
|
|
554
|
+
// next
|
|
555
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
556
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
557
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
558
|
+
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
|
559
|
+
outputAccum);
|
|
560
|
+
|
|
561
|
+
// clang-format on
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Process 3 samples.
|
|
565
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
566
|
+
// codes.
|
|
567
|
+
static void accum(
|
|
568
|
+
const float* const __restrict pqCoarseCentroids,
|
|
569
|
+
const float* const __restrict pqFineCentroids,
|
|
570
|
+
const uint8_t* const __restrict code0,
|
|
571
|
+
const float weight0,
|
|
572
|
+
const uint8_t* const __restrict code1,
|
|
573
|
+
const float weight1,
|
|
574
|
+
const uint8_t* const __restrict code2,
|
|
575
|
+
const float weight2,
|
|
576
|
+
float* const __restrict outputAccum) {
|
|
577
|
+
// coarse quantizer
|
|
578
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
579
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
580
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
581
|
+
|
|
582
|
+
// fine quantizer
|
|
583
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
584
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
585
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
586
|
+
|
|
587
|
+
// clang-format off
|
|
588
|
+
|
|
589
|
+
// process chunks, 4 float
|
|
590
|
+
// but 8 floats per loop
|
|
591
|
+
|
|
592
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
593
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
594
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
595
|
+
const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
|
|
596
|
+
const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
|
|
597
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
598
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
599
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
600
|
+
const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
|
|
601
|
+
const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
|
|
602
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
603
|
+
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
|
604
|
+
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
|
605
|
+
const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
|
|
606
|
+
const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
|
|
607
|
+
|
|
608
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
609
|
+
|
|
610
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
611
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
612
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
613
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
614
|
+
pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
|
|
615
|
+
pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
|
|
616
|
+
weight0,
|
|
617
|
+
existingValue);
|
|
618
|
+
|
|
619
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
620
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
621
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
622
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
623
|
+
pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
|
|
624
|
+
pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
|
|
625
|
+
weight1,
|
|
626
|
+
existingValue);
|
|
627
|
+
|
|
628
|
+
existingValue = elementaryBlock2x4bAccum(
|
|
629
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
630
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
|
631
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
|
632
|
+
pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
|
|
633
|
+
pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
|
|
634
|
+
weight2,
|
|
635
|
+
existingValue);
|
|
636
|
+
|
|
637
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
638
|
+
|
|
639
|
+
// next
|
|
640
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
641
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
642
|
+
code0, weight0,
|
|
643
|
+
code1, weight1,
|
|
644
|
+
code2, weight2,
|
|
645
|
+
outputAccum);
|
|
646
|
+
|
|
647
|
+
// clang-format on
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
|
|
651
|
+
template <
|
|
652
|
+
intptr_t DIM,
|
|
653
|
+
intptr_t COARSE_SIZE,
|
|
654
|
+
intptr_t COARSE_BITS,
|
|
655
|
+
intptr_t FINE_BITS,
|
|
656
|
+
intptr_t CPOS,
|
|
657
|
+
bool QPOS_LEFT_GE_8,
|
|
658
|
+
bool QPOS_LEFT_GE_4>
|
|
659
|
+
struct Index2LevelDecoderImpl<
|
|
660
|
+
DIM,
|
|
661
|
+
COARSE_SIZE,
|
|
662
|
+
4,
|
|
663
|
+
COARSE_BITS,
|
|
664
|
+
FINE_BITS,
|
|
665
|
+
CPOS,
|
|
666
|
+
false,
|
|
667
|
+
true,
|
|
668
|
+
QPOS_LEFT_GE_8,
|
|
669
|
+
QPOS_LEFT_GE_4,
|
|
670
|
+
false> {
|
|
671
|
+
static constexpr intptr_t FINE_SIZE = 4;
|
|
672
|
+
|
|
673
|
+
static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
|
|
674
|
+
static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
|
|
675
|
+
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
676
|
+
static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
|
|
677
|
+
|
|
678
|
+
static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
|
|
679
|
+
|
|
680
|
+
// coarse quantizer storage
|
|
681
|
+
static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
|
|
682
|
+
|
|
683
|
+
// coarse quantizer bytes start from 0
|
|
684
|
+
// fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
|
|
685
|
+
static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
|
|
686
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
|
|
687
|
+
N_COARSE_ELEMENTS * COARSE_BITS;
|
|
688
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
|
|
689
|
+
(N_COARSE_ELEMENTS_BITS + 7) / 8;
|
|
690
|
+
|
|
691
|
+
static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
|
|
692
|
+
|
|
693
|
+
// process 1 sample
|
|
694
|
+
static void store(
|
|
695
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
696
|
+
const float* const __restrict pqFineCentroids0,
|
|
697
|
+
const uint8_t* const __restrict code0,
|
|
698
|
+
float* const __restrict outputStore) {
|
|
699
|
+
// coarse quantizer
|
|
700
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
701
|
+
|
|
702
|
+
// fine quantizer
|
|
703
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
704
|
+
|
|
705
|
+
// clang-format off
|
|
706
|
+
|
|
707
|
+
// process chunks, 4 float
|
|
708
|
+
// but 8 floats per loop
|
|
709
|
+
|
|
710
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
711
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
712
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
713
|
+
|
|
714
|
+
const __m256 storeValue = elementaryBlock4x2b(
|
|
715
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
716
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
717
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
|
|
718
|
+
|
|
719
|
+
_mm256_storeu_ps(outputStore + CPOS, storeValue);
|
|
720
|
+
|
|
721
|
+
// next
|
|
722
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
|
|
723
|
+
pqCoarseCentroids0, pqFineCentroids0, code0,
|
|
724
|
+
outputStore);
|
|
725
|
+
|
|
726
|
+
// clang-format on
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
// process 1 sample
|
|
730
|
+
static void accum(
|
|
731
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
732
|
+
const float* const __restrict pqFineCentroids0,
|
|
733
|
+
const uint8_t* const __restrict code0,
|
|
734
|
+
const float weight0,
|
|
735
|
+
float* const __restrict outputAccum) {
|
|
736
|
+
// coarse quantizer
|
|
737
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
738
|
+
|
|
739
|
+
// fine quantizer
|
|
740
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
741
|
+
|
|
742
|
+
// clang-format off
|
|
743
|
+
|
|
744
|
+
// process chunks, 4 float
|
|
745
|
+
// but 8 floats per loop
|
|
746
|
+
|
|
747
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
748
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
749
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
750
|
+
|
|
751
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
752
|
+
|
|
753
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
754
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
755
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
756
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
757
|
+
weight0,
|
|
758
|
+
existingValue);
|
|
759
|
+
|
|
760
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
761
|
+
|
|
762
|
+
// next
|
|
763
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
764
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
765
|
+
outputAccum);
|
|
766
|
+
|
|
767
|
+
// clang-format on
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// Process 2 samples.
|
|
771
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
772
|
+
// table.
|
|
773
|
+
static void accum(
|
|
774
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
775
|
+
const float* const __restrict pqFineCentroids0,
|
|
776
|
+
const uint8_t* const __restrict code0,
|
|
777
|
+
const float weight0,
|
|
778
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
779
|
+
const float* const __restrict pqFineCentroids1,
|
|
780
|
+
const uint8_t* const __restrict code1,
|
|
781
|
+
const float weight1,
|
|
782
|
+
float* const __restrict outputAccum) {
|
|
783
|
+
// coarse quantizer
|
|
784
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
785
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
786
|
+
|
|
787
|
+
// fine quantizer
|
|
788
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
789
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
790
|
+
|
|
791
|
+
// clang-format off
|
|
792
|
+
|
|
793
|
+
// process chunks, 4 float
|
|
794
|
+
// but 8 floats per loop
|
|
795
|
+
|
|
796
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
797
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
798
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
799
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
800
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
801
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
802
|
+
|
|
803
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
804
|
+
|
|
805
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
806
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
807
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
808
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
809
|
+
weight0,
|
|
810
|
+
existingValue);
|
|
811
|
+
|
|
812
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
813
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
814
|
+
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
815
|
+
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
816
|
+
weight1,
|
|
817
|
+
existingValue);
|
|
818
|
+
|
|
819
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
820
|
+
|
|
821
|
+
// next
|
|
822
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
823
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
824
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
825
|
+
outputAccum);
|
|
826
|
+
|
|
827
|
+
// clang-format on
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
// Process 2 samples.
|
|
831
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
832
|
+
// codes.
|
|
833
|
+
static void accum(
|
|
834
|
+
const float* const __restrict pqCoarseCentroids,
|
|
835
|
+
const float* const __restrict pqFineCentroids,
|
|
836
|
+
const uint8_t* const __restrict code0,
|
|
837
|
+
const float weight0,
|
|
838
|
+
const uint8_t* const __restrict code1,
|
|
839
|
+
const float weight1,
|
|
840
|
+
float* const __restrict outputAccum) {
|
|
841
|
+
// coarse quantizer
|
|
842
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
843
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
844
|
+
|
|
845
|
+
// fine quantizer
|
|
846
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
847
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
848
|
+
|
|
849
|
+
// clang-format off
|
|
850
|
+
|
|
851
|
+
// process chunks, 4 float
|
|
852
|
+
// but 8 floats per loop
|
|
853
|
+
|
|
854
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
855
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
856
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
857
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
858
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
859
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
860
|
+
|
|
861
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
862
|
+
|
|
863
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
864
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
865
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
866
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
867
|
+
weight0,
|
|
868
|
+
existingValue);
|
|
869
|
+
|
|
870
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
871
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
872
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
873
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
874
|
+
weight1,
|
|
875
|
+
existingValue);
|
|
876
|
+
|
|
877
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
878
|
+
|
|
879
|
+
// next
|
|
880
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
881
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
882
|
+
code0, weight0,
|
|
883
|
+
code1, weight1,
|
|
884
|
+
outputAccum);
|
|
885
|
+
|
|
886
|
+
// clang-format on
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
// Process 3 samples.
|
|
890
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
891
|
+
// table.
|
|
892
|
+
static void accum(
|
|
893
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
894
|
+
const float* const __restrict pqFineCentroids0,
|
|
895
|
+
const uint8_t* const __restrict code0,
|
|
896
|
+
const float weight0,
|
|
897
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
898
|
+
const float* const __restrict pqFineCentroids1,
|
|
899
|
+
const uint8_t* const __restrict code1,
|
|
900
|
+
const float weight1,
|
|
901
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
902
|
+
const float* const __restrict pqFineCentroids2,
|
|
903
|
+
const uint8_t* const __restrict code2,
|
|
904
|
+
const float weight2,
|
|
905
|
+
float* const __restrict outputAccum) {
|
|
906
|
+
// coarse quantizer
|
|
907
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
908
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
909
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
910
|
+
|
|
911
|
+
// fine quantizer
|
|
912
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
913
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
914
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
915
|
+
|
|
916
|
+
// clang-format off
|
|
917
|
+
|
|
918
|
+
// process chunks, 4 float
|
|
919
|
+
// but 8 floats per loop
|
|
920
|
+
|
|
921
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
922
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
923
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
924
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
925
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
926
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
927
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
928
|
+
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
|
929
|
+
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
|
930
|
+
|
|
931
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
932
|
+
|
|
933
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
934
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
935
|
+
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
936
|
+
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
937
|
+
weight0,
|
|
938
|
+
existingValue);
|
|
939
|
+
|
|
940
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
941
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
942
|
+
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
943
|
+
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
944
|
+
weight1,
|
|
945
|
+
existingValue);
|
|
946
|
+
|
|
947
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
948
|
+
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
949
|
+
pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
|
950
|
+
pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
|
951
|
+
weight2,
|
|
952
|
+
existingValue);
|
|
953
|
+
|
|
954
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
955
|
+
|
|
956
|
+
// next
|
|
957
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
958
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
959
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
960
|
+
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
|
961
|
+
outputAccum);
|
|
962
|
+
|
|
963
|
+
// clang-format on
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
// Process 3 samples.
|
|
967
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
968
|
+
// codes.
|
|
969
|
+
static void accum(
|
|
970
|
+
const float* const __restrict pqCoarseCentroids,
|
|
971
|
+
const float* const __restrict pqFineCentroids,
|
|
972
|
+
const uint8_t* const __restrict code0,
|
|
973
|
+
const float weight0,
|
|
974
|
+
const uint8_t* const __restrict code1,
|
|
975
|
+
const float weight1,
|
|
976
|
+
const uint8_t* const __restrict code2,
|
|
977
|
+
const float weight2,
|
|
978
|
+
float* const __restrict outputAccum) {
|
|
979
|
+
// coarse quantizer
|
|
980
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
981
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
982
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
983
|
+
|
|
984
|
+
// fine quantizer
|
|
985
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
986
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
987
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
988
|
+
|
|
989
|
+
// clang-format off
|
|
990
|
+
|
|
991
|
+
// process chunks, 4 float
|
|
992
|
+
// but 8 floats per loop
|
|
993
|
+
|
|
994
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
995
|
+
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
|
996
|
+
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
|
997
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
998
|
+
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
|
999
|
+
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
|
1000
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
1001
|
+
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
|
1002
|
+
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
|
1003
|
+
|
|
1004
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
1005
|
+
|
|
1006
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
1007
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1008
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
|
1009
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
|
1010
|
+
weight0,
|
|
1011
|
+
existingValue);
|
|
1012
|
+
|
|
1013
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
1014
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1015
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
|
1016
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
|
1017
|
+
weight1,
|
|
1018
|
+
existingValue);
|
|
1019
|
+
|
|
1020
|
+
existingValue = elementaryBlock4x2bAccum(
|
|
1021
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
1022
|
+
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
|
1023
|
+
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
|
1024
|
+
weight2,
|
|
1025
|
+
existingValue);
|
|
1026
|
+
|
|
1027
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1028
|
+
|
|
1029
|
+
// next
|
|
1030
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
1031
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
1032
|
+
code0, weight0,
|
|
1033
|
+
code1, weight1,
|
|
1034
|
+
code2, weight2,
|
|
1035
|
+
outputAccum);
|
|
1036
|
+
|
|
1037
|
+
// clang-format on
|
|
1038
|
+
}
|
|
1039
|
+
};
|
|
1040
|
+
|
|
1041
|
+
template <
|
|
1042
|
+
intptr_t DIM,
|
|
1043
|
+
intptr_t COARSE_SIZE,
|
|
1044
|
+
intptr_t FINE_SIZE,
|
|
1045
|
+
intptr_t COARSE_BITS,
|
|
1046
|
+
intptr_t FINE_BITS,
|
|
1047
|
+
intptr_t CPOS>
|
|
1048
|
+
struct Index2LevelDecoderImpl<
|
|
1049
|
+
DIM,
|
|
1050
|
+
COARSE_SIZE,
|
|
1051
|
+
FINE_SIZE,
|
|
1052
|
+
COARSE_BITS,
|
|
1053
|
+
FINE_BITS,
|
|
1054
|
+
CPOS,
|
|
1055
|
+
false,
|
|
1056
|
+
false,
|
|
1057
|
+
true,
|
|
1058
|
+
true,
|
|
1059
|
+
false> {
|
|
1060
|
+
static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
|
|
1061
|
+
static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
|
|
1062
|
+
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
1063
|
+
static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
|
|
1064
|
+
|
|
1065
|
+
static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
|
|
1066
|
+
|
|
1067
|
+
// coarse quantizer storage
|
|
1068
|
+
static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
|
|
1069
|
+
|
|
1070
|
+
// coarse quantizer bytes start from 0
|
|
1071
|
+
// fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
|
|
1072
|
+
static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
|
|
1073
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
|
|
1074
|
+
N_COARSE_ELEMENTS * COARSE_BITS;
|
|
1075
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
|
|
1076
|
+
(N_COARSE_ELEMENTS_BITS + 7) / 8;
|
|
1077
|
+
|
|
1078
|
+
static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
|
|
1079
|
+
|
|
1080
|
+
// process 1 sample
|
|
1081
|
+
static void store(
|
|
1082
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1083
|
+
const float* const __restrict pqFineCentroids0,
|
|
1084
|
+
const uint8_t* const __restrict code0,
|
|
1085
|
+
float* const __restrict outputStore) {
|
|
1086
|
+
// coarse quantizer
|
|
1087
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1088
|
+
|
|
1089
|
+
// fine quantizer
|
|
1090
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1091
|
+
|
|
1092
|
+
// clang-format off
|
|
1093
|
+
|
|
1094
|
+
// process chunks, 8 float
|
|
1095
|
+
|
|
1096
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1097
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1098
|
+
|
|
1099
|
+
const __m256 storeValue = elementaryBlock8x1b(
|
|
1100
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1101
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
|
|
1102
|
+
|
|
1103
|
+
_mm256_storeu_ps(outputStore + CPOS, storeValue);
|
|
1104
|
+
|
|
1105
|
+
// next
|
|
1106
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
|
|
1107
|
+
pqCoarseCentroids0, pqFineCentroids0, code0,
|
|
1108
|
+
outputStore);
|
|
1109
|
+
|
|
1110
|
+
// clang-format on
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
// process 1 sample
|
|
1114
|
+
static void accum(
|
|
1115
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1116
|
+
const float* const __restrict pqFineCentroids0,
|
|
1117
|
+
const uint8_t* const __restrict code0,
|
|
1118
|
+
const float weight0,
|
|
1119
|
+
float* const __restrict outputAccum) {
|
|
1120
|
+
// coarse quantizer
|
|
1121
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1122
|
+
|
|
1123
|
+
// fine quantizer
|
|
1124
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1125
|
+
|
|
1126
|
+
// clang-format off
|
|
1127
|
+
|
|
1128
|
+
// process chunks, 8 float
|
|
1129
|
+
|
|
1130
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1131
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1132
|
+
|
|
1133
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
1134
|
+
|
|
1135
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1136
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1137
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1138
|
+
weight0,
|
|
1139
|
+
existingValue);
|
|
1140
|
+
|
|
1141
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1142
|
+
|
|
1143
|
+
// next
|
|
1144
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
1145
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
1146
|
+
outputAccum);
|
|
1147
|
+
|
|
1148
|
+
// clang-format on
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
// Process 2 samples.
|
|
1152
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
1153
|
+
// table.
|
|
1154
|
+
static void accum(
|
|
1155
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1156
|
+
const float* const __restrict pqFineCentroids0,
|
|
1157
|
+
const uint8_t* const __restrict code0,
|
|
1158
|
+
const float weight0,
|
|
1159
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1160
|
+
const float* const __restrict pqFineCentroids1,
|
|
1161
|
+
const uint8_t* const __restrict code1,
|
|
1162
|
+
const float weight1,
|
|
1163
|
+
float* const __restrict outputAccum) {
|
|
1164
|
+
// coarse quantizer
|
|
1165
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1166
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1167
|
+
|
|
1168
|
+
// fine quantizer
|
|
1169
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1170
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1171
|
+
|
|
1172
|
+
// clang-format off
|
|
1173
|
+
|
|
1174
|
+
// process chunks, 8 float
|
|
1175
|
+
|
|
1176
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1177
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1178
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1179
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1180
|
+
|
|
1181
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
1182
|
+
|
|
1183
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1184
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1185
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1186
|
+
weight0,
|
|
1187
|
+
existingValue);
|
|
1188
|
+
|
|
1189
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1190
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1191
|
+
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1192
|
+
weight1,
|
|
1193
|
+
existingValue);
|
|
1194
|
+
|
|
1195
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1196
|
+
|
|
1197
|
+
// next
|
|
1198
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
1199
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
1200
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
1201
|
+
outputAccum);
|
|
1202
|
+
|
|
1203
|
+
// clang-format on
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
// Process 2 samples.
|
|
1207
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
1208
|
+
// codes.
|
|
1209
|
+
static void accum(
|
|
1210
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1211
|
+
const float* const __restrict pqFineCentroids,
|
|
1212
|
+
const uint8_t* const __restrict code0,
|
|
1213
|
+
const float weight0,
|
|
1214
|
+
const uint8_t* const __restrict code1,
|
|
1215
|
+
const float weight1,
|
|
1216
|
+
float* const __restrict outputAccum) {
|
|
1217
|
+
// coarse quantizer
|
|
1218
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1219
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1220
|
+
|
|
1221
|
+
// fine quantizer
|
|
1222
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1223
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1224
|
+
|
|
1225
|
+
// clang-format off
|
|
1226
|
+
|
|
1227
|
+
// process chunks, 8 float
|
|
1228
|
+
|
|
1229
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1230
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1231
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1232
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1233
|
+
|
|
1234
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
1235
|
+
|
|
1236
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1237
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1238
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1239
|
+
weight0,
|
|
1240
|
+
existingValue);
|
|
1241
|
+
|
|
1242
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1243
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1244
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1245
|
+
weight1,
|
|
1246
|
+
existingValue);
|
|
1247
|
+
|
|
1248
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1249
|
+
|
|
1250
|
+
// next
|
|
1251
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
1252
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
1253
|
+
code0, weight0,
|
|
1254
|
+
code1, weight1,
|
|
1255
|
+
outputAccum);
|
|
1256
|
+
|
|
1257
|
+
// clang-format on
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
// Process 3 samples.
|
|
1261
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
1262
|
+
// table.
|
|
1263
|
+
static void accum(
|
|
1264
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1265
|
+
const float* const __restrict pqFineCentroids0,
|
|
1266
|
+
const uint8_t* const __restrict code0,
|
|
1267
|
+
const float weight0,
|
|
1268
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1269
|
+
const float* const __restrict pqFineCentroids1,
|
|
1270
|
+
const uint8_t* const __restrict code1,
|
|
1271
|
+
const float weight1,
|
|
1272
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
1273
|
+
const float* const __restrict pqFineCentroids2,
|
|
1274
|
+
const uint8_t* const __restrict code2,
|
|
1275
|
+
const float weight2,
|
|
1276
|
+
float* const __restrict outputAccum) {
|
|
1277
|
+
// coarse quantizer
|
|
1278
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1279
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1280
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
1281
|
+
|
|
1282
|
+
// fine quantizer
|
|
1283
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1284
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1285
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
1286
|
+
|
|
1287
|
+
// clang-format off
|
|
1288
|
+
|
|
1289
|
+
// process chunks, 8 float
|
|
1290
|
+
|
|
1291
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1292
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1293
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1294
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1295
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
1296
|
+
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
|
1297
|
+
|
|
1298
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
1299
|
+
|
|
1300
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1301
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1302
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1303
|
+
weight0,
|
|
1304
|
+
existingValue);
|
|
1305
|
+
|
|
1306
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1307
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1308
|
+
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1309
|
+
weight1,
|
|
1310
|
+
existingValue);
|
|
1311
|
+
|
|
1312
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1313
|
+
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
1314
|
+
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
|
1315
|
+
weight2,
|
|
1316
|
+
existingValue);
|
|
1317
|
+
|
|
1318
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1319
|
+
|
|
1320
|
+
// next
|
|
1321
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
1322
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
1323
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
1324
|
+
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
|
1325
|
+
outputAccum);
|
|
1326
|
+
|
|
1327
|
+
// clang-format on
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
// Process 3 samples.
|
|
1331
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
1332
|
+
// codes.
|
|
1333
|
+
static void accum(
|
|
1334
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1335
|
+
const float* const __restrict pqFineCentroids,
|
|
1336
|
+
const uint8_t* const __restrict code0,
|
|
1337
|
+
const float weight0,
|
|
1338
|
+
const uint8_t* const __restrict code1,
|
|
1339
|
+
const float weight1,
|
|
1340
|
+
const uint8_t* const __restrict code2,
|
|
1341
|
+
const float weight2,
|
|
1342
|
+
float* const __restrict outputAccum) {
|
|
1343
|
+
// coarse quantizer
|
|
1344
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1345
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1346
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
1347
|
+
|
|
1348
|
+
// fine quantizer
|
|
1349
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1350
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1351
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
1352
|
+
|
|
1353
|
+
// clang-format off
|
|
1354
|
+
|
|
1355
|
+
// process chunks, 8 float
|
|
1356
|
+
|
|
1357
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1358
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1359
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1360
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1361
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
1362
|
+
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
|
1363
|
+
|
|
1364
|
+
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
|
1365
|
+
|
|
1366
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1367
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1368
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1369
|
+
weight0,
|
|
1370
|
+
existingValue);
|
|
1371
|
+
|
|
1372
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1373
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1374
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1375
|
+
weight1,
|
|
1376
|
+
existingValue);
|
|
1377
|
+
|
|
1378
|
+
existingValue = elementaryBlock8x1bAccum(
|
|
1379
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
1380
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
|
1381
|
+
weight2,
|
|
1382
|
+
existingValue);
|
|
1383
|
+
|
|
1384
|
+
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1385
|
+
|
|
1386
|
+
// next
|
|
1387
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
|
1388
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
1389
|
+
code0, weight0,
|
|
1390
|
+
code1, weight1,
|
|
1391
|
+
code2, weight2,
|
|
1392
|
+
outputAccum);
|
|
1393
|
+
|
|
1394
|
+
// clang-format on
|
|
1395
|
+
}
|
|
1396
|
+
};
|
|
1397
|
+
|
|
1398
|
+
template <
|
|
1399
|
+
intptr_t DIM,
|
|
1400
|
+
intptr_t COARSE_SIZE,
|
|
1401
|
+
intptr_t FINE_SIZE,
|
|
1402
|
+
intptr_t COARSE_BITS,
|
|
1403
|
+
intptr_t FINE_BITS,
|
|
1404
|
+
intptr_t CPOS>
|
|
1405
|
+
struct Index2LevelDecoderImpl<
|
|
1406
|
+
DIM,
|
|
1407
|
+
COARSE_SIZE,
|
|
1408
|
+
FINE_SIZE,
|
|
1409
|
+
COARSE_BITS,
|
|
1410
|
+
FINE_BITS,
|
|
1411
|
+
CPOS,
|
|
1412
|
+
false,
|
|
1413
|
+
false,
|
|
1414
|
+
false,
|
|
1415
|
+
true,
|
|
1416
|
+
false> {
|
|
1417
|
+
static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
|
|
1418
|
+
static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
|
|
1419
|
+
static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
|
|
1420
|
+
static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
|
|
1421
|
+
|
|
1422
|
+
static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
|
|
1423
|
+
|
|
1424
|
+
// coarse quantizer storage
|
|
1425
|
+
static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
|
|
1426
|
+
|
|
1427
|
+
// coarse quantizer bytes start from 0
|
|
1428
|
+
// fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
|
|
1429
|
+
static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
|
|
1430
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
|
|
1431
|
+
N_COARSE_ELEMENTS * COARSE_BITS;
|
|
1432
|
+
static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
|
|
1433
|
+
(N_COARSE_ELEMENTS_BITS + 7) / 8;
|
|
1434
|
+
|
|
1435
|
+
static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
|
|
1436
|
+
|
|
1437
|
+
// process 1 sample
|
|
1438
|
+
static void store(
|
|
1439
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1440
|
+
const float* const __restrict pqFineCentroids0,
|
|
1441
|
+
const uint8_t* const __restrict code0,
|
|
1442
|
+
float* const __restrict outputStore) {
|
|
1443
|
+
// coarse quantizer
|
|
1444
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1445
|
+
|
|
1446
|
+
// fine quantizer
|
|
1447
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1448
|
+
|
|
1449
|
+
// clang-format off
|
|
1450
|
+
|
|
1451
|
+
// process chunks, 4 float
|
|
1452
|
+
|
|
1453
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1454
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1455
|
+
|
|
1456
|
+
const __m128 storeValue = elementaryBlock4x1b(
|
|
1457
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1458
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
|
|
1459
|
+
|
|
1460
|
+
_mm_storeu_ps(outputStore + CPOS, storeValue);
|
|
1461
|
+
|
|
1462
|
+
// next
|
|
1463
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::store(
|
|
1464
|
+
pqCoarseCentroids0, pqFineCentroids0, code0,
|
|
1465
|
+
outputStore);
|
|
1466
|
+
|
|
1467
|
+
// clang-format on
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1470
|
+
// process 1 sample
|
|
1471
|
+
static void accum(
|
|
1472
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1473
|
+
const float* const __restrict pqFineCentroids0,
|
|
1474
|
+
const uint8_t* const __restrict code0,
|
|
1475
|
+
const float weight0,
|
|
1476
|
+
float* const __restrict outputAccum) {
|
|
1477
|
+
// coarse quantizer
|
|
1478
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1479
|
+
|
|
1480
|
+
// fine quantizer
|
|
1481
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1482
|
+
|
|
1483
|
+
// clang-format off
|
|
1484
|
+
|
|
1485
|
+
// process chunks, 4 float
|
|
1486
|
+
|
|
1487
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1488
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS,fineCentroidIdx>::get(fine0);
|
|
1489
|
+
|
|
1490
|
+
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
|
1491
|
+
|
|
1492
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1493
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1494
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1495
|
+
weight0,
|
|
1496
|
+
existingValue);
|
|
1497
|
+
|
|
1498
|
+
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1499
|
+
|
|
1500
|
+
// next
|
|
1501
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
|
1502
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
1503
|
+
outputAccum);
|
|
1504
|
+
|
|
1505
|
+
// clang-format on
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
// Process 2 samples.
|
|
1509
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
1510
|
+
// table.
|
|
1511
|
+
static void accum(
|
|
1512
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1513
|
+
const float* const __restrict pqFineCentroids0,
|
|
1514
|
+
const uint8_t* const __restrict code0,
|
|
1515
|
+
const float weight0,
|
|
1516
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1517
|
+
const float* const __restrict pqFineCentroids1,
|
|
1518
|
+
const uint8_t* const __restrict code1,
|
|
1519
|
+
const float weight1,
|
|
1520
|
+
float* const __restrict outputAccum) {
|
|
1521
|
+
// coarse quantizer
|
|
1522
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1523
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1524
|
+
|
|
1525
|
+
// fine quantizer
|
|
1526
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1527
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1528
|
+
|
|
1529
|
+
// clang-format off
|
|
1530
|
+
|
|
1531
|
+
// process chunks, 4 float
|
|
1532
|
+
|
|
1533
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1534
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1535
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1536
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1537
|
+
|
|
1538
|
+
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
|
1539
|
+
|
|
1540
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1541
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1542
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1543
|
+
weight0,
|
|
1544
|
+
existingValue);
|
|
1545
|
+
|
|
1546
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1547
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1548
|
+
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1549
|
+
weight1,
|
|
1550
|
+
existingValue);
|
|
1551
|
+
|
|
1552
|
+
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1553
|
+
|
|
1554
|
+
// next
|
|
1555
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
|
1556
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
1557
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
1558
|
+
outputAccum);
|
|
1559
|
+
|
|
1560
|
+
// clang-format on
|
|
1561
|
+
}
|
|
1562
|
+
|
|
1563
|
+
// Process 2 samples.
|
|
1564
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
1565
|
+
// codes.
|
|
1566
|
+
static void accum(
|
|
1567
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1568
|
+
const float* const __restrict pqFineCentroids,
|
|
1569
|
+
const uint8_t* const __restrict code0,
|
|
1570
|
+
const float weight0,
|
|
1571
|
+
const uint8_t* const __restrict code1,
|
|
1572
|
+
const float weight1,
|
|
1573
|
+
float* const __restrict outputAccum) {
|
|
1574
|
+
// coarse quantizer
|
|
1575
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1576
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1577
|
+
|
|
1578
|
+
// fine quantizer
|
|
1579
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1580
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1581
|
+
|
|
1582
|
+
// clang-format off
|
|
1583
|
+
|
|
1584
|
+
// process chunks, 4 float
|
|
1585
|
+
|
|
1586
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1587
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1588
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1589
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1590
|
+
|
|
1591
|
+
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
|
1592
|
+
|
|
1593
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1594
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1595
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1596
|
+
weight0,
|
|
1597
|
+
existingValue);
|
|
1598
|
+
|
|
1599
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1600
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1601
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1602
|
+
weight1,
|
|
1603
|
+
existingValue);
|
|
1604
|
+
|
|
1605
|
+
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1606
|
+
|
|
1607
|
+
// next
|
|
1608
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
|
1609
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
1610
|
+
code0, weight0,
|
|
1611
|
+
code1, weight1,
|
|
1612
|
+
outputAccum);
|
|
1613
|
+
|
|
1614
|
+
// clang-format on
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
// Process 3 samples.
|
|
1618
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
1619
|
+
// table.
|
|
1620
|
+
static void accum(
|
|
1621
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1622
|
+
const float* const __restrict pqFineCentroids0,
|
|
1623
|
+
const uint8_t* const __restrict code0,
|
|
1624
|
+
const float weight0,
|
|
1625
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1626
|
+
const float* const __restrict pqFineCentroids1,
|
|
1627
|
+
const uint8_t* const __restrict code1,
|
|
1628
|
+
const float weight1,
|
|
1629
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
1630
|
+
const float* const __restrict pqFineCentroids2,
|
|
1631
|
+
const uint8_t* const __restrict code2,
|
|
1632
|
+
const float weight2,
|
|
1633
|
+
float* const __restrict outputAccum) {
|
|
1634
|
+
// coarse quantizer
|
|
1635
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1636
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1637
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
1638
|
+
|
|
1639
|
+
// fine quantizer
|
|
1640
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1641
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1642
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
1643
|
+
|
|
1644
|
+
// clang-format off
|
|
1645
|
+
|
|
1646
|
+
// process chunks, 4 float
|
|
1647
|
+
|
|
1648
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1649
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1650
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1651
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1652
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
1653
|
+
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
|
1654
|
+
|
|
1655
|
+
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
|
1656
|
+
|
|
1657
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1658
|
+
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1659
|
+
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1660
|
+
weight0,
|
|
1661
|
+
existingValue);
|
|
1662
|
+
|
|
1663
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1664
|
+
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1665
|
+
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1666
|
+
weight1,
|
|
1667
|
+
existingValue);
|
|
1668
|
+
|
|
1669
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1670
|
+
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
1671
|
+
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
|
1672
|
+
weight2,
|
|
1673
|
+
existingValue);
|
|
1674
|
+
|
|
1675
|
+
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1676
|
+
|
|
1677
|
+
// next
|
|
1678
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
|
1679
|
+
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
|
1680
|
+
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
|
1681
|
+
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
|
1682
|
+
outputAccum);
|
|
1683
|
+
|
|
1684
|
+
// clang-format on
|
|
1685
|
+
}
|
|
1686
|
+
|
|
1687
|
+
// Process 3 samples.
|
|
1688
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
1689
|
+
// codes.
|
|
1690
|
+
static void accum(
|
|
1691
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1692
|
+
const float* const __restrict pqFineCentroids,
|
|
1693
|
+
const uint8_t* const __restrict code0,
|
|
1694
|
+
const float weight0,
|
|
1695
|
+
const uint8_t* const __restrict code1,
|
|
1696
|
+
const float weight1,
|
|
1697
|
+
const uint8_t* const __restrict code2,
|
|
1698
|
+
const float weight2,
|
|
1699
|
+
float* const __restrict outputAccum) {
|
|
1700
|
+
// coarse quantizer
|
|
1701
|
+
const uint8_t* const __restrict coarse0 = code0;
|
|
1702
|
+
const uint8_t* const __restrict coarse1 = code1;
|
|
1703
|
+
const uint8_t* const __restrict coarse2 = code2;
|
|
1704
|
+
|
|
1705
|
+
// fine quantizer
|
|
1706
|
+
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
|
1707
|
+
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
|
1708
|
+
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
|
1709
|
+
|
|
1710
|
+
// clang-format off
|
|
1711
|
+
|
|
1712
|
+
// process chunks, 4 float
|
|
1713
|
+
|
|
1714
|
+
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
|
1715
|
+
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
|
1716
|
+
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
|
1717
|
+
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
|
1718
|
+
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
|
1719
|
+
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
|
1720
|
+
|
|
1721
|
+
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
|
1722
|
+
|
|
1723
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1724
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
|
1725
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
|
1726
|
+
weight0,
|
|
1727
|
+
existingValue);
|
|
1728
|
+
|
|
1729
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1730
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
|
1731
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
|
1732
|
+
weight1,
|
|
1733
|
+
existingValue);
|
|
1734
|
+
|
|
1735
|
+
existingValue = elementaryBlock4x1bAccum(
|
|
1736
|
+
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
|
1737
|
+
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
|
1738
|
+
weight2,
|
|
1739
|
+
existingValue);
|
|
1740
|
+
|
|
1741
|
+
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
|
1742
|
+
|
|
1743
|
+
// next
|
|
1744
|
+
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
|
1745
|
+
pqCoarseCentroids, pqFineCentroids,
|
|
1746
|
+
code0, weight0,
|
|
1747
|
+
code1, weight1,
|
|
1748
|
+
code2, weight2,
|
|
1749
|
+
outputAccum);
|
|
1750
|
+
|
|
1751
|
+
// clang-format on
|
|
1752
|
+
}
|
|
1753
|
+
};
|
|
1754
|
+
|
|
1755
|
+
// This partial specialization is expected to do nothing.
|
|
1756
|
+
template <
|
|
1757
|
+
intptr_t DIM,
|
|
1758
|
+
intptr_t COARSE_SIZE,
|
|
1759
|
+
intptr_t FINE_SIZE,
|
|
1760
|
+
intptr_t COARSE_BITS,
|
|
1761
|
+
intptr_t FINE_BITS,
|
|
1762
|
+
bool FINE_SIZE_EQ_2,
|
|
1763
|
+
bool FINE_SIZE_EQ_4,
|
|
1764
|
+
bool QPOS_LEFT_GE_8,
|
|
1765
|
+
bool QPOS_LEFT_GE_4>
|
|
1766
|
+
struct Index2LevelDecoderImpl<
|
|
1767
|
+
DIM,
|
|
1768
|
+
COARSE_SIZE,
|
|
1769
|
+
FINE_SIZE,
|
|
1770
|
+
COARSE_BITS,
|
|
1771
|
+
FINE_BITS,
|
|
1772
|
+
DIM,
|
|
1773
|
+
FINE_SIZE_EQ_2,
|
|
1774
|
+
FINE_SIZE_EQ_4,
|
|
1775
|
+
QPOS_LEFT_GE_8,
|
|
1776
|
+
QPOS_LEFT_GE_4,
|
|
1777
|
+
true> {
|
|
1778
|
+
// clang-format off
|
|
1779
|
+
|
|
1780
|
+
// process 1 sample
|
|
1781
|
+
static void store(
|
|
1782
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1783
|
+
const float* const __restrict pqFineCentroids0,
|
|
1784
|
+
const uint8_t* const __restrict code0,
|
|
1785
|
+
float* const __restrict outputStore) {}
|
|
1786
|
+
|
|
1787
|
+
// process 1 sample
|
|
1788
|
+
static void accum(
|
|
1789
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1790
|
+
const float* const __restrict pqFineCentroids0,
|
|
1791
|
+
const uint8_t* const __restrict code0,
|
|
1792
|
+
const float weight0,
|
|
1793
|
+
float* const __restrict outputAccum) {}
|
|
1794
|
+
|
|
1795
|
+
// Process 2 samples.
|
|
1796
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids table.
|
|
1797
|
+
static void accum(
|
|
1798
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1799
|
+
const float* const __restrict pqFineCentroids0,
|
|
1800
|
+
const uint8_t* const __restrict code0,
|
|
1801
|
+
const float weight0,
|
|
1802
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1803
|
+
const float* const __restrict pqFineCentroids1,
|
|
1804
|
+
const uint8_t* const __restrict code1,
|
|
1805
|
+
const float weight1,
|
|
1806
|
+
float* const __restrict outputAccum) {}
|
|
1807
|
+
|
|
1808
|
+
// Process 2 samples.
|
|
1809
|
+
// Coarse pq centroids table and fine pq centroids table are shared among codes.
|
|
1810
|
+
static void accum(
|
|
1811
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1812
|
+
const float* const __restrict pqFineCentroids,
|
|
1813
|
+
const uint8_t* const __restrict code0,
|
|
1814
|
+
const float weight0,
|
|
1815
|
+
const uint8_t* const __restrict code1,
|
|
1816
|
+
const float weight1,
|
|
1817
|
+
float* const __restrict outputAccum) {}
|
|
1818
|
+
|
|
1819
|
+
// Process 3 samples.
|
|
1820
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids table.
|
|
1821
|
+
static void accum(
|
|
1822
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1823
|
+
const float* const __restrict pqFineCentroids0,
|
|
1824
|
+
const uint8_t* const __restrict code0,
|
|
1825
|
+
const float weight0,
|
|
1826
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1827
|
+
const float* const __restrict pqFineCentroids1,
|
|
1828
|
+
const uint8_t* const __restrict code1,
|
|
1829
|
+
const float weight1,
|
|
1830
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
1831
|
+
const float* const __restrict pqFineCentroids2,
|
|
1832
|
+
const uint8_t* const __restrict code2,
|
|
1833
|
+
const float weight2,
|
|
1834
|
+
float* const __restrict outputAccum) {}
|
|
1835
|
+
|
|
1836
|
+
// Process 3 samples.
|
|
1837
|
+
// Coarse pq centroids table and fine pq centroids table are shared among codes.
|
|
1838
|
+
static void accum(
|
|
1839
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1840
|
+
const float* const __restrict pqFineCentroids,
|
|
1841
|
+
const uint8_t* const __restrict code0,
|
|
1842
|
+
const float weight0,
|
|
1843
|
+
const uint8_t* const __restrict code1,
|
|
1844
|
+
const float weight1,
|
|
1845
|
+
const uint8_t* const __restrict code2,
|
|
1846
|
+
const float weight2,
|
|
1847
|
+
float* const __restrict outputAccum) {}
|
|
1848
|
+
|
|
1849
|
+
// clang-format on
|
|
1850
|
+
};
|
|
1851
|
+
} // namespace
|
|
1852
|
+
|
|
1853
|
+
// Suitable for IVF256,PQ[1]x8
|
|
1854
|
+
// Suitable for Residual[1]x8,PQ[2]x8
|
|
1855
|
+
// Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
|
|
1856
|
+
// Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
|
|
1857
|
+
template <
|
|
1858
|
+
intptr_t DIM,
|
|
1859
|
+
intptr_t COARSE_SIZE,
|
|
1860
|
+
intptr_t FINE_SIZE,
|
|
1861
|
+
intptr_t COARSE_BITS = 8,
|
|
1862
|
+
intptr_t FINE_BITS = 8>
|
|
1863
|
+
struct Index2LevelDecoder {
|
|
1864
|
+
static_assert(
|
|
1865
|
+
COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
|
|
1866
|
+
"Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
|
|
1867
|
+
static_assert(
|
|
1868
|
+
FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
|
|
1869
|
+
"Only 8, 10 or 16 bits are currently supported for FINE_BITS");
|
|
1870
|
+
|
|
1871
|
+
static constexpr intptr_t dim = DIM;
|
|
1872
|
+
static constexpr intptr_t coarseSize = COARSE_SIZE;
|
|
1873
|
+
static constexpr intptr_t fineSize = FINE_SIZE;
|
|
1874
|
+
static constexpr intptr_t coarseBits = COARSE_BITS;
|
|
1875
|
+
static constexpr intptr_t fineBits = FINE_BITS;
|
|
1876
|
+
|
|
1877
|
+
// Process 1 sample.
|
|
1878
|
+
static void store(
|
|
1879
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1880
|
+
const float* const __restrict pqFineCentroids,
|
|
1881
|
+
const uint8_t* const __restrict code,
|
|
1882
|
+
float* const __restrict outputStore) {
|
|
1883
|
+
Index2LevelDecoderImpl<
|
|
1884
|
+
DIM,
|
|
1885
|
+
COARSE_SIZE,
|
|
1886
|
+
FINE_SIZE,
|
|
1887
|
+
COARSE_BITS,
|
|
1888
|
+
FINE_BITS,
|
|
1889
|
+
0>::
|
|
1890
|
+
store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
|
|
1891
|
+
}
|
|
1892
|
+
|
|
1893
|
+
// Process 1 sample.
|
|
1894
|
+
// Performs outputAccum += weight * decoded(code)
|
|
1895
|
+
static void accum(
|
|
1896
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1897
|
+
const float* const __restrict pqFineCentroids,
|
|
1898
|
+
const uint8_t* const __restrict code,
|
|
1899
|
+
const float weight,
|
|
1900
|
+
float* const __restrict outputAccum) {
|
|
1901
|
+
Index2LevelDecoderImpl<
|
|
1902
|
+
DIM,
|
|
1903
|
+
COARSE_SIZE,
|
|
1904
|
+
FINE_SIZE,
|
|
1905
|
+
COARSE_BITS,
|
|
1906
|
+
FINE_BITS,
|
|
1907
|
+
0>::
|
|
1908
|
+
accum(pqCoarseCentroids,
|
|
1909
|
+
pqFineCentroids,
|
|
1910
|
+
code,
|
|
1911
|
+
weight,
|
|
1912
|
+
outputAccum);
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1915
|
+
// Process 2 samples.
|
|
1916
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
1917
|
+
// table.
|
|
1918
|
+
//
|
|
1919
|
+
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
|
1920
|
+
// decoded(code1).
|
|
1921
|
+
static void accum(
|
|
1922
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1923
|
+
const float* const __restrict pqFineCentroids0,
|
|
1924
|
+
const uint8_t* const __restrict code0,
|
|
1925
|
+
const float weight0,
|
|
1926
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1927
|
+
const float* const __restrict pqFineCentroids1,
|
|
1928
|
+
const uint8_t* const __restrict code1,
|
|
1929
|
+
const float weight1,
|
|
1930
|
+
float* const __restrict outputAccum) {
|
|
1931
|
+
Index2LevelDecoderImpl<
|
|
1932
|
+
DIM,
|
|
1933
|
+
COARSE_SIZE,
|
|
1934
|
+
FINE_SIZE,
|
|
1935
|
+
COARSE_BITS,
|
|
1936
|
+
FINE_BITS,
|
|
1937
|
+
0>::
|
|
1938
|
+
accum(pqCoarseCentroids0,
|
|
1939
|
+
pqFineCentroids0,
|
|
1940
|
+
code0,
|
|
1941
|
+
weight0,
|
|
1942
|
+
pqCoarseCentroids1,
|
|
1943
|
+
pqFineCentroids1,
|
|
1944
|
+
code1,
|
|
1945
|
+
weight1,
|
|
1946
|
+
outputAccum);
|
|
1947
|
+
}
|
|
1948
|
+
|
|
1949
|
+
// Process 2 samples.
|
|
1950
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
1951
|
+
// codes.
|
|
1952
|
+
//
|
|
1953
|
+
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
|
1954
|
+
// decoded(code1)
|
|
1955
|
+
static void accum(
|
|
1956
|
+
const float* const __restrict pqCoarseCentroids,
|
|
1957
|
+
const float* const __restrict pqFineCentroids,
|
|
1958
|
+
const uint8_t* const __restrict code0,
|
|
1959
|
+
const float weight0,
|
|
1960
|
+
const uint8_t* const __restrict code1,
|
|
1961
|
+
const float weight1,
|
|
1962
|
+
float* const __restrict outputAccum) {
|
|
1963
|
+
Index2LevelDecoderImpl<
|
|
1964
|
+
DIM,
|
|
1965
|
+
COARSE_SIZE,
|
|
1966
|
+
FINE_SIZE,
|
|
1967
|
+
COARSE_BITS,
|
|
1968
|
+
FINE_BITS,
|
|
1969
|
+
0>::
|
|
1970
|
+
accum(pqCoarseCentroids,
|
|
1971
|
+
pqFineCentroids,
|
|
1972
|
+
code0,
|
|
1973
|
+
weight0,
|
|
1974
|
+
code1,
|
|
1975
|
+
weight1,
|
|
1976
|
+
outputAccum);
|
|
1977
|
+
}
|
|
1978
|
+
|
|
1979
|
+
// Process 3 samples.
|
|
1980
|
+
// Each code uses its own coarse pq centroids table and fine pq centroids
|
|
1981
|
+
// table.
|
|
1982
|
+
//
|
|
1983
|
+
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
|
1984
|
+
// decoded(code1) + weight2 * decoded(code2)
|
|
1985
|
+
static void accum(
|
|
1986
|
+
const float* const __restrict pqCoarseCentroids0,
|
|
1987
|
+
const float* const __restrict pqFineCentroids0,
|
|
1988
|
+
const uint8_t* const __restrict code0,
|
|
1989
|
+
const float weight0,
|
|
1990
|
+
const float* const __restrict pqCoarseCentroids1,
|
|
1991
|
+
const float* const __restrict pqFineCentroids1,
|
|
1992
|
+
const uint8_t* const __restrict code1,
|
|
1993
|
+
const float weight1,
|
|
1994
|
+
const float* const __restrict pqCoarseCentroids2,
|
|
1995
|
+
const float* const __restrict pqFineCentroids2,
|
|
1996
|
+
const uint8_t* const __restrict code2,
|
|
1997
|
+
const float weight2,
|
|
1998
|
+
float* const __restrict outputAccum) {
|
|
1999
|
+
Index2LevelDecoderImpl<
|
|
2000
|
+
DIM,
|
|
2001
|
+
COARSE_SIZE,
|
|
2002
|
+
FINE_SIZE,
|
|
2003
|
+
COARSE_BITS,
|
|
2004
|
+
FINE_BITS,
|
|
2005
|
+
0>::
|
|
2006
|
+
accum(pqCoarseCentroids0,
|
|
2007
|
+
pqFineCentroids0,
|
|
2008
|
+
code0,
|
|
2009
|
+
weight0,
|
|
2010
|
+
pqCoarseCentroids1,
|
|
2011
|
+
pqFineCentroids1,
|
|
2012
|
+
code1,
|
|
2013
|
+
weight1,
|
|
2014
|
+
pqCoarseCentroids2,
|
|
2015
|
+
pqFineCentroids2,
|
|
2016
|
+
code2,
|
|
2017
|
+
weight2,
|
|
2018
|
+
outputAccum);
|
|
2019
|
+
}
|
|
2020
|
+
|
|
2021
|
+
// Process 3 samples.
|
|
2022
|
+
// Coarse pq centroids table and fine pq centroids table are shared among
|
|
2023
|
+
// codes.
|
|
2024
|
+
//
|
|
2025
|
+
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
|
2026
|
+
// decoded(code1) + weight2 * decoded(code2)
|
|
2027
|
+
static void accum(
|
|
2028
|
+
const float* const __restrict pqCoarseCentroids,
|
|
2029
|
+
const float* const __restrict pqFineCentroids,
|
|
2030
|
+
const uint8_t* const __restrict code0,
|
|
2031
|
+
const float weight0,
|
|
2032
|
+
const uint8_t* const __restrict code1,
|
|
2033
|
+
const float weight1,
|
|
2034
|
+
const uint8_t* const __restrict code2,
|
|
2035
|
+
const float weight2,
|
|
2036
|
+
float* const __restrict outputAccum) {
|
|
2037
|
+
Index2LevelDecoderImpl<
|
|
2038
|
+
DIM,
|
|
2039
|
+
COARSE_SIZE,
|
|
2040
|
+
FINE_SIZE,
|
|
2041
|
+
COARSE_BITS,
|
|
2042
|
+
FINE_BITS,
|
|
2043
|
+
0>::
|
|
2044
|
+
accum(pqCoarseCentroids,
|
|
2045
|
+
pqFineCentroids,
|
|
2046
|
+
code0,
|
|
2047
|
+
weight0,
|
|
2048
|
+
code1,
|
|
2049
|
+
weight1,
|
|
2050
|
+
code2,
|
|
2051
|
+
weight2,
|
|
2052
|
+
outputAccum);
|
|
2053
|
+
}
|
|
2054
|
+
};
|
|
2055
|
+
|
|
2056
|
+
} // namespace cppcontrib
|
|
2057
|
+
} // namespace faiss
|
|
2058
|
+
#endif // LEVEL2_AVX2_INL_H
|