faiss 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -0,0 +1,1618 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+
3
+ #ifndef PQ_AVX2_INL_H
4
+ #define PQ_AVX2_INL_H
5
+
6
+ #include <immintrin.h>
7
+
8
+ #include <cstddef>
9
+ #include <cstdint>
10
+
11
+ #include <faiss/cppcontrib/detail/UintReader.h>
12
+
13
+ namespace faiss {
14
+ namespace cppcontrib {
15
+
16
+ ////////////////////////////////////////////////////////////////////////////////////
17
+ /// IndexPQDecoder
18
+ ////////////////////////////////////////////////////////////////////////////////////
19
+
20
+ namespace {
21
+
22
+ // Despite the following functions are somewhat redundant, I'd like to keep the
23
+ // overall basic blocks similar to ones from Index2LevelDecoder.
24
+ // A compiler will optimize away the redundant code.
25
+
26
+ // Processes 8 float values.
27
+ // Returns {
28
+ // [0..1] = *fine0[0..1];
29
+ // [2..3] = *fine1[0..1];
30
+ // [4..5] = *fine2[0..1];
31
+ // [6..7] = *fine3[0..1];
32
+ // }
33
+ inline __m256 elementaryBlock2x4b(
34
+ const float* const __restrict fine0,
35
+ const float* const __restrict fine1,
36
+ const float* const __restrict fine2,
37
+ const float* const __restrict fine3) {
38
+ // load fine
39
+ const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
40
+ *reinterpret_cast<const double*>(fine0),
41
+ *reinterpret_cast<const double*>(fine1),
42
+ *reinterpret_cast<const double*>(fine2),
43
+ *reinterpret_cast<const double*>(fine3)));
44
+
45
+ // add coarse and fine
46
+ return fineValue;
47
+ }
48
+
49
+ // Processes 8 float values.
50
+ // Returns {
51
+ // [0..1] = existingValue[0..1] + weight * (*fine0[0..1]);
52
+ // [2..3] = existingValue[0..1] + weight * (*fine1[0..1]);
53
+ // [4..5] = existingValue[0..1] + weight * (*fine2[0..1]);
54
+ // [6..7] = existingValue[0..1] + weight * (*fine3[0..1]);
55
+ // }
56
+ inline __m256 elementaryBlock2x4bAccum(
57
+ const float* const __restrict fine0,
58
+ const float* const __restrict fine1,
59
+ const float* const __restrict fine2,
60
+ const float* const __restrict fine3,
61
+ const float weight,
62
+ const __m256 existingValue) {
63
+ // add coarse and fine
64
+ const __m256 fineValue = elementaryBlock2x4b(fine0, fine1, fine2, fine3);
65
+
66
+ // this operation is expected to be optimized by a compiler
67
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
68
+ // do fma
69
+ return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
70
+ }
71
+
72
+ // Processes 4 float values.
73
+ // Returns {
74
+ // [0..3] = *fine[0..3];
75
+ // }
76
+ inline __m128 elementaryBlock4x1b(const float* const __restrict fine) {
77
+ // load fine
78
+ const __m128 fineValue = _mm_loadu_ps(fine);
79
+ return fineValue;
80
+ }
81
+
82
+ // Processes 4 float values.
83
+ // Returns {
84
+ // [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
85
+ // }
86
+ inline __m128 elementaryBlock4x1bAccum(
87
+ const float* const __restrict fine,
88
+ const float weight,
89
+ const __m128 existingValue) {
90
+ const __m128 fineValue = elementaryBlock4x1b(fine);
91
+
92
+ // this operation is expected to be optimized by a compiler
93
+ const __m128 weightAvx = _mm_set1_ps(weight);
94
+ // do fma
95
+ return _mm_fmadd_ps(fineValue, weightAvx, existingValue);
96
+ }
97
+
98
+ // Processes 8 float values.
99
+ // Returns {
100
+ // [0..3] = *fine0[0..3];
101
+ // [4..7] = *fine1[0..3];
102
+ // }
103
+ inline __m256 elementaryBlock4x2b(
104
+ const float* const __restrict fine0,
105
+ const float* const __restrict fine1) {
106
+ // load fine
107
+ const __m128 fineValue0 = _mm_loadu_ps(fine0);
108
+ const __m128 fineValue1 = _mm_loadu_ps(fine1);
109
+
110
+ // combine two 4b into a single 8b
111
+ const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
112
+ return combinedFineValue;
113
+ }
114
+
115
+ // Processes 8 float values.
116
+ // Returns {
117
+ // [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
118
+ // [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
119
+ // }
120
+ inline __m256 elementaryBlock4x2bAccum(
121
+ const float* const __restrict fine0,
122
+ const float* const __restrict fine1,
123
+ const float weight,
124
+ const __m256 existingValue) {
125
+ const __m256 fineValue = elementaryBlock4x2b(fine0, fine1);
126
+
127
+ // this operation is expected to be optimized by a compiler
128
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
129
+ // do fma
130
+ return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
131
+ }
132
+
133
+ // Processes 8 float values.
134
+ // Returns {
135
+ // [0..7] = *fine[0..7];
136
+ // }
137
+ inline __m256 elementaryBlock8x1b(const float* const __restrict fine) {
138
+ // load fine
139
+ const __m256 fineValue = _mm256_loadu_ps(fine);
140
+ return fineValue;
141
+ }
142
+
143
+ // Processes 8 float values.
144
+ // Returns {
145
+ // [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
146
+ // }
147
+ inline __m256 elementaryBlock8x1bAccum(
148
+ const float* const __restrict fine,
149
+ const float weight,
150
+ const __m256 existingValue) {
151
+ const __m256 fineValue = elementaryBlock8x1b(fine);
152
+
153
+ // this operation is expected to be optimized by a compiler
154
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
155
+ // do fma
156
+ return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
157
+ }
158
+
159
+ // The following code uses template-based for-loop unrolling,
160
+ // because the compiler does not do that on its own as needed.
161
+ // The idea is the following:
162
+ // template<int I, int MAX>
163
+ // struct Foo {
164
+ // static void bar() {
165
+ // doSomething(I);
166
+ // Foo<I + 1, MAX>::bar();
167
+ // }
168
+ // };
169
+ //
170
+ // template<int MAX>
171
+ // struct Foo<MAX, MAX> {
172
+ // static void bar() {}
173
+ // };
174
+ //
175
+ // Initiate the loop:
176
+ // Foo<0, MAX>::bar();
177
+
178
+ template <
179
+ intptr_t DIM,
180
+ intptr_t FINE_SIZE,
181
+ intptr_t FINE_BITS,
182
+ intptr_t CPOS,
183
+ bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
184
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
185
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
186
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
187
+ bool DIM_EQ_CPOS = DIM == CPOS>
188
+ struct IndexPQDecoderImpl;
189
+
190
+ template <
191
+ intptr_t DIM,
192
+ intptr_t FINE_BITS,
193
+ intptr_t CPOS,
194
+ bool QPOS_LEFT_GE_8,
195
+ bool QPOS_LEFT_GE_4>
196
+ struct IndexPQDecoderImpl<
197
+ DIM,
198
+ 2,
199
+ FINE_BITS,
200
+ CPOS,
201
+ true,
202
+ false,
203
+ QPOS_LEFT_GE_8,
204
+ QPOS_LEFT_GE_4,
205
+ false> {
206
+ static constexpr intptr_t FINE_SIZE = 2;
207
+
208
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
209
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
210
+
211
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
212
+
213
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
214
+
215
+ // process 1 sample
216
+ static void store(
217
+ const float* const __restrict pqFineCentroids0,
218
+ const uint8_t* const __restrict code0,
219
+ float* const __restrict outputStore) {
220
+ // fine quantizer
221
+ const uint8_t* const __restrict fine0 = code0;
222
+
223
+ // clang-format off
224
+
225
+ // process chunks, 4 float
226
+ // but 8 floats per loop
227
+
228
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
229
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
230
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
231
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
232
+
233
+ const __m256 storeValue = elementaryBlock2x4b(
234
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
235
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
236
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
237
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
238
+
239
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
240
+
241
+ // next
242
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
243
+ pqFineCentroids0, code0, outputStore);
244
+
245
+ // clang-format on
246
+ }
247
+
248
+ // process 1 sample
249
+ static void accum(
250
+ const float* const __restrict pqFineCentroids0,
251
+ const uint8_t* const __restrict code0,
252
+ const float weight0,
253
+ float* const __restrict outputAccum) {
254
+ // fine quantizer
255
+ const uint8_t* const __restrict fine0 = code0;
256
+
257
+ // clang-format off
258
+
259
+ // process chunks, 4 float
260
+ // but 8 floats per loop
261
+
262
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
263
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
264
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
265
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
266
+
267
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
268
+
269
+ existingValue = elementaryBlock2x4bAccum(
270
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
271
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
272
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
273
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
274
+ weight0,
275
+ existingValue);
276
+
277
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
278
+
279
+ // next
280
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
281
+ pqFineCentroids0, code0, weight0, outputAccum);
282
+
283
+ // clang-format on
284
+ }
285
+
286
+ // Process 2 samples.
287
+ // Each code uses its own fine pq centroids table.
288
+ static void accum(
289
+ const float* const __restrict pqFineCentroids0,
290
+ const uint8_t* const __restrict code0,
291
+ const float weight0,
292
+ const float* const __restrict pqFineCentroids1,
293
+ const uint8_t* const __restrict code1,
294
+ const float weight1,
295
+ float* const __restrict outputAccum) {
296
+ // fine quantizer
297
+ const uint8_t* const __restrict fine0 = code0;
298
+ const uint8_t* const __restrict fine1 = code1;
299
+
300
+ // clang-format off
301
+
302
+ // process chunks, 4 float
303
+ // but 8 floats per loop
304
+
305
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
306
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
307
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
308
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
309
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
310
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
311
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
312
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
313
+
314
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
315
+
316
+ existingValue = elementaryBlock2x4bAccum(
317
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
318
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
319
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
320
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
321
+ weight0,
322
+ existingValue);
323
+
324
+ existingValue = elementaryBlock2x4bAccum(
325
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
326
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
327
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
328
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
329
+ weight1,
330
+ existingValue);
331
+
332
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
333
+
334
+ // next
335
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
336
+ pqFineCentroids0, code0, weight0,
337
+ pqFineCentroids1, code1, weight1,
338
+ outputAccum);
339
+
340
+ // clang-format on
341
+ }
342
+
343
+ // Process 2 samples.
344
+ // Fine pq centroids table is shared among codes.
345
+ static void accum(
346
+ const float* const __restrict pqFineCentroids,
347
+ const uint8_t* const __restrict code0,
348
+ const float weight0,
349
+ const uint8_t* const __restrict code1,
350
+ const float weight1,
351
+ float* const __restrict outputAccum) {
352
+ // fine quantizer
353
+ const uint8_t* const __restrict fine0 = code0;
354
+ const uint8_t* const __restrict fine1 = code1;
355
+
356
+ // clang-format off
357
+
358
+ // process chunks, 4 float
359
+ // but 8 floats per loop
360
+
361
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
362
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
363
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
364
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
365
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
366
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
367
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
368
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
369
+
370
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
371
+
372
+ existingValue = elementaryBlock2x4bAccum(
373
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
374
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
375
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
376
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
377
+ weight0,
378
+ existingValue);
379
+
380
+ existingValue = elementaryBlock2x4bAccum(
381
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
382
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
383
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
384
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
385
+ weight1,
386
+ existingValue);
387
+
388
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
389
+
390
+ // next
391
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
392
+ pqFineCentroids,
393
+ code0, weight0,
394
+ code1, weight1,
395
+ outputAccum);
396
+
397
+ // clang-format on
398
+ }
399
+
400
+ // Process 3 samples.
401
+ // Each code uses its own fine pq centroids table.
402
+ static void accum(
403
+ const float* const __restrict pqFineCentroids0,
404
+ const uint8_t* const __restrict code0,
405
+ const float weight0,
406
+ const float* const __restrict pqFineCentroids1,
407
+ const uint8_t* const __restrict code1,
408
+ const float weight1,
409
+ const float* const __restrict pqFineCentroids2,
410
+ const uint8_t* const __restrict code2,
411
+ const float weight2,
412
+ float* const __restrict outputAccum) {
413
+ // fine quantizer
414
+ const uint8_t* const __restrict fine0 = code0;
415
+ const uint8_t* const __restrict fine1 = code1;
416
+ const uint8_t* const __restrict fine2 = code2;
417
+
418
+ // clang-format off
419
+
420
+ // process chunks, 4 float
421
+ // but 8 floats per loop
422
+
423
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
424
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
425
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
426
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
427
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
428
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
429
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
430
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
431
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
432
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
433
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
434
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
435
+
436
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
437
+
438
+ existingValue = elementaryBlock2x4bAccum(
439
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
440
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
441
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
442
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
443
+ weight0,
444
+ existingValue);
445
+
446
+ existingValue = elementaryBlock2x4bAccum(
447
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
448
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
449
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
450
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
451
+ weight1,
452
+ existingValue);
453
+
454
+ existingValue = elementaryBlock2x4bAccum(
455
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
456
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
457
+ pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
458
+ pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
459
+ weight2,
460
+ existingValue);
461
+
462
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
463
+
464
+ // next
465
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
466
+ pqFineCentroids0, code0, weight0,
467
+ pqFineCentroids1, code1, weight1,
468
+ pqFineCentroids2, code2, weight2,
469
+ outputAccum);
470
+
471
+ // clang-format on
472
+ }
473
+
474
+ // Process 3 samples.
475
+ // Fine pq centroids table is shared among codes.
476
+ static void accum(
477
+ const float* const __restrict pqFineCentroids,
478
+ const uint8_t* const __restrict code0,
479
+ const float weight0,
480
+ const uint8_t* const __restrict code1,
481
+ const float weight1,
482
+ const uint8_t* const __restrict code2,
483
+ const float weight2,
484
+ float* const __restrict outputAccum) {
485
+ // fine quantizer
486
+ const uint8_t* const __restrict fine0 = code0;
487
+ const uint8_t* const __restrict fine1 = code1;
488
+ const uint8_t* const __restrict fine2 = code2;
489
+
490
+ // clang-format off
491
+
492
+ // process chunks, 4 float
493
+ // but 8 floats per loop
494
+
495
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
496
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
497
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
498
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
499
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
500
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
501
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
502
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
503
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
504
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
505
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
506
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
507
+
508
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
509
+
510
+ existingValue = elementaryBlock2x4bAccum(
511
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
512
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
513
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
514
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
515
+ weight0,
516
+ existingValue);
517
+
518
+ existingValue = elementaryBlock2x4bAccum(
519
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
520
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
521
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
522
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
523
+ weight1,
524
+ existingValue);
525
+
526
+ existingValue = elementaryBlock2x4bAccum(
527
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
528
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
529
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
530
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
531
+ weight2,
532
+ existingValue);
533
+
534
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
535
+
536
+ // next
537
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
538
+ pqFineCentroids,
539
+ code0, weight0,
540
+ code1, weight1,
541
+ code2, weight2,
542
+ outputAccum);
543
+
544
+ // clang-format on
545
+ }
546
+ };
547
+
548
+ template <
549
+ intptr_t DIM,
550
+ intptr_t FINE_BITS,
551
+ intptr_t CPOS,
552
+ bool QPOS_LEFT_GE_8,
553
+ bool QPOS_LEFT_GE_4>
554
+ struct IndexPQDecoderImpl<
555
+ DIM,
556
+ 4,
557
+ FINE_BITS,
558
+ CPOS,
559
+ false,
560
+ true,
561
+ QPOS_LEFT_GE_8,
562
+ QPOS_LEFT_GE_4,
563
+ false> {
564
+ static constexpr intptr_t FINE_SIZE = 4;
565
+
566
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
567
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
568
+
569
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
570
+
571
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
572
+
573
+ // process 1 sample
574
+ static void store(
575
+ const float* const __restrict pqFineCentroids0,
576
+ const uint8_t* const __restrict code0,
577
+ float* const __restrict outputStore) {
578
+ // fine quantizer
579
+ const uint8_t* const __restrict fine0 = code0;
580
+
581
+ // clang-format off
582
+
583
+ // process chunks, 4 float
584
+ // but 8 floats per loop
585
+
586
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
587
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
588
+
589
+ const __m256 storeValue = elementaryBlock4x2b(
590
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
591
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
592
+
593
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
594
+
595
+ // next
596
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
597
+ pqFineCentroids0, code0, outputStore);
598
+
599
+ // clang-format on
600
+ }
601
+
602
+ // process 1 sample
603
+ static void accum(
604
+ const float* const __restrict pqFineCentroids0,
605
+ const uint8_t* const __restrict code0,
606
+ const float weight0,
607
+ float* const __restrict outputAccum) {
608
+ // fine quantizer
609
+ const uint8_t* const __restrict fine0 = code0;
610
+
611
+ // clang-format off
612
+
613
+ // process chunks, 4 float
614
+ // but 8 floats per loop
615
+
616
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
617
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
618
+
619
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
620
+
621
+ existingValue = elementaryBlock4x2bAccum(
622
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
623
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
624
+ weight0,
625
+ existingValue);
626
+
627
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
628
+
629
+ // next
630
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
631
+ pqFineCentroids0, code0, weight0, outputAccum);
632
+
633
+ // clang-format on
634
+ }
635
+
636
+ // Process 2 samples.
637
+ // Each code uses its own fine pq centroids table.
638
+ static void accum(
639
+ const float* const __restrict pqFineCentroids0,
640
+ const uint8_t* const __restrict code0,
641
+ const float weight0,
642
+ const float* const __restrict pqFineCentroids1,
643
+ const uint8_t* const __restrict code1,
644
+ const float weight1,
645
+ float* const __restrict outputAccum) {
646
+ // fine quantizer
647
+ const uint8_t* const __restrict fine0 = code0;
648
+ const uint8_t* const __restrict fine1 = code1;
649
+
650
+ // clang-format off
651
+
652
+ // process chunks, 4 float
653
+ // but 8 floats per loop
654
+
655
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
656
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
657
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
658
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
659
+
660
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
661
+
662
+ existingValue = elementaryBlock4x2bAccum(
663
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
664
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
665
+ weight0,
666
+ existingValue);
667
+
668
+ existingValue = elementaryBlock4x2bAccum(
669
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
670
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
671
+ weight1,
672
+ existingValue);
673
+
674
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
675
+
676
+ // next
677
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
678
+ pqFineCentroids0, code0, weight0,
679
+ pqFineCentroids1, code1, weight1,
680
+ outputAccum);
681
+
682
+ // clang-format on
683
+ }
684
+
685
+ // Process 2 samples.
686
+ // Fine pq centroids table is shared among codes.
687
+ static void accum(
688
+ const float* const __restrict pqFineCentroids,
689
+ const uint8_t* const __restrict code0,
690
+ const float weight0,
691
+ const uint8_t* const __restrict code1,
692
+ const float weight1,
693
+ float* const __restrict outputAccum) {
694
+ // fine quantizer
695
+ const uint8_t* const __restrict fine0 = code0;
696
+ const uint8_t* const __restrict fine1 = code1;
697
+
698
+ // clang-format off
699
+
700
+ // process chunks, 4 float
701
+ // but 8 floats per loop
702
+
703
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
704
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
705
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
706
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
707
+
708
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
709
+
710
+ existingValue = elementaryBlock4x2bAccum(
711
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
712
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
713
+ weight0,
714
+ existingValue);
715
+
716
+ existingValue = elementaryBlock4x2bAccum(
717
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
718
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
719
+ weight1,
720
+ existingValue);
721
+
722
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
723
+
724
+ // next
725
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
726
+ pqFineCentroids,
727
+ code0, weight0,
728
+ code1, weight1,
729
+ outputAccum);
730
+
731
+ // clang-format on
732
+ }
733
+
734
+ // Process 3 samples.
735
+ // Each code uses its own fine pq centroids table.
736
+ static void accum(
737
+ const float* const __restrict pqFineCentroids0,
738
+ const uint8_t* const __restrict code0,
739
+ const float weight0,
740
+ const float* const __restrict pqFineCentroids1,
741
+ const uint8_t* const __restrict code1,
742
+ const float weight1,
743
+ const float* const __restrict pqFineCentroids2,
744
+ const uint8_t* const __restrict code2,
745
+ const float weight2,
746
+ float* const __restrict outputAccum) {
747
+ // fine quantizer
748
+ const uint8_t* const __restrict fine0 = code0;
749
+ const uint8_t* const __restrict fine1 = code1;
750
+ const uint8_t* const __restrict fine2 = code2;
751
+
752
+ // clang-format off
753
+
754
+ // process chunks, 4 float
755
+ // but 8 floats per loop
756
+
757
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
758
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
759
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
760
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
761
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
762
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
763
+
764
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
765
+
766
+ existingValue = elementaryBlock4x2bAccum(
767
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
768
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
769
+ weight0,
770
+ existingValue);
771
+
772
+ existingValue = elementaryBlock4x2bAccum(
773
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
774
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
775
+ weight1,
776
+ existingValue);
777
+
778
+ existingValue = elementaryBlock4x2bAccum(
779
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
780
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
781
+ weight2,
782
+ existingValue);
783
+
784
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
785
+
786
+ // next
787
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
788
+ pqFineCentroids0, code0, weight0,
789
+ pqFineCentroids1, code1, weight1,
790
+ pqFineCentroids2, code2, weight2,
791
+ outputAccum);
792
+
793
+ // clang-format on
794
+ }
795
+
796
+ // Process 3 samples.
797
+ // Fine pq centroids table is shared among codes.
798
+ static void accum(
799
+ const float* const __restrict pqFineCentroids,
800
+ const uint8_t* const __restrict code0,
801
+ const float weight0,
802
+ const uint8_t* const __restrict code1,
803
+ const float weight1,
804
+ const uint8_t* const __restrict code2,
805
+ const float weight2,
806
+ float* const __restrict outputAccum) {
807
+ // fine quantizer
808
+ const uint8_t* const __restrict fine0 = code0;
809
+ const uint8_t* const __restrict fine1 = code1;
810
+ const uint8_t* const __restrict fine2 = code2;
811
+
812
+ // clang-format off
813
+
814
+ // process chunks, 4 float
815
+ // but 8 floats per loop
816
+
817
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
818
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
819
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
820
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
821
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
822
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
823
+
824
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
825
+
826
+ existingValue = elementaryBlock4x2bAccum(
827
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
828
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
829
+ weight0,
830
+ existingValue);
831
+
832
+ existingValue = elementaryBlock4x2bAccum(
833
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
834
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
835
+ weight1,
836
+ existingValue);
837
+
838
+ existingValue = elementaryBlock4x2bAccum(
839
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
840
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
841
+ weight2,
842
+ existingValue);
843
+
844
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
845
+
846
+ // next
847
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
848
+ pqFineCentroids,
849
+ code0, weight0,
850
+ code1, weight1,
851
+ code2, weight2,
852
+ outputAccum);
853
+
854
+ // clang-format on
855
+ }
856
+ };
857
+
858
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
859
+ struct IndexPQDecoderImpl<
860
+ DIM,
861
+ FINE_SIZE,
862
+ FINE_BITS,
863
+ CPOS,
864
+ false,
865
+ false,
866
+ true,
867
+ true,
868
+ false> {
869
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
870
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
871
+
872
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
873
+
874
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
875
+
876
+ // process 1 sample
877
+ static void store(
878
+ const float* const __restrict pqFineCentroids0,
879
+ const uint8_t* const __restrict code0,
880
+ float* const __restrict outputStore) {
881
+ // fine quantizer
882
+ const uint8_t* const __restrict fine0 = code0;
883
+
884
+ // clang-format off
885
+
886
+ // process chunks, 8 float
887
+
888
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
889
+
890
+ const __m256 storeValue = elementaryBlock8x1b(
891
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
892
+
893
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
894
+
895
+ // next
896
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
897
+ pqFineCentroids0, code0, outputStore);
898
+
899
+ // clang-format on
900
+ }
901
+
902
+ // process 1 sample
903
+ static void accum(
904
+ const float* const __restrict pqFineCentroids0,
905
+ const uint8_t* const __restrict code0,
906
+ const float weight0,
907
+ float* const __restrict outputAccum) {
908
+ // fine quantizer
909
+ const uint8_t* const __restrict fine0 = code0;
910
+
911
+ // clang-format off
912
+
913
+ // process chunks, 8 float
914
+
915
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
916
+
917
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
918
+
919
+ existingValue = elementaryBlock8x1bAccum(
920
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
921
+ weight0,
922
+ existingValue);
923
+
924
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
925
+
926
+ // next
927
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
928
+ pqFineCentroids0, code0, weight0, outputAccum);
929
+
930
+ // clang-format on
931
+ }
932
+
933
+ // Process 2 samples.
934
+ // Each code uses its own fine pq centroids table.
935
+ static void accum(
936
+ const float* const __restrict pqFineCentroids0,
937
+ const uint8_t* const __restrict code0,
938
+ const float weight0,
939
+ const float* const __restrict pqFineCentroids1,
940
+ const uint8_t* const __restrict code1,
941
+ const float weight1,
942
+ float* const __restrict outputAccum) {
943
+ // fine quantizer
944
+ const uint8_t* const __restrict fine0 = code0;
945
+ const uint8_t* const __restrict fine1 = code1;
946
+
947
+ // clang-format off
948
+
949
+ // process chunks, 8 float
950
+
951
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
952
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
953
+
954
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
955
+
956
+ existingValue = elementaryBlock8x1bAccum(
957
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
958
+ weight0,
959
+ existingValue);
960
+
961
+ existingValue = elementaryBlock8x1bAccum(
962
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
963
+ weight1,
964
+ existingValue);
965
+
966
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
967
+
968
+ // next
969
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
970
+ pqFineCentroids0, code0, weight0,
971
+ pqFineCentroids1, code1, weight1,
972
+ outputAccum);
973
+
974
+ // clang-format on
975
+ }
976
+
977
+ // Process 2 samples.
978
+ // Fine pq centroids table is shared among codes.
979
+ static void accum(
980
+ const float* const __restrict pqFineCentroids,
981
+ const uint8_t* const __restrict code0,
982
+ const float weight0,
983
+ const uint8_t* const __restrict code1,
984
+ const float weight1,
985
+ float* const __restrict outputAccum) {
986
+ // fine quantizer
987
+ const uint8_t* const __restrict fine0 = code0;
988
+ const uint8_t* const __restrict fine1 = code1;
989
+
990
+ // clang-format off
991
+
992
+ // process chunks, 8 float
993
+
994
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
995
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
996
+
997
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
998
+
999
+ existingValue = elementaryBlock8x1bAccum(
1000
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1001
+ weight0,
1002
+ existingValue);
1003
+
1004
+ existingValue = elementaryBlock8x1bAccum(
1005
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1006
+ weight1,
1007
+ existingValue);
1008
+
1009
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1010
+
1011
+ // next
1012
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
1013
+ pqFineCentroids,
1014
+ code0, weight0,
1015
+ code1, weight1,
1016
+ outputAccum);
1017
+
1018
+ // clang-format on
1019
+ }
1020
+
1021
+ // Process 3 samples.
1022
+ // Each code uses its own fine pq centroids table.
1023
+ static void accum(
1024
+ const float* const __restrict pqFineCentroids0,
1025
+ const uint8_t* const __restrict code0,
1026
+ const float weight0,
1027
+ const float* const __restrict pqFineCentroids1,
1028
+ const uint8_t* const __restrict code1,
1029
+ const float weight1,
1030
+ const float* const __restrict pqFineCentroids2,
1031
+ const uint8_t* const __restrict code2,
1032
+ const float weight2,
1033
+ float* const __restrict outputAccum) {
1034
+ // fine quantizer
1035
+ const uint8_t* const __restrict fine0 = code0;
1036
+ const uint8_t* const __restrict fine1 = code1;
1037
+ const uint8_t* const __restrict fine2 = code2;
1038
+
1039
+ // clang-format off
1040
+
1041
+ // process chunks, 8 float
1042
+
1043
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1044
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1045
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1046
+
1047
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1048
+
1049
+ existingValue = elementaryBlock8x1bAccum(
1050
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1051
+ weight0,
1052
+ existingValue);
1053
+
1054
+ existingValue = elementaryBlock8x1bAccum(
1055
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1056
+ weight1,
1057
+ existingValue);
1058
+
1059
+ existingValue = elementaryBlock8x1bAccum(
1060
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1061
+ weight2,
1062
+ existingValue);
1063
+
1064
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1065
+
1066
+ // next
1067
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
1068
+ pqFineCentroids0, code0, weight0,
1069
+ pqFineCentroids1, code1, weight1,
1070
+ pqFineCentroids2, code2, weight2,
1071
+ outputAccum);
1072
+
1073
+ // clang-format on
1074
+ }
1075
+
1076
+ // Process 3 samples.
1077
+ // Fine pq centroids table is shared among codes.
1078
+ static void accum(
1079
+ const float* const __restrict pqFineCentroids,
1080
+ const uint8_t* const __restrict code0,
1081
+ const float weight0,
1082
+ const uint8_t* const __restrict code1,
1083
+ const float weight1,
1084
+ const uint8_t* const __restrict code2,
1085
+ const float weight2,
1086
+ float* const __restrict outputAccum) {
1087
+ // fine quantizer
1088
+ const uint8_t* const __restrict fine0 = code0;
1089
+ const uint8_t* const __restrict fine1 = code1;
1090
+ const uint8_t* const __restrict fine2 = code2;
1091
+
1092
+ // clang-format off
1093
+
1094
+ // process chunks, 8 float
1095
+
1096
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1097
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1098
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1099
+
1100
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1101
+
1102
+ existingValue = elementaryBlock8x1bAccum(
1103
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1104
+ weight0,
1105
+ existingValue);
1106
+
1107
+ existingValue = elementaryBlock8x1bAccum(
1108
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1109
+ weight1,
1110
+ existingValue);
1111
+
1112
+ existingValue = elementaryBlock8x1bAccum(
1113
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1114
+ weight2,
1115
+ existingValue);
1116
+
1117
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1118
+
1119
+ // next
1120
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
1121
+ pqFineCentroids,
1122
+ code0, weight0,
1123
+ code1, weight1,
1124
+ code2, weight2,
1125
+ outputAccum);
1126
+
1127
+ // clang-format on
1128
+ }
1129
+ };
1130
+
1131
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
1132
+ struct IndexPQDecoderImpl<
1133
+ DIM,
1134
+ FINE_SIZE,
1135
+ FINE_BITS,
1136
+ CPOS,
1137
+ false,
1138
+ false,
1139
+ false,
1140
+ true,
1141
+ false> {
1142
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1143
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1144
+
1145
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1146
+
1147
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1148
+
1149
+ // process 1 sample
1150
+ static void store(
1151
+ const float* const __restrict pqFineCentroids0,
1152
+ const uint8_t* const __restrict code0,
1153
+ float* const __restrict outputStore) {
1154
+ // fine quantizer
1155
+ const uint8_t* const __restrict fine0 = code0;
1156
+
1157
+ // clang-format off
1158
+
1159
+ // process chunks, 4 float
1160
+
1161
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1162
+
1163
+ const __m128 storeValue = elementaryBlock4x1b(
1164
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
1165
+
1166
+ _mm_storeu_ps(outputStore + CPOS, storeValue);
1167
+
1168
+ // next
1169
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
1170
+ pqFineCentroids0, code0, outputStore);
1171
+
1172
+ // clang-format on
1173
+ }
1174
+
1175
+ // process 1 sample
1176
+ static void accum(
1177
+ const float* const __restrict pqFineCentroids0,
1178
+ const uint8_t* const __restrict code0,
1179
+ const float weight0,
1180
+ float* const __restrict outputAccum) {
1181
+ // fine quantizer
1182
+ const uint8_t* const __restrict fine0 = code0;
1183
+
1184
+ // clang-format off
1185
+
1186
+ // process chunks, 4 float
1187
+
1188
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1189
+
1190
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1191
+
1192
+ existingValue = elementaryBlock4x1bAccum(
1193
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1194
+ weight0,
1195
+ existingValue);
1196
+
1197
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1198
+
1199
+ // next
1200
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1201
+ pqFineCentroids0, code0, weight0, outputAccum);
1202
+
1203
+ // clang-format on
1204
+ }
1205
+
1206
+ // Process 2 samples.
1207
+ // Each code uses its own fine pq centroids table.
1208
+ static void accum(
1209
+ const float* const __restrict pqFineCentroids0,
1210
+ const uint8_t* const __restrict code0,
1211
+ const float weight0,
1212
+ const float* const __restrict pqFineCentroids1,
1213
+ const uint8_t* const __restrict code1,
1214
+ const float weight1,
1215
+ float* const __restrict outputAccum) {
1216
+ // fine quantizer
1217
+ const uint8_t* const __restrict fine0 = code0;
1218
+ const uint8_t* const __restrict fine1 = code1;
1219
+
1220
+ // clang-format off
1221
+
1222
+ // process chunks, 4 float
1223
+
1224
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1225
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1226
+
1227
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1228
+
1229
+ existingValue = elementaryBlock4x1bAccum(
1230
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1231
+ weight0,
1232
+ existingValue);
1233
+
1234
+ existingValue = elementaryBlock4x1bAccum(
1235
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1236
+ weight1,
1237
+ existingValue);
1238
+
1239
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1240
+
1241
+ // next
1242
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1243
+ pqFineCentroids0, code0, weight0,
1244
+ pqFineCentroids1, code1, weight1,
1245
+ outputAccum);
1246
+
1247
+ // clang-format on
1248
+ }
1249
+
1250
+ // Process 2 samples.
1251
+ // Fine pq centroids table is shared among codes.
1252
+ static void accum(
1253
+ const float* const __restrict pqFineCentroids,
1254
+ const uint8_t* const __restrict code0,
1255
+ const float weight0,
1256
+ const uint8_t* const __restrict code1,
1257
+ const float weight1,
1258
+ float* const __restrict outputAccum) {
1259
+ // fine quantizer
1260
+ const uint8_t* const __restrict fine0 = code0;
1261
+ const uint8_t* const __restrict fine1 = code1;
1262
+
1263
+ // clang-format off
1264
+
1265
+ // process chunks, 4 float
1266
+
1267
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1268
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1269
+
1270
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1271
+
1272
+ existingValue = elementaryBlock4x1bAccum(
1273
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1274
+ weight0,
1275
+ existingValue);
1276
+
1277
+ existingValue = elementaryBlock4x1bAccum(
1278
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1279
+ weight1,
1280
+ existingValue);
1281
+
1282
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1283
+
1284
+ // next
1285
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1286
+ pqFineCentroids,
1287
+ code0, weight0,
1288
+ code1, weight1,
1289
+ outputAccum);
1290
+
1291
+ // clang-format on
1292
+ }
1293
+
1294
+ // Process 3 samples.
1295
+ // Each code uses its own fine pq centroids table.
1296
+ static void accum(
1297
+ const float* const __restrict pqFineCentroids0,
1298
+ const uint8_t* const __restrict code0,
1299
+ const float weight0,
1300
+ const float* const __restrict pqFineCentroids1,
1301
+ const uint8_t* const __restrict code1,
1302
+ const float weight1,
1303
+ const float* const __restrict pqFineCentroids2,
1304
+ const uint8_t* const __restrict code2,
1305
+ const float weight2,
1306
+ float* const __restrict outputAccum) {
1307
+ // fine quantizer
1308
+ const uint8_t* const __restrict fine0 = code0;
1309
+ const uint8_t* const __restrict fine1 = code1;
1310
+ const uint8_t* const __restrict fine2 = code2;
1311
+
1312
+ // clang-format off
1313
+
1314
+ // process chunks, 4 float
1315
+
1316
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1317
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1318
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1319
+
1320
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1321
+
1322
+ existingValue = elementaryBlock4x1bAccum(
1323
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1324
+ weight0,
1325
+ existingValue);
1326
+
1327
+ existingValue = elementaryBlock4x1bAccum(
1328
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1329
+ weight1,
1330
+ existingValue);
1331
+
1332
+ existingValue = elementaryBlock4x1bAccum(
1333
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1334
+ weight2,
1335
+ existingValue);
1336
+
1337
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1338
+
1339
+ // next
1340
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1341
+ pqFineCentroids0, code0, weight0,
1342
+ pqFineCentroids1, code1, weight1,
1343
+ pqFineCentroids2, code2, weight2,
1344
+ outputAccum);
1345
+
1346
+ // clang-format on
1347
+ }
1348
+
1349
+ // Process 3 samples.
1350
+ // Fine pq centroids table is shared among codes.
1351
+ static void accum(
1352
+ const float* const __restrict pqFineCentroids,
1353
+ const uint8_t* const __restrict code0,
1354
+ const float weight0,
1355
+ const uint8_t* const __restrict code1,
1356
+ const float weight1,
1357
+ const uint8_t* const __restrict code2,
1358
+ const float weight2,
1359
+ float* const __restrict outputAccum) {
1360
+ // fine quantizer
1361
+ const uint8_t* const __restrict fine0 = code0;
1362
+ const uint8_t* const __restrict fine1 = code1;
1363
+ const uint8_t* const __restrict fine2 = code2;
1364
+
1365
+ // clang-format off
1366
+
1367
+ // process chunks, 4 float
1368
+
1369
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1370
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1371
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1372
+
1373
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1374
+
1375
+ existingValue = elementaryBlock4x1bAccum(
1376
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1377
+ weight0,
1378
+ existingValue);
1379
+
1380
+ existingValue = elementaryBlock4x1bAccum(
1381
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1382
+ weight1,
1383
+ existingValue);
1384
+
1385
+ existingValue = elementaryBlock4x1bAccum(
1386
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1387
+ weight2,
1388
+ existingValue);
1389
+
1390
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1391
+
1392
+ // next
1393
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1394
+ pqFineCentroids, code0, weight0,
1395
+ code1, weight1,
1396
+ code2, weight2,
1397
+ outputAccum);
1398
+
1399
+ // clang-format on
1400
+ }
1401
+ };
1402
+
1403
+ // This partial specialization is expected to do nothing.
1404
+ template <
1405
+ intptr_t DIM,
1406
+ intptr_t FINE_SIZE,
1407
+ intptr_t FINE_BITS,
1408
+ bool FINE_SIZE_EQ_2,
1409
+ bool FINE_SIZE_EQ_4,
1410
+ bool QPOS_LEFT_GE_8,
1411
+ bool QPOS_LEFT_GE_4>
1412
+ struct IndexPQDecoderImpl<
1413
+ DIM,
1414
+ FINE_SIZE,
1415
+ FINE_BITS,
1416
+ DIM,
1417
+ FINE_SIZE_EQ_2,
1418
+ FINE_SIZE_EQ_4,
1419
+ QPOS_LEFT_GE_8,
1420
+ QPOS_LEFT_GE_4,
1421
+ true> {
1422
+ // clang-format off
1423
+
1424
+ // process 1 sample
1425
+ static void store(
1426
+ const float* const __restrict pqFineCentroids0,
1427
+ const uint8_t* const __restrict code0,
1428
+ float* const __restrict outputStore) {}
1429
+
1430
+ // process 1 sample
1431
+ static void accum(
1432
+ const float* const __restrict pqFineCentroids0,
1433
+ const uint8_t* const __restrict code0,
1434
+ const float weight0,
1435
+ float* const __restrict outputAccum) {}
1436
+
1437
+ // Process 2 samples.
1438
+ // Each code uses its own fine pq centroids table.
1439
+ static void accum(
1440
+ const float* const __restrict pqFineCentroids0,
1441
+ const uint8_t* const __restrict code0,
1442
+ const float weight0,
1443
+ const float* const __restrict pqFineCentroids1,
1444
+ const uint8_t* const __restrict code1,
1445
+ const float weight1,
1446
+ float* const __restrict outputAccum) {}
1447
+
1448
+ // Process 2 samples.
1449
+ // Fine pq centroids table is shared among codes.
1450
+ static void accum(
1451
+ const float* const __restrict pqFineCentroids,
1452
+ const uint8_t* const __restrict code0,
1453
+ const float weight0,
1454
+ const uint8_t* const __restrict code1,
1455
+ const float weight1,
1456
+ float* const __restrict outputAccum) {}
1457
+
1458
+ // Process 3 samples.
1459
+ // Each code uses its own fine pq centroids table.
1460
+ static void accum(
1461
+ const float* const __restrict pqFineCentroids0,
1462
+ const uint8_t* const __restrict code0,
1463
+ const float weight0,
1464
+ const float* const __restrict pqFineCentroids1,
1465
+ const uint8_t* const __restrict code1,
1466
+ const float weight1,
1467
+ const float* const __restrict pqFineCentroids2,
1468
+ const uint8_t* const __restrict code2,
1469
+ const float weight2,
1470
+ float* const __restrict outputAccum) {}
1471
+
1472
+ // Process 3 samples.
1473
+ // Fine pq centroids table is shared among codes.
1474
+ static void accum(
1475
+ const float* const __restrict pqFineCentroids,
1476
+ const uint8_t* const __restrict code0,
1477
+ const float weight0,
1478
+ const uint8_t* const __restrict code1,
1479
+ const float weight1,
1480
+ const uint8_t* const __restrict code2,
1481
+ const float weight2,
1482
+ float* const __restrict outputAccum) {}
1483
+
1484
+ // clang-format on
1485
+ };
1486
+
1487
+ } // namespace
1488
+
1489
+ // Suitable for PQ[1]x8
1490
+ // Suitable for PQ[1]x10
1491
+ // Suitable for PQ[1]x16
1492
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
1493
+ struct IndexPQDecoder {
1494
+ static_assert(
1495
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1496
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1497
+
1498
+ static constexpr intptr_t dim = DIM;
1499
+ static constexpr intptr_t fineSize = FINE_SIZE;
1500
+ static constexpr intptr_t fineBits = FINE_BITS;
1501
+
1502
+ // Process 1 sample.
1503
+ static void store(
1504
+ const float* const __restrict pqFineCentroids,
1505
+ const uint8_t* const __restrict code,
1506
+ float* const __restrict outputStore) {
1507
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
1508
+ pqFineCentroids, code, outputStore);
1509
+ }
1510
+
1511
+ // Process 1 sample.
1512
+ // Performs outputAccum += weight * decoded(code)
1513
+ static void accum(
1514
+ const float* const __restrict pqFineCentroids,
1515
+ const uint8_t* const __restrict code,
1516
+ const float weight,
1517
+ float* const __restrict outputAccum) {
1518
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1519
+ pqFineCentroids, code, weight, outputAccum);
1520
+ }
1521
+
1522
+ // Process 2 samples.
1523
+ // Each code uses its own fine pq centroids table.
1524
+ //
1525
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1526
+ // decoded(code1)
1527
+ static void accum(
1528
+ const float* const __restrict pqFineCentroids0,
1529
+ const uint8_t* const __restrict code0,
1530
+ const float weight0,
1531
+ const float* const __restrict pqFineCentroids1,
1532
+ const uint8_t* const __restrict code1,
1533
+ const float weight1,
1534
+ float* const __restrict outputAccum) {
1535
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1536
+ pqFineCentroids0,
1537
+ code0,
1538
+ weight0,
1539
+ pqFineCentroids1,
1540
+ code1,
1541
+ weight1,
1542
+ outputAccum);
1543
+ }
1544
+
1545
+ // Process 2 samples.
1546
+ // Fine pq centroids table is shared among codes.
1547
+ //
1548
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1549
+ // decoded(code1)
1550
+ static void accum(
1551
+ const float* const __restrict pqFineCentroids,
1552
+ const uint8_t* const __restrict code0,
1553
+ const float weight0,
1554
+ const uint8_t* const __restrict code1,
1555
+ const float weight1,
1556
+ float* const __restrict outputAccum) {
1557
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1558
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
1559
+ }
1560
+
1561
+ // Process 3 samples.
1562
+ // Each code uses its own fine pq centroids table.
1563
+ //
1564
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1565
+ // decoded(code1) + weight2 * decoded(code2)
1566
+ static void accum(
1567
+ const float* const __restrict pqFineCentroids0,
1568
+ const uint8_t* const __restrict code0,
1569
+ const float weight0,
1570
+ const float* const __restrict pqFineCentroids1,
1571
+ const uint8_t* const __restrict code1,
1572
+ const float weight1,
1573
+ const float* const __restrict pqFineCentroids2,
1574
+ const uint8_t* const __restrict code2,
1575
+ const float weight2,
1576
+ float* const __restrict outputAccum) {
1577
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1578
+ pqFineCentroids0,
1579
+ code0,
1580
+ weight0,
1581
+ pqFineCentroids1,
1582
+ code1,
1583
+ weight1,
1584
+ pqFineCentroids2,
1585
+ code2,
1586
+ weight2,
1587
+ outputAccum);
1588
+ }
1589
+
1590
+ // Process 3 samples.
1591
+ // Fine pq centroids table is shared among codes.
1592
+ //
1593
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1594
+ // decoded(code1) + weight2 * decoded(code2)
1595
+ static void accum(
1596
+ const float* const __restrict pqFineCentroids,
1597
+ const uint8_t* const __restrict code0,
1598
+ const float weight0,
1599
+ const uint8_t* const __restrict code1,
1600
+ const float weight1,
1601
+ const uint8_t* const __restrict code2,
1602
+ const float weight2,
1603
+ float* const __restrict outputAccum) {
1604
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1605
+ pqFineCentroids,
1606
+ code0,
1607
+ weight0,
1608
+ code1,
1609
+ weight1,
1610
+ code2,
1611
+ weight2,
1612
+ outputAccum);
1613
+ }
1614
+ };
1615
+
1616
+ } // namespace cppcontrib
1617
+ } // namespace faiss
1618
+ #endif // PQ_AVX2_INL_H