faiss 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -0,0 +1,1618 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+
3
+ #ifndef PQ_AVX2_INL_H
4
+ #define PQ_AVX2_INL_H
5
+
6
+ #include <immintrin.h>
7
+
8
+ #include <cstddef>
9
+ #include <cstdint>
10
+
11
+ #include <faiss/cppcontrib/detail/UintReader.h>
12
+
13
+ namespace faiss {
14
+ namespace cppcontrib {
15
+
16
+ ////////////////////////////////////////////////////////////////////////////////////
17
+ /// IndexPQDecoder
18
+ ////////////////////////////////////////////////////////////////////////////////////
19
+
20
+ namespace {
21
+
22
+ // Despite the following functions are somewhat redundant, I'd like to keep the
23
+ // overall basic blocks similar to ones from Index2LevelDecoder.
24
+ // A compiler will optimize away the redundant code.
25
+
26
+ // Processes 8 float values.
27
+ // Returns {
28
+ // [0..1] = *fine0[0..1];
29
+ // [2..3] = *fine1[0..1];
30
+ // [4..5] = *fine2[0..1];
31
+ // [6..7] = *fine3[0..1];
32
+ // }
33
+ inline __m256 elementaryBlock2x4b(
34
+ const float* const __restrict fine0,
35
+ const float* const __restrict fine1,
36
+ const float* const __restrict fine2,
37
+ const float* const __restrict fine3) {
38
+ // load fine
39
+ const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
40
+ *reinterpret_cast<const double*>(fine0),
41
+ *reinterpret_cast<const double*>(fine1),
42
+ *reinterpret_cast<const double*>(fine2),
43
+ *reinterpret_cast<const double*>(fine3)));
44
+
45
+ // add coarse and fine
46
+ return fineValue;
47
+ }
48
+
49
+ // Processes 8 float values.
50
+ // Returns {
51
+ // [0..1] = existingValue[0..1] + weight * (*fine0[0..1]);
52
+ // [2..3] = existingValue[0..1] + weight * (*fine1[0..1]);
53
+ // [4..5] = existingValue[0..1] + weight * (*fine2[0..1]);
54
+ // [6..7] = existingValue[0..1] + weight * (*fine3[0..1]);
55
+ // }
56
+ inline __m256 elementaryBlock2x4bAccum(
57
+ const float* const __restrict fine0,
58
+ const float* const __restrict fine1,
59
+ const float* const __restrict fine2,
60
+ const float* const __restrict fine3,
61
+ const float weight,
62
+ const __m256 existingValue) {
63
+ // add coarse and fine
64
+ const __m256 fineValue = elementaryBlock2x4b(fine0, fine1, fine2, fine3);
65
+
66
+ // this operation is expected to be optimized by a compiler
67
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
68
+ // do fma
69
+ return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
70
+ }
71
+
72
+ // Processes 4 float values.
73
+ // Returns {
74
+ // [0..3] = *fine[0..3];
75
+ // }
76
+ inline __m128 elementaryBlock4x1b(const float* const __restrict fine) {
77
+ // load fine
78
+ const __m128 fineValue = _mm_loadu_ps(fine);
79
+ return fineValue;
80
+ }
81
+
82
+ // Processes 4 float values.
83
+ // Returns {
84
+ // [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
85
+ // }
86
+ inline __m128 elementaryBlock4x1bAccum(
87
+ const float* const __restrict fine,
88
+ const float weight,
89
+ const __m128 existingValue) {
90
+ const __m128 fineValue = elementaryBlock4x1b(fine);
91
+
92
+ // this operation is expected to be optimized by a compiler
93
+ const __m128 weightAvx = _mm_set1_ps(weight);
94
+ // do fma
95
+ return _mm_fmadd_ps(fineValue, weightAvx, existingValue);
96
+ }
97
+
98
+ // Processes 8 float values.
99
+ // Returns {
100
+ // [0..3] = *fine0[0..3];
101
+ // [4..7] = *fine1[0..3];
102
+ // }
103
+ inline __m256 elementaryBlock4x2b(
104
+ const float* const __restrict fine0,
105
+ const float* const __restrict fine1) {
106
+ // load fine
107
+ const __m128 fineValue0 = _mm_loadu_ps(fine0);
108
+ const __m128 fineValue1 = _mm_loadu_ps(fine1);
109
+
110
+ // combine two 4b into a single 8b
111
+ const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
112
+ return combinedFineValue;
113
+ }
114
+
115
+ // Processes 8 float values.
116
+ // Returns {
117
+ // [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
118
+ // [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
119
+ // }
120
+ inline __m256 elementaryBlock4x2bAccum(
121
+ const float* const __restrict fine0,
122
+ const float* const __restrict fine1,
123
+ const float weight,
124
+ const __m256 existingValue) {
125
+ const __m256 fineValue = elementaryBlock4x2b(fine0, fine1);
126
+
127
+ // this operation is expected to be optimized by a compiler
128
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
129
+ // do fma
130
+ return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
131
+ }
132
+
133
+ // Processes 8 float values.
134
+ // Returns {
135
+ // [0..7] = *fine[0..7];
136
+ // }
137
+ inline __m256 elementaryBlock8x1b(const float* const __restrict fine) {
138
+ // load fine
139
+ const __m256 fineValue = _mm256_loadu_ps(fine);
140
+ return fineValue;
141
+ }
142
+
143
+ // Processes 8 float values.
144
+ // Returns {
145
+ // [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
146
+ // }
147
+ inline __m256 elementaryBlock8x1bAccum(
148
+ const float* const __restrict fine,
149
+ const float weight,
150
+ const __m256 existingValue) {
151
+ const __m256 fineValue = elementaryBlock8x1b(fine);
152
+
153
+ // this operation is expected to be optimized by a compiler
154
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
155
+ // do fma
156
+ return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
157
+ }
158
+
159
+ // The following code uses template-based for-loop unrolling,
160
+ // because the compiler does not do that on its own as needed.
161
+ // The idea is the following:
162
+ // template<int I, int MAX>
163
+ // struct Foo {
164
+ // static void bar() {
165
+ // doSomething(I);
166
+ // Foo<I + 1, MAX>::bar();
167
+ // }
168
+ // };
169
+ //
170
+ // template<int MAX>
171
+ // struct Foo<MAX, MAX> {
172
+ // static void bar() {}
173
+ // };
174
+ //
175
+ // Initiate the loop:
176
+ // Foo<0, MAX>::bar();
177
+
178
+ template <
179
+ intptr_t DIM,
180
+ intptr_t FINE_SIZE,
181
+ intptr_t FINE_BITS,
182
+ intptr_t CPOS,
183
+ bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
184
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
185
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
186
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
187
+ bool DIM_EQ_CPOS = DIM == CPOS>
188
+ struct IndexPQDecoderImpl;
189
+
190
+ template <
191
+ intptr_t DIM,
192
+ intptr_t FINE_BITS,
193
+ intptr_t CPOS,
194
+ bool QPOS_LEFT_GE_8,
195
+ bool QPOS_LEFT_GE_4>
196
+ struct IndexPQDecoderImpl<
197
+ DIM,
198
+ 2,
199
+ FINE_BITS,
200
+ CPOS,
201
+ true,
202
+ false,
203
+ QPOS_LEFT_GE_8,
204
+ QPOS_LEFT_GE_4,
205
+ false> {
206
+ static constexpr intptr_t FINE_SIZE = 2;
207
+
208
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
209
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
210
+
211
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
212
+
213
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
214
+
215
+ // process 1 sample
216
+ static void store(
217
+ const float* const __restrict pqFineCentroids0,
218
+ const uint8_t* const __restrict code0,
219
+ float* const __restrict outputStore) {
220
+ // fine quantizer
221
+ const uint8_t* const __restrict fine0 = code0;
222
+
223
+ // clang-format off
224
+
225
+ // process chunks, 4 float
226
+ // but 8 floats per loop
227
+
228
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
229
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
230
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
231
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
232
+
233
+ const __m256 storeValue = elementaryBlock2x4b(
234
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
235
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
236
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
237
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
238
+
239
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
240
+
241
+ // next
242
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
243
+ pqFineCentroids0, code0, outputStore);
244
+
245
+ // clang-format on
246
+ }
247
+
248
+ // process 1 sample
249
+ static void accum(
250
+ const float* const __restrict pqFineCentroids0,
251
+ const uint8_t* const __restrict code0,
252
+ const float weight0,
253
+ float* const __restrict outputAccum) {
254
+ // fine quantizer
255
+ const uint8_t* const __restrict fine0 = code0;
256
+
257
+ // clang-format off
258
+
259
+ // process chunks, 4 float
260
+ // but 8 floats per loop
261
+
262
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
263
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
264
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
265
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
266
+
267
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
268
+
269
+ existingValue = elementaryBlock2x4bAccum(
270
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
271
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
272
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
273
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
274
+ weight0,
275
+ existingValue);
276
+
277
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
278
+
279
+ // next
280
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
281
+ pqFineCentroids0, code0, weight0, outputAccum);
282
+
283
+ // clang-format on
284
+ }
285
+
286
+ // Process 2 samples.
287
+ // Each code uses its own fine pq centroids table.
288
+ static void accum(
289
+ const float* const __restrict pqFineCentroids0,
290
+ const uint8_t* const __restrict code0,
291
+ const float weight0,
292
+ const float* const __restrict pqFineCentroids1,
293
+ const uint8_t* const __restrict code1,
294
+ const float weight1,
295
+ float* const __restrict outputAccum) {
296
+ // fine quantizer
297
+ const uint8_t* const __restrict fine0 = code0;
298
+ const uint8_t* const __restrict fine1 = code1;
299
+
300
+ // clang-format off
301
+
302
+ // process chunks, 4 float
303
+ // but 8 floats per loop
304
+
305
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
306
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
307
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
308
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
309
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
310
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
311
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
312
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
313
+
314
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
315
+
316
+ existingValue = elementaryBlock2x4bAccum(
317
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
318
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
319
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
320
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
321
+ weight0,
322
+ existingValue);
323
+
324
+ existingValue = elementaryBlock2x4bAccum(
325
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
326
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
327
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
328
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
329
+ weight1,
330
+ existingValue);
331
+
332
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
333
+
334
+ // next
335
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
336
+ pqFineCentroids0, code0, weight0,
337
+ pqFineCentroids1, code1, weight1,
338
+ outputAccum);
339
+
340
+ // clang-format on
341
+ }
342
+
343
+ // Process 2 samples.
344
+ // Fine pq centroids table is shared among codes.
345
+ static void accum(
346
+ const float* const __restrict pqFineCentroids,
347
+ const uint8_t* const __restrict code0,
348
+ const float weight0,
349
+ const uint8_t* const __restrict code1,
350
+ const float weight1,
351
+ float* const __restrict outputAccum) {
352
+ // fine quantizer
353
+ const uint8_t* const __restrict fine0 = code0;
354
+ const uint8_t* const __restrict fine1 = code1;
355
+
356
+ // clang-format off
357
+
358
+ // process chunks, 4 float
359
+ // but 8 floats per loop
360
+
361
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
362
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
363
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
364
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
365
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
366
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
367
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
368
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
369
+
370
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
371
+
372
+ existingValue = elementaryBlock2x4bAccum(
373
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
374
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
375
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
376
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
377
+ weight0,
378
+ existingValue);
379
+
380
+ existingValue = elementaryBlock2x4bAccum(
381
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
382
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
383
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
384
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
385
+ weight1,
386
+ existingValue);
387
+
388
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
389
+
390
+ // next
391
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
392
+ pqFineCentroids,
393
+ code0, weight0,
394
+ code1, weight1,
395
+ outputAccum);
396
+
397
+ // clang-format on
398
+ }
399
+
400
+ // Process 3 samples.
401
+ // Each code uses its own fine pq centroids table.
402
+ static void accum(
403
+ const float* const __restrict pqFineCentroids0,
404
+ const uint8_t* const __restrict code0,
405
+ const float weight0,
406
+ const float* const __restrict pqFineCentroids1,
407
+ const uint8_t* const __restrict code1,
408
+ const float weight1,
409
+ const float* const __restrict pqFineCentroids2,
410
+ const uint8_t* const __restrict code2,
411
+ const float weight2,
412
+ float* const __restrict outputAccum) {
413
+ // fine quantizer
414
+ const uint8_t* const __restrict fine0 = code0;
415
+ const uint8_t* const __restrict fine1 = code1;
416
+ const uint8_t* const __restrict fine2 = code2;
417
+
418
+ // clang-format off
419
+
420
+ // process chunks, 4 float
421
+ // but 8 floats per loop
422
+
423
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
424
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
425
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
426
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
427
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
428
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
429
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
430
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
431
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
432
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
433
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
434
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
435
+
436
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
437
+
438
+ existingValue = elementaryBlock2x4bAccum(
439
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
440
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
441
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
442
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
443
+ weight0,
444
+ existingValue);
445
+
446
+ existingValue = elementaryBlock2x4bAccum(
447
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
448
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
449
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
450
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
451
+ weight1,
452
+ existingValue);
453
+
454
+ existingValue = elementaryBlock2x4bAccum(
455
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
456
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
457
+ pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
458
+ pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
459
+ weight2,
460
+ existingValue);
461
+
462
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
463
+
464
+ // next
465
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
466
+ pqFineCentroids0, code0, weight0,
467
+ pqFineCentroids1, code1, weight1,
468
+ pqFineCentroids2, code2, weight2,
469
+ outputAccum);
470
+
471
+ // clang-format on
472
+ }
473
+
474
+ // Process 3 samples.
475
+ // Fine pq centroids table is shared among codes.
476
+ static void accum(
477
+ const float* const __restrict pqFineCentroids,
478
+ const uint8_t* const __restrict code0,
479
+ const float weight0,
480
+ const uint8_t* const __restrict code1,
481
+ const float weight1,
482
+ const uint8_t* const __restrict code2,
483
+ const float weight2,
484
+ float* const __restrict outputAccum) {
485
+ // fine quantizer
486
+ const uint8_t* const __restrict fine0 = code0;
487
+ const uint8_t* const __restrict fine1 = code1;
488
+ const uint8_t* const __restrict fine2 = code2;
489
+
490
+ // clang-format off
491
+
492
+ // process chunks, 4 float
493
+ // but 8 floats per loop
494
+
495
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
496
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
497
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
498
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
499
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
500
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
501
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
502
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
503
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
504
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
505
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
506
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
507
+
508
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
509
+
510
+ existingValue = elementaryBlock2x4bAccum(
511
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
512
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
513
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
514
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
515
+ weight0,
516
+ existingValue);
517
+
518
+ existingValue = elementaryBlock2x4bAccum(
519
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
520
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
521
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
522
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
523
+ weight1,
524
+ existingValue);
525
+
526
+ existingValue = elementaryBlock2x4bAccum(
527
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
528
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
529
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
530
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
531
+ weight2,
532
+ existingValue);
533
+
534
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
535
+
536
+ // next
537
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
538
+ pqFineCentroids,
539
+ code0, weight0,
540
+ code1, weight1,
541
+ code2, weight2,
542
+ outputAccum);
543
+
544
+ // clang-format on
545
+ }
546
+ };
547
+
548
+ template <
549
+ intptr_t DIM,
550
+ intptr_t FINE_BITS,
551
+ intptr_t CPOS,
552
+ bool QPOS_LEFT_GE_8,
553
+ bool QPOS_LEFT_GE_4>
554
+ struct IndexPQDecoderImpl<
555
+ DIM,
556
+ 4,
557
+ FINE_BITS,
558
+ CPOS,
559
+ false,
560
+ true,
561
+ QPOS_LEFT_GE_8,
562
+ QPOS_LEFT_GE_4,
563
+ false> {
564
+ static constexpr intptr_t FINE_SIZE = 4;
565
+
566
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
567
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
568
+
569
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
570
+
571
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
572
+
573
+ // process 1 sample
574
+ static void store(
575
+ const float* const __restrict pqFineCentroids0,
576
+ const uint8_t* const __restrict code0,
577
+ float* const __restrict outputStore) {
578
+ // fine quantizer
579
+ const uint8_t* const __restrict fine0 = code0;
580
+
581
+ // clang-format off
582
+
583
+ // process chunks, 4 float
584
+ // but 8 floats per loop
585
+
586
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
587
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
588
+
589
+ const __m256 storeValue = elementaryBlock4x2b(
590
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
591
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
592
+
593
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
594
+
595
+ // next
596
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
597
+ pqFineCentroids0, code0, outputStore);
598
+
599
+ // clang-format on
600
+ }
601
+
602
+ // process 1 sample
603
+ static void accum(
604
+ const float* const __restrict pqFineCentroids0,
605
+ const uint8_t* const __restrict code0,
606
+ const float weight0,
607
+ float* const __restrict outputAccum) {
608
+ // fine quantizer
609
+ const uint8_t* const __restrict fine0 = code0;
610
+
611
+ // clang-format off
612
+
613
+ // process chunks, 4 float
614
+ // but 8 floats per loop
615
+
616
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
617
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
618
+
619
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
620
+
621
+ existingValue = elementaryBlock4x2bAccum(
622
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
623
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
624
+ weight0,
625
+ existingValue);
626
+
627
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
628
+
629
+ // next
630
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
631
+ pqFineCentroids0, code0, weight0, outputAccum);
632
+
633
+ // clang-format on
634
+ }
635
+
636
+ // Process 2 samples.
637
+ // Each code uses its own fine pq centroids table.
638
+ static void accum(
639
+ const float* const __restrict pqFineCentroids0,
640
+ const uint8_t* const __restrict code0,
641
+ const float weight0,
642
+ const float* const __restrict pqFineCentroids1,
643
+ const uint8_t* const __restrict code1,
644
+ const float weight1,
645
+ float* const __restrict outputAccum) {
646
+ // fine quantizer
647
+ const uint8_t* const __restrict fine0 = code0;
648
+ const uint8_t* const __restrict fine1 = code1;
649
+
650
+ // clang-format off
651
+
652
+ // process chunks, 4 float
653
+ // but 8 floats per loop
654
+
655
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
656
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
657
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
658
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
659
+
660
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
661
+
662
+ existingValue = elementaryBlock4x2bAccum(
663
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
664
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
665
+ weight0,
666
+ existingValue);
667
+
668
+ existingValue = elementaryBlock4x2bAccum(
669
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
670
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
671
+ weight1,
672
+ existingValue);
673
+
674
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
675
+
676
+ // next
677
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
678
+ pqFineCentroids0, code0, weight0,
679
+ pqFineCentroids1, code1, weight1,
680
+ outputAccum);
681
+
682
+ // clang-format on
683
+ }
684
+
685
+ // Process 2 samples.
686
+ // Fine pq centroids table is shared among codes.
687
+ static void accum(
688
+ const float* const __restrict pqFineCentroids,
689
+ const uint8_t* const __restrict code0,
690
+ const float weight0,
691
+ const uint8_t* const __restrict code1,
692
+ const float weight1,
693
+ float* const __restrict outputAccum) {
694
+ // fine quantizer
695
+ const uint8_t* const __restrict fine0 = code0;
696
+ const uint8_t* const __restrict fine1 = code1;
697
+
698
+ // clang-format off
699
+
700
+ // process chunks, 4 float
701
+ // but 8 floats per loop
702
+
703
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
704
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
705
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
706
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
707
+
708
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
709
+
710
+ existingValue = elementaryBlock4x2bAccum(
711
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
712
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
713
+ weight0,
714
+ existingValue);
715
+
716
+ existingValue = elementaryBlock4x2bAccum(
717
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
718
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
719
+ weight1,
720
+ existingValue);
721
+
722
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
723
+
724
+ // next
725
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
726
+ pqFineCentroids,
727
+ code0, weight0,
728
+ code1, weight1,
729
+ outputAccum);
730
+
731
+ // clang-format on
732
+ }
733
+
734
+ // Process 3 samples.
735
+ // Each code uses its own fine pq centroids table.
736
+ static void accum(
737
+ const float* const __restrict pqFineCentroids0,
738
+ const uint8_t* const __restrict code0,
739
+ const float weight0,
740
+ const float* const __restrict pqFineCentroids1,
741
+ const uint8_t* const __restrict code1,
742
+ const float weight1,
743
+ const float* const __restrict pqFineCentroids2,
744
+ const uint8_t* const __restrict code2,
745
+ const float weight2,
746
+ float* const __restrict outputAccum) {
747
+ // fine quantizer
748
+ const uint8_t* const __restrict fine0 = code0;
749
+ const uint8_t* const __restrict fine1 = code1;
750
+ const uint8_t* const __restrict fine2 = code2;
751
+
752
+ // clang-format off
753
+
754
+ // process chunks, 4 float
755
+ // but 8 floats per loop
756
+
757
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
758
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
759
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
760
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
761
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
762
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
763
+
764
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
765
+
766
+ existingValue = elementaryBlock4x2bAccum(
767
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
768
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
769
+ weight0,
770
+ existingValue);
771
+
772
+ existingValue = elementaryBlock4x2bAccum(
773
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
774
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
775
+ weight1,
776
+ existingValue);
777
+
778
+ existingValue = elementaryBlock4x2bAccum(
779
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
780
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
781
+ weight2,
782
+ existingValue);
783
+
784
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
785
+
786
+ // next
787
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
788
+ pqFineCentroids0, code0, weight0,
789
+ pqFineCentroids1, code1, weight1,
790
+ pqFineCentroids2, code2, weight2,
791
+ outputAccum);
792
+
793
+ // clang-format on
794
+ }
795
+
796
+ // Process 3 samples.
797
+ // Fine pq centroids table is shared among codes.
798
+ static void accum(
799
+ const float* const __restrict pqFineCentroids,
800
+ const uint8_t* const __restrict code0,
801
+ const float weight0,
802
+ const uint8_t* const __restrict code1,
803
+ const float weight1,
804
+ const uint8_t* const __restrict code2,
805
+ const float weight2,
806
+ float* const __restrict outputAccum) {
807
+ // fine quantizer
808
+ const uint8_t* const __restrict fine0 = code0;
809
+ const uint8_t* const __restrict fine1 = code1;
810
+ const uint8_t* const __restrict fine2 = code2;
811
+
812
+ // clang-format off
813
+
814
+ // process chunks, 4 float
815
+ // but 8 floats per loop
816
+
817
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
818
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
819
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
820
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
821
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
822
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
823
+
824
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
825
+
826
+ existingValue = elementaryBlock4x2bAccum(
827
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
828
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
829
+ weight0,
830
+ existingValue);
831
+
832
+ existingValue = elementaryBlock4x2bAccum(
833
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
834
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
835
+ weight1,
836
+ existingValue);
837
+
838
+ existingValue = elementaryBlock4x2bAccum(
839
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
840
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
841
+ weight2,
842
+ existingValue);
843
+
844
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
845
+
846
+ // next
847
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
848
+ pqFineCentroids,
849
+ code0, weight0,
850
+ code1, weight1,
851
+ code2, weight2,
852
+ outputAccum);
853
+
854
+ // clang-format on
855
+ }
856
+ };
857
+
858
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
859
+ struct IndexPQDecoderImpl<
860
+ DIM,
861
+ FINE_SIZE,
862
+ FINE_BITS,
863
+ CPOS,
864
+ false,
865
+ false,
866
+ true,
867
+ true,
868
+ false> {
869
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
870
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
871
+
872
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
873
+
874
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
875
+
876
+ // process 1 sample
877
+ static void store(
878
+ const float* const __restrict pqFineCentroids0,
879
+ const uint8_t* const __restrict code0,
880
+ float* const __restrict outputStore) {
881
+ // fine quantizer
882
+ const uint8_t* const __restrict fine0 = code0;
883
+
884
+ // clang-format off
885
+
886
+ // process chunks, 8 float
887
+
888
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
889
+
890
+ const __m256 storeValue = elementaryBlock8x1b(
891
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
892
+
893
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
894
+
895
+ // next
896
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
897
+ pqFineCentroids0, code0, outputStore);
898
+
899
+ // clang-format on
900
+ }
901
+
902
+ // process 1 sample
903
+ static void accum(
904
+ const float* const __restrict pqFineCentroids0,
905
+ const uint8_t* const __restrict code0,
906
+ const float weight0,
907
+ float* const __restrict outputAccum) {
908
+ // fine quantizer
909
+ const uint8_t* const __restrict fine0 = code0;
910
+
911
+ // clang-format off
912
+
913
+ // process chunks, 8 float
914
+
915
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
916
+
917
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
918
+
919
+ existingValue = elementaryBlock8x1bAccum(
920
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
921
+ weight0,
922
+ existingValue);
923
+
924
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
925
+
926
+ // next
927
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
928
+ pqFineCentroids0, code0, weight0, outputAccum);
929
+
930
+ // clang-format on
931
+ }
932
+
933
+ // Process 2 samples.
934
+ // Each code uses its own fine pq centroids table.
935
+ static void accum(
936
+ const float* const __restrict pqFineCentroids0,
937
+ const uint8_t* const __restrict code0,
938
+ const float weight0,
939
+ const float* const __restrict pqFineCentroids1,
940
+ const uint8_t* const __restrict code1,
941
+ const float weight1,
942
+ float* const __restrict outputAccum) {
943
+ // fine quantizer
944
+ const uint8_t* const __restrict fine0 = code0;
945
+ const uint8_t* const __restrict fine1 = code1;
946
+
947
+ // clang-format off
948
+
949
+ // process chunks, 8 float
950
+
951
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
952
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
953
+
954
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
955
+
956
+ existingValue = elementaryBlock8x1bAccum(
957
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
958
+ weight0,
959
+ existingValue);
960
+
961
+ existingValue = elementaryBlock8x1bAccum(
962
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
963
+ weight1,
964
+ existingValue);
965
+
966
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
967
+
968
+ // next
969
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
970
+ pqFineCentroids0, code0, weight0,
971
+ pqFineCentroids1, code1, weight1,
972
+ outputAccum);
973
+
974
+ // clang-format on
975
+ }
976
+
977
+ // Process 2 samples.
978
+ // Fine pq centroids table is shared among codes.
979
+ static void accum(
980
+ const float* const __restrict pqFineCentroids,
981
+ const uint8_t* const __restrict code0,
982
+ const float weight0,
983
+ const uint8_t* const __restrict code1,
984
+ const float weight1,
985
+ float* const __restrict outputAccum) {
986
+ // fine quantizer
987
+ const uint8_t* const __restrict fine0 = code0;
988
+ const uint8_t* const __restrict fine1 = code1;
989
+
990
+ // clang-format off
991
+
992
+ // process chunks, 8 float
993
+
994
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
995
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
996
+
997
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
998
+
999
+ existingValue = elementaryBlock8x1bAccum(
1000
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1001
+ weight0,
1002
+ existingValue);
1003
+
1004
+ existingValue = elementaryBlock8x1bAccum(
1005
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1006
+ weight1,
1007
+ existingValue);
1008
+
1009
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1010
+
1011
+ // next
1012
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
1013
+ pqFineCentroids,
1014
+ code0, weight0,
1015
+ code1, weight1,
1016
+ outputAccum);
1017
+
1018
+ // clang-format on
1019
+ }
1020
+
1021
+ // Process 3 samples.
1022
+ // Each code uses its own fine pq centroids table.
1023
+ static void accum(
1024
+ const float* const __restrict pqFineCentroids0,
1025
+ const uint8_t* const __restrict code0,
1026
+ const float weight0,
1027
+ const float* const __restrict pqFineCentroids1,
1028
+ const uint8_t* const __restrict code1,
1029
+ const float weight1,
1030
+ const float* const __restrict pqFineCentroids2,
1031
+ const uint8_t* const __restrict code2,
1032
+ const float weight2,
1033
+ float* const __restrict outputAccum) {
1034
+ // fine quantizer
1035
+ const uint8_t* const __restrict fine0 = code0;
1036
+ const uint8_t* const __restrict fine1 = code1;
1037
+ const uint8_t* const __restrict fine2 = code2;
1038
+
1039
+ // clang-format off
1040
+
1041
+ // process chunks, 8 float
1042
+
1043
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1044
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1045
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1046
+
1047
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1048
+
1049
+ existingValue = elementaryBlock8x1bAccum(
1050
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1051
+ weight0,
1052
+ existingValue);
1053
+
1054
+ existingValue = elementaryBlock8x1bAccum(
1055
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1056
+ weight1,
1057
+ existingValue);
1058
+
1059
+ existingValue = elementaryBlock8x1bAccum(
1060
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1061
+ weight2,
1062
+ existingValue);
1063
+
1064
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1065
+
1066
+ // next
1067
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
1068
+ pqFineCentroids0, code0, weight0,
1069
+ pqFineCentroids1, code1, weight1,
1070
+ pqFineCentroids2, code2, weight2,
1071
+ outputAccum);
1072
+
1073
+ // clang-format on
1074
+ }
1075
+
1076
+ // Process 3 samples.
1077
+ // Fine pq centroids table is shared among codes.
1078
+ static void accum(
1079
+ const float* const __restrict pqFineCentroids,
1080
+ const uint8_t* const __restrict code0,
1081
+ const float weight0,
1082
+ const uint8_t* const __restrict code1,
1083
+ const float weight1,
1084
+ const uint8_t* const __restrict code2,
1085
+ const float weight2,
1086
+ float* const __restrict outputAccum) {
1087
+ // fine quantizer
1088
+ const uint8_t* const __restrict fine0 = code0;
1089
+ const uint8_t* const __restrict fine1 = code1;
1090
+ const uint8_t* const __restrict fine2 = code2;
1091
+
1092
+ // clang-format off
1093
+
1094
+ // process chunks, 8 float
1095
+
1096
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1097
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1098
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1099
+
1100
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1101
+
1102
+ existingValue = elementaryBlock8x1bAccum(
1103
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1104
+ weight0,
1105
+ existingValue);
1106
+
1107
+ existingValue = elementaryBlock8x1bAccum(
1108
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1109
+ weight1,
1110
+ existingValue);
1111
+
1112
+ existingValue = elementaryBlock8x1bAccum(
1113
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1114
+ weight2,
1115
+ existingValue);
1116
+
1117
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1118
+
1119
+ // next
1120
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
1121
+ pqFineCentroids,
1122
+ code0, weight0,
1123
+ code1, weight1,
1124
+ code2, weight2,
1125
+ outputAccum);
1126
+
1127
+ // clang-format on
1128
+ }
1129
+ };
1130
+
1131
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
1132
+ struct IndexPQDecoderImpl<
1133
+ DIM,
1134
+ FINE_SIZE,
1135
+ FINE_BITS,
1136
+ CPOS,
1137
+ false,
1138
+ false,
1139
+ false,
1140
+ true,
1141
+ false> {
1142
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1143
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1144
+
1145
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1146
+
1147
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1148
+
1149
+ // process 1 sample
1150
+ static void store(
1151
+ const float* const __restrict pqFineCentroids0,
1152
+ const uint8_t* const __restrict code0,
1153
+ float* const __restrict outputStore) {
1154
+ // fine quantizer
1155
+ const uint8_t* const __restrict fine0 = code0;
1156
+
1157
+ // clang-format off
1158
+
1159
+ // process chunks, 4 float
1160
+
1161
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1162
+
1163
+ const __m128 storeValue = elementaryBlock4x1b(
1164
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
1165
+
1166
+ _mm_storeu_ps(outputStore + CPOS, storeValue);
1167
+
1168
+ // next
1169
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
1170
+ pqFineCentroids0, code0, outputStore);
1171
+
1172
+ // clang-format on
1173
+ }
1174
+
1175
+ // process 1 sample
1176
+ static void accum(
1177
+ const float* const __restrict pqFineCentroids0,
1178
+ const uint8_t* const __restrict code0,
1179
+ const float weight0,
1180
+ float* const __restrict outputAccum) {
1181
+ // fine quantizer
1182
+ const uint8_t* const __restrict fine0 = code0;
1183
+
1184
+ // clang-format off
1185
+
1186
+ // process chunks, 4 float
1187
+
1188
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1189
+
1190
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1191
+
1192
+ existingValue = elementaryBlock4x1bAccum(
1193
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1194
+ weight0,
1195
+ existingValue);
1196
+
1197
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1198
+
1199
+ // next
1200
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1201
+ pqFineCentroids0, code0, weight0, outputAccum);
1202
+
1203
+ // clang-format on
1204
+ }
1205
+
1206
+ // Process 2 samples.
1207
+ // Each code uses its own fine pq centroids table.
1208
+ static void accum(
1209
+ const float* const __restrict pqFineCentroids0,
1210
+ const uint8_t* const __restrict code0,
1211
+ const float weight0,
1212
+ const float* const __restrict pqFineCentroids1,
1213
+ const uint8_t* const __restrict code1,
1214
+ const float weight1,
1215
+ float* const __restrict outputAccum) {
1216
+ // fine quantizer
1217
+ const uint8_t* const __restrict fine0 = code0;
1218
+ const uint8_t* const __restrict fine1 = code1;
1219
+
1220
+ // clang-format off
1221
+
1222
+ // process chunks, 4 float
1223
+
1224
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1225
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1226
+
1227
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1228
+
1229
+ existingValue = elementaryBlock4x1bAccum(
1230
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1231
+ weight0,
1232
+ existingValue);
1233
+
1234
+ existingValue = elementaryBlock4x1bAccum(
1235
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1236
+ weight1,
1237
+ existingValue);
1238
+
1239
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1240
+
1241
+ // next
1242
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1243
+ pqFineCentroids0, code0, weight0,
1244
+ pqFineCentroids1, code1, weight1,
1245
+ outputAccum);
1246
+
1247
+ // clang-format on
1248
+ }
1249
+
1250
+ // Process 2 samples.
1251
+ // Fine pq centroids table is shared among codes.
1252
+ static void accum(
1253
+ const float* const __restrict pqFineCentroids,
1254
+ const uint8_t* const __restrict code0,
1255
+ const float weight0,
1256
+ const uint8_t* const __restrict code1,
1257
+ const float weight1,
1258
+ float* const __restrict outputAccum) {
1259
+ // fine quantizer
1260
+ const uint8_t* const __restrict fine0 = code0;
1261
+ const uint8_t* const __restrict fine1 = code1;
1262
+
1263
+ // clang-format off
1264
+
1265
+ // process chunks, 4 float
1266
+
1267
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1268
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1269
+
1270
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1271
+
1272
+ existingValue = elementaryBlock4x1bAccum(
1273
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1274
+ weight0,
1275
+ existingValue);
1276
+
1277
+ existingValue = elementaryBlock4x1bAccum(
1278
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1279
+ weight1,
1280
+ existingValue);
1281
+
1282
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1283
+
1284
+ // next
1285
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1286
+ pqFineCentroids,
1287
+ code0, weight0,
1288
+ code1, weight1,
1289
+ outputAccum);
1290
+
1291
+ // clang-format on
1292
+ }
1293
+
1294
+ // Process 3 samples.
1295
+ // Each code uses its own fine pq centroids table.
1296
+ static void accum(
1297
+ const float* const __restrict pqFineCentroids0,
1298
+ const uint8_t* const __restrict code0,
1299
+ const float weight0,
1300
+ const float* const __restrict pqFineCentroids1,
1301
+ const uint8_t* const __restrict code1,
1302
+ const float weight1,
1303
+ const float* const __restrict pqFineCentroids2,
1304
+ const uint8_t* const __restrict code2,
1305
+ const float weight2,
1306
+ float* const __restrict outputAccum) {
1307
+ // fine quantizer
1308
+ const uint8_t* const __restrict fine0 = code0;
1309
+ const uint8_t* const __restrict fine1 = code1;
1310
+ const uint8_t* const __restrict fine2 = code2;
1311
+
1312
+ // clang-format off
1313
+
1314
+ // process chunks, 4 float
1315
+
1316
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1317
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1318
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1319
+
1320
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1321
+
1322
+ existingValue = elementaryBlock4x1bAccum(
1323
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1324
+ weight0,
1325
+ existingValue);
1326
+
1327
+ existingValue = elementaryBlock4x1bAccum(
1328
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1329
+ weight1,
1330
+ existingValue);
1331
+
1332
+ existingValue = elementaryBlock4x1bAccum(
1333
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1334
+ weight2,
1335
+ existingValue);
1336
+
1337
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1338
+
1339
+ // next
1340
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1341
+ pqFineCentroids0, code0, weight0,
1342
+ pqFineCentroids1, code1, weight1,
1343
+ pqFineCentroids2, code2, weight2,
1344
+ outputAccum);
1345
+
1346
+ // clang-format on
1347
+ }
1348
+
1349
+ // Process 3 samples.
1350
+ // Fine pq centroids table is shared among codes.
1351
+ static void accum(
1352
+ const float* const __restrict pqFineCentroids,
1353
+ const uint8_t* const __restrict code0,
1354
+ const float weight0,
1355
+ const uint8_t* const __restrict code1,
1356
+ const float weight1,
1357
+ const uint8_t* const __restrict code2,
1358
+ const float weight2,
1359
+ float* const __restrict outputAccum) {
1360
+ // fine quantizer
1361
+ const uint8_t* const __restrict fine0 = code0;
1362
+ const uint8_t* const __restrict fine1 = code1;
1363
+ const uint8_t* const __restrict fine2 = code2;
1364
+
1365
+ // clang-format off
1366
+
1367
+ // process chunks, 4 float
1368
+
1369
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1370
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1371
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1372
+
1373
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1374
+
1375
+ existingValue = elementaryBlock4x1bAccum(
1376
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1377
+ weight0,
1378
+ existingValue);
1379
+
1380
+ existingValue = elementaryBlock4x1bAccum(
1381
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1382
+ weight1,
1383
+ existingValue);
1384
+
1385
+ existingValue = elementaryBlock4x1bAccum(
1386
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1387
+ weight2,
1388
+ existingValue);
1389
+
1390
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1391
+
1392
+ // next
1393
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1394
+ pqFineCentroids, code0, weight0,
1395
+ code1, weight1,
1396
+ code2, weight2,
1397
+ outputAccum);
1398
+
1399
+ // clang-format on
1400
+ }
1401
+ };
1402
+
1403
+ // This partial specialization is expected to do nothing.
1404
+ template <
1405
+ intptr_t DIM,
1406
+ intptr_t FINE_SIZE,
1407
+ intptr_t FINE_BITS,
1408
+ bool FINE_SIZE_EQ_2,
1409
+ bool FINE_SIZE_EQ_4,
1410
+ bool QPOS_LEFT_GE_8,
1411
+ bool QPOS_LEFT_GE_4>
1412
+ struct IndexPQDecoderImpl<
1413
+ DIM,
1414
+ FINE_SIZE,
1415
+ FINE_BITS,
1416
+ DIM,
1417
+ FINE_SIZE_EQ_2,
1418
+ FINE_SIZE_EQ_4,
1419
+ QPOS_LEFT_GE_8,
1420
+ QPOS_LEFT_GE_4,
1421
+ true> {
1422
+ // clang-format off
1423
+
1424
+ // process 1 sample
1425
+ static void store(
1426
+ const float* const __restrict pqFineCentroids0,
1427
+ const uint8_t* const __restrict code0,
1428
+ float* const __restrict outputStore) {}
1429
+
1430
+ // process 1 sample
1431
+ static void accum(
1432
+ const float* const __restrict pqFineCentroids0,
1433
+ const uint8_t* const __restrict code0,
1434
+ const float weight0,
1435
+ float* const __restrict outputAccum) {}
1436
+
1437
+ // Process 2 samples.
1438
+ // Each code uses its own fine pq centroids table.
1439
+ static void accum(
1440
+ const float* const __restrict pqFineCentroids0,
1441
+ const uint8_t* const __restrict code0,
1442
+ const float weight0,
1443
+ const float* const __restrict pqFineCentroids1,
1444
+ const uint8_t* const __restrict code1,
1445
+ const float weight1,
1446
+ float* const __restrict outputAccum) {}
1447
+
1448
+ // Process 2 samples.
1449
+ // Fine pq centroids table is shared among codes.
1450
+ static void accum(
1451
+ const float* const __restrict pqFineCentroids,
1452
+ const uint8_t* const __restrict code0,
1453
+ const float weight0,
1454
+ const uint8_t* const __restrict code1,
1455
+ const float weight1,
1456
+ float* const __restrict outputAccum) {}
1457
+
1458
+ // Process 3 samples.
1459
+ // Each code uses its own fine pq centroids table.
1460
+ static void accum(
1461
+ const float* const __restrict pqFineCentroids0,
1462
+ const uint8_t* const __restrict code0,
1463
+ const float weight0,
1464
+ const float* const __restrict pqFineCentroids1,
1465
+ const uint8_t* const __restrict code1,
1466
+ const float weight1,
1467
+ const float* const __restrict pqFineCentroids2,
1468
+ const uint8_t* const __restrict code2,
1469
+ const float weight2,
1470
+ float* const __restrict outputAccum) {}
1471
+
1472
+ // Process 3 samples.
1473
+ // Fine pq centroids table is shared among codes.
1474
+ static void accum(
1475
+ const float* const __restrict pqFineCentroids,
1476
+ const uint8_t* const __restrict code0,
1477
+ const float weight0,
1478
+ const uint8_t* const __restrict code1,
1479
+ const float weight1,
1480
+ const uint8_t* const __restrict code2,
1481
+ const float weight2,
1482
+ float* const __restrict outputAccum) {}
1483
+
1484
+ // clang-format on
1485
+ };
1486
+
1487
+ } // namespace
1488
+
1489
+ // Suitable for PQ[1]x8
1490
+ // Suitable for PQ[1]x10
1491
+ // Suitable for PQ[1]x16
1492
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
1493
+ struct IndexPQDecoder {
1494
+ static_assert(
1495
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1496
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1497
+
1498
+ static constexpr intptr_t dim = DIM;
1499
+ static constexpr intptr_t fineSize = FINE_SIZE;
1500
+ static constexpr intptr_t fineBits = FINE_BITS;
1501
+
1502
+ // Process 1 sample.
1503
+ static void store(
1504
+ const float* const __restrict pqFineCentroids,
1505
+ const uint8_t* const __restrict code,
1506
+ float* const __restrict outputStore) {
1507
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
1508
+ pqFineCentroids, code, outputStore);
1509
+ }
1510
+
1511
+ // Process 1 sample.
1512
+ // Performs outputAccum += weight * decoded(code)
1513
+ static void accum(
1514
+ const float* const __restrict pqFineCentroids,
1515
+ const uint8_t* const __restrict code,
1516
+ const float weight,
1517
+ float* const __restrict outputAccum) {
1518
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1519
+ pqFineCentroids, code, weight, outputAccum);
1520
+ }
1521
+
1522
+ // Process 2 samples.
1523
+ // Each code uses its own fine pq centroids table.
1524
+ //
1525
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1526
+ // decoded(code1)
1527
+ static void accum(
1528
+ const float* const __restrict pqFineCentroids0,
1529
+ const uint8_t* const __restrict code0,
1530
+ const float weight0,
1531
+ const float* const __restrict pqFineCentroids1,
1532
+ const uint8_t* const __restrict code1,
1533
+ const float weight1,
1534
+ float* const __restrict outputAccum) {
1535
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1536
+ pqFineCentroids0,
1537
+ code0,
1538
+ weight0,
1539
+ pqFineCentroids1,
1540
+ code1,
1541
+ weight1,
1542
+ outputAccum);
1543
+ }
1544
+
1545
+ // Process 2 samples.
1546
+ // Fine pq centroids table is shared among codes.
1547
+ //
1548
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1549
+ // decoded(code1)
1550
+ static void accum(
1551
+ const float* const __restrict pqFineCentroids,
1552
+ const uint8_t* const __restrict code0,
1553
+ const float weight0,
1554
+ const uint8_t* const __restrict code1,
1555
+ const float weight1,
1556
+ float* const __restrict outputAccum) {
1557
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1558
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
1559
+ }
1560
+
1561
+ // Process 3 samples.
1562
+ // Each code uses its own fine pq centroids table.
1563
+ //
1564
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1565
+ // decoded(code1) + weight2 * decoded(code2)
1566
+ static void accum(
1567
+ const float* const __restrict pqFineCentroids0,
1568
+ const uint8_t* const __restrict code0,
1569
+ const float weight0,
1570
+ const float* const __restrict pqFineCentroids1,
1571
+ const uint8_t* const __restrict code1,
1572
+ const float weight1,
1573
+ const float* const __restrict pqFineCentroids2,
1574
+ const uint8_t* const __restrict code2,
1575
+ const float weight2,
1576
+ float* const __restrict outputAccum) {
1577
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1578
+ pqFineCentroids0,
1579
+ code0,
1580
+ weight0,
1581
+ pqFineCentroids1,
1582
+ code1,
1583
+ weight1,
1584
+ pqFineCentroids2,
1585
+ code2,
1586
+ weight2,
1587
+ outputAccum);
1588
+ }
1589
+
1590
+ // Process 3 samples.
1591
+ // Fine pq centroids table is shared among codes.
1592
+ //
1593
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1594
+ // decoded(code1) + weight2 * decoded(code2)
1595
+ static void accum(
1596
+ const float* const __restrict pqFineCentroids,
1597
+ const uint8_t* const __restrict code0,
1598
+ const float weight0,
1599
+ const uint8_t* const __restrict code1,
1600
+ const float weight1,
1601
+ const uint8_t* const __restrict code2,
1602
+ const float weight2,
1603
+ float* const __restrict outputAccum) {
1604
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1605
+ pqFineCentroids,
1606
+ code0,
1607
+ weight0,
1608
+ code1,
1609
+ weight1,
1610
+ code2,
1611
+ weight2,
1612
+ outputAccum);
1613
+ }
1614
+ };
1615
+
1616
+ } // namespace cppcontrib
1617
+ } // namespace faiss
1618
+ #endif // PQ_AVX2_INL_H