faiss 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -0,0 +1,2058 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+ #ifndef LEVEL2_AVX2_INL_H
3
+ #define LEVEL2_AVX2_INL_H
4
+
5
+ #include <immintrin.h>
6
+
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+
10
+ #include <faiss/cppcontrib/detail/UintReader.h>
11
+
12
+ namespace faiss {
13
+ namespace cppcontrib {
14
+
15
+ ////////////////////////////////////////////////////////////////////////////////////
16
+ /// Index2LevelDecoder
17
+ ////////////////////////////////////////////////////////////////////////////////////
18
+
19
+ namespace {
20
+
21
+ // Processes 8 float values.
22
+ // Returns {
23
+ // [0..1] = *coarse[0..1] + *fine0[0..1];
24
+ // [2..3] = *coarse[2..3] + *fine1[0..1];
25
+ // [4..5] = *coarse[4..5] + *fine2[0..1];
26
+ // [6..7] = *coarse[6..7] + *fine3[0..1];
27
+ // }
28
+ inline __m256 elementaryBlock2x4b(
29
+ const float* const __restrict coarse,
30
+ const float* const __restrict fine0,
31
+ const float* const __restrict fine1,
32
+ const float* const __restrict fine2,
33
+ const float* const __restrict fine3) {
34
+ // load fine
35
+ const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
36
+ *reinterpret_cast<const double*>(fine0),
37
+ *reinterpret_cast<const double*>(fine1),
38
+ *reinterpret_cast<const double*>(fine2),
39
+ *reinterpret_cast<const double*>(fine3)));
40
+ // load coarse
41
+ const __m256 coarseValue = _mm256_loadu_ps(coarse);
42
+
43
+ // add coarse and fine
44
+ return _mm256_add_ps(fineValue, coarseValue);
45
+ }
46
+
47
+ // Processes 8 float values.
48
+ // Returns {
49
+ // [0..1] = existingValue[0..1] + weight * (*coarse[0..1] + *fine0[0..1]);
50
+ // [2..3] = existingValue[0..1] + weight * (*coarse[2..3] + *fine1[0..1]);
51
+ // [4..5] = existingValue[0..1] + weight * (*coarse[4..5] + *fine2[0..1]);
52
+ // [6..7] = existingValue[0..1] + weight * (*coarse[6..7] + *fine3[0..1]);
53
+ // }
54
+ inline __m256 elementaryBlock2x4bAccum(
55
+ const float* const __restrict coarse,
56
+ const float* const __restrict fine0,
57
+ const float* const __restrict fine1,
58
+ const float* const __restrict fine2,
59
+ const float* const __restrict fine3,
60
+ const float weight,
61
+ const __m256 existingValue) {
62
+ // add coarse and fine
63
+ const __m256 combinedValue =
64
+ elementaryBlock2x4b(coarse, fine0, fine1, fine2, fine3);
65
+
66
+ // this operation is expected to be optimized by a compiler
67
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
68
+ // do fma
69
+ return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
70
+ }
71
+
72
+ // Processes 4 float values.
73
+ // Returns {
74
+ // [0..3] = *coarse[0..3] + *fine[0..3];
75
+ // }
76
+ inline __m128 elementaryBlock4x1b(
77
+ const float* const __restrict coarse,
78
+ const float* const __restrict fine) {
79
+ // load fine
80
+ const __m128 fineValue = _mm_loadu_ps(fine);
81
+ // load coarse
82
+ const __m128 coarseValue = _mm_loadu_ps(coarse);
83
+
84
+ // add coarse and fine
85
+ return _mm_add_ps(fineValue, coarseValue);
86
+ }
87
+
88
+ // Processes 4 float values.
89
+ // Returns {
90
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
91
+ // }
92
+ inline __m128 elementaryBlock4x1bAccum(
93
+ const float* const __restrict coarse,
94
+ const float* const __restrict fine,
95
+ const float weight,
96
+ const __m128 existingValue) {
97
+ // add coarse and fine
98
+ const __m128 combinedValue = elementaryBlock4x1b(coarse, fine);
99
+
100
+ // this operation is expected to be optimized by a compiler
101
+ const __m128 weightAvx = _mm_set1_ps(weight);
102
+ // do fma
103
+ return _mm_fmadd_ps(combinedValue, weightAvx, existingValue);
104
+ }
105
+
106
+ // Processes 8 float values.
107
+ // Returns {
108
+ // [0..3] = *coarse[0..3] + *fine0[0..3];
109
+ // [4..7] = *coarse[4..7] + *fine1[0..3];
110
+ // }
111
+ inline __m256 elementaryBlock4x2b(
112
+ const float* const __restrict coarse,
113
+ const float* const __restrict fine0,
114
+ const float* const __restrict fine1) {
115
+ // load fine
116
+ const __m128 fineValue0 = _mm_loadu_ps(fine0);
117
+ const __m128 fineValue1 = _mm_loadu_ps(fine1);
118
+ // load coarse
119
+ const __m256 coarseValue = _mm256_loadu_ps(coarse);
120
+
121
+ // combine two 4b into a single 8b
122
+ const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
123
+ // add coarse and fine
124
+ return _mm256_add_ps(combinedFineValue, coarseValue);
125
+ }
126
+
127
+ // Processes 8 float values.
128
+ // Returns {
129
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
130
+ // [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
131
+ // }
132
+ inline __m256 elementaryBlock4x2bAccum(
133
+ const float* const __restrict coarse,
134
+ const float* const __restrict fine0,
135
+ const float* const __restrict fine1,
136
+ const float weight,
137
+ const __m256 existingValue) {
138
+ // add coarse and fine
139
+ const __m256 combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
140
+
141
+ // this operation is expected to be optimized by a compiler
142
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
143
+ // do fma
144
+ return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
145
+ }
146
+
147
+ // Processes 8 float values.
148
+ // Returns {
149
+ // [0..7] = *coarse[0..7] + *fine[0..7];
150
+ // }
151
+ inline __m256 elementaryBlock8x1b(
152
+ const float* const __restrict coarse,
153
+ const float* const __restrict fine) {
154
+ // load fine
155
+ const __m256 fineValue = _mm256_loadu_ps(fine);
156
+ // load coarse
157
+ const __m256 coarseValue = _mm256_loadu_ps(coarse);
158
+
159
+ // add coarse and fine
160
+ return _mm256_add_ps(fineValue, coarseValue);
161
+ }
162
+
163
+ // Processes 8 float values.
164
+ // Returns {
165
+ // [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
166
+ // }
167
+ inline __m256 elementaryBlock8x1bAccum(
168
+ const float* const __restrict coarse,
169
+ const float* const __restrict fine,
170
+ const float weight,
171
+ const __m256 existingValue) {
172
+ // add coarse and fine
173
+ const __m256 combinedValue = elementaryBlock8x1b(coarse, fine);
174
+
175
+ // this operation is expected to be optimized by a compiler
176
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
177
+ // do fma
178
+ return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
179
+ }
180
+
181
+ // The following code uses template-based for-loop unrolling,
182
+ // because the compiler does not do that on its own as needed.
183
+ // The idea is the following:
184
+ // template<int I, int MAX>
185
+ // struct Foo {
186
+ // static void bar() {
187
+ // doSomething(I);
188
+ // Foo<I + 1, MAX>::bar();
189
+ // }
190
+ // };
191
+ //
192
+ // template<int MAX>
193
+ // struct Foo<MAX, MAX> {
194
+ // static void bar() {}
195
+ // };
196
+ //
197
+ // Initiate the loop:
198
+ // Foo<0, MAX>::bar();
199
+
200
+ template <
201
+ intptr_t DIM,
202
+ intptr_t COARSE_SIZE,
203
+ intptr_t FINE_SIZE,
204
+ intptr_t COARSE_BITS,
205
+ intptr_t FINE_BITS,
206
+ intptr_t CPOS,
207
+ bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
208
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
209
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
210
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
211
+ bool DIM_EQ_CPOS = DIM == CPOS>
212
+ struct Index2LevelDecoderImpl;
213
+
214
+ template <
215
+ intptr_t DIM,
216
+ intptr_t COARSE_SIZE,
217
+ intptr_t COARSE_BITS,
218
+ intptr_t FINE_BITS,
219
+ intptr_t CPOS,
220
+ bool QPOS_LEFT_GE_8,
221
+ bool QPOS_LEFT_GE_4>
222
+ struct Index2LevelDecoderImpl<
223
+ DIM,
224
+ COARSE_SIZE,
225
+ 2,
226
+ COARSE_BITS,
227
+ FINE_BITS,
228
+ CPOS,
229
+ true,
230
+ false,
231
+ QPOS_LEFT_GE_8,
232
+ QPOS_LEFT_GE_4,
233
+ false> {
234
+ static constexpr intptr_t FINE_SIZE = 2;
235
+
236
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
237
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
238
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
239
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
240
+
241
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
242
+
243
+ // coarse quantizer storage
244
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
245
+
246
+ // coarse quantizer bytes start from 0
247
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
248
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
249
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
250
+ N_COARSE_ELEMENTS * COARSE_BITS;
251
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
252
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
253
+
254
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
255
+
256
+ // process 1 sample
257
+ static void store(
258
+ const float* const __restrict pqCoarseCentroids0,
259
+ const float* const __restrict pqFineCentroids0,
260
+ const uint8_t* const __restrict code0,
261
+ float* const __restrict outputStore) {
262
+ // coarse quantizer
263
+ const uint8_t* const __restrict coarse0 = code0;
264
+
265
+ // fine quantizer
266
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
267
+
268
+ // clang-format off
269
+
270
+ // process chunks, 2 float
271
+ // but 8 floats per loop
272
+
273
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
274
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
275
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
276
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
277
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
278
+
279
+ const __m256 storeValue = elementaryBlock2x4b(
280
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
281
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
282
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
283
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
284
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
285
+
286
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
287
+
288
+ // next
289
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
290
+ pqCoarseCentroids0, pqFineCentroids0, code0,
291
+ outputStore);
292
+
293
+ // clang-format on
294
+ }
295
+
296
+ // process 1 sample
297
+ static void accum(
298
+ const float* const __restrict pqCoarseCentroids0,
299
+ const float* const __restrict pqFineCentroids0,
300
+ const uint8_t* const __restrict code0,
301
+ const float weight0,
302
+ float* const __restrict outputAccum) {
303
+ // coarse quantizer
304
+ const uint8_t* const __restrict coarse0 = code0;
305
+
306
+ // fine quantizer
307
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
308
+
309
+ // clang-format off
310
+
311
+ // process chunks, 2 float
312
+ // but 8 floats per loop
313
+
314
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
315
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
316
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
317
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
318
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
319
+
320
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
321
+
322
+ existingValue = elementaryBlock2x4bAccum(
323
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
324
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
325
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
326
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
327
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset, weight0,
328
+ existingValue);
329
+
330
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
331
+
332
+ // next
333
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
334
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
335
+ outputAccum);
336
+
337
+ // clang-format on
338
+ }
339
+
340
+ // Process 2 samples.
341
+ // Each code uses its own coarse pq centroids table and fine pq centroids
342
+ // table.
343
+ static void accum(
344
+ const float* const __restrict pqCoarseCentroids0,
345
+ const float* const __restrict pqFineCentroids0,
346
+ const uint8_t* const __restrict code0,
347
+ const float weight0,
348
+ const float* const __restrict pqCoarseCentroids1,
349
+ const float* const __restrict pqFineCentroids1,
350
+ const uint8_t* const __restrict code1,
351
+ const float weight1,
352
+ float* const __restrict outputAccum) {
353
+ // coarse quantizer
354
+ const uint8_t* const __restrict coarse0 = code0;
355
+ const uint8_t* const __restrict coarse1 = code1;
356
+
357
+ // fine quantizer
358
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
359
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
360
+
361
+ // clang-format off
362
+
363
+ // process chunks, 4 float
364
+ // but 8 floats per loop
365
+
366
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
367
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
368
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
369
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
370
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
371
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
372
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
373
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
374
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
375
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
376
+
377
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
378
+
379
+ existingValue = elementaryBlock2x4bAccum(
380
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
381
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
382
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
383
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
384
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
385
+ weight0,
386
+ existingValue);
387
+
388
+ existingValue = elementaryBlock2x4bAccum(
389
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
390
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
391
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
392
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
393
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
394
+ weight1,
395
+ existingValue);
396
+
397
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
398
+
399
+ // next
400
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
401
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
402
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
403
+ outputAccum);
404
+
405
+ // clang-format on
406
+ }
407
+
408
+ // Process 2 samples.
409
+ // Coarse pq centroids table and fine pq centroids table are shared among
410
+ // codes.
411
+ static void accum(
412
+ const float* const __restrict pqCoarseCentroids,
413
+ const float* const __restrict pqFineCentroids,
414
+ const uint8_t* const __restrict code0,
415
+ const float weight0,
416
+ const uint8_t* const __restrict code1,
417
+ const float weight1,
418
+ float* const __restrict outputAccum) {
419
+ // coarse quantizer
420
+ const uint8_t* const __restrict coarse0 = code0;
421
+ const uint8_t* const __restrict coarse1 = code1;
422
+
423
+ // fine quantizer
424
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
425
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
426
+
427
+ // clang-format off
428
+
429
+ // process chunks, 4 float
430
+ // but 8 floats per loop
431
+
432
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
433
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
434
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
435
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
436
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
437
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
438
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
439
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
440
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
441
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
442
+
443
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
444
+
445
+ existingValue = elementaryBlock2x4bAccum(
446
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
447
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
448
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
449
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
450
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
451
+ weight0,
452
+ existingValue);
453
+
454
+ existingValue = elementaryBlock2x4bAccum(
455
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
456
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
457
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
458
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
459
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
460
+ weight1,
461
+ existingValue);
462
+
463
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
464
+
465
+ // next
466
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
467
+ pqCoarseCentroids, pqFineCentroids,
468
+ code0, weight0,
469
+ code1, weight1,
470
+ outputAccum);
471
+
472
+ // clang-format on
473
+ }
474
+
475
+ // Process 3 samples.
476
+ // Each code uses its own coarse pq centroids table and fine pq centroids
477
+ // table.
478
+ static void accum(
479
+ const float* const __restrict pqCoarseCentroids0,
480
+ const float* const __restrict pqFineCentroids0,
481
+ const uint8_t* const __restrict code0,
482
+ const float weight0,
483
+ const float* const __restrict pqCoarseCentroids1,
484
+ const float* const __restrict pqFineCentroids1,
485
+ const uint8_t* const __restrict code1,
486
+ const float weight1,
487
+ const float* const __restrict pqCoarseCentroids2,
488
+ const float* const __restrict pqFineCentroids2,
489
+ const uint8_t* const __restrict code2,
490
+ const float weight2,
491
+ float* const __restrict outputAccum) {
492
+ // coarse quantizer
493
+ const uint8_t* const __restrict coarse0 = code0;
494
+ const uint8_t* const __restrict coarse1 = code1;
495
+ const uint8_t* const __restrict coarse2 = code2;
496
+
497
+ // fine quantizer
498
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
499
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
500
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
501
+
502
+ // clang-format off
503
+
504
+ // process chunks, 2 float
505
+ // but 8 floats per loop
506
+
507
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
508
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
509
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
510
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
511
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
512
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
513
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
514
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
515
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
516
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
517
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
518
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
519
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
520
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
521
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
522
+
523
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
524
+
525
+ existingValue = elementaryBlock2x4bAccum(
526
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
527
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
528
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
529
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
530
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
531
+ weight0,
532
+ existingValue);
533
+
534
+ existingValue = elementaryBlock2x4bAccum(
535
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
536
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
537
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
538
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
539
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
540
+ weight1,
541
+ existingValue);
542
+
543
+ existingValue = elementaryBlock2x4bAccum(
544
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
545
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
546
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
547
+ pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
548
+ pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
549
+ weight2,
550
+ existingValue);
551
+
552
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
553
+
554
+ // next
555
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
556
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
557
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
558
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
559
+ outputAccum);
560
+
561
+ // clang-format on
562
+ }
563
+
564
+ // Process 3 samples.
565
+ // Coarse pq centroids table and fine pq centroids table are shared among
566
+ // codes.
567
+ static void accum(
568
+ const float* const __restrict pqCoarseCentroids,
569
+ const float* const __restrict pqFineCentroids,
570
+ const uint8_t* const __restrict code0,
571
+ const float weight0,
572
+ const uint8_t* const __restrict code1,
573
+ const float weight1,
574
+ const uint8_t* const __restrict code2,
575
+ const float weight2,
576
+ float* const __restrict outputAccum) {
577
+ // coarse quantizer
578
+ const uint8_t* const __restrict coarse0 = code0;
579
+ const uint8_t* const __restrict coarse1 = code1;
580
+ const uint8_t* const __restrict coarse2 = code2;
581
+
582
+ // fine quantizer
583
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
584
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
585
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
586
+
587
+ // clang-format off
588
+
589
+ // process chunks, 4 float
590
+ // but 8 floats per loop
591
+
592
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
593
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
594
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
595
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
596
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
597
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
598
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
599
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
600
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
601
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
602
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
603
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
604
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
605
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
606
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
607
+
608
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
609
+
610
+ existingValue = elementaryBlock2x4bAccum(
611
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
612
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
613
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
614
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
615
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
616
+ weight0,
617
+ existingValue);
618
+
619
+ existingValue = elementaryBlock2x4bAccum(
620
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
621
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
622
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
623
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
624
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
625
+ weight1,
626
+ existingValue);
627
+
628
+ existingValue = elementaryBlock2x4bAccum(
629
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
630
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
631
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
632
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
633
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
634
+ weight2,
635
+ existingValue);
636
+
637
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
638
+
639
+ // next
640
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
641
+ pqCoarseCentroids, pqFineCentroids,
642
+ code0, weight0,
643
+ code1, weight1,
644
+ code2, weight2,
645
+ outputAccum);
646
+
647
+ // clang-format on
648
+ }
649
+ };
650
+
651
+ template <
652
+ intptr_t DIM,
653
+ intptr_t COARSE_SIZE,
654
+ intptr_t COARSE_BITS,
655
+ intptr_t FINE_BITS,
656
+ intptr_t CPOS,
657
+ bool QPOS_LEFT_GE_8,
658
+ bool QPOS_LEFT_GE_4>
659
+ struct Index2LevelDecoderImpl<
660
+ DIM,
661
+ COARSE_SIZE,
662
+ 4,
663
+ COARSE_BITS,
664
+ FINE_BITS,
665
+ CPOS,
666
+ false,
667
+ true,
668
+ QPOS_LEFT_GE_8,
669
+ QPOS_LEFT_GE_4,
670
+ false> {
671
+ static constexpr intptr_t FINE_SIZE = 4;
672
+
673
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
674
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
675
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
676
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
677
+
678
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
679
+
680
+ // coarse quantizer storage
681
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
682
+
683
+ // coarse quantizer bytes start from 0
684
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
685
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
686
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
687
+ N_COARSE_ELEMENTS * COARSE_BITS;
688
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
689
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
690
+
691
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
692
+
693
+ // process 1 sample
694
+ static void store(
695
+ const float* const __restrict pqCoarseCentroids0,
696
+ const float* const __restrict pqFineCentroids0,
697
+ const uint8_t* const __restrict code0,
698
+ float* const __restrict outputStore) {
699
+ // coarse quantizer
700
+ const uint8_t* const __restrict coarse0 = code0;
701
+
702
+ // fine quantizer
703
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
704
+
705
+ // clang-format off
706
+
707
+ // process chunks, 4 float
708
+ // but 8 floats per loop
709
+
710
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
711
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
712
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
713
+
714
+ const __m256 storeValue = elementaryBlock4x2b(
715
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
716
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
717
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
718
+
719
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
720
+
721
+ // next
722
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
723
+ pqCoarseCentroids0, pqFineCentroids0, code0,
724
+ outputStore);
725
+
726
+ // clang-format on
727
+ }
728
+
729
+ // process 1 sample
730
+ static void accum(
731
+ const float* const __restrict pqCoarseCentroids0,
732
+ const float* const __restrict pqFineCentroids0,
733
+ const uint8_t* const __restrict code0,
734
+ const float weight0,
735
+ float* const __restrict outputAccum) {
736
+ // coarse quantizer
737
+ const uint8_t* const __restrict coarse0 = code0;
738
+
739
+ // fine quantizer
740
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
741
+
742
+ // clang-format off
743
+
744
+ // process chunks, 4 float
745
+ // but 8 floats per loop
746
+
747
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
748
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
749
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
750
+
751
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
752
+
753
+ existingValue = elementaryBlock4x2bAccum(
754
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
755
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
756
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
757
+ weight0,
758
+ existingValue);
759
+
760
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
761
+
762
+ // next
763
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
764
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
765
+ outputAccum);
766
+
767
+ // clang-format on
768
+ }
769
+
770
+ // Process 2 samples.
771
+ // Each code uses its own coarse pq centroids table and fine pq centroids
772
+ // table.
773
+ static void accum(
774
+ const float* const __restrict pqCoarseCentroids0,
775
+ const float* const __restrict pqFineCentroids0,
776
+ const uint8_t* const __restrict code0,
777
+ const float weight0,
778
+ const float* const __restrict pqCoarseCentroids1,
779
+ const float* const __restrict pqFineCentroids1,
780
+ const uint8_t* const __restrict code1,
781
+ const float weight1,
782
+ float* const __restrict outputAccum) {
783
+ // coarse quantizer
784
+ const uint8_t* const __restrict coarse0 = code0;
785
+ const uint8_t* const __restrict coarse1 = code1;
786
+
787
+ // fine quantizer
788
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
789
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
790
+
791
+ // clang-format off
792
+
793
+ // process chunks, 4 float
794
+ // but 8 floats per loop
795
+
796
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
797
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
798
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
799
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
800
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
801
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
802
+
803
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
804
+
805
+ existingValue = elementaryBlock4x2bAccum(
806
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
807
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
808
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
809
+ weight0,
810
+ existingValue);
811
+
812
+ existingValue = elementaryBlock4x2bAccum(
813
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
814
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
815
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
816
+ weight1,
817
+ existingValue);
818
+
819
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
820
+
821
+ // next
822
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
823
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
824
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
825
+ outputAccum);
826
+
827
+ // clang-format on
828
+ }
829
+
830
+ // Process 2 samples.
831
+ // Coarse pq centroids table and fine pq centroids table are shared among
832
+ // codes.
833
+ static void accum(
834
+ const float* const __restrict pqCoarseCentroids,
835
+ const float* const __restrict pqFineCentroids,
836
+ const uint8_t* const __restrict code0,
837
+ const float weight0,
838
+ const uint8_t* const __restrict code1,
839
+ const float weight1,
840
+ float* const __restrict outputAccum) {
841
+ // coarse quantizer
842
+ const uint8_t* const __restrict coarse0 = code0;
843
+ const uint8_t* const __restrict coarse1 = code1;
844
+
845
+ // fine quantizer
846
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
847
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
848
+
849
+ // clang-format off
850
+
851
+ // process chunks, 4 float
852
+ // but 8 floats per loop
853
+
854
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
855
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
856
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
857
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
858
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
859
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
860
+
861
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
862
+
863
+ existingValue = elementaryBlock4x2bAccum(
864
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
865
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
866
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
867
+ weight0,
868
+ existingValue);
869
+
870
+ existingValue = elementaryBlock4x2bAccum(
871
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
872
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
873
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
874
+ weight1,
875
+ existingValue);
876
+
877
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
878
+
879
+ // next
880
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
881
+ pqCoarseCentroids, pqFineCentroids,
882
+ code0, weight0,
883
+ code1, weight1,
884
+ outputAccum);
885
+
886
+ // clang-format on
887
+ }
888
+
889
+ // Process 3 samples.
890
+ // Each code uses its own coarse pq centroids table and fine pq centroids
891
+ // table.
892
+ static void accum(
893
+ const float* const __restrict pqCoarseCentroids0,
894
+ const float* const __restrict pqFineCentroids0,
895
+ const uint8_t* const __restrict code0,
896
+ const float weight0,
897
+ const float* const __restrict pqCoarseCentroids1,
898
+ const float* const __restrict pqFineCentroids1,
899
+ const uint8_t* const __restrict code1,
900
+ const float weight1,
901
+ const float* const __restrict pqCoarseCentroids2,
902
+ const float* const __restrict pqFineCentroids2,
903
+ const uint8_t* const __restrict code2,
904
+ const float weight2,
905
+ float* const __restrict outputAccum) {
906
+ // coarse quantizer
907
+ const uint8_t* const __restrict coarse0 = code0;
908
+ const uint8_t* const __restrict coarse1 = code1;
909
+ const uint8_t* const __restrict coarse2 = code2;
910
+
911
+ // fine quantizer
912
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
913
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
914
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
915
+
916
+ // clang-format off
917
+
918
+ // process chunks, 4 float
919
+ // but 8 floats per loop
920
+
921
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
922
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
923
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
924
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
925
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
926
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
927
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
928
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
929
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
930
+
931
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
932
+
933
+ existingValue = elementaryBlock4x2bAccum(
934
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
935
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
936
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
937
+ weight0,
938
+ existingValue);
939
+
940
+ existingValue = elementaryBlock4x2bAccum(
941
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
942
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
943
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
944
+ weight1,
945
+ existingValue);
946
+
947
+ existingValue = elementaryBlock4x2bAccum(
948
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
949
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
950
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
951
+ weight2,
952
+ existingValue);
953
+
954
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
955
+
956
+ // next
957
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
958
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
959
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
960
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
961
+ outputAccum);
962
+
963
+ // clang-format on
964
+ }
965
+
966
+ // Process 3 samples.
967
+ // Coarse pq centroids table and fine pq centroids table are shared among
968
+ // codes.
969
+ static void accum(
970
+ const float* const __restrict pqCoarseCentroids,
971
+ const float* const __restrict pqFineCentroids,
972
+ const uint8_t* const __restrict code0,
973
+ const float weight0,
974
+ const uint8_t* const __restrict code1,
975
+ const float weight1,
976
+ const uint8_t* const __restrict code2,
977
+ const float weight2,
978
+ float* const __restrict outputAccum) {
979
+ // coarse quantizer
980
+ const uint8_t* const __restrict coarse0 = code0;
981
+ const uint8_t* const __restrict coarse1 = code1;
982
+ const uint8_t* const __restrict coarse2 = code2;
983
+
984
+ // fine quantizer
985
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
986
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
987
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
988
+
989
+ // clang-format off
990
+
991
+ // process chunks, 4 float
992
+ // but 8 floats per loop
993
+
994
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
995
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
996
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
997
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
998
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
999
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
1000
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1001
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
1002
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
1003
+
1004
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1005
+
1006
+ existingValue = elementaryBlock4x2bAccum(
1007
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1008
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
1009
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
1010
+ weight0,
1011
+ existingValue);
1012
+
1013
+ existingValue = elementaryBlock4x2bAccum(
1014
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1015
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
1016
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
1017
+ weight1,
1018
+ existingValue);
1019
+
1020
+ existingValue = elementaryBlock4x2bAccum(
1021
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1022
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
1023
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
1024
+ weight2,
1025
+ existingValue);
1026
+
1027
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1028
+
1029
+ // next
1030
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1031
+ pqCoarseCentroids, pqFineCentroids,
1032
+ code0, weight0,
1033
+ code1, weight1,
1034
+ code2, weight2,
1035
+ outputAccum);
1036
+
1037
+ // clang-format on
1038
+ }
1039
+ };
1040
+
1041
+ template <
1042
+ intptr_t DIM,
1043
+ intptr_t COARSE_SIZE,
1044
+ intptr_t FINE_SIZE,
1045
+ intptr_t COARSE_BITS,
1046
+ intptr_t FINE_BITS,
1047
+ intptr_t CPOS>
1048
+ struct Index2LevelDecoderImpl<
1049
+ DIM,
1050
+ COARSE_SIZE,
1051
+ FINE_SIZE,
1052
+ COARSE_BITS,
1053
+ FINE_BITS,
1054
+ CPOS,
1055
+ false,
1056
+ false,
1057
+ true,
1058
+ true,
1059
+ false> {
1060
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
1061
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
1062
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1063
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1064
+
1065
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1066
+
1067
+ // coarse quantizer storage
1068
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
1069
+
1070
+ // coarse quantizer bytes start from 0
1071
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
1072
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
1073
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
1074
+ N_COARSE_ELEMENTS * COARSE_BITS;
1075
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
1076
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
1077
+
1078
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1079
+
1080
+ // process 1 sample
1081
+ static void store(
1082
+ const float* const __restrict pqCoarseCentroids0,
1083
+ const float* const __restrict pqFineCentroids0,
1084
+ const uint8_t* const __restrict code0,
1085
+ float* const __restrict outputStore) {
1086
+ // coarse quantizer
1087
+ const uint8_t* const __restrict coarse0 = code0;
1088
+
1089
+ // fine quantizer
1090
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1091
+
1092
+ // clang-format off
1093
+
1094
+ // process chunks, 8 float
1095
+
1096
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1097
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1098
+
1099
+ const __m256 storeValue = elementaryBlock8x1b(
1100
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1101
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
1102
+
1103
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
1104
+
1105
+ // next
1106
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
1107
+ pqCoarseCentroids0, pqFineCentroids0, code0,
1108
+ outputStore);
1109
+
1110
+ // clang-format on
1111
+ }
1112
+
1113
+ // process 1 sample
1114
+ static void accum(
1115
+ const float* const __restrict pqCoarseCentroids0,
1116
+ const float* const __restrict pqFineCentroids0,
1117
+ const uint8_t* const __restrict code0,
1118
+ const float weight0,
1119
+ float* const __restrict outputAccum) {
1120
+ // coarse quantizer
1121
+ const uint8_t* const __restrict coarse0 = code0;
1122
+
1123
+ // fine quantizer
1124
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1125
+
1126
+ // clang-format off
1127
+
1128
+ // process chunks, 8 float
1129
+
1130
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1131
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1132
+
1133
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1134
+
1135
+ existingValue = elementaryBlock8x1bAccum(
1136
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1137
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1138
+ weight0,
1139
+ existingValue);
1140
+
1141
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1142
+
1143
+ // next
1144
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1145
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1146
+ outputAccum);
1147
+
1148
+ // clang-format on
1149
+ }
1150
+
1151
+ // Process 2 samples.
1152
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1153
+ // table.
1154
+ static void accum(
1155
+ const float* const __restrict pqCoarseCentroids0,
1156
+ const float* const __restrict pqFineCentroids0,
1157
+ const uint8_t* const __restrict code0,
1158
+ const float weight0,
1159
+ const float* const __restrict pqCoarseCentroids1,
1160
+ const float* const __restrict pqFineCentroids1,
1161
+ const uint8_t* const __restrict code1,
1162
+ const float weight1,
1163
+ float* const __restrict outputAccum) {
1164
+ // coarse quantizer
1165
+ const uint8_t* const __restrict coarse0 = code0;
1166
+ const uint8_t* const __restrict coarse1 = code1;
1167
+
1168
+ // fine quantizer
1169
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1170
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1171
+
1172
+ // clang-format off
1173
+
1174
+ // process chunks, 8 float
1175
+
1176
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1177
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1178
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1179
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1180
+
1181
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1182
+
1183
+ existingValue = elementaryBlock8x1bAccum(
1184
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1185
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1186
+ weight0,
1187
+ existingValue);
1188
+
1189
+ existingValue = elementaryBlock8x1bAccum(
1190
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1191
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1192
+ weight1,
1193
+ existingValue);
1194
+
1195
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1196
+
1197
+ // next
1198
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1199
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1200
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1201
+ outputAccum);
1202
+
1203
+ // clang-format on
1204
+ }
1205
+
1206
+ // Process 2 samples.
1207
+ // Coarse pq centroids table and fine pq centroids table are shared among
1208
+ // codes.
1209
+ static void accum(
1210
+ const float* const __restrict pqCoarseCentroids,
1211
+ const float* const __restrict pqFineCentroids,
1212
+ const uint8_t* const __restrict code0,
1213
+ const float weight0,
1214
+ const uint8_t* const __restrict code1,
1215
+ const float weight1,
1216
+ float* const __restrict outputAccum) {
1217
+ // coarse quantizer
1218
+ const uint8_t* const __restrict coarse0 = code0;
1219
+ const uint8_t* const __restrict coarse1 = code1;
1220
+
1221
+ // fine quantizer
1222
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1223
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1224
+
1225
+ // clang-format off
1226
+
1227
+ // process chunks, 8 float
1228
+
1229
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1230
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1231
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1232
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1233
+
1234
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1235
+
1236
+ existingValue = elementaryBlock8x1bAccum(
1237
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1238
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1239
+ weight0,
1240
+ existingValue);
1241
+
1242
+ existingValue = elementaryBlock8x1bAccum(
1243
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1244
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1245
+ weight1,
1246
+ existingValue);
1247
+
1248
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1249
+
1250
+ // next
1251
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1252
+ pqCoarseCentroids, pqFineCentroids,
1253
+ code0, weight0,
1254
+ code1, weight1,
1255
+ outputAccum);
1256
+
1257
+ // clang-format on
1258
+ }
1259
+
1260
+ // Process 3 samples.
1261
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1262
+ // table.
1263
+ static void accum(
1264
+ const float* const __restrict pqCoarseCentroids0,
1265
+ const float* const __restrict pqFineCentroids0,
1266
+ const uint8_t* const __restrict code0,
1267
+ const float weight0,
1268
+ const float* const __restrict pqCoarseCentroids1,
1269
+ const float* const __restrict pqFineCentroids1,
1270
+ const uint8_t* const __restrict code1,
1271
+ const float weight1,
1272
+ const float* const __restrict pqCoarseCentroids2,
1273
+ const float* const __restrict pqFineCentroids2,
1274
+ const uint8_t* const __restrict code2,
1275
+ const float weight2,
1276
+ float* const __restrict outputAccum) {
1277
+ // coarse quantizer
1278
+ const uint8_t* const __restrict coarse0 = code0;
1279
+ const uint8_t* const __restrict coarse1 = code1;
1280
+ const uint8_t* const __restrict coarse2 = code2;
1281
+
1282
+ // fine quantizer
1283
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1284
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1285
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1286
+
1287
+ // clang-format off
1288
+
1289
+ // process chunks, 8 float
1290
+
1291
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1292
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1293
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1294
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1295
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1296
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1297
+
1298
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1299
+
1300
+ existingValue = elementaryBlock8x1bAccum(
1301
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1302
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1303
+ weight0,
1304
+ existingValue);
1305
+
1306
+ existingValue = elementaryBlock8x1bAccum(
1307
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1308
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1309
+ weight1,
1310
+ existingValue);
1311
+
1312
+ existingValue = elementaryBlock8x1bAccum(
1313
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1314
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1315
+ weight2,
1316
+ existingValue);
1317
+
1318
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1319
+
1320
+ // next
1321
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1322
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1323
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1324
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
1325
+ outputAccum);
1326
+
1327
+ // clang-format on
1328
+ }
1329
+
1330
+ // Process 3 samples.
1331
+ // Coarse pq centroids table and fine pq centroids table are shared among
1332
+ // codes.
1333
+ static void accum(
1334
+ const float* const __restrict pqCoarseCentroids,
1335
+ const float* const __restrict pqFineCentroids,
1336
+ const uint8_t* const __restrict code0,
1337
+ const float weight0,
1338
+ const uint8_t* const __restrict code1,
1339
+ const float weight1,
1340
+ const uint8_t* const __restrict code2,
1341
+ const float weight2,
1342
+ float* const __restrict outputAccum) {
1343
+ // coarse quantizer
1344
+ const uint8_t* const __restrict coarse0 = code0;
1345
+ const uint8_t* const __restrict coarse1 = code1;
1346
+ const uint8_t* const __restrict coarse2 = code2;
1347
+
1348
+ // fine quantizer
1349
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1350
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1351
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1352
+
1353
+ // clang-format off
1354
+
1355
+ // process chunks, 8 float
1356
+
1357
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1358
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1359
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1360
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1361
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1362
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1363
+
1364
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1365
+
1366
+ existingValue = elementaryBlock8x1bAccum(
1367
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1368
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1369
+ weight0,
1370
+ existingValue);
1371
+
1372
+ existingValue = elementaryBlock8x1bAccum(
1373
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1374
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1375
+ weight1,
1376
+ existingValue);
1377
+
1378
+ existingValue = elementaryBlock8x1bAccum(
1379
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1380
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1381
+ weight2,
1382
+ existingValue);
1383
+
1384
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1385
+
1386
+ // next
1387
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1388
+ pqCoarseCentroids, pqFineCentroids,
1389
+ code0, weight0,
1390
+ code1, weight1,
1391
+ code2, weight2,
1392
+ outputAccum);
1393
+
1394
+ // clang-format on
1395
+ }
1396
+ };
1397
+
1398
+ template <
1399
+ intptr_t DIM,
1400
+ intptr_t COARSE_SIZE,
1401
+ intptr_t FINE_SIZE,
1402
+ intptr_t COARSE_BITS,
1403
+ intptr_t FINE_BITS,
1404
+ intptr_t CPOS>
1405
+ struct Index2LevelDecoderImpl<
1406
+ DIM,
1407
+ COARSE_SIZE,
1408
+ FINE_SIZE,
1409
+ COARSE_BITS,
1410
+ FINE_BITS,
1411
+ CPOS,
1412
+ false,
1413
+ false,
1414
+ false,
1415
+ true,
1416
+ false> {
1417
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
1418
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
1419
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1420
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1421
+
1422
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1423
+
1424
+ // coarse quantizer storage
1425
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
1426
+
1427
+ // coarse quantizer bytes start from 0
1428
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
1429
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
1430
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
1431
+ N_COARSE_ELEMENTS * COARSE_BITS;
1432
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
1433
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
1434
+
1435
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1436
+
1437
+ // process 1 sample
1438
+ static void store(
1439
+ const float* const __restrict pqCoarseCentroids0,
1440
+ const float* const __restrict pqFineCentroids0,
1441
+ const uint8_t* const __restrict code0,
1442
+ float* const __restrict outputStore) {
1443
+ // coarse quantizer
1444
+ const uint8_t* const __restrict coarse0 = code0;
1445
+
1446
+ // fine quantizer
1447
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1448
+
1449
+ // clang-format off
1450
+
1451
+ // process chunks, 4 float
1452
+
1453
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1454
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1455
+
1456
+ const __m128 storeValue = elementaryBlock4x1b(
1457
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1458
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
1459
+
1460
+ _mm_storeu_ps(outputStore + CPOS, storeValue);
1461
+
1462
+ // next
1463
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::store(
1464
+ pqCoarseCentroids0, pqFineCentroids0, code0,
1465
+ outputStore);
1466
+
1467
+ // clang-format on
1468
+ }
1469
+
1470
+ // process 1 sample
1471
+ static void accum(
1472
+ const float* const __restrict pqCoarseCentroids0,
1473
+ const float* const __restrict pqFineCentroids0,
1474
+ const uint8_t* const __restrict code0,
1475
+ const float weight0,
1476
+ float* const __restrict outputAccum) {
1477
+ // coarse quantizer
1478
+ const uint8_t* const __restrict coarse0 = code0;
1479
+
1480
+ // fine quantizer
1481
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1482
+
1483
+ // clang-format off
1484
+
1485
+ // process chunks, 4 float
1486
+
1487
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1488
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS,fineCentroidIdx>::get(fine0);
1489
+
1490
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1491
+
1492
+ existingValue = elementaryBlock4x1bAccum(
1493
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1494
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1495
+ weight0,
1496
+ existingValue);
1497
+
1498
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1499
+
1500
+ // next
1501
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1502
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1503
+ outputAccum);
1504
+
1505
+ // clang-format on
1506
+ }
1507
+
1508
+ // Process 2 samples.
1509
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1510
+ // table.
1511
+ static void accum(
1512
+ const float* const __restrict pqCoarseCentroids0,
1513
+ const float* const __restrict pqFineCentroids0,
1514
+ const uint8_t* const __restrict code0,
1515
+ const float weight0,
1516
+ const float* const __restrict pqCoarseCentroids1,
1517
+ const float* const __restrict pqFineCentroids1,
1518
+ const uint8_t* const __restrict code1,
1519
+ const float weight1,
1520
+ float* const __restrict outputAccum) {
1521
+ // coarse quantizer
1522
+ const uint8_t* const __restrict coarse0 = code0;
1523
+ const uint8_t* const __restrict coarse1 = code1;
1524
+
1525
+ // fine quantizer
1526
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1527
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1528
+
1529
+ // clang-format off
1530
+
1531
+ // process chunks, 4 float
1532
+
1533
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1534
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1535
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1536
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1537
+
1538
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1539
+
1540
+ existingValue = elementaryBlock4x1bAccum(
1541
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1542
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1543
+ weight0,
1544
+ existingValue);
1545
+
1546
+ existingValue = elementaryBlock4x1bAccum(
1547
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1548
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1549
+ weight1,
1550
+ existingValue);
1551
+
1552
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1553
+
1554
+ // next
1555
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1556
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1557
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1558
+ outputAccum);
1559
+
1560
+ // clang-format on
1561
+ }
1562
+
1563
+ // Process 2 samples.
1564
+ // Coarse pq centroids table and fine pq centroids table are shared among
1565
+ // codes.
1566
+ static void accum(
1567
+ const float* const __restrict pqCoarseCentroids,
1568
+ const float* const __restrict pqFineCentroids,
1569
+ const uint8_t* const __restrict code0,
1570
+ const float weight0,
1571
+ const uint8_t* const __restrict code1,
1572
+ const float weight1,
1573
+ float* const __restrict outputAccum) {
1574
+ // coarse quantizer
1575
+ const uint8_t* const __restrict coarse0 = code0;
1576
+ const uint8_t* const __restrict coarse1 = code1;
1577
+
1578
+ // fine quantizer
1579
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1580
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1581
+
1582
+ // clang-format off
1583
+
1584
+ // process chunks, 4 float
1585
+
1586
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1587
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1588
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1589
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1590
+
1591
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1592
+
1593
+ existingValue = elementaryBlock4x1bAccum(
1594
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1595
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1596
+ weight0,
1597
+ existingValue);
1598
+
1599
+ existingValue = elementaryBlock4x1bAccum(
1600
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1601
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1602
+ weight1,
1603
+ existingValue);
1604
+
1605
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1606
+
1607
+ // next
1608
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1609
+ pqCoarseCentroids, pqFineCentroids,
1610
+ code0, weight0,
1611
+ code1, weight1,
1612
+ outputAccum);
1613
+
1614
+ // clang-format on
1615
+ }
1616
+
1617
+ // Process 3 samples.
1618
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1619
+ // table.
1620
+ static void accum(
1621
+ const float* const __restrict pqCoarseCentroids0,
1622
+ const float* const __restrict pqFineCentroids0,
1623
+ const uint8_t* const __restrict code0,
1624
+ const float weight0,
1625
+ const float* const __restrict pqCoarseCentroids1,
1626
+ const float* const __restrict pqFineCentroids1,
1627
+ const uint8_t* const __restrict code1,
1628
+ const float weight1,
1629
+ const float* const __restrict pqCoarseCentroids2,
1630
+ const float* const __restrict pqFineCentroids2,
1631
+ const uint8_t* const __restrict code2,
1632
+ const float weight2,
1633
+ float* const __restrict outputAccum) {
1634
+ // coarse quantizer
1635
+ const uint8_t* const __restrict coarse0 = code0;
1636
+ const uint8_t* const __restrict coarse1 = code1;
1637
+ const uint8_t* const __restrict coarse2 = code2;
1638
+
1639
+ // fine quantizer
1640
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1641
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1642
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1643
+
1644
+ // clang-format off
1645
+
1646
+ // process chunks, 4 float
1647
+
1648
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1649
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1650
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1651
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1652
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1653
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1654
+
1655
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1656
+
1657
+ existingValue = elementaryBlock4x1bAccum(
1658
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1659
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1660
+ weight0,
1661
+ existingValue);
1662
+
1663
+ existingValue = elementaryBlock4x1bAccum(
1664
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1665
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1666
+ weight1,
1667
+ existingValue);
1668
+
1669
+ existingValue = elementaryBlock4x1bAccum(
1670
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1671
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1672
+ weight2,
1673
+ existingValue);
1674
+
1675
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1676
+
1677
+ // next
1678
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1679
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1680
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1681
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
1682
+ outputAccum);
1683
+
1684
+ // clang-format on
1685
+ }
1686
+
1687
+ // Process 3 samples.
1688
+ // Coarse pq centroids table and fine pq centroids table are shared among
1689
+ // codes.
1690
+ static void accum(
1691
+ const float* const __restrict pqCoarseCentroids,
1692
+ const float* const __restrict pqFineCentroids,
1693
+ const uint8_t* const __restrict code0,
1694
+ const float weight0,
1695
+ const uint8_t* const __restrict code1,
1696
+ const float weight1,
1697
+ const uint8_t* const __restrict code2,
1698
+ const float weight2,
1699
+ float* const __restrict outputAccum) {
1700
+ // coarse quantizer
1701
+ const uint8_t* const __restrict coarse0 = code0;
1702
+ const uint8_t* const __restrict coarse1 = code1;
1703
+ const uint8_t* const __restrict coarse2 = code2;
1704
+
1705
+ // fine quantizer
1706
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1707
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1708
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1709
+
1710
+ // clang-format off
1711
+
1712
+ // process chunks, 4 float
1713
+
1714
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1715
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1716
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1717
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1718
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1719
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1720
+
1721
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1722
+
1723
+ existingValue = elementaryBlock4x1bAccum(
1724
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1725
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1726
+ weight0,
1727
+ existingValue);
1728
+
1729
+ existingValue = elementaryBlock4x1bAccum(
1730
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1731
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1732
+ weight1,
1733
+ existingValue);
1734
+
1735
+ existingValue = elementaryBlock4x1bAccum(
1736
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1737
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1738
+ weight2,
1739
+ existingValue);
1740
+
1741
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1742
+
1743
+ // next
1744
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1745
+ pqCoarseCentroids, pqFineCentroids,
1746
+ code0, weight0,
1747
+ code1, weight1,
1748
+ code2, weight2,
1749
+ outputAccum);
1750
+
1751
+ // clang-format on
1752
+ }
1753
+ };
1754
+
1755
+ // This partial specialization is expected to do nothing.
1756
+ template <
1757
+ intptr_t DIM,
1758
+ intptr_t COARSE_SIZE,
1759
+ intptr_t FINE_SIZE,
1760
+ intptr_t COARSE_BITS,
1761
+ intptr_t FINE_BITS,
1762
+ bool FINE_SIZE_EQ_2,
1763
+ bool FINE_SIZE_EQ_4,
1764
+ bool QPOS_LEFT_GE_8,
1765
+ bool QPOS_LEFT_GE_4>
1766
+ struct Index2LevelDecoderImpl<
1767
+ DIM,
1768
+ COARSE_SIZE,
1769
+ FINE_SIZE,
1770
+ COARSE_BITS,
1771
+ FINE_BITS,
1772
+ DIM,
1773
+ FINE_SIZE_EQ_2,
1774
+ FINE_SIZE_EQ_4,
1775
+ QPOS_LEFT_GE_8,
1776
+ QPOS_LEFT_GE_4,
1777
+ true> {
1778
+ // clang-format off
1779
+
1780
+ // process 1 sample
1781
+ static void store(
1782
+ const float* const __restrict pqCoarseCentroids0,
1783
+ const float* const __restrict pqFineCentroids0,
1784
+ const uint8_t* const __restrict code0,
1785
+ float* const __restrict outputStore) {}
1786
+
1787
+ // process 1 sample
1788
+ static void accum(
1789
+ const float* const __restrict pqCoarseCentroids0,
1790
+ const float* const __restrict pqFineCentroids0,
1791
+ const uint8_t* const __restrict code0,
1792
+ const float weight0,
1793
+ float* const __restrict outputAccum) {}
1794
+
1795
+ // Process 2 samples.
1796
+ // Each code uses its own coarse pq centroids table and fine pq centroids table.
1797
+ static void accum(
1798
+ const float* const __restrict pqCoarseCentroids0,
1799
+ const float* const __restrict pqFineCentroids0,
1800
+ const uint8_t* const __restrict code0,
1801
+ const float weight0,
1802
+ const float* const __restrict pqCoarseCentroids1,
1803
+ const float* const __restrict pqFineCentroids1,
1804
+ const uint8_t* const __restrict code1,
1805
+ const float weight1,
1806
+ float* const __restrict outputAccum) {}
1807
+
1808
+ // Process 2 samples.
1809
+ // Coarse pq centroids table and fine pq centroids table are shared among codes.
1810
+ static void accum(
1811
+ const float* const __restrict pqCoarseCentroids,
1812
+ const float* const __restrict pqFineCentroids,
1813
+ const uint8_t* const __restrict code0,
1814
+ const float weight0,
1815
+ const uint8_t* const __restrict code1,
1816
+ const float weight1,
1817
+ float* const __restrict outputAccum) {}
1818
+
1819
+ // Process 3 samples.
1820
+ // Each code uses its own coarse pq centroids table and fine pq centroids table.
1821
+ static void accum(
1822
+ const float* const __restrict pqCoarseCentroids0,
1823
+ const float* const __restrict pqFineCentroids0,
1824
+ const uint8_t* const __restrict code0,
1825
+ const float weight0,
1826
+ const float* const __restrict pqCoarseCentroids1,
1827
+ const float* const __restrict pqFineCentroids1,
1828
+ const uint8_t* const __restrict code1,
1829
+ const float weight1,
1830
+ const float* const __restrict pqCoarseCentroids2,
1831
+ const float* const __restrict pqFineCentroids2,
1832
+ const uint8_t* const __restrict code2,
1833
+ const float weight2,
1834
+ float* const __restrict outputAccum) {}
1835
+
1836
+ // Process 3 samples.
1837
+ // Coarse pq centroids table and fine pq centroids table are shared among codes.
1838
+ static void accum(
1839
+ const float* const __restrict pqCoarseCentroids,
1840
+ const float* const __restrict pqFineCentroids,
1841
+ const uint8_t* const __restrict code0,
1842
+ const float weight0,
1843
+ const uint8_t* const __restrict code1,
1844
+ const float weight1,
1845
+ const uint8_t* const __restrict code2,
1846
+ const float weight2,
1847
+ float* const __restrict outputAccum) {}
1848
+
1849
+ // clang-format on
1850
+ };
1851
+ } // namespace
1852
+
1853
+ // Suitable for IVF256,PQ[1]x8
1854
+ // Suitable for Residual[1]x8,PQ[2]x8
1855
+ // Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
1856
+ // Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
1857
+ template <
1858
+ intptr_t DIM,
1859
+ intptr_t COARSE_SIZE,
1860
+ intptr_t FINE_SIZE,
1861
+ intptr_t COARSE_BITS = 8,
1862
+ intptr_t FINE_BITS = 8>
1863
+ struct Index2LevelDecoder {
1864
+ static_assert(
1865
+ COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
1866
+ "Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
1867
+ static_assert(
1868
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1869
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1870
+
1871
+ static constexpr intptr_t dim = DIM;
1872
+ static constexpr intptr_t coarseSize = COARSE_SIZE;
1873
+ static constexpr intptr_t fineSize = FINE_SIZE;
1874
+ static constexpr intptr_t coarseBits = COARSE_BITS;
1875
+ static constexpr intptr_t fineBits = FINE_BITS;
1876
+
1877
+ // Process 1 sample.
1878
+ static void store(
1879
+ const float* const __restrict pqCoarseCentroids,
1880
+ const float* const __restrict pqFineCentroids,
1881
+ const uint8_t* const __restrict code,
1882
+ float* const __restrict outputStore) {
1883
+ Index2LevelDecoderImpl<
1884
+ DIM,
1885
+ COARSE_SIZE,
1886
+ FINE_SIZE,
1887
+ COARSE_BITS,
1888
+ FINE_BITS,
1889
+ 0>::
1890
+ store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
1891
+ }
1892
+
1893
+ // Process 1 sample.
1894
+ // Performs outputAccum += weight * decoded(code)
1895
+ static void accum(
1896
+ const float* const __restrict pqCoarseCentroids,
1897
+ const float* const __restrict pqFineCentroids,
1898
+ const uint8_t* const __restrict code,
1899
+ const float weight,
1900
+ float* const __restrict outputAccum) {
1901
+ Index2LevelDecoderImpl<
1902
+ DIM,
1903
+ COARSE_SIZE,
1904
+ FINE_SIZE,
1905
+ COARSE_BITS,
1906
+ FINE_BITS,
1907
+ 0>::
1908
+ accum(pqCoarseCentroids,
1909
+ pqFineCentroids,
1910
+ code,
1911
+ weight,
1912
+ outputAccum);
1913
+ }
1914
+
1915
+ // Process 2 samples.
1916
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1917
+ // table.
1918
+ //
1919
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1920
+ // decoded(code1).
1921
+ static void accum(
1922
+ const float* const __restrict pqCoarseCentroids0,
1923
+ const float* const __restrict pqFineCentroids0,
1924
+ const uint8_t* const __restrict code0,
1925
+ const float weight0,
1926
+ const float* const __restrict pqCoarseCentroids1,
1927
+ const float* const __restrict pqFineCentroids1,
1928
+ const uint8_t* const __restrict code1,
1929
+ const float weight1,
1930
+ float* const __restrict outputAccum) {
1931
+ Index2LevelDecoderImpl<
1932
+ DIM,
1933
+ COARSE_SIZE,
1934
+ FINE_SIZE,
1935
+ COARSE_BITS,
1936
+ FINE_BITS,
1937
+ 0>::
1938
+ accum(pqCoarseCentroids0,
1939
+ pqFineCentroids0,
1940
+ code0,
1941
+ weight0,
1942
+ pqCoarseCentroids1,
1943
+ pqFineCentroids1,
1944
+ code1,
1945
+ weight1,
1946
+ outputAccum);
1947
+ }
1948
+
1949
+ // Process 2 samples.
1950
+ // Coarse pq centroids table and fine pq centroids table are shared among
1951
+ // codes.
1952
+ //
1953
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1954
+ // decoded(code1)
1955
+ static void accum(
1956
+ const float* const __restrict pqCoarseCentroids,
1957
+ const float* const __restrict pqFineCentroids,
1958
+ const uint8_t* const __restrict code0,
1959
+ const float weight0,
1960
+ const uint8_t* const __restrict code1,
1961
+ const float weight1,
1962
+ float* const __restrict outputAccum) {
1963
+ Index2LevelDecoderImpl<
1964
+ DIM,
1965
+ COARSE_SIZE,
1966
+ FINE_SIZE,
1967
+ COARSE_BITS,
1968
+ FINE_BITS,
1969
+ 0>::
1970
+ accum(pqCoarseCentroids,
1971
+ pqFineCentroids,
1972
+ code0,
1973
+ weight0,
1974
+ code1,
1975
+ weight1,
1976
+ outputAccum);
1977
+ }
1978
+
1979
+ // Process 3 samples.
1980
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1981
+ // table.
1982
+ //
1983
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1984
+ // decoded(code1) + weight2 * decoded(code2)
1985
+ static void accum(
1986
+ const float* const __restrict pqCoarseCentroids0,
1987
+ const float* const __restrict pqFineCentroids0,
1988
+ const uint8_t* const __restrict code0,
1989
+ const float weight0,
1990
+ const float* const __restrict pqCoarseCentroids1,
1991
+ const float* const __restrict pqFineCentroids1,
1992
+ const uint8_t* const __restrict code1,
1993
+ const float weight1,
1994
+ const float* const __restrict pqCoarseCentroids2,
1995
+ const float* const __restrict pqFineCentroids2,
1996
+ const uint8_t* const __restrict code2,
1997
+ const float weight2,
1998
+ float* const __restrict outputAccum) {
1999
+ Index2LevelDecoderImpl<
2000
+ DIM,
2001
+ COARSE_SIZE,
2002
+ FINE_SIZE,
2003
+ COARSE_BITS,
2004
+ FINE_BITS,
2005
+ 0>::
2006
+ accum(pqCoarseCentroids0,
2007
+ pqFineCentroids0,
2008
+ code0,
2009
+ weight0,
2010
+ pqCoarseCentroids1,
2011
+ pqFineCentroids1,
2012
+ code1,
2013
+ weight1,
2014
+ pqCoarseCentroids2,
2015
+ pqFineCentroids2,
2016
+ code2,
2017
+ weight2,
2018
+ outputAccum);
2019
+ }
2020
+
2021
+ // Process 3 samples.
2022
+ // Coarse pq centroids table and fine pq centroids table are shared among
2023
+ // codes.
2024
+ //
2025
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2026
+ // decoded(code1) + weight2 * decoded(code2)
2027
+ static void accum(
2028
+ const float* const __restrict pqCoarseCentroids,
2029
+ const float* const __restrict pqFineCentroids,
2030
+ const uint8_t* const __restrict code0,
2031
+ const float weight0,
2032
+ const uint8_t* const __restrict code1,
2033
+ const float weight1,
2034
+ const uint8_t* const __restrict code2,
2035
+ const float weight2,
2036
+ float* const __restrict outputAccum) {
2037
+ Index2LevelDecoderImpl<
2038
+ DIM,
2039
+ COARSE_SIZE,
2040
+ FINE_SIZE,
2041
+ COARSE_BITS,
2042
+ FINE_BITS,
2043
+ 0>::
2044
+ accum(pqCoarseCentroids,
2045
+ pqFineCentroids,
2046
+ code0,
2047
+ weight0,
2048
+ code1,
2049
+ weight1,
2050
+ code2,
2051
+ weight2,
2052
+ outputAccum);
2053
+ }
2054
+ };
2055
+
2056
+ } // namespace cppcontrib
2057
+ } // namespace faiss
2058
+ #endif // LEVEL2_AVX2_INL_H