faiss 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -0,0 +1,2058 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+ #ifndef LEVEL2_AVX2_INL_H
3
+ #define LEVEL2_AVX2_INL_H
4
+
5
+ #include <immintrin.h>
6
+
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+
10
+ #include <faiss/cppcontrib/detail/UintReader.h>
11
+
12
+ namespace faiss {
13
+ namespace cppcontrib {
14
+
15
+ ////////////////////////////////////////////////////////////////////////////////////
16
+ /// Index2LevelDecoder
17
+ ////////////////////////////////////////////////////////////////////////////////////
18
+
19
+ namespace {
20
+
21
+ // Processes 8 float values.
22
+ // Returns {
23
+ // [0..1] = *coarse[0..1] + *fine0[0..1];
24
+ // [2..3] = *coarse[2..3] + *fine1[0..1];
25
+ // [4..5] = *coarse[4..5] + *fine2[0..1];
26
+ // [6..7] = *coarse[6..7] + *fine3[0..1];
27
+ // }
28
+ inline __m256 elementaryBlock2x4b(
29
+ const float* const __restrict coarse,
30
+ const float* const __restrict fine0,
31
+ const float* const __restrict fine1,
32
+ const float* const __restrict fine2,
33
+ const float* const __restrict fine3) {
34
+ // load fine
35
+ const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
36
+ *reinterpret_cast<const double*>(fine0),
37
+ *reinterpret_cast<const double*>(fine1),
38
+ *reinterpret_cast<const double*>(fine2),
39
+ *reinterpret_cast<const double*>(fine3)));
40
+ // load coarse
41
+ const __m256 coarseValue = _mm256_loadu_ps(coarse);
42
+
43
+ // add coarse and fine
44
+ return _mm256_add_ps(fineValue, coarseValue);
45
+ }
46
+
47
+ // Processes 8 float values.
48
+ // Returns {
49
+ // [0..1] = existingValue[0..1] + weight * (*coarse[0..1] + *fine0[0..1]);
50
+ // [2..3] = existingValue[0..1] + weight * (*coarse[2..3] + *fine1[0..1]);
51
+ // [4..5] = existingValue[0..1] + weight * (*coarse[4..5] + *fine2[0..1]);
52
+ // [6..7] = existingValue[0..1] + weight * (*coarse[6..7] + *fine3[0..1]);
53
+ // }
54
+ inline __m256 elementaryBlock2x4bAccum(
55
+ const float* const __restrict coarse,
56
+ const float* const __restrict fine0,
57
+ const float* const __restrict fine1,
58
+ const float* const __restrict fine2,
59
+ const float* const __restrict fine3,
60
+ const float weight,
61
+ const __m256 existingValue) {
62
+ // add coarse and fine
63
+ const __m256 combinedValue =
64
+ elementaryBlock2x4b(coarse, fine0, fine1, fine2, fine3);
65
+
66
+ // this operation is expected to be optimized by a compiler
67
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
68
+ // do fma
69
+ return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
70
+ }
71
+
72
+ // Processes 4 float values.
73
+ // Returns {
74
+ // [0..3] = *coarse[0..3] + *fine[0..3];
75
+ // }
76
+ inline __m128 elementaryBlock4x1b(
77
+ const float* const __restrict coarse,
78
+ const float* const __restrict fine) {
79
+ // load fine
80
+ const __m128 fineValue = _mm_loadu_ps(fine);
81
+ // load coarse
82
+ const __m128 coarseValue = _mm_loadu_ps(coarse);
83
+
84
+ // add coarse and fine
85
+ return _mm_add_ps(fineValue, coarseValue);
86
+ }
87
+
88
+ // Processes 4 float values.
89
+ // Returns {
90
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
91
+ // }
92
+ inline __m128 elementaryBlock4x1bAccum(
93
+ const float* const __restrict coarse,
94
+ const float* const __restrict fine,
95
+ const float weight,
96
+ const __m128 existingValue) {
97
+ // add coarse and fine
98
+ const __m128 combinedValue = elementaryBlock4x1b(coarse, fine);
99
+
100
+ // this operation is expected to be optimized by a compiler
101
+ const __m128 weightAvx = _mm_set1_ps(weight);
102
+ // do fma
103
+ return _mm_fmadd_ps(combinedValue, weightAvx, existingValue);
104
+ }
105
+
106
+ // Processes 8 float values.
107
+ // Returns {
108
+ // [0..3] = *coarse[0..3] + *fine0[0..3];
109
+ // [4..7] = *coarse[4..7] + *fine1[0..3];
110
+ // }
111
+ inline __m256 elementaryBlock4x2b(
112
+ const float* const __restrict coarse,
113
+ const float* const __restrict fine0,
114
+ const float* const __restrict fine1) {
115
+ // load fine
116
+ const __m128 fineValue0 = _mm_loadu_ps(fine0);
117
+ const __m128 fineValue1 = _mm_loadu_ps(fine1);
118
+ // load coarse
119
+ const __m256 coarseValue = _mm256_loadu_ps(coarse);
120
+
121
+ // combine two 4b into a single 8b
122
+ const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
123
+ // add coarse and fine
124
+ return _mm256_add_ps(combinedFineValue, coarseValue);
125
+ }
126
+
127
+ // Processes 8 float values.
128
+ // Returns {
129
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
130
+ // [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
131
+ // }
132
+ inline __m256 elementaryBlock4x2bAccum(
133
+ const float* const __restrict coarse,
134
+ const float* const __restrict fine0,
135
+ const float* const __restrict fine1,
136
+ const float weight,
137
+ const __m256 existingValue) {
138
+ // add coarse and fine
139
+ const __m256 combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
140
+
141
+ // this operation is expected to be optimized by a compiler
142
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
143
+ // do fma
144
+ return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
145
+ }
146
+
147
+ // Processes 8 float values.
148
+ // Returns {
149
+ // [0..7] = *coarse[0..7] + *fine[0..7];
150
+ // }
151
+ inline __m256 elementaryBlock8x1b(
152
+ const float* const __restrict coarse,
153
+ const float* const __restrict fine) {
154
+ // load fine
155
+ const __m256 fineValue = _mm256_loadu_ps(fine);
156
+ // load coarse
157
+ const __m256 coarseValue = _mm256_loadu_ps(coarse);
158
+
159
+ // add coarse and fine
160
+ return _mm256_add_ps(fineValue, coarseValue);
161
+ }
162
+
163
+ // Processes 8 float values.
164
+ // Returns {
165
+ // [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
166
+ // }
167
+ inline __m256 elementaryBlock8x1bAccum(
168
+ const float* const __restrict coarse,
169
+ const float* const __restrict fine,
170
+ const float weight,
171
+ const __m256 existingValue) {
172
+ // add coarse and fine
173
+ const __m256 combinedValue = elementaryBlock8x1b(coarse, fine);
174
+
175
+ // this operation is expected to be optimized by a compiler
176
+ const __m256 weightAvx2 = _mm256_set1_ps(weight);
177
+ // do fma
178
+ return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
179
+ }
180
+
181
+ // The following code uses template-based for-loop unrolling,
182
+ // because the compiler does not do that on its own as needed.
183
+ // The idea is the following:
184
+ // template<int I, int MAX>
185
+ // struct Foo {
186
+ // static void bar() {
187
+ // doSomething(I);
188
+ // Foo<I + 1, MAX>::bar();
189
+ // }
190
+ // };
191
+ //
192
+ // template<int MAX>
193
+ // struct Foo<MAX, MAX> {
194
+ // static void bar() {}
195
+ // };
196
+ //
197
+ // Initiate the loop:
198
+ // Foo<0, MAX>::bar();
199
+
200
+ template <
201
+ intptr_t DIM,
202
+ intptr_t COARSE_SIZE,
203
+ intptr_t FINE_SIZE,
204
+ intptr_t COARSE_BITS,
205
+ intptr_t FINE_BITS,
206
+ intptr_t CPOS,
207
+ bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
208
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
209
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
210
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
211
+ bool DIM_EQ_CPOS = DIM == CPOS>
212
+ struct Index2LevelDecoderImpl;
213
+
214
+ template <
215
+ intptr_t DIM,
216
+ intptr_t COARSE_SIZE,
217
+ intptr_t COARSE_BITS,
218
+ intptr_t FINE_BITS,
219
+ intptr_t CPOS,
220
+ bool QPOS_LEFT_GE_8,
221
+ bool QPOS_LEFT_GE_4>
222
+ struct Index2LevelDecoderImpl<
223
+ DIM,
224
+ COARSE_SIZE,
225
+ 2,
226
+ COARSE_BITS,
227
+ FINE_BITS,
228
+ CPOS,
229
+ true,
230
+ false,
231
+ QPOS_LEFT_GE_8,
232
+ QPOS_LEFT_GE_4,
233
+ false> {
234
+ static constexpr intptr_t FINE_SIZE = 2;
235
+
236
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
237
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
238
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
239
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
240
+
241
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
242
+
243
+ // coarse quantizer storage
244
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
245
+
246
+ // coarse quantizer bytes start from 0
247
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
248
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
249
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
250
+ N_COARSE_ELEMENTS * COARSE_BITS;
251
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
252
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
253
+
254
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
255
+
256
+ // process 1 sample
257
+ static void store(
258
+ const float* const __restrict pqCoarseCentroids0,
259
+ const float* const __restrict pqFineCentroids0,
260
+ const uint8_t* const __restrict code0,
261
+ float* const __restrict outputStore) {
262
+ // coarse quantizer
263
+ const uint8_t* const __restrict coarse0 = code0;
264
+
265
+ // fine quantizer
266
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
267
+
268
+ // clang-format off
269
+
270
+ // process chunks, 2 float
271
+ // but 8 floats per loop
272
+
273
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
274
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
275
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
276
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
277
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
278
+
279
+ const __m256 storeValue = elementaryBlock2x4b(
280
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
281
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
282
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
283
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
284
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
285
+
286
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
287
+
288
+ // next
289
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
290
+ pqCoarseCentroids0, pqFineCentroids0, code0,
291
+ outputStore);
292
+
293
+ // clang-format on
294
+ }
295
+
296
+ // process 1 sample
297
+ static void accum(
298
+ const float* const __restrict pqCoarseCentroids0,
299
+ const float* const __restrict pqFineCentroids0,
300
+ const uint8_t* const __restrict code0,
301
+ const float weight0,
302
+ float* const __restrict outputAccum) {
303
+ // coarse quantizer
304
+ const uint8_t* const __restrict coarse0 = code0;
305
+
306
+ // fine quantizer
307
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
308
+
309
+ // clang-format off
310
+
311
+ // process chunks, 2 float
312
+ // but 8 floats per loop
313
+
314
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
315
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
316
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
317
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
318
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
319
+
320
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
321
+
322
+ existingValue = elementaryBlock2x4bAccum(
323
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
324
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
325
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
326
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
327
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset, weight0,
328
+ existingValue);
329
+
330
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
331
+
332
+ // next
333
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
334
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
335
+ outputAccum);
336
+
337
+ // clang-format on
338
+ }
339
+
340
+ // Process 2 samples.
341
+ // Each code uses its own coarse pq centroids table and fine pq centroids
342
+ // table.
343
+ static void accum(
344
+ const float* const __restrict pqCoarseCentroids0,
345
+ const float* const __restrict pqFineCentroids0,
346
+ const uint8_t* const __restrict code0,
347
+ const float weight0,
348
+ const float* const __restrict pqCoarseCentroids1,
349
+ const float* const __restrict pqFineCentroids1,
350
+ const uint8_t* const __restrict code1,
351
+ const float weight1,
352
+ float* const __restrict outputAccum) {
353
+ // coarse quantizer
354
+ const uint8_t* const __restrict coarse0 = code0;
355
+ const uint8_t* const __restrict coarse1 = code1;
356
+
357
+ // fine quantizer
358
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
359
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
360
+
361
+ // clang-format off
362
+
363
+ // process chunks, 4 float
364
+ // but 8 floats per loop
365
+
366
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
367
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
368
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
369
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
370
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
371
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
372
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
373
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
374
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
375
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
376
+
377
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
378
+
379
+ existingValue = elementaryBlock2x4bAccum(
380
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
381
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
382
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
383
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
384
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
385
+ weight0,
386
+ existingValue);
387
+
388
+ existingValue = elementaryBlock2x4bAccum(
389
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
390
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
391
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
392
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
393
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
394
+ weight1,
395
+ existingValue);
396
+
397
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
398
+
399
+ // next
400
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
401
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
402
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
403
+ outputAccum);
404
+
405
+ // clang-format on
406
+ }
407
+
408
+ // Process 2 samples.
409
+ // Coarse pq centroids table and fine pq centroids table are shared among
410
+ // codes.
411
+ static void accum(
412
+ const float* const __restrict pqCoarseCentroids,
413
+ const float* const __restrict pqFineCentroids,
414
+ const uint8_t* const __restrict code0,
415
+ const float weight0,
416
+ const uint8_t* const __restrict code1,
417
+ const float weight1,
418
+ float* const __restrict outputAccum) {
419
+ // coarse quantizer
420
+ const uint8_t* const __restrict coarse0 = code0;
421
+ const uint8_t* const __restrict coarse1 = code1;
422
+
423
+ // fine quantizer
424
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
425
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
426
+
427
+ // clang-format off
428
+
429
+ // process chunks, 4 float
430
+ // but 8 floats per loop
431
+
432
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
433
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
434
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
435
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
436
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
437
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
438
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
439
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
440
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
441
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
442
+
443
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
444
+
445
+ existingValue = elementaryBlock2x4bAccum(
446
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
447
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
448
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
449
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
450
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
451
+ weight0,
452
+ existingValue);
453
+
454
+ existingValue = elementaryBlock2x4bAccum(
455
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
456
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
457
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
458
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
459
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
460
+ weight1,
461
+ existingValue);
462
+
463
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
464
+
465
+ // next
466
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
467
+ pqCoarseCentroids, pqFineCentroids,
468
+ code0, weight0,
469
+ code1, weight1,
470
+ outputAccum);
471
+
472
+ // clang-format on
473
+ }
474
+
475
+ // Process 3 samples.
476
+ // Each code uses its own coarse pq centroids table and fine pq centroids
477
+ // table.
478
+ static void accum(
479
+ const float* const __restrict pqCoarseCentroids0,
480
+ const float* const __restrict pqFineCentroids0,
481
+ const uint8_t* const __restrict code0,
482
+ const float weight0,
483
+ const float* const __restrict pqCoarseCentroids1,
484
+ const float* const __restrict pqFineCentroids1,
485
+ const uint8_t* const __restrict code1,
486
+ const float weight1,
487
+ const float* const __restrict pqCoarseCentroids2,
488
+ const float* const __restrict pqFineCentroids2,
489
+ const uint8_t* const __restrict code2,
490
+ const float weight2,
491
+ float* const __restrict outputAccum) {
492
+ // coarse quantizer
493
+ const uint8_t* const __restrict coarse0 = code0;
494
+ const uint8_t* const __restrict coarse1 = code1;
495
+ const uint8_t* const __restrict coarse2 = code2;
496
+
497
+ // fine quantizer
498
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
499
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
500
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
501
+
502
+ // clang-format off
503
+
504
+ // process chunks, 2 float
505
+ // but 8 floats per loop
506
+
507
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
508
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
509
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
510
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
511
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
512
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
513
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
514
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
515
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
516
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
517
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
518
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
519
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
520
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
521
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
522
+
523
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
524
+
525
+ existingValue = elementaryBlock2x4bAccum(
526
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
527
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
528
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
529
+ pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
530
+ pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
531
+ weight0,
532
+ existingValue);
533
+
534
+ existingValue = elementaryBlock2x4bAccum(
535
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
536
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
537
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
538
+ pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
539
+ pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
540
+ weight1,
541
+ existingValue);
542
+
543
+ existingValue = elementaryBlock2x4bAccum(
544
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
545
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
546
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
547
+ pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
548
+ pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
549
+ weight2,
550
+ existingValue);
551
+
552
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
553
+
554
+ // next
555
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
556
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
557
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
558
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
559
+ outputAccum);
560
+
561
+ // clang-format on
562
+ }
563
+
564
+ // Process 3 samples.
565
+ // Coarse pq centroids table and fine pq centroids table are shared among
566
+ // codes.
567
+ static void accum(
568
+ const float* const __restrict pqCoarseCentroids,
569
+ const float* const __restrict pqFineCentroids,
570
+ const uint8_t* const __restrict code0,
571
+ const float weight0,
572
+ const uint8_t* const __restrict code1,
573
+ const float weight1,
574
+ const uint8_t* const __restrict code2,
575
+ const float weight2,
576
+ float* const __restrict outputAccum) {
577
+ // coarse quantizer
578
+ const uint8_t* const __restrict coarse0 = code0;
579
+ const uint8_t* const __restrict coarse1 = code1;
580
+ const uint8_t* const __restrict coarse2 = code2;
581
+
582
+ // fine quantizer
583
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
584
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
585
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
586
+
587
+ // clang-format off
588
+
589
+ // process chunks, 4 float
590
+ // but 8 floats per loop
591
+
592
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
593
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
594
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
595
+ const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
596
+ const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
597
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
598
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
599
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
600
+ const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
601
+ const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
602
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
603
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
604
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
605
+ const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
606
+ const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
607
+
608
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
609
+
610
+ existingValue = elementaryBlock2x4bAccum(
611
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
612
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
613
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
614
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
615
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
616
+ weight0,
617
+ existingValue);
618
+
619
+ existingValue = elementaryBlock2x4bAccum(
620
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
621
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
622
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
623
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
624
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
625
+ weight1,
626
+ existingValue);
627
+
628
+ existingValue = elementaryBlock2x4bAccum(
629
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
630
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
631
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
632
+ pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
633
+ pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
634
+ weight2,
635
+ existingValue);
636
+
637
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
638
+
639
+ // next
640
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
641
+ pqCoarseCentroids, pqFineCentroids,
642
+ code0, weight0,
643
+ code1, weight1,
644
+ code2, weight2,
645
+ outputAccum);
646
+
647
+ // clang-format on
648
+ }
649
+ };
650
+
651
+ template <
652
+ intptr_t DIM,
653
+ intptr_t COARSE_SIZE,
654
+ intptr_t COARSE_BITS,
655
+ intptr_t FINE_BITS,
656
+ intptr_t CPOS,
657
+ bool QPOS_LEFT_GE_8,
658
+ bool QPOS_LEFT_GE_4>
659
+ struct Index2LevelDecoderImpl<
660
+ DIM,
661
+ COARSE_SIZE,
662
+ 4,
663
+ COARSE_BITS,
664
+ FINE_BITS,
665
+ CPOS,
666
+ false,
667
+ true,
668
+ QPOS_LEFT_GE_8,
669
+ QPOS_LEFT_GE_4,
670
+ false> {
671
+ static constexpr intptr_t FINE_SIZE = 4;
672
+
673
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
674
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
675
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
676
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
677
+
678
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
679
+
680
+ // coarse quantizer storage
681
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
682
+
683
+ // coarse quantizer bytes start from 0
684
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
685
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
686
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
687
+ N_COARSE_ELEMENTS * COARSE_BITS;
688
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
689
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
690
+
691
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
692
+
693
+ // process 1 sample
694
+ static void store(
695
+ const float* const __restrict pqCoarseCentroids0,
696
+ const float* const __restrict pqFineCentroids0,
697
+ const uint8_t* const __restrict code0,
698
+ float* const __restrict outputStore) {
699
+ // coarse quantizer
700
+ const uint8_t* const __restrict coarse0 = code0;
701
+
702
+ // fine quantizer
703
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
704
+
705
+ // clang-format off
706
+
707
+ // process chunks, 4 float
708
+ // but 8 floats per loop
709
+
710
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
711
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
712
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
713
+
714
+ const __m256 storeValue = elementaryBlock4x2b(
715
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
716
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
717
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
718
+
719
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
720
+
721
+ // next
722
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
723
+ pqCoarseCentroids0, pqFineCentroids0, code0,
724
+ outputStore);
725
+
726
+ // clang-format on
727
+ }
728
+
729
+ // process 1 sample
730
+ static void accum(
731
+ const float* const __restrict pqCoarseCentroids0,
732
+ const float* const __restrict pqFineCentroids0,
733
+ const uint8_t* const __restrict code0,
734
+ const float weight0,
735
+ float* const __restrict outputAccum) {
736
+ // coarse quantizer
737
+ const uint8_t* const __restrict coarse0 = code0;
738
+
739
+ // fine quantizer
740
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
741
+
742
+ // clang-format off
743
+
744
+ // process chunks, 4 float
745
+ // but 8 floats per loop
746
+
747
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
748
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
749
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
750
+
751
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
752
+
753
+ existingValue = elementaryBlock4x2bAccum(
754
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
755
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
756
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
757
+ weight0,
758
+ existingValue);
759
+
760
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
761
+
762
+ // next
763
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
764
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
765
+ outputAccum);
766
+
767
+ // clang-format on
768
+ }
769
+
770
+ // Process 2 samples.
771
+ // Each code uses its own coarse pq centroids table and fine pq centroids
772
+ // table.
773
+ static void accum(
774
+ const float* const __restrict pqCoarseCentroids0,
775
+ const float* const __restrict pqFineCentroids0,
776
+ const uint8_t* const __restrict code0,
777
+ const float weight0,
778
+ const float* const __restrict pqCoarseCentroids1,
779
+ const float* const __restrict pqFineCentroids1,
780
+ const uint8_t* const __restrict code1,
781
+ const float weight1,
782
+ float* const __restrict outputAccum) {
783
+ // coarse quantizer
784
+ const uint8_t* const __restrict coarse0 = code0;
785
+ const uint8_t* const __restrict coarse1 = code1;
786
+
787
+ // fine quantizer
788
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
789
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
790
+
791
+ // clang-format off
792
+
793
+ // process chunks, 4 float
794
+ // but 8 floats per loop
795
+
796
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
797
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
798
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
799
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
800
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
801
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
802
+
803
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
804
+
805
+ existingValue = elementaryBlock4x2bAccum(
806
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
807
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
808
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
809
+ weight0,
810
+ existingValue);
811
+
812
+ existingValue = elementaryBlock4x2bAccum(
813
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
814
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
815
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
816
+ weight1,
817
+ existingValue);
818
+
819
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
820
+
821
+ // next
822
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
823
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
824
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
825
+ outputAccum);
826
+
827
+ // clang-format on
828
+ }
829
+
830
+ // Process 2 samples.
831
+ // Coarse pq centroids table and fine pq centroids table are shared among
832
+ // codes.
833
+ static void accum(
834
+ const float* const __restrict pqCoarseCentroids,
835
+ const float* const __restrict pqFineCentroids,
836
+ const uint8_t* const __restrict code0,
837
+ const float weight0,
838
+ const uint8_t* const __restrict code1,
839
+ const float weight1,
840
+ float* const __restrict outputAccum) {
841
+ // coarse quantizer
842
+ const uint8_t* const __restrict coarse0 = code0;
843
+ const uint8_t* const __restrict coarse1 = code1;
844
+
845
+ // fine quantizer
846
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
847
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
848
+
849
+ // clang-format off
850
+
851
+ // process chunks, 4 float
852
+ // but 8 floats per loop
853
+
854
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
855
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
856
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
857
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
858
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
859
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
860
+
861
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
862
+
863
+ existingValue = elementaryBlock4x2bAccum(
864
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
865
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
866
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
867
+ weight0,
868
+ existingValue);
869
+
870
+ existingValue = elementaryBlock4x2bAccum(
871
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
872
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
873
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
874
+ weight1,
875
+ existingValue);
876
+
877
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
878
+
879
+ // next
880
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
881
+ pqCoarseCentroids, pqFineCentroids,
882
+ code0, weight0,
883
+ code1, weight1,
884
+ outputAccum);
885
+
886
+ // clang-format on
887
+ }
888
+
889
+ // Process 3 samples.
890
+ // Each code uses its own coarse pq centroids table and fine pq centroids
891
+ // table.
892
+ static void accum(
893
+ const float* const __restrict pqCoarseCentroids0,
894
+ const float* const __restrict pqFineCentroids0,
895
+ const uint8_t* const __restrict code0,
896
+ const float weight0,
897
+ const float* const __restrict pqCoarseCentroids1,
898
+ const float* const __restrict pqFineCentroids1,
899
+ const uint8_t* const __restrict code1,
900
+ const float weight1,
901
+ const float* const __restrict pqCoarseCentroids2,
902
+ const float* const __restrict pqFineCentroids2,
903
+ const uint8_t* const __restrict code2,
904
+ const float weight2,
905
+ float* const __restrict outputAccum) {
906
+ // coarse quantizer
907
+ const uint8_t* const __restrict coarse0 = code0;
908
+ const uint8_t* const __restrict coarse1 = code1;
909
+ const uint8_t* const __restrict coarse2 = code2;
910
+
911
+ // fine quantizer
912
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
913
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
914
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
915
+
916
+ // clang-format off
917
+
918
+ // process chunks, 4 float
919
+ // but 8 floats per loop
920
+
921
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
922
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
923
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
924
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
925
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
926
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
927
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
928
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
929
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
930
+
931
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
932
+
933
+ existingValue = elementaryBlock4x2bAccum(
934
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
935
+ pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
936
+ pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
937
+ weight0,
938
+ existingValue);
939
+
940
+ existingValue = elementaryBlock4x2bAccum(
941
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
942
+ pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
943
+ pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
944
+ weight1,
945
+ existingValue);
946
+
947
+ existingValue = elementaryBlock4x2bAccum(
948
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
949
+ pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
950
+ pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
951
+ weight2,
952
+ existingValue);
953
+
954
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
955
+
956
+ // next
957
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
958
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
959
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
960
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
961
+ outputAccum);
962
+
963
+ // clang-format on
964
+ }
965
+
966
+ // Process 3 samples.
967
+ // Coarse pq centroids table and fine pq centroids table are shared among
968
+ // codes.
969
+ static void accum(
970
+ const float* const __restrict pqCoarseCentroids,
971
+ const float* const __restrict pqFineCentroids,
972
+ const uint8_t* const __restrict code0,
973
+ const float weight0,
974
+ const uint8_t* const __restrict code1,
975
+ const float weight1,
976
+ const uint8_t* const __restrict code2,
977
+ const float weight2,
978
+ float* const __restrict outputAccum) {
979
+ // coarse quantizer
980
+ const uint8_t* const __restrict coarse0 = code0;
981
+ const uint8_t* const __restrict coarse1 = code1;
982
+ const uint8_t* const __restrict coarse2 = code2;
983
+
984
+ // fine quantizer
985
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
986
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
987
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
988
+
989
+ // clang-format off
990
+
991
+ // process chunks, 4 float
992
+ // but 8 floats per loop
993
+
994
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
995
+ const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
996
+ const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
997
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
998
+ const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
999
+ const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
1000
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1001
+ const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
1002
+ const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
1003
+
1004
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1005
+
1006
+ existingValue = elementaryBlock4x2bAccum(
1007
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1008
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
1009
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
1010
+ weight0,
1011
+ existingValue);
1012
+
1013
+ existingValue = elementaryBlock4x2bAccum(
1014
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1015
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
1016
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
1017
+ weight1,
1018
+ existingValue);
1019
+
1020
+ existingValue = elementaryBlock4x2bAccum(
1021
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1022
+ pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
1023
+ pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
1024
+ weight2,
1025
+ existingValue);
1026
+
1027
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1028
+
1029
+ // next
1030
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1031
+ pqCoarseCentroids, pqFineCentroids,
1032
+ code0, weight0,
1033
+ code1, weight1,
1034
+ code2, weight2,
1035
+ outputAccum);
1036
+
1037
+ // clang-format on
1038
+ }
1039
+ };
1040
+
1041
+ template <
1042
+ intptr_t DIM,
1043
+ intptr_t COARSE_SIZE,
1044
+ intptr_t FINE_SIZE,
1045
+ intptr_t COARSE_BITS,
1046
+ intptr_t FINE_BITS,
1047
+ intptr_t CPOS>
1048
+ struct Index2LevelDecoderImpl<
1049
+ DIM,
1050
+ COARSE_SIZE,
1051
+ FINE_SIZE,
1052
+ COARSE_BITS,
1053
+ FINE_BITS,
1054
+ CPOS,
1055
+ false,
1056
+ false,
1057
+ true,
1058
+ true,
1059
+ false> {
1060
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
1061
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
1062
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1063
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1064
+
1065
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1066
+
1067
+ // coarse quantizer storage
1068
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
1069
+
1070
+ // coarse quantizer bytes start from 0
1071
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
1072
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
1073
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
1074
+ N_COARSE_ELEMENTS * COARSE_BITS;
1075
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
1076
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
1077
+
1078
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1079
+
1080
+ // process 1 sample
1081
+ static void store(
1082
+ const float* const __restrict pqCoarseCentroids0,
1083
+ const float* const __restrict pqFineCentroids0,
1084
+ const uint8_t* const __restrict code0,
1085
+ float* const __restrict outputStore) {
1086
+ // coarse quantizer
1087
+ const uint8_t* const __restrict coarse0 = code0;
1088
+
1089
+ // fine quantizer
1090
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1091
+
1092
+ // clang-format off
1093
+
1094
+ // process chunks, 8 float
1095
+
1096
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1097
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1098
+
1099
+ const __m256 storeValue = elementaryBlock8x1b(
1100
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1101
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
1102
+
1103
+ _mm256_storeu_ps(outputStore + CPOS, storeValue);
1104
+
1105
+ // next
1106
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
1107
+ pqCoarseCentroids0, pqFineCentroids0, code0,
1108
+ outputStore);
1109
+
1110
+ // clang-format on
1111
+ }
1112
+
1113
+ // process 1 sample
1114
+ static void accum(
1115
+ const float* const __restrict pqCoarseCentroids0,
1116
+ const float* const __restrict pqFineCentroids0,
1117
+ const uint8_t* const __restrict code0,
1118
+ const float weight0,
1119
+ float* const __restrict outputAccum) {
1120
+ // coarse quantizer
1121
+ const uint8_t* const __restrict coarse0 = code0;
1122
+
1123
+ // fine quantizer
1124
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1125
+
1126
+ // clang-format off
1127
+
1128
+ // process chunks, 8 float
1129
+
1130
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1131
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1132
+
1133
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1134
+
1135
+ existingValue = elementaryBlock8x1bAccum(
1136
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1137
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1138
+ weight0,
1139
+ existingValue);
1140
+
1141
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1142
+
1143
+ // next
1144
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1145
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1146
+ outputAccum);
1147
+
1148
+ // clang-format on
1149
+ }
1150
+
1151
+ // Process 2 samples.
1152
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1153
+ // table.
1154
+ static void accum(
1155
+ const float* const __restrict pqCoarseCentroids0,
1156
+ const float* const __restrict pqFineCentroids0,
1157
+ const uint8_t* const __restrict code0,
1158
+ const float weight0,
1159
+ const float* const __restrict pqCoarseCentroids1,
1160
+ const float* const __restrict pqFineCentroids1,
1161
+ const uint8_t* const __restrict code1,
1162
+ const float weight1,
1163
+ float* const __restrict outputAccum) {
1164
+ // coarse quantizer
1165
+ const uint8_t* const __restrict coarse0 = code0;
1166
+ const uint8_t* const __restrict coarse1 = code1;
1167
+
1168
+ // fine quantizer
1169
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1170
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1171
+
1172
+ // clang-format off
1173
+
1174
+ // process chunks, 8 float
1175
+
1176
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1177
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1178
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1179
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1180
+
1181
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1182
+
1183
+ existingValue = elementaryBlock8x1bAccum(
1184
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1185
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1186
+ weight0,
1187
+ existingValue);
1188
+
1189
+ existingValue = elementaryBlock8x1bAccum(
1190
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1191
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1192
+ weight1,
1193
+ existingValue);
1194
+
1195
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1196
+
1197
+ // next
1198
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1199
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1200
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1201
+ outputAccum);
1202
+
1203
+ // clang-format on
1204
+ }
1205
+
1206
+ // Process 2 samples.
1207
+ // Coarse pq centroids table and fine pq centroids table are shared among
1208
+ // codes.
1209
+ static void accum(
1210
+ const float* const __restrict pqCoarseCentroids,
1211
+ const float* const __restrict pqFineCentroids,
1212
+ const uint8_t* const __restrict code0,
1213
+ const float weight0,
1214
+ const uint8_t* const __restrict code1,
1215
+ const float weight1,
1216
+ float* const __restrict outputAccum) {
1217
+ // coarse quantizer
1218
+ const uint8_t* const __restrict coarse0 = code0;
1219
+ const uint8_t* const __restrict coarse1 = code1;
1220
+
1221
+ // fine quantizer
1222
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1223
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1224
+
1225
+ // clang-format off
1226
+
1227
+ // process chunks, 8 float
1228
+
1229
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1230
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1231
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1232
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1233
+
1234
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1235
+
1236
+ existingValue = elementaryBlock8x1bAccum(
1237
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1238
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1239
+ weight0,
1240
+ existingValue);
1241
+
1242
+ existingValue = elementaryBlock8x1bAccum(
1243
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1244
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1245
+ weight1,
1246
+ existingValue);
1247
+
1248
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1249
+
1250
+ // next
1251
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1252
+ pqCoarseCentroids, pqFineCentroids,
1253
+ code0, weight0,
1254
+ code1, weight1,
1255
+ outputAccum);
1256
+
1257
+ // clang-format on
1258
+ }
1259
+
1260
+ // Process 3 samples.
1261
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1262
+ // table.
1263
+ static void accum(
1264
+ const float* const __restrict pqCoarseCentroids0,
1265
+ const float* const __restrict pqFineCentroids0,
1266
+ const uint8_t* const __restrict code0,
1267
+ const float weight0,
1268
+ const float* const __restrict pqCoarseCentroids1,
1269
+ const float* const __restrict pqFineCentroids1,
1270
+ const uint8_t* const __restrict code1,
1271
+ const float weight1,
1272
+ const float* const __restrict pqCoarseCentroids2,
1273
+ const float* const __restrict pqFineCentroids2,
1274
+ const uint8_t* const __restrict code2,
1275
+ const float weight2,
1276
+ float* const __restrict outputAccum) {
1277
+ // coarse quantizer
1278
+ const uint8_t* const __restrict coarse0 = code0;
1279
+ const uint8_t* const __restrict coarse1 = code1;
1280
+ const uint8_t* const __restrict coarse2 = code2;
1281
+
1282
+ // fine quantizer
1283
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1284
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1285
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1286
+
1287
+ // clang-format off
1288
+
1289
+ // process chunks, 8 float
1290
+
1291
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1292
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1293
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1294
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1295
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1296
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1297
+
1298
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1299
+
1300
+ existingValue = elementaryBlock8x1bAccum(
1301
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1302
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1303
+ weight0,
1304
+ existingValue);
1305
+
1306
+ existingValue = elementaryBlock8x1bAccum(
1307
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1308
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1309
+ weight1,
1310
+ existingValue);
1311
+
1312
+ existingValue = elementaryBlock8x1bAccum(
1313
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1314
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1315
+ weight2,
1316
+ existingValue);
1317
+
1318
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1319
+
1320
+ // next
1321
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1322
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1323
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1324
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
1325
+ outputAccum);
1326
+
1327
+ // clang-format on
1328
+ }
1329
+
1330
+ // Process 3 samples.
1331
+ // Coarse pq centroids table and fine pq centroids table are shared among
1332
+ // codes.
1333
+ static void accum(
1334
+ const float* const __restrict pqCoarseCentroids,
1335
+ const float* const __restrict pqFineCentroids,
1336
+ const uint8_t* const __restrict code0,
1337
+ const float weight0,
1338
+ const uint8_t* const __restrict code1,
1339
+ const float weight1,
1340
+ const uint8_t* const __restrict code2,
1341
+ const float weight2,
1342
+ float* const __restrict outputAccum) {
1343
+ // coarse quantizer
1344
+ const uint8_t* const __restrict coarse0 = code0;
1345
+ const uint8_t* const __restrict coarse1 = code1;
1346
+ const uint8_t* const __restrict coarse2 = code2;
1347
+
1348
+ // fine quantizer
1349
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1350
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1351
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1352
+
1353
+ // clang-format off
1354
+
1355
+ // process chunks, 8 float
1356
+
1357
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1358
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1359
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1360
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1361
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1362
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1363
+
1364
+ __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
1365
+
1366
+ existingValue = elementaryBlock8x1bAccum(
1367
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1368
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1369
+ weight0,
1370
+ existingValue);
1371
+
1372
+ existingValue = elementaryBlock8x1bAccum(
1373
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1374
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1375
+ weight1,
1376
+ existingValue);
1377
+
1378
+ existingValue = elementaryBlock8x1bAccum(
1379
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1380
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1381
+ weight2,
1382
+ existingValue);
1383
+
1384
+ _mm256_storeu_ps(outputAccum + CPOS, existingValue);
1385
+
1386
+ // next
1387
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
1388
+ pqCoarseCentroids, pqFineCentroids,
1389
+ code0, weight0,
1390
+ code1, weight1,
1391
+ code2, weight2,
1392
+ outputAccum);
1393
+
1394
+ // clang-format on
1395
+ }
1396
+ };
1397
+
1398
+ template <
1399
+ intptr_t DIM,
1400
+ intptr_t COARSE_SIZE,
1401
+ intptr_t FINE_SIZE,
1402
+ intptr_t COARSE_BITS,
1403
+ intptr_t FINE_BITS,
1404
+ intptr_t CPOS>
1405
+ struct Index2LevelDecoderImpl<
1406
+ DIM,
1407
+ COARSE_SIZE,
1408
+ FINE_SIZE,
1409
+ COARSE_BITS,
1410
+ FINE_BITS,
1411
+ CPOS,
1412
+ false,
1413
+ false,
1414
+ false,
1415
+ true,
1416
+ false> {
1417
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
1418
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
1419
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1420
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1421
+
1422
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1423
+
1424
+ // coarse quantizer storage
1425
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
1426
+
1427
+ // coarse quantizer bytes start from 0
1428
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
1429
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
1430
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
1431
+ N_COARSE_ELEMENTS * COARSE_BITS;
1432
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
1433
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
1434
+
1435
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1436
+
1437
+ // process 1 sample
1438
+ static void store(
1439
+ const float* const __restrict pqCoarseCentroids0,
1440
+ const float* const __restrict pqFineCentroids0,
1441
+ const uint8_t* const __restrict code0,
1442
+ float* const __restrict outputStore) {
1443
+ // coarse quantizer
1444
+ const uint8_t* const __restrict coarse0 = code0;
1445
+
1446
+ // fine quantizer
1447
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1448
+
1449
+ // clang-format off
1450
+
1451
+ // process chunks, 4 float
1452
+
1453
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1454
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1455
+
1456
+ const __m128 storeValue = elementaryBlock4x1b(
1457
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1458
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
1459
+
1460
+ _mm_storeu_ps(outputStore + CPOS, storeValue);
1461
+
1462
+ // next
1463
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::store(
1464
+ pqCoarseCentroids0, pqFineCentroids0, code0,
1465
+ outputStore);
1466
+
1467
+ // clang-format on
1468
+ }
1469
+
1470
+ // process 1 sample
1471
+ static void accum(
1472
+ const float* const __restrict pqCoarseCentroids0,
1473
+ const float* const __restrict pqFineCentroids0,
1474
+ const uint8_t* const __restrict code0,
1475
+ const float weight0,
1476
+ float* const __restrict outputAccum) {
1477
+ // coarse quantizer
1478
+ const uint8_t* const __restrict coarse0 = code0;
1479
+
1480
+ // fine quantizer
1481
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1482
+
1483
+ // clang-format off
1484
+
1485
+ // process chunks, 4 float
1486
+
1487
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1488
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS,fineCentroidIdx>::get(fine0);
1489
+
1490
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1491
+
1492
+ existingValue = elementaryBlock4x1bAccum(
1493
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1494
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1495
+ weight0,
1496
+ existingValue);
1497
+
1498
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1499
+
1500
+ // next
1501
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1502
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1503
+ outputAccum);
1504
+
1505
+ // clang-format on
1506
+ }
1507
+
1508
+ // Process 2 samples.
1509
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1510
+ // table.
1511
+ static void accum(
1512
+ const float* const __restrict pqCoarseCentroids0,
1513
+ const float* const __restrict pqFineCentroids0,
1514
+ const uint8_t* const __restrict code0,
1515
+ const float weight0,
1516
+ const float* const __restrict pqCoarseCentroids1,
1517
+ const float* const __restrict pqFineCentroids1,
1518
+ const uint8_t* const __restrict code1,
1519
+ const float weight1,
1520
+ float* const __restrict outputAccum) {
1521
+ // coarse quantizer
1522
+ const uint8_t* const __restrict coarse0 = code0;
1523
+ const uint8_t* const __restrict coarse1 = code1;
1524
+
1525
+ // fine quantizer
1526
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1527
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1528
+
1529
+ // clang-format off
1530
+
1531
+ // process chunks, 4 float
1532
+
1533
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1534
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1535
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1536
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1537
+
1538
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1539
+
1540
+ existingValue = elementaryBlock4x1bAccum(
1541
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1542
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1543
+ weight0,
1544
+ existingValue);
1545
+
1546
+ existingValue = elementaryBlock4x1bAccum(
1547
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1548
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1549
+ weight1,
1550
+ existingValue);
1551
+
1552
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1553
+
1554
+ // next
1555
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1556
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1557
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1558
+ outputAccum);
1559
+
1560
+ // clang-format on
1561
+ }
1562
+
1563
+ // Process 2 samples.
1564
+ // Coarse pq centroids table and fine pq centroids table are shared among
1565
+ // codes.
1566
+ static void accum(
1567
+ const float* const __restrict pqCoarseCentroids,
1568
+ const float* const __restrict pqFineCentroids,
1569
+ const uint8_t* const __restrict code0,
1570
+ const float weight0,
1571
+ const uint8_t* const __restrict code1,
1572
+ const float weight1,
1573
+ float* const __restrict outputAccum) {
1574
+ // coarse quantizer
1575
+ const uint8_t* const __restrict coarse0 = code0;
1576
+ const uint8_t* const __restrict coarse1 = code1;
1577
+
1578
+ // fine quantizer
1579
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1580
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1581
+
1582
+ // clang-format off
1583
+
1584
+ // process chunks, 4 float
1585
+
1586
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1587
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1588
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1589
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1590
+
1591
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1592
+
1593
+ existingValue = elementaryBlock4x1bAccum(
1594
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1595
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1596
+ weight0,
1597
+ existingValue);
1598
+
1599
+ existingValue = elementaryBlock4x1bAccum(
1600
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1601
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1602
+ weight1,
1603
+ existingValue);
1604
+
1605
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1606
+
1607
+ // next
1608
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1609
+ pqCoarseCentroids, pqFineCentroids,
1610
+ code0, weight0,
1611
+ code1, weight1,
1612
+ outputAccum);
1613
+
1614
+ // clang-format on
1615
+ }
1616
+
1617
+ // Process 3 samples.
1618
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1619
+ // table.
1620
+ static void accum(
1621
+ const float* const __restrict pqCoarseCentroids0,
1622
+ const float* const __restrict pqFineCentroids0,
1623
+ const uint8_t* const __restrict code0,
1624
+ const float weight0,
1625
+ const float* const __restrict pqCoarseCentroids1,
1626
+ const float* const __restrict pqFineCentroids1,
1627
+ const uint8_t* const __restrict code1,
1628
+ const float weight1,
1629
+ const float* const __restrict pqCoarseCentroids2,
1630
+ const float* const __restrict pqFineCentroids2,
1631
+ const uint8_t* const __restrict code2,
1632
+ const float weight2,
1633
+ float* const __restrict outputAccum) {
1634
+ // coarse quantizer
1635
+ const uint8_t* const __restrict coarse0 = code0;
1636
+ const uint8_t* const __restrict coarse1 = code1;
1637
+ const uint8_t* const __restrict coarse2 = code2;
1638
+
1639
+ // fine quantizer
1640
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1641
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1642
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1643
+
1644
+ // clang-format off
1645
+
1646
+ // process chunks, 4 float
1647
+
1648
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1649
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1650
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1651
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1652
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1653
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1654
+
1655
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1656
+
1657
+ existingValue = elementaryBlock4x1bAccum(
1658
+ pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1659
+ pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1660
+ weight0,
1661
+ existingValue);
1662
+
1663
+ existingValue = elementaryBlock4x1bAccum(
1664
+ pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1665
+ pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1666
+ weight1,
1667
+ existingValue);
1668
+
1669
+ existingValue = elementaryBlock4x1bAccum(
1670
+ pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1671
+ pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1672
+ weight2,
1673
+ existingValue);
1674
+
1675
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1676
+
1677
+ // next
1678
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1679
+ pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
1680
+ pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
1681
+ pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
1682
+ outputAccum);
1683
+
1684
+ // clang-format on
1685
+ }
1686
+
1687
+ // Process 3 samples.
1688
+ // Coarse pq centroids table and fine pq centroids table are shared among
1689
+ // codes.
1690
+ static void accum(
1691
+ const float* const __restrict pqCoarseCentroids,
1692
+ const float* const __restrict pqFineCentroids,
1693
+ const uint8_t* const __restrict code0,
1694
+ const float weight0,
1695
+ const uint8_t* const __restrict code1,
1696
+ const float weight1,
1697
+ const uint8_t* const __restrict code2,
1698
+ const float weight2,
1699
+ float* const __restrict outputAccum) {
1700
+ // coarse quantizer
1701
+ const uint8_t* const __restrict coarse0 = code0;
1702
+ const uint8_t* const __restrict coarse1 = code1;
1703
+ const uint8_t* const __restrict coarse2 = code2;
1704
+
1705
+ // fine quantizer
1706
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1707
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1708
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1709
+
1710
+ // clang-format off
1711
+
1712
+ // process chunks, 4 float
1713
+
1714
+ const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
1715
+ const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
1716
+ const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
1717
+ const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
1718
+ const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
1719
+ const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
1720
+
1721
+ __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
1722
+
1723
+ existingValue = elementaryBlock4x1bAccum(
1724
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
1725
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
1726
+ weight0,
1727
+ existingValue);
1728
+
1729
+ existingValue = elementaryBlock4x1bAccum(
1730
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
1731
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
1732
+ weight1,
1733
+ existingValue);
1734
+
1735
+ existingValue = elementaryBlock4x1bAccum(
1736
+ pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
1737
+ pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
1738
+ weight2,
1739
+ existingValue);
1740
+
1741
+ _mm_storeu_ps(outputAccum + CPOS, existingValue);
1742
+
1743
+ // next
1744
+ Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
1745
+ pqCoarseCentroids, pqFineCentroids,
1746
+ code0, weight0,
1747
+ code1, weight1,
1748
+ code2, weight2,
1749
+ outputAccum);
1750
+
1751
+ // clang-format on
1752
+ }
1753
+ };
1754
+
1755
+ // This partial specialization is expected to do nothing.
1756
+ template <
1757
+ intptr_t DIM,
1758
+ intptr_t COARSE_SIZE,
1759
+ intptr_t FINE_SIZE,
1760
+ intptr_t COARSE_BITS,
1761
+ intptr_t FINE_BITS,
1762
+ bool FINE_SIZE_EQ_2,
1763
+ bool FINE_SIZE_EQ_4,
1764
+ bool QPOS_LEFT_GE_8,
1765
+ bool QPOS_LEFT_GE_4>
1766
+ struct Index2LevelDecoderImpl<
1767
+ DIM,
1768
+ COARSE_SIZE,
1769
+ FINE_SIZE,
1770
+ COARSE_BITS,
1771
+ FINE_BITS,
1772
+ DIM,
1773
+ FINE_SIZE_EQ_2,
1774
+ FINE_SIZE_EQ_4,
1775
+ QPOS_LEFT_GE_8,
1776
+ QPOS_LEFT_GE_4,
1777
+ true> {
1778
+ // clang-format off
1779
+
1780
+ // process 1 sample
1781
+ static void store(
1782
+ const float* const __restrict pqCoarseCentroids0,
1783
+ const float* const __restrict pqFineCentroids0,
1784
+ const uint8_t* const __restrict code0,
1785
+ float* const __restrict outputStore) {}
1786
+
1787
+ // process 1 sample
1788
+ static void accum(
1789
+ const float* const __restrict pqCoarseCentroids0,
1790
+ const float* const __restrict pqFineCentroids0,
1791
+ const uint8_t* const __restrict code0,
1792
+ const float weight0,
1793
+ float* const __restrict outputAccum) {}
1794
+
1795
+ // Process 2 samples.
1796
+ // Each code uses its own coarse pq centroids table and fine pq centroids table.
1797
+ static void accum(
1798
+ const float* const __restrict pqCoarseCentroids0,
1799
+ const float* const __restrict pqFineCentroids0,
1800
+ const uint8_t* const __restrict code0,
1801
+ const float weight0,
1802
+ const float* const __restrict pqCoarseCentroids1,
1803
+ const float* const __restrict pqFineCentroids1,
1804
+ const uint8_t* const __restrict code1,
1805
+ const float weight1,
1806
+ float* const __restrict outputAccum) {}
1807
+
1808
+ // Process 2 samples.
1809
+ // Coarse pq centroids table and fine pq centroids table are shared among codes.
1810
+ static void accum(
1811
+ const float* const __restrict pqCoarseCentroids,
1812
+ const float* const __restrict pqFineCentroids,
1813
+ const uint8_t* const __restrict code0,
1814
+ const float weight0,
1815
+ const uint8_t* const __restrict code1,
1816
+ const float weight1,
1817
+ float* const __restrict outputAccum) {}
1818
+
1819
+ // Process 3 samples.
1820
+ // Each code uses its own coarse pq centroids table and fine pq centroids table.
1821
+ static void accum(
1822
+ const float* const __restrict pqCoarseCentroids0,
1823
+ const float* const __restrict pqFineCentroids0,
1824
+ const uint8_t* const __restrict code0,
1825
+ const float weight0,
1826
+ const float* const __restrict pqCoarseCentroids1,
1827
+ const float* const __restrict pqFineCentroids1,
1828
+ const uint8_t* const __restrict code1,
1829
+ const float weight1,
1830
+ const float* const __restrict pqCoarseCentroids2,
1831
+ const float* const __restrict pqFineCentroids2,
1832
+ const uint8_t* const __restrict code2,
1833
+ const float weight2,
1834
+ float* const __restrict outputAccum) {}
1835
+
1836
+ // Process 3 samples.
1837
+ // Coarse pq centroids table and fine pq centroids table are shared among codes.
1838
+ static void accum(
1839
+ const float* const __restrict pqCoarseCentroids,
1840
+ const float* const __restrict pqFineCentroids,
1841
+ const uint8_t* const __restrict code0,
1842
+ const float weight0,
1843
+ const uint8_t* const __restrict code1,
1844
+ const float weight1,
1845
+ const uint8_t* const __restrict code2,
1846
+ const float weight2,
1847
+ float* const __restrict outputAccum) {}
1848
+
1849
+ // clang-format on
1850
+ };
1851
+ } // namespace
1852
+
1853
+ // Suitable for IVF256,PQ[1]x8
1854
+ // Suitable for Residual[1]x8,PQ[2]x8
1855
+ // Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
1856
+ // Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
1857
+ template <
1858
+ intptr_t DIM,
1859
+ intptr_t COARSE_SIZE,
1860
+ intptr_t FINE_SIZE,
1861
+ intptr_t COARSE_BITS = 8,
1862
+ intptr_t FINE_BITS = 8>
1863
+ struct Index2LevelDecoder {
1864
+ static_assert(
1865
+ COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
1866
+ "Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
1867
+ static_assert(
1868
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1869
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1870
+
1871
+ static constexpr intptr_t dim = DIM;
1872
+ static constexpr intptr_t coarseSize = COARSE_SIZE;
1873
+ static constexpr intptr_t fineSize = FINE_SIZE;
1874
+ static constexpr intptr_t coarseBits = COARSE_BITS;
1875
+ static constexpr intptr_t fineBits = FINE_BITS;
1876
+
1877
+ // Process 1 sample.
1878
+ static void store(
1879
+ const float* const __restrict pqCoarseCentroids,
1880
+ const float* const __restrict pqFineCentroids,
1881
+ const uint8_t* const __restrict code,
1882
+ float* const __restrict outputStore) {
1883
+ Index2LevelDecoderImpl<
1884
+ DIM,
1885
+ COARSE_SIZE,
1886
+ FINE_SIZE,
1887
+ COARSE_BITS,
1888
+ FINE_BITS,
1889
+ 0>::
1890
+ store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
1891
+ }
1892
+
1893
+ // Process 1 sample.
1894
+ // Performs outputAccum += weight * decoded(code)
1895
+ static void accum(
1896
+ const float* const __restrict pqCoarseCentroids,
1897
+ const float* const __restrict pqFineCentroids,
1898
+ const uint8_t* const __restrict code,
1899
+ const float weight,
1900
+ float* const __restrict outputAccum) {
1901
+ Index2LevelDecoderImpl<
1902
+ DIM,
1903
+ COARSE_SIZE,
1904
+ FINE_SIZE,
1905
+ COARSE_BITS,
1906
+ FINE_BITS,
1907
+ 0>::
1908
+ accum(pqCoarseCentroids,
1909
+ pqFineCentroids,
1910
+ code,
1911
+ weight,
1912
+ outputAccum);
1913
+ }
1914
+
1915
+ // Process 2 samples.
1916
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1917
+ // table.
1918
+ //
1919
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1920
+ // decoded(code1).
1921
+ static void accum(
1922
+ const float* const __restrict pqCoarseCentroids0,
1923
+ const float* const __restrict pqFineCentroids0,
1924
+ const uint8_t* const __restrict code0,
1925
+ const float weight0,
1926
+ const float* const __restrict pqCoarseCentroids1,
1927
+ const float* const __restrict pqFineCentroids1,
1928
+ const uint8_t* const __restrict code1,
1929
+ const float weight1,
1930
+ float* const __restrict outputAccum) {
1931
+ Index2LevelDecoderImpl<
1932
+ DIM,
1933
+ COARSE_SIZE,
1934
+ FINE_SIZE,
1935
+ COARSE_BITS,
1936
+ FINE_BITS,
1937
+ 0>::
1938
+ accum(pqCoarseCentroids0,
1939
+ pqFineCentroids0,
1940
+ code0,
1941
+ weight0,
1942
+ pqCoarseCentroids1,
1943
+ pqFineCentroids1,
1944
+ code1,
1945
+ weight1,
1946
+ outputAccum);
1947
+ }
1948
+
1949
+ // Process 2 samples.
1950
+ // Coarse pq centroids table and fine pq centroids table are shared among
1951
+ // codes.
1952
+ //
1953
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1954
+ // decoded(code1)
1955
+ static void accum(
1956
+ const float* const __restrict pqCoarseCentroids,
1957
+ const float* const __restrict pqFineCentroids,
1958
+ const uint8_t* const __restrict code0,
1959
+ const float weight0,
1960
+ const uint8_t* const __restrict code1,
1961
+ const float weight1,
1962
+ float* const __restrict outputAccum) {
1963
+ Index2LevelDecoderImpl<
1964
+ DIM,
1965
+ COARSE_SIZE,
1966
+ FINE_SIZE,
1967
+ COARSE_BITS,
1968
+ FINE_BITS,
1969
+ 0>::
1970
+ accum(pqCoarseCentroids,
1971
+ pqFineCentroids,
1972
+ code0,
1973
+ weight0,
1974
+ code1,
1975
+ weight1,
1976
+ outputAccum);
1977
+ }
1978
+
1979
+ // Process 3 samples.
1980
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1981
+ // table.
1982
+ //
1983
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1984
+ // decoded(code1) + weight2 * decoded(code2)
1985
+ static void accum(
1986
+ const float* const __restrict pqCoarseCentroids0,
1987
+ const float* const __restrict pqFineCentroids0,
1988
+ const uint8_t* const __restrict code0,
1989
+ const float weight0,
1990
+ const float* const __restrict pqCoarseCentroids1,
1991
+ const float* const __restrict pqFineCentroids1,
1992
+ const uint8_t* const __restrict code1,
1993
+ const float weight1,
1994
+ const float* const __restrict pqCoarseCentroids2,
1995
+ const float* const __restrict pqFineCentroids2,
1996
+ const uint8_t* const __restrict code2,
1997
+ const float weight2,
1998
+ float* const __restrict outputAccum) {
1999
+ Index2LevelDecoderImpl<
2000
+ DIM,
2001
+ COARSE_SIZE,
2002
+ FINE_SIZE,
2003
+ COARSE_BITS,
2004
+ FINE_BITS,
2005
+ 0>::
2006
+ accum(pqCoarseCentroids0,
2007
+ pqFineCentroids0,
2008
+ code0,
2009
+ weight0,
2010
+ pqCoarseCentroids1,
2011
+ pqFineCentroids1,
2012
+ code1,
2013
+ weight1,
2014
+ pqCoarseCentroids2,
2015
+ pqFineCentroids2,
2016
+ code2,
2017
+ weight2,
2018
+ outputAccum);
2019
+ }
2020
+
2021
+ // Process 3 samples.
2022
+ // Coarse pq centroids table and fine pq centroids table are shared among
2023
+ // codes.
2024
+ //
2025
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2026
+ // decoded(code1) + weight2 * decoded(code2)
2027
+ static void accum(
2028
+ const float* const __restrict pqCoarseCentroids,
2029
+ const float* const __restrict pqFineCentroids,
2030
+ const uint8_t* const __restrict code0,
2031
+ const float weight0,
2032
+ const uint8_t* const __restrict code1,
2033
+ const float weight1,
2034
+ const uint8_t* const __restrict code2,
2035
+ const float weight2,
2036
+ float* const __restrict outputAccum) {
2037
+ Index2LevelDecoderImpl<
2038
+ DIM,
2039
+ COARSE_SIZE,
2040
+ FINE_SIZE,
2041
+ COARSE_BITS,
2042
+ FINE_BITS,
2043
+ 0>::
2044
+ accum(pqCoarseCentroids,
2045
+ pqFineCentroids,
2046
+ code0,
2047
+ weight0,
2048
+ code1,
2049
+ weight1,
2050
+ code2,
2051
+ weight2,
2052
+ outputAccum);
2053
+ }
2054
+ };
2055
+
2056
+ } // namespace cppcontrib
2057
+ } // namespace faiss
2058
+ #endif // LEVEL2_AVX2_INL_H