faiss 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (189) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +23 -21
  5. data/ext/faiss/extconf.rb +11 -0
  6. data/ext/faiss/index.cpp +4 -4
  7. data/ext/faiss/index_binary.cpp +6 -6
  8. data/ext/faiss/product_quantizer.cpp +4 -4
  9. data/lib/faiss/version.rb +1 -1
  10. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  11. data/vendor/faiss/faiss/Clustering.cpp +32 -0
  12. data/vendor/faiss/faiss/Clustering.h +14 -0
  13. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  14. data/vendor/faiss/faiss/IVFlib.h +26 -2
  15. data/vendor/faiss/faiss/Index.cpp +36 -3
  16. data/vendor/faiss/faiss/Index.h +43 -6
  17. data/vendor/faiss/faiss/Index2Layer.cpp +24 -93
  18. data/vendor/faiss/faiss/Index2Layer.h +8 -17
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +610 -0
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +253 -0
  21. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  22. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  23. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  24. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  25. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  26. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  27. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  28. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  29. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  30. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  31. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  32. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  33. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  34. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  35. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  36. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  37. data/vendor/faiss/faiss/IndexFlat.cpp +52 -69
  38. data/vendor/faiss/faiss/IndexFlat.h +16 -19
  39. data/vendor/faiss/faiss/IndexFlatCodes.cpp +101 -0
  40. data/vendor/faiss/faiss/IndexFlatCodes.h +59 -0
  41. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  42. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  43. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  44. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  45. data/vendor/faiss/faiss/IndexIVF.cpp +200 -40
  46. data/vendor/faiss/faiss/IndexIVF.h +59 -22
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +393 -0
  48. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +183 -0
  49. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  50. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  51. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  52. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  53. data/vendor/faiss/faiss/IndexIVFFlat.cpp +43 -26
  54. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  55. data/vendor/faiss/faiss/IndexIVFPQ.cpp +238 -53
  56. data/vendor/faiss/faiss/IndexIVFPQ.h +6 -2
  57. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  58. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  59. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  60. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +63 -40
  62. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +23 -7
  63. data/vendor/faiss/faiss/IndexLSH.cpp +8 -32
  64. data/vendor/faiss/faiss/IndexLSH.h +4 -16
  65. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  66. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  67. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -5
  68. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  69. data/vendor/faiss/faiss/IndexNSG.cpp +37 -5
  70. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  71. data/vendor/faiss/faiss/IndexPQ.cpp +108 -120
  72. data/vendor/faiss/faiss/IndexPQ.h +21 -22
  73. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  74. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  75. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  76. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  77. data/vendor/faiss/faiss/IndexRefine.cpp +36 -4
  78. data/vendor/faiss/faiss/IndexRefine.h +14 -2
  79. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  80. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +28 -43
  84. data/vendor/faiss/faiss/IndexScalarQuantizer.h +8 -23
  85. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  86. data/vendor/faiss/faiss/IndexShards.h +2 -1
  87. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  88. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  89. data/vendor/faiss/faiss/VectorTransform.cpp +45 -1
  90. data/vendor/faiss/faiss/VectorTransform.h +25 -4
  91. data/vendor/faiss/faiss/clone_index.cpp +26 -3
  92. data/vendor/faiss/faiss/clone_index.h +3 -0
  93. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  94. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  95. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  101. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  102. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  103. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  104. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  105. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -6
  106. data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
  107. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  108. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  109. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  110. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  111. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  112. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  113. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  114. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  115. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  116. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  117. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  120. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  121. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  122. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  123. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  124. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +331 -29
  125. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +110 -19
  126. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  127. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  128. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  129. data/vendor/faiss/faiss/impl/HNSW.cpp +133 -32
  130. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  131. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  132. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  133. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +378 -217
  134. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +106 -29
  135. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  136. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  137. data/vendor/faiss/faiss/impl/NSG.cpp +1 -4
  138. data/vendor/faiss/faiss/impl/NSG.h +1 -1
  139. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  140. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  141. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  142. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  143. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  144. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +521 -55
  145. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +94 -16
  146. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  147. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +108 -191
  148. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  149. data/vendor/faiss/faiss/impl/index_read.cpp +338 -24
  150. data/vendor/faiss/faiss/impl/index_write.cpp +300 -18
  151. data/vendor/faiss/faiss/impl/io.cpp +1 -1
  152. data/vendor/faiss/faiss/impl/io_macros.h +20 -0
  153. data/vendor/faiss/faiss/impl/kmeans1d.cpp +303 -0
  154. data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
  155. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  156. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  157. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  158. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  159. data/vendor/faiss/faiss/index_factory.cpp +772 -412
  160. data/vendor/faiss/faiss/index_factory.h +3 -0
  161. data/vendor/faiss/faiss/index_io.h +5 -0
  162. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  163. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  164. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  165. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  166. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  167. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  168. data/vendor/faiss/faiss/utils/distances.cpp +384 -58
  169. data/vendor/faiss/faiss/utils/distances.h +149 -18
  170. data/vendor/faiss/faiss/utils/distances_simd.cpp +776 -6
  171. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  172. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  173. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  174. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  175. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  176. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  177. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  178. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  179. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  180. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  181. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  182. data/vendor/faiss/faiss/utils/random.h +5 -0
  183. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  184. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  185. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  186. data/vendor/faiss/faiss/utils/utils.h +1 -1
  187. metadata +46 -5
  188. data/vendor/faiss/faiss/IndexResidual.cpp +0 -291
  189. data/vendor/faiss/faiss/IndexResidual.h +0 -152
@@ -0,0 +1,2147 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+ #ifndef LEVEL2_NEON_INL_H
3
+ #define LEVEL2_NEON_INL_H
4
+
5
+ #include <arm_neon.h>
6
+
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+
10
+ #include <faiss/cppcontrib/detail/UintReader.h>
11
+
12
+ namespace faiss {
13
+ namespace cppcontrib {
14
+
15
+ namespace {
16
+
17
+ // Processes 4 float values.
18
+ // Returns {
19
+ // [0..3] = *coarse[0..3] + *fine[0..3];
20
+ // }
21
+ inline float32x4_t elementaryBlock4x1b(
22
+ const float* const __restrict coarse,
23
+ const float* const __restrict fine) {
24
+ // load fine
25
+ const auto fineValue = vld1q_f32(fine);
26
+ // load coarse
27
+ const auto coarseValue = vld1q_f32(coarse);
28
+
29
+ // add coarse and fine
30
+ return vaddq_f32(fineValue, coarseValue);
31
+ }
32
+
33
+ // Processes 4 float values.
34
+ // Returns {
35
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
36
+ // }
37
+ inline float32x4_t elementaryBlock4x1bAccum(
38
+ const float* const __restrict coarse,
39
+ const float* const __restrict fine,
40
+ const float weight,
41
+ const float32x4_t existingValue) {
42
+ // add coarse and fine
43
+ const auto combinedValue = elementaryBlock4x1b(coarse, fine);
44
+
45
+ // this operation is expected to be optimized by a compiler
46
+ const auto weightNeon = vdupq_n_f32(weight);
47
+ // do fma
48
+ return vfmaq_f32(existingValue, weightNeon, combinedValue);
49
+ }
50
+
51
+ // Processes 8 float values.
52
+ // Returns {
53
+ // [0..3] = *coarse[0..3] + *fine0[0..3];
54
+ // [4..7] = *coarse[4..7] + *fine1[0..3];
55
+ // }
56
+ inline float32x4x2_t elementaryBlock4x2b(
57
+ const float* const __restrict coarse,
58
+ const float* const __restrict fine0,
59
+ const float* const __restrict fine1) {
60
+ // load fine
61
+ const auto fineValue0 = vld1q_f32(fine0);
62
+ const auto fineValue1 = vld1q_f32(fine1);
63
+ // load coarse
64
+ const auto coarseValue0 = vld1q_f32(coarse);
65
+ const auto coarseValue1 = vld1q_f32(coarse + 4);
66
+
67
+ // add coarse and fine
68
+ const auto result0 = vaddq_f32(fineValue0, coarseValue0);
69
+ const auto result1 = vaddq_f32(fineValue1, coarseValue1);
70
+
71
+ return {result0, result1};
72
+ }
73
+
74
+ // Processes 8 float values.
75
+ // Returns {
76
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
77
+ // [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
78
+ // }
79
+ inline float32x4x2_t elementaryBlock4x2bAccum(
80
+ const float* const __restrict coarse,
81
+ const float* const __restrict fine0,
82
+ const float* const __restrict fine1,
83
+ const float weight,
84
+ const float32x4x2_t existingValue) {
85
+ // add coarse and fine
86
+ const auto combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
87
+
88
+ // this operation is expected to be optimized by a compiler
89
+ const auto weightNeon = vdupq_n_f32(weight);
90
+ // do fma
91
+ const auto result0 =
92
+ vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
93
+ const auto result1 =
94
+ vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
95
+ return {result0, result1};
96
+ }
97
+
98
+ // Processes 8 float values.
99
+ // Returns {
100
+ // [0..7] = *coarse[0..7] + *fine[0..7];
101
+ // }
102
+ inline float32x4x2_t elementaryBlock8x1b(
103
+ const float* const __restrict coarse,
104
+ const float* const __restrict fine) {
105
+ // load fine
106
+ const auto fineValue0 = vld1q_f32(fine);
107
+ const auto fineValue1 = vld1q_f32(fine + 4);
108
+ // load coarse
109
+ const auto coarseValue0 = vld1q_f32(coarse);
110
+ const auto coarseValue1 = vld1q_f32(coarse + 4);
111
+
112
+ // add coarse and fine
113
+ return {vaddq_f32(fineValue0, coarseValue0),
114
+ vaddq_f32(fineValue1, coarseValue1)};
115
+ }
116
+
117
+ // Processes 8 float values.
118
+ // Returns {
119
+ // [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
120
+ // }
121
+ inline float32x4x2_t elementaryBlock8x1bAccum(
122
+ const float* const __restrict coarse,
123
+ const float* const __restrict fine,
124
+ const float weight,
125
+ const float32x4x2_t existingValue) {
126
+ // add coarse and fine
127
+ const auto combinedValue = elementaryBlock8x1b(coarse, fine);
128
+
129
+ // this operation is expected to be optimized by a compiler
130
+ const auto weightNeon = vdupq_n_f32(weight);
131
+ // do fma
132
+ const auto result0 =
133
+ vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
134
+ const auto result1 =
135
+ vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
136
+ return {result0, result1};
137
+ }
138
+
139
+ // The following code uses template-based for-loop unrolling,
140
+ // because the compiler does not do that on its own as needed.
141
+ // The idea is the following:
142
+ // template<int I, int MAX>
143
+ // struct Foo {
144
+ // static void bar() {
145
+ // doSomething(I);
146
+ // Foo<I + 1, MAX>::bar();
147
+ // }
148
+ // };
149
+ //
150
+ // template<int MAX>
151
+ // struct Foo<MAX, MAX> {
152
+ // static void bar() {}
153
+ // };
154
+ //
155
+ // Initiate the loop:
156
+ // Foo<0, MAX>::bar();
157
+
158
+ template <
159
+ intptr_t DIM,
160
+ intptr_t COARSE_SIZE,
161
+ intptr_t FINE_SIZE,
162
+ intptr_t COARSE_BITS,
163
+ intptr_t FINE_BITS,
164
+ intptr_t CPOS,
165
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
166
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
167
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
168
+ bool DIM_EQ_CPOS = DIM == CPOS>
169
+ struct Index2LevelDecoderImpl;
170
+
171
+ template <
172
+ intptr_t DIM,
173
+ intptr_t COARSE_SIZE,
174
+ intptr_t COARSE_BITS,
175
+ intptr_t FINE_BITS,
176
+ intptr_t CPOS,
177
+ bool QPOS_LEFT_GE_8,
178
+ bool QPOS_LEFT_GE_4>
179
+ struct Index2LevelDecoderImpl<
180
+ DIM,
181
+ COARSE_SIZE,
182
+ 4,
183
+ COARSE_BITS,
184
+ FINE_BITS,
185
+ CPOS,
186
+ true,
187
+ QPOS_LEFT_GE_8,
188
+ QPOS_LEFT_GE_4,
189
+ false> {
190
+ static constexpr intptr_t FINE_SIZE = 4;
191
+
192
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
193
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
194
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
195
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
196
+
197
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
198
+
199
+ // coarse quantizer storage
200
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
201
+
202
+ // coarse quantizer bytes start from 0
203
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
204
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
205
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
206
+ N_COARSE_ELEMENTS * COARSE_BITS;
207
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
208
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
209
+
210
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
211
+
212
+ // process 1 sample
213
+ static void store(
214
+ const float* const __restrict pqCoarseCentroids0,
215
+ const float* const __restrict pqFineCentroids0,
216
+ const uint8_t* const __restrict code0,
217
+ float* const __restrict outputStore) {
218
+ // coarse quantizer
219
+ const uint8_t* const __restrict coarse0 = code0;
220
+
221
+ // fine quantizer
222
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
223
+
224
+ // process chunks, 4 float
225
+ // but 8 floats per loop
226
+
227
+ const intptr_t coarseCode0 = detail::
228
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
229
+ get(coarse0);
230
+ const intptr_t fineCode0a = detail::
231
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
232
+ fine0);
233
+ const intptr_t fineCode0b = detail::
234
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
235
+ fine0);
236
+
237
+ const auto storeValue = elementaryBlock4x2b(
238
+ pqCoarseCentroids0 +
239
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
240
+ COARSE_SIZE +
241
+ coarseCentroidOffset,
242
+ pqFineCentroids0 +
243
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
244
+ fineCode0a) *
245
+ FINE_SIZE +
246
+ fineCentroidOffset,
247
+ pqFineCentroids0 +
248
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
249
+ fineCode0b) *
250
+ FINE_SIZE +
251
+ fineCentroidOffset);
252
+
253
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
254
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
255
+
256
+ // next
257
+ Index2LevelDecoderImpl<
258
+ DIM,
259
+ COARSE_SIZE,
260
+ FINE_SIZE,
261
+ COARSE_BITS,
262
+ FINE_BITS,
263
+ CPOS + 8>::
264
+ store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
265
+ }
266
+
267
+ // process 1 sample
268
+ static void accum(
269
+ const float* const __restrict pqCoarseCentroids0,
270
+ const float* const __restrict pqFineCentroids0,
271
+ const uint8_t* const __restrict code0,
272
+ const float weight0,
273
+ float* const __restrict outputAccum) {
274
+ // coarse quantizer
275
+ const uint8_t* const __restrict coarse0 = code0;
276
+
277
+ // fine quantizer
278
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
279
+
280
+ // process chunks, 4 float
281
+ // but 8 floats per loop
282
+
283
+ const intptr_t coarseCode0 = detail::
284
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
285
+ get(coarse0);
286
+ const intptr_t fineCode0a = detail::
287
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
288
+ fine0);
289
+ const intptr_t fineCode0b = detail::
290
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
291
+ fine0);
292
+
293
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
294
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
295
+
296
+ auto existingValue = elementaryBlock4x2bAccum(
297
+ pqCoarseCentroids0 +
298
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
299
+ COARSE_SIZE +
300
+ coarseCentroidOffset,
301
+ pqFineCentroids0 +
302
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
303
+ fineCode0a) *
304
+ FINE_SIZE +
305
+ fineCentroidOffset,
306
+ pqFineCentroids0 +
307
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
308
+ fineCode0b) *
309
+ FINE_SIZE +
310
+ fineCentroidOffset,
311
+ weight0,
312
+ {existingValue0, existingValue1});
313
+
314
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
315
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
316
+
317
+ // next
318
+ Index2LevelDecoderImpl<
319
+ DIM,
320
+ COARSE_SIZE,
321
+ FINE_SIZE,
322
+ COARSE_BITS,
323
+ FINE_BITS,
324
+ CPOS + 8>::
325
+ accum(pqCoarseCentroids0,
326
+ pqFineCentroids0,
327
+ code0,
328
+ weight0,
329
+ outputAccum);
330
+ }
331
+
332
+ // Process 2 samples.
333
+ // Each code uses its own coarse pq centroids table and fine pq centroids
334
+ // table.
335
+ static void accum(
336
+ const float* const __restrict pqCoarseCentroids0,
337
+ const float* const __restrict pqFineCentroids0,
338
+ const uint8_t* const __restrict code0,
339
+ const float weight0,
340
+ const float* const __restrict pqCoarseCentroids1,
341
+ const float* const __restrict pqFineCentroids1,
342
+ const uint8_t* const __restrict code1,
343
+ const float weight1,
344
+ float* const __restrict outputAccum) {
345
+ // coarse quantizer
346
+ const uint8_t* const __restrict coarse0 = code0;
347
+ const uint8_t* const __restrict coarse1 = code1;
348
+
349
+ // fine quantizer
350
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
351
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
352
+
353
+ // process chunks, 4 float
354
+ // but 8 floats per loop
355
+
356
+ const intptr_t coarseCode0 = detail::
357
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
358
+ get(coarse0);
359
+ const intptr_t fineCode0a = detail::
360
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
361
+ fine0);
362
+ const intptr_t fineCode0b = detail::
363
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
364
+ fine0);
365
+ const intptr_t coarseCode1 = detail::
366
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
367
+ get(coarse1);
368
+ const intptr_t fineCode1a = detail::
369
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
370
+ fine1);
371
+ const intptr_t fineCode1b = detail::
372
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
373
+ fine1);
374
+
375
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
376
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
377
+
378
+ auto existingValue = elementaryBlock4x2bAccum(
379
+ pqCoarseCentroids0 +
380
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
381
+ COARSE_SIZE +
382
+ coarseCentroidOffset,
383
+ pqFineCentroids0 +
384
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
385
+ fineCode0a) *
386
+ FINE_SIZE +
387
+ fineCentroidOffset,
388
+ pqFineCentroids0 +
389
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
390
+ fineCode0b) *
391
+ FINE_SIZE +
392
+ fineCentroidOffset,
393
+ weight0,
394
+ {existingValue0, existingValue1});
395
+
396
+ existingValue = elementaryBlock4x2bAccum(
397
+ pqCoarseCentroids1 +
398
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
399
+ COARSE_SIZE +
400
+ coarseCentroidOffset,
401
+ pqFineCentroids1 +
402
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
403
+ fineCode1a) *
404
+ FINE_SIZE +
405
+ fineCentroidOffset,
406
+ pqFineCentroids1 +
407
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
408
+ fineCode1b) *
409
+ FINE_SIZE +
410
+ fineCentroidOffset,
411
+ weight1,
412
+ existingValue);
413
+
414
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
415
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
416
+
417
+ // next
418
+ Index2LevelDecoderImpl<
419
+ DIM,
420
+ COARSE_SIZE,
421
+ FINE_SIZE,
422
+ COARSE_BITS,
423
+ FINE_BITS,
424
+ CPOS + 8>::
425
+ accum(pqCoarseCentroids0,
426
+ pqFineCentroids0,
427
+ code0,
428
+ weight0,
429
+ pqCoarseCentroids1,
430
+ pqFineCentroids1,
431
+ code1,
432
+ weight1,
433
+ outputAccum);
434
+ }
435
+
436
+ // Process 2 samples.
437
+ // Coarse pq centroids table and fine pq centroids table are shared among
438
+ // codes.
439
+ static void accum(
440
+ const float* const __restrict pqCoarseCentroids,
441
+ const float* const __restrict pqFineCentroids,
442
+ const uint8_t* const __restrict code0,
443
+ const float weight0,
444
+ const uint8_t* const __restrict code1,
445
+ const float weight1,
446
+ float* const __restrict outputAccum) {
447
+ // coarse quantizer
448
+ const uint8_t* const __restrict coarse0 = code0;
449
+ const uint8_t* const __restrict coarse1 = code1;
450
+
451
+ // fine quantizer
452
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
453
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
454
+
455
+ // process chunks, 4 float
456
+ // but 8 floats per loop
457
+
458
+ const intptr_t coarseCode0 = detail::
459
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
460
+ get(coarse0);
461
+ const intptr_t fineCode0a = detail::
462
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
463
+ fine0);
464
+ const intptr_t fineCode0b = detail::
465
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
466
+ fine0);
467
+ const intptr_t coarseCode1 = detail::
468
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
469
+ get(coarse1);
470
+ const intptr_t fineCode1a = detail::
471
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
472
+ fine1);
473
+ const intptr_t fineCode1b = detail::
474
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
475
+ fine1);
476
+
477
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
478
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
479
+
480
+ auto existingValue = elementaryBlock4x2bAccum(
481
+ pqCoarseCentroids +
482
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
483
+ COARSE_SIZE +
484
+ coarseCentroidOffset,
485
+ pqFineCentroids +
486
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
487
+ fineCode0a) *
488
+ FINE_SIZE +
489
+ fineCentroidOffset,
490
+ pqFineCentroids +
491
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
492
+ fineCode0b) *
493
+ FINE_SIZE +
494
+ fineCentroidOffset,
495
+ weight0,
496
+ {existingValue0, existingValue1});
497
+
498
+ existingValue = elementaryBlock4x2bAccum(
499
+ pqCoarseCentroids +
500
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
501
+ COARSE_SIZE +
502
+ coarseCentroidOffset,
503
+ pqFineCentroids +
504
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
505
+ fineCode1a) *
506
+ FINE_SIZE +
507
+ fineCentroidOffset,
508
+ pqFineCentroids +
509
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
510
+ fineCode1b) *
511
+ FINE_SIZE +
512
+ fineCentroidOffset,
513
+ weight1,
514
+ existingValue);
515
+
516
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
517
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
518
+
519
+ // next
520
+ Index2LevelDecoderImpl<
521
+ DIM,
522
+ COARSE_SIZE,
523
+ FINE_SIZE,
524
+ COARSE_BITS,
525
+ FINE_BITS,
526
+ CPOS + 8>::
527
+ accum(pqCoarseCentroids,
528
+ pqFineCentroids,
529
+ code0,
530
+ weight0,
531
+ code1,
532
+ weight1,
533
+ outputAccum);
534
+ }
535
+
536
+ // Process 3 samples.
537
+ // Each code uses its own coarse pq centroids table and fine pq centroids
538
+ // table.
539
+ static void accum(
540
+ const float* const __restrict pqCoarseCentroids0,
541
+ const float* const __restrict pqFineCentroids0,
542
+ const uint8_t* const __restrict code0,
543
+ const float weight0,
544
+ const float* const __restrict pqCoarseCentroids1,
545
+ const float* const __restrict pqFineCentroids1,
546
+ const uint8_t* const __restrict code1,
547
+ const float weight1,
548
+ const float* const __restrict pqCoarseCentroids2,
549
+ const float* const __restrict pqFineCentroids2,
550
+ const uint8_t* const __restrict code2,
551
+ const float weight2,
552
+ float* const __restrict outputAccum) {
553
+ // coarse quantizer
554
+ const uint8_t* const __restrict coarse0 = code0;
555
+ const uint8_t* const __restrict coarse1 = code1;
556
+ const uint8_t* const __restrict coarse2 = code2;
557
+
558
+ // fine quantizer
559
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
560
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
561
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
562
+
563
+ // process chunks, 4 float
564
+ // but 8 floats per loop
565
+
566
+ const intptr_t coarseCode0 = detail::
567
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
568
+ get(coarse0);
569
+ const intptr_t fineCode0a = detail::
570
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
571
+ fine0);
572
+ const intptr_t fineCode0b = detail::
573
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
574
+ fine0);
575
+ const intptr_t coarseCode1 = detail::
576
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
577
+ get(coarse1);
578
+ const intptr_t fineCode1a = detail::
579
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
580
+ fine1);
581
+ const intptr_t fineCode1b = detail::
582
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
583
+ fine1);
584
+ const intptr_t coarseCode2 = detail::
585
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
586
+ get(coarse2);
587
+ const intptr_t fineCode2a = detail::
588
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
589
+ fine2);
590
+ const intptr_t fineCode2b = detail::
591
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
592
+ fine2);
593
+
594
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
595
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
596
+
597
+ auto existingValue = elementaryBlock4x2bAccum(
598
+ pqCoarseCentroids0 +
599
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
600
+ COARSE_SIZE +
601
+ coarseCentroidOffset,
602
+ pqFineCentroids0 +
603
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
604
+ fineCode0a) *
605
+ FINE_SIZE +
606
+ fineCentroidOffset,
607
+ pqFineCentroids0 +
608
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
609
+ fineCode0b) *
610
+ FINE_SIZE +
611
+ fineCentroidOffset,
612
+ weight0,
613
+ {existingValue0, existingValue1});
614
+
615
+ existingValue = elementaryBlock4x2bAccum(
616
+ pqCoarseCentroids1 +
617
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
618
+ COARSE_SIZE +
619
+ coarseCentroidOffset,
620
+ pqFineCentroids1 +
621
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
622
+ fineCode1a) *
623
+ FINE_SIZE +
624
+ fineCentroidOffset,
625
+ pqFineCentroids1 +
626
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
627
+ fineCode1b) *
628
+ FINE_SIZE +
629
+ fineCentroidOffset,
630
+ weight1,
631
+ existingValue);
632
+
633
+ existingValue = elementaryBlock4x2bAccum(
634
+ pqCoarseCentroids2 +
635
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
636
+ COARSE_SIZE +
637
+ coarseCentroidOffset,
638
+ pqFineCentroids2 +
639
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
640
+ fineCode2a) *
641
+ FINE_SIZE +
642
+ fineCentroidOffset,
643
+ pqFineCentroids2 +
644
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
645
+ fineCode2b) *
646
+ FINE_SIZE +
647
+ fineCentroidOffset,
648
+ weight2,
649
+ existingValue);
650
+
651
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
652
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
653
+
654
+ // next
655
+ Index2LevelDecoderImpl<
656
+ DIM,
657
+ COARSE_SIZE,
658
+ FINE_SIZE,
659
+ COARSE_BITS,
660
+ FINE_BITS,
661
+ CPOS + 8>::
662
+ accum(pqCoarseCentroids0,
663
+ pqFineCentroids0,
664
+ code0,
665
+ weight0,
666
+ pqCoarseCentroids1,
667
+ pqFineCentroids1,
668
+ code1,
669
+ weight1,
670
+ pqCoarseCentroids2,
671
+ pqFineCentroids2,
672
+ code2,
673
+ weight2,
674
+ outputAccum);
675
+ }
676
+
677
+ // Process 3 samples.
678
+ // Coarse pq centroids table and fine pq centroids table are shared among
679
+ // codes.
680
+ static void accum(
681
+ const float* const __restrict pqCoarseCentroids,
682
+ const float* const __restrict pqFineCentroids,
683
+ const uint8_t* const __restrict code0,
684
+ const float weight0,
685
+ const uint8_t* const __restrict code1,
686
+ const float weight1,
687
+ const uint8_t* const __restrict code2,
688
+ const float weight2,
689
+ float* const __restrict outputAccum) {
690
+ // coarse quantizer
691
+ const uint8_t* const __restrict coarse0 = code0;
692
+ const uint8_t* const __restrict coarse1 = code1;
693
+ const uint8_t* const __restrict coarse2 = code2;
694
+
695
+ // fine quantizer
696
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
697
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
698
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
699
+
700
+ // process chunks, 4 float
701
+ // but 8 floats per loop
702
+
703
+ const intptr_t coarseCode0 = detail::
704
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
705
+ get(coarse0);
706
+ const intptr_t fineCode0a = detail::
707
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
708
+ fine0);
709
+ const intptr_t fineCode0b = detail::
710
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
711
+ fine0);
712
+ const intptr_t coarseCode1 = detail::
713
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
714
+ get(coarse1);
715
+ const intptr_t fineCode1a = detail::
716
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
717
+ fine1);
718
+ const intptr_t fineCode1b = detail::
719
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
720
+ fine1);
721
+ const intptr_t coarseCode2 = detail::
722
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
723
+ get(coarse2);
724
+ const intptr_t fineCode2a = detail::
725
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
726
+ fine2);
727
+ const intptr_t fineCode2b = detail::
728
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
729
+ fine2);
730
+
731
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
732
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
733
+
734
+ auto existingValue = elementaryBlock4x2bAccum(
735
+ pqCoarseCentroids +
736
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
737
+ COARSE_SIZE +
738
+ coarseCentroidOffset,
739
+ pqFineCentroids +
740
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
741
+ fineCode0a) *
742
+ FINE_SIZE +
743
+ fineCentroidOffset,
744
+ pqFineCentroids +
745
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
746
+ fineCode0b) *
747
+ FINE_SIZE +
748
+ fineCentroidOffset,
749
+ weight0,
750
+ {existingValue0, existingValue1});
751
+
752
+ existingValue = elementaryBlock4x2bAccum(
753
+ pqCoarseCentroids +
754
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
755
+ COARSE_SIZE +
756
+ coarseCentroidOffset,
757
+ pqFineCentroids +
758
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
759
+ fineCode1a) *
760
+ FINE_SIZE +
761
+ fineCentroidOffset,
762
+ pqFineCentroids +
763
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
764
+ fineCode1b) *
765
+ FINE_SIZE +
766
+ fineCentroidOffset,
767
+ weight1,
768
+ existingValue);
769
+
770
+ existingValue = elementaryBlock4x2bAccum(
771
+ pqCoarseCentroids +
772
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
773
+ COARSE_SIZE +
774
+ coarseCentroidOffset,
775
+ pqFineCentroids +
776
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
777
+ fineCode2a) *
778
+ FINE_SIZE +
779
+ fineCentroidOffset,
780
+ pqFineCentroids +
781
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
782
+ fineCode2b) *
783
+ FINE_SIZE +
784
+ fineCentroidOffset,
785
+ weight2,
786
+ existingValue);
787
+
788
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
789
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
790
+
791
+ // next
792
+ Index2LevelDecoderImpl<
793
+ DIM,
794
+ COARSE_SIZE,
795
+ FINE_SIZE,
796
+ COARSE_BITS,
797
+ FINE_BITS,
798
+ CPOS + 8>::
799
+ accum(pqCoarseCentroids,
800
+ pqFineCentroids,
801
+ code0,
802
+ weight0,
803
+ code1,
804
+ weight1,
805
+ code2,
806
+ weight2,
807
+ outputAccum);
808
+ }
809
+ };
810
+
811
+ template <
812
+ intptr_t DIM,
813
+ intptr_t COARSE_SIZE,
814
+ intptr_t FINE_SIZE,
815
+ intptr_t COARSE_BITS,
816
+ intptr_t FINE_BITS,
817
+ intptr_t CPOS>
818
+ struct Index2LevelDecoderImpl<
819
+ DIM,
820
+ COARSE_SIZE,
821
+ FINE_SIZE,
822
+ COARSE_BITS,
823
+ FINE_BITS,
824
+ CPOS,
825
+ false,
826
+ true,
827
+ true,
828
+ false> {
829
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
830
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
831
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
832
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
833
+
834
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
835
+
836
+ // coarse quantizer storage
837
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
838
+
839
+ // coarse quantizer bytes start from 0
840
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
841
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
842
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
843
+ N_COARSE_ELEMENTS * COARSE_BITS;
844
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
845
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
846
+
847
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
848
+
849
+ // process 1 sample
850
+ static void store(
851
+ const float* const __restrict pqCoarseCentroids0,
852
+ const float* const __restrict pqFineCentroids0,
853
+ const uint8_t* const __restrict code0,
854
+ float* const __restrict outputStore) {
855
+ // coarse quantizer
856
+ const uint8_t* const __restrict coarse0 = code0;
857
+
858
+ // fine quantizer
859
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
860
+
861
+ // process chunks, 8 float
862
+
863
+ const intptr_t coarseCode0 = detail::
864
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
865
+ get(coarse0);
866
+ const intptr_t fineCode0 =
867
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
868
+ get(fine0);
869
+
870
+ const auto storeValue = elementaryBlock8x1b(
871
+ pqCoarseCentroids0 +
872
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
873
+ COARSE_SIZE +
874
+ coarseCentroidOffset,
875
+ pqFineCentroids0 +
876
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
877
+ FINE_SIZE +
878
+ fineCentroidOffset);
879
+
880
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
881
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
882
+
883
+ // next
884
+ Index2LevelDecoderImpl<
885
+ DIM,
886
+ COARSE_SIZE,
887
+ FINE_SIZE,
888
+ COARSE_BITS,
889
+ FINE_BITS,
890
+ CPOS + 8>::
891
+ store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
892
+ }
893
+
894
+ // process 1 sample
895
+ static void accum(
896
+ const float* const __restrict pqCoarseCentroids0,
897
+ const float* const __restrict pqFineCentroids0,
898
+ const uint8_t* const __restrict code0,
899
+ const float weight0,
900
+ float* const __restrict outputAccum) {
901
+ // coarse quantizer
902
+ const uint8_t* const __restrict coarse0 = code0;
903
+
904
+ // fine quantizer
905
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
906
+
907
+ // process chunks, 8 float
908
+
909
+ const intptr_t coarseCode0 = detail::
910
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
911
+ get(coarse0);
912
+ const intptr_t fineCode0 =
913
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
914
+ get(fine0);
915
+
916
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
917
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
918
+
919
+ const auto existingValue = elementaryBlock8x1bAccum(
920
+ pqCoarseCentroids0 +
921
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
922
+ COARSE_SIZE +
923
+ coarseCentroidOffset,
924
+ pqFineCentroids0 +
925
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
926
+ FINE_SIZE +
927
+ fineCentroidOffset,
928
+ weight0,
929
+ {existingValue0, existingValue1});
930
+
931
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
932
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
933
+
934
+ // next
935
+ Index2LevelDecoderImpl<
936
+ DIM,
937
+ COARSE_SIZE,
938
+ FINE_SIZE,
939
+ COARSE_BITS,
940
+ FINE_BITS,
941
+ CPOS + 8>::
942
+ accum(pqCoarseCentroids0,
943
+ pqFineCentroids0,
944
+ code0,
945
+ weight0,
946
+ outputAccum);
947
+ }
948
+
949
+ // Process 2 samples.
950
+ // Each code uses its own coarse pq centroids table and fine pq centroids
951
+ // table.
952
+ static void accum(
953
+ const float* const __restrict pqCoarseCentroids0,
954
+ const float* const __restrict pqFineCentroids0,
955
+ const uint8_t* const __restrict code0,
956
+ const float weight0,
957
+ const float* const __restrict pqCoarseCentroids1,
958
+ const float* const __restrict pqFineCentroids1,
959
+ const uint8_t* const __restrict code1,
960
+ const float weight1,
961
+ float* const __restrict outputAccum) {
962
+ // coarse quantizer
963
+ const uint8_t* const __restrict coarse0 = code0;
964
+ const uint8_t* const __restrict coarse1 = code1;
965
+
966
+ // fine quantizer
967
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
968
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
969
+
970
+ // process chunks, 8 float
971
+
972
+ const intptr_t coarseCode0 = detail::
973
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
974
+ get(coarse0);
975
+ const intptr_t fineCode0 =
976
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
977
+ get(fine0);
978
+ const intptr_t coarseCode1 = detail::
979
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
980
+ get(coarse1);
981
+ const intptr_t fineCode1 =
982
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
983
+ get(fine1);
984
+
985
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
986
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
987
+
988
+ auto existingValue = elementaryBlock8x1bAccum(
989
+ pqCoarseCentroids0 +
990
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
991
+ COARSE_SIZE +
992
+ coarseCentroidOffset,
993
+ pqFineCentroids0 +
994
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
995
+ FINE_SIZE +
996
+ fineCentroidOffset,
997
+ weight0,
998
+ {existingValue0, existingValue1});
999
+
1000
+ existingValue = elementaryBlock8x1bAccum(
1001
+ pqCoarseCentroids1 +
1002
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1003
+ COARSE_SIZE +
1004
+ coarseCentroidOffset,
1005
+ pqFineCentroids1 +
1006
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1007
+ FINE_SIZE +
1008
+ fineCentroidOffset,
1009
+ weight1,
1010
+ existingValue);
1011
+
1012
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1013
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1014
+
1015
+ // next
1016
+ Index2LevelDecoderImpl<
1017
+ DIM,
1018
+ COARSE_SIZE,
1019
+ FINE_SIZE,
1020
+ COARSE_BITS,
1021
+ FINE_BITS,
1022
+ CPOS + 8>::
1023
+ accum(pqCoarseCentroids0,
1024
+ pqFineCentroids0,
1025
+ code0,
1026
+ weight0,
1027
+ pqCoarseCentroids1,
1028
+ pqFineCentroids1,
1029
+ code1,
1030
+ weight1,
1031
+ outputAccum);
1032
+ }
1033
+
1034
+ // Process 2 samples.
1035
+ // Coarse pq centroids table and fine pq centroids table are shared among
1036
+ // codes.
1037
+ static void accum(
1038
+ const float* const __restrict pqCoarseCentroids,
1039
+ const float* const __restrict pqFineCentroids,
1040
+ const uint8_t* const __restrict code0,
1041
+ const float weight0,
1042
+ const uint8_t* const __restrict code1,
1043
+ const float weight1,
1044
+ float* const __restrict outputAccum) {
1045
+ // coarse quantizer
1046
+ const uint8_t* const __restrict coarse0 = code0;
1047
+ const uint8_t* const __restrict coarse1 = code1;
1048
+
1049
+ // fine quantizer
1050
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1051
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1052
+
1053
+ // process chunks, 8 float
1054
+
1055
+ const intptr_t coarseCode0 = detail::
1056
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1057
+ get(coarse0);
1058
+ const intptr_t fineCode0 =
1059
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1060
+ get(fine0);
1061
+ const intptr_t coarseCode1 = detail::
1062
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1063
+ get(coarse1);
1064
+ const intptr_t fineCode1 =
1065
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1066
+ get(fine1);
1067
+
1068
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
1069
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
1070
+
1071
+ auto existingValue = elementaryBlock8x1bAccum(
1072
+ pqCoarseCentroids +
1073
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1074
+ COARSE_SIZE +
1075
+ coarseCentroidOffset,
1076
+ pqFineCentroids +
1077
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1078
+ FINE_SIZE +
1079
+ fineCentroidOffset,
1080
+ weight0,
1081
+ {existingValue0, existingValue1});
1082
+
1083
+ existingValue = elementaryBlock8x1bAccum(
1084
+ pqCoarseCentroids +
1085
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1086
+ COARSE_SIZE +
1087
+ coarseCentroidOffset,
1088
+ pqFineCentroids +
1089
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1090
+ FINE_SIZE +
1091
+ fineCentroidOffset,
1092
+ weight1,
1093
+ existingValue);
1094
+
1095
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1096
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1097
+
1098
+ // next
1099
+ Index2LevelDecoderImpl<
1100
+ DIM,
1101
+ COARSE_SIZE,
1102
+ FINE_SIZE,
1103
+ COARSE_BITS,
1104
+ FINE_BITS,
1105
+ CPOS + 8>::
1106
+ accum(pqCoarseCentroids,
1107
+ pqFineCentroids,
1108
+ code0,
1109
+ weight0,
1110
+ code1,
1111
+ weight1,
1112
+ outputAccum);
1113
+ }
1114
+
1115
+ // Process 3 samples.
1116
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1117
+ // table.
1118
+ static void accum(
1119
+ const float* const __restrict pqCoarseCentroids0,
1120
+ const float* const __restrict pqFineCentroids0,
1121
+ const uint8_t* const __restrict code0,
1122
+ const float weight0,
1123
+ const float* const __restrict pqCoarseCentroids1,
1124
+ const float* const __restrict pqFineCentroids1,
1125
+ const uint8_t* const __restrict code1,
1126
+ const float weight1,
1127
+ const float* const __restrict pqCoarseCentroids2,
1128
+ const float* const __restrict pqFineCentroids2,
1129
+ const uint8_t* const __restrict code2,
1130
+ const float weight2,
1131
+ float* const __restrict outputAccum) {
1132
+ // coarse quantizer
1133
+ const uint8_t* const __restrict coarse0 = code0;
1134
+ const uint8_t* const __restrict coarse1 = code1;
1135
+ const uint8_t* const __restrict coarse2 = code2;
1136
+
1137
+ // fine quantizer
1138
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1139
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1140
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1141
+
1142
+ // process chunks, 8 float
1143
+
1144
+ const intptr_t coarseCode0 = detail::
1145
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1146
+ get(coarse0);
1147
+ const intptr_t fineCode0 =
1148
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1149
+ get(fine0);
1150
+ const intptr_t coarseCode1 = detail::
1151
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1152
+ get(coarse1);
1153
+ const intptr_t fineCode1 =
1154
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1155
+ get(fine1);
1156
+ const intptr_t coarseCode2 = detail::
1157
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1158
+ get(coarse2);
1159
+ const intptr_t fineCode2 =
1160
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1161
+ get(fine2);
1162
+
1163
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
1164
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
1165
+
1166
+ auto existingValue = elementaryBlock8x1bAccum(
1167
+ pqCoarseCentroids0 +
1168
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1169
+ COARSE_SIZE +
1170
+ coarseCentroidOffset,
1171
+ pqFineCentroids0 +
1172
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1173
+ FINE_SIZE +
1174
+ fineCentroidOffset,
1175
+ weight0,
1176
+ {existingValue0, existingValue1});
1177
+
1178
+ existingValue = elementaryBlock8x1bAccum(
1179
+ pqCoarseCentroids1 +
1180
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1181
+ COARSE_SIZE +
1182
+ coarseCentroidOffset,
1183
+ pqFineCentroids1 +
1184
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1185
+ FINE_SIZE +
1186
+ fineCentroidOffset,
1187
+ weight1,
1188
+ existingValue);
1189
+
1190
+ existingValue = elementaryBlock8x1bAccum(
1191
+ pqCoarseCentroids2 +
1192
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1193
+ COARSE_SIZE +
1194
+ coarseCentroidOffset,
1195
+ pqFineCentroids2 +
1196
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1197
+ FINE_SIZE +
1198
+ fineCentroidOffset,
1199
+ weight2,
1200
+ existingValue);
1201
+
1202
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1203
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1204
+
1205
+ // next
1206
+ Index2LevelDecoderImpl<
1207
+ DIM,
1208
+ COARSE_SIZE,
1209
+ FINE_SIZE,
1210
+ COARSE_BITS,
1211
+ FINE_BITS,
1212
+ CPOS + 8>::
1213
+ accum(pqCoarseCentroids0,
1214
+ pqFineCentroids0,
1215
+ code0,
1216
+ weight0,
1217
+ pqCoarseCentroids1,
1218
+ pqFineCentroids1,
1219
+ code1,
1220
+ weight1,
1221
+ pqCoarseCentroids2,
1222
+ pqFineCentroids2,
1223
+ code2,
1224
+ weight2,
1225
+ outputAccum);
1226
+ }
1227
+
1228
+ // Process 3 samples.
1229
+ // Coarse pq centroids table and fine pq centroids table are shared among
1230
+ // codes.
1231
+ static void accum(
1232
+ const float* const __restrict pqCoarseCentroids,
1233
+ const float* const __restrict pqFineCentroids,
1234
+ const uint8_t* const __restrict code0,
1235
+ const float weight0,
1236
+ const uint8_t* const __restrict code1,
1237
+ const float weight1,
1238
+ const uint8_t* const __restrict code2,
1239
+ const float weight2,
1240
+ float* const __restrict outputAccum) {
1241
+ // coarse quantizer
1242
+ const uint8_t* const __restrict coarse0 = code0;
1243
+ const uint8_t* const __restrict coarse1 = code1;
1244
+ const uint8_t* const __restrict coarse2 = code2;
1245
+
1246
+ // fine quantizer
1247
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1248
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1249
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1250
+
1251
+ // process chunks, 8 float
1252
+
1253
+ const intptr_t coarseCode0 = detail::
1254
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1255
+ get(coarse0);
1256
+ const intptr_t fineCode0 =
1257
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1258
+ get(fine0);
1259
+ const intptr_t coarseCode1 = detail::
1260
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1261
+ get(coarse1);
1262
+ const intptr_t fineCode1 =
1263
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1264
+ get(fine1);
1265
+ const intptr_t coarseCode2 = detail::
1266
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1267
+ get(coarse2);
1268
+ const intptr_t fineCode2 =
1269
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1270
+ get(fine2);
1271
+
1272
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
1273
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
1274
+
1275
+ auto existingValue = elementaryBlock8x1bAccum(
1276
+ pqCoarseCentroids +
1277
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1278
+ COARSE_SIZE +
1279
+ coarseCentroidOffset,
1280
+ pqFineCentroids +
1281
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1282
+ FINE_SIZE +
1283
+ fineCentroidOffset,
1284
+ weight0,
1285
+ {existingValue0, existingValue1});
1286
+
1287
+ existingValue = elementaryBlock8x1bAccum(
1288
+ pqCoarseCentroids +
1289
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1290
+ COARSE_SIZE +
1291
+ coarseCentroidOffset,
1292
+ pqFineCentroids +
1293
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1294
+ FINE_SIZE +
1295
+ fineCentroidOffset,
1296
+ weight1,
1297
+ existingValue);
1298
+
1299
+ existingValue = elementaryBlock8x1bAccum(
1300
+ pqCoarseCentroids +
1301
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1302
+ COARSE_SIZE +
1303
+ coarseCentroidOffset,
1304
+ pqFineCentroids +
1305
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1306
+ FINE_SIZE +
1307
+ fineCentroidOffset,
1308
+ weight2,
1309
+ existingValue);
1310
+
1311
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1312
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1313
+
1314
+ // next
1315
+ Index2LevelDecoderImpl<
1316
+ DIM,
1317
+ COARSE_SIZE,
1318
+ FINE_SIZE,
1319
+ COARSE_BITS,
1320
+ FINE_BITS,
1321
+ CPOS + 8>::
1322
+ accum(pqCoarseCentroids,
1323
+ pqFineCentroids,
1324
+ code0,
1325
+ weight0,
1326
+ code1,
1327
+ weight1,
1328
+ code2,
1329
+ weight2,
1330
+ outputAccum);
1331
+ }
1332
+ };
1333
+
1334
+ template <
1335
+ intptr_t DIM,
1336
+ intptr_t COARSE_SIZE,
1337
+ intptr_t FINE_SIZE,
1338
+ intptr_t COARSE_BITS,
1339
+ intptr_t FINE_BITS,
1340
+ intptr_t CPOS>
1341
+ struct Index2LevelDecoderImpl<
1342
+ DIM,
1343
+ COARSE_SIZE,
1344
+ FINE_SIZE,
1345
+ COARSE_BITS,
1346
+ FINE_BITS,
1347
+ CPOS,
1348
+ false,
1349
+ false,
1350
+ true,
1351
+ false> {
1352
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
1353
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
1354
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1355
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1356
+
1357
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1358
+
1359
+ // coarse quantizer storage
1360
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
1361
+
1362
+ // coarse quantizer bytes start from 0
1363
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
1364
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
1365
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
1366
+ N_COARSE_ELEMENTS * COARSE_BITS;
1367
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
1368
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
1369
+
1370
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1371
+
1372
+ // process 1 sample
1373
+ static void store(
1374
+ const float* const __restrict pqCoarseCentroids0,
1375
+ const float* const __restrict pqFineCentroids0,
1376
+ const uint8_t* const __restrict code0,
1377
+ float* const __restrict outputStore) {
1378
+ // coarse quantizer
1379
+ const uint8_t* const __restrict coarse0 = code0;
1380
+
1381
+ // fine quantizer
1382
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1383
+
1384
+ // process chunks, 4 float
1385
+
1386
+ const intptr_t coarseCode0 = detail::
1387
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1388
+ get(coarse0);
1389
+ const intptr_t fineCode0 =
1390
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1391
+ get(fine0);
1392
+
1393
+ const auto storeValue = elementaryBlock4x1b(
1394
+ pqCoarseCentroids0 +
1395
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1396
+ COARSE_SIZE +
1397
+ coarseCentroidOffset,
1398
+ pqFineCentroids0 +
1399
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1400
+ FINE_SIZE +
1401
+ fineCentroidOffset);
1402
+
1403
+ vst1q_f32(outputStore + CPOS, storeValue);
1404
+
1405
+ // next
1406
+ Index2LevelDecoderImpl<
1407
+ DIM,
1408
+ COARSE_SIZE,
1409
+ FINE_SIZE,
1410
+ COARSE_BITS,
1411
+ FINE_BITS,
1412
+ CPOS + 4>::
1413
+ store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
1414
+ }
1415
+
1416
+ // process 1 sample
1417
+ static void accum(
1418
+ const float* const __restrict pqCoarseCentroids0,
1419
+ const float* const __restrict pqFineCentroids0,
1420
+ const uint8_t* const __restrict code0,
1421
+ const float weight0,
1422
+ float* const __restrict outputAccum) {
1423
+ // coarse quantizer
1424
+ const uint8_t* const __restrict coarse0 = code0;
1425
+
1426
+ // fine quantizer
1427
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1428
+
1429
+ // process chunks, 4 float
1430
+
1431
+ const intptr_t coarseCode0 = detail::
1432
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1433
+ get(coarse0);
1434
+ const intptr_t fineCode0 =
1435
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1436
+ get(fine0);
1437
+
1438
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1439
+
1440
+ existingValue = elementaryBlock4x1bAccum(
1441
+ pqCoarseCentroids0 +
1442
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1443
+ COARSE_SIZE +
1444
+ coarseCentroidOffset,
1445
+ pqFineCentroids0 +
1446
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1447
+ FINE_SIZE +
1448
+ fineCentroidOffset,
1449
+ weight0,
1450
+ existingValue);
1451
+
1452
+ vst1q_f32(outputAccum + CPOS, existingValue);
1453
+
1454
+ // next
1455
+ Index2LevelDecoderImpl<
1456
+ DIM,
1457
+ COARSE_SIZE,
1458
+ FINE_SIZE,
1459
+ COARSE_BITS,
1460
+ FINE_BITS,
1461
+ CPOS + 4>::
1462
+ accum(pqCoarseCentroids0,
1463
+ pqFineCentroids0,
1464
+ code0,
1465
+ weight0,
1466
+ outputAccum);
1467
+ }
1468
+
1469
+ // Process 2 samples.
1470
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1471
+ // table.
1472
+ static void accum(
1473
+ const float* const __restrict pqCoarseCentroids0,
1474
+ const float* const __restrict pqFineCentroids0,
1475
+ const uint8_t* const __restrict code0,
1476
+ const float weight0,
1477
+ const float* const __restrict pqCoarseCentroids1,
1478
+ const float* const __restrict pqFineCentroids1,
1479
+ const uint8_t* const __restrict code1,
1480
+ const float weight1,
1481
+ float* const __restrict outputAccum) {
1482
+ // coarse quantizer
1483
+ const uint8_t* const __restrict coarse0 = code0;
1484
+ const uint8_t* const __restrict coarse1 = code1;
1485
+
1486
+ // fine quantizer
1487
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1488
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1489
+
1490
+ // process chunks, 4 float
1491
+
1492
+ const intptr_t coarseCode0 = detail::
1493
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1494
+ get(coarse0);
1495
+ const intptr_t fineCode0 =
1496
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1497
+ get(fine0);
1498
+ const intptr_t coarseCode1 = detail::
1499
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1500
+ get(coarse1);
1501
+ const intptr_t fineCode1 =
1502
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1503
+ get(fine1);
1504
+
1505
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1506
+
1507
+ existingValue = elementaryBlock4x1bAccum(
1508
+ pqCoarseCentroids0 +
1509
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1510
+ COARSE_SIZE +
1511
+ coarseCentroidOffset,
1512
+ pqFineCentroids0 +
1513
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1514
+ FINE_SIZE +
1515
+ fineCentroidOffset,
1516
+ weight0,
1517
+ existingValue);
1518
+
1519
+ existingValue = elementaryBlock4x1bAccum(
1520
+ pqCoarseCentroids1 +
1521
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1522
+ COARSE_SIZE +
1523
+ coarseCentroidOffset,
1524
+ pqFineCentroids1 +
1525
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1526
+ FINE_SIZE +
1527
+ fineCentroidOffset,
1528
+ weight1,
1529
+ existingValue);
1530
+
1531
+ vst1q_f32(outputAccum + CPOS, existingValue);
1532
+
1533
+ // next
1534
+ Index2LevelDecoderImpl<
1535
+ DIM,
1536
+ COARSE_SIZE,
1537
+ FINE_SIZE,
1538
+ COARSE_BITS,
1539
+ FINE_BITS,
1540
+ CPOS + 4>::
1541
+ accum(pqCoarseCentroids0,
1542
+ pqFineCentroids0,
1543
+ code0,
1544
+ weight0,
1545
+ pqCoarseCentroids1,
1546
+ pqFineCentroids1,
1547
+ code1,
1548
+ weight1,
1549
+ outputAccum);
1550
+ }
1551
+
1552
+ // Process 2 samples.
1553
+ // Coarse pq centroids table and fine pq centroids table are shared among
1554
+ // codes.
1555
+ static void accum(
1556
+ const float* const __restrict pqCoarseCentroids,
1557
+ const float* const __restrict pqFineCentroids,
1558
+ const uint8_t* const __restrict code0,
1559
+ const float weight0,
1560
+ const uint8_t* const __restrict code1,
1561
+ const float weight1,
1562
+ float* const __restrict outputAccum) {
1563
+ // coarse quantizer
1564
+ const uint8_t* const __restrict coarse0 = code0;
1565
+ const uint8_t* const __restrict coarse1 = code1;
1566
+
1567
+ // fine quantizer
1568
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1569
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1570
+
1571
+ // process chunks, 4 float
1572
+
1573
+ const intptr_t coarseCode0 = detail::
1574
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1575
+ get(coarse0);
1576
+ const intptr_t fineCode0 =
1577
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1578
+ get(fine0);
1579
+ const intptr_t coarseCode1 = detail::
1580
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1581
+ get(coarse1);
1582
+ const intptr_t fineCode1 =
1583
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1584
+ get(fine1);
1585
+
1586
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1587
+
1588
+ existingValue = elementaryBlock4x1bAccum(
1589
+ pqCoarseCentroids +
1590
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1591
+ COARSE_SIZE +
1592
+ coarseCentroidOffset,
1593
+ pqFineCentroids +
1594
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1595
+ FINE_SIZE +
1596
+ fineCentroidOffset,
1597
+ weight0,
1598
+ existingValue);
1599
+
1600
+ existingValue = elementaryBlock4x1bAccum(
1601
+ pqCoarseCentroids +
1602
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1603
+ COARSE_SIZE +
1604
+ coarseCentroidOffset,
1605
+ pqFineCentroids +
1606
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1607
+ FINE_SIZE +
1608
+ fineCentroidOffset,
1609
+ weight1,
1610
+ existingValue);
1611
+
1612
+ vst1q_f32(outputAccum + CPOS, existingValue);
1613
+
1614
+ // next
1615
+ Index2LevelDecoderImpl<
1616
+ DIM,
1617
+ COARSE_SIZE,
1618
+ FINE_SIZE,
1619
+ COARSE_BITS,
1620
+ FINE_BITS,
1621
+ CPOS + 4>::
1622
+ accum(pqCoarseCentroids,
1623
+ pqFineCentroids,
1624
+ code0,
1625
+ weight0,
1626
+ code1,
1627
+ weight1,
1628
+ outputAccum);
1629
+ }
1630
+
1631
+ // Process 3 samples.
1632
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1633
+ // table.
1634
+ static void accum(
1635
+ const float* const __restrict pqCoarseCentroids0,
1636
+ const float* const __restrict pqFineCentroids0,
1637
+ const uint8_t* const __restrict code0,
1638
+ const float weight0,
1639
+ const float* const __restrict pqCoarseCentroids1,
1640
+ const float* const __restrict pqFineCentroids1,
1641
+ const uint8_t* const __restrict code1,
1642
+ const float weight1,
1643
+ const float* const __restrict pqCoarseCentroids2,
1644
+ const float* const __restrict pqFineCentroids2,
1645
+ const uint8_t* const __restrict code2,
1646
+ const float weight2,
1647
+ float* const __restrict outputAccum) {
1648
+ // coarse quantizer
1649
+ const uint8_t* const __restrict coarse0 = code0;
1650
+ const uint8_t* const __restrict coarse1 = code1;
1651
+ const uint8_t* const __restrict coarse2 = code2;
1652
+
1653
+ // fine quantizer
1654
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1655
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1656
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1657
+
1658
+ // process chunks, 4 float
1659
+
1660
+ const intptr_t coarseCode0 = detail::
1661
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1662
+ get(coarse0);
1663
+ const intptr_t fineCode0 =
1664
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1665
+ get(fine0);
1666
+ const intptr_t coarseCode1 = detail::
1667
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1668
+ get(coarse1);
1669
+ const intptr_t fineCode1 =
1670
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1671
+ get(fine1);
1672
+ const intptr_t coarseCode2 = detail::
1673
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1674
+ get(coarse2);
1675
+ const intptr_t fineCode2 =
1676
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1677
+ get(fine2);
1678
+
1679
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1680
+
1681
+ existingValue = elementaryBlock4x1bAccum(
1682
+ pqCoarseCentroids0 +
1683
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1684
+ COARSE_SIZE +
1685
+ coarseCentroidOffset,
1686
+ pqFineCentroids0 +
1687
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1688
+ FINE_SIZE +
1689
+ fineCentroidOffset,
1690
+ weight0,
1691
+ existingValue);
1692
+
1693
+ existingValue = elementaryBlock4x1bAccum(
1694
+ pqCoarseCentroids1 +
1695
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1696
+ COARSE_SIZE +
1697
+ coarseCentroidOffset,
1698
+ pqFineCentroids1 +
1699
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1700
+ FINE_SIZE +
1701
+ fineCentroidOffset,
1702
+ weight1,
1703
+ existingValue);
1704
+
1705
+ existingValue = elementaryBlock4x1bAccum(
1706
+ pqCoarseCentroids2 +
1707
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1708
+ COARSE_SIZE +
1709
+ coarseCentroidOffset,
1710
+ pqFineCentroids2 +
1711
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1712
+ FINE_SIZE +
1713
+ fineCentroidOffset,
1714
+ weight2,
1715
+ existingValue);
1716
+
1717
+ vst1q_f32(outputAccum + CPOS, existingValue);
1718
+
1719
+ // next
1720
+ Index2LevelDecoderImpl<
1721
+ DIM,
1722
+ COARSE_SIZE,
1723
+ FINE_SIZE,
1724
+ COARSE_BITS,
1725
+ FINE_BITS,
1726
+ CPOS + 4>::
1727
+ accum(pqCoarseCentroids0,
1728
+ pqFineCentroids0,
1729
+ code0,
1730
+ weight0,
1731
+ pqCoarseCentroids1,
1732
+ pqFineCentroids1,
1733
+ code1,
1734
+ weight1,
1735
+ pqCoarseCentroids2,
1736
+ pqFineCentroids2,
1737
+ code2,
1738
+ weight2,
1739
+ outputAccum);
1740
+ }
1741
+
1742
+ // Process 3 samples.
1743
+ // Coarse pq centroids table and fine pq centroids table are shared among
1744
+ // codes.
1745
+ static void accum(
1746
+ const float* const __restrict pqCoarseCentroids,
1747
+ const float* const __restrict pqFineCentroids,
1748
+ const uint8_t* const __restrict code0,
1749
+ const float weight0,
1750
+ const uint8_t* const __restrict code1,
1751
+ const float weight1,
1752
+ const uint8_t* const __restrict code2,
1753
+ const float weight2,
1754
+ float* const __restrict outputAccum) {
1755
+ // coarse quantizer
1756
+ const uint8_t* const __restrict coarse0 = code0;
1757
+ const uint8_t* const __restrict coarse1 = code1;
1758
+ const uint8_t* const __restrict coarse2 = code2;
1759
+
1760
+ // fine quantizer
1761
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1762
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1763
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1764
+
1765
+ // process chunks, 4 float
1766
+
1767
+ const intptr_t coarseCode0 = detail::
1768
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1769
+ get(coarse0);
1770
+ const intptr_t fineCode0 =
1771
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1772
+ get(fine0);
1773
+ const intptr_t coarseCode1 = detail::
1774
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1775
+ get(coarse1);
1776
+ const intptr_t fineCode1 =
1777
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1778
+ get(fine1);
1779
+ const intptr_t coarseCode2 = detail::
1780
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1781
+ get(coarse2);
1782
+ const intptr_t fineCode2 =
1783
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1784
+ get(fine2);
1785
+
1786
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1787
+
1788
+ existingValue = elementaryBlock4x1bAccum(
1789
+ pqCoarseCentroids +
1790
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1791
+ COARSE_SIZE +
1792
+ coarseCentroidOffset,
1793
+ pqFineCentroids +
1794
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1795
+ FINE_SIZE +
1796
+ fineCentroidOffset,
1797
+ weight0,
1798
+ existingValue);
1799
+
1800
+ existingValue = elementaryBlock4x1bAccum(
1801
+ pqCoarseCentroids +
1802
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1803
+ COARSE_SIZE +
1804
+ coarseCentroidOffset,
1805
+ pqFineCentroids +
1806
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1807
+ FINE_SIZE +
1808
+ fineCentroidOffset,
1809
+ weight1,
1810
+ existingValue);
1811
+
1812
+ existingValue = elementaryBlock4x1bAccum(
1813
+ pqCoarseCentroids +
1814
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1815
+ COARSE_SIZE +
1816
+ coarseCentroidOffset,
1817
+ pqFineCentroids +
1818
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1819
+ FINE_SIZE +
1820
+ fineCentroidOffset,
1821
+ weight2,
1822
+ existingValue);
1823
+
1824
+ vst1q_f32(outputAccum + CPOS, existingValue);
1825
+
1826
+ // next
1827
+ Index2LevelDecoderImpl<
1828
+ DIM,
1829
+ COARSE_SIZE,
1830
+ FINE_SIZE,
1831
+ COARSE_BITS,
1832
+ FINE_BITS,
1833
+ CPOS + 4>::
1834
+ accum(pqCoarseCentroids,
1835
+ pqFineCentroids,
1836
+ code0,
1837
+ weight0,
1838
+ code1,
1839
+ weight1,
1840
+ code2,
1841
+ weight2,
1842
+ outputAccum);
1843
+ }
1844
+ };
1845
+
1846
+ // This partial specialization is expected to do nothing.
1847
+ template <
1848
+ intptr_t DIM,
1849
+ intptr_t COARSE_SIZE,
1850
+ intptr_t FINE_SIZE,
1851
+ intptr_t COARSE_BITS,
1852
+ intptr_t FINE_BITS,
1853
+ bool FINE_SIZE_EQ_4,
1854
+ bool QPOS_LEFT_GE_8,
1855
+ bool QPOS_LEFT_GE_4>
1856
+ struct Index2LevelDecoderImpl<
1857
+ DIM,
1858
+ COARSE_SIZE,
1859
+ FINE_SIZE,
1860
+ COARSE_BITS,
1861
+ FINE_BITS,
1862
+ DIM,
1863
+ FINE_SIZE_EQ_4,
1864
+ QPOS_LEFT_GE_8,
1865
+ QPOS_LEFT_GE_4,
1866
+ true> {
1867
+ // process 1 sample
1868
+ static void store(
1869
+ const float* const __restrict pqCoarseCentroids0,
1870
+ const float* const __restrict pqFineCentroids0,
1871
+ const uint8_t* const __restrict code0,
1872
+ float* const __restrict outputStore) {}
1873
+
1874
+ // process 1 sample
1875
+ static void accum(
1876
+ const float* const __restrict pqCoarseCentroids0,
1877
+ const float* const __restrict pqFineCentroids0,
1878
+ const uint8_t* const __restrict code0,
1879
+ const float weight0,
1880
+ float* const __restrict outputAccum) {}
1881
+
1882
+ // Process 2 samples.
1883
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1884
+ // table.
1885
+ static void accum(
1886
+ const float* const __restrict pqCoarseCentroids0,
1887
+ const float* const __restrict pqFineCentroids0,
1888
+ const uint8_t* const __restrict code0,
1889
+ const float weight0,
1890
+ const float* const __restrict pqCoarseCentroids1,
1891
+ const float* const __restrict pqFineCentroids1,
1892
+ const uint8_t* const __restrict code1,
1893
+ const float weight1,
1894
+ float* const __restrict outputAccum) {}
1895
+
1896
+ // Process 2 samples.
1897
+ // Coarse pq centroids table and fine pq centroids table are shared among
1898
+ // codes.
1899
+ static void accum(
1900
+ const float* const __restrict pqCoarseCentroids,
1901
+ const float* const __restrict pqFineCentroids,
1902
+ const uint8_t* const __restrict code0,
1903
+ const float weight0,
1904
+ const uint8_t* const __restrict code1,
1905
+ const float weight1,
1906
+ float* const __restrict outputAccum) {}
1907
+
1908
+ // Process 3 samples.
1909
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1910
+ // table.
1911
+ static void accum(
1912
+ const float* const __restrict pqCoarseCentroids0,
1913
+ const float* const __restrict pqFineCentroids0,
1914
+ const uint8_t* const __restrict code0,
1915
+ const float weight0,
1916
+ const float* const __restrict pqCoarseCentroids1,
1917
+ const float* const __restrict pqFineCentroids1,
1918
+ const uint8_t* const __restrict code1,
1919
+ const float weight1,
1920
+ const float* const __restrict pqCoarseCentroids2,
1921
+ const float* const __restrict pqFineCentroids2,
1922
+ const uint8_t* const __restrict code2,
1923
+ const float weight2,
1924
+ float* const __restrict outputAccum) {}
1925
+
1926
+ // Process 3 samples.
1927
+ // Coarse pq centroids table and fine pq centroids table are shared among
1928
+ // codes.
1929
+ static void accum(
1930
+ const float* const __restrict pqCoarseCentroids,
1931
+ const float* const __restrict pqFineCentroids,
1932
+ const uint8_t* const __restrict code0,
1933
+ const float weight0,
1934
+ const uint8_t* const __restrict code1,
1935
+ const float weight1,
1936
+ const uint8_t* const __restrict code2,
1937
+ const float weight2,
1938
+ float* const __restrict outputAccum) {}
1939
+ };
1940
+ } // namespace
1941
+
1942
+ // Suitable for IVF256,PQ[1]x8
1943
+ // Suitable for Residual[1]x8,PQ[2]x8
1944
+ // Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
1945
+ // Suitable for Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
1946
+ template <
1947
+ intptr_t DIM,
1948
+ intptr_t COARSE_SIZE,
1949
+ intptr_t FINE_SIZE,
1950
+ intptr_t COARSE_BITS = 8,
1951
+ intptr_t FINE_BITS = 8>
1952
+ struct Index2LevelDecoder {
1953
+ static_assert(
1954
+ COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
1955
+ "Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
1956
+ static_assert(
1957
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1958
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1959
+
1960
+ static constexpr intptr_t dim = DIM;
1961
+ static constexpr intptr_t coarseSize = COARSE_SIZE;
1962
+ static constexpr intptr_t fineSize = FINE_SIZE;
1963
+ static constexpr intptr_t coarseBits = COARSE_BITS;
1964
+ static constexpr intptr_t fineBits = FINE_BITS;
1965
+
1966
+ // Process 1 sample.
1967
+ static void store(
1968
+ const float* const __restrict pqCoarseCentroids,
1969
+ const float* const __restrict pqFineCentroids,
1970
+ const uint8_t* const __restrict code,
1971
+ float* const __restrict outputStore) {
1972
+ Index2LevelDecoderImpl<
1973
+ DIM,
1974
+ COARSE_SIZE,
1975
+ FINE_SIZE,
1976
+ COARSE_BITS,
1977
+ FINE_BITS,
1978
+ 0>::
1979
+ store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
1980
+ }
1981
+
1982
+ // Process 1 sample.
1983
+ // Performs outputAccum += weight * decoded(code)
1984
+ static void accum(
1985
+ const float* const __restrict pqCoarseCentroids,
1986
+ const float* const __restrict pqFineCentroids,
1987
+ const uint8_t* const __restrict code,
1988
+ const float weight,
1989
+ float* const __restrict outputAccum) {
1990
+ Index2LevelDecoderImpl<
1991
+ DIM,
1992
+ COARSE_SIZE,
1993
+ FINE_SIZE,
1994
+ COARSE_BITS,
1995
+ FINE_BITS,
1996
+ 0>::
1997
+ accum(pqCoarseCentroids,
1998
+ pqFineCentroids,
1999
+ code,
2000
+ weight,
2001
+ outputAccum);
2002
+ }
2003
+
2004
+ // Process 2 samples.
2005
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2006
+ // decoded(code1).
2007
+ //
2008
+ // Each code uses its own coarse pq centroids table and fine pq centroids
2009
+ // table.
2010
+ static void accum(
2011
+ const float* const __restrict pqCoarseCentroids0,
2012
+ const float* const __restrict pqFineCentroids0,
2013
+ const uint8_t* const __restrict code0,
2014
+ const float weight0,
2015
+ const float* const __restrict pqCoarseCentroids1,
2016
+ const float* const __restrict pqFineCentroids1,
2017
+ const uint8_t* const __restrict code1,
2018
+ const float weight1,
2019
+ float* const __restrict outputAccum) {
2020
+ Index2LevelDecoderImpl<
2021
+ DIM,
2022
+ COARSE_SIZE,
2023
+ FINE_SIZE,
2024
+ COARSE_BITS,
2025
+ FINE_BITS,
2026
+ 0>::
2027
+ accum(pqCoarseCentroids0,
2028
+ pqFineCentroids0,
2029
+ code0,
2030
+ weight0,
2031
+ pqCoarseCentroids1,
2032
+ pqFineCentroids1,
2033
+ code1,
2034
+ weight1,
2035
+ outputAccum);
2036
+ }
2037
+
2038
+ // Process 2 samples.
2039
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2040
+ // decoded(code1)
2041
+ //
2042
+ // Coarse pq centroids table and fine pq centroids table are shared among
2043
+ // codes.
2044
+ static void accum(
2045
+ const float* const __restrict pqCoarseCentroids,
2046
+ const float* const __restrict pqFineCentroids,
2047
+ const uint8_t* const __restrict code0,
2048
+ const float weight0,
2049
+ const uint8_t* const __restrict code1,
2050
+ const float weight1,
2051
+ float* const __restrict outputAccum) {
2052
+ Index2LevelDecoderImpl<
2053
+ DIM,
2054
+ COARSE_SIZE,
2055
+ FINE_SIZE,
2056
+ COARSE_BITS,
2057
+ FINE_BITS,
2058
+ 0>::
2059
+ accum(pqCoarseCentroids,
2060
+ pqFineCentroids,
2061
+ code0,
2062
+ weight0,
2063
+ code1,
2064
+ weight1,
2065
+ outputAccum);
2066
+ }
2067
+
2068
+ // Process 3 samples.
2069
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2070
+ // decoded(code1) + weight2 * decoded(code2)
2071
+ //
2072
+ // Each code uses its own coarse pq centroids table and fine pq centroids
2073
+ // table.
2074
+ static void accum(
2075
+ const float* const __restrict pqCoarseCentroids0,
2076
+ const float* const __restrict pqFineCentroids0,
2077
+ const uint8_t* const __restrict code0,
2078
+ const float weight0,
2079
+ const float* const __restrict pqCoarseCentroids1,
2080
+ const float* const __restrict pqFineCentroids1,
2081
+ const uint8_t* const __restrict code1,
2082
+ const float weight1,
2083
+ const float* const __restrict pqCoarseCentroids2,
2084
+ const float* const __restrict pqFineCentroids2,
2085
+ const uint8_t* const __restrict code2,
2086
+ const float weight2,
2087
+ float* const __restrict outputAccum) {
2088
+ Index2LevelDecoderImpl<
2089
+ DIM,
2090
+ COARSE_SIZE,
2091
+ FINE_SIZE,
2092
+ COARSE_BITS,
2093
+ FINE_BITS,
2094
+ 0>::
2095
+ accum(pqCoarseCentroids0,
2096
+ pqFineCentroids0,
2097
+ code0,
2098
+ weight0,
2099
+ pqCoarseCentroids1,
2100
+ pqFineCentroids1,
2101
+ code1,
2102
+ weight1,
2103
+ pqCoarseCentroids2,
2104
+ pqFineCentroids2,
2105
+ code2,
2106
+ weight2,
2107
+ outputAccum);
2108
+ }
2109
+
2110
+ // Process 3 samples.
2111
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2112
+ // decoded(code1) + weight2 * decoded(code2)
2113
+ //
2114
+ // Coarse pq centroids table and fine pq centroids table are shared among
2115
+ // codes.
2116
+ static void accum(
2117
+ const float* const __restrict pqCoarseCentroids,
2118
+ const float* const __restrict pqFineCentroids,
2119
+ const uint8_t* const __restrict code0,
2120
+ const float weight0,
2121
+ const uint8_t* const __restrict code1,
2122
+ const float weight1,
2123
+ const uint8_t* const __restrict code2,
2124
+ const float weight2,
2125
+ float* const __restrict outputAccum) {
2126
+ Index2LevelDecoderImpl<
2127
+ DIM,
2128
+ COARSE_SIZE,
2129
+ FINE_SIZE,
2130
+ COARSE_BITS,
2131
+ FINE_BITS,
2132
+ 0>::
2133
+ accum(pqCoarseCentroids,
2134
+ pqFineCentroids,
2135
+ code0,
2136
+ weight0,
2137
+ code1,
2138
+ weight1,
2139
+ code2,
2140
+ weight2,
2141
+ outputAccum);
2142
+ }
2143
+ };
2144
+
2145
+ } // namespace cppcontrib
2146
+ } // namespace faiss
2147
+ #endif // LEVEL2_NEON_INL_H