faiss 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -0,0 +1,2147 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+ #ifndef LEVEL2_NEON_INL_H
3
+ #define LEVEL2_NEON_INL_H
4
+
5
+ #include <arm_neon.h>
6
+
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+
10
+ #include <faiss/cppcontrib/detail/UintReader.h>
11
+
12
+ namespace faiss {
13
+ namespace cppcontrib {
14
+
15
+ namespace {
16
+
17
+ // Processes 4 float values.
18
+ // Returns {
19
+ // [0..3] = *coarse[0..3] + *fine[0..3];
20
+ // }
21
+ inline float32x4_t elementaryBlock4x1b(
22
+ const float* const __restrict coarse,
23
+ const float* const __restrict fine) {
24
+ // load fine
25
+ const auto fineValue = vld1q_f32(fine);
26
+ // load coarse
27
+ const auto coarseValue = vld1q_f32(coarse);
28
+
29
+ // add coarse and fine
30
+ return vaddq_f32(fineValue, coarseValue);
31
+ }
32
+
33
+ // Processes 4 float values.
34
+ // Returns {
35
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
36
+ // }
37
+ inline float32x4_t elementaryBlock4x1bAccum(
38
+ const float* const __restrict coarse,
39
+ const float* const __restrict fine,
40
+ const float weight,
41
+ const float32x4_t existingValue) {
42
+ // add coarse and fine
43
+ const auto combinedValue = elementaryBlock4x1b(coarse, fine);
44
+
45
+ // this operation is expected to be optimized by a compiler
46
+ const auto weightNeon = vdupq_n_f32(weight);
47
+ // do fma
48
+ return vfmaq_f32(existingValue, weightNeon, combinedValue);
49
+ }
50
+
51
+ // Processes 8 float values.
52
+ // Returns {
53
+ // [0..3] = *coarse[0..3] + *fine0[0..3];
54
+ // [4..7] = *coarse[4..7] + *fine1[0..3];
55
+ // }
56
+ inline float32x4x2_t elementaryBlock4x2b(
57
+ const float* const __restrict coarse,
58
+ const float* const __restrict fine0,
59
+ const float* const __restrict fine1) {
60
+ // load fine
61
+ const auto fineValue0 = vld1q_f32(fine0);
62
+ const auto fineValue1 = vld1q_f32(fine1);
63
+ // load coarse
64
+ const auto coarseValue0 = vld1q_f32(coarse);
65
+ const auto coarseValue1 = vld1q_f32(coarse + 4);
66
+
67
+ // add coarse and fine
68
+ const auto result0 = vaddq_f32(fineValue0, coarseValue0);
69
+ const auto result1 = vaddq_f32(fineValue1, coarseValue1);
70
+
71
+ return {result0, result1};
72
+ }
73
+
74
+ // Processes 8 float values.
75
+ // Returns {
76
+ // [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
77
+ // [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
78
+ // }
79
+ inline float32x4x2_t elementaryBlock4x2bAccum(
80
+ const float* const __restrict coarse,
81
+ const float* const __restrict fine0,
82
+ const float* const __restrict fine1,
83
+ const float weight,
84
+ const float32x4x2_t existingValue) {
85
+ // add coarse and fine
86
+ const auto combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
87
+
88
+ // this operation is expected to be optimized by a compiler
89
+ const auto weightNeon = vdupq_n_f32(weight);
90
+ // do fma
91
+ const auto result0 =
92
+ vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
93
+ const auto result1 =
94
+ vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
95
+ return {result0, result1};
96
+ }
97
+
98
+ // Processes 8 float values.
99
+ // Returns {
100
+ // [0..7] = *coarse[0..7] + *fine[0..7];
101
+ // }
102
+ inline float32x4x2_t elementaryBlock8x1b(
103
+ const float* const __restrict coarse,
104
+ const float* const __restrict fine) {
105
+ // load fine
106
+ const auto fineValue0 = vld1q_f32(fine);
107
+ const auto fineValue1 = vld1q_f32(fine + 4);
108
+ // load coarse
109
+ const auto coarseValue0 = vld1q_f32(coarse);
110
+ const auto coarseValue1 = vld1q_f32(coarse + 4);
111
+
112
+ // add coarse and fine
113
+ return {vaddq_f32(fineValue0, coarseValue0),
114
+ vaddq_f32(fineValue1, coarseValue1)};
115
+ }
116
+
117
+ // Processes 8 float values.
118
+ // Returns {
119
+ // [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
120
+ // }
121
+ inline float32x4x2_t elementaryBlock8x1bAccum(
122
+ const float* const __restrict coarse,
123
+ const float* const __restrict fine,
124
+ const float weight,
125
+ const float32x4x2_t existingValue) {
126
+ // add coarse and fine
127
+ const auto combinedValue = elementaryBlock8x1b(coarse, fine);
128
+
129
+ // this operation is expected to be optimized by a compiler
130
+ const auto weightNeon = vdupq_n_f32(weight);
131
+ // do fma
132
+ const auto result0 =
133
+ vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
134
+ const auto result1 =
135
+ vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
136
+ return {result0, result1};
137
+ }
138
+
139
+ // The following code uses template-based for-loop unrolling,
140
+ // because the compiler does not do that on its own as needed.
141
+ // The idea is the following:
142
+ // template<int I, int MAX>
143
+ // struct Foo {
144
+ // static void bar() {
145
+ // doSomething(I);
146
+ // Foo<I + 1, MAX>::bar();
147
+ // }
148
+ // };
149
+ //
150
+ // template<int MAX>
151
+ // struct Foo<MAX, MAX> {
152
+ // static void bar() {}
153
+ // };
154
+ //
155
+ // Initiate the loop:
156
+ // Foo<0, MAX>::bar();
157
+
158
+ template <
159
+ intptr_t DIM,
160
+ intptr_t COARSE_SIZE,
161
+ intptr_t FINE_SIZE,
162
+ intptr_t COARSE_BITS,
163
+ intptr_t FINE_BITS,
164
+ intptr_t CPOS,
165
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
166
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
167
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
168
+ bool DIM_EQ_CPOS = DIM == CPOS>
169
+ struct Index2LevelDecoderImpl;
170
+
171
+ template <
172
+ intptr_t DIM,
173
+ intptr_t COARSE_SIZE,
174
+ intptr_t COARSE_BITS,
175
+ intptr_t FINE_BITS,
176
+ intptr_t CPOS,
177
+ bool QPOS_LEFT_GE_8,
178
+ bool QPOS_LEFT_GE_4>
179
+ struct Index2LevelDecoderImpl<
180
+ DIM,
181
+ COARSE_SIZE,
182
+ 4,
183
+ COARSE_BITS,
184
+ FINE_BITS,
185
+ CPOS,
186
+ true,
187
+ QPOS_LEFT_GE_8,
188
+ QPOS_LEFT_GE_4,
189
+ false> {
190
+ static constexpr intptr_t FINE_SIZE = 4;
191
+
192
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
193
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
194
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
195
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
196
+
197
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
198
+
199
+ // coarse quantizer storage
200
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
201
+
202
+ // coarse quantizer bytes start from 0
203
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
204
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
205
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
206
+ N_COARSE_ELEMENTS * COARSE_BITS;
207
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
208
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
209
+
210
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
211
+
212
+ // process 1 sample
213
+ static void store(
214
+ const float* const __restrict pqCoarseCentroids0,
215
+ const float* const __restrict pqFineCentroids0,
216
+ const uint8_t* const __restrict code0,
217
+ float* const __restrict outputStore) {
218
+ // coarse quantizer
219
+ const uint8_t* const __restrict coarse0 = code0;
220
+
221
+ // fine quantizer
222
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
223
+
224
+ // process chunks, 4 float
225
+ // but 8 floats per loop
226
+
227
+ const intptr_t coarseCode0 = detail::
228
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
229
+ get(coarse0);
230
+ const intptr_t fineCode0a = detail::
231
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
232
+ fine0);
233
+ const intptr_t fineCode0b = detail::
234
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
235
+ fine0);
236
+
237
+ const auto storeValue = elementaryBlock4x2b(
238
+ pqCoarseCentroids0 +
239
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
240
+ COARSE_SIZE +
241
+ coarseCentroidOffset,
242
+ pqFineCentroids0 +
243
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
244
+ fineCode0a) *
245
+ FINE_SIZE +
246
+ fineCentroidOffset,
247
+ pqFineCentroids0 +
248
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
249
+ fineCode0b) *
250
+ FINE_SIZE +
251
+ fineCentroidOffset);
252
+
253
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
254
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
255
+
256
+ // next
257
+ Index2LevelDecoderImpl<
258
+ DIM,
259
+ COARSE_SIZE,
260
+ FINE_SIZE,
261
+ COARSE_BITS,
262
+ FINE_BITS,
263
+ CPOS + 8>::
264
+ store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
265
+ }
266
+
267
+ // process 1 sample
268
+ static void accum(
269
+ const float* const __restrict pqCoarseCentroids0,
270
+ const float* const __restrict pqFineCentroids0,
271
+ const uint8_t* const __restrict code0,
272
+ const float weight0,
273
+ float* const __restrict outputAccum) {
274
+ // coarse quantizer
275
+ const uint8_t* const __restrict coarse0 = code0;
276
+
277
+ // fine quantizer
278
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
279
+
280
+ // process chunks, 4 float
281
+ // but 8 floats per loop
282
+
283
+ const intptr_t coarseCode0 = detail::
284
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
285
+ get(coarse0);
286
+ const intptr_t fineCode0a = detail::
287
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
288
+ fine0);
289
+ const intptr_t fineCode0b = detail::
290
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
291
+ fine0);
292
+
293
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
294
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
295
+
296
+ auto existingValue = elementaryBlock4x2bAccum(
297
+ pqCoarseCentroids0 +
298
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
299
+ COARSE_SIZE +
300
+ coarseCentroidOffset,
301
+ pqFineCentroids0 +
302
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
303
+ fineCode0a) *
304
+ FINE_SIZE +
305
+ fineCentroidOffset,
306
+ pqFineCentroids0 +
307
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
308
+ fineCode0b) *
309
+ FINE_SIZE +
310
+ fineCentroidOffset,
311
+ weight0,
312
+ {existingValue0, existingValue1});
313
+
314
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
315
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
316
+
317
+ // next
318
+ Index2LevelDecoderImpl<
319
+ DIM,
320
+ COARSE_SIZE,
321
+ FINE_SIZE,
322
+ COARSE_BITS,
323
+ FINE_BITS,
324
+ CPOS + 8>::
325
+ accum(pqCoarseCentroids0,
326
+ pqFineCentroids0,
327
+ code0,
328
+ weight0,
329
+ outputAccum);
330
+ }
331
+
332
+ // Process 2 samples.
333
+ // Each code uses its own coarse pq centroids table and fine pq centroids
334
+ // table.
335
+ static void accum(
336
+ const float* const __restrict pqCoarseCentroids0,
337
+ const float* const __restrict pqFineCentroids0,
338
+ const uint8_t* const __restrict code0,
339
+ const float weight0,
340
+ const float* const __restrict pqCoarseCentroids1,
341
+ const float* const __restrict pqFineCentroids1,
342
+ const uint8_t* const __restrict code1,
343
+ const float weight1,
344
+ float* const __restrict outputAccum) {
345
+ // coarse quantizer
346
+ const uint8_t* const __restrict coarse0 = code0;
347
+ const uint8_t* const __restrict coarse1 = code1;
348
+
349
+ // fine quantizer
350
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
351
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
352
+
353
+ // process chunks, 4 float
354
+ // but 8 floats per loop
355
+
356
+ const intptr_t coarseCode0 = detail::
357
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
358
+ get(coarse0);
359
+ const intptr_t fineCode0a = detail::
360
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
361
+ fine0);
362
+ const intptr_t fineCode0b = detail::
363
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
364
+ fine0);
365
+ const intptr_t coarseCode1 = detail::
366
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
367
+ get(coarse1);
368
+ const intptr_t fineCode1a = detail::
369
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
370
+ fine1);
371
+ const intptr_t fineCode1b = detail::
372
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
373
+ fine1);
374
+
375
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
376
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
377
+
378
+ auto existingValue = elementaryBlock4x2bAccum(
379
+ pqCoarseCentroids0 +
380
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
381
+ COARSE_SIZE +
382
+ coarseCentroidOffset,
383
+ pqFineCentroids0 +
384
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
385
+ fineCode0a) *
386
+ FINE_SIZE +
387
+ fineCentroidOffset,
388
+ pqFineCentroids0 +
389
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
390
+ fineCode0b) *
391
+ FINE_SIZE +
392
+ fineCentroidOffset,
393
+ weight0,
394
+ {existingValue0, existingValue1});
395
+
396
+ existingValue = elementaryBlock4x2bAccum(
397
+ pqCoarseCentroids1 +
398
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
399
+ COARSE_SIZE +
400
+ coarseCentroidOffset,
401
+ pqFineCentroids1 +
402
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
403
+ fineCode1a) *
404
+ FINE_SIZE +
405
+ fineCentroidOffset,
406
+ pqFineCentroids1 +
407
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
408
+ fineCode1b) *
409
+ FINE_SIZE +
410
+ fineCentroidOffset,
411
+ weight1,
412
+ existingValue);
413
+
414
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
415
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
416
+
417
+ // next
418
+ Index2LevelDecoderImpl<
419
+ DIM,
420
+ COARSE_SIZE,
421
+ FINE_SIZE,
422
+ COARSE_BITS,
423
+ FINE_BITS,
424
+ CPOS + 8>::
425
+ accum(pqCoarseCentroids0,
426
+ pqFineCentroids0,
427
+ code0,
428
+ weight0,
429
+ pqCoarseCentroids1,
430
+ pqFineCentroids1,
431
+ code1,
432
+ weight1,
433
+ outputAccum);
434
+ }
435
+
436
+ // Process 2 samples.
437
+ // Coarse pq centroids table and fine pq centroids table are shared among
438
+ // codes.
439
+ static void accum(
440
+ const float* const __restrict pqCoarseCentroids,
441
+ const float* const __restrict pqFineCentroids,
442
+ const uint8_t* const __restrict code0,
443
+ const float weight0,
444
+ const uint8_t* const __restrict code1,
445
+ const float weight1,
446
+ float* const __restrict outputAccum) {
447
+ // coarse quantizer
448
+ const uint8_t* const __restrict coarse0 = code0;
449
+ const uint8_t* const __restrict coarse1 = code1;
450
+
451
+ // fine quantizer
452
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
453
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
454
+
455
+ // process chunks, 4 float
456
+ // but 8 floats per loop
457
+
458
+ const intptr_t coarseCode0 = detail::
459
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
460
+ get(coarse0);
461
+ const intptr_t fineCode0a = detail::
462
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
463
+ fine0);
464
+ const intptr_t fineCode0b = detail::
465
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
466
+ fine0);
467
+ const intptr_t coarseCode1 = detail::
468
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
469
+ get(coarse1);
470
+ const intptr_t fineCode1a = detail::
471
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
472
+ fine1);
473
+ const intptr_t fineCode1b = detail::
474
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
475
+ fine1);
476
+
477
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
478
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
479
+
480
+ auto existingValue = elementaryBlock4x2bAccum(
481
+ pqCoarseCentroids +
482
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
483
+ COARSE_SIZE +
484
+ coarseCentroidOffset,
485
+ pqFineCentroids +
486
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
487
+ fineCode0a) *
488
+ FINE_SIZE +
489
+ fineCentroidOffset,
490
+ pqFineCentroids +
491
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
492
+ fineCode0b) *
493
+ FINE_SIZE +
494
+ fineCentroidOffset,
495
+ weight0,
496
+ {existingValue0, existingValue1});
497
+
498
+ existingValue = elementaryBlock4x2bAccum(
499
+ pqCoarseCentroids +
500
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
501
+ COARSE_SIZE +
502
+ coarseCentroidOffset,
503
+ pqFineCentroids +
504
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
505
+ fineCode1a) *
506
+ FINE_SIZE +
507
+ fineCentroidOffset,
508
+ pqFineCentroids +
509
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
510
+ fineCode1b) *
511
+ FINE_SIZE +
512
+ fineCentroidOffset,
513
+ weight1,
514
+ existingValue);
515
+
516
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
517
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
518
+
519
+ // next
520
+ Index2LevelDecoderImpl<
521
+ DIM,
522
+ COARSE_SIZE,
523
+ FINE_SIZE,
524
+ COARSE_BITS,
525
+ FINE_BITS,
526
+ CPOS + 8>::
527
+ accum(pqCoarseCentroids,
528
+ pqFineCentroids,
529
+ code0,
530
+ weight0,
531
+ code1,
532
+ weight1,
533
+ outputAccum);
534
+ }
535
+
536
+ // Process 3 samples.
537
+ // Each code uses its own coarse pq centroids table and fine pq centroids
538
+ // table.
539
+ static void accum(
540
+ const float* const __restrict pqCoarseCentroids0,
541
+ const float* const __restrict pqFineCentroids0,
542
+ const uint8_t* const __restrict code0,
543
+ const float weight0,
544
+ const float* const __restrict pqCoarseCentroids1,
545
+ const float* const __restrict pqFineCentroids1,
546
+ const uint8_t* const __restrict code1,
547
+ const float weight1,
548
+ const float* const __restrict pqCoarseCentroids2,
549
+ const float* const __restrict pqFineCentroids2,
550
+ const uint8_t* const __restrict code2,
551
+ const float weight2,
552
+ float* const __restrict outputAccum) {
553
+ // coarse quantizer
554
+ const uint8_t* const __restrict coarse0 = code0;
555
+ const uint8_t* const __restrict coarse1 = code1;
556
+ const uint8_t* const __restrict coarse2 = code2;
557
+
558
+ // fine quantizer
559
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
560
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
561
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
562
+
563
+ // process chunks, 4 float
564
+ // but 8 floats per loop
565
+
566
+ const intptr_t coarseCode0 = detail::
567
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
568
+ get(coarse0);
569
+ const intptr_t fineCode0a = detail::
570
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
571
+ fine0);
572
+ const intptr_t fineCode0b = detail::
573
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
574
+ fine0);
575
+ const intptr_t coarseCode1 = detail::
576
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
577
+ get(coarse1);
578
+ const intptr_t fineCode1a = detail::
579
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
580
+ fine1);
581
+ const intptr_t fineCode1b = detail::
582
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
583
+ fine1);
584
+ const intptr_t coarseCode2 = detail::
585
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
586
+ get(coarse2);
587
+ const intptr_t fineCode2a = detail::
588
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
589
+ fine2);
590
+ const intptr_t fineCode2b = detail::
591
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
592
+ fine2);
593
+
594
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
595
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
596
+
597
+ auto existingValue = elementaryBlock4x2bAccum(
598
+ pqCoarseCentroids0 +
599
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
600
+ COARSE_SIZE +
601
+ coarseCentroidOffset,
602
+ pqFineCentroids0 +
603
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
604
+ fineCode0a) *
605
+ FINE_SIZE +
606
+ fineCentroidOffset,
607
+ pqFineCentroids0 +
608
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
609
+ fineCode0b) *
610
+ FINE_SIZE +
611
+ fineCentroidOffset,
612
+ weight0,
613
+ {existingValue0, existingValue1});
614
+
615
+ existingValue = elementaryBlock4x2bAccum(
616
+ pqCoarseCentroids1 +
617
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
618
+ COARSE_SIZE +
619
+ coarseCentroidOffset,
620
+ pqFineCentroids1 +
621
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
622
+ fineCode1a) *
623
+ FINE_SIZE +
624
+ fineCentroidOffset,
625
+ pqFineCentroids1 +
626
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
627
+ fineCode1b) *
628
+ FINE_SIZE +
629
+ fineCentroidOffset,
630
+ weight1,
631
+ existingValue);
632
+
633
+ existingValue = elementaryBlock4x2bAccum(
634
+ pqCoarseCentroids2 +
635
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
636
+ COARSE_SIZE +
637
+ coarseCentroidOffset,
638
+ pqFineCentroids2 +
639
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
640
+ fineCode2a) *
641
+ FINE_SIZE +
642
+ fineCentroidOffset,
643
+ pqFineCentroids2 +
644
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
645
+ fineCode2b) *
646
+ FINE_SIZE +
647
+ fineCentroidOffset,
648
+ weight2,
649
+ existingValue);
650
+
651
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
652
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
653
+
654
+ // next
655
+ Index2LevelDecoderImpl<
656
+ DIM,
657
+ COARSE_SIZE,
658
+ FINE_SIZE,
659
+ COARSE_BITS,
660
+ FINE_BITS,
661
+ CPOS + 8>::
662
+ accum(pqCoarseCentroids0,
663
+ pqFineCentroids0,
664
+ code0,
665
+ weight0,
666
+ pqCoarseCentroids1,
667
+ pqFineCentroids1,
668
+ code1,
669
+ weight1,
670
+ pqCoarseCentroids2,
671
+ pqFineCentroids2,
672
+ code2,
673
+ weight2,
674
+ outputAccum);
675
+ }
676
+
677
+ // Process 3 samples.
678
+ // Coarse pq centroids table and fine pq centroids table are shared among
679
+ // codes.
680
+ static void accum(
681
+ const float* const __restrict pqCoarseCentroids,
682
+ const float* const __restrict pqFineCentroids,
683
+ const uint8_t* const __restrict code0,
684
+ const float weight0,
685
+ const uint8_t* const __restrict code1,
686
+ const float weight1,
687
+ const uint8_t* const __restrict code2,
688
+ const float weight2,
689
+ float* const __restrict outputAccum) {
690
+ // coarse quantizer
691
+ const uint8_t* const __restrict coarse0 = code0;
692
+ const uint8_t* const __restrict coarse1 = code1;
693
+ const uint8_t* const __restrict coarse2 = code2;
694
+
695
+ // fine quantizer
696
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
697
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
698
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
699
+
700
+ // process chunks, 4 float
701
+ // but 8 floats per loop
702
+
703
+ const intptr_t coarseCode0 = detail::
704
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
705
+ get(coarse0);
706
+ const intptr_t fineCode0a = detail::
707
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
708
+ fine0);
709
+ const intptr_t fineCode0b = detail::
710
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
711
+ fine0);
712
+ const intptr_t coarseCode1 = detail::
713
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
714
+ get(coarse1);
715
+ const intptr_t fineCode1a = detail::
716
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
717
+ fine1);
718
+ const intptr_t fineCode1b = detail::
719
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
720
+ fine1);
721
+ const intptr_t coarseCode2 = detail::
722
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
723
+ get(coarse2);
724
+ const intptr_t fineCode2a = detail::
725
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
726
+ fine2);
727
+ const intptr_t fineCode2b = detail::
728
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
729
+ fine2);
730
+
731
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
732
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
733
+
734
+ auto existingValue = elementaryBlock4x2bAccum(
735
+ pqCoarseCentroids +
736
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
737
+ COARSE_SIZE +
738
+ coarseCentroidOffset,
739
+ pqFineCentroids +
740
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
741
+ fineCode0a) *
742
+ FINE_SIZE +
743
+ fineCentroidOffset,
744
+ pqFineCentroids +
745
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
746
+ fineCode0b) *
747
+ FINE_SIZE +
748
+ fineCentroidOffset,
749
+ weight0,
750
+ {existingValue0, existingValue1});
751
+
752
+ existingValue = elementaryBlock4x2bAccum(
753
+ pqCoarseCentroids +
754
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
755
+ COARSE_SIZE +
756
+ coarseCentroidOffset,
757
+ pqFineCentroids +
758
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
759
+ fineCode1a) *
760
+ FINE_SIZE +
761
+ fineCentroidOffset,
762
+ pqFineCentroids +
763
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
764
+ fineCode1b) *
765
+ FINE_SIZE +
766
+ fineCentroidOffset,
767
+ weight1,
768
+ existingValue);
769
+
770
+ existingValue = elementaryBlock4x2bAccum(
771
+ pqCoarseCentroids +
772
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
773
+ COARSE_SIZE +
774
+ coarseCentroidOffset,
775
+ pqFineCentroids +
776
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
777
+ fineCode2a) *
778
+ FINE_SIZE +
779
+ fineCentroidOffset,
780
+ pqFineCentroids +
781
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
782
+ fineCode2b) *
783
+ FINE_SIZE +
784
+ fineCentroidOffset,
785
+ weight2,
786
+ existingValue);
787
+
788
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
789
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
790
+
791
+ // next
792
+ Index2LevelDecoderImpl<
793
+ DIM,
794
+ COARSE_SIZE,
795
+ FINE_SIZE,
796
+ COARSE_BITS,
797
+ FINE_BITS,
798
+ CPOS + 8>::
799
+ accum(pqCoarseCentroids,
800
+ pqFineCentroids,
801
+ code0,
802
+ weight0,
803
+ code1,
804
+ weight1,
805
+ code2,
806
+ weight2,
807
+ outputAccum);
808
+ }
809
+ };
810
+
811
+ template <
812
+ intptr_t DIM,
813
+ intptr_t COARSE_SIZE,
814
+ intptr_t FINE_SIZE,
815
+ intptr_t COARSE_BITS,
816
+ intptr_t FINE_BITS,
817
+ intptr_t CPOS>
818
+ struct Index2LevelDecoderImpl<
819
+ DIM,
820
+ COARSE_SIZE,
821
+ FINE_SIZE,
822
+ COARSE_BITS,
823
+ FINE_BITS,
824
+ CPOS,
825
+ false,
826
+ true,
827
+ true,
828
+ false> {
829
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
830
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
831
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
832
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
833
+
834
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
835
+
836
+ // coarse quantizer storage
837
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
838
+
839
+ // coarse quantizer bytes start from 0
840
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
841
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
842
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
843
+ N_COARSE_ELEMENTS * COARSE_BITS;
844
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
845
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
846
+
847
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
848
+
849
+ // process 1 sample
850
+ static void store(
851
+ const float* const __restrict pqCoarseCentroids0,
852
+ const float* const __restrict pqFineCentroids0,
853
+ const uint8_t* const __restrict code0,
854
+ float* const __restrict outputStore) {
855
+ // coarse quantizer
856
+ const uint8_t* const __restrict coarse0 = code0;
857
+
858
+ // fine quantizer
859
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
860
+
861
+ // process chunks, 8 float
862
+
863
+ const intptr_t coarseCode0 = detail::
864
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
865
+ get(coarse0);
866
+ const intptr_t fineCode0 =
867
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
868
+ get(fine0);
869
+
870
+ const auto storeValue = elementaryBlock8x1b(
871
+ pqCoarseCentroids0 +
872
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
873
+ COARSE_SIZE +
874
+ coarseCentroidOffset,
875
+ pqFineCentroids0 +
876
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
877
+ FINE_SIZE +
878
+ fineCentroidOffset);
879
+
880
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
881
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
882
+
883
+ // next
884
+ Index2LevelDecoderImpl<
885
+ DIM,
886
+ COARSE_SIZE,
887
+ FINE_SIZE,
888
+ COARSE_BITS,
889
+ FINE_BITS,
890
+ CPOS + 8>::
891
+ store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
892
+ }
893
+
894
+ // process 1 sample
895
+ static void accum(
896
+ const float* const __restrict pqCoarseCentroids0,
897
+ const float* const __restrict pqFineCentroids0,
898
+ const uint8_t* const __restrict code0,
899
+ const float weight0,
900
+ float* const __restrict outputAccum) {
901
+ // coarse quantizer
902
+ const uint8_t* const __restrict coarse0 = code0;
903
+
904
+ // fine quantizer
905
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
906
+
907
+ // process chunks, 8 float
908
+
909
+ const intptr_t coarseCode0 = detail::
910
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
911
+ get(coarse0);
912
+ const intptr_t fineCode0 =
913
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
914
+ get(fine0);
915
+
916
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
917
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
918
+
919
+ const auto existingValue = elementaryBlock8x1bAccum(
920
+ pqCoarseCentroids0 +
921
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
922
+ COARSE_SIZE +
923
+ coarseCentroidOffset,
924
+ pqFineCentroids0 +
925
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
926
+ FINE_SIZE +
927
+ fineCentroidOffset,
928
+ weight0,
929
+ {existingValue0, existingValue1});
930
+
931
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
932
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
933
+
934
+ // next
935
+ Index2LevelDecoderImpl<
936
+ DIM,
937
+ COARSE_SIZE,
938
+ FINE_SIZE,
939
+ COARSE_BITS,
940
+ FINE_BITS,
941
+ CPOS + 8>::
942
+ accum(pqCoarseCentroids0,
943
+ pqFineCentroids0,
944
+ code0,
945
+ weight0,
946
+ outputAccum);
947
+ }
948
+
949
+ // Process 2 samples.
950
+ // Each code uses its own coarse pq centroids table and fine pq centroids
951
+ // table.
952
+ static void accum(
953
+ const float* const __restrict pqCoarseCentroids0,
954
+ const float* const __restrict pqFineCentroids0,
955
+ const uint8_t* const __restrict code0,
956
+ const float weight0,
957
+ const float* const __restrict pqCoarseCentroids1,
958
+ const float* const __restrict pqFineCentroids1,
959
+ const uint8_t* const __restrict code1,
960
+ const float weight1,
961
+ float* const __restrict outputAccum) {
962
+ // coarse quantizer
963
+ const uint8_t* const __restrict coarse0 = code0;
964
+ const uint8_t* const __restrict coarse1 = code1;
965
+
966
+ // fine quantizer
967
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
968
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
969
+
970
+ // process chunks, 8 float
971
+
972
+ const intptr_t coarseCode0 = detail::
973
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
974
+ get(coarse0);
975
+ const intptr_t fineCode0 =
976
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
977
+ get(fine0);
978
+ const intptr_t coarseCode1 = detail::
979
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
980
+ get(coarse1);
981
+ const intptr_t fineCode1 =
982
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
983
+ get(fine1);
984
+
985
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
986
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
987
+
988
+ auto existingValue = elementaryBlock8x1bAccum(
989
+ pqCoarseCentroids0 +
990
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
991
+ COARSE_SIZE +
992
+ coarseCentroidOffset,
993
+ pqFineCentroids0 +
994
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
995
+ FINE_SIZE +
996
+ fineCentroidOffset,
997
+ weight0,
998
+ {existingValue0, existingValue1});
999
+
1000
+ existingValue = elementaryBlock8x1bAccum(
1001
+ pqCoarseCentroids1 +
1002
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1003
+ COARSE_SIZE +
1004
+ coarseCentroidOffset,
1005
+ pqFineCentroids1 +
1006
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1007
+ FINE_SIZE +
1008
+ fineCentroidOffset,
1009
+ weight1,
1010
+ existingValue);
1011
+
1012
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1013
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1014
+
1015
+ // next
1016
+ Index2LevelDecoderImpl<
1017
+ DIM,
1018
+ COARSE_SIZE,
1019
+ FINE_SIZE,
1020
+ COARSE_BITS,
1021
+ FINE_BITS,
1022
+ CPOS + 8>::
1023
+ accum(pqCoarseCentroids0,
1024
+ pqFineCentroids0,
1025
+ code0,
1026
+ weight0,
1027
+ pqCoarseCentroids1,
1028
+ pqFineCentroids1,
1029
+ code1,
1030
+ weight1,
1031
+ outputAccum);
1032
+ }
1033
+
1034
+ // Process 2 samples.
1035
+ // Coarse pq centroids table and fine pq centroids table are shared among
1036
+ // codes.
1037
+ static void accum(
1038
+ const float* const __restrict pqCoarseCentroids,
1039
+ const float* const __restrict pqFineCentroids,
1040
+ const uint8_t* const __restrict code0,
1041
+ const float weight0,
1042
+ const uint8_t* const __restrict code1,
1043
+ const float weight1,
1044
+ float* const __restrict outputAccum) {
1045
+ // coarse quantizer
1046
+ const uint8_t* const __restrict coarse0 = code0;
1047
+ const uint8_t* const __restrict coarse1 = code1;
1048
+
1049
+ // fine quantizer
1050
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1051
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1052
+
1053
+ // process chunks, 8 float
1054
+
1055
+ const intptr_t coarseCode0 = detail::
1056
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1057
+ get(coarse0);
1058
+ const intptr_t fineCode0 =
1059
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1060
+ get(fine0);
1061
+ const intptr_t coarseCode1 = detail::
1062
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1063
+ get(coarse1);
1064
+ const intptr_t fineCode1 =
1065
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1066
+ get(fine1);
1067
+
1068
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
1069
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
1070
+
1071
+ auto existingValue = elementaryBlock8x1bAccum(
1072
+ pqCoarseCentroids +
1073
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1074
+ COARSE_SIZE +
1075
+ coarseCentroidOffset,
1076
+ pqFineCentroids +
1077
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1078
+ FINE_SIZE +
1079
+ fineCentroidOffset,
1080
+ weight0,
1081
+ {existingValue0, existingValue1});
1082
+
1083
+ existingValue = elementaryBlock8x1bAccum(
1084
+ pqCoarseCentroids +
1085
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1086
+ COARSE_SIZE +
1087
+ coarseCentroidOffset,
1088
+ pqFineCentroids +
1089
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1090
+ FINE_SIZE +
1091
+ fineCentroidOffset,
1092
+ weight1,
1093
+ existingValue);
1094
+
1095
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1096
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1097
+
1098
+ // next
1099
+ Index2LevelDecoderImpl<
1100
+ DIM,
1101
+ COARSE_SIZE,
1102
+ FINE_SIZE,
1103
+ COARSE_BITS,
1104
+ FINE_BITS,
1105
+ CPOS + 8>::
1106
+ accum(pqCoarseCentroids,
1107
+ pqFineCentroids,
1108
+ code0,
1109
+ weight0,
1110
+ code1,
1111
+ weight1,
1112
+ outputAccum);
1113
+ }
1114
+
1115
+ // Process 3 samples.
1116
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1117
+ // table.
1118
+ static void accum(
1119
+ const float* const __restrict pqCoarseCentroids0,
1120
+ const float* const __restrict pqFineCentroids0,
1121
+ const uint8_t* const __restrict code0,
1122
+ const float weight0,
1123
+ const float* const __restrict pqCoarseCentroids1,
1124
+ const float* const __restrict pqFineCentroids1,
1125
+ const uint8_t* const __restrict code1,
1126
+ const float weight1,
1127
+ const float* const __restrict pqCoarseCentroids2,
1128
+ const float* const __restrict pqFineCentroids2,
1129
+ const uint8_t* const __restrict code2,
1130
+ const float weight2,
1131
+ float* const __restrict outputAccum) {
1132
+ // coarse quantizer
1133
+ const uint8_t* const __restrict coarse0 = code0;
1134
+ const uint8_t* const __restrict coarse1 = code1;
1135
+ const uint8_t* const __restrict coarse2 = code2;
1136
+
1137
+ // fine quantizer
1138
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1139
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1140
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1141
+
1142
+ // process chunks, 8 float
1143
+
1144
+ const intptr_t coarseCode0 = detail::
1145
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1146
+ get(coarse0);
1147
+ const intptr_t fineCode0 =
1148
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1149
+ get(fine0);
1150
+ const intptr_t coarseCode1 = detail::
1151
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1152
+ get(coarse1);
1153
+ const intptr_t fineCode1 =
1154
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1155
+ get(fine1);
1156
+ const intptr_t coarseCode2 = detail::
1157
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1158
+ get(coarse2);
1159
+ const intptr_t fineCode2 =
1160
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1161
+ get(fine2);
1162
+
1163
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
1164
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
1165
+
1166
+ auto existingValue = elementaryBlock8x1bAccum(
1167
+ pqCoarseCentroids0 +
1168
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1169
+ COARSE_SIZE +
1170
+ coarseCentroidOffset,
1171
+ pqFineCentroids0 +
1172
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1173
+ FINE_SIZE +
1174
+ fineCentroidOffset,
1175
+ weight0,
1176
+ {existingValue0, existingValue1});
1177
+
1178
+ existingValue = elementaryBlock8x1bAccum(
1179
+ pqCoarseCentroids1 +
1180
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1181
+ COARSE_SIZE +
1182
+ coarseCentroidOffset,
1183
+ pqFineCentroids1 +
1184
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1185
+ FINE_SIZE +
1186
+ fineCentroidOffset,
1187
+ weight1,
1188
+ existingValue);
1189
+
1190
+ existingValue = elementaryBlock8x1bAccum(
1191
+ pqCoarseCentroids2 +
1192
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1193
+ COARSE_SIZE +
1194
+ coarseCentroidOffset,
1195
+ pqFineCentroids2 +
1196
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1197
+ FINE_SIZE +
1198
+ fineCentroidOffset,
1199
+ weight2,
1200
+ existingValue);
1201
+
1202
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1203
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1204
+
1205
+ // next
1206
+ Index2LevelDecoderImpl<
1207
+ DIM,
1208
+ COARSE_SIZE,
1209
+ FINE_SIZE,
1210
+ COARSE_BITS,
1211
+ FINE_BITS,
1212
+ CPOS + 8>::
1213
+ accum(pqCoarseCentroids0,
1214
+ pqFineCentroids0,
1215
+ code0,
1216
+ weight0,
1217
+ pqCoarseCentroids1,
1218
+ pqFineCentroids1,
1219
+ code1,
1220
+ weight1,
1221
+ pqCoarseCentroids2,
1222
+ pqFineCentroids2,
1223
+ code2,
1224
+ weight2,
1225
+ outputAccum);
1226
+ }
1227
+
1228
+ // Process 3 samples.
1229
+ // Coarse pq centroids table and fine pq centroids table are shared among
1230
+ // codes.
1231
+ static void accum(
1232
+ const float* const __restrict pqCoarseCentroids,
1233
+ const float* const __restrict pqFineCentroids,
1234
+ const uint8_t* const __restrict code0,
1235
+ const float weight0,
1236
+ const uint8_t* const __restrict code1,
1237
+ const float weight1,
1238
+ const uint8_t* const __restrict code2,
1239
+ const float weight2,
1240
+ float* const __restrict outputAccum) {
1241
+ // coarse quantizer
1242
+ const uint8_t* const __restrict coarse0 = code0;
1243
+ const uint8_t* const __restrict coarse1 = code1;
1244
+ const uint8_t* const __restrict coarse2 = code2;
1245
+
1246
+ // fine quantizer
1247
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1248
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1249
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1250
+
1251
+ // process chunks, 8 float
1252
+
1253
+ const intptr_t coarseCode0 = detail::
1254
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1255
+ get(coarse0);
1256
+ const intptr_t fineCode0 =
1257
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1258
+ get(fine0);
1259
+ const intptr_t coarseCode1 = detail::
1260
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1261
+ get(coarse1);
1262
+ const intptr_t fineCode1 =
1263
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1264
+ get(fine1);
1265
+ const intptr_t coarseCode2 = detail::
1266
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1267
+ get(coarse2);
1268
+ const intptr_t fineCode2 =
1269
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1270
+ get(fine2);
1271
+
1272
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
1273
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
1274
+
1275
+ auto existingValue = elementaryBlock8x1bAccum(
1276
+ pqCoarseCentroids +
1277
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1278
+ COARSE_SIZE +
1279
+ coarseCentroidOffset,
1280
+ pqFineCentroids +
1281
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1282
+ FINE_SIZE +
1283
+ fineCentroidOffset,
1284
+ weight0,
1285
+ {existingValue0, existingValue1});
1286
+
1287
+ existingValue = elementaryBlock8x1bAccum(
1288
+ pqCoarseCentroids +
1289
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1290
+ COARSE_SIZE +
1291
+ coarseCentroidOffset,
1292
+ pqFineCentroids +
1293
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1294
+ FINE_SIZE +
1295
+ fineCentroidOffset,
1296
+ weight1,
1297
+ existingValue);
1298
+
1299
+ existingValue = elementaryBlock8x1bAccum(
1300
+ pqCoarseCentroids +
1301
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1302
+ COARSE_SIZE +
1303
+ coarseCentroidOffset,
1304
+ pqFineCentroids +
1305
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1306
+ FINE_SIZE +
1307
+ fineCentroidOffset,
1308
+ weight2,
1309
+ existingValue);
1310
+
1311
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
1312
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
1313
+
1314
+ // next
1315
+ Index2LevelDecoderImpl<
1316
+ DIM,
1317
+ COARSE_SIZE,
1318
+ FINE_SIZE,
1319
+ COARSE_BITS,
1320
+ FINE_BITS,
1321
+ CPOS + 8>::
1322
+ accum(pqCoarseCentroids,
1323
+ pqFineCentroids,
1324
+ code0,
1325
+ weight0,
1326
+ code1,
1327
+ weight1,
1328
+ code2,
1329
+ weight2,
1330
+ outputAccum);
1331
+ }
1332
+ };
1333
+
1334
+ template <
1335
+ intptr_t DIM,
1336
+ intptr_t COARSE_SIZE,
1337
+ intptr_t FINE_SIZE,
1338
+ intptr_t COARSE_BITS,
1339
+ intptr_t FINE_BITS,
1340
+ intptr_t CPOS>
1341
+ struct Index2LevelDecoderImpl<
1342
+ DIM,
1343
+ COARSE_SIZE,
1344
+ FINE_SIZE,
1345
+ COARSE_BITS,
1346
+ FINE_BITS,
1347
+ CPOS,
1348
+ false,
1349
+ false,
1350
+ true,
1351
+ false> {
1352
+ static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
1353
+ static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
1354
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
1355
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
1356
+
1357
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
1358
+
1359
+ // coarse quantizer storage
1360
+ static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
1361
+
1362
+ // coarse quantizer bytes start from 0
1363
+ // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
1364
+ static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
1365
+ static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
1366
+ N_COARSE_ELEMENTS * COARSE_BITS;
1367
+ static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
1368
+ (N_COARSE_ELEMENTS_BITS + 7) / 8;
1369
+
1370
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
1371
+
1372
+ // process 1 sample
1373
+ static void store(
1374
+ const float* const __restrict pqCoarseCentroids0,
1375
+ const float* const __restrict pqFineCentroids0,
1376
+ const uint8_t* const __restrict code0,
1377
+ float* const __restrict outputStore) {
1378
+ // coarse quantizer
1379
+ const uint8_t* const __restrict coarse0 = code0;
1380
+
1381
+ // fine quantizer
1382
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1383
+
1384
+ // process chunks, 4 float
1385
+
1386
+ const intptr_t coarseCode0 = detail::
1387
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1388
+ get(coarse0);
1389
+ const intptr_t fineCode0 =
1390
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1391
+ get(fine0);
1392
+
1393
+ const auto storeValue = elementaryBlock4x1b(
1394
+ pqCoarseCentroids0 +
1395
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1396
+ COARSE_SIZE +
1397
+ coarseCentroidOffset,
1398
+ pqFineCentroids0 +
1399
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1400
+ FINE_SIZE +
1401
+ fineCentroidOffset);
1402
+
1403
+ vst1q_f32(outputStore + CPOS, storeValue);
1404
+
1405
+ // next
1406
+ Index2LevelDecoderImpl<
1407
+ DIM,
1408
+ COARSE_SIZE,
1409
+ FINE_SIZE,
1410
+ COARSE_BITS,
1411
+ FINE_BITS,
1412
+ CPOS + 4>::
1413
+ store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
1414
+ }
1415
+
1416
+ // process 1 sample
1417
+ static void accum(
1418
+ const float* const __restrict pqCoarseCentroids0,
1419
+ const float* const __restrict pqFineCentroids0,
1420
+ const uint8_t* const __restrict code0,
1421
+ const float weight0,
1422
+ float* const __restrict outputAccum) {
1423
+ // coarse quantizer
1424
+ const uint8_t* const __restrict coarse0 = code0;
1425
+
1426
+ // fine quantizer
1427
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1428
+
1429
+ // process chunks, 4 float
1430
+
1431
+ const intptr_t coarseCode0 = detail::
1432
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1433
+ get(coarse0);
1434
+ const intptr_t fineCode0 =
1435
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1436
+ get(fine0);
1437
+
1438
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1439
+
1440
+ existingValue = elementaryBlock4x1bAccum(
1441
+ pqCoarseCentroids0 +
1442
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1443
+ COARSE_SIZE +
1444
+ coarseCentroidOffset,
1445
+ pqFineCentroids0 +
1446
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1447
+ FINE_SIZE +
1448
+ fineCentroidOffset,
1449
+ weight0,
1450
+ existingValue);
1451
+
1452
+ vst1q_f32(outputAccum + CPOS, existingValue);
1453
+
1454
+ // next
1455
+ Index2LevelDecoderImpl<
1456
+ DIM,
1457
+ COARSE_SIZE,
1458
+ FINE_SIZE,
1459
+ COARSE_BITS,
1460
+ FINE_BITS,
1461
+ CPOS + 4>::
1462
+ accum(pqCoarseCentroids0,
1463
+ pqFineCentroids0,
1464
+ code0,
1465
+ weight0,
1466
+ outputAccum);
1467
+ }
1468
+
1469
+ // Process 2 samples.
1470
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1471
+ // table.
1472
+ static void accum(
1473
+ const float* const __restrict pqCoarseCentroids0,
1474
+ const float* const __restrict pqFineCentroids0,
1475
+ const uint8_t* const __restrict code0,
1476
+ const float weight0,
1477
+ const float* const __restrict pqCoarseCentroids1,
1478
+ const float* const __restrict pqFineCentroids1,
1479
+ const uint8_t* const __restrict code1,
1480
+ const float weight1,
1481
+ float* const __restrict outputAccum) {
1482
+ // coarse quantizer
1483
+ const uint8_t* const __restrict coarse0 = code0;
1484
+ const uint8_t* const __restrict coarse1 = code1;
1485
+
1486
+ // fine quantizer
1487
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1488
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1489
+
1490
+ // process chunks, 4 float
1491
+
1492
+ const intptr_t coarseCode0 = detail::
1493
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1494
+ get(coarse0);
1495
+ const intptr_t fineCode0 =
1496
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1497
+ get(fine0);
1498
+ const intptr_t coarseCode1 = detail::
1499
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1500
+ get(coarse1);
1501
+ const intptr_t fineCode1 =
1502
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1503
+ get(fine1);
1504
+
1505
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1506
+
1507
+ existingValue = elementaryBlock4x1bAccum(
1508
+ pqCoarseCentroids0 +
1509
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1510
+ COARSE_SIZE +
1511
+ coarseCentroidOffset,
1512
+ pqFineCentroids0 +
1513
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1514
+ FINE_SIZE +
1515
+ fineCentroidOffset,
1516
+ weight0,
1517
+ existingValue);
1518
+
1519
+ existingValue = elementaryBlock4x1bAccum(
1520
+ pqCoarseCentroids1 +
1521
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1522
+ COARSE_SIZE +
1523
+ coarseCentroidOffset,
1524
+ pqFineCentroids1 +
1525
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1526
+ FINE_SIZE +
1527
+ fineCentroidOffset,
1528
+ weight1,
1529
+ existingValue);
1530
+
1531
+ vst1q_f32(outputAccum + CPOS, existingValue);
1532
+
1533
+ // next
1534
+ Index2LevelDecoderImpl<
1535
+ DIM,
1536
+ COARSE_SIZE,
1537
+ FINE_SIZE,
1538
+ COARSE_BITS,
1539
+ FINE_BITS,
1540
+ CPOS + 4>::
1541
+ accum(pqCoarseCentroids0,
1542
+ pqFineCentroids0,
1543
+ code0,
1544
+ weight0,
1545
+ pqCoarseCentroids1,
1546
+ pqFineCentroids1,
1547
+ code1,
1548
+ weight1,
1549
+ outputAccum);
1550
+ }
1551
+
1552
+ // Process 2 samples.
1553
+ // Coarse pq centroids table and fine pq centroids table are shared among
1554
+ // codes.
1555
+ static void accum(
1556
+ const float* const __restrict pqCoarseCentroids,
1557
+ const float* const __restrict pqFineCentroids,
1558
+ const uint8_t* const __restrict code0,
1559
+ const float weight0,
1560
+ const uint8_t* const __restrict code1,
1561
+ const float weight1,
1562
+ float* const __restrict outputAccum) {
1563
+ // coarse quantizer
1564
+ const uint8_t* const __restrict coarse0 = code0;
1565
+ const uint8_t* const __restrict coarse1 = code1;
1566
+
1567
+ // fine quantizer
1568
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1569
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1570
+
1571
+ // process chunks, 4 float
1572
+
1573
+ const intptr_t coarseCode0 = detail::
1574
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1575
+ get(coarse0);
1576
+ const intptr_t fineCode0 =
1577
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1578
+ get(fine0);
1579
+ const intptr_t coarseCode1 = detail::
1580
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1581
+ get(coarse1);
1582
+ const intptr_t fineCode1 =
1583
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1584
+ get(fine1);
1585
+
1586
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1587
+
1588
+ existingValue = elementaryBlock4x1bAccum(
1589
+ pqCoarseCentroids +
1590
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1591
+ COARSE_SIZE +
1592
+ coarseCentroidOffset,
1593
+ pqFineCentroids +
1594
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1595
+ FINE_SIZE +
1596
+ fineCentroidOffset,
1597
+ weight0,
1598
+ existingValue);
1599
+
1600
+ existingValue = elementaryBlock4x1bAccum(
1601
+ pqCoarseCentroids +
1602
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1603
+ COARSE_SIZE +
1604
+ coarseCentroidOffset,
1605
+ pqFineCentroids +
1606
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1607
+ FINE_SIZE +
1608
+ fineCentroidOffset,
1609
+ weight1,
1610
+ existingValue);
1611
+
1612
+ vst1q_f32(outputAccum + CPOS, existingValue);
1613
+
1614
+ // next
1615
+ Index2LevelDecoderImpl<
1616
+ DIM,
1617
+ COARSE_SIZE,
1618
+ FINE_SIZE,
1619
+ COARSE_BITS,
1620
+ FINE_BITS,
1621
+ CPOS + 4>::
1622
+ accum(pqCoarseCentroids,
1623
+ pqFineCentroids,
1624
+ code0,
1625
+ weight0,
1626
+ code1,
1627
+ weight1,
1628
+ outputAccum);
1629
+ }
1630
+
1631
+ // Process 3 samples.
1632
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1633
+ // table.
1634
+ static void accum(
1635
+ const float* const __restrict pqCoarseCentroids0,
1636
+ const float* const __restrict pqFineCentroids0,
1637
+ const uint8_t* const __restrict code0,
1638
+ const float weight0,
1639
+ const float* const __restrict pqCoarseCentroids1,
1640
+ const float* const __restrict pqFineCentroids1,
1641
+ const uint8_t* const __restrict code1,
1642
+ const float weight1,
1643
+ const float* const __restrict pqCoarseCentroids2,
1644
+ const float* const __restrict pqFineCentroids2,
1645
+ const uint8_t* const __restrict code2,
1646
+ const float weight2,
1647
+ float* const __restrict outputAccum) {
1648
+ // coarse quantizer
1649
+ const uint8_t* const __restrict coarse0 = code0;
1650
+ const uint8_t* const __restrict coarse1 = code1;
1651
+ const uint8_t* const __restrict coarse2 = code2;
1652
+
1653
+ // fine quantizer
1654
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1655
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1656
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1657
+
1658
+ // process chunks, 4 float
1659
+
1660
+ const intptr_t coarseCode0 = detail::
1661
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1662
+ get(coarse0);
1663
+ const intptr_t fineCode0 =
1664
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1665
+ get(fine0);
1666
+ const intptr_t coarseCode1 = detail::
1667
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1668
+ get(coarse1);
1669
+ const intptr_t fineCode1 =
1670
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1671
+ get(fine1);
1672
+ const intptr_t coarseCode2 = detail::
1673
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1674
+ get(coarse2);
1675
+ const intptr_t fineCode2 =
1676
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1677
+ get(fine2);
1678
+
1679
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1680
+
1681
+ existingValue = elementaryBlock4x1bAccum(
1682
+ pqCoarseCentroids0 +
1683
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1684
+ COARSE_SIZE +
1685
+ coarseCentroidOffset,
1686
+ pqFineCentroids0 +
1687
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1688
+ FINE_SIZE +
1689
+ fineCentroidOffset,
1690
+ weight0,
1691
+ existingValue);
1692
+
1693
+ existingValue = elementaryBlock4x1bAccum(
1694
+ pqCoarseCentroids1 +
1695
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1696
+ COARSE_SIZE +
1697
+ coarseCentroidOffset,
1698
+ pqFineCentroids1 +
1699
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1700
+ FINE_SIZE +
1701
+ fineCentroidOffset,
1702
+ weight1,
1703
+ existingValue);
1704
+
1705
+ existingValue = elementaryBlock4x1bAccum(
1706
+ pqCoarseCentroids2 +
1707
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1708
+ COARSE_SIZE +
1709
+ coarseCentroidOffset,
1710
+ pqFineCentroids2 +
1711
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1712
+ FINE_SIZE +
1713
+ fineCentroidOffset,
1714
+ weight2,
1715
+ existingValue);
1716
+
1717
+ vst1q_f32(outputAccum + CPOS, existingValue);
1718
+
1719
+ // next
1720
+ Index2LevelDecoderImpl<
1721
+ DIM,
1722
+ COARSE_SIZE,
1723
+ FINE_SIZE,
1724
+ COARSE_BITS,
1725
+ FINE_BITS,
1726
+ CPOS + 4>::
1727
+ accum(pqCoarseCentroids0,
1728
+ pqFineCentroids0,
1729
+ code0,
1730
+ weight0,
1731
+ pqCoarseCentroids1,
1732
+ pqFineCentroids1,
1733
+ code1,
1734
+ weight1,
1735
+ pqCoarseCentroids2,
1736
+ pqFineCentroids2,
1737
+ code2,
1738
+ weight2,
1739
+ outputAccum);
1740
+ }
1741
+
1742
+ // Process 3 samples.
1743
+ // Coarse pq centroids table and fine pq centroids table are shared among
1744
+ // codes.
1745
+ static void accum(
1746
+ const float* const __restrict pqCoarseCentroids,
1747
+ const float* const __restrict pqFineCentroids,
1748
+ const uint8_t* const __restrict code0,
1749
+ const float weight0,
1750
+ const uint8_t* const __restrict code1,
1751
+ const float weight1,
1752
+ const uint8_t* const __restrict code2,
1753
+ const float weight2,
1754
+ float* const __restrict outputAccum) {
1755
+ // coarse quantizer
1756
+ const uint8_t* const __restrict coarse0 = code0;
1757
+ const uint8_t* const __restrict coarse1 = code1;
1758
+ const uint8_t* const __restrict coarse2 = code2;
1759
+
1760
+ // fine quantizer
1761
+ const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
1762
+ const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
1763
+ const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
1764
+
1765
+ // process chunks, 4 float
1766
+
1767
+ const intptr_t coarseCode0 = detail::
1768
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1769
+ get(coarse0);
1770
+ const intptr_t fineCode0 =
1771
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1772
+ get(fine0);
1773
+ const intptr_t coarseCode1 = detail::
1774
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1775
+ get(coarse1);
1776
+ const intptr_t fineCode1 =
1777
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1778
+ get(fine1);
1779
+ const intptr_t coarseCode2 = detail::
1780
+ UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
1781
+ get(coarse2);
1782
+ const intptr_t fineCode2 =
1783
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1784
+ get(fine2);
1785
+
1786
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1787
+
1788
+ existingValue = elementaryBlock4x1bAccum(
1789
+ pqCoarseCentroids +
1790
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
1791
+ COARSE_SIZE +
1792
+ coarseCentroidOffset,
1793
+ pqFineCentroids +
1794
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1795
+ FINE_SIZE +
1796
+ fineCentroidOffset,
1797
+ weight0,
1798
+ existingValue);
1799
+
1800
+ existingValue = elementaryBlock4x1bAccum(
1801
+ pqCoarseCentroids +
1802
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
1803
+ COARSE_SIZE +
1804
+ coarseCentroidOffset,
1805
+ pqFineCentroids +
1806
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1807
+ FINE_SIZE +
1808
+ fineCentroidOffset,
1809
+ weight1,
1810
+ existingValue);
1811
+
1812
+ existingValue = elementaryBlock4x1bAccum(
1813
+ pqCoarseCentroids +
1814
+ (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
1815
+ COARSE_SIZE +
1816
+ coarseCentroidOffset,
1817
+ pqFineCentroids +
1818
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1819
+ FINE_SIZE +
1820
+ fineCentroidOffset,
1821
+ weight2,
1822
+ existingValue);
1823
+
1824
+ vst1q_f32(outputAccum + CPOS, existingValue);
1825
+
1826
+ // next
1827
+ Index2LevelDecoderImpl<
1828
+ DIM,
1829
+ COARSE_SIZE,
1830
+ FINE_SIZE,
1831
+ COARSE_BITS,
1832
+ FINE_BITS,
1833
+ CPOS + 4>::
1834
+ accum(pqCoarseCentroids,
1835
+ pqFineCentroids,
1836
+ code0,
1837
+ weight0,
1838
+ code1,
1839
+ weight1,
1840
+ code2,
1841
+ weight2,
1842
+ outputAccum);
1843
+ }
1844
+ };
1845
+
1846
+ // This partial specialization is expected to do nothing.
1847
+ template <
1848
+ intptr_t DIM,
1849
+ intptr_t COARSE_SIZE,
1850
+ intptr_t FINE_SIZE,
1851
+ intptr_t COARSE_BITS,
1852
+ intptr_t FINE_BITS,
1853
+ bool FINE_SIZE_EQ_4,
1854
+ bool QPOS_LEFT_GE_8,
1855
+ bool QPOS_LEFT_GE_4>
1856
+ struct Index2LevelDecoderImpl<
1857
+ DIM,
1858
+ COARSE_SIZE,
1859
+ FINE_SIZE,
1860
+ COARSE_BITS,
1861
+ FINE_BITS,
1862
+ DIM,
1863
+ FINE_SIZE_EQ_4,
1864
+ QPOS_LEFT_GE_8,
1865
+ QPOS_LEFT_GE_4,
1866
+ true> {
1867
+ // process 1 sample
1868
+ static void store(
1869
+ const float* const __restrict pqCoarseCentroids0,
1870
+ const float* const __restrict pqFineCentroids0,
1871
+ const uint8_t* const __restrict code0,
1872
+ float* const __restrict outputStore) {}
1873
+
1874
+ // process 1 sample
1875
+ static void accum(
1876
+ const float* const __restrict pqCoarseCentroids0,
1877
+ const float* const __restrict pqFineCentroids0,
1878
+ const uint8_t* const __restrict code0,
1879
+ const float weight0,
1880
+ float* const __restrict outputAccum) {}
1881
+
1882
+ // Process 2 samples.
1883
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1884
+ // table.
1885
+ static void accum(
1886
+ const float* const __restrict pqCoarseCentroids0,
1887
+ const float* const __restrict pqFineCentroids0,
1888
+ const uint8_t* const __restrict code0,
1889
+ const float weight0,
1890
+ const float* const __restrict pqCoarseCentroids1,
1891
+ const float* const __restrict pqFineCentroids1,
1892
+ const uint8_t* const __restrict code1,
1893
+ const float weight1,
1894
+ float* const __restrict outputAccum) {}
1895
+
1896
+ // Process 2 samples.
1897
+ // Coarse pq centroids table and fine pq centroids table are shared among
1898
+ // codes.
1899
+ static void accum(
1900
+ const float* const __restrict pqCoarseCentroids,
1901
+ const float* const __restrict pqFineCentroids,
1902
+ const uint8_t* const __restrict code0,
1903
+ const float weight0,
1904
+ const uint8_t* const __restrict code1,
1905
+ const float weight1,
1906
+ float* const __restrict outputAccum) {}
1907
+
1908
+ // Process 3 samples.
1909
+ // Each code uses its own coarse pq centroids table and fine pq centroids
1910
+ // table.
1911
+ static void accum(
1912
+ const float* const __restrict pqCoarseCentroids0,
1913
+ const float* const __restrict pqFineCentroids0,
1914
+ const uint8_t* const __restrict code0,
1915
+ const float weight0,
1916
+ const float* const __restrict pqCoarseCentroids1,
1917
+ const float* const __restrict pqFineCentroids1,
1918
+ const uint8_t* const __restrict code1,
1919
+ const float weight1,
1920
+ const float* const __restrict pqCoarseCentroids2,
1921
+ const float* const __restrict pqFineCentroids2,
1922
+ const uint8_t* const __restrict code2,
1923
+ const float weight2,
1924
+ float* const __restrict outputAccum) {}
1925
+
1926
+ // Process 3 samples.
1927
+ // Coarse pq centroids table and fine pq centroids table are shared among
1928
+ // codes.
1929
+ static void accum(
1930
+ const float* const __restrict pqCoarseCentroids,
1931
+ const float* const __restrict pqFineCentroids,
1932
+ const uint8_t* const __restrict code0,
1933
+ const float weight0,
1934
+ const uint8_t* const __restrict code1,
1935
+ const float weight1,
1936
+ const uint8_t* const __restrict code2,
1937
+ const float weight2,
1938
+ float* const __restrict outputAccum) {}
1939
+ };
1940
+ } // namespace
1941
+
1942
+ // Suitable for IVF256,PQ[1]x8
1943
+ // Suitable for Residual[1]x8,PQ[2]x8
1944
+ // Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
1945
+ // Suitable for Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
1946
+ template <
1947
+ intptr_t DIM,
1948
+ intptr_t COARSE_SIZE,
1949
+ intptr_t FINE_SIZE,
1950
+ intptr_t COARSE_BITS = 8,
1951
+ intptr_t FINE_BITS = 8>
1952
+ struct Index2LevelDecoder {
1953
+ static_assert(
1954
+ COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
1955
+ "Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
1956
+ static_assert(
1957
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1958
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1959
+
1960
+ static constexpr intptr_t dim = DIM;
1961
+ static constexpr intptr_t coarseSize = COARSE_SIZE;
1962
+ static constexpr intptr_t fineSize = FINE_SIZE;
1963
+ static constexpr intptr_t coarseBits = COARSE_BITS;
1964
+ static constexpr intptr_t fineBits = FINE_BITS;
1965
+
1966
+ // Process 1 sample.
1967
+ static void store(
1968
+ const float* const __restrict pqCoarseCentroids,
1969
+ const float* const __restrict pqFineCentroids,
1970
+ const uint8_t* const __restrict code,
1971
+ float* const __restrict outputStore) {
1972
+ Index2LevelDecoderImpl<
1973
+ DIM,
1974
+ COARSE_SIZE,
1975
+ FINE_SIZE,
1976
+ COARSE_BITS,
1977
+ FINE_BITS,
1978
+ 0>::
1979
+ store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
1980
+ }
1981
+
1982
+ // Process 1 sample.
1983
+ // Performs outputAccum += weight * decoded(code)
1984
+ static void accum(
1985
+ const float* const __restrict pqCoarseCentroids,
1986
+ const float* const __restrict pqFineCentroids,
1987
+ const uint8_t* const __restrict code,
1988
+ const float weight,
1989
+ float* const __restrict outputAccum) {
1990
+ Index2LevelDecoderImpl<
1991
+ DIM,
1992
+ COARSE_SIZE,
1993
+ FINE_SIZE,
1994
+ COARSE_BITS,
1995
+ FINE_BITS,
1996
+ 0>::
1997
+ accum(pqCoarseCentroids,
1998
+ pqFineCentroids,
1999
+ code,
2000
+ weight,
2001
+ outputAccum);
2002
+ }
2003
+
2004
+ // Process 2 samples.
2005
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2006
+ // decoded(code1).
2007
+ //
2008
+ // Each code uses its own coarse pq centroids table and fine pq centroids
2009
+ // table.
2010
+ static void accum(
2011
+ const float* const __restrict pqCoarseCentroids0,
2012
+ const float* const __restrict pqFineCentroids0,
2013
+ const uint8_t* const __restrict code0,
2014
+ const float weight0,
2015
+ const float* const __restrict pqCoarseCentroids1,
2016
+ const float* const __restrict pqFineCentroids1,
2017
+ const uint8_t* const __restrict code1,
2018
+ const float weight1,
2019
+ float* const __restrict outputAccum) {
2020
+ Index2LevelDecoderImpl<
2021
+ DIM,
2022
+ COARSE_SIZE,
2023
+ FINE_SIZE,
2024
+ COARSE_BITS,
2025
+ FINE_BITS,
2026
+ 0>::
2027
+ accum(pqCoarseCentroids0,
2028
+ pqFineCentroids0,
2029
+ code0,
2030
+ weight0,
2031
+ pqCoarseCentroids1,
2032
+ pqFineCentroids1,
2033
+ code1,
2034
+ weight1,
2035
+ outputAccum);
2036
+ }
2037
+
2038
+ // Process 2 samples.
2039
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2040
+ // decoded(code1)
2041
+ //
2042
+ // Coarse pq centroids table and fine pq centroids table are shared among
2043
+ // codes.
2044
+ static void accum(
2045
+ const float* const __restrict pqCoarseCentroids,
2046
+ const float* const __restrict pqFineCentroids,
2047
+ const uint8_t* const __restrict code0,
2048
+ const float weight0,
2049
+ const uint8_t* const __restrict code1,
2050
+ const float weight1,
2051
+ float* const __restrict outputAccum) {
2052
+ Index2LevelDecoderImpl<
2053
+ DIM,
2054
+ COARSE_SIZE,
2055
+ FINE_SIZE,
2056
+ COARSE_BITS,
2057
+ FINE_BITS,
2058
+ 0>::
2059
+ accum(pqCoarseCentroids,
2060
+ pqFineCentroids,
2061
+ code0,
2062
+ weight0,
2063
+ code1,
2064
+ weight1,
2065
+ outputAccum);
2066
+ }
2067
+
2068
+ // Process 3 samples.
2069
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2070
+ // decoded(code1) + weight2 * decoded(code2)
2071
+ //
2072
+ // Each code uses its own coarse pq centroids table and fine pq centroids
2073
+ // table.
2074
+ static void accum(
2075
+ const float* const __restrict pqCoarseCentroids0,
2076
+ const float* const __restrict pqFineCentroids0,
2077
+ const uint8_t* const __restrict code0,
2078
+ const float weight0,
2079
+ const float* const __restrict pqCoarseCentroids1,
2080
+ const float* const __restrict pqFineCentroids1,
2081
+ const uint8_t* const __restrict code1,
2082
+ const float weight1,
2083
+ const float* const __restrict pqCoarseCentroids2,
2084
+ const float* const __restrict pqFineCentroids2,
2085
+ const uint8_t* const __restrict code2,
2086
+ const float weight2,
2087
+ float* const __restrict outputAccum) {
2088
+ Index2LevelDecoderImpl<
2089
+ DIM,
2090
+ COARSE_SIZE,
2091
+ FINE_SIZE,
2092
+ COARSE_BITS,
2093
+ FINE_BITS,
2094
+ 0>::
2095
+ accum(pqCoarseCentroids0,
2096
+ pqFineCentroids0,
2097
+ code0,
2098
+ weight0,
2099
+ pqCoarseCentroids1,
2100
+ pqFineCentroids1,
2101
+ code1,
2102
+ weight1,
2103
+ pqCoarseCentroids2,
2104
+ pqFineCentroids2,
2105
+ code2,
2106
+ weight2,
2107
+ outputAccum);
2108
+ }
2109
+
2110
+ // Process 3 samples.
2111
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
2112
+ // decoded(code1) + weight2 * decoded(code2)
2113
+ //
2114
+ // Coarse pq centroids table and fine pq centroids table are shared among
2115
+ // codes.
2116
+ static void accum(
2117
+ const float* const __restrict pqCoarseCentroids,
2118
+ const float* const __restrict pqFineCentroids,
2119
+ const uint8_t* const __restrict code0,
2120
+ const float weight0,
2121
+ const uint8_t* const __restrict code1,
2122
+ const float weight1,
2123
+ const uint8_t* const __restrict code2,
2124
+ const float weight2,
2125
+ float* const __restrict outputAccum) {
2126
+ Index2LevelDecoderImpl<
2127
+ DIM,
2128
+ COARSE_SIZE,
2129
+ FINE_SIZE,
2130
+ COARSE_BITS,
2131
+ FINE_BITS,
2132
+ 0>::
2133
+ accum(pqCoarseCentroids,
2134
+ pqFineCentroids,
2135
+ code0,
2136
+ weight0,
2137
+ code1,
2138
+ weight1,
2139
+ code2,
2140
+ weight2,
2141
+ outputAccum);
2142
+ }
2143
+ };
2144
+
2145
+ } // namespace cppcontrib
2146
+ } // namespace faiss
2147
+ #endif // LEVEL2_NEON_INL_H