faiss 0.2.4 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (178) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +23 -21
  5. data/ext/faiss/extconf.rb +11 -0
  6. data/ext/faiss/index.cpp +17 -4
  7. data/ext/faiss/index_binary.cpp +6 -6
  8. data/ext/faiss/product_quantizer.cpp +4 -4
  9. data/lib/faiss/version.rb +1 -1
  10. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  11. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  12. data/vendor/faiss/faiss/IVFlib.h +26 -2
  13. data/vendor/faiss/faiss/Index.cpp +36 -3
  14. data/vendor/faiss/faiss/Index.h +43 -6
  15. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  16. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  21. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  22. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  26. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  28. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  29. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  30. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  31. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  32. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  33. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  34. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  35. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  36. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  37. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  38. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  39. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  40. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  41. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  42. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  43. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  44. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  48. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  50. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  51. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  52. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  53. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  54. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  56. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  57. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  58. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  61. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  62. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  63. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  64. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  65. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  66. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  67. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  68. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  69. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  70. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  71. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  72. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  73. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  74. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  75. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  76. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  78. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  80. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  82. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  83. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  84. data/vendor/faiss/faiss/IndexShards.h +2 -1
  85. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  86. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  87. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  88. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  89. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  90. data/vendor/faiss/faiss/clone_index.h +3 -0
  91. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  93. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  101. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  102. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  103. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  105. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  106. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  110. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  111. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  112. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  113. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  114. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  116. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  118. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  119. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  120. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  122. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  124. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  125. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  126. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  127. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  128. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  129. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  131. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  132. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  133. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  134. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  136. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  138. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  139. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  141. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  142. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  144. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  145. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  146. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  147. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  151. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  152. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  153. data/vendor/faiss/faiss/index_io.h +5 -0
  154. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  155. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  156. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  157. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  158. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  159. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  160. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  161. data/vendor/faiss/faiss/utils/distances.h +113 -15
  162. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  163. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  164. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  165. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  166. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  167. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  168. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  169. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  170. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  172. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  173. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  174. data/vendor/faiss/faiss/utils/random.h +5 -0
  175. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  176. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  177. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  178. metadata +37 -3
@@ -0,0 +1,1452 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+ #ifndef PQ_NEON_INL_H
3
+ #define PQ_NEON_INL_H
4
+
5
+ #include <arm_neon.h>
6
+
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+
10
+ #include <faiss/cppcontrib/detail/UintReader.h>
11
+
12
+ namespace faiss {
13
+ namespace cppcontrib {
14
+
15
+ ////////////////////////////////////////////////////////////////////////////////////
16
+ /// IndexPQDecoder
17
+ ////////////////////////////////////////////////////////////////////////////////////
18
+
19
+ namespace {
20
+
21
+ // Despite the following functions are somewhat redundant, I'd like to keep the
22
+ // overall basic blocks similar to ones from Index2LevelDecoder.
23
+ // A compiler will optimize away the redundant code.
24
+
25
+ // Processes 4 float values.
26
+ // Returns {
27
+ // [0..3] = *fine[0..3];
28
+ // }
29
+ inline float32x4_t elementaryBlock4x1b(const float* const __restrict fine) {
30
+ // load fine
31
+ const auto fineValue = vld1q_f32(fine);
32
+ return fineValue;
33
+ }
34
+
35
+ // Processes 4 float values.
36
+ // Returns {
37
+ // [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
38
+ // }
39
+ inline float32x4_t elementaryBlock4x1bAccum(
40
+ const float* const __restrict fine,
41
+ const float weight,
42
+ const float32x4_t existingValue) {
43
+ const auto fineValue = elementaryBlock4x1b(fine);
44
+
45
+ // this operation is expected to be optimized by a compiler
46
+ const auto weightNeon = vdupq_n_f32(weight);
47
+ // do fma
48
+ return vfmaq_f32(existingValue, weightNeon, fineValue);
49
+ }
50
+
51
+ // Processes 8 float values.
52
+ // Returns {
53
+ // [0..3] = *fine0[0..3];
54
+ // [4..7] = *fine1[0..3];
55
+ // }
56
+ inline float32x4x2_t elementaryBlock4x2b(
57
+ const float* const __restrict fine0,
58
+ const float* const __restrict fine1) {
59
+ // load fine
60
+ const auto fineValue0 = vld1q_f32(fine0);
61
+ const auto fineValue1 = vld1q_f32(fine1);
62
+
63
+ return {fineValue0, fineValue1};
64
+ }
65
+
66
+ // Processes 8 float values.
67
+ // Returns {
68
+ // [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
69
+ // [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
70
+ // }
71
+ inline float32x4x2_t elementaryBlock4x2bAccum(
72
+ const float* const __restrict fine0,
73
+ const float* const __restrict fine1,
74
+ const float weight,
75
+ const float32x4x2_t existingValue) {
76
+ const auto fineValue = elementaryBlock4x2b(fine0, fine1);
77
+
78
+ // this operation is expected to be optimized by a compiler
79
+ const auto weightNeon = vdupq_n_f32(weight);
80
+ // do fma
81
+ const auto result0 =
82
+ vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
83
+ const auto result1 =
84
+ vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
85
+ return {result0, result1};
86
+ }
87
+
88
+ // Processes 8 float values.
89
+ // Returns {
90
+ // [0..7] = *fine[0..7];
91
+ // }
92
+ inline float32x4x2_t elementaryBlock8x1b(const float* const __restrict fine) {
93
+ // load fine
94
+ const auto fineValue0 = vld1q_f32(fine);
95
+ const auto fineValue1 = vld1q_f32(fine + 4);
96
+ return {fineValue0, fineValue1};
97
+ }
98
+
99
+ // Processes 8 float values.
100
+ // Returns {
101
+ // [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
102
+ // }
103
+ inline float32x4x2_t elementaryBlock8x1bAccum(
104
+ const float* const __restrict fine,
105
+ const float weight,
106
+ const float32x4x2_t existingValue) {
107
+ const auto fineValue = elementaryBlock8x1b(fine);
108
+
109
+ // this operation is expected to be optimized by a compiler
110
+ const auto weightNeon = vdupq_n_f32(weight);
111
+ // do fma
112
+ const auto result0 =
113
+ vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
114
+ const auto result1 =
115
+ vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
116
+ return {result0, result1};
117
+ }
118
+
119
+ // The following code uses template-based for-loop unrolling,
120
+ // because the compiler does not do that on its own as needed.
121
+ // The idea is the following:
122
+ // template<int I, int MAX>
123
+ // struct Foo {
124
+ // static void bar() {
125
+ // doSomething(I);
126
+ // Foo<I + 1, MAX>::bar();
127
+ // }
128
+ // };
129
+ //
130
+ // template<int MAX>
131
+ // struct Foo<MAX, MAX> {
132
+ // static void bar() {}
133
+ // };
134
+ //
135
+ // Initiate the loop:
136
+ // Foo<0, MAX>::bar();
137
+
138
+ template <
139
+ intptr_t DIM,
140
+ intptr_t FINE_SIZE,
141
+ intptr_t FINE_BITS,
142
+ intptr_t CPOS,
143
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
144
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
145
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
146
+ bool DIM_EQ_CPOS = DIM == CPOS>
147
+ struct IndexPQDecoderImpl;
148
+
149
+ template <
150
+ intptr_t DIM,
151
+ intptr_t CPOS,
152
+ intptr_t FINE_BITS,
153
+ bool QPOS_LEFT_GE_8,
154
+ bool QPOS_LEFT_GE_4>
155
+ struct IndexPQDecoderImpl<
156
+ DIM,
157
+ 4,
158
+ FINE_BITS,
159
+ CPOS,
160
+ true,
161
+ QPOS_LEFT_GE_8,
162
+ QPOS_LEFT_GE_4,
163
+ false> {
164
+ static constexpr intptr_t FINE_SIZE = 4;
165
+
166
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
167
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
168
+
169
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
170
+
171
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
172
+
173
+ // process 1 sample
174
+ static void store(
175
+ const float* const __restrict pqFineCentroids0,
176
+ const uint8_t* const __restrict code0,
177
+ float* const __restrict outputStore) {
178
+ // fine quantizer
179
+ const uint8_t* const __restrict fine0 = code0;
180
+
181
+ // process chunks, 4 float
182
+ // but 8 floats per loop
183
+
184
+ const intptr_t fineCode0a = detail::
185
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
186
+ fine0);
187
+ const intptr_t fineCode0b = detail::
188
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
189
+ fine0);
190
+
191
+ const auto storeValue = elementaryBlock4x2b(
192
+ pqFineCentroids0 +
193
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
194
+ fineCode0a) *
195
+ FINE_SIZE +
196
+ fineCentroidOffset,
197
+ pqFineCentroids0 +
198
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
199
+ fineCode0b) *
200
+ FINE_SIZE +
201
+ fineCentroidOffset);
202
+
203
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
204
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
205
+
206
+ // next
207
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
208
+ pqFineCentroids0, code0, outputStore);
209
+ }
210
+
211
+ // process 1 sample
212
+ static void accum(
213
+ const float* const __restrict pqFineCentroids0,
214
+ const uint8_t* const __restrict code0,
215
+ const float weight0,
216
+ float* const __restrict outputAccum) {
217
+ // fine quantizer
218
+ const uint8_t* const __restrict fine0 = code0;
219
+
220
+ // process chunks, 4 float
221
+ // but 8 floats per loop
222
+
223
+ const intptr_t fineCode0a = detail::
224
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
225
+ fine0);
226
+ const intptr_t fineCode0b = detail::
227
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
228
+ fine0);
229
+
230
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
231
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
232
+
233
+ auto existingValue = elementaryBlock4x2bAccum(
234
+ pqFineCentroids0 +
235
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
236
+ fineCode0a) *
237
+ FINE_SIZE +
238
+ fineCentroidOffset,
239
+ pqFineCentroids0 +
240
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
241
+ fineCode0b) *
242
+ FINE_SIZE +
243
+ fineCentroidOffset,
244
+ weight0,
245
+ {existingValue0, existingValue1});
246
+
247
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
248
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
249
+
250
+ // next
251
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
252
+ pqFineCentroids0, code0, weight0, outputAccum);
253
+ }
254
+
255
+ // Process 2 samples.
256
+ // Each code uses its own fine pq centroids table.
257
+ static void accum(
258
+ const float* const __restrict pqFineCentroids0,
259
+ const uint8_t* const __restrict code0,
260
+ const float weight0,
261
+ const float* const __restrict pqFineCentroids1,
262
+ const uint8_t* const __restrict code1,
263
+ const float weight1,
264
+ float* const __restrict outputAccum) {
265
+ // fine quantizer
266
+ const uint8_t* const __restrict fine0 = code0;
267
+ const uint8_t* const __restrict fine1 = code1;
268
+
269
+ // process chunks, 4 float
270
+ // but 8 floats per loop
271
+
272
+ const intptr_t fineCode0a = detail::
273
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
274
+ fine0);
275
+ const intptr_t fineCode0b = detail::
276
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
277
+ fine0);
278
+ const intptr_t fineCode1a = detail::
279
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
280
+ fine1);
281
+ const intptr_t fineCode1b = detail::
282
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
283
+ fine1);
284
+
285
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
286
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
287
+
288
+ auto existingValue = elementaryBlock4x2bAccum(
289
+ pqFineCentroids0 +
290
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
291
+ fineCode0a) *
292
+ FINE_SIZE +
293
+ fineCentroidOffset,
294
+ pqFineCentroids0 +
295
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
296
+ fineCode0b) *
297
+ FINE_SIZE +
298
+ fineCentroidOffset,
299
+ weight0,
300
+ {existingValue0, existingValue1});
301
+
302
+ existingValue = elementaryBlock4x2bAccum(
303
+ pqFineCentroids1 +
304
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
305
+ fineCode1a) *
306
+ FINE_SIZE +
307
+ fineCentroidOffset,
308
+ pqFineCentroids1 +
309
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
310
+ fineCode1b) *
311
+ FINE_SIZE +
312
+ fineCentroidOffset,
313
+ weight1,
314
+ existingValue);
315
+
316
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
317
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
318
+
319
+ // next
320
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
321
+ pqFineCentroids0,
322
+ code0,
323
+ weight0,
324
+ pqFineCentroids1,
325
+ code1,
326
+ weight1,
327
+ outputAccum);
328
+ }
329
+
330
+ // Process 2 samples.
331
+ // Fine pq centroids table is shared among codes.
332
+ static void accum(
333
+ const float* const __restrict pqFineCentroids,
334
+ const uint8_t* const __restrict code0,
335
+ const float weight0,
336
+ const uint8_t* const __restrict code1,
337
+ const float weight1,
338
+ float* const __restrict outputAccum) {
339
+ // fine quantizer
340
+ const uint8_t* const __restrict fine0 = code0;
341
+ const uint8_t* const __restrict fine1 = code1;
342
+
343
+ // process chunks, 4 float
344
+ // but 8 floats per loop
345
+
346
+ const intptr_t fineCode0a = detail::
347
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
348
+ fine0);
349
+ const intptr_t fineCode0b = detail::
350
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
351
+ fine0);
352
+ const intptr_t fineCode1a = detail::
353
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
354
+ fine1);
355
+ const intptr_t fineCode1b = detail::
356
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
357
+ fine1);
358
+
359
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
360
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
361
+
362
+ auto existingValue = elementaryBlock4x2bAccum(
363
+ pqFineCentroids +
364
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
365
+ fineCode0a) *
366
+ FINE_SIZE +
367
+ fineCentroidOffset,
368
+ pqFineCentroids +
369
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
370
+ fineCode0b) *
371
+ FINE_SIZE +
372
+ fineCentroidOffset,
373
+ weight0,
374
+ {existingValue0, existingValue1});
375
+
376
+ existingValue = elementaryBlock4x2bAccum(
377
+ pqFineCentroids +
378
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
379
+ fineCode1a) *
380
+ FINE_SIZE +
381
+ fineCentroidOffset,
382
+ pqFineCentroids +
383
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
384
+ fineCode1b) *
385
+ FINE_SIZE +
386
+ fineCentroidOffset,
387
+ weight1,
388
+ existingValue);
389
+
390
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
391
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
392
+
393
+ // next
394
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
395
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
396
+ }
397
+
398
+ // Process 3 samples.
399
+ // Each code uses its own fine pq centroids table.
400
+ static void accum(
401
+ const float* const __restrict pqFineCentroids0,
402
+ const uint8_t* const __restrict code0,
403
+ const float weight0,
404
+ const float* const __restrict pqFineCentroids1,
405
+ const uint8_t* const __restrict code1,
406
+ const float weight1,
407
+ const float* const __restrict pqFineCentroids2,
408
+ const uint8_t* const __restrict code2,
409
+ const float weight2,
410
+ float* const __restrict outputAccum) {
411
+ // fine quantizer
412
+ const uint8_t* const __restrict fine0 = code0;
413
+ const uint8_t* const __restrict fine1 = code1;
414
+ const uint8_t* const __restrict fine2 = code2;
415
+
416
+ // process chunks, 4 float
417
+ // but 8 floats per loop
418
+
419
+ const intptr_t fineCode0a = detail::
420
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
421
+ fine0);
422
+ const intptr_t fineCode0b = detail::
423
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
424
+ fine0);
425
+ const intptr_t fineCode1a = detail::
426
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
427
+ fine1);
428
+ const intptr_t fineCode1b = detail::
429
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
430
+ fine1);
431
+ const intptr_t fineCode2a = detail::
432
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
433
+ fine2);
434
+ const intptr_t fineCode2b = detail::
435
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
436
+ fine2);
437
+
438
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
439
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
440
+
441
+ auto existingValue = elementaryBlock4x2bAccum(
442
+ pqFineCentroids0 +
443
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
444
+ fineCode0a) *
445
+ FINE_SIZE +
446
+ fineCentroidOffset,
447
+ pqFineCentroids0 +
448
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
449
+ fineCode0b) *
450
+ FINE_SIZE +
451
+ fineCentroidOffset,
452
+ weight0,
453
+ {existingValue0, existingValue1});
454
+
455
+ existingValue = elementaryBlock4x2bAccum(
456
+ pqFineCentroids1 +
457
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
458
+ fineCode1a) *
459
+ FINE_SIZE +
460
+ fineCentroidOffset,
461
+ pqFineCentroids1 +
462
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
463
+ fineCode1b) *
464
+ FINE_SIZE +
465
+ fineCentroidOffset,
466
+ weight1,
467
+ existingValue);
468
+
469
+ existingValue = elementaryBlock4x2bAccum(
470
+ pqFineCentroids2 +
471
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
472
+ fineCode2a) *
473
+ FINE_SIZE +
474
+ fineCentroidOffset,
475
+ pqFineCentroids2 +
476
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
477
+ fineCode2b) *
478
+ FINE_SIZE +
479
+ fineCentroidOffset,
480
+ weight2,
481
+ existingValue);
482
+
483
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
484
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
485
+
486
+ // next
487
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
488
+ pqFineCentroids0,
489
+ code0,
490
+ weight0,
491
+ pqFineCentroids1,
492
+ code1,
493
+ weight1,
494
+ pqFineCentroids2,
495
+ code2,
496
+ weight2,
497
+ outputAccum);
498
+ }
499
+
500
+ // Process 3 samples.
501
+ // Fine pq centroids table is shared among codes.
502
+ static void accum(
503
+ const float* const __restrict pqFineCentroids,
504
+ const uint8_t* const __restrict code0,
505
+ const float weight0,
506
+ const uint8_t* const __restrict code1,
507
+ const float weight1,
508
+ const uint8_t* const __restrict code2,
509
+ const float weight2,
510
+ float* const __restrict outputAccum) {
511
+ // fine quantizer
512
+ const uint8_t* const __restrict fine0 = code0;
513
+ const uint8_t* const __restrict fine1 = code1;
514
+ const uint8_t* const __restrict fine2 = code2;
515
+
516
+ // process chunks, 4 float
517
+ // but 8 floats per loop
518
+
519
+ const intptr_t fineCode0a = detail::
520
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
521
+ fine0);
522
+ const intptr_t fineCode0b = detail::
523
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
524
+ fine0);
525
+ const intptr_t fineCode1a = detail::
526
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
527
+ fine1);
528
+ const intptr_t fineCode1b = detail::
529
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
530
+ fine1);
531
+ const intptr_t fineCode2a = detail::
532
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
533
+ fine2);
534
+ const intptr_t fineCode2b = detail::
535
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
536
+ fine2);
537
+
538
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
539
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
540
+
541
+ auto existingValue = elementaryBlock4x2bAccum(
542
+ pqFineCentroids +
543
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
544
+ fineCode0a) *
545
+ FINE_SIZE +
546
+ fineCentroidOffset,
547
+ pqFineCentroids +
548
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
549
+ fineCode0b) *
550
+ FINE_SIZE +
551
+ fineCentroidOffset,
552
+ weight0,
553
+ {existingValue0, existingValue1});
554
+
555
+ existingValue = elementaryBlock4x2bAccum(
556
+ pqFineCentroids +
557
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
558
+ fineCode1a) *
559
+ FINE_SIZE +
560
+ fineCentroidOffset,
561
+ pqFineCentroids +
562
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
563
+ fineCode1b) *
564
+ FINE_SIZE +
565
+ fineCentroidOffset,
566
+ weight1,
567
+ existingValue);
568
+
569
+ existingValue = elementaryBlock4x2bAccum(
570
+ pqFineCentroids +
571
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
572
+ fineCode2a) *
573
+ FINE_SIZE +
574
+ fineCentroidOffset,
575
+ pqFineCentroids +
576
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
577
+ fineCode2b) *
578
+ FINE_SIZE +
579
+ fineCentroidOffset,
580
+ weight2,
581
+ existingValue);
582
+
583
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
584
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
585
+
586
+ // next
587
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
588
+ pqFineCentroids,
589
+ code0,
590
+ weight0,
591
+ code1,
592
+ weight1,
593
+ code2,
594
+ weight2,
595
+ outputAccum);
596
+ }
597
+ };
598
+
599
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
600
+ struct IndexPQDecoderImpl<
601
+ DIM,
602
+ FINE_SIZE,
603
+ FINE_BITS,
604
+ CPOS,
605
+ false,
606
+ true,
607
+ true,
608
+ false> {
609
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
610
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
611
+
612
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
613
+
614
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
615
+
616
+ // process 1 sample
617
+ static void store(
618
+ const float* const __restrict pqFineCentroids0,
619
+ const uint8_t* const __restrict code0,
620
+ float* const __restrict outputStore) {
621
+ // fine quantizer
622
+ const uint8_t* const __restrict fine0 = code0;
623
+
624
+ // process chunks, 8 float
625
+
626
+ const intptr_t fineCode0 =
627
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
628
+ get(fine0);
629
+
630
+ const auto storeValue = elementaryBlock8x1b(
631
+ pqFineCentroids0 +
632
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
633
+ fineCentroidOffset);
634
+
635
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
636
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
637
+
638
+ // next
639
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
640
+ pqFineCentroids0, code0, outputStore);
641
+ }
642
+
643
+ // process 1 sample
644
+ static void accum(
645
+ const float* const __restrict pqFineCentroids0,
646
+ const uint8_t* const __restrict code0,
647
+ const float weight0,
648
+ float* const __restrict outputAccum) {
649
+ // fine quantizer
650
+ const uint8_t* const __restrict fine0 = code0;
651
+
652
+ // process chunks, 8 float
653
+
654
+ const intptr_t fineCode0 =
655
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
656
+ get(fine0);
657
+
658
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
659
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
660
+
661
+ const auto existingValue = elementaryBlock8x1bAccum(
662
+ pqFineCentroids0 +
663
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
664
+ FINE_SIZE +
665
+ fineCentroidOffset,
666
+ weight0,
667
+ {existingValue0, existingValue1});
668
+
669
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
670
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
671
+
672
+ // next
673
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
674
+ pqFineCentroids0, code0, weight0, outputAccum);
675
+ }
676
+
677
+ // Process 2 samples.
678
+ // Each code uses its own fine pq centroids table.
679
+ static void accum(
680
+ const float* const __restrict pqFineCentroids0,
681
+ const uint8_t* const __restrict code0,
682
+ const float weight0,
683
+ const float* const __restrict pqFineCentroids1,
684
+ const uint8_t* const __restrict code1,
685
+ const float weight1,
686
+ float* const __restrict outputAccum) {
687
+ // fine quantizer
688
+ const uint8_t* const __restrict fine0 = code0;
689
+ const uint8_t* const __restrict fine1 = code1;
690
+
691
+ // process chunks, 8 float
692
+
693
+ const intptr_t fineCode0 =
694
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
695
+ get(fine0);
696
+ const intptr_t fineCode1 =
697
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
698
+ get(fine1);
699
+
700
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
701
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
702
+
703
+ auto existingValue = elementaryBlock8x1bAccum(
704
+ pqFineCentroids0 +
705
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
706
+ FINE_SIZE +
707
+ fineCentroidOffset,
708
+ weight0,
709
+ {existingValue0, existingValue1});
710
+
711
+ existingValue = elementaryBlock8x1bAccum(
712
+ pqFineCentroids1 +
713
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
714
+ FINE_SIZE +
715
+ fineCentroidOffset,
716
+ weight1,
717
+ existingValue);
718
+
719
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
720
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
721
+
722
+ // next
723
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
724
+ pqFineCentroids0,
725
+ code0,
726
+ weight0,
727
+ pqFineCentroids1,
728
+ code1,
729
+ weight1,
730
+ outputAccum);
731
+ }
732
+
733
+ // Process 2 samples.
734
+ // Fine pq centroids table is shared among codes.
735
+ static void accum(
736
+ const float* const __restrict pqFineCentroids,
737
+ const uint8_t* const __restrict code0,
738
+ const float weight0,
739
+ const uint8_t* const __restrict code1,
740
+ const float weight1,
741
+ float* const __restrict outputAccum) {
742
+ // fine quantizer
743
+ const uint8_t* const __restrict fine0 = code0;
744
+ const uint8_t* const __restrict fine1 = code1;
745
+
746
+ // process chunks, 8 float
747
+
748
+ const intptr_t fineCode0 =
749
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
750
+ get(fine0);
751
+ const intptr_t fineCode1 =
752
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
753
+ get(fine1);
754
+
755
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
756
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
757
+
758
+ auto existingValue = elementaryBlock8x1bAccum(
759
+ pqFineCentroids +
760
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
761
+ FINE_SIZE +
762
+ fineCentroidOffset,
763
+ weight0,
764
+ {existingValue0, existingValue1});
765
+
766
+ existingValue = elementaryBlock8x1bAccum(
767
+ pqFineCentroids +
768
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
769
+ FINE_SIZE +
770
+ fineCentroidOffset,
771
+ weight1,
772
+ existingValue);
773
+
774
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
775
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
776
+
777
+ // next
778
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
779
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
780
+ }
781
+
782
+ // Process 3 samples.
783
+ // Each code uses its own fine pq centroids table.
784
+ static void accum(
785
+ const float* const __restrict pqFineCentroids0,
786
+ const uint8_t* const __restrict code0,
787
+ const float weight0,
788
+ const float* const __restrict pqFineCentroids1,
789
+ const uint8_t* const __restrict code1,
790
+ const float weight1,
791
+ const float* const __restrict pqFineCentroids2,
792
+ const uint8_t* const __restrict code2,
793
+ const float weight2,
794
+ float* const __restrict outputAccum) {
795
+ // fine quantizer
796
+ const uint8_t* const __restrict fine0 = code0;
797
+ const uint8_t* const __restrict fine1 = code1;
798
+ const uint8_t* const __restrict fine2 = code2;
799
+
800
+ // process chunks, 8 float
801
+
802
+ const intptr_t fineCode0 =
803
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
804
+ get(fine0);
805
+ const intptr_t fineCode1 =
806
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
807
+ get(fine1);
808
+ const intptr_t fineCode2 =
809
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
810
+ get(fine2);
811
+
812
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
813
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
814
+
815
+ auto existingValue = elementaryBlock8x1bAccum(
816
+ pqFineCentroids0 +
817
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
818
+ FINE_SIZE +
819
+ fineCentroidOffset,
820
+ weight0,
821
+ {existingValue0, existingValue1});
822
+
823
+ existingValue = elementaryBlock8x1bAccum(
824
+ pqFineCentroids1 +
825
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
826
+ FINE_SIZE +
827
+ fineCentroidOffset,
828
+ weight1,
829
+ existingValue);
830
+
831
+ existingValue = elementaryBlock8x1bAccum(
832
+ pqFineCentroids2 +
833
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
834
+ FINE_SIZE +
835
+ fineCentroidOffset,
836
+ weight2,
837
+ existingValue);
838
+
839
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
840
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
841
+
842
+ // next
843
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
844
+ pqFineCentroids0,
845
+ code0,
846
+ weight0,
847
+ pqFineCentroids1,
848
+ code1,
849
+ weight1,
850
+ pqFineCentroids2,
851
+ code2,
852
+ weight2,
853
+ outputAccum);
854
+ }
855
+
856
+ // Process 3 samples.
857
+ // Fine pq centroids table is shared among codes.
858
+ static void accum(
859
+ const float* const __restrict pqFineCentroids,
860
+ const uint8_t* const __restrict code0,
861
+ const float weight0,
862
+ const uint8_t* const __restrict code1,
863
+ const float weight1,
864
+ const uint8_t* const __restrict code2,
865
+ const float weight2,
866
+ float* const __restrict outputAccum) {
867
+ // fine quantizer
868
+ const uint8_t* const __restrict fine0 = code0;
869
+ const uint8_t* const __restrict fine1 = code1;
870
+ const uint8_t* const __restrict fine2 = code2;
871
+
872
+ // process chunks, 8 float
873
+
874
+ const intptr_t fineCode0 =
875
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
876
+ get(fine0);
877
+ const intptr_t fineCode1 =
878
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
879
+ get(fine1);
880
+ const intptr_t fineCode2 =
881
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
882
+ get(fine2);
883
+
884
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
885
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
886
+
887
+ auto existingValue = elementaryBlock8x1bAccum(
888
+ pqFineCentroids +
889
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
890
+ FINE_SIZE +
891
+ fineCentroidOffset,
892
+ weight0,
893
+ {existingValue0, existingValue1});
894
+
895
+ existingValue = elementaryBlock8x1bAccum(
896
+ pqFineCentroids +
897
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
898
+ FINE_SIZE +
899
+ fineCentroidOffset,
900
+ weight1,
901
+ existingValue);
902
+
903
+ existingValue = elementaryBlock8x1bAccum(
904
+ pqFineCentroids +
905
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
906
+ FINE_SIZE +
907
+ fineCentroidOffset,
908
+ weight2,
909
+ existingValue);
910
+
911
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
912
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
913
+
914
+ // next
915
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
916
+ pqFineCentroids,
917
+ code0,
918
+ weight0,
919
+ code1,
920
+ weight1,
921
+ code2,
922
+ weight2,
923
+ outputAccum);
924
+ }
925
+ };
926
+
927
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
928
+ struct IndexPQDecoderImpl<
929
+ DIM,
930
+ FINE_SIZE,
931
+ FINE_BITS,
932
+ CPOS,
933
+ false,
934
+ false,
935
+ true,
936
+ false> {
937
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
938
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
939
+
940
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
941
+
942
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
943
+
944
+ // process 1 sample
945
+ static void store(
946
+ const float* const __restrict pqFineCentroids0,
947
+ const uint8_t* const __restrict code0,
948
+ float* const __restrict outputStore) {
949
+ // fine quantizer
950
+ const uint8_t* const __restrict fine0 = code0;
951
+
952
+ // process chunks, 4 float
953
+
954
+ const intptr_t fineCode0 =
955
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
956
+ get(fine0);
957
+
958
+ const auto storeValue = elementaryBlock4x1b(
959
+ pqFineCentroids0 +
960
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
961
+ fineCentroidOffset);
962
+
963
+ vst1q_f32(outputStore + CPOS, storeValue);
964
+
965
+ // next
966
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
967
+ pqFineCentroids0, code0, outputStore);
968
+ }
969
+
970
+ // process 1 sample
971
+ static void accum(
972
+ const float* const __restrict pqFineCentroids0,
973
+ const uint8_t* const __restrict code0,
974
+ const float weight0,
975
+ float* const __restrict outputAccum) {
976
+ // fine quantizer
977
+ const uint8_t* const __restrict fine0 = code0;
978
+
979
+ // process chunks, 4 float
980
+
981
+ const intptr_t fineCode0 =
982
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
983
+ get(fine0);
984
+
985
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
986
+
987
+ existingValue = elementaryBlock4x1bAccum(
988
+ pqFineCentroids0 +
989
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
990
+ FINE_SIZE +
991
+ fineCentroidOffset,
992
+ weight0,
993
+ existingValue);
994
+
995
+ vst1q_f32(outputAccum + CPOS, existingValue);
996
+
997
+ // next
998
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
999
+ pqFineCentroids0, code0, weight0, outputAccum);
1000
+ }
1001
+
1002
+ // Process 2 samples.
1003
+ // Each code uses its own fine pq centroids table.
1004
+ static void accum(
1005
+ const float* const __restrict pqFineCentroids0,
1006
+ const uint8_t* const __restrict code0,
1007
+ const float weight0,
1008
+ const float* const __restrict pqFineCentroids1,
1009
+ const uint8_t* const __restrict code1,
1010
+ const float weight1,
1011
+ float* const __restrict outputAccum) {
1012
+ // fine quantizer
1013
+ const uint8_t* const __restrict fine0 = code0;
1014
+ const uint8_t* const __restrict fine1 = code1;
1015
+
1016
+ // process chunks, 4 float
1017
+
1018
+ const intptr_t fineCode0 =
1019
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1020
+ get(fine0);
1021
+ const intptr_t fineCode1 =
1022
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1023
+ get(fine1);
1024
+
1025
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1026
+
1027
+ existingValue = elementaryBlock4x1bAccum(
1028
+ pqFineCentroids0 +
1029
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1030
+ FINE_SIZE +
1031
+ fineCentroidOffset,
1032
+ weight0,
1033
+ existingValue);
1034
+
1035
+ existingValue = elementaryBlock4x1bAccum(
1036
+ pqFineCentroids1 +
1037
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1038
+ FINE_SIZE +
1039
+ fineCentroidOffset,
1040
+ weight1,
1041
+ existingValue);
1042
+
1043
+ vst1q_f32(outputAccum + CPOS, existingValue);
1044
+
1045
+ // next
1046
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1047
+ pqFineCentroids0,
1048
+ code0,
1049
+ weight0,
1050
+ pqFineCentroids1,
1051
+ code1,
1052
+ weight1,
1053
+ outputAccum);
1054
+ }
1055
+
1056
+ // Process 2 samples.
1057
+ // Fine pq centroids table is shared among codes.
1058
+ static void accum(
1059
+ const float* const __restrict pqFineCentroids,
1060
+ const uint8_t* const __restrict code0,
1061
+ const float weight0,
1062
+ const uint8_t* const __restrict code1,
1063
+ const float weight1,
1064
+ float* const __restrict outputAccum) {
1065
+ // fine quantizer
1066
+ const uint8_t* const __restrict fine0 = code0;
1067
+ const uint8_t* const __restrict fine1 = code1;
1068
+
1069
+ // process chunks, 4 float
1070
+
1071
+ const intptr_t fineCode0 =
1072
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1073
+ get(fine0);
1074
+ const intptr_t fineCode1 =
1075
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1076
+ get(fine1);
1077
+
1078
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1079
+
1080
+ existingValue = elementaryBlock4x1bAccum(
1081
+ pqFineCentroids +
1082
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1083
+ FINE_SIZE +
1084
+ fineCentroidOffset,
1085
+ weight0,
1086
+ existingValue);
1087
+
1088
+ existingValue = elementaryBlock4x1bAccum(
1089
+ pqFineCentroids +
1090
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1091
+ FINE_SIZE +
1092
+ fineCentroidOffset,
1093
+ weight1,
1094
+ existingValue);
1095
+
1096
+ vst1q_f32(outputAccum + CPOS, existingValue);
1097
+
1098
+ // next
1099
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1100
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
1101
+ }
1102
+
1103
+ // Process 3 samples.
1104
+ // Each code uses its own fine pq centroids table.
1105
+ static void accum(
1106
+ const float* const __restrict pqFineCentroids0,
1107
+ const uint8_t* const __restrict code0,
1108
+ const float weight0,
1109
+ const float* const __restrict pqFineCentroids1,
1110
+ const uint8_t* const __restrict code1,
1111
+ const float weight1,
1112
+ const float* const __restrict pqFineCentroids2,
1113
+ const uint8_t* const __restrict code2,
1114
+ const float weight2,
1115
+ float* const __restrict outputAccum) {
1116
+ // fine quantizer
1117
+ const uint8_t* const __restrict fine0 = code0;
1118
+ const uint8_t* const __restrict fine1 = code1;
1119
+ const uint8_t* const __restrict fine2 = code2;
1120
+
1121
+ // process chunks, 4 float
1122
+
1123
+ const intptr_t fineCode0 =
1124
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1125
+ get(fine0);
1126
+ const intptr_t fineCode1 =
1127
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1128
+ get(fine1);
1129
+ const intptr_t fineCode2 =
1130
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1131
+ get(fine2);
1132
+
1133
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1134
+
1135
+ existingValue = elementaryBlock4x1bAccum(
1136
+ pqFineCentroids0 +
1137
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1138
+ FINE_SIZE +
1139
+ fineCentroidOffset,
1140
+ weight0,
1141
+ existingValue);
1142
+
1143
+ existingValue = elementaryBlock4x1bAccum(
1144
+ pqFineCentroids1 +
1145
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1146
+ FINE_SIZE +
1147
+ fineCentroidOffset,
1148
+ weight1,
1149
+ existingValue);
1150
+
1151
+ existingValue = elementaryBlock4x1bAccum(
1152
+ pqFineCentroids2 +
1153
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1154
+ FINE_SIZE +
1155
+ fineCentroidOffset,
1156
+ weight2,
1157
+ existingValue);
1158
+
1159
+ vst1q_f32(outputAccum + CPOS, existingValue);
1160
+
1161
+ // next
1162
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1163
+ pqFineCentroids0,
1164
+ code0,
1165
+ weight0,
1166
+ pqFineCentroids1,
1167
+ code1,
1168
+ weight1,
1169
+ pqFineCentroids2,
1170
+ code2,
1171
+ weight2,
1172
+ outputAccum);
1173
+ }
1174
+
1175
+ // Process 3 samples.
1176
+ // Fine pq centroids table is shared among codes.
1177
+ static void accum(
1178
+ const float* const __restrict pqFineCentroids,
1179
+ const uint8_t* const __restrict code0,
1180
+ const float weight0,
1181
+ const uint8_t* const __restrict code1,
1182
+ const float weight1,
1183
+ const uint8_t* const __restrict code2,
1184
+ const float weight2,
1185
+ float* const __restrict outputAccum) {
1186
+ // fine quantizer
1187
+ const uint8_t* const __restrict fine0 = code0;
1188
+ const uint8_t* const __restrict fine1 = code1;
1189
+ const uint8_t* const __restrict fine2 = code2;
1190
+
1191
+ // process chunks, 4 float
1192
+
1193
+ const intptr_t fineCode0 =
1194
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1195
+ get(fine0);
1196
+ const intptr_t fineCode1 =
1197
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1198
+ get(fine1);
1199
+ const intptr_t fineCode2 =
1200
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1201
+ get(fine2);
1202
+
1203
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1204
+
1205
+ existingValue = elementaryBlock4x1bAccum(
1206
+ pqFineCentroids +
1207
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1208
+ FINE_SIZE +
1209
+ fineCentroidOffset,
1210
+ weight0,
1211
+ existingValue);
1212
+
1213
+ existingValue = elementaryBlock4x1bAccum(
1214
+ pqFineCentroids +
1215
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1216
+ FINE_SIZE +
1217
+ fineCentroidOffset,
1218
+ weight1,
1219
+ existingValue);
1220
+
1221
+ existingValue = elementaryBlock4x1bAccum(
1222
+ pqFineCentroids +
1223
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1224
+ FINE_SIZE +
1225
+ fineCentroidOffset,
1226
+ weight2,
1227
+ existingValue);
1228
+
1229
+ vst1q_f32(outputAccum + CPOS, existingValue);
1230
+
1231
+ // next
1232
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1233
+ pqFineCentroids,
1234
+ code0,
1235
+ weight0,
1236
+ code1,
1237
+ weight1,
1238
+ code2,
1239
+ weight2,
1240
+ outputAccum);
1241
+ }
1242
+ };
1243
+
1244
+ // This partial specialization is expected to do nothing.
1245
+ template <
1246
+ intptr_t DIM,
1247
+ intptr_t FINE_SIZE,
1248
+ intptr_t FINE_BITS,
1249
+ bool FINE_SIZE_EQ_4,
1250
+ bool QPOS_LEFT_GE_8,
1251
+ bool QPOS_LEFT_GE_4>
1252
+ struct IndexPQDecoderImpl<
1253
+ DIM,
1254
+ FINE_SIZE,
1255
+ FINE_BITS,
1256
+ DIM,
1257
+ FINE_SIZE_EQ_4,
1258
+ QPOS_LEFT_GE_8,
1259
+ QPOS_LEFT_GE_4,
1260
+ true> {
1261
+ // process 1 sample
1262
+ static void store(
1263
+ const float* const __restrict pqFineCentroids0,
1264
+ const uint8_t* const __restrict code0,
1265
+ float* const __restrict outputStore) {}
1266
+
1267
+ // process 1 sample
1268
+ static void accum(
1269
+ const float* const __restrict pqFineCentroids0,
1270
+ const uint8_t* const __restrict code0,
1271
+ const float weight0,
1272
+ float* const __restrict outputAccum) {}
1273
+
1274
+ // Process 2 samples.
1275
+ // Each code uses its own fine pq centroids table.
1276
+ static void accum(
1277
+ const float* const __restrict pqFineCentroids0,
1278
+ const uint8_t* const __restrict code0,
1279
+ const float weight0,
1280
+ const float* const __restrict pqFineCentroids1,
1281
+ const uint8_t* const __restrict code1,
1282
+ const float weight1,
1283
+ float* const __restrict outputAccum) {}
1284
+
1285
+ // Process 2 samples.
1286
+ // Fine pq centroids table is shared among codes.
1287
+ static void accum(
1288
+ const float* const __restrict pqFineCentroids,
1289
+ const uint8_t* const __restrict code0,
1290
+ const float weight0,
1291
+ const uint8_t* const __restrict code1,
1292
+ const float weight1,
1293
+ float* const __restrict outputAccum) {}
1294
+
1295
+ // Process 3 samples.
1296
+ // Each code uses its own fine pq centroids table.
1297
+ static void accum(
1298
+ const float* const __restrict pqFineCentroids0,
1299
+ const uint8_t* const __restrict code0,
1300
+ const float weight0,
1301
+ const float* const __restrict pqFineCentroids1,
1302
+ const uint8_t* const __restrict code1,
1303
+ const float weight1,
1304
+ const float* const __restrict pqFineCentroids2,
1305
+ const uint8_t* const __restrict code2,
1306
+ const float weight2,
1307
+ float* const __restrict outputAccum) {}
1308
+
1309
+ // Process 3 samples.
1310
+ // Fine pq centroids table is shared among codes.
1311
+ static void accum(
1312
+ const float* const __restrict pqFineCentroids,
1313
+ const uint8_t* const __restrict code0,
1314
+ const float weight0,
1315
+ const uint8_t* const __restrict code1,
1316
+ const float weight1,
1317
+ const uint8_t* const __restrict code2,
1318
+ const float weight2,
1319
+ float* const __restrict outputAccum) {}
1320
+ };
1321
+ } // namespace
1322
+
1323
+ // Suitable for PQ[1]x8
1324
+ // Suitable for PQ[1]x10
1325
+ // Suitable for PQ[1]x16
1326
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
1327
+ struct IndexPQDecoder {
1328
+ static_assert(
1329
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1330
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1331
+
1332
+ static constexpr intptr_t dim = DIM;
1333
+ static constexpr intptr_t fineSize = FINE_SIZE;
1334
+ static constexpr intptr_t fineBits = FINE_BITS;
1335
+
1336
+ // Process 1 sample.
1337
+ static void store(
1338
+ const float* const __restrict pqFineCentroids,
1339
+ const uint8_t* const __restrict code,
1340
+ float* const __restrict outputStore) {
1341
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
1342
+ pqFineCentroids, code, outputStore);
1343
+ }
1344
+
1345
+ // Process 1 sample.
1346
+ // Performs outputAccum += weight * decoded(code)
1347
+ static void accum(
1348
+ const float* const __restrict pqFineCentroids,
1349
+ const uint8_t* const __restrict code,
1350
+ const float weight,
1351
+ float* const __restrict outputAccum) {
1352
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1353
+ pqFineCentroids, code, weight, outputAccum);
1354
+ }
1355
+
1356
+ // Process 2 samples.
1357
+ // Each code uses its own fine pq centroids table.
1358
+ //
1359
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1360
+ // decoded(code1)
1361
+ static void accum(
1362
+ const float* const __restrict pqFineCentroids0,
1363
+ const uint8_t* const __restrict code0,
1364
+ const float weight0,
1365
+ const float* const __restrict pqFineCentroids1,
1366
+ const uint8_t* const __restrict code1,
1367
+ const float weight1,
1368
+ float* const __restrict outputAccum) {
1369
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1370
+ pqFineCentroids0,
1371
+ code0,
1372
+ weight0,
1373
+ pqFineCentroids1,
1374
+ code1,
1375
+ weight1,
1376
+ outputAccum);
1377
+ }
1378
+
1379
+ // Process 2 samples.
1380
+ // Fine pq centroids table is shared among codes.
1381
+ //
1382
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1383
+ // decoded(code1)
1384
+ static void accum(
1385
+ const float* const __restrict pqFineCentroids,
1386
+ const uint8_t* const __restrict code0,
1387
+ const float weight0,
1388
+ const uint8_t* const __restrict code1,
1389
+ const float weight1,
1390
+ float* const __restrict outputAccum) {
1391
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1392
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
1393
+ }
1394
+
1395
+ // Process 3 samples.
1396
+ // Each code uses its own fine pq centroids table.
1397
+ //
1398
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1399
+ // decoded(code1) + weight2 * decoded(code2)
1400
+ static void accum(
1401
+ const float* const __restrict pqFineCentroids0,
1402
+ const uint8_t* const __restrict code0,
1403
+ const float weight0,
1404
+ const float* const __restrict pqFineCentroids1,
1405
+ const uint8_t* const __restrict code1,
1406
+ const float weight1,
1407
+ const float* const __restrict pqFineCentroids2,
1408
+ const uint8_t* const __restrict code2,
1409
+ const float weight2,
1410
+ float* const __restrict outputAccum) {
1411
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1412
+ pqFineCentroids0,
1413
+ code0,
1414
+ weight0,
1415
+ pqFineCentroids1,
1416
+ code1,
1417
+ weight1,
1418
+ pqFineCentroids2,
1419
+ code2,
1420
+ weight2,
1421
+ outputAccum);
1422
+ }
1423
+
1424
+ // Process 3 samples.
1425
+ // Fine pq centroids table is shared among codes.
1426
+ //
1427
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1428
+ // decoded(code1) + weight2 * decoded(code2)
1429
+ static void accum(
1430
+ const float* const __restrict pqFineCentroids,
1431
+ const uint8_t* const __restrict code0,
1432
+ const float weight0,
1433
+ const uint8_t* const __restrict code1,
1434
+ const float weight1,
1435
+ const uint8_t* const __restrict code2,
1436
+ const float weight2,
1437
+ float* const __restrict outputAccum) {
1438
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1439
+ pqFineCentroids,
1440
+ code0,
1441
+ weight0,
1442
+ code1,
1443
+ weight1,
1444
+ code2,
1445
+ weight2,
1446
+ outputAccum);
1447
+ }
1448
+ };
1449
+
1450
+ } // namespace cppcontrib
1451
+ } // namespace faiss
1452
+ #endif // PQ_NEON_INL_H