faiss 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -0,0 +1,1452 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+ #ifndef PQ_NEON_INL_H
3
+ #define PQ_NEON_INL_H
4
+
5
+ #include <arm_neon.h>
6
+
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+
10
+ #include <faiss/cppcontrib/detail/UintReader.h>
11
+
12
+ namespace faiss {
13
+ namespace cppcontrib {
14
+
15
+ ////////////////////////////////////////////////////////////////////////////////////
16
+ /// IndexPQDecoder
17
+ ////////////////////////////////////////////////////////////////////////////////////
18
+
19
+ namespace {
20
+
21
+ // Despite the following functions are somewhat redundant, I'd like to keep the
22
+ // overall basic blocks similar to ones from Index2LevelDecoder.
23
+ // A compiler will optimize away the redundant code.
24
+
25
+ // Processes 4 float values.
26
+ // Returns {
27
+ // [0..3] = *fine[0..3];
28
+ // }
29
+ inline float32x4_t elementaryBlock4x1b(const float* const __restrict fine) {
30
+ // load fine
31
+ const auto fineValue = vld1q_f32(fine);
32
+ return fineValue;
33
+ }
34
+
35
+ // Processes 4 float values.
36
+ // Returns {
37
+ // [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
38
+ // }
39
+ inline float32x4_t elementaryBlock4x1bAccum(
40
+ const float* const __restrict fine,
41
+ const float weight,
42
+ const float32x4_t existingValue) {
43
+ const auto fineValue = elementaryBlock4x1b(fine);
44
+
45
+ // this operation is expected to be optimized by a compiler
46
+ const auto weightNeon = vdupq_n_f32(weight);
47
+ // do fma
48
+ return vfmaq_f32(existingValue, weightNeon, fineValue);
49
+ }
50
+
51
+ // Processes 8 float values.
52
+ // Returns {
53
+ // [0..3] = *fine0[0..3];
54
+ // [4..7] = *fine1[0..3];
55
+ // }
56
+ inline float32x4x2_t elementaryBlock4x2b(
57
+ const float* const __restrict fine0,
58
+ const float* const __restrict fine1) {
59
+ // load fine
60
+ const auto fineValue0 = vld1q_f32(fine0);
61
+ const auto fineValue1 = vld1q_f32(fine1);
62
+
63
+ return {fineValue0, fineValue1};
64
+ }
65
+
66
+ // Processes 8 float values.
67
+ // Returns {
68
+ // [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
69
+ // [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
70
+ // }
71
+ inline float32x4x2_t elementaryBlock4x2bAccum(
72
+ const float* const __restrict fine0,
73
+ const float* const __restrict fine1,
74
+ const float weight,
75
+ const float32x4x2_t existingValue) {
76
+ const auto fineValue = elementaryBlock4x2b(fine0, fine1);
77
+
78
+ // this operation is expected to be optimized by a compiler
79
+ const auto weightNeon = vdupq_n_f32(weight);
80
+ // do fma
81
+ const auto result0 =
82
+ vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
83
+ const auto result1 =
84
+ vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
85
+ return {result0, result1};
86
+ }
87
+
88
+ // Processes 8 float values.
89
+ // Returns {
90
+ // [0..7] = *fine[0..7];
91
+ // }
92
+ inline float32x4x2_t elementaryBlock8x1b(const float* const __restrict fine) {
93
+ // load fine
94
+ const auto fineValue0 = vld1q_f32(fine);
95
+ const auto fineValue1 = vld1q_f32(fine + 4);
96
+ return {fineValue0, fineValue1};
97
+ }
98
+
99
+ // Processes 8 float values.
100
+ // Returns {
101
+ // [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
102
+ // }
103
+ inline float32x4x2_t elementaryBlock8x1bAccum(
104
+ const float* const __restrict fine,
105
+ const float weight,
106
+ const float32x4x2_t existingValue) {
107
+ const auto fineValue = elementaryBlock8x1b(fine);
108
+
109
+ // this operation is expected to be optimized by a compiler
110
+ const auto weightNeon = vdupq_n_f32(weight);
111
+ // do fma
112
+ const auto result0 =
113
+ vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
114
+ const auto result1 =
115
+ vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
116
+ return {result0, result1};
117
+ }
118
+
119
+ // The following code uses template-based for-loop unrolling,
120
+ // because the compiler does not do that on its own as needed.
121
+ // The idea is the following:
122
+ // template<int I, int MAX>
123
+ // struct Foo {
124
+ // static void bar() {
125
+ // doSomething(I);
126
+ // Foo<I + 1, MAX>::bar();
127
+ // }
128
+ // };
129
+ //
130
+ // template<int MAX>
131
+ // struct Foo<MAX, MAX> {
132
+ // static void bar() {}
133
+ // };
134
+ //
135
+ // Initiate the loop:
136
+ // Foo<0, MAX>::bar();
137
+
138
+ template <
139
+ intptr_t DIM,
140
+ intptr_t FINE_SIZE,
141
+ intptr_t FINE_BITS,
142
+ intptr_t CPOS,
143
+ bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
144
+ bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
145
+ bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
146
+ bool DIM_EQ_CPOS = DIM == CPOS>
147
+ struct IndexPQDecoderImpl;
148
+
149
+ template <
150
+ intptr_t DIM,
151
+ intptr_t CPOS,
152
+ intptr_t FINE_BITS,
153
+ bool QPOS_LEFT_GE_8,
154
+ bool QPOS_LEFT_GE_4>
155
+ struct IndexPQDecoderImpl<
156
+ DIM,
157
+ 4,
158
+ FINE_BITS,
159
+ CPOS,
160
+ true,
161
+ QPOS_LEFT_GE_8,
162
+ QPOS_LEFT_GE_4,
163
+ false> {
164
+ static constexpr intptr_t FINE_SIZE = 4;
165
+
166
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
167
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
168
+
169
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
170
+
171
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
172
+
173
+ // process 1 sample
174
+ static void store(
175
+ const float* const __restrict pqFineCentroids0,
176
+ const uint8_t* const __restrict code0,
177
+ float* const __restrict outputStore) {
178
+ // fine quantizer
179
+ const uint8_t* const __restrict fine0 = code0;
180
+
181
+ // process chunks, 4 float
182
+ // but 8 floats per loop
183
+
184
+ const intptr_t fineCode0a = detail::
185
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
186
+ fine0);
187
+ const intptr_t fineCode0b = detail::
188
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
189
+ fine0);
190
+
191
+ const auto storeValue = elementaryBlock4x2b(
192
+ pqFineCentroids0 +
193
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
194
+ fineCode0a) *
195
+ FINE_SIZE +
196
+ fineCentroidOffset,
197
+ pqFineCentroids0 +
198
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
199
+ fineCode0b) *
200
+ FINE_SIZE +
201
+ fineCentroidOffset);
202
+
203
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
204
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
205
+
206
+ // next
207
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
208
+ pqFineCentroids0, code0, outputStore);
209
+ }
210
+
211
+ // process 1 sample
212
+ static void accum(
213
+ const float* const __restrict pqFineCentroids0,
214
+ const uint8_t* const __restrict code0,
215
+ const float weight0,
216
+ float* const __restrict outputAccum) {
217
+ // fine quantizer
218
+ const uint8_t* const __restrict fine0 = code0;
219
+
220
+ // process chunks, 4 float
221
+ // but 8 floats per loop
222
+
223
+ const intptr_t fineCode0a = detail::
224
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
225
+ fine0);
226
+ const intptr_t fineCode0b = detail::
227
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
228
+ fine0);
229
+
230
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
231
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
232
+
233
+ auto existingValue = elementaryBlock4x2bAccum(
234
+ pqFineCentroids0 +
235
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
236
+ fineCode0a) *
237
+ FINE_SIZE +
238
+ fineCentroidOffset,
239
+ pqFineCentroids0 +
240
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
241
+ fineCode0b) *
242
+ FINE_SIZE +
243
+ fineCentroidOffset,
244
+ weight0,
245
+ {existingValue0, existingValue1});
246
+
247
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
248
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
249
+
250
+ // next
251
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
252
+ pqFineCentroids0, code0, weight0, outputAccum);
253
+ }
254
+
255
+ // Process 2 samples.
256
+ // Each code uses its own fine pq centroids table.
257
+ static void accum(
258
+ const float* const __restrict pqFineCentroids0,
259
+ const uint8_t* const __restrict code0,
260
+ const float weight0,
261
+ const float* const __restrict pqFineCentroids1,
262
+ const uint8_t* const __restrict code1,
263
+ const float weight1,
264
+ float* const __restrict outputAccum) {
265
+ // fine quantizer
266
+ const uint8_t* const __restrict fine0 = code0;
267
+ const uint8_t* const __restrict fine1 = code1;
268
+
269
+ // process chunks, 4 float
270
+ // but 8 floats per loop
271
+
272
+ const intptr_t fineCode0a = detail::
273
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
274
+ fine0);
275
+ const intptr_t fineCode0b = detail::
276
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
277
+ fine0);
278
+ const intptr_t fineCode1a = detail::
279
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
280
+ fine1);
281
+ const intptr_t fineCode1b = detail::
282
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
283
+ fine1);
284
+
285
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
286
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
287
+
288
+ auto existingValue = elementaryBlock4x2bAccum(
289
+ pqFineCentroids0 +
290
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
291
+ fineCode0a) *
292
+ FINE_SIZE +
293
+ fineCentroidOffset,
294
+ pqFineCentroids0 +
295
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
296
+ fineCode0b) *
297
+ FINE_SIZE +
298
+ fineCentroidOffset,
299
+ weight0,
300
+ {existingValue0, existingValue1});
301
+
302
+ existingValue = elementaryBlock4x2bAccum(
303
+ pqFineCentroids1 +
304
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
305
+ fineCode1a) *
306
+ FINE_SIZE +
307
+ fineCentroidOffset,
308
+ pqFineCentroids1 +
309
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
310
+ fineCode1b) *
311
+ FINE_SIZE +
312
+ fineCentroidOffset,
313
+ weight1,
314
+ existingValue);
315
+
316
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
317
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
318
+
319
+ // next
320
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
321
+ pqFineCentroids0,
322
+ code0,
323
+ weight0,
324
+ pqFineCentroids1,
325
+ code1,
326
+ weight1,
327
+ outputAccum);
328
+ }
329
+
330
+ // Process 2 samples.
331
+ // Fine pq centroids table is shared among codes.
332
+ static void accum(
333
+ const float* const __restrict pqFineCentroids,
334
+ const uint8_t* const __restrict code0,
335
+ const float weight0,
336
+ const uint8_t* const __restrict code1,
337
+ const float weight1,
338
+ float* const __restrict outputAccum) {
339
+ // fine quantizer
340
+ const uint8_t* const __restrict fine0 = code0;
341
+ const uint8_t* const __restrict fine1 = code1;
342
+
343
+ // process chunks, 4 float
344
+ // but 8 floats per loop
345
+
346
+ const intptr_t fineCode0a = detail::
347
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
348
+ fine0);
349
+ const intptr_t fineCode0b = detail::
350
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
351
+ fine0);
352
+ const intptr_t fineCode1a = detail::
353
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
354
+ fine1);
355
+ const intptr_t fineCode1b = detail::
356
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
357
+ fine1);
358
+
359
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
360
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
361
+
362
+ auto existingValue = elementaryBlock4x2bAccum(
363
+ pqFineCentroids +
364
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
365
+ fineCode0a) *
366
+ FINE_SIZE +
367
+ fineCentroidOffset,
368
+ pqFineCentroids +
369
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
370
+ fineCode0b) *
371
+ FINE_SIZE +
372
+ fineCentroidOffset,
373
+ weight0,
374
+ {existingValue0, existingValue1});
375
+
376
+ existingValue = elementaryBlock4x2bAccum(
377
+ pqFineCentroids +
378
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
379
+ fineCode1a) *
380
+ FINE_SIZE +
381
+ fineCentroidOffset,
382
+ pqFineCentroids +
383
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
384
+ fineCode1b) *
385
+ FINE_SIZE +
386
+ fineCentroidOffset,
387
+ weight1,
388
+ existingValue);
389
+
390
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
391
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
392
+
393
+ // next
394
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
395
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
396
+ }
397
+
398
+ // Process 3 samples.
399
+ // Each code uses its own fine pq centroids table.
400
+ static void accum(
401
+ const float* const __restrict pqFineCentroids0,
402
+ const uint8_t* const __restrict code0,
403
+ const float weight0,
404
+ const float* const __restrict pqFineCentroids1,
405
+ const uint8_t* const __restrict code1,
406
+ const float weight1,
407
+ const float* const __restrict pqFineCentroids2,
408
+ const uint8_t* const __restrict code2,
409
+ const float weight2,
410
+ float* const __restrict outputAccum) {
411
+ // fine quantizer
412
+ const uint8_t* const __restrict fine0 = code0;
413
+ const uint8_t* const __restrict fine1 = code1;
414
+ const uint8_t* const __restrict fine2 = code2;
415
+
416
+ // process chunks, 4 float
417
+ // but 8 floats per loop
418
+
419
+ const intptr_t fineCode0a = detail::
420
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
421
+ fine0);
422
+ const intptr_t fineCode0b = detail::
423
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
424
+ fine0);
425
+ const intptr_t fineCode1a = detail::
426
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
427
+ fine1);
428
+ const intptr_t fineCode1b = detail::
429
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
430
+ fine1);
431
+ const intptr_t fineCode2a = detail::
432
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
433
+ fine2);
434
+ const intptr_t fineCode2b = detail::
435
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
436
+ fine2);
437
+
438
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
439
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
440
+
441
+ auto existingValue = elementaryBlock4x2bAccum(
442
+ pqFineCentroids0 +
443
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
444
+ fineCode0a) *
445
+ FINE_SIZE +
446
+ fineCentroidOffset,
447
+ pqFineCentroids0 +
448
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
449
+ fineCode0b) *
450
+ FINE_SIZE +
451
+ fineCentroidOffset,
452
+ weight0,
453
+ {existingValue0, existingValue1});
454
+
455
+ existingValue = elementaryBlock4x2bAccum(
456
+ pqFineCentroids1 +
457
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
458
+ fineCode1a) *
459
+ FINE_SIZE +
460
+ fineCentroidOffset,
461
+ pqFineCentroids1 +
462
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
463
+ fineCode1b) *
464
+ FINE_SIZE +
465
+ fineCentroidOffset,
466
+ weight1,
467
+ existingValue);
468
+
469
+ existingValue = elementaryBlock4x2bAccum(
470
+ pqFineCentroids2 +
471
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
472
+ fineCode2a) *
473
+ FINE_SIZE +
474
+ fineCentroidOffset,
475
+ pqFineCentroids2 +
476
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
477
+ fineCode2b) *
478
+ FINE_SIZE +
479
+ fineCentroidOffset,
480
+ weight2,
481
+ existingValue);
482
+
483
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
484
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
485
+
486
+ // next
487
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
488
+ pqFineCentroids0,
489
+ code0,
490
+ weight0,
491
+ pqFineCentroids1,
492
+ code1,
493
+ weight1,
494
+ pqFineCentroids2,
495
+ code2,
496
+ weight2,
497
+ outputAccum);
498
+ }
499
+
500
+ // Process 3 samples.
501
+ // Fine pq centroids table is shared among codes.
502
+ static void accum(
503
+ const float* const __restrict pqFineCentroids,
504
+ const uint8_t* const __restrict code0,
505
+ const float weight0,
506
+ const uint8_t* const __restrict code1,
507
+ const float weight1,
508
+ const uint8_t* const __restrict code2,
509
+ const float weight2,
510
+ float* const __restrict outputAccum) {
511
+ // fine quantizer
512
+ const uint8_t* const __restrict fine0 = code0;
513
+ const uint8_t* const __restrict fine1 = code1;
514
+ const uint8_t* const __restrict fine2 = code2;
515
+
516
+ // process chunks, 4 float
517
+ // but 8 floats per loop
518
+
519
+ const intptr_t fineCode0a = detail::
520
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
521
+ fine0);
522
+ const intptr_t fineCode0b = detail::
523
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
524
+ fine0);
525
+ const intptr_t fineCode1a = detail::
526
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
527
+ fine1);
528
+ const intptr_t fineCode1b = detail::
529
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
530
+ fine1);
531
+ const intptr_t fineCode2a = detail::
532
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
533
+ fine2);
534
+ const intptr_t fineCode2b = detail::
535
+ UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
536
+ fine2);
537
+
538
+ auto existingValue0 = vld1q_f32(outputAccum + CPOS);
539
+ auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
540
+
541
+ auto existingValue = elementaryBlock4x2bAccum(
542
+ pqFineCentroids +
543
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
544
+ fineCode0a) *
545
+ FINE_SIZE +
546
+ fineCentroidOffset,
547
+ pqFineCentroids +
548
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
549
+ fineCode0b) *
550
+ FINE_SIZE +
551
+ fineCentroidOffset,
552
+ weight0,
553
+ {existingValue0, existingValue1});
554
+
555
+ existingValue = elementaryBlock4x2bAccum(
556
+ pqFineCentroids +
557
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
558
+ fineCode1a) *
559
+ FINE_SIZE +
560
+ fineCentroidOffset,
561
+ pqFineCentroids +
562
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
563
+ fineCode1b) *
564
+ FINE_SIZE +
565
+ fineCentroidOffset,
566
+ weight1,
567
+ existingValue);
568
+
569
+ existingValue = elementaryBlock4x2bAccum(
570
+ pqFineCentroids +
571
+ ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
572
+ fineCode2a) *
573
+ FINE_SIZE +
574
+ fineCentroidOffset,
575
+ pqFineCentroids +
576
+ ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
577
+ fineCode2b) *
578
+ FINE_SIZE +
579
+ fineCentroidOffset,
580
+ weight2,
581
+ existingValue);
582
+
583
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
584
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
585
+
586
+ // next
587
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
588
+ pqFineCentroids,
589
+ code0,
590
+ weight0,
591
+ code1,
592
+ weight1,
593
+ code2,
594
+ weight2,
595
+ outputAccum);
596
+ }
597
+ };
598
+
599
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
600
+ struct IndexPQDecoderImpl<
601
+ DIM,
602
+ FINE_SIZE,
603
+ FINE_BITS,
604
+ CPOS,
605
+ false,
606
+ true,
607
+ true,
608
+ false> {
609
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
610
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
611
+
612
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
613
+
614
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
615
+
616
+ // process 1 sample
617
+ static void store(
618
+ const float* const __restrict pqFineCentroids0,
619
+ const uint8_t* const __restrict code0,
620
+ float* const __restrict outputStore) {
621
+ // fine quantizer
622
+ const uint8_t* const __restrict fine0 = code0;
623
+
624
+ // process chunks, 8 float
625
+
626
+ const intptr_t fineCode0 =
627
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
628
+ get(fine0);
629
+
630
+ const auto storeValue = elementaryBlock8x1b(
631
+ pqFineCentroids0 +
632
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
633
+ fineCentroidOffset);
634
+
635
+ vst1q_f32(outputStore + CPOS, storeValue.val[0]);
636
+ vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
637
+
638
+ // next
639
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
640
+ pqFineCentroids0, code0, outputStore);
641
+ }
642
+
643
+ // process 1 sample
644
+ static void accum(
645
+ const float* const __restrict pqFineCentroids0,
646
+ const uint8_t* const __restrict code0,
647
+ const float weight0,
648
+ float* const __restrict outputAccum) {
649
+ // fine quantizer
650
+ const uint8_t* const __restrict fine0 = code0;
651
+
652
+ // process chunks, 8 float
653
+
654
+ const intptr_t fineCode0 =
655
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
656
+ get(fine0);
657
+
658
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
659
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
660
+
661
+ const auto existingValue = elementaryBlock8x1bAccum(
662
+ pqFineCentroids0 +
663
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
664
+ FINE_SIZE +
665
+ fineCentroidOffset,
666
+ weight0,
667
+ {existingValue0, existingValue1});
668
+
669
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
670
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
671
+
672
+ // next
673
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
674
+ pqFineCentroids0, code0, weight0, outputAccum);
675
+ }
676
+
677
+ // Process 2 samples.
678
+ // Each code uses its own fine pq centroids table.
679
+ static void accum(
680
+ const float* const __restrict pqFineCentroids0,
681
+ const uint8_t* const __restrict code0,
682
+ const float weight0,
683
+ const float* const __restrict pqFineCentroids1,
684
+ const uint8_t* const __restrict code1,
685
+ const float weight1,
686
+ float* const __restrict outputAccum) {
687
+ // fine quantizer
688
+ const uint8_t* const __restrict fine0 = code0;
689
+ const uint8_t* const __restrict fine1 = code1;
690
+
691
+ // process chunks, 8 float
692
+
693
+ const intptr_t fineCode0 =
694
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
695
+ get(fine0);
696
+ const intptr_t fineCode1 =
697
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
698
+ get(fine1);
699
+
700
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
701
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
702
+
703
+ auto existingValue = elementaryBlock8x1bAccum(
704
+ pqFineCentroids0 +
705
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
706
+ FINE_SIZE +
707
+ fineCentroidOffset,
708
+ weight0,
709
+ {existingValue0, existingValue1});
710
+
711
+ existingValue = elementaryBlock8x1bAccum(
712
+ pqFineCentroids1 +
713
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
714
+ FINE_SIZE +
715
+ fineCentroidOffset,
716
+ weight1,
717
+ existingValue);
718
+
719
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
720
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
721
+
722
+ // next
723
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
724
+ pqFineCentroids0,
725
+ code0,
726
+ weight0,
727
+ pqFineCentroids1,
728
+ code1,
729
+ weight1,
730
+ outputAccum);
731
+ }
732
+
733
+ // Process 2 samples.
734
+ // Fine pq centroids table is shared among codes.
735
+ static void accum(
736
+ const float* const __restrict pqFineCentroids,
737
+ const uint8_t* const __restrict code0,
738
+ const float weight0,
739
+ const uint8_t* const __restrict code1,
740
+ const float weight1,
741
+ float* const __restrict outputAccum) {
742
+ // fine quantizer
743
+ const uint8_t* const __restrict fine0 = code0;
744
+ const uint8_t* const __restrict fine1 = code1;
745
+
746
+ // process chunks, 8 float
747
+
748
+ const intptr_t fineCode0 =
749
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
750
+ get(fine0);
751
+ const intptr_t fineCode1 =
752
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
753
+ get(fine1);
754
+
755
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
756
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
757
+
758
+ auto existingValue = elementaryBlock8x1bAccum(
759
+ pqFineCentroids +
760
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
761
+ FINE_SIZE +
762
+ fineCentroidOffset,
763
+ weight0,
764
+ {existingValue0, existingValue1});
765
+
766
+ existingValue = elementaryBlock8x1bAccum(
767
+ pqFineCentroids +
768
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
769
+ FINE_SIZE +
770
+ fineCentroidOffset,
771
+ weight1,
772
+ existingValue);
773
+
774
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
775
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
776
+
777
+ // next
778
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
779
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
780
+ }
781
+
782
+ // Process 3 samples.
783
+ // Each code uses its own fine pq centroids table.
784
+ static void accum(
785
+ const float* const __restrict pqFineCentroids0,
786
+ const uint8_t* const __restrict code0,
787
+ const float weight0,
788
+ const float* const __restrict pqFineCentroids1,
789
+ const uint8_t* const __restrict code1,
790
+ const float weight1,
791
+ const float* const __restrict pqFineCentroids2,
792
+ const uint8_t* const __restrict code2,
793
+ const float weight2,
794
+ float* const __restrict outputAccum) {
795
+ // fine quantizer
796
+ const uint8_t* const __restrict fine0 = code0;
797
+ const uint8_t* const __restrict fine1 = code1;
798
+ const uint8_t* const __restrict fine2 = code2;
799
+
800
+ // process chunks, 8 float
801
+
802
+ const intptr_t fineCode0 =
803
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
804
+ get(fine0);
805
+ const intptr_t fineCode1 =
806
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
807
+ get(fine1);
808
+ const intptr_t fineCode2 =
809
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
810
+ get(fine2);
811
+
812
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
813
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
814
+
815
+ auto existingValue = elementaryBlock8x1bAccum(
816
+ pqFineCentroids0 +
817
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
818
+ FINE_SIZE +
819
+ fineCentroidOffset,
820
+ weight0,
821
+ {existingValue0, existingValue1});
822
+
823
+ existingValue = elementaryBlock8x1bAccum(
824
+ pqFineCentroids1 +
825
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
826
+ FINE_SIZE +
827
+ fineCentroidOffset,
828
+ weight1,
829
+ existingValue);
830
+
831
+ existingValue = elementaryBlock8x1bAccum(
832
+ pqFineCentroids2 +
833
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
834
+ FINE_SIZE +
835
+ fineCentroidOffset,
836
+ weight2,
837
+ existingValue);
838
+
839
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
840
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
841
+
842
+ // next
843
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
844
+ pqFineCentroids0,
845
+ code0,
846
+ weight0,
847
+ pqFineCentroids1,
848
+ code1,
849
+ weight1,
850
+ pqFineCentroids2,
851
+ code2,
852
+ weight2,
853
+ outputAccum);
854
+ }
855
+
856
+ // Process 3 samples.
857
+ // Fine pq centroids table is shared among codes.
858
+ static void accum(
859
+ const float* const __restrict pqFineCentroids,
860
+ const uint8_t* const __restrict code0,
861
+ const float weight0,
862
+ const uint8_t* const __restrict code1,
863
+ const float weight1,
864
+ const uint8_t* const __restrict code2,
865
+ const float weight2,
866
+ float* const __restrict outputAccum) {
867
+ // fine quantizer
868
+ const uint8_t* const __restrict fine0 = code0;
869
+ const uint8_t* const __restrict fine1 = code1;
870
+ const uint8_t* const __restrict fine2 = code2;
871
+
872
+ // process chunks, 8 float
873
+
874
+ const intptr_t fineCode0 =
875
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
876
+ get(fine0);
877
+ const intptr_t fineCode1 =
878
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
879
+ get(fine1);
880
+ const intptr_t fineCode2 =
881
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
882
+ get(fine2);
883
+
884
+ const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
885
+ const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
886
+
887
+ auto existingValue = elementaryBlock8x1bAccum(
888
+ pqFineCentroids +
889
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
890
+ FINE_SIZE +
891
+ fineCentroidOffset,
892
+ weight0,
893
+ {existingValue0, existingValue1});
894
+
895
+ existingValue = elementaryBlock8x1bAccum(
896
+ pqFineCentroids +
897
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
898
+ FINE_SIZE +
899
+ fineCentroidOffset,
900
+ weight1,
901
+ existingValue);
902
+
903
+ existingValue = elementaryBlock8x1bAccum(
904
+ pqFineCentroids +
905
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
906
+ FINE_SIZE +
907
+ fineCentroidOffset,
908
+ weight2,
909
+ existingValue);
910
+
911
+ vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
912
+ vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
913
+
914
+ // next
915
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
916
+ pqFineCentroids,
917
+ code0,
918
+ weight0,
919
+ code1,
920
+ weight1,
921
+ code2,
922
+ weight2,
923
+ outputAccum);
924
+ }
925
+ };
926
+
927
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
928
+ struct IndexPQDecoderImpl<
929
+ DIM,
930
+ FINE_SIZE,
931
+ FINE_BITS,
932
+ CPOS,
933
+ false,
934
+ false,
935
+ true,
936
+ false> {
937
+ static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
938
+ static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
939
+
940
+ static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
941
+
942
+ static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
943
+
944
+ // process 1 sample
945
+ static void store(
946
+ const float* const __restrict pqFineCentroids0,
947
+ const uint8_t* const __restrict code0,
948
+ float* const __restrict outputStore) {
949
+ // fine quantizer
950
+ const uint8_t* const __restrict fine0 = code0;
951
+
952
+ // process chunks, 4 float
953
+
954
+ const intptr_t fineCode0 =
955
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
956
+ get(fine0);
957
+
958
+ const auto storeValue = elementaryBlock4x1b(
959
+ pqFineCentroids0 +
960
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
961
+ fineCentroidOffset);
962
+
963
+ vst1q_f32(outputStore + CPOS, storeValue);
964
+
965
+ // next
966
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
967
+ pqFineCentroids0, code0, outputStore);
968
+ }
969
+
970
+ // process 1 sample
971
+ static void accum(
972
+ const float* const __restrict pqFineCentroids0,
973
+ const uint8_t* const __restrict code0,
974
+ const float weight0,
975
+ float* const __restrict outputAccum) {
976
+ // fine quantizer
977
+ const uint8_t* const __restrict fine0 = code0;
978
+
979
+ // process chunks, 4 float
980
+
981
+ const intptr_t fineCode0 =
982
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
983
+ get(fine0);
984
+
985
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
986
+
987
+ existingValue = elementaryBlock4x1bAccum(
988
+ pqFineCentroids0 +
989
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
990
+ FINE_SIZE +
991
+ fineCentroidOffset,
992
+ weight0,
993
+ existingValue);
994
+
995
+ vst1q_f32(outputAccum + CPOS, existingValue);
996
+
997
+ // next
998
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
999
+ pqFineCentroids0, code0, weight0, outputAccum);
1000
+ }
1001
+
1002
+ // Process 2 samples.
1003
+ // Each code uses its own fine pq centroids table.
1004
+ static void accum(
1005
+ const float* const __restrict pqFineCentroids0,
1006
+ const uint8_t* const __restrict code0,
1007
+ const float weight0,
1008
+ const float* const __restrict pqFineCentroids1,
1009
+ const uint8_t* const __restrict code1,
1010
+ const float weight1,
1011
+ float* const __restrict outputAccum) {
1012
+ // fine quantizer
1013
+ const uint8_t* const __restrict fine0 = code0;
1014
+ const uint8_t* const __restrict fine1 = code1;
1015
+
1016
+ // process chunks, 4 float
1017
+
1018
+ const intptr_t fineCode0 =
1019
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1020
+ get(fine0);
1021
+ const intptr_t fineCode1 =
1022
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1023
+ get(fine1);
1024
+
1025
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1026
+
1027
+ existingValue = elementaryBlock4x1bAccum(
1028
+ pqFineCentroids0 +
1029
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1030
+ FINE_SIZE +
1031
+ fineCentroidOffset,
1032
+ weight0,
1033
+ existingValue);
1034
+
1035
+ existingValue = elementaryBlock4x1bAccum(
1036
+ pqFineCentroids1 +
1037
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1038
+ FINE_SIZE +
1039
+ fineCentroidOffset,
1040
+ weight1,
1041
+ existingValue);
1042
+
1043
+ vst1q_f32(outputAccum + CPOS, existingValue);
1044
+
1045
+ // next
1046
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1047
+ pqFineCentroids0,
1048
+ code0,
1049
+ weight0,
1050
+ pqFineCentroids1,
1051
+ code1,
1052
+ weight1,
1053
+ outputAccum);
1054
+ }
1055
+
1056
+ // Process 2 samples.
1057
+ // Fine pq centroids table is shared among codes.
1058
+ static void accum(
1059
+ const float* const __restrict pqFineCentroids,
1060
+ const uint8_t* const __restrict code0,
1061
+ const float weight0,
1062
+ const uint8_t* const __restrict code1,
1063
+ const float weight1,
1064
+ float* const __restrict outputAccum) {
1065
+ // fine quantizer
1066
+ const uint8_t* const __restrict fine0 = code0;
1067
+ const uint8_t* const __restrict fine1 = code1;
1068
+
1069
+ // process chunks, 4 float
1070
+
1071
+ const intptr_t fineCode0 =
1072
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1073
+ get(fine0);
1074
+ const intptr_t fineCode1 =
1075
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1076
+ get(fine1);
1077
+
1078
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1079
+
1080
+ existingValue = elementaryBlock4x1bAccum(
1081
+ pqFineCentroids +
1082
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1083
+ FINE_SIZE +
1084
+ fineCentroidOffset,
1085
+ weight0,
1086
+ existingValue);
1087
+
1088
+ existingValue = elementaryBlock4x1bAccum(
1089
+ pqFineCentroids +
1090
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1091
+ FINE_SIZE +
1092
+ fineCentroidOffset,
1093
+ weight1,
1094
+ existingValue);
1095
+
1096
+ vst1q_f32(outputAccum + CPOS, existingValue);
1097
+
1098
+ // next
1099
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1100
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
1101
+ }
1102
+
1103
+ // Process 3 samples.
1104
+ // Each code uses its own fine pq centroids table.
1105
+ static void accum(
1106
+ const float* const __restrict pqFineCentroids0,
1107
+ const uint8_t* const __restrict code0,
1108
+ const float weight0,
1109
+ const float* const __restrict pqFineCentroids1,
1110
+ const uint8_t* const __restrict code1,
1111
+ const float weight1,
1112
+ const float* const __restrict pqFineCentroids2,
1113
+ const uint8_t* const __restrict code2,
1114
+ const float weight2,
1115
+ float* const __restrict outputAccum) {
1116
+ // fine quantizer
1117
+ const uint8_t* const __restrict fine0 = code0;
1118
+ const uint8_t* const __restrict fine1 = code1;
1119
+ const uint8_t* const __restrict fine2 = code2;
1120
+
1121
+ // process chunks, 4 float
1122
+
1123
+ const intptr_t fineCode0 =
1124
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1125
+ get(fine0);
1126
+ const intptr_t fineCode1 =
1127
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1128
+ get(fine1);
1129
+ const intptr_t fineCode2 =
1130
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1131
+ get(fine2);
1132
+
1133
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1134
+
1135
+ existingValue = elementaryBlock4x1bAccum(
1136
+ pqFineCentroids0 +
1137
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1138
+ FINE_SIZE +
1139
+ fineCentroidOffset,
1140
+ weight0,
1141
+ existingValue);
1142
+
1143
+ existingValue = elementaryBlock4x1bAccum(
1144
+ pqFineCentroids1 +
1145
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1146
+ FINE_SIZE +
1147
+ fineCentroidOffset,
1148
+ weight1,
1149
+ existingValue);
1150
+
1151
+ existingValue = elementaryBlock4x1bAccum(
1152
+ pqFineCentroids2 +
1153
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1154
+ FINE_SIZE +
1155
+ fineCentroidOffset,
1156
+ weight2,
1157
+ existingValue);
1158
+
1159
+ vst1q_f32(outputAccum + CPOS, existingValue);
1160
+
1161
+ // next
1162
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1163
+ pqFineCentroids0,
1164
+ code0,
1165
+ weight0,
1166
+ pqFineCentroids1,
1167
+ code1,
1168
+ weight1,
1169
+ pqFineCentroids2,
1170
+ code2,
1171
+ weight2,
1172
+ outputAccum);
1173
+ }
1174
+
1175
+ // Process 3 samples.
1176
+ // Fine pq centroids table is shared among codes.
1177
+ static void accum(
1178
+ const float* const __restrict pqFineCentroids,
1179
+ const uint8_t* const __restrict code0,
1180
+ const float weight0,
1181
+ const uint8_t* const __restrict code1,
1182
+ const float weight1,
1183
+ const uint8_t* const __restrict code2,
1184
+ const float weight2,
1185
+ float* const __restrict outputAccum) {
1186
+ // fine quantizer
1187
+ const uint8_t* const __restrict fine0 = code0;
1188
+ const uint8_t* const __restrict fine1 = code1;
1189
+ const uint8_t* const __restrict fine2 = code2;
1190
+
1191
+ // process chunks, 4 float
1192
+
1193
+ const intptr_t fineCode0 =
1194
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1195
+ get(fine0);
1196
+ const intptr_t fineCode1 =
1197
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1198
+ get(fine1);
1199
+ const intptr_t fineCode2 =
1200
+ detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
1201
+ get(fine2);
1202
+
1203
+ auto existingValue = vld1q_f32(outputAccum + CPOS);
1204
+
1205
+ existingValue = elementaryBlock4x1bAccum(
1206
+ pqFineCentroids +
1207
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
1208
+ FINE_SIZE +
1209
+ fineCentroidOffset,
1210
+ weight0,
1211
+ existingValue);
1212
+
1213
+ existingValue = elementaryBlock4x1bAccum(
1214
+ pqFineCentroids +
1215
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
1216
+ FINE_SIZE +
1217
+ fineCentroidOffset,
1218
+ weight1,
1219
+ existingValue);
1220
+
1221
+ existingValue = elementaryBlock4x1bAccum(
1222
+ pqFineCentroids +
1223
+ (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
1224
+ FINE_SIZE +
1225
+ fineCentroidOffset,
1226
+ weight2,
1227
+ existingValue);
1228
+
1229
+ vst1q_f32(outputAccum + CPOS, existingValue);
1230
+
1231
+ // next
1232
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
1233
+ pqFineCentroids,
1234
+ code0,
1235
+ weight0,
1236
+ code1,
1237
+ weight1,
1238
+ code2,
1239
+ weight2,
1240
+ outputAccum);
1241
+ }
1242
+ };
1243
+
1244
+ // This partial specialization is expected to do nothing.
1245
+ template <
1246
+ intptr_t DIM,
1247
+ intptr_t FINE_SIZE,
1248
+ intptr_t FINE_BITS,
1249
+ bool FINE_SIZE_EQ_4,
1250
+ bool QPOS_LEFT_GE_8,
1251
+ bool QPOS_LEFT_GE_4>
1252
+ struct IndexPQDecoderImpl<
1253
+ DIM,
1254
+ FINE_SIZE,
1255
+ FINE_BITS,
1256
+ DIM,
1257
+ FINE_SIZE_EQ_4,
1258
+ QPOS_LEFT_GE_8,
1259
+ QPOS_LEFT_GE_4,
1260
+ true> {
1261
+ // process 1 sample
1262
+ static void store(
1263
+ const float* const __restrict pqFineCentroids0,
1264
+ const uint8_t* const __restrict code0,
1265
+ float* const __restrict outputStore) {}
1266
+
1267
+ // process 1 sample
1268
+ static void accum(
1269
+ const float* const __restrict pqFineCentroids0,
1270
+ const uint8_t* const __restrict code0,
1271
+ const float weight0,
1272
+ float* const __restrict outputAccum) {}
1273
+
1274
+ // Process 2 samples.
1275
+ // Each code uses its own fine pq centroids table.
1276
+ static void accum(
1277
+ const float* const __restrict pqFineCentroids0,
1278
+ const uint8_t* const __restrict code0,
1279
+ const float weight0,
1280
+ const float* const __restrict pqFineCentroids1,
1281
+ const uint8_t* const __restrict code1,
1282
+ const float weight1,
1283
+ float* const __restrict outputAccum) {}
1284
+
1285
+ // Process 2 samples.
1286
+ // Fine pq centroids table is shared among codes.
1287
+ static void accum(
1288
+ const float* const __restrict pqFineCentroids,
1289
+ const uint8_t* const __restrict code0,
1290
+ const float weight0,
1291
+ const uint8_t* const __restrict code1,
1292
+ const float weight1,
1293
+ float* const __restrict outputAccum) {}
1294
+
1295
+ // Process 3 samples.
1296
+ // Each code uses its own fine pq centroids table.
1297
+ static void accum(
1298
+ const float* const __restrict pqFineCentroids0,
1299
+ const uint8_t* const __restrict code0,
1300
+ const float weight0,
1301
+ const float* const __restrict pqFineCentroids1,
1302
+ const uint8_t* const __restrict code1,
1303
+ const float weight1,
1304
+ const float* const __restrict pqFineCentroids2,
1305
+ const uint8_t* const __restrict code2,
1306
+ const float weight2,
1307
+ float* const __restrict outputAccum) {}
1308
+
1309
+ // Process 3 samples.
1310
+ // Fine pq centroids table is shared among codes.
1311
+ static void accum(
1312
+ const float* const __restrict pqFineCentroids,
1313
+ const uint8_t* const __restrict code0,
1314
+ const float weight0,
1315
+ const uint8_t* const __restrict code1,
1316
+ const float weight1,
1317
+ const uint8_t* const __restrict code2,
1318
+ const float weight2,
1319
+ float* const __restrict outputAccum) {}
1320
+ };
1321
+ } // namespace
1322
+
1323
+ // Suitable for PQ[1]x8
1324
+ // Suitable for PQ[1]x10
1325
+ // Suitable for PQ[1]x16
1326
+ template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
1327
+ struct IndexPQDecoder {
1328
+ static_assert(
1329
+ FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
1330
+ "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
1331
+
1332
+ static constexpr intptr_t dim = DIM;
1333
+ static constexpr intptr_t fineSize = FINE_SIZE;
1334
+ static constexpr intptr_t fineBits = FINE_BITS;
1335
+
1336
+ // Process 1 sample.
1337
+ static void store(
1338
+ const float* const __restrict pqFineCentroids,
1339
+ const uint8_t* const __restrict code,
1340
+ float* const __restrict outputStore) {
1341
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
1342
+ pqFineCentroids, code, outputStore);
1343
+ }
1344
+
1345
+ // Process 1 sample.
1346
+ // Performs outputAccum += weight * decoded(code)
1347
+ static void accum(
1348
+ const float* const __restrict pqFineCentroids,
1349
+ const uint8_t* const __restrict code,
1350
+ const float weight,
1351
+ float* const __restrict outputAccum) {
1352
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1353
+ pqFineCentroids, code, weight, outputAccum);
1354
+ }
1355
+
1356
+ // Process 2 samples.
1357
+ // Each code uses its own fine pq centroids table.
1358
+ //
1359
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1360
+ // decoded(code1)
1361
+ static void accum(
1362
+ const float* const __restrict pqFineCentroids0,
1363
+ const uint8_t* const __restrict code0,
1364
+ const float weight0,
1365
+ const float* const __restrict pqFineCentroids1,
1366
+ const uint8_t* const __restrict code1,
1367
+ const float weight1,
1368
+ float* const __restrict outputAccum) {
1369
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1370
+ pqFineCentroids0,
1371
+ code0,
1372
+ weight0,
1373
+ pqFineCentroids1,
1374
+ code1,
1375
+ weight1,
1376
+ outputAccum);
1377
+ }
1378
+
1379
+ // Process 2 samples.
1380
+ // Fine pq centroids table is shared among codes.
1381
+ //
1382
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1383
+ // decoded(code1)
1384
+ static void accum(
1385
+ const float* const __restrict pqFineCentroids,
1386
+ const uint8_t* const __restrict code0,
1387
+ const float weight0,
1388
+ const uint8_t* const __restrict code1,
1389
+ const float weight1,
1390
+ float* const __restrict outputAccum) {
1391
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1392
+ pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
1393
+ }
1394
+
1395
+ // Process 3 samples.
1396
+ // Each code uses its own fine pq centroids table.
1397
+ //
1398
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1399
+ // decoded(code1) + weight2 * decoded(code2)
1400
+ static void accum(
1401
+ const float* const __restrict pqFineCentroids0,
1402
+ const uint8_t* const __restrict code0,
1403
+ const float weight0,
1404
+ const float* const __restrict pqFineCentroids1,
1405
+ const uint8_t* const __restrict code1,
1406
+ const float weight1,
1407
+ const float* const __restrict pqFineCentroids2,
1408
+ const uint8_t* const __restrict code2,
1409
+ const float weight2,
1410
+ float* const __restrict outputAccum) {
1411
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1412
+ pqFineCentroids0,
1413
+ code0,
1414
+ weight0,
1415
+ pqFineCentroids1,
1416
+ code1,
1417
+ weight1,
1418
+ pqFineCentroids2,
1419
+ code2,
1420
+ weight2,
1421
+ outputAccum);
1422
+ }
1423
+
1424
+ // Process 3 samples.
1425
+ // Fine pq centroids table is shared among codes.
1426
+ //
1427
+ // Performs outputAccum += weight0 * decoded(code0) + weight1 *
1428
+ // decoded(code1) + weight2 * decoded(code2)
1429
+ static void accum(
1430
+ const float* const __restrict pqFineCentroids,
1431
+ const uint8_t* const __restrict code0,
1432
+ const float weight0,
1433
+ const uint8_t* const __restrict code1,
1434
+ const float weight1,
1435
+ const uint8_t* const __restrict code2,
1436
+ const float weight2,
1437
+ float* const __restrict outputAccum) {
1438
+ IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
1439
+ pqFineCentroids,
1440
+ code0,
1441
+ weight0,
1442
+ code1,
1443
+ weight1,
1444
+ code2,
1445
+ weight2,
1446
+ outputAccum);
1447
+ }
1448
+ };
1449
+
1450
+ } // namespace cppcontrib
1451
+ } // namespace faiss
1452
+ #endif // PQ_NEON_INL_H