faiss 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/LICENSE.txt +1 -1
  4. data/ext/faiss/extconf.rb +1 -1
  5. data/ext/faiss/index.cpp +10 -14
  6. data/ext/faiss/numo.hpp +957 -833
  7. data/lib/faiss/version.rb +1 -1
  8. data/vendor/faiss/faiss/AutoTune.cpp +2 -2
  9. data/vendor/faiss/faiss/AutoTune.h +2 -2
  10. data/vendor/faiss/faiss/Clustering.cpp +2 -2
  11. data/vendor/faiss/faiss/Clustering.h +2 -2
  12. data/vendor/faiss/faiss/IVFlib.cpp +2 -2
  13. data/vendor/faiss/faiss/IVFlib.h +2 -2
  14. data/vendor/faiss/faiss/Index.cpp +6 -2
  15. data/vendor/faiss/faiss/Index.h +10 -3
  16. data/vendor/faiss/faiss/Index2Layer.cpp +2 -2
  17. data/vendor/faiss/faiss/Index2Layer.h +2 -2
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +7 -7
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +2 -2
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +14 -16
  21. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +2 -2
  22. data/vendor/faiss/faiss/IndexBinary.cpp +13 -2
  23. data/vendor/faiss/faiss/IndexBinary.h +8 -2
  24. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -3
  25. data/vendor/faiss/faiss/IndexBinaryFlat.h +2 -2
  26. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -2
  27. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -2
  28. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +2 -7
  29. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -2
  30. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -3
  31. data/vendor/faiss/faiss/IndexBinaryHash.h +2 -2
  32. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +2 -2
  33. data/vendor/faiss/faiss/IndexBinaryIVF.h +2 -2
  34. data/vendor/faiss/faiss/IndexFastScan.cpp +10 -14
  35. data/vendor/faiss/faiss/IndexFastScan.h +11 -2
  36. data/vendor/faiss/faiss/IndexFlat.cpp +2 -3
  37. data/vendor/faiss/faiss/IndexFlat.h +2 -2
  38. data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -2
  39. data/vendor/faiss/faiss/IndexFlatCodes.h +5 -2
  40. data/vendor/faiss/faiss/IndexHNSW.cpp +13 -6
  41. data/vendor/faiss/faiss/IndexHNSW.h +2 -2
  42. data/vendor/faiss/faiss/IndexIDMap.cpp +19 -3
  43. data/vendor/faiss/faiss/IndexIDMap.h +5 -2
  44. data/vendor/faiss/faiss/IndexIVF.cpp +2 -3
  45. data/vendor/faiss/faiss/IndexIVF.h +5 -4
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +6 -7
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +2 -2
  48. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +3 -14
  49. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +2 -4
  50. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +71 -34
  51. data/vendor/faiss/faiss/IndexIVFFastScan.h +19 -2
  52. data/vendor/faiss/faiss/IndexIVFFlat.cpp +2 -2
  53. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -2
  54. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +2 -2
  55. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +2 -2
  56. data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
  57. data/vendor/faiss/faiss/IndexIVFPQ.h +2 -2
  58. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +7 -33
  59. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +2 -4
  60. data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -2
  61. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -2
  62. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +2 -3
  63. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -2
  64. data/vendor/faiss/faiss/IndexLSH.cpp +2 -3
  65. data/vendor/faiss/faiss/IndexLSH.h +2 -2
  66. data/vendor/faiss/faiss/IndexLattice.cpp +2 -2
  67. data/vendor/faiss/faiss/IndexLattice.h +2 -2
  68. data/vendor/faiss/faiss/IndexNNDescent.cpp +2 -2
  69. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  70. data/vendor/faiss/faiss/IndexNSG.cpp +2 -5
  71. data/vendor/faiss/faiss/IndexNSG.h +2 -2
  72. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +2 -2
  73. data/vendor/faiss/faiss/IndexNeuralNetCodec.h +2 -2
  74. data/vendor/faiss/faiss/IndexPQ.cpp +26 -26
  75. data/vendor/faiss/faiss/IndexPQ.h +2 -2
  76. data/vendor/faiss/faiss/IndexPQFastScan.cpp +2 -5
  77. data/vendor/faiss/faiss/IndexPQFastScan.h +2 -11
  78. data/vendor/faiss/faiss/IndexPreTransform.cpp +2 -2
  79. data/vendor/faiss/faiss/IndexPreTransform.h +2 -2
  80. data/vendor/faiss/faiss/IndexRefine.cpp +41 -4
  81. data/vendor/faiss/faiss/IndexRefine.h +9 -2
  82. data/vendor/faiss/faiss/IndexReplicas.cpp +2 -2
  83. data/vendor/faiss/faiss/IndexReplicas.h +2 -2
  84. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +2 -2
  85. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +2 -2
  86. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -3
  87. data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -2
  88. data/vendor/faiss/faiss/IndexShards.cpp +2 -2
  89. data/vendor/faiss/faiss/IndexShards.h +2 -2
  90. data/vendor/faiss/faiss/IndexShardsIVF.cpp +2 -2
  91. data/vendor/faiss/faiss/IndexShardsIVF.h +2 -2
  92. data/vendor/faiss/faiss/MatrixStats.cpp +2 -2
  93. data/vendor/faiss/faiss/MatrixStats.h +2 -2
  94. data/vendor/faiss/faiss/MetaIndexes.cpp +2 -3
  95. data/vendor/faiss/faiss/MetaIndexes.h +2 -2
  96. data/vendor/faiss/faiss/MetricType.h +2 -2
  97. data/vendor/faiss/faiss/VectorTransform.cpp +2 -2
  98. data/vendor/faiss/faiss/VectorTransform.h +2 -2
  99. data/vendor/faiss/faiss/clone_index.cpp +2 -2
  100. data/vendor/faiss/faiss/clone_index.h +2 -2
  101. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +2 -2
  102. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +2 -2
  103. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +2 -2
  104. data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +44 -4
  105. data/vendor/faiss/faiss/cppcontrib/factory_tools.h +7 -2
  106. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2 -2
  107. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +2 -2
  108. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2 -2
  109. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +2 -2
  110. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +2 -2
  111. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +2 -2
  112. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +2 -2
  113. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +2 -2
  114. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +2 -5
  115. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +2 -2
  116. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +13 -13
  117. data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
  118. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -6
  119. data/vendor/faiss/faiss/gpu/GpuDistance.h +11 -7
  120. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +2 -2
  121. data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +2 -2
  122. data/vendor/faiss/faiss/gpu/GpuIndex.h +8 -7
  123. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -2
  124. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -3
  125. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +2 -2
  126. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +2 -2
  127. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -2
  128. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +3 -3
  129. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +2 -2
  130. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +2 -2
  131. data/vendor/faiss/faiss/gpu/GpuResources.cpp +7 -2
  132. data/vendor/faiss/faiss/gpu/GpuResources.h +11 -4
  133. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +51 -21
  134. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +15 -5
  135. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -2
  136. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +2 -2
  137. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +2 -2
  138. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +2 -2
  139. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +2 -2
  140. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
  141. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +2 -2
  142. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +2 -2
  143. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +2 -3
  144. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +2 -2
  145. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +2 -2
  146. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +2 -2
  147. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +54 -54
  148. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +80 -78
  149. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +51 -51
  150. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +2 -2
  151. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +3 -3
  152. data/vendor/faiss/faiss/gpu/test/TestGpuResidualQuantizer.cpp +70 -0
  153. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +74 -4
  154. data/vendor/faiss/faiss/gpu/test/TestUtils.h +2 -2
  155. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  156. data/vendor/faiss/faiss/gpu/utils/{RaftUtils.h → CuvsUtils.h} +12 -11
  157. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +2 -2
  158. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +2 -2
  159. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +2 -2
  160. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +2 -2
  161. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +2 -2
  162. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  163. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +79 -11
  164. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +17 -5
  165. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +2 -2
  166. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -2
  167. data/vendor/faiss/faiss/impl/CodePacker.cpp +2 -2
  168. data/vendor/faiss/faiss/impl/CodePacker.h +2 -2
  169. data/vendor/faiss/faiss/impl/DistanceComputer.h +2 -2
  170. data/vendor/faiss/faiss/impl/FaissAssert.h +2 -2
  171. data/vendor/faiss/faiss/impl/FaissException.cpp +2 -2
  172. data/vendor/faiss/faiss/impl/FaissException.h +2 -3
  173. data/vendor/faiss/faiss/impl/HNSW.cpp +24 -19
  174. data/vendor/faiss/faiss/impl/HNSW.h +12 -2
  175. data/vendor/faiss/faiss/impl/IDSelector.cpp +2 -2
  176. data/vendor/faiss/faiss/impl/IDSelector.h +2 -2
  177. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +2 -2
  178. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +2 -2
  179. data/vendor/faiss/faiss/impl/LookupTableScaler.h +2 -2
  180. data/vendor/faiss/faiss/impl/NNDescent.cpp +2 -2
  181. data/vendor/faiss/faiss/impl/NNDescent.h +2 -2
  182. data/vendor/faiss/faiss/impl/NSG.cpp +27 -21
  183. data/vendor/faiss/faiss/impl/NSG.h +20 -8
  184. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +2 -2
  185. data/vendor/faiss/faiss/impl/PolysemousTraining.h +2 -2
  186. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +2 -4
  187. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +2 -2
  188. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +2 -2
  189. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +2 -2
  190. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -2
  191. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  192. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +2 -36
  193. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +3 -13
  194. data/vendor/faiss/faiss/impl/ResultHandler.h +2 -2
  195. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +2 -2
  196. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +2 -2
  197. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +2 -2
  198. data/vendor/faiss/faiss/impl/ThreadedIndex.h +2 -2
  199. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +2 -2
  200. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +2 -2
  201. data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +2 -2
  202. data/vendor/faiss/faiss/impl/code_distance/code_distance-sve.h +440 -0
  203. data/vendor/faiss/faiss/impl/code_distance/code_distance.h +55 -2
  204. data/vendor/faiss/faiss/impl/index_read.cpp +2 -5
  205. data/vendor/faiss/faiss/impl/index_read_utils.h +2 -2
  206. data/vendor/faiss/faiss/impl/index_write.cpp +2 -6
  207. data/vendor/faiss/faiss/impl/io.cpp +2 -2
  208. data/vendor/faiss/faiss/impl/io.h +2 -2
  209. data/vendor/faiss/faiss/impl/io_macros.h +2 -9
  210. data/vendor/faiss/faiss/impl/kmeans1d.cpp +2 -3
  211. data/vendor/faiss/faiss/impl/kmeans1d.h +2 -2
  212. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +2 -3
  213. data/vendor/faiss/faiss/impl/lattice_Zn.h +2 -2
  214. data/vendor/faiss/faiss/impl/platform_macros.h +12 -2
  215. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +2 -2
  216. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +20 -2
  217. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +2 -2
  218. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
  219. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +3 -3
  220. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +2 -2
  221. data/vendor/faiss/faiss/impl/simd_result_handlers.h +18 -18
  222. data/vendor/faiss/faiss/index_factory.cpp +20 -21
  223. data/vendor/faiss/faiss/index_factory.h +2 -2
  224. data/vendor/faiss/faiss/index_io.h +2 -2
  225. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +2 -2
  226. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +2 -2
  227. data/vendor/faiss/faiss/invlists/DirectMap.cpp +2 -2
  228. data/vendor/faiss/faiss/invlists/DirectMap.h +2 -2
  229. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +2 -2
  230. data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
  231. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +2 -2
  232. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +2 -2
  233. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -3
  234. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -2
  235. data/vendor/faiss/faiss/python/python_callbacks.cpp +2 -2
  236. data/vendor/faiss/faiss/python/python_callbacks.h +2 -2
  237. data/vendor/faiss/faiss/utils/AlignedTable.h +5 -3
  238. data/vendor/faiss/faiss/utils/Heap.cpp +2 -2
  239. data/vendor/faiss/faiss/utils/Heap.h +2 -2
  240. data/vendor/faiss/faiss/utils/NeuralNet.cpp +11 -7
  241. data/vendor/faiss/faiss/utils/NeuralNet.h +2 -2
  242. data/vendor/faiss/faiss/utils/WorkerThread.cpp +2 -2
  243. data/vendor/faiss/faiss/utils/WorkerThread.h +2 -2
  244. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +2 -2
  245. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +2 -2
  246. data/vendor/faiss/faiss/utils/approx_topk/generic.h +2 -2
  247. data/vendor/faiss/faiss/utils/approx_topk/mode.h +2 -2
  248. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +2 -2
  249. data/vendor/faiss/faiss/utils/bf16.h +2 -2
  250. data/vendor/faiss/faiss/utils/distances.cpp +191 -2
  251. data/vendor/faiss/faiss/utils/distances.h +3 -3
  252. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +2 -2
  253. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
  254. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
  255. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +2 -2
  256. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +2 -2
  257. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +2 -2
  258. data/vendor/faiss/faiss/utils/distances_simd.cpp +502 -3
  259. data/vendor/faiss/faiss/utils/extra_distances-inl.h +2 -2
  260. data/vendor/faiss/faiss/utils/extra_distances.cpp +2 -3
  261. data/vendor/faiss/faiss/utils/extra_distances.h +2 -2
  262. data/vendor/faiss/faiss/utils/fp16-arm.h +2 -2
  263. data/vendor/faiss/faiss/utils/fp16-fp16c.h +2 -2
  264. data/vendor/faiss/faiss/utils/fp16-inl.h +2 -2
  265. data/vendor/faiss/faiss/utils/fp16.h +2 -2
  266. data/vendor/faiss/faiss/utils/hamming-inl.h +2 -2
  267. data/vendor/faiss/faiss/utils/hamming.cpp +2 -3
  268. data/vendor/faiss/faiss/utils/hamming.h +2 -2
  269. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +2 -2
  270. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +490 -0
  271. data/vendor/faiss/faiss/utils/hamming_distance/common.h +2 -2
  272. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +2 -2
  273. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +5 -2
  274. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +5 -5
  275. data/vendor/faiss/faiss/utils/ordered_key_value.h +2 -2
  276. data/vendor/faiss/faiss/utils/partitioning.cpp +2 -2
  277. data/vendor/faiss/faiss/utils/partitioning.h +2 -2
  278. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  279. data/vendor/faiss/faiss/utils/quantize_lut.cpp +2 -2
  280. data/vendor/faiss/faiss/utils/quantize_lut.h +2 -2
  281. data/vendor/faiss/faiss/utils/random.cpp +2 -2
  282. data/vendor/faiss/faiss/utils/random.h +2 -2
  283. data/vendor/faiss/faiss/utils/simdlib.h +2 -2
  284. data/vendor/faiss/faiss/utils/simdlib_avx2.h +2 -2
  285. data/vendor/faiss/faiss/utils/simdlib_avx512.h +2 -2
  286. data/vendor/faiss/faiss/utils/simdlib_emulated.h +2 -2
  287. data/vendor/faiss/faiss/utils/simdlib_neon.h +2 -2
  288. data/vendor/faiss/faiss/utils/simdlib_ppc64.h +2 -2
  289. data/vendor/faiss/faiss/utils/sorting.cpp +2 -2
  290. data/vendor/faiss/faiss/utils/sorting.h +2 -2
  291. data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +2 -2
  292. data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +2 -2
  293. data/vendor/faiss/faiss/utils/utils.cpp +7 -7
  294. data/vendor/faiss/faiss/utils/utils.h +4 -3
  295. metadata +9 -10
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -29,6 +29,10 @@
29
29
  #include <faiss/utils/transpose/transpose-avx2-inl.h>
30
30
  #endif
31
31
 
32
+ #ifdef __ARM_FEATURE_SVE
33
+ #include <arm_sve.h>
34
+ #endif
35
+
32
36
  #ifdef __aarch64__
33
37
  #include <arm_neon.h>
34
38
  #endif
@@ -2585,6 +2589,7 @@ size_t fvec_L2sqr_ny_nearest_y_transposed(
2585
2589
 
2586
2590
  float fvec_L1(const float* x, const float* y, size_t d) {
2587
2591
  __m256 msum1 = _mm256_setzero_ps();
2592
+ // signmask used for absolute value
2588
2593
  __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
2589
2594
 
2590
2595
  while (d >= 8) {
@@ -2592,7 +2597,9 @@ float fvec_L1(const float* x, const float* y, size_t d) {
2592
2597
  x += 8;
2593
2598
  __m256 my = _mm256_loadu_ps(y);
2594
2599
  y += 8;
2600
+ // subtract
2595
2601
  const __m256 a_m_b = _mm256_sub_ps(mx, my);
2602
+ // find sum of absolute value of distances (manhattan distance)
2596
2603
  msum1 = _mm256_add_ps(msum1, _mm256_and_ps(signmask, a_m_b));
2597
2604
  d -= 8;
2598
2605
  }
@@ -2625,6 +2632,7 @@ float fvec_L1(const float* x, const float* y, size_t d) {
2625
2632
 
2626
2633
  float fvec_Linf(const float* x, const float* y, size_t d) {
2627
2634
  __m256 msum1 = _mm256_setzero_ps();
2635
+ // signmask used for absolute value
2628
2636
  __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
2629
2637
 
2630
2638
  while (d >= 8) {
@@ -2632,7 +2640,9 @@ float fvec_Linf(const float* x, const float* y, size_t d) {
2632
2640
  x += 8;
2633
2641
  __m256 my = _mm256_loadu_ps(y);
2634
2642
  y += 8;
2643
+ // subtract
2635
2644
  const __m256 a_m_b = _mm256_sub_ps(mx, my);
2645
+ // find max of absolute value of distances (chebyshev distance)
2636
2646
  msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
2637
2647
  d -= 8;
2638
2648
  }
@@ -2673,6 +2683,441 @@ float fvec_Linf(const float* x, const float* y, size_t d) {
2673
2683
  return fvec_Linf_ref(x, y, d);
2674
2684
  }
2675
2685
 
2686
+ #elif defined(__ARM_FEATURE_SVE)
2687
+
2688
+ struct ElementOpIP {
2689
+ static svfloat32_t op(svbool_t pg, svfloat32_t x, svfloat32_t y) {
2690
+ return svmul_f32_x(pg, x, y);
2691
+ }
2692
+ static svfloat32_t merge(
2693
+ svbool_t pg,
2694
+ svfloat32_t z,
2695
+ svfloat32_t x,
2696
+ svfloat32_t y) {
2697
+ return svmla_f32_x(pg, z, x, y);
2698
+ }
2699
+ };
2700
+
2701
+ template <typename ElementOp>
2702
+ void fvec_op_ny_sve_d1(float* dis, const float* x, const float* y, size_t ny) {
2703
+ const size_t lanes = svcntw();
2704
+ const size_t lanes2 = lanes * 2;
2705
+ const size_t lanes3 = lanes * 3;
2706
+ const size_t lanes4 = lanes * 4;
2707
+ const svbool_t pg = svptrue_b32();
2708
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
2709
+ size_t i = 0;
2710
+ for (; i + lanes4 < ny; i += lanes4) {
2711
+ svfloat32_t y0 = svld1_f32(pg, y);
2712
+ svfloat32_t y1 = svld1_f32(pg, y + lanes);
2713
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
2714
+ svfloat32_t y3 = svld1_f32(pg, y + lanes3);
2715
+ y0 = ElementOp::op(pg, x0, y0);
2716
+ y1 = ElementOp::op(pg, x0, y1);
2717
+ y2 = ElementOp::op(pg, x0, y2);
2718
+ y3 = ElementOp::op(pg, x0, y3);
2719
+ svst1_f32(pg, dis, y0);
2720
+ svst1_f32(pg, dis + lanes, y1);
2721
+ svst1_f32(pg, dis + lanes2, y2);
2722
+ svst1_f32(pg, dis + lanes3, y3);
2723
+ y += lanes4;
2724
+ dis += lanes4;
2725
+ }
2726
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
2727
+ const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
2728
+ const svbool_t pg2 = svwhilelt_b32_u64(i + lanes2, ny);
2729
+ const svbool_t pg3 = svwhilelt_b32_u64(i + lanes3, ny);
2730
+ svfloat32_t y0 = svld1_f32(pg0, y);
2731
+ svfloat32_t y1 = svld1_f32(pg1, y + lanes);
2732
+ svfloat32_t y2 = svld1_f32(pg2, y + lanes2);
2733
+ svfloat32_t y3 = svld1_f32(pg3, y + lanes3);
2734
+ y0 = ElementOp::op(pg0, x0, y0);
2735
+ y1 = ElementOp::op(pg1, x0, y1);
2736
+ y2 = ElementOp::op(pg2, x0, y2);
2737
+ y3 = ElementOp::op(pg3, x0, y3);
2738
+ svst1_f32(pg0, dis, y0);
2739
+ svst1_f32(pg1, dis + lanes, y1);
2740
+ svst1_f32(pg2, dis + lanes2, y2);
2741
+ svst1_f32(pg3, dis + lanes3, y3);
2742
+ }
2743
+
2744
+ template <typename ElementOp>
2745
+ void fvec_op_ny_sve_d2(float* dis, const float* x, const float* y, size_t ny) {
2746
+ const size_t lanes = svcntw();
2747
+ const size_t lanes2 = lanes * 2;
2748
+ const size_t lanes4 = lanes * 4;
2749
+ const svbool_t pg = svptrue_b32();
2750
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
2751
+ const svfloat32_t x1 = svdup_n_f32(x[1]);
2752
+ size_t i = 0;
2753
+ for (; i + lanes2 < ny; i += lanes2) {
2754
+ const svfloat32x2_t y0 = svld2_f32(pg, y);
2755
+ const svfloat32x2_t y1 = svld2_f32(pg, y + lanes2);
2756
+ svfloat32_t y00 = svget2_f32(y0, 0);
2757
+ const svfloat32_t y01 = svget2_f32(y0, 1);
2758
+ svfloat32_t y10 = svget2_f32(y1, 0);
2759
+ const svfloat32_t y11 = svget2_f32(y1, 1);
2760
+ y00 = ElementOp::op(pg, x0, y00);
2761
+ y10 = ElementOp::op(pg, x0, y10);
2762
+ y00 = ElementOp::merge(pg, y00, x1, y01);
2763
+ y10 = ElementOp::merge(pg, y10, x1, y11);
2764
+ svst1_f32(pg, dis, y00);
2765
+ svst1_f32(pg, dis + lanes, y10);
2766
+ y += lanes4;
2767
+ dis += lanes2;
2768
+ }
2769
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
2770
+ const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
2771
+ const svfloat32x2_t y0 = svld2_f32(pg0, y);
2772
+ const svfloat32x2_t y1 = svld2_f32(pg1, y + lanes2);
2773
+ svfloat32_t y00 = svget2_f32(y0, 0);
2774
+ const svfloat32_t y01 = svget2_f32(y0, 1);
2775
+ svfloat32_t y10 = svget2_f32(y1, 0);
2776
+ const svfloat32_t y11 = svget2_f32(y1, 1);
2777
+ y00 = ElementOp::op(pg0, x0, y00);
2778
+ y10 = ElementOp::op(pg1, x0, y10);
2779
+ y00 = ElementOp::merge(pg0, y00, x1, y01);
2780
+ y10 = ElementOp::merge(pg1, y10, x1, y11);
2781
+ svst1_f32(pg0, dis, y00);
2782
+ svst1_f32(pg1, dis + lanes, y10);
2783
+ }
2784
+
2785
+ template <typename ElementOp>
2786
+ void fvec_op_ny_sve_d4(float* dis, const float* x, const float* y, size_t ny) {
2787
+ const size_t lanes = svcntw();
2788
+ const size_t lanes4 = lanes * 4;
2789
+ const svbool_t pg = svptrue_b32();
2790
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
2791
+ const svfloat32_t x1 = svdup_n_f32(x[1]);
2792
+ const svfloat32_t x2 = svdup_n_f32(x[2]);
2793
+ const svfloat32_t x3 = svdup_n_f32(x[3]);
2794
+ size_t i = 0;
2795
+ for (; i + lanes < ny; i += lanes) {
2796
+ const svfloat32x4_t y0 = svld4_f32(pg, y);
2797
+ svfloat32_t y00 = svget4_f32(y0, 0);
2798
+ const svfloat32_t y01 = svget4_f32(y0, 1);
2799
+ svfloat32_t y02 = svget4_f32(y0, 2);
2800
+ const svfloat32_t y03 = svget4_f32(y0, 3);
2801
+ y00 = ElementOp::op(pg, x0, y00);
2802
+ y02 = ElementOp::op(pg, x2, y02);
2803
+ y00 = ElementOp::merge(pg, y00, x1, y01);
2804
+ y02 = ElementOp::merge(pg, y02, x3, y03);
2805
+ y00 = svadd_f32_x(pg, y00, y02);
2806
+ svst1_f32(pg, dis, y00);
2807
+ y += lanes4;
2808
+ dis += lanes;
2809
+ }
2810
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
2811
+ const svfloat32x4_t y0 = svld4_f32(pg0, y);
2812
+ svfloat32_t y00 = svget4_f32(y0, 0);
2813
+ const svfloat32_t y01 = svget4_f32(y0, 1);
2814
+ svfloat32_t y02 = svget4_f32(y0, 2);
2815
+ const svfloat32_t y03 = svget4_f32(y0, 3);
2816
+ y00 = ElementOp::op(pg0, x0, y00);
2817
+ y02 = ElementOp::op(pg0, x2, y02);
2818
+ y00 = ElementOp::merge(pg0, y00, x1, y01);
2819
+ y02 = ElementOp::merge(pg0, y02, x3, y03);
2820
+ y00 = svadd_f32_x(pg0, y00, y02);
2821
+ svst1_f32(pg0, dis, y00);
2822
+ }
2823
+
2824
+ template <typename ElementOp>
2825
+ void fvec_op_ny_sve_d8(float* dis, const float* x, const float* y, size_t ny) {
2826
+ const size_t lanes = svcntw();
2827
+ const size_t lanes4 = lanes * 4;
2828
+ const size_t lanes8 = lanes * 8;
2829
+ const svbool_t pg = svptrue_b32();
2830
+ const svfloat32_t x0 = svdup_n_f32(x[0]);
2831
+ const svfloat32_t x1 = svdup_n_f32(x[1]);
2832
+ const svfloat32_t x2 = svdup_n_f32(x[2]);
2833
+ const svfloat32_t x3 = svdup_n_f32(x[3]);
2834
+ const svfloat32_t x4 = svdup_n_f32(x[4]);
2835
+ const svfloat32_t x5 = svdup_n_f32(x[5]);
2836
+ const svfloat32_t x6 = svdup_n_f32(x[6]);
2837
+ const svfloat32_t x7 = svdup_n_f32(x[7]);
2838
+ size_t i = 0;
2839
+ for (; i + lanes < ny; i += lanes) {
2840
+ const svfloat32x4_t ya = svld4_f32(pg, y);
2841
+ const svfloat32x4_t yb = svld4_f32(pg, y + lanes4);
2842
+ const svfloat32_t ya0 = svget4_f32(ya, 0);
2843
+ const svfloat32_t ya1 = svget4_f32(ya, 1);
2844
+ const svfloat32_t ya2 = svget4_f32(ya, 2);
2845
+ const svfloat32_t ya3 = svget4_f32(ya, 3);
2846
+ const svfloat32_t yb0 = svget4_f32(yb, 0);
2847
+ const svfloat32_t yb1 = svget4_f32(yb, 1);
2848
+ const svfloat32_t yb2 = svget4_f32(yb, 2);
2849
+ const svfloat32_t yb3 = svget4_f32(yb, 3);
2850
+ svfloat32_t y0 = svuzp1(ya0, yb0);
2851
+ const svfloat32_t y1 = svuzp1(ya1, yb1);
2852
+ svfloat32_t y2 = svuzp1(ya2, yb2);
2853
+ const svfloat32_t y3 = svuzp1(ya3, yb3);
2854
+ svfloat32_t y4 = svuzp2(ya0, yb0);
2855
+ const svfloat32_t y5 = svuzp2(ya1, yb1);
2856
+ svfloat32_t y6 = svuzp2(ya2, yb2);
2857
+ const svfloat32_t y7 = svuzp2(ya3, yb3);
2858
+ y0 = ElementOp::op(pg, x0, y0);
2859
+ y2 = ElementOp::op(pg, x2, y2);
2860
+ y4 = ElementOp::op(pg, x4, y4);
2861
+ y6 = ElementOp::op(pg, x6, y6);
2862
+ y0 = ElementOp::merge(pg, y0, x1, y1);
2863
+ y2 = ElementOp::merge(pg, y2, x3, y3);
2864
+ y4 = ElementOp::merge(pg, y4, x5, y5);
2865
+ y6 = ElementOp::merge(pg, y6, x7, y7);
2866
+ y0 = svadd_f32_x(pg, y0, y2);
2867
+ y4 = svadd_f32_x(pg, y4, y6);
2868
+ y0 = svadd_f32_x(pg, y0, y4);
2869
+ svst1_f32(pg, dis, y0);
2870
+ y += lanes8;
2871
+ dis += lanes;
2872
+ }
2873
+ const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
2874
+ const svbool_t pga = svwhilelt_b32_u64(i * 2, ny * 2);
2875
+ const svbool_t pgb = svwhilelt_b32_u64(i * 2 + lanes, ny * 2);
2876
+ const svfloat32x4_t ya = svld4_f32(pga, y);
2877
+ const svfloat32x4_t yb = svld4_f32(pgb, y + lanes4);
2878
+ const svfloat32_t ya0 = svget4_f32(ya, 0);
2879
+ const svfloat32_t ya1 = svget4_f32(ya, 1);
2880
+ const svfloat32_t ya2 = svget4_f32(ya, 2);
2881
+ const svfloat32_t ya3 = svget4_f32(ya, 3);
2882
+ const svfloat32_t yb0 = svget4_f32(yb, 0);
2883
+ const svfloat32_t yb1 = svget4_f32(yb, 1);
2884
+ const svfloat32_t yb2 = svget4_f32(yb, 2);
2885
+ const svfloat32_t yb3 = svget4_f32(yb, 3);
2886
+ svfloat32_t y0 = svuzp1(ya0, yb0);
2887
+ const svfloat32_t y1 = svuzp1(ya1, yb1);
2888
+ svfloat32_t y2 = svuzp1(ya2, yb2);
2889
+ const svfloat32_t y3 = svuzp1(ya3, yb3);
2890
+ svfloat32_t y4 = svuzp2(ya0, yb0);
2891
+ const svfloat32_t y5 = svuzp2(ya1, yb1);
2892
+ svfloat32_t y6 = svuzp2(ya2, yb2);
2893
+ const svfloat32_t y7 = svuzp2(ya3, yb3);
2894
+ y0 = ElementOp::op(pg0, x0, y0);
2895
+ y2 = ElementOp::op(pg0, x2, y2);
2896
+ y4 = ElementOp::op(pg0, x4, y4);
2897
+ y6 = ElementOp::op(pg0, x6, y6);
2898
+ y0 = ElementOp::merge(pg0, y0, x1, y1);
2899
+ y2 = ElementOp::merge(pg0, y2, x3, y3);
2900
+ y4 = ElementOp::merge(pg0, y4, x5, y5);
2901
+ y6 = ElementOp::merge(pg0, y6, x7, y7);
2902
+ y0 = svadd_f32_x(pg0, y0, y2);
2903
+ y4 = svadd_f32_x(pg0, y4, y6);
2904
+ y0 = svadd_f32_x(pg0, y0, y4);
2905
+ svst1_f32(pg0, dis, y0);
2906
+ y += lanes8;
2907
+ dis += lanes;
2908
+ }
2909
+
2910
+ template <typename ElementOp>
2911
+ void fvec_op_ny_sve_lanes1(
2912
+ float* dis,
2913
+ const float* x,
2914
+ const float* y,
2915
+ size_t ny) {
2916
+ const size_t lanes = svcntw();
2917
+ const size_t lanes2 = lanes * 2;
2918
+ const size_t lanes3 = lanes * 3;
2919
+ const size_t lanes4 = lanes * 4;
2920
+ const svbool_t pg = svptrue_b32();
2921
+ const svfloat32_t x0 = svld1_f32(pg, x);
2922
+ size_t i = 0;
2923
+ for (; i + 3 < ny; i += 4) {
2924
+ svfloat32_t y0 = svld1_f32(pg, y);
2925
+ svfloat32_t y1 = svld1_f32(pg, y + lanes);
2926
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
2927
+ svfloat32_t y3 = svld1_f32(pg, y + lanes3);
2928
+ y += lanes4;
2929
+ y0 = ElementOp::op(pg, x0, y0);
2930
+ y1 = ElementOp::op(pg, x0, y1);
2931
+ y2 = ElementOp::op(pg, x0, y2);
2932
+ y3 = ElementOp::op(pg, x0, y3);
2933
+ dis[i] = svaddv_f32(pg, y0);
2934
+ dis[i + 1] = svaddv_f32(pg, y1);
2935
+ dis[i + 2] = svaddv_f32(pg, y2);
2936
+ dis[i + 3] = svaddv_f32(pg, y3);
2937
+ }
2938
+ for (; i < ny; ++i) {
2939
+ svfloat32_t y0 = svld1_f32(pg, y);
2940
+ y += lanes;
2941
+ y0 = ElementOp::op(pg, x0, y0);
2942
+ dis[i] = svaddv_f32(pg, y0);
2943
+ }
2944
+ }
2945
+
2946
+ template <typename ElementOp>
2947
+ void fvec_op_ny_sve_lanes2(
2948
+ float* dis,
2949
+ const float* x,
2950
+ const float* y,
2951
+ size_t ny) {
2952
+ const size_t lanes = svcntw();
2953
+ const size_t lanes2 = lanes * 2;
2954
+ const size_t lanes3 = lanes * 3;
2955
+ const size_t lanes4 = lanes * 4;
2956
+ const svbool_t pg = svptrue_b32();
2957
+ const svfloat32_t x0 = svld1_f32(pg, x);
2958
+ const svfloat32_t x1 = svld1_f32(pg, x + lanes);
2959
+ size_t i = 0;
2960
+ for (; i + 1 < ny; i += 2) {
2961
+ svfloat32_t y00 = svld1_f32(pg, y);
2962
+ const svfloat32_t y01 = svld1_f32(pg, y + lanes);
2963
+ svfloat32_t y10 = svld1_f32(pg, y + lanes2);
2964
+ const svfloat32_t y11 = svld1_f32(pg, y + lanes3);
2965
+ y += lanes4;
2966
+ y00 = ElementOp::op(pg, x0, y00);
2967
+ y10 = ElementOp::op(pg, x0, y10);
2968
+ y00 = ElementOp::merge(pg, y00, x1, y01);
2969
+ y10 = ElementOp::merge(pg, y10, x1, y11);
2970
+ dis[i] = svaddv_f32(pg, y00);
2971
+ dis[i + 1] = svaddv_f32(pg, y10);
2972
+ }
2973
+ if (i < ny) {
2974
+ svfloat32_t y0 = svld1_f32(pg, y);
2975
+ const svfloat32_t y1 = svld1_f32(pg, y + lanes);
2976
+ y0 = ElementOp::op(pg, x0, y0);
2977
+ y0 = ElementOp::merge(pg, y0, x1, y1);
2978
+ dis[i] = svaddv_f32(pg, y0);
2979
+ }
2980
+ }
2981
+
2982
+ template <typename ElementOp>
2983
+ void fvec_op_ny_sve_lanes3(
2984
+ float* dis,
2985
+ const float* x,
2986
+ const float* y,
2987
+ size_t ny) {
2988
+ const size_t lanes = svcntw();
2989
+ const size_t lanes2 = lanes * 2;
2990
+ const size_t lanes3 = lanes * 3;
2991
+ const svbool_t pg = svptrue_b32();
2992
+ const svfloat32_t x0 = svld1_f32(pg, x);
2993
+ const svfloat32_t x1 = svld1_f32(pg, x + lanes);
2994
+ const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
2995
+ for (size_t i = 0; i < ny; ++i) {
2996
+ svfloat32_t y0 = svld1_f32(pg, y);
2997
+ const svfloat32_t y1 = svld1_f32(pg, y + lanes);
2998
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
2999
+ y += lanes3;
3000
+ y0 = ElementOp::op(pg, x0, y0);
3001
+ y0 = ElementOp::merge(pg, y0, x1, y1);
3002
+ y0 = ElementOp::merge(pg, y0, x2, y2);
3003
+ dis[i] = svaddv_f32(pg, y0);
3004
+ }
3005
+ }
3006
+
3007
+ template <typename ElementOp>
3008
+ void fvec_op_ny_sve_lanes4(
3009
+ float* dis,
3010
+ const float* x,
3011
+ const float* y,
3012
+ size_t ny) {
3013
+ const size_t lanes = svcntw();
3014
+ const size_t lanes2 = lanes * 2;
3015
+ const size_t lanes3 = lanes * 3;
3016
+ const size_t lanes4 = lanes * 4;
3017
+ const svbool_t pg = svptrue_b32();
3018
+ const svfloat32_t x0 = svld1_f32(pg, x);
3019
+ const svfloat32_t x1 = svld1_f32(pg, x + lanes);
3020
+ const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
3021
+ const svfloat32_t x3 = svld1_f32(pg, x + lanes3);
3022
+ for (size_t i = 0; i < ny; ++i) {
3023
+ svfloat32_t y0 = svld1_f32(pg, y);
3024
+ const svfloat32_t y1 = svld1_f32(pg, y + lanes);
3025
+ svfloat32_t y2 = svld1_f32(pg, y + lanes2);
3026
+ const svfloat32_t y3 = svld1_f32(pg, y + lanes3);
3027
+ y += lanes4;
3028
+ y0 = ElementOp::op(pg, x0, y0);
3029
+ y2 = ElementOp::op(pg, x2, y2);
3030
+ y0 = ElementOp::merge(pg, y0, x1, y1);
3031
+ y2 = ElementOp::merge(pg, y2, x3, y3);
3032
+ y0 = svadd_f32_x(pg, y0, y2);
3033
+ dis[i] = svaddv_f32(pg, y0);
3034
+ }
3035
+ }
3036
+
3037
+ void fvec_L2sqr_ny(
3038
+ float* dis,
3039
+ const float* x,
3040
+ const float* y,
3041
+ size_t d,
3042
+ size_t ny) {
3043
+ fvec_L2sqr_ny_ref(dis, x, y, d, ny);
3044
+ }
3045
+
3046
+ void fvec_L2sqr_ny_transposed(
3047
+ float* dis,
3048
+ const float* x,
3049
+ const float* y,
3050
+ const float* y_sqlen,
3051
+ size_t d,
3052
+ size_t d_offset,
3053
+ size_t ny) {
3054
+ return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
3055
+ }
3056
+
3057
+ size_t fvec_L2sqr_ny_nearest(
3058
+ float* distances_tmp_buffer,
3059
+ const float* x,
3060
+ const float* y,
3061
+ size_t d,
3062
+ size_t ny) {
3063
+ return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
3064
+ }
3065
+
3066
+ size_t fvec_L2sqr_ny_nearest_y_transposed(
3067
+ float* distances_tmp_buffer,
3068
+ const float* x,
3069
+ const float* y,
3070
+ const float* y_sqlen,
3071
+ size_t d,
3072
+ size_t d_offset,
3073
+ size_t ny) {
3074
+ return fvec_L2sqr_ny_nearest_y_transposed_ref(
3075
+ distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
3076
+ }
3077
+
3078
+ float fvec_L1(const float* x, const float* y, size_t d) {
3079
+ return fvec_L1_ref(x, y, d);
3080
+ }
3081
+
3082
+ float fvec_Linf(const float* x, const float* y, size_t d) {
3083
+ return fvec_Linf_ref(x, y, d);
3084
+ }
3085
+
3086
+ void fvec_inner_products_ny(
3087
+ float* dis,
3088
+ const float* x,
3089
+ const float* y,
3090
+ size_t d,
3091
+ size_t ny) {
3092
+ const size_t lanes = svcntw();
3093
+ switch (d) {
3094
+ case 1:
3095
+ fvec_op_ny_sve_d1<ElementOpIP>(dis, x, y, ny);
3096
+ break;
3097
+ case 2:
3098
+ fvec_op_ny_sve_d2<ElementOpIP>(dis, x, y, ny);
3099
+ break;
3100
+ case 4:
3101
+ fvec_op_ny_sve_d4<ElementOpIP>(dis, x, y, ny);
3102
+ break;
3103
+ case 8:
3104
+ fvec_op_ny_sve_d8<ElementOpIP>(dis, x, y, ny);
3105
+ break;
3106
+ default:
3107
+ if (d == lanes)
3108
+ fvec_op_ny_sve_lanes1<ElementOpIP>(dis, x, y, ny);
3109
+ else if (d == lanes * 2)
3110
+ fvec_op_ny_sve_lanes2<ElementOpIP>(dis, x, y, ny);
3111
+ else if (d == lanes * 3)
3112
+ fvec_op_ny_sve_lanes3<ElementOpIP>(dis, x, y, ny);
3113
+ else if (d == lanes * 4)
3114
+ fvec_op_ny_sve_lanes4<ElementOpIP>(dis, x, y, ny);
3115
+ else
3116
+ fvec_inner_products_ny_ref(dis, x, y, d, ny);
3117
+ break;
3118
+ }
3119
+ }
3120
+
2676
3121
  #elif defined(__aarch64__)
2677
3122
 
2678
3123
  // not optimized for ARM
@@ -2934,6 +3379,60 @@ void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
2934
3379
  #endif
2935
3380
  }
2936
3381
 
3382
+ #elif defined(__ARM_FEATURE_SVE)
3383
+
3384
+ void fvec_madd(
3385
+ const size_t n,
3386
+ const float* __restrict a,
3387
+ const float bf,
3388
+ const float* __restrict b,
3389
+ float* __restrict c) {
3390
+ const size_t lanes = static_cast<size_t>(svcntw());
3391
+ const size_t lanes2 = lanes * 2;
3392
+ const size_t lanes3 = lanes * 3;
3393
+ const size_t lanes4 = lanes * 4;
3394
+ size_t i = 0;
3395
+ for (; i + lanes4 < n; i += lanes4) {
3396
+ const auto mask = svptrue_b32();
3397
+ const auto ai0 = svld1_f32(mask, a + i);
3398
+ const auto ai1 = svld1_f32(mask, a + i + lanes);
3399
+ const auto ai2 = svld1_f32(mask, a + i + lanes2);
3400
+ const auto ai3 = svld1_f32(mask, a + i + lanes3);
3401
+ const auto bi0 = svld1_f32(mask, b + i);
3402
+ const auto bi1 = svld1_f32(mask, b + i + lanes);
3403
+ const auto bi2 = svld1_f32(mask, b + i + lanes2);
3404
+ const auto bi3 = svld1_f32(mask, b + i + lanes3);
3405
+ const auto ci0 = svmla_n_f32_x(mask, ai0, bi0, bf);
3406
+ const auto ci1 = svmla_n_f32_x(mask, ai1, bi1, bf);
3407
+ const auto ci2 = svmla_n_f32_x(mask, ai2, bi2, bf);
3408
+ const auto ci3 = svmla_n_f32_x(mask, ai3, bi3, bf);
3409
+ svst1_f32(mask, c + i, ci0);
3410
+ svst1_f32(mask, c + i + lanes, ci1);
3411
+ svst1_f32(mask, c + i + lanes2, ci2);
3412
+ svst1_f32(mask, c + i + lanes3, ci3);
3413
+ }
3414
+ const auto mask0 = svwhilelt_b32_u64(i, n);
3415
+ const auto mask1 = svwhilelt_b32_u64(i + lanes, n);
3416
+ const auto mask2 = svwhilelt_b32_u64(i + lanes2, n);
3417
+ const auto mask3 = svwhilelt_b32_u64(i + lanes3, n);
3418
+ const auto ai0 = svld1_f32(mask0, a + i);
3419
+ const auto ai1 = svld1_f32(mask1, a + i + lanes);
3420
+ const auto ai2 = svld1_f32(mask2, a + i + lanes2);
3421
+ const auto ai3 = svld1_f32(mask3, a + i + lanes3);
3422
+ const auto bi0 = svld1_f32(mask0, b + i);
3423
+ const auto bi1 = svld1_f32(mask1, b + i + lanes);
3424
+ const auto bi2 = svld1_f32(mask2, b + i + lanes2);
3425
+ const auto bi3 = svld1_f32(mask3, b + i + lanes3);
3426
+ const auto ci0 = svmla_n_f32_x(mask0, ai0, bi0, bf);
3427
+ const auto ci1 = svmla_n_f32_x(mask1, ai1, bi1, bf);
3428
+ const auto ci2 = svmla_n_f32_x(mask2, ai2, bi2, bf);
3429
+ const auto ci3 = svmla_n_f32_x(mask3, ai3, bi3, bf);
3430
+ svst1_f32(mask0, c + i, ci0);
3431
+ svst1_f32(mask1, c + i + lanes, ci1);
3432
+ svst1_f32(mask2, c + i + lanes2, ci2);
3433
+ svst1_f32(mask3, c + i + lanes3, ci3);
3434
+ }
3435
+
2937
3436
  #elif defined(__aarch64__)
2938
3437
 
2939
3438
  void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
@@ -3266,7 +3765,7 @@ void fvec_add(size_t d, const float* a, float b, float* c) {
3266
3765
  size_t i;
3267
3766
  simd8float32 bv(b);
3268
3767
  for (i = 0; i + 7 < d; i += 8) {
3269
- simd8float32 ci, ai, bi;
3768
+ simd8float32 ci, ai;
3270
3769
  ai.loadu(a + i);
3271
3770
  ci = ai + bv;
3272
3771
  ci.storeu(c + i);
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -15,7 +15,6 @@
15
15
 
16
16
  #include <faiss/impl/AuxIndexStructures.h>
17
17
  #include <faiss/impl/DistanceComputer.h>
18
- #include <faiss/impl/FaissAssert.h>
19
18
  #include <faiss/utils/utils.h>
20
19
 
21
20
  namespace faiss {
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -24,7 +24,6 @@
24
24
  #include <faiss/utils/hamming.h>
25
25
 
26
26
  #include <algorithm>
27
- #include <cmath>
28
27
  #include <cstdio>
29
28
  #include <memory>
30
29
  #include <vector>
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
@@ -1,5 +1,5 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  *
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.