@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,171 +0,0 @@
1
- #include "ggml-cpu.h"
2
-
3
- #ifdef GGML_USE_CUDA
4
- #include "ggml-cuda.h"
5
- #endif
6
-
7
- #ifdef GGML_USE_METAL
8
- #include "ggml-metal.h"
9
- #endif
10
-
11
- #ifdef GGML_USE_VULKAN
12
- #include "ggml-vulkan.h"
13
- #endif
14
-
15
- #ifdef GGML_USE_SYCL
16
- #include "ggml-sycl.h"
17
- #endif
18
-
19
- #include "ggml-rpc.h"
20
- #ifdef _WIN32
21
- # include <windows.h>
22
- #else
23
- # include <unistd.h>
24
- #endif
25
- #include <string>
26
- #include <stdio.h>
27
-
28
- struct rpc_server_params {
29
- std::string host = "127.0.0.1";
30
- int port = 50052;
31
- size_t backend_mem = 0;
32
- };
33
-
34
- static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
35
- fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
36
- fprintf(stderr, "options:\n");
37
- fprintf(stderr, " -h, --help show this help message and exit\n");
38
- fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
39
- fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
40
- fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
41
- fprintf(stderr, "\n");
42
- }
43
-
44
- static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
45
- std::string arg;
46
- for (int i = 1; i < argc; i++) {
47
- arg = argv[i];
48
- if (arg == "-H" || arg == "--host") {
49
- if (++i >= argc) {
50
- return false;
51
- }
52
- params.host = argv[i];
53
- } else if (arg == "-p" || arg == "--port") {
54
- if (++i >= argc) {
55
- return false;
56
- }
57
- params.port = std::stoi(argv[i]);
58
- if (params.port <= 0 || params.port > 65535) {
59
- return false;
60
- }
61
- } else if (arg == "-m" || arg == "--mem") {
62
- if (++i >= argc) {
63
- return false;
64
- }
65
- params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
66
- } else if (arg == "-h" || arg == "--help") {
67
- print_usage(argc, argv, params);
68
- exit(0);
69
- } else {
70
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
71
- print_usage(argc, argv, params);
72
- exit(0);
73
- }
74
- }
75
- return true;
76
- }
77
-
78
- static ggml_backend_t create_backend() {
79
- ggml_backend_t backend = NULL;
80
- #ifdef GGML_USE_CUDA
81
- fprintf(stderr, "%s: using CUDA backend\n", __func__);
82
- backend = ggml_backend_cuda_init(0); // init device 0
83
- if (!backend) {
84
- fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
85
- }
86
- #elif GGML_USE_METAL
87
- fprintf(stderr, "%s: using Metal backend\n", __func__);
88
- backend = ggml_backend_metal_init();
89
- if (!backend) {
90
- fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
91
- }
92
- #elif GGML_USE_VULKAN
93
- fprintf(stderr, "%s: using Vulkan backend\n", __func__);
94
- backend = ggml_backend_vk_init(0); // init device 0
95
- if (!backend) {
96
- fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
97
- }
98
- #elif GGML_USE_SYCL
99
- fprintf(stderr, "%s: using SYCL backend\n", __func__);
100
- backend = ggml_backend_sycl_init(0); // init device 0
101
- if (!backend) {
102
- fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
103
- }
104
- #endif
105
-
106
- // if there aren't GPU Backends fallback to CPU backend
107
- if (!backend) {
108
- fprintf(stderr, "%s: using CPU backend\n", __func__);
109
- backend = ggml_backend_cpu_init();
110
- }
111
- return backend;
112
- }
113
-
114
- static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
115
- #ifdef GGML_USE_CUDA
116
- ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
117
- #elif GGML_USE_VULKAN
118
- ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
119
- #elif GGML_USE_SYCL
120
- ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
121
- #else
122
- #ifdef _WIN32
123
- MEMORYSTATUSEX status;
124
- status.dwLength = sizeof(status);
125
- GlobalMemoryStatusEx(&status);
126
- *total_mem = status.ullTotalPhys;
127
- *free_mem = status.ullAvailPhys;
128
- #else
129
- long pages = sysconf(_SC_PHYS_PAGES);
130
- long page_size = sysconf(_SC_PAGE_SIZE);
131
- *total_mem = pages * page_size;
132
- *free_mem = *total_mem;
133
- #endif
134
- #endif
135
- }
136
-
137
- int main(int argc, char * argv[]) {
138
- rpc_server_params params;
139
- if (!rpc_server_params_parse(argc, argv, params)) {
140
- fprintf(stderr, "Invalid parameters\n");
141
- return 1;
142
- }
143
-
144
- if (params.host != "127.0.0.1") {
145
- fprintf(stderr, "\n");
146
- fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
147
- fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
148
- fprintf(stderr, " Never expose the RPC server to an open network!\n");
149
- fprintf(stderr, " This is an experimental feature and is not secure!\n");
150
- fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
151
- fprintf(stderr, "\n");
152
- }
153
-
154
- ggml_backend_t backend = create_backend();
155
- if (!backend) {
156
- fprintf(stderr, "Failed to create backend\n");
157
- return 1;
158
- }
159
- std::string endpoint = params.host + ":" + std::to_string(params.port);
160
- size_t free_mem, total_mem;
161
- if (params.backend_mem > 0) {
162
- free_mem = params.backend_mem;
163
- total_mem = params.backend_mem;
164
- } else {
165
- get_backend_memory(&free_mem, &total_mem);
166
- }
167
- printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
168
- ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
169
- ggml_backend_free(backend);
170
- return 0;
171
- }
@@ -1,5 +0,0 @@
1
- set(TARGET llama-run)
2
- add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -1,30 +0,0 @@
1
- file(GLOB SRC_FILES
2
- get_row_f32.cpp
3
- get_row_f16.cpp
4
- get_row_q4_0.cpp
5
- get_row_q8_0.cpp
6
- quantize_f32_q8_0.cpp
7
- quantize_f16_q8_0.cpp
8
- quantize_float_to_q4_0.cpp
9
- dup.cpp
10
- )
11
-
12
- set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
13
- set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
14
-
15
- if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
16
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
17
- elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
18
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
19
- else()
20
- message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
21
- endif()
22
- include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
23
-
24
- ascendc_library(ascendc_kernels STATIC
25
- ${SRC_FILES}
26
- )
27
-
28
- message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
29
- ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
30
- # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
@@ -1,19 +0,0 @@
1
- #ifndef ASCENDC_KERNELS_H
2
- #define ASCENDC_KERNELS_H
3
-
4
- #include "aclrtlaunch_ascendc_get_row_f32.h"
5
- #include "aclrtlaunch_ascendc_get_row_f16.h"
6
- #include "aclrtlaunch_ascendc_get_row_q8_0.h"
7
- #include "aclrtlaunch_ascendc_get_row_q4_0.h"
8
-
9
- #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10
- #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11
- #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12
- #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
13
-
14
- #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
15
- #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
16
- #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
17
- #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
18
-
19
- #endif // ASCENDC_KERNELS_H
@@ -1,234 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
-
5
- #define BUFFER_NUM 2
6
- const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
7
-
8
- template <typename SRC_T, typename DST_T>
9
- class DupByRows {
10
- public:
11
- __aicore__ inline DupByRows() {}
12
- __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
13
- size_t *input_nb_ub) {
14
- /* Dup by rows when src is contigous on first dimension and dst is
15
- contiguous, each kernel process one row.
16
- */
17
-
18
- // Input has four dims.
19
- int64_t op_block_num = GetBlockNum();
20
- int64_t op_block_idx = GetBlockIdx();
21
-
22
- // param
23
- num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
24
- num_elem = input_ne_ub[0];
25
-
26
- // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
27
- idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
28
- idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
29
- / (input_ne_ub[1]);
30
- idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
31
- - idx_ne2 * input_ne_ub[1];
32
-
33
- // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
34
- src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
35
- + input_nb_ub[1] * idx_ne1;
36
-
37
- // dst is contiguous
38
- dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
39
-
40
- src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
41
- src_stride));
42
- dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
43
- dst_stride));
44
-
45
- pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
46
- 32 - 1) / 32 * 32);
47
- pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
48
- 32 - 1) / 32 * 32);
49
- }
50
-
51
- __aicore__ inline void copy_in() {
52
- LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
53
- const size_t elem_per_block = 32 / sizeof(SRC_T);
54
- size_t tail = num_elem % elem_per_block;
55
- size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
56
- DataCopy(src_local, src_gm, cpy_elements_len);
57
- src_queue.EnQue(src_local);
58
- }
59
-
60
- __aicore__ inline void copy_out() {
61
- LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
62
- #ifdef ASCEND_310P
63
- const size_t elem_per_block = 32 / sizeof(DST_T);
64
- size_t tail = num_elem % elem_per_block;
65
- size_t len = num_elem & ~(elem_per_block - 1);
66
- if (len > 0) {
67
- DataCopy(dst_gm, dst_local, len);
68
- }
69
- if(tail != 0) {
70
- for (size_t i = tail; i < elem_per_block; i++) {
71
- dst_local[len + i].SetValue(0, 0);
72
- }
73
- SetAtomicAdd<float>();
74
- DataCopy(dst_gm[len], dst_local[len], elem_per_block);
75
- SetAtomicNone();
76
- }
77
- #else
78
- DataCopyExtParams dataCopyParams;
79
- dataCopyParams.blockCount = 1;
80
- dataCopyParams.blockLen = num_elem * sizeof(DST_T);
81
- DataCopyPad(dst_gm, dst_local, dataCopyParams);
82
- #endif
83
- dst_queue.FreeTensor(dst_local);
84
- }
85
-
86
- __aicore__ inline void dup() {
87
- // main process, copy one row data from src to dst.
88
- copy_in();
89
-
90
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
91
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
92
-
93
- int32_t BLOCK_NUM = 32 / sizeof(DST_T);
94
- DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
95
- / BLOCK_NUM * BLOCK_NUM);
96
- dst_queue.EnQue<DST_T>(dst_local);
97
-
98
- src_queue.FreeTensor(src_local);
99
- copy_out();
100
- }
101
-
102
- __aicore__ inline void dup_with_cast() {
103
- // main process, copy one row data from src to dst.
104
- // cast dtype from src to dst.
105
- copy_in();
106
-
107
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
108
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
109
-
110
- Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
111
- dst_queue.EnQue<DST_T>(dst_local);
112
-
113
- src_queue.FreeTensor(src_local);
114
- copy_out();
115
- }
116
-
117
- private:
118
-
119
- TPipe pipe;
120
- GlobalTensor<SRC_T> src_gm;
121
- GlobalTensor<DST_T> dst_gm;
122
-
123
- int64_t num_rows;
124
- int64_t num_elem;
125
- int64_t idx_ne3;
126
- int64_t idx_ne2;
127
- int64_t idx_ne1;
128
- int64_t src_stride;
129
- int64_t dst_stride;
130
-
131
- TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
132
- TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
133
- };
134
-
135
- template <typename T>
136
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
137
- auto gm_ptr = (__gm__ uint8_t *)gm;
138
- auto ub_ptr = (uint8_t *)(ub);
139
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
140
- *ub_ptr = *gm_ptr;
141
- }
142
- }
143
-
144
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
145
- GM_ADDR src_gm,
146
- GM_ADDR dst_gm,
147
- GM_ADDR input_ne_gm,
148
- GM_ADDR input_nb_gm,
149
- GM_ADDR output_ne_gm,
150
- GM_ADDR output_nb_gm) {
151
-
152
- int64_t input_ne_ub[4];
153
- size_t input_nb_ub[4];
154
- int64_t output_ne_ub[4];
155
- size_t output_nb_ub[4];
156
-
157
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
158
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
159
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
160
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
161
-
162
- DupByRows<half, half> op;
163
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
164
- op.dup();
165
- }
166
-
167
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
168
- GM_ADDR src_gm,
169
- GM_ADDR dst_gm,
170
- GM_ADDR input_ne_gm,
171
- GM_ADDR input_nb_gm,
172
- GM_ADDR output_ne_gm,
173
- GM_ADDR output_nb_gm) {
174
- int64_t input_ne_ub[4];
175
- size_t input_nb_ub[4];
176
- int64_t output_ne_ub[4];
177
- size_t output_nb_ub[4];
178
-
179
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
182
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
183
-
184
- DupByRows<float, float> op;
185
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
186
- op.dup();
187
- }
188
-
189
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
190
- GM_ADDR src_gm,
191
- GM_ADDR dst_gm,
192
- GM_ADDR input_ne_gm,
193
- GM_ADDR input_nb_gm,
194
- GM_ADDR output_ne_gm,
195
- GM_ADDR output_nb_gm) {
196
-
197
- int64_t input_ne_ub[4];
198
- size_t input_nb_ub[4];
199
- int64_t output_ne_ub[4];
200
- size_t output_nb_ub[4];
201
-
202
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
203
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
204
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
205
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
206
-
207
- DupByRows<float, half> op;
208
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
209
- op.dup_with_cast();
210
- }
211
-
212
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
213
- GM_ADDR src_gm,
214
- GM_ADDR dst_gm,
215
- GM_ADDR input_ne_gm,
216
- GM_ADDR input_nb_gm,
217
- GM_ADDR output_ne_gm,
218
- GM_ADDR output_nb_gm) {
219
-
220
- // copy params from gm to ub.
221
- int64_t input_ne_ub[4];
222
- size_t input_nb_ub[4];
223
- int64_t output_ne_ub[4];
224
- size_t output_nb_ub[4];
225
-
226
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
227
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
228
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
229
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
230
-
231
- DupByRows<half, float> op;
232
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
233
- op.dup_with_cast();
234
- }
@@ -1,197 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- class GET_ROW_F16 {
9
- public:
10
- __aicore__ inline GET_ROW_F16() {}
11
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
- int64_t *input_ne_ub, size_t *input_nb_ub,
13
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
- int64_t *output_ne_ub, size_t *output_nb_ub) {
15
- // TODO, use template for F16/f32
16
- int64_t op_block_num = GetBlockNum();
17
- op_block_idx = GetBlockIdx();
18
-
19
- for (int i = 0; i < 4; i++) {
20
- input_ne[i] = input_ne_ub[i];
21
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
22
-
23
- indices_ne[i] = indices_ne_ub[i];
24
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
25
-
26
- output_ne[i] = output_ne_ub[i];
27
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
28
- }
29
-
30
- // Indices has two dims. n_elements = all rows should get.
31
- // dr, all rows should this thread get.
32
- uint64_t n_elements =
33
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
34
- dr = n_elements / op_block_num;
35
-
36
- uint64_t tails = n_elements % op_block_num;
37
- if (op_block_idx < tails) {
38
- dr += 1;
39
- ir = dr * op_block_idx;
40
- } else {
41
- ir = dr * op_block_idx + tails;
42
- }
43
-
44
- input_gm.SetGlobalBuffer((__gm__ half *)input);
45
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
46
- output_gm.SetGlobalBuffer((__gm__ float *)output);
47
-
48
- uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
49
- & ~31);
50
- uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
51
- & ~31);
52
-
53
- local_buffer_elems = input_local_buffer_size / sizeof(half);
54
-
55
- // TODO, consider long row that can't put in UB.
56
- // All data should asign to 32. It's ok because all data is align to 32.
57
- pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
58
- pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
59
- }
60
-
61
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
62
- size_t origin_len = len;
63
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
64
- const size_t elem_per_block = 32 / sizeof(half);
65
- size_t tail = len % elem_per_block;
66
- len = len & ~(elem_per_block - 1);
67
- if(tail != 0) {
68
- len += elem_per_block;
69
- }
70
- DataCopy(input_local, input_gm[offset], len);
71
- input_queue.EnQue(input_local);
72
- }
73
-
74
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
75
- LocalTensor<float> output_local = output_queue.DeQue<float>();
76
- const size_t elem_per_block = 32 / sizeof(float);
77
- size_t tail = len % elem_per_block;
78
- len = len & ~(elem_per_block - 1);
79
- if (len > 0) {
80
- DataCopy(output_gm[offset], output_local, len);
81
- }
82
-
83
- if(tail != 0) {
84
- #ifdef ASCEND_310P
85
- for (size_t i = tail; i < elem_per_block; i++) {
86
- output_local[len + i].SetValue(0, 0);
87
- }
88
- SetAtomicAdd<float>();
89
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
90
- SetAtomicNone();
91
- #else
92
- DataCopyExtParams dataCopyParams;
93
- dataCopyParams.blockCount = 1;
94
- dataCopyParams.blockLen = tail * sizeof(float);
95
- DataCopyPad(output_gm[offset + len], output_local[len],
96
- dataCopyParams);
97
- #endif
98
- }
99
- output_queue.FreeTensor(output_local);
100
- }
101
-
102
- __aicore__ inline void calculate_row(int64_t idx) {
103
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
104
- const int64_t indices_ne1_idx =
105
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
106
- indices_ne[0];
107
- const int64_t indices_ne0_idx =
108
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
109
- indices_ne1_idx * indices_ne[0]);
110
-
111
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
112
- indices_ne1_idx * indices_stride[1] +
113
- indices_ne2_idx * indices_stride[2];
114
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
115
-
116
- const int64_t input_offset = selected_row_idx * input_stride[1] +
117
- indices_ne1_idx * input_stride[2] +
118
- indices_ne2_idx * input_stride[3];
119
-
120
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
121
- indices_ne1_idx * output_stride[2] +
122
- indices_ne2_idx * output_stride[3];
123
-
124
- copy_in(input_offset, input_ne[0]);
125
- LocalTensor<half> input_local = input_queue.DeQue<half>();
126
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
127
-
128
- Cast(output_local, input_local, RoundMode::CAST_NONE,
129
- local_buffer_elems);
130
- output_queue.EnQue(output_local);
131
- copy_out(output_offset, input_ne[0]);
132
-
133
- input_queue.FreeTensor(input_local);
134
- }
135
-
136
- __aicore__ inline void calculate() {
137
- for (int64_t i = ir; i < ir + dr; i++) {
138
- calculate_row(i);
139
- }
140
- }
141
-
142
- private:
143
- int64_t input_ne[4];
144
- size_t input_stride[4];
145
-
146
- int64_t indices_ne[4];
147
- size_t indices_stride[4];
148
-
149
- int64_t output_ne[4];
150
- size_t output_stride[4];
151
-
152
- size_t local_buffer_elems;
153
-
154
- int64_t ir;
155
- int64_t dr;
156
-
157
- TPipe pipe;
158
- GlobalTensor<half> input_gm;
159
- GlobalTensor<int32_t> indices_gm;
160
- GlobalTensor<float> output_gm;
161
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
162
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
163
- int64_t op_block_idx;
164
- };
165
-
166
- template <typename T>
167
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
168
- auto gm_ptr = (__gm__ uint8_t *)gm;
169
- auto ub_ptr = (uint8_t *)(ub);
170
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
171
- *ub_ptr = *gm_ptr;
172
- }
173
- }
174
-
175
- extern "C" __global__ __aicore__ void ascendc_get_row_f16(
176
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
177
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
178
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
179
- int64_t input_ne_ub[4];
180
- size_t input_nb_ub[4];
181
- int64_t indices_ne_ub[4];
182
- size_t indices_nb_ub[4];
183
- int64_t output_ne_ub[4];
184
- size_t output_nb_ub[4];
185
-
186
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
187
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
188
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
189
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
190
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
191
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
192
-
193
- GET_ROW_F16 op;
194
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
195
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
196
- op.calculate();
197
- }