@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,190 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- class GET_ROW_F32 {
9
- public:
10
- __aicore__ inline GET_ROW_F32() {}
11
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
- int64_t *input_ne_ub, size_t *input_nb_ub,
13
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
- int64_t *output_ne_ub, size_t *output_nb_ub) {
15
- int64_t op_block_num = GetBlockNum();
16
- op_block_idx = GetBlockIdx();
17
-
18
- for (int i = 0; i < 4; i++) {
19
- input_ne[i] = input_ne_ub[i];
20
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
21
-
22
- indices_ne[i] = indices_ne_ub[i];
23
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
-
25
- output_ne[i] = output_ne_ub[i];
26
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
- }
28
-
29
- // Indices has two dims. n_elements = all rows should get.
30
- // dr, all rows should this thread get.
31
- uint64_t n_elements =
32
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
33
- dr = n_elements / op_block_num;
34
-
35
- uint64_t tails = n_elements % op_block_num;
36
- if (op_block_idx < tails) {
37
- dr += 1;
38
- ir = dr * op_block_idx;
39
- } else {
40
- ir = dr * op_block_idx + tails;
41
- }
42
-
43
- input_gm.SetGlobalBuffer((__gm__ float *)input);
44
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
45
- output_gm.SetGlobalBuffer((__gm__ float *)output);
46
-
47
- uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
48
- local_buffer_elems = local_buffer_size / sizeof(float);
49
-
50
- // TODO, consider long row that can't put in UB.
51
- // All data should asign to 32. It's ok because all data is align to 32.
52
- pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
53
- pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
54
- }
55
-
56
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
57
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58
- const size_t elem_per_block = 32 / sizeof(float);
59
- size_t tail = len % elem_per_block;
60
- len = len & ~(elem_per_block - 1);
61
- if(tail != 0) {
62
- len += elem_per_block;
63
- }
64
- DataCopy(input_local, input_gm[offset], len);
65
- input_queue.EnQue(input_local);
66
- }
67
-
68
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
69
- LocalTensor<float> output_local = output_queue.DeQue<float>();
70
- const size_t elem_per_block = 32 / sizeof(float);
71
- size_t tail = len % elem_per_block;
72
- len = len & ~(elem_per_block - 1);
73
- if (len > 0) {
74
- DataCopy(output_gm[offset], output_local, len);
75
- }
76
-
77
- if(tail != 0) {
78
- #ifdef ASCEND_310P
79
- for (size_t i = tail; i < elem_per_block; i++) {
80
- output_local[len + i].SetValue(0, 0);
81
- }
82
- SetAtomicAdd<float>();
83
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
84
- SetAtomicNone();
85
- #else
86
- DataCopyExtParams dataCopyParams;
87
- dataCopyParams.blockCount = 1;
88
- dataCopyParams.blockLen = tail * sizeof(float);
89
- DataCopyPad(output_gm[offset + len], output_local[len],
90
- dataCopyParams);
91
- #endif
92
- }
93
- output_queue.FreeTensor(output_local);
94
- }
95
-
96
- __aicore__ inline void calculate_row(int64_t idx) {
97
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
98
- const int64_t indices_ne1_idx =
99
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
100
- indices_ne[0];
101
- const int64_t indices_ne0_idx =
102
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
103
- indices_ne1_idx * indices_ne[0]);
104
-
105
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
106
- indices_ne1_idx * indices_stride[1] +
107
- indices_ne2_idx * indices_stride[2];
108
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
109
-
110
- const int64_t input_offset = selected_row_idx * input_stride[1] +
111
- indices_ne1_idx * input_stride[2] +
112
- indices_ne2_idx * input_stride[3];
113
-
114
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
115
- indices_ne1_idx * output_stride[2] +
116
- indices_ne2_idx * output_stride[3];
117
-
118
- copy_in(input_offset, input_ne[0]);
119
- LocalTensor<float> input_local = input_queue.DeQue<float>();
120
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
121
-
122
- DataCopy(output_local, input_local, local_buffer_elems);
123
- output_queue.EnQue(output_local);
124
- copy_out(output_offset, input_ne[0]);
125
-
126
- input_queue.FreeTensor(input_local);
127
- }
128
-
129
- __aicore__ inline void calculate() {
130
- for (int64_t i = ir; i < ir + dr; i++) {
131
- calculate_row(i);
132
- }
133
- }
134
-
135
- private:
136
- int64_t input_ne[4];
137
- size_t input_stride[4];
138
-
139
- int64_t indices_ne[4];
140
- size_t indices_stride[4];
141
-
142
- int64_t output_ne[4];
143
- size_t output_stride[4];
144
-
145
- size_t local_buffer_elems;
146
-
147
- int64_t ir;
148
- int64_t dr;
149
-
150
- TPipe pipe;
151
- GlobalTensor<float> input_gm;
152
- GlobalTensor<int32_t> indices_gm;
153
- GlobalTensor<float> output_gm;
154
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
155
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
156
- int64_t op_block_idx;
157
- };
158
-
159
- template <typename T>
160
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
161
- auto gm_ptr = (__gm__ uint8_t *)gm;
162
- auto ub_ptr = (uint8_t *)(ub);
163
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
164
- *ub_ptr = *gm_ptr;
165
- }
166
- }
167
-
168
- extern "C" __global__ __aicore__ void ascendc_get_row_f32(
169
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
170
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
171
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
172
- int64_t input_ne_ub[4];
173
- size_t input_nb_ub[4];
174
- int64_t indices_ne_ub[4];
175
- size_t indices_nb_ub[4];
176
- int64_t output_ne_ub[4];
177
- size_t output_nb_ub[4];
178
-
179
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
182
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
183
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
184
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
185
-
186
- GET_ROW_F32 op;
187
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
188
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
189
- op.calculate();
190
- }
@@ -1,204 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
- #ifdef ASCEND_310P // 310P not support 4bit get row
6
- extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
7
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
8
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
9
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
10
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
11
- printf("Ascend310P not support 4bit get row.\n");
12
- }
13
- #else
14
-
15
- #define BUFFER_NUM 2
16
-
17
- #define QK4_0 32
18
-
19
- class GET_ROW_Q4_0 {
20
- public:
21
- __aicore__ inline GET_ROW_Q4_0() {}
22
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
23
- int64_t *input_ne_ub, int64_t *indices_ne_ub,
24
- size_t *indices_nb_ub, int64_t *output_ne_ub,
25
- size_t *output_nb_ub) {
26
- int64_t op_block_num = GetBlockNum();
27
- int64_t op_block_idx = GetBlockIdx();
28
-
29
- for (int i = 0; i < 4; i++) {
30
- input_ne[i] = input_ne_ub[i];
31
- indices_ne[i] = indices_ne_ub[i];
32
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
33
- scale_ne[i] = input_ne_ub[i];
34
- output_ne[i] = output_ne_ub[i];
35
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
36
- }
37
-
38
- // one scale for a group.
39
- scale_ne[0] /= QK4_0;
40
-
41
- input_stride[0] = 1;
42
- scale_stride[0] = 1;
43
- output_stride[0] = 1;
44
- for (int i = 1; i < 4; i++) {
45
- input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
46
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
47
- }
48
-
49
- group_size_in_row = input_ne[0] / QK4_0;
50
- int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
51
- input_ne[3] / 2;
52
-
53
- // Indices has two dims. n_elements = all rows should get.
54
- // dr, all rows should this thread get.
55
- uint64_t n_elements =
56
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
57
- dr = n_elements / op_block_num;
58
-
59
- uint64_t tails = n_elements % op_block_num;
60
- if (op_block_idx < tails) {
61
- dr += 1;
62
- ir = dr * op_block_idx;
63
- } else {
64
- ir = dr * op_block_idx + tails;
65
- }
66
-
67
- input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
68
- scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
69
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
70
- output_gm.SetGlobalBuffer((__gm__ float *)output);
71
-
72
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
73
- pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
74
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
75
- }
76
-
77
- __aicore__ inline void copy_in(uint32_t offset) {
78
- LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
79
- // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
80
- DataCopy(input_local, input_gm[offset], QK4_0);
81
- input_queue.EnQue(input_local);
82
- }
83
-
84
- __aicore__ inline void copy_out(uint32_t offset) {
85
- LocalTensor<float> output_local = output_queue.DeQue<float>();
86
- DataCopy(output_gm[offset], output_local, QK4_0);
87
- output_queue.FreeTensor(output_local);
88
- }
89
-
90
- __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
91
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
92
- const int64_t indices_ne1_idx =
93
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
94
- indices_ne[0];
95
- const int64_t indices_ne0_idx =
96
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
97
- indices_ne1_idx * indices_ne[0]);
98
-
99
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
100
- indices_ne1_idx * indices_stride[1] +
101
- indices_ne2_idx * indices_stride[2];
102
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
103
-
104
- const int64_t input_offset = selected_row_idx * input_stride[1] +
105
- indices_ne1_idx * input_stride[2] +
106
- indices_ne2_idx * input_stride[3] +
107
- group * QK4_0;
108
- const int64_t scale_offset = selected_row_idx * scale_stride[1] +
109
- indices_ne1_idx * scale_stride[2] +
110
- indices_ne2_idx * scale_stride[3] + group;
111
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
112
- indices_ne1_idx * output_stride[2] +
113
- indices_ne2_idx * output_stride[3] +
114
- group * QK4_0;
115
-
116
- copy_in(input_offset);
117
- LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
118
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
119
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
120
-
121
- // TODO: cast more data to speed up.
122
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
123
- Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
124
-
125
- // Only mul need compile by group.
126
- half scale = scale_gm.GetValue(scale_offset);
127
-
128
- Muls(output_local, output_local, (float)scale, QK4_0);
129
-
130
- input_queue.FreeTensor(input_local);
131
- cast_queue.FreeTensor(cast_local);
132
- output_queue.EnQue(output_local);
133
-
134
- copy_out(output_offset);
135
- }
136
-
137
- __aicore__ inline void calculate() {
138
- for (int64_t i = ir; i < ir + dr; i++) {
139
- for (int64_t j = 0; j < group_size_in_row; j++) {
140
- calculate_group(i, j);
141
- }
142
- }
143
- }
144
-
145
- private:
146
- int64_t input_ne[4];
147
- size_t input_stride[4];
148
-
149
- int64_t scale_ne[4];
150
- size_t scale_stride[4];
151
-
152
- int64_t indices_ne[4];
153
- size_t indices_stride[4];
154
-
155
- int64_t output_ne[4];
156
- size_t output_stride[4];
157
-
158
- int64_t ir;
159
- int64_t dr;
160
-
161
- int64_t group_size_in_row;
162
-
163
- TPipe pipe;
164
- GlobalTensor<int4b_t> input_gm;
165
- GlobalTensor<half> scale_gm;
166
- GlobalTensor<int32_t> indices_gm;
167
- GlobalTensor<float> output_gm;
168
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
169
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
170
- TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
171
- };
172
-
173
- template <typename T>
174
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
175
- auto gm_ptr = (__gm__ uint8_t *)gm;
176
- auto ub_ptr = (uint8_t *)(ub);
177
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
178
- *ub_ptr = *gm_ptr;
179
- }
180
- }
181
-
182
- extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
183
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
184
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
185
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
186
- int64_t input_ne_ub[4];
187
- int64_t indices_ne_ub[4];
188
- size_t indices_nb_ub[4];
189
- int64_t output_ne_ub[4];
190
- size_t output_nb_ub[4];
191
-
192
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
193
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
194
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
195
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
196
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
197
-
198
- GET_ROW_Q4_0 op;
199
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
200
- indices_nb_ub, output_ne_ub, output_nb_ub);
201
- op.calculate();
202
- }
203
-
204
- #endif // #ifdef ASCEND_310P
@@ -1,191 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- #define QK8_0 32
9
-
10
- class GET_ROW_Q8_0 {
11
- public:
12
- __aicore__ inline GET_ROW_Q8_0() {}
13
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
14
- int64_t *input_ne_ub, int64_t *indices_ne_ub,
15
- size_t *indices_nb_ub, int64_t *output_ne_ub,
16
- size_t *output_nb_ub) {
17
- int64_t op_block_num = GetBlockNum();
18
- int64_t op_block_idx = GetBlockIdx();
19
-
20
- for (int i = 0; i < 4; i++) {
21
- input_ne[i] = input_ne_ub[i];
22
- indices_ne[i] = indices_ne_ub[i];
23
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
- scale_ne[i] = input_ne_ub[i];
25
- output_ne[i] = output_ne_ub[i];
26
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
- }
28
-
29
- // one scale for a group.
30
- scale_ne[0] /= QK8_0;
31
-
32
- input_stride[0] = 1;
33
- scale_stride[0] = 1;
34
- output_stride[0] = 1;
35
- for (int i = 1; i < 4; i++) {
36
- input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
37
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
38
- }
39
-
40
- group_size_in_row = input_ne[0] / QK8_0;
41
- int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
42
- input_ne[3] * sizeof(int8_t);
43
-
44
- // Indices has two dims. n_elements = all rows should get.
45
- // dr, all rows should this thread get.
46
- uint64_t n_elements =
47
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
48
- dr = n_elements / op_block_num;
49
-
50
- uint64_t tails = n_elements % op_block_num;
51
- if (op_block_idx < tails) {
52
- dr += 1;
53
- ir = dr * op_block_idx;
54
- } else {
55
- ir = dr * op_block_idx + tails;
56
- }
57
-
58
- input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
59
- scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
60
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
61
- output_gm.SetGlobalBuffer((__gm__ float *)output);
62
-
63
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
64
- pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
65
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
66
- }
67
-
68
- __aicore__ inline void copy_in(uint32_t offset) {
69
- LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
70
- DataCopy(input_local, input_gm[offset], QK8_0);
71
- input_queue.EnQue(input_local);
72
- }
73
-
74
- __aicore__ inline void copy_out(uint32_t offset) {
75
- LocalTensor<float> output_local = output_queue.DeQue<float>();
76
- DataCopy(output_gm[offset], output_local, QK8_0);
77
- output_queue.FreeTensor(output_local);
78
- }
79
-
80
- __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
81
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
82
- const int64_t indices_ne1_idx =
83
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
84
- indices_ne[0];
85
- const int64_t indices_ne0_idx =
86
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
87
- indices_ne1_idx * indices_ne[0]);
88
-
89
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
90
- indices_ne1_idx * indices_stride[1] +
91
- indices_ne2_idx * indices_stride[2];
92
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
93
-
94
- const int64_t input_offset = selected_row_idx * input_stride[1] +
95
- indices_ne1_idx * input_stride[2] +
96
- indices_ne2_idx * input_stride[3] +
97
- group * QK8_0;
98
- const int64_t scale_offset = selected_row_idx * scale_stride[1] +
99
- indices_ne1_idx * scale_stride[2] +
100
- indices_ne2_idx * scale_stride[3] + group;
101
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
102
- indices_ne1_idx * output_stride[2] +
103
- indices_ne2_idx * output_stride[3] +
104
- group * QK8_0;
105
-
106
- copy_in(input_offset);
107
- LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
108
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
109
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
110
-
111
- // TODO: cast more data to speed up.
112
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
113
- Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
114
-
115
- // Only mul need compile by group.
116
- half scale = scale_gm.GetValue(scale_offset);
117
- Muls(output_local, output_local, (float)scale, QK8_0);
118
-
119
- input_queue.FreeTensor(input_local);
120
- cast_queue.FreeTensor(cast_local);
121
- output_queue.EnQue(output_local);
122
-
123
- copy_out(output_offset);
124
- }
125
-
126
- __aicore__ inline void calculate() {
127
- for (int64_t i = ir; i < ir + dr; i++) {
128
- for (int64_t j = 0; j < group_size_in_row; j++) {
129
- calculate_group(i, j);
130
- }
131
- }
132
- }
133
-
134
- private:
135
- int64_t input_ne[4];
136
- size_t input_stride[4];
137
-
138
- int64_t scale_ne[4];
139
- size_t scale_stride[4];
140
-
141
- int64_t indices_ne[4];
142
- size_t indices_stride[4];
143
-
144
- int64_t output_ne[4];
145
- size_t output_stride[4];
146
-
147
- int64_t ir;
148
- int64_t dr;
149
-
150
- int64_t group_size_in_row;
151
-
152
- TPipe pipe;
153
- GlobalTensor<int8_t> input_gm;
154
- GlobalTensor<half> scale_gm;
155
- GlobalTensor<int32_t> indices_gm;
156
- GlobalTensor<float> output_gm;
157
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
158
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
159
- TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
160
- };
161
-
162
- template <typename T>
163
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
164
- auto gm_ptr = (__gm__ uint8_t *)gm;
165
- auto ub_ptr = (uint8_t *)(ub);
166
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
167
- *ub_ptr = *gm_ptr;
168
- }
169
- }
170
-
171
- extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
172
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
173
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
174
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
175
- int64_t input_ne_ub[4];
176
- int64_t indices_ne_ub[4];
177
- size_t indices_nb_ub[4];
178
- int64_t output_ne_ub[4];
179
- size_t output_nb_ub[4];
180
-
181
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
182
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
183
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
184
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
185
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
186
-
187
- GET_ROW_Q8_0 op;
188
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
189
- indices_nb_ub, output_ne_ub, output_nb_ub);
190
- op.calculate();
191
- }