@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,218 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
- #ifdef ASCEND_310P
5
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
6
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
- printf("Ascend310P not support f16->8bit quantization.\n");
10
- }
11
- #else
12
-
13
- #define BUFFER_NUM 2
14
- #define QK8_0 32
15
-
16
- class QUANTIZE_F16_Q8_0 {
17
- public:
18
- __aicore__ inline QUANTIZE_F16_Q8_0() {}
19
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
20
- int64_t *input_ne_ub, size_t *input_nb_ub,
21
- int64_t *output_ne_ub) {
22
- int64_t op_block_num = GetBlockNum();
23
- int64_t op_block_idx = GetBlockIdx();
24
-
25
- for (int i = 0; i < 4; i++) {
26
- input_ne[i] = input_ne_ub[i];
27
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
28
-
29
- output_ne[i] = output_ne_ub[i];
30
- }
31
-
32
- output_stride[0] = 1;
33
- for (int i = 1; i < 4; i++) {
34
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
35
- }
36
-
37
- scale_ne = input_ne;
38
- scale_stride[0] = 1;
39
- scale_stride[1] = input_ne[0] / QK8_0;
40
- for (int i = 2; i < 4; i++) {
41
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
42
- }
43
-
44
- // split input tensor by rows.
45
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
46
- dr = nr / op_block_num;
47
-
48
- uint64_t tails = nr % op_block_num;
49
- if (op_block_idx < tails) {
50
- dr += 1;
51
- ir = dr * op_block_idx;
52
- } else {
53
- ir = dr * op_block_idx + tails;
54
- }
55
-
56
- group_size_in_row = scale_stride[1];
57
- int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
58
- output_ne[3] * sizeof(uint8_t);
59
-
60
- input_gm.SetGlobalBuffer((__gm__ half *)input);
61
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
62
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
63
- group_size_in_row *
64
- sizeof(half)));
65
-
66
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
67
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
68
- pipe.InitBuffer(work_queue, 1, 32);
69
- pipe.InitBuffer(max_queue, 1, 32);
70
- pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
71
- pipe.InitBuffer(scale_queue, 1, 32);
72
- pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
73
- }
74
-
75
- __aicore__ inline void copy_in(uint32_t offset) {
76
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
77
- DataCopy(input_local, input_gm[offset], QK8_0);
78
- input_queue.EnQue(input_local);
79
- }
80
-
81
- __aicore__ inline void copy_out(uint32_t offset) {
82
- LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
83
- DataCopy(output_gm[offset], output_local, QK8_0);
84
- output_queue.FreeTensor(output_local);
85
- }
86
-
87
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
88
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
89
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
90
- const int64_t i1 =
91
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
92
-
93
- const int64_t input_offset = i1 * input_stride[1] +
94
- i2 * input_stride[2] +
95
- i3 * input_stride[3] + QK8_0 * group;
96
-
97
- const int64_t output_offset = i1 * output_stride[1] +
98
- i2 * output_stride[2] +
99
- i3 * output_stride[3] + QK8_0 * group;
100
-
101
- copy_in(input_offset);
102
- LocalTensor<half> input_local = input_queue.DeQue<half>();
103
- LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
104
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
105
- LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
106
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
107
- LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
108
-
109
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
110
- Abs(abs_local, cast_local, QK8_0);
111
- ReduceMax(max_local, abs_local, work_local, QK8_0);
112
-
113
- pipe_barrier(PIPE_ALL);
114
- float d = max_local.GetValue(0);
115
- d = d / ((1 << 7) - 1);
116
- if (d != 0) {
117
- Muls(cast_local, cast_local, 1.0f / d, QK8_0);
118
- }
119
-
120
- Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
121
- Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
122
- Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
123
- output_queue.EnQue(output_local);
124
- copy_out(output_offset);
125
-
126
- input_queue.FreeTensor(input_local);
127
- work_queue.FreeTensor(work_local);
128
- abs_queue.FreeTensor(abs_local);
129
- max_queue.FreeTensor(max_local);
130
- cast_queue.FreeTensor(cast_local);
131
- return (half)d;
132
- }
133
-
134
- __aicore__ inline void calculate() {
135
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
136
- uint32_t scale_local_offset = 0;
137
- uint32_t scale_global_offset = 0;
138
- for (int64_t i = ir; i < ir + dr; i++) {
139
- for (int64_t j = 0; j < group_size_in_row; j++) {
140
- half scale = calculate_group(i, j);
141
- scale_local.SetValue(scale_local_offset++, scale);
142
- if (scale_local_offset == 16) {
143
- scale_local_offset = 0;
144
- // TODO: OPTIMIZE ME
145
- pipe_barrier(PIPE_ALL);
146
- DataCopy(scale_gm[scale_global_offset], scale_local, 16);
147
- pipe_barrier(PIPE_ALL);
148
- scale_global_offset += 16;
149
- }
150
- }
151
- }
152
-
153
- if (scale_local_offset != 0) {
154
- pipe_barrier(PIPE_ALL);
155
- DataCopyExtParams dataCopyParams;
156
- dataCopyParams.blockCount = 1;
157
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
158
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
159
- dataCopyParams);
160
- pipe_barrier(PIPE_ALL);
161
- }
162
- }
163
-
164
- private:
165
- int64_t input_ne[4];
166
- size_t input_stride[4];
167
-
168
- int64_t *scale_ne;
169
- size_t scale_stride[4];
170
-
171
- int64_t output_ne[4];
172
- size_t output_stride[4];
173
-
174
- int64_t group_size_in_row;
175
-
176
- int64_t ir;
177
- int64_t dr;
178
-
179
- TPipe pipe;
180
- GlobalTensor<half> input_gm;
181
- GlobalTensor<half> scale_gm;
182
- GlobalTensor<int8_t> output_gm;
183
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
184
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
185
- TQue<QuePosition::VECIN, 1> work_queue;
186
- TQue<QuePosition::VECOUT, 1> max_queue;
187
- TQue<QuePosition::VECIN, 1> abs_queue;
188
- TQue<QuePosition::VECOUT, 1> scale_queue;
189
- TQue<QuePosition::VECOUT, 1> cast_queue;
190
-
191
- };
192
-
193
- template <typename T>
194
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
195
- auto gm_ptr = (__gm__ uint8_t *)gm;
196
- auto ub_ptr = (uint8_t *)(ub);
197
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
198
- *ub_ptr = *gm_ptr;
199
- }
200
- }
201
-
202
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
203
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
204
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
205
- int64_t input_ne_ub[4];
206
- size_t input_nb_ub[4];
207
- int64_t output_ne_ub[4];
208
-
209
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
210
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
211
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
212
-
213
- QUANTIZE_F16_Q8_0 op;
214
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
215
- op.calculate();
216
- }
217
-
218
- #endif // #ifdef ASCEND_310P
@@ -1,216 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
- #ifdef ASCEND_310P // 310P not support f32->8bit quantization
5
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
6
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
- printf("Ascend310P not support f32->8bit quantization.\n");
10
- }
11
- #else
12
-
13
- #define BUFFER_NUM 2
14
- #define QK8_0 32
15
-
16
- class QUANTIZE_F32_Q8_0 {
17
- public:
18
- __aicore__ inline QUANTIZE_F32_Q8_0() {}
19
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
20
- int64_t *input_ne_ub, size_t *input_nb_ub,
21
- int64_t *output_ne_ub) {
22
- int64_t op_block_num = GetBlockNum();
23
- int64_t op_block_idx = GetBlockIdx();
24
-
25
- for (int i = 0; i < 4; i++) {
26
- input_ne[i] = input_ne_ub[i];
27
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
28
-
29
- output_ne[i] = output_ne_ub[i];
30
- }
31
-
32
- output_stride[0] = 1;
33
- for (int i = 1; i < 4; i++) {
34
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
35
- }
36
-
37
- scale_ne = input_ne;
38
- scale_stride[0] = 1;
39
- scale_stride[1] = input_ne[0] / QK8_0;
40
- for (int i = 2; i < 4; i++) {
41
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
42
- }
43
-
44
- // split input tensor by rows.
45
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
46
- dr = nr / op_block_num;
47
-
48
- uint64_t tails = nr % op_block_num;
49
- if (op_block_idx < tails) {
50
- dr += 1;
51
- ir = dr * op_block_idx;
52
- } else {
53
- ir = dr * op_block_idx + tails;
54
- }
55
-
56
- group_size_in_row = scale_stride[1];
57
- int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
58
- output_ne[3] * sizeof(uint8_t);
59
-
60
- input_gm.SetGlobalBuffer((__gm__ float *)input);
61
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
62
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
63
- ir * group_size_in_row *
64
- sizeof(half)));
65
-
66
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
67
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
68
- pipe.InitBuffer(work_queue, 1, 32);
69
- pipe.InitBuffer(max_queue, 1, 32);
70
- pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
71
- pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
72
- pipe.InitBuffer(scale_queue, 1, 32);
73
- }
74
-
75
- __aicore__ inline void copy_in(uint32_t offset) {
76
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
77
- DataCopy(input_local, input_gm[offset], QK8_0);
78
- input_queue.EnQue(input_local);
79
- }
80
-
81
- __aicore__ inline void copy_out(uint32_t offset) {
82
- LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
83
- DataCopy(output_gm[offset], output_local, QK8_0);
84
- output_queue.FreeTensor(output_local);
85
- }
86
-
87
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
88
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
89
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
90
- const int64_t i1 =
91
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
92
-
93
- const int64_t input_offset = i1 * input_stride[1] +
94
- i2 * input_stride[2] +
95
- i3 * input_stride[3] + QK8_0 * group;
96
-
97
- const int64_t output_offset = i1 * output_stride[1] +
98
- i2 * output_stride[2] +
99
- i3 * output_stride[3] + QK8_0 * group;
100
-
101
- copy_in(input_offset);
102
- LocalTensor<float> input_local = input_queue.DeQue<float>();
103
- LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
104
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
105
- LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
106
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
107
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
108
-
109
- Abs(abs_local, input_local, QK8_0);
110
- ReduceMax(max_local, abs_local, work_local, QK8_0);
111
- pipe_barrier(PIPE_ALL);
112
- float d = max_local.GetValue(0);
113
- d = d / ((1 << 7) - 1);
114
- if (d != 0) {
115
- Muls(input_local, input_local, 1.0f / d, QK8_0);
116
- }
117
-
118
- Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
119
- Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
120
- Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
121
- output_queue.EnQue(output_local);
122
- copy_out(output_offset);
123
-
124
- input_queue.FreeTensor(input_local);
125
- work_queue.FreeTensor(work_local);
126
- abs_queue.FreeTensor(abs_local);
127
- max_queue.FreeTensor(max_local);
128
- cast_queue.FreeTensor(cast_local);
129
-
130
- return (half)d;
131
- }
132
-
133
- __aicore__ inline void calculate() {
134
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
135
- uint32_t scale_local_offset = 0;
136
- uint32_t scale_global_offset = 0;
137
- for (int64_t i = ir; i < ir + dr; i++) {
138
- for (int64_t j = 0; j < group_size_in_row; j++) {
139
- half scale = calculate_group(i, j);
140
- scale_local.SetValue(scale_local_offset++, scale);
141
- if (scale_local_offset == 16) {
142
- scale_local_offset = 0;
143
- // TODO: OPTIMIZE ME
144
- pipe_barrier(PIPE_ALL);
145
- DataCopy(scale_gm[scale_global_offset], scale_local, 16);
146
- pipe_barrier(PIPE_ALL);
147
- scale_global_offset += 16;
148
- }
149
- }
150
- }
151
-
152
- if (scale_local_offset != 0) {
153
- pipe_barrier(PIPE_ALL);
154
- DataCopyExtParams dataCopyParams;
155
- dataCopyParams.blockCount = 1;
156
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
157
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
158
- dataCopyParams);
159
- pipe_barrier(PIPE_ALL);
160
- }
161
- }
162
-
163
- private:
164
- int64_t input_ne[4];
165
- size_t input_stride[4];
166
-
167
- int64_t *scale_ne;
168
- size_t scale_stride[4];
169
-
170
- int64_t output_ne[4];
171
- size_t output_stride[4];
172
-
173
- int64_t group_size_in_row;
174
-
175
- int64_t ir;
176
- int64_t dr;
177
-
178
- TPipe pipe;
179
- GlobalTensor<float> input_gm;
180
- GlobalTensor<half> scale_gm;
181
- GlobalTensor<int8_t> output_gm;
182
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
183
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
184
- TQue<QuePosition::VECIN, 1> work_queue;
185
- TQue<QuePosition::VECOUT, 1> max_queue;
186
- TQue<QuePosition::VECIN, 1> abs_queue;
187
- TQue<QuePosition::VECIN, 1> cast_queue;
188
- TQue<QuePosition::VECOUT, 1> scale_queue;
189
- };
190
-
191
- template <typename T>
192
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
193
- auto gm_ptr = (__gm__ uint8_t *)gm;
194
- auto ub_ptr = (uint8_t *)(ub);
195
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
196
- *ub_ptr = *gm_ptr;
197
- }
198
- }
199
-
200
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
201
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
202
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
203
- int64_t input_ne_ub[4];
204
- size_t input_nb_ub[4];
205
- int64_t output_ne_ub[4];
206
-
207
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
208
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
209
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
210
-
211
- QUANTIZE_F32_Q8_0 op;
212
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
213
- op.calculate();
214
- }
215
-
216
- #endif // #ifdef ASCEND_310P