@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -37,13 +37,16 @@ extern "C" {
37
37
  // ====== Dataset ======
38
38
 
39
39
  GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
40
- int64_t ne_datapoint, // number of elements per datapoint
41
- int64_t ne_label, // number of elements per label
42
- int64_t ndata, // total number of datapoints/labels
43
- int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
40
+ enum ggml_type type_data, // the type for the internal data tensor
41
+ enum ggml_type type_label, // the type for the internal labels tensor
42
+ int64_t ne_datapoint, // number of elements per datapoint
43
+ int64_t ne_label, // number of elements per label
44
+ int64_t ndata, // total number of datapoints/labels
45
+ int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
44
46
  GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
45
47
 
46
48
  // get underlying tensors that store the data
49
+ GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
47
50
  GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
48
51
  GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
49
52
 
@@ -56,13 +59,19 @@ extern "C" {
56
59
  struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
57
60
  struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
58
61
  int64_t ibatch);
62
+ GGML_API void ggml_opt_dataset_get_batch_host(
63
+ ggml_opt_dataset_t dataset,
64
+ void * data_batch,
65
+ size_t nb_data_batch,
66
+ void * labels_batch,
67
+ int64_t ibatch);
59
68
 
60
69
  // ====== Model / Context ======
61
70
 
62
71
  enum ggml_opt_build_type {
63
- GGML_OPT_BUILD_TYPE_FORWARD,
64
- GGML_OPT_BUILD_TYPE_GRAD,
65
- GGML_OPT_BUILD_TYPE_OPT,
72
+ GGML_OPT_BUILD_TYPE_FORWARD = 10,
73
+ GGML_OPT_BUILD_TYPE_GRAD = 20,
74
+ GGML_OPT_BUILD_TYPE_OPT = 30,
66
75
  };
67
76
 
68
77
  // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
81
90
  // userdata can be used to pass arbitrary data
82
91
  typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
83
92
 
84
- // returns the default optimizer params (constant)
93
+ // returns the default optimizer params (constant, hard-coded values)
85
94
  // userdata is not used
86
95
  GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
87
96
 
97
+ // casts userdata to ggml_opt_optimizer_params and returns it
98
+ GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
99
+
88
100
  // parameters for initializing a new optimization context
89
101
  struct ggml_opt_params {
90
102
  ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
91
103
 
92
- struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
93
-
94
- // the forward graph is defined by inputs and outputs
95
- // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
96
- struct ggml_tensor * inputs;
97
- struct ggml_tensor * outputs;
104
+ // by default the forward graph needs to be reconstructed for each eval
105
+ // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
106
+ struct ggml_context * ctx_compute;
107
+ struct ggml_tensor * inputs;
108
+ struct ggml_tensor * outputs;
98
109
 
99
110
  enum ggml_opt_loss_type loss_type;
100
111
  enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {
107
118
 
108
119
  // get parameters for an optimization context with defaults set where possible
109
120
  // parameters for which no sensible defaults exist are supplied as arguments to this function
110
- GGML_API ggml_opt_params ggml_opt_default_params(
111
- ggml_backend_sched_t backend_sched,
112
- struct ggml_context * ctx_compute,
113
- struct ggml_tensor * inputs,
114
- struct ggml_tensor * outputs,
115
- enum ggml_opt_loss_type loss_type);
121
+ GGML_API struct ggml_opt_params ggml_opt_default_params(
122
+ ggml_backend_sched_t backend_sched,
123
+ enum ggml_opt_loss_type loss_type);
116
124
 
117
125
  GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
118
126
  GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -121,6 +129,7 @@ extern "C" {
121
129
  GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
122
130
 
123
131
  // get underlying tensors that store data
132
+ // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
124
133
  GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
125
134
  GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
126
135
  GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +137,12 @@ extern "C" {
128
137
  GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
129
138
  GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
130
139
 
140
+ // get the gradient accumulator for a node from the forward graph
131
141
  GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
132
142
 
133
143
  // ====== Optimization Result ======
134
144
 
135
- GGML_API ggml_opt_result_t ggml_opt_result_init();
145
+ GGML_API ggml_opt_result_t ggml_opt_result_init(void);
136
146
  GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
137
147
  GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
138
148
 
@@ -144,11 +154,20 @@ extern "C" {
144
154
 
145
155
  // ====== Computation ======
146
156
 
147
- // do forward pass, increment result if not NULL
148
- GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
157
+ // if not using static graphs, this function must be called prior to ggml_opt_alloc
158
+ GGML_API void ggml_opt_prepare_alloc(
159
+ ggml_opt_context_t opt_ctx,
160
+ struct ggml_context * ctx_compute,
161
+ struct ggml_cgraph * gf,
162
+ struct ggml_tensor * inputs,
163
+ struct ggml_tensor * outputs);
164
+
165
+ // allocate the next graph for evaluation, either forward or forward + backward
166
+ // must be called exactly once prior to calling ggml_opt_eval
167
+ GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
149
168
 
150
- // do forward pass, increment result if not NULL, do backward pass
151
- GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
169
+ // do forward pass, increment result if not NULL, do backward pass if allocated
170
+ GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
152
171
 
153
172
  // ############################################################################
154
173
  // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +219,9 @@ extern "C" {
200
219
  // fit model defined by inputs and outputs to dataset
201
220
  GGML_API void ggml_opt_fit(
202
221
  ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
203
- ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
204
- ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
205
- ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
222
+ struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
223
+ struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
224
+ struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
206
225
  ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
207
226
  enum ggml_opt_loss_type loss_type, // loss to minimize
208
227
  ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
@@ -7,6 +7,9 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
+ #define RPC_PROTO_MAJOR_VERSION 2
11
+ #define RPC_PROTO_MINOR_VERSION 0
12
+ #define RPC_PROTO_PATCH_VERSION 0
10
13
  #define GGML_RPC_MAX_SERVERS 16
11
14
 
12
15
  // backend API
@@ -17,7 +20,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
17
20
 
18
21
  GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19
22
 
20
- GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
23
+ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
24
+ const char * cache_dir,
25
+ size_t free_mem, size_t total_mem);
21
26
 
22
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
23
28
 
@@ -393,8 +393,8 @@ extern "C" {
393
393
 
394
394
  // precision
395
395
  enum ggml_prec {
396
- GGML_PREC_DEFAULT,
397
- GGML_PREC_F32,
396
+ GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
397
+ GGML_PREC_F32 = 10,
398
398
  };
399
399
 
400
400
  // model file types
@@ -481,6 +481,7 @@ extern "C" {
481
481
  GGML_OP_CONV_TRANSPOSE_1D,
482
482
  GGML_OP_IM2COL,
483
483
  GGML_OP_IM2COL_BACK,
484
+ GGML_OP_CONV_2D_DW,
484
485
  GGML_OP_CONV_TRANSPOSE_2D,
485
486
  GGML_OP_POOL_1D,
486
487
  GGML_OP_POOL_2D,
@@ -507,17 +508,12 @@ extern "C" {
507
508
 
508
509
  GGML_OP_UNARY,
509
510
 
510
- GGML_OP_MAP_UNARY,
511
- GGML_OP_MAP_BINARY,
512
-
513
- GGML_OP_MAP_CUSTOM1_F32,
514
- GGML_OP_MAP_CUSTOM2_F32,
515
- GGML_OP_MAP_CUSTOM3_F32,
516
-
517
511
  GGML_OP_MAP_CUSTOM1,
518
512
  GGML_OP_MAP_CUSTOM2,
519
513
  GGML_OP_MAP_CUSTOM3,
520
514
 
515
+ GGML_OP_CUSTOM,
516
+
521
517
  GGML_OP_CROSS_ENTROPY_LOSS,
522
518
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
523
519
  GGML_OP_OPT_STEP_ADAMW,
@@ -677,11 +673,18 @@ extern "C" {
677
673
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
678
674
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
679
675
 
676
+ // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
680
677
  GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
681
678
  GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
682
679
  GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
683
680
  GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
684
681
 
682
+ // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
683
+ GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
684
+
685
+ // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
686
+ GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
687
+
685
688
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
686
689
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
687
690
 
@@ -765,7 +768,7 @@ extern "C" {
765
768
  // Tensor flags
766
769
  GGML_API void ggml_set_input(struct ggml_tensor * tensor);
767
770
  GGML_API void ggml_set_output(struct ggml_tensor * tensor);
768
- GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
771
+ GGML_API void ggml_set_param(struct ggml_tensor * tensor);
769
772
  GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
770
773
 
771
774
  //
@@ -935,7 +938,7 @@ extern "C" {
935
938
  GGML_API struct ggml_tensor * ggml_repeat_back(
936
939
  struct ggml_context * ctx,
937
940
  struct ggml_tensor * a,
938
- struct ggml_tensor * b);
941
+ struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
939
942
 
940
943
  // concat a and b along dim
941
944
  // used in stable-diffusion
@@ -1665,7 +1668,7 @@ extern "C" {
1665
1668
  struct ggml_tensor * a,
1666
1669
  struct ggml_tensor * b);
1667
1670
 
1668
- // depthwise
1671
+ // depthwise (via im2col and mul_mat)
1669
1672
  GGML_API struct ggml_tensor * ggml_conv_2d_dw(
1670
1673
  struct ggml_context * ctx,
1671
1674
  struct ggml_tensor * a, // convolution kernel
@@ -1677,6 +1680,22 @@ extern "C" {
1677
1680
  int d0, // dilation dimension 0
1678
1681
  int d1); // dilation dimension 1
1679
1682
 
1683
+ // Depthwise 2D convolution
1684
+ // may be faster than ggml_conv_2d_dw, but not available in all backends
1685
+ // a: KW KH 1 C convolution kernel
1686
+ // b: W H C N input data
1687
+ // res: W_out H_out C N
1688
+ GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
1689
+ struct ggml_context * ctx,
1690
+ struct ggml_tensor * a,
1691
+ struct ggml_tensor * b,
1692
+ int stride0,
1693
+ int stride1,
1694
+ int pad0,
1695
+ int pad1,
1696
+ int dilation0,
1697
+ int dilation1);
1698
+
1680
1699
  GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1681
1700
  struct ggml_context * ctx,
1682
1701
  struct ggml_tensor * a,
@@ -1722,24 +1741,29 @@ extern "C" {
1722
1741
  float p0,
1723
1742
  float p1);
1724
1743
 
1725
- // nearest interpolate
1744
+ enum ggml_scale_mode {
1745
+ GGML_SCALE_MODE_NEAREST = 0,
1746
+ GGML_SCALE_MODE_BILINEAR = 1,
1747
+ };
1748
+
1749
+ // interpolate
1726
1750
  // multiplies ne0 and ne1 by scale factor
1727
- // used in stable-diffusion
1728
1751
  GGML_API struct ggml_tensor * ggml_upscale(
1729
1752
  struct ggml_context * ctx,
1730
1753
  struct ggml_tensor * a,
1731
- int scale_factor);
1754
+ int scale_factor,
1755
+ enum ggml_scale_mode mode);
1732
1756
 
1733
- // nearest interpolate
1734
- // nearest interpolate to specified dimensions
1735
- // used in tortoise.cpp
1757
+ // interpolate
1758
+ // interpolate scale to specified dimensions
1736
1759
  GGML_API struct ggml_tensor * ggml_upscale_ext(
1737
1760
  struct ggml_context * ctx,
1738
1761
  struct ggml_tensor * a,
1739
1762
  int ne0,
1740
1763
  int ne1,
1741
1764
  int ne2,
1742
- int ne3);
1765
+ int ne3,
1766
+ enum ggml_scale_mode mode);
1743
1767
 
1744
1768
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1745
1769
  GGML_API struct ggml_tensor * ggml_pad(
@@ -1791,11 +1815,11 @@ extern "C" {
1791
1815
 
1792
1816
  #define GGML_KQ_MASK_PAD 64
1793
1817
 
1794
- // q: [n_embd, n_batch, n_head, 1]
1795
- // k: [n_embd, n_kv, n_head_kv, 1]
1796
- // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1797
- // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1798
- // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1818
+ // q: [n_embd_k, n_batch, n_head, 1]
1819
+ // k: [n_embd_k, n_kv, n_head_kv, 1]
1820
+ // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1821
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1822
+ // res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
1799
1823
  GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1800
1824
  struct ggml_context * ctx,
1801
1825
  struct ggml_tensor * q,
@@ -1916,83 +1940,6 @@ extern "C" {
1916
1940
 
1917
1941
  // custom operators
1918
1942
 
1919
- typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1920
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1921
-
1922
- typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1923
- typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1924
- typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1925
-
1926
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1927
- struct ggml_context * ctx,
1928
- struct ggml_tensor * a,
1929
- ggml_unary_op_f32_t fun),
1930
- "use ggml_map_custom1 instead");
1931
-
1932
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1933
- struct ggml_context * ctx,
1934
- struct ggml_tensor * a,
1935
- ggml_unary_op_f32_t fun),
1936
- "use ggml_map_custom1_inplace instead");
1937
-
1938
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1939
- struct ggml_context * ctx,
1940
- struct ggml_tensor * a,
1941
- struct ggml_tensor * b,
1942
- ggml_binary_op_f32_t fun),
1943
- "use ggml_map_custom2 instead");
1944
-
1945
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1946
- struct ggml_context * ctx,
1947
- struct ggml_tensor * a,
1948
- struct ggml_tensor * b,
1949
- ggml_binary_op_f32_t fun),
1950
- "use ggml_map_custom2_inplace instead");
1951
-
1952
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1953
- struct ggml_context * ctx,
1954
- struct ggml_tensor * a,
1955
- ggml_custom1_op_f32_t fun),
1956
- "use ggml_map_custom1 instead");
1957
-
1958
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1959
- struct ggml_context * ctx,
1960
- struct ggml_tensor * a,
1961
- ggml_custom1_op_f32_t fun),
1962
- "use ggml_map_custom1_inplace instead");
1963
-
1964
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1965
- struct ggml_context * ctx,
1966
- struct ggml_tensor * a,
1967
- struct ggml_tensor * b,
1968
- ggml_custom2_op_f32_t fun),
1969
- "use ggml_map_custom2 instead");
1970
-
1971
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1972
- struct ggml_context * ctx,
1973
- struct ggml_tensor * a,
1974
- struct ggml_tensor * b,
1975
- ggml_custom2_op_f32_t fun),
1976
- "use ggml_map_custom2_inplace instead");
1977
-
1978
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1979
- struct ggml_context * ctx,
1980
- struct ggml_tensor * a,
1981
- struct ggml_tensor * b,
1982
- struct ggml_tensor * c,
1983
- ggml_custom3_op_f32_t fun),
1984
- "use ggml_map_custom3 instead");
1985
-
1986
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1987
- struct ggml_context * ctx,
1988
- struct ggml_tensor * a,
1989
- struct ggml_tensor * b,
1990
- struct ggml_tensor * c,
1991
- ggml_custom3_op_f32_t fun),
1992
- "use ggml_map_custom3_inplace instead");
1993
-
1994
- // custom operators v2
1995
-
1996
1943
  typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1997
1944
  typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1998
1945
  typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
@@ -2048,6 +1995,30 @@ extern "C" {
2048
1995
  int n_tasks,
2049
1996
  void * userdata);
2050
1997
 
1998
+ typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
1999
+
2000
+ GGML_API struct ggml_tensor * ggml_custom_4d(
2001
+ struct ggml_context * ctx,
2002
+ enum ggml_type type,
2003
+ int64_t ne0,
2004
+ int64_t ne1,
2005
+ int64_t ne2,
2006
+ int64_t ne3,
2007
+ struct ggml_tensor ** args,
2008
+ int n_args,
2009
+ ggml_custom_op_t fun,
2010
+ int n_tasks,
2011
+ void * userdata);
2012
+
2013
+ GGML_API struct ggml_tensor * ggml_custom_inplace(
2014
+ struct ggml_context * ctx,
2015
+ struct ggml_tensor * a,
2016
+ struct ggml_tensor ** args,
2017
+ int n_args,
2018
+ ggml_custom_op_t fun,
2019
+ int n_tasks,
2020
+ void * userdata);
2021
+
2051
2022
  // loss function
2052
2023
 
2053
2024
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
@@ -2078,15 +2049,14 @@ extern "C" {
2078
2049
 
2079
2050
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2080
2051
  GGML_API void ggml_build_backward_expand(
2081
- struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
2082
- struct ggml_context * ctx_compute, // context for gradient computation
2083
- struct ggml_cgraph * cgraph,
2084
- bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
2052
+ struct ggml_context * ctx, // context for gradient computation
2053
+ struct ggml_cgraph * cgraph,
2054
+ struct ggml_tensor ** grad_accs);
2085
2055
 
2086
2056
  // graph allocation in a context
2087
2057
  GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2088
2058
  GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2089
- GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2059
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
2090
2060
  GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2091
2061
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2092
2062
  GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
@@ -65,7 +65,7 @@ if (GGML_LTO)
65
65
  endif()
66
66
  endif()
67
67
 
68
- if (GGML_CCACHE)
68
+ if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
69
69
  find_program(GGML_CCACHE_FOUND ccache)
70
70
  find_program(GGML_SCCACHE_FOUND sccache)
71
71
 
@@ -214,7 +214,7 @@ add_library(ggml
214
214
  target_link_libraries(ggml PUBLIC ggml-base)
215
215
 
216
216
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
217
- target_link_libraries(ggml PRIVATE dl stdc++fs)
217
+ target_link_libraries(ggml PRIVATE dl)
218
218
  endif()
219
219
 
220
220
  function(ggml_add_backend_library backend)
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
267
267
  set(GGML_CPU_TAG_NAME ${tag_name})
268
268
  # other: OPENMP LLAMAFILE CPU_HBM
269
269
  foreach (feat NATIVE
270
+ SSE42
270
271
  AVX AVX2 BMI2 AVX_VNNI FMA F16C
271
272
  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
272
273
  AMX_TILE AMX_INT8 AMX_BF16)
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
286
287
  if (NOT GGML_BACKEND_DL)
287
288
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
288
289
  endif()
289
- ggml_add_cpu_backend_variant(sandybridge AVX)
290
- ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
291
- ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
292
- ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
293
- ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
290
+ ggml_add_cpu_backend_variant(x64)
291
+ ggml_add_cpu_backend_variant(sse42 SSE42)
292
+ ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
293
+ ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
294
+ ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
295
+ ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
296
+ ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
294
297
  if (NOT MSVC)
295
298
  # MSVC doesn't support AMX
296
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
299
+ ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
297
300
  endif()
298
301
  elseif (GGML_CPU)
299
302
  ggml_add_cpu_backend_variant_impl("")
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
816
816
  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
817
817
  size_t node_size = 0;
818
818
  if (!node->data && !node->view_src) {
819
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
819
+ // If we previously had data but don't now then reallocate
820
+ if (talloc->buffer_id < 0) {
821
+ return false;
822
+ }
820
823
  node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
821
824
  }
822
825
  return talloc->size_max >= node_size;
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
56
56
  return SIZE_MAX;
57
57
  }
58
58
 
59
- size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
59
+ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
60
60
  // get_alloc_size is optional, defaults to ggml_nbytes
61
61
  if (buft->iface.get_alloc_size) {
62
62
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
152
152
  return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
153
153
  }
154
154
 
155
- size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
155
+ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
156
156
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
157
157
  }
158
158
 
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
674
674
  char * context_buffer;
675
675
  size_t context_buffer_size;
676
676
 
677
+ bool op_offload;
678
+
677
679
  int debug;
678
680
  };
679
681
 
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
766
768
  if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
767
769
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
768
770
  // check if a backend with higher prio wants to offload the op
769
- if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
771
+ if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
770
772
  for (int b = 0; b < src_backend_id; b++) {
771
773
  if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
772
774
  SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1109
1111
 
1110
1112
  const int node_backend_id = tensor_backend_id(node);
1111
1113
 
1112
- assert(node_backend_id != -1); // all nodes should be assigned by now
1114
+ assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1113
1115
 
1114
1116
  // check if we should start a new split based on the sources of the current node
1115
1117
  bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1452
1454
  ggml_backend_buffer_type_t * bufts,
1453
1455
  int n_backends,
1454
1456
  size_t graph_size,
1455
- bool parallel) {
1457
+ bool parallel,
1458
+ bool op_offload) {
1456
1459
  GGML_ASSERT(n_backends > 0);
1457
1460
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1458
1461
  GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1497
1500
  }
1498
1501
 
1499
1502
  sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1503
+ sched->op_offload = op_offload;
1500
1504
 
1501
1505
  ggml_backend_sched_reset(sched);
1502
1506
 
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
51
51
  ${CANN_INSTALL_DIR}/acllib/include
52
52
  )
53
53
 
54
- add_subdirectory(kernels)
55
54
  list(APPEND CANN_LIBRARIES
56
55
  ascendcl
57
56
  nnopbase
58
57
  opapi
59
58
  acl_op_compiler
60
- ascendc_kernels
61
59
  )
62
60
 
63
61
  file(GLOB GGML_SOURCES_CANN "*.cpp")
@@ -41,6 +41,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
41
41
  return ACL_INT4;
42
42
  case GGML_TYPE_Q8_0:
43
43
  return ACL_INT8;
44
+ case GGML_TYPE_I64:
45
+ return ACL_INT64;
44
46
  default:
45
47
  return ACL_DT_UNDEFINED;
46
48
  }
@@ -54,9 +56,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
54
56
  // added.
55
57
  int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
56
58
 
57
- int64_t acl_storage_len = 0;
58
59
  if (ne == nullptr) {
59
- acl_storage_len = ggml_nbytes(tensor);
60
60
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
61
61
  acl_ne[i] = tensor->ne[i];
62
62
  // The step size of acl is in elements.
@@ -65,14 +65,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
65
65
  } else {
66
66
  // With bcast
67
67
  for (int i = 0; i < dims; i++) {
68
- acl_storage_len += (ne[i] - 1) * nb[i];
69
68
  acl_ne[i] = ne[i];
70
69
  acl_stride[i] = nb[i] / ggml_element_size(tensor);
71
70
  }
72
71
  }
73
72
 
74
- // Reverse ne and stride.
75
73
  int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
74
+ int64_t acl_storage_len = 1;
75
+ for (int i = 0; i < final_dims; i++) {
76
+ acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
77
+ }
78
+
79
+ // Reverse ne and stride.
76
80
  std::reverse(acl_ne, acl_ne + final_dims);
77
81
  std::reverse(acl_stride, acl_stride + final_dims);
78
82
 
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
101
101
  tmp_stride[i] = nb[i] / type_size;
102
102
  }
103
103
 
104
- std::reverse(tmp_ne, tmp_ne + dims);
105
- std::reverse(tmp_stride, tmp_stride + dims);
106
-
107
- int64_t acl_storage_len = 0;
104
+ int64_t acl_storage_len = 1;
108
105
  for (int i = 0; i < dims; i++) {
109
- acl_storage_len += (ne[i] - 1) * nb[i];
106
+ acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
110
107
  }
111
108
 
109
+ std::reverse(tmp_ne, tmp_ne + dims);
110
+ std::reverse(tmp_stride, tmp_stride + dims);
111
+
112
112
  aclTensor* acl_tensor =
113
113
  aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114
114
  format, &acl_storage_len, 1, data_ptr);