@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -12,60 +12,30 @@ llama_add_compile_flags()
12
12
 
13
13
  # examples
14
14
 
15
- include_directories(${CMAKE_CURRENT_SOURCE_DIR})
16
-
17
15
  if (EMSCRIPTEN)
18
16
  else()
19
- add_subdirectory(batched-bench)
20
17
  add_subdirectory(batched)
21
18
  add_subdirectory(embedding)
22
19
  add_subdirectory(eval-callback)
23
20
 
24
- if (NOT WIN32)
25
- # disabled on Windows because it uses internal functions not exported with LLAMA_API
26
- add_subdirectory(gbnf-validator)
27
- endif()
28
-
29
21
  add_subdirectory(gguf-hash)
30
- add_subdirectory(gguf-split)
31
22
  add_subdirectory(gguf)
32
23
  add_subdirectory(gritlm)
33
- add_subdirectory(imatrix)
34
- add_subdirectory(infill)
35
- add_subdirectory(llama-bench)
36
24
  add_subdirectory(lookahead)
37
25
  add_subdirectory(lookup)
38
- add_subdirectory(main)
39
26
  add_subdirectory(parallel)
40
27
  add_subdirectory(passkey)
41
- add_subdirectory(perplexity)
42
- add_subdirectory(quantize)
43
28
  add_subdirectory(retrieval)
44
- if (LLAMA_BUILD_SERVER)
45
- add_subdirectory(server)
46
- endif()
47
29
  add_subdirectory(save-load-state)
48
- add_subdirectory(run)
49
30
  add_subdirectory(simple)
50
31
  add_subdirectory(simple-chat)
51
32
  add_subdirectory(speculative)
52
33
  add_subdirectory(speculative-simple)
53
- add_subdirectory(tokenize)
54
- add_subdirectory(tts)
55
34
  add_subdirectory(gen-docs)
35
+ add_subdirectory(training)
56
36
  if (NOT GGML_BACKEND_DL)
57
- # these examples use the backends directly and cannot be built with dynamic loading
58
37
  add_subdirectory(convert-llama2c-to-ggml)
59
- add_subdirectory(cvector-generator)
60
- add_subdirectory(export-lora)
61
- if (NOT WIN32)
62
- # disabled on Windows because it uses internal functions not exported with LLAMA_API
63
- add_subdirectory(quantize-stats)
64
- endif()
65
- add_subdirectory(llava)
66
- if (GGML_RPC)
67
- add_subdirectory(rpc)
68
- endif()
38
+ # these examples use the backends directly and cannot be built with dynamic loading
69
39
  if (GGML_SYCL)
70
40
  add_subdirectory(sycl)
71
41
  endif()
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
41
41
 
42
42
  llama_model_params model_params = common_model_params_to_llama(params);
43
43
 
44
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
44
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
45
45
 
46
46
  if (model == NULL) {
47
47
  LOG_ERR("%s: error: unable to load model\n" , __func__);
@@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
35
35
 
36
36
  static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
37
37
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
38
- const struct llama_model * model = llama_get_model(ctx);
39
38
 
40
39
  // clear previous kv_cache values (irrelevant for embeddings)
41
40
  llama_kv_self_clear(ctx);
42
41
 
43
42
  // run model
44
43
  LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
45
- if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
46
- // encoder-only model
47
- if (llama_encode(ctx, batch) < 0) {
48
- LOG_ERR("%s : failed to encode\n", __func__);
49
- }
50
- } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
51
- // decoder-only model
52
- if (llama_decode(ctx, batch) < 0) {
53
- LOG_ERR("%s : failed to decode\n", __func__);
54
- }
44
+ if (llama_encode(ctx, batch) < 0) {
45
+ LOG_ERR("%s : failed to encode\n", __func__);
55
46
  }
56
47
 
57
48
  for (int i = 0; i < batch.n_tokens; i++) {
@@ -89,6 +80,13 @@ int main(int argc, char ** argv) {
89
80
  common_init();
90
81
 
91
82
  params.embedding = true;
83
+
84
+ // utilize the full context
85
+ if (params.n_batch < params.n_ctx) {
86
+ LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
87
+ params.n_batch = params.n_ctx;
88
+ }
89
+
92
90
  // For non-causal models, batch size must be equal to ubatch size
93
91
  params.n_ubatch = params.n_batch;
94
92
 
@@ -134,7 +132,6 @@ int main(int argc, char ** argv) {
134
132
 
135
133
  // max batch size
136
134
  const uint64_t n_batch = params.n_batch;
137
- GGML_ASSERT(params.n_batch >= params.n_ctx);
138
135
 
139
136
  // tokenize the prompts and trim
140
137
  std::vector<std::vector<int32_t>> inputs;
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
168
168
 
169
169
  llama_backend_init();
170
170
 
171
- llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
171
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
172
172
 
173
173
  // create generation context
174
174
  llama_context * ctx = llama_init_from_model(model, cparams);
@@ -18,6 +18,7 @@ android {
18
18
  }
19
19
  externalNativeBuild {
20
20
  cmake {
21
+ arguments += "-DLLAMA_CURL=OFF"
21
22
  arguments += "-DLLAMA_BUILD_COMMON=ON"
22
23
  arguments += "-DGGML_LLAMAFILE=OFF"
23
24
  arguments += "-DCMAKE_BUILD_TYPE=Release"
@@ -34,11 +34,61 @@ static std::string k_system =
34
34
  R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
35
35
  The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
36
36
 
37
- User: Recommend a nice restaurant in the area.
38
- Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
39
- User: Who is Richard Feynman?
40
- Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
41
- User:)";
37
+ User:
38
+ Recommend a nice restaurant in the area.
39
+ Assistant:
40
+ I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
41
+ User:
42
+ Who is Richard Feynman?
43
+ Assistant:
44
+ Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
45
+ )";
46
+
47
+ static std::vector<std::string> k_questions = {
48
+ "What is the tallest mountain in the world?",
49
+ "Who was the first person to win two Nobel Prizes?",
50
+ "Which country invented paper?",
51
+ "What organ is primarily responsible for pumping blood throughout the body?",
52
+ "Which planet is known for its prominent ring system?",
53
+ "Who directed the movie 'Inception'?",
54
+ "What is the freezing point of water in Fahrenheit?",
55
+ "Which animal is known to have the longest lifespan?",
56
+ "What language has the most native speakers worldwide?",
57
+ "What is the capital city of Canada?",
58
+ "Who is credited with inventing the World Wide Web?",
59
+ "Which metal is liquid at room temperature?",
60
+ "What is the term for an animal that eats both plants and meat?",
61
+ "Who painted 'The Starry Night'?",
62
+ "What gas do humans exhale that plants use for photosynthesis?",
63
+ "What year did World War II end?",
64
+ "Which continent has the most countries?",
65
+ "Who wrote the novel 'Frankenstein'?",
66
+ "What does DNA stand for?",
67
+ "What is the main ingredient in traditional Japanese miso soup?"
68
+ };
69
+
70
+ static std::vector<std::string> k_answers = {
71
+ "The tallest mountain in the world is Mount Everest.",
72
+ "Marie Curie was the first person to win two Nobel Prizes.",
73
+ "Paper was invented in China.",
74
+ "The heart is the organ responsible for pumping blood.",
75
+ "Saturn is known for its prominent ring system.",
76
+ "Christopher Nolan directed the movie 'Inception'.",
77
+ "The freezing point of water in Fahrenheit is 32°F.",
78
+ "The bowhead whale is known to have the longest lifespan among mammals.",
79
+ "Mandarin Chinese has the most native speakers in the world.",
80
+ "The capital city of Canada is Ottawa.",
81
+ "Tim Berners-Lee is credited with inventing the World Wide Web.",
82
+ "Mercury is the metal that is liquid at room temperature.",
83
+ "An animal that eats both plants and meat is called an omnivore.",
84
+ "'The Starry Night' was painted by Vincent van Gogh.",
85
+ "Humans exhale carbon dioxide, which plants use in photosynthesis.",
86
+ "World War II ended in 1945.",
87
+ "Africa is the continent with the most countries.",
88
+ "The novel 'Frankenstein' was written by Mary Shelley.",
89
+ "DNA stands for Deoxyribonucleic Acid.",
90
+ "The main ingredient in traditional Japanese miso soup is fermented soybean paste."
91
+ };
42
92
 
43
93
  static std::vector<std::string> k_prompts = {
44
94
  "What is the meaning of life?",
@@ -49,7 +99,7 @@ static std::vector<std::string> k_prompts = {
49
99
  "What is the best way to learn a new language?",
50
100
  "How to get a job at Google?",
51
101
  "If you could have any superpower, what would it be?",
52
- "I want to learn how to play the piano.",
102
+ "I want to learn how to play the piano. What would be the best way to do it?",
53
103
  };
54
104
 
55
105
  struct client {
@@ -68,6 +118,7 @@ struct client {
68
118
  int64_t t_start_prompt;
69
119
  int64_t t_start_gen;
70
120
 
121
+ int32_t n_past = 0;
71
122
  int32_t n_prompt = 0;
72
123
  int32_t n_decoded = 0;
73
124
  int32_t i_batch = -1;
@@ -106,6 +157,9 @@ int main(int argc, char ** argv) {
106
157
 
107
158
  common_params params;
108
159
 
160
+ params.n_predict = 128;
161
+ params.n_junk = 0;
162
+
109
163
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
110
164
  return 1;
111
165
  }
@@ -126,6 +180,12 @@ int main(int argc, char ** argv) {
126
180
 
127
181
  const bool dump_kv_cache = params.dump_kv_cache;
128
182
 
183
+ // is the system prompt shared in the cache
184
+ const bool is_sp_shared = params.is_pp_shared;
185
+
186
+ // extra text to insert in each client's prompt in order to make it larger
187
+ const int32_t n_junk = params.n_junk;
188
+
129
189
  // init llama.cpp
130
190
  llama_backend_init();
131
191
  llama_numa_init(params.numa);
@@ -167,6 +227,7 @@ int main(int argc, char ** argv) {
167
227
  }
168
228
 
169
229
  std::vector<llama_token> tokens_system;
230
+
170
231
  tokens_system = common_tokenize(ctx, k_system, true);
171
232
  const int32_t n_tokens_system = tokens_system.size();
172
233
 
@@ -188,7 +249,7 @@ int main(int argc, char ** argv) {
188
249
  LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
189
250
  LOG_INF("\n");
190
251
 
191
- {
252
+ if (is_sp_shared) {
192
253
  LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
193
254
 
194
255
  for (int32_t i = 0; i < n_tokens_system; ++i) {
@@ -226,7 +287,7 @@ int main(int argc, char ** argv) {
226
287
 
227
288
  client.i_batch = batch.n_tokens;
228
289
 
229
- common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
290
+ common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);
230
291
 
231
292
  client.n_decoded += 1;
232
293
  }
@@ -252,9 +313,23 @@ int main(int argc, char ** argv) {
252
313
  client.t_start_gen = 0;
253
314
 
254
315
  client.input = k_prompts[rand() % k_prompts.size()];
255
- client.prompt = client.input + "\nAssistant:";
256
316
  client.response = "";
257
317
 
318
+ // construct the prompt:
319
+ // [system prompt] + [junk] + [user prompt]
320
+ client.n_past = 0;
321
+ client.prompt = "";
322
+ if (is_sp_shared) {
323
+ client.n_past = n_tokens_system;
324
+ } else {
325
+ client.prompt += k_system;
326
+ }
327
+ for (int i = 0; i < n_junk; ++i) {
328
+ const int r = rand() % k_questions.size();
329
+ client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
330
+ }
331
+ client.prompt += "User:\n" + client.input + "\nAssistant:\n";
332
+
258
333
  common_sampler_reset(client.smpl);
259
334
 
260
335
  // do not prepend BOS because we have a system prompt!
@@ -262,7 +337,7 @@ int main(int argc, char ** argv) {
262
337
  tokens_prompt = common_tokenize(ctx, client.prompt, false);
263
338
 
264
339
  for (size_t i = 0; i < tokens_prompt.size(); ++i) {
265
- common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
340
+ common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
266
341
  }
267
342
 
268
343
  // extract the logits only for the last token
@@ -361,10 +436,9 @@ int main(int argc, char ** argv) {
361
436
  // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
362
437
 
363
438
  if (client.n_decoded > 2 &&
364
- (llama_vocab_is_eog(vocab, id) ||
365
- (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
366
- client.response.find("User:") != std::string::npos ||
367
- client.response.find('\n') != std::string::npos)) {
439
+ (llama_vocab_is_eog(vocab, id) ||
440
+ (params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
441
+ client.response.find("User:") != std::string::npos)) {
368
442
  // basic reverse prompt
369
443
  const size_t pos = client.response.find("User:");
370
444
  if (pos != std::string::npos) {
@@ -405,7 +479,7 @@ int main(int argc, char ** argv) {
405
479
  params.prompt_file = "used built-in defaults";
406
480
  }
407
481
  LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
408
- LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
482
+ LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
409
483
 
410
484
  LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
411
485
  LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
64
64
 
65
65
  llama_model_params model_params = common_model_params_to_llama(params);
66
66
 
67
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
67
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
68
68
 
69
69
  if (model == NULL) {
70
70
  LOG_ERR("%s: unable to load model\n" , __func__);
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
46
46
 
47
47
  common_init();
48
48
 
49
- if (params.speculative.model.empty()) {
49
+ if (params.speculative.model.path.empty()) {
50
50
  LOG_ERR("%s: --model-draft is required\n", __func__);
51
51
  return 1;
52
52
  }
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
24
24
 
25
25
  common_init();
26
26
 
27
- if (params.speculative.model.empty()) {
27
+ if (params.speculative.model.path.empty()) {
28
28
  LOG_ERR("%s: --model-draft is required\n", __func__);
29
29
  return 1;
30
30
  }
@@ -8,10 +8,10 @@ cd build
8
8
  source /opt/intel/oneapi/setvars.sh
9
9
 
10
10
  #for FP16
11
- #cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
11
+ #cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
12
12
 
13
13
  #for FP32
14
- cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
14
+ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
15
15
 
16
16
  #build example/main
17
17
  #cmake --build . --config Release --target main
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
13
13
 
14
14
  :: for FP16
15
15
  :: faster for long-prompt inference
16
- :: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
16
+ :: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
17
17
 
18
18
  :: for FP32
19
- cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
19
+ cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
20
20
  if %errorlevel% neq 0 goto ERROR
21
21
  :: build example/main only
22
22
  :: make main
@@ -0,0 +1,5 @@
1
+ set(TARGET llama-finetune)
2
+ add_executable(${TARGET} finetune.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,96 @@
1
+ #include "arg.h"
2
+ #include "common.h"
3
+ #include "log.h"
4
+ #include "llama.h"
5
+
6
+ #include <cmath>
7
+ #include <cstdio>
8
+ #include <cstring>
9
+ #include <ctime>
10
+ #include <vector>
11
+
12
+ #if defined(_MSC_VER)
13
+ #pragma warning(disable: 4244 4267) // possible loss of data
14
+ #endif
15
+
16
+ int main(int argc, char ** argv) {
17
+ common_params params;
18
+
19
+ params.escape = false;
20
+
21
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
22
+ return 1;
23
+ }
24
+
25
+ if (params.use_mmap) {
26
+ LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
27
+ params.use_mmap = false;
28
+ }
29
+ if (params.cache_type_k != GGML_TYPE_F32) {
30
+ LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
31
+ params.cache_type_k = GGML_TYPE_F32;
32
+ }
33
+ if (params.cache_type_v != GGML_TYPE_F32) {
34
+ LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
35
+ params.cache_type_v = GGML_TYPE_F32;
36
+ }
37
+
38
+ common_init();
39
+ llama_backend_init();
40
+ llama_numa_init(params.numa);
41
+
42
+ // load the model and apply lora adapter, if any
43
+ common_init_result llama_init = common_init_from_params(params);
44
+ llama_model_ptr & model = llama_init.model;
45
+ llama_context_ptr & ctx = llama_init.context;
46
+
47
+ if (model == NULL) {
48
+ LOG_ERR("%s: unable to load model\n", __func__);
49
+ return 1;
50
+ }
51
+
52
+ // print system information
53
+ {
54
+ LOG_INF("\n");
55
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
56
+ }
57
+
58
+ constexpr float val_split = 0.05f;
59
+
60
+ std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
61
+ ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
62
+
63
+ struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
64
+ optimizer_params.adamw.alpha = 1e-7f; // learning rate
65
+
66
+ struct llama_opt_params lopt_params {
67
+ /*n_ctx_train =*/ 0,
68
+ /*param_filter =*/ llama_opt_param_filter_all,
69
+ /*param_filter_ud =*/ nullptr,
70
+ /*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
71
+ /*get_opt_pars_ud =*/ &optimizer_params,
72
+ };
73
+ llama_opt_init(ctx.get(), model.get(), lopt_params);
74
+
75
+ const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
76
+
77
+ ggml_opt_result_t result_train = ggml_opt_result_init();
78
+ ggml_opt_result_t result_eval = ggml_opt_result_init();
79
+
80
+ for (int epoch = 0; epoch < 2; ++epoch) {
81
+ llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
82
+ ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
83
+ fprintf(stderr, "\n");
84
+
85
+ ggml_opt_result_reset(result_train);
86
+ ggml_opt_result_reset(result_eval);
87
+ }
88
+ ggml_opt_result_free(result_train);
89
+ ggml_opt_result_free(result_eval);
90
+
91
+ llama_model_save_to_file(model.get(), "finetuned-model.gguf");
92
+
93
+ llama_backend_free();
94
+
95
+ return 0;
96
+ }
@@ -100,9 +100,14 @@ else()
100
100
  set(INS_ENB ON)
101
101
  endif()
102
102
 
103
+ message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
104
+ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
105
+ message(DEBUG "INS_ENB : ${INS_ENB}")
106
+
103
107
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
104
108
  option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
105
109
  option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
110
+ option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
106
111
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
107
112
  option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
108
113
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
@@ -123,10 +128,12 @@ endif()
123
128
  option(GGML_LASX "ggml: enable lasx" ON)
124
129
  option(GGML_LSX "ggml: enable lsx" ON)
125
130
  option(GGML_RVV "ggml: enable rvv" ON)
131
+ option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
126
132
  option(GGML_VXE "ggml: enable vxe" ON)
127
133
 
128
134
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
129
- set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
135
+ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
136
+ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
130
137
 
131
138
 
132
139
  if (WIN32)
@@ -164,7 +171,6 @@ option(GGML_HIP "ggml: use HIP"
164
171
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
165
172
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
166
173
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
167
- option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
168
174
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
169
175
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
170
176
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -187,6 +193,7 @@ option(GGML_RPC "ggml: use RPC"
187
193
  option(GGML_SYCL "ggml: use SYCL" OFF)
188
194
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
189
195
  option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
196
+ option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
190
197
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
191
198
  "ggml: sycl target device")
192
199
  set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
@@ -354,3 +361,29 @@ write_basic_package_version_file(
354
361
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
355
362
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
356
363
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
364
+
365
+ if (MSVC)
366
+ set(MSVC_WARNING_FLAGS
367
+ /wd4005 # Macro redefinition
368
+ /wd4244 # Conversion from one type to another type, possible loss of data
369
+ /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
370
+ /wd4996 # Disable POSIX deprecation warnings
371
+ /wd4702 # Unreachable code warnings
372
+ )
373
+ function(disable_msvc_warnings target_name)
374
+ if(TARGET ${target_name})
375
+ target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
376
+ endif()
377
+ endfunction()
378
+
379
+ disable_msvc_warnings(ggml-base)
380
+ disable_msvc_warnings(ggml)
381
+ disable_msvc_warnings(ggml-cpu)
382
+ disable_msvc_warnings(ggml-cpu-x64)
383
+ disable_msvc_warnings(ggml-cpu-sse42)
384
+ disable_msvc_warnings(ggml-cpu-sandybridge)
385
+ disable_msvc_warnings(ggml-cpu-haswell)
386
+ disable_msvc_warnings(ggml-cpu-skylakex)
387
+ disable_msvc_warnings(ggml-cpu-icelake)
388
+ disable_msvc_warnings(ggml-cpu-alderlake)
389
+ endif()
@@ -0,0 +1,22 @@
1
+ find_package(Git)
2
+
3
+ # the commit's SHA1
4
+ execute_process(COMMAND
5
+ "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
6
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
7
+ OUTPUT_VARIABLE GIT_SHA1
8
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+
10
+ # the date of the commit
11
+ execute_process(COMMAND
12
+ "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
13
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
14
+ OUTPUT_VARIABLE GIT_DATE
15
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
16
+
17
+ # the subject of the commit
18
+ execute_process(COMMAND
19
+ "${GIT_EXECUTABLE}" log -1 --format=%s
20
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
21
+ OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
22
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -38,7 +38,7 @@ extern "C" {
38
38
  GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
39
39
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
40
40
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
41
- GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
41
+ GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
42
42
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
43
43
  GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
44
44
 
@@ -59,7 +59,7 @@ extern "C" {
59
59
  GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
60
60
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
61
61
  GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
62
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
62
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
63
63
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
64
64
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
65
65
  GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
@@ -248,7 +248,7 @@ extern "C" {
248
248
  // preferrably to run on the same backend as the buffer
249
249
  ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
250
250
 
251
- sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
251
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
252
252
 
253
253
  // initialize buffers from a max size graph (optional)
254
254
  reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
289
289
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
290
290
 
291
291
  // Initialize a backend scheduler, backends with low index are given priority over backends with high index
292
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
292
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
293
293
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
294
294
 
295
295
  // Initialize backend buffers from a measure graph
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
24
24
 
25
25
  struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
26
26
 
27
- typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
27
+ typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
28
28
 
29
29
  // ggml-backend
30
30
 
@@ -133,6 +133,11 @@ extern "C" {
133
133
 
134
134
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
135
 
136
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
+ GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
+ GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
139
+ GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
140
+
136
141
  #ifdef __cplusplus
137
142
  }
138
143
  #endif