@fugood/llama.node 0.6.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +45 -7
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  27. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  29. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  30. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  31. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  32. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  36. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  37. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  38. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  39. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  40. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  41. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  42. package/src/llama.cpp/cmake/common.cmake +0 -35
  43. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  44. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  45. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  46. package/src/llama.cpp/docs/build.md +0 -561
  47. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  48. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  49. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  50. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  53. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  54. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  56. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  57. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  58. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  59. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  60. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  61. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  62. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  63. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  64. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  68. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  70. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  71. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  72. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  73. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  74. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  75. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  76. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  77. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  79. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  80. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  81. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  82. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  83. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  84. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  85. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  86. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  87. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  88. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  89. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  90. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  91. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  92. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  93. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  94. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  96. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  99. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  100. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  101. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  102. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  103. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  104. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  105. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  106. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  107. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  108. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  109. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  110. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  111. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  112. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  113. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  114. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  115. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  123. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  125. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  127. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  128. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  129. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  130. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  131. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  132. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  135. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  136. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  137. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  138. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  139. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  140. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  141. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  142. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  143. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  144. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  145. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  146. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  147. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  148. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  149. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  150. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  151. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  153. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  155. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  157. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  159. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  161. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  163. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  167. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  169. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  170. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  172. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  173. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  175. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  177. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  183. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  185. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  186. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  187. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  189. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  191. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  195. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  198. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  199. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  200. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  204. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  205. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  206. package/src/llama.cpp/models/.editorconfig +0 -1
  207. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  208. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  212. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  214. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  217. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  225. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  228. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  231. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  233. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  237. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  240. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  242. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  245. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  248. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  256. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  261. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  262. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  263. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  264. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  265. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  266. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  267. package/src/llama.cpp/prompts/assistant.txt +0 -31
  268. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  269. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  270. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  271. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  273. package/src/llama.cpp/prompts/chat.txt +0 -28
  274. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  275. package/src/llama.cpp/prompts/dan.txt +0 -1
  276. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  277. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  278. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  279. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  280. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  281. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  284. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  285. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  286. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  287. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  289. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  290. package/src/llama.cpp/requirements.txt +0 -13
  291. package/src/llama.cpp/scripts/build-info.sh +0 -30
  292. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  293. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  294. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  295. package/src/llama.cpp/tests/get-model.cpp +0 -21
  296. package/src/llama.cpp/tests/get-model.h +0 -2
  297. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  298. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  299. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  300. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  301. package/src/llama.cpp/tests/test-c.c +0 -7
  302. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  303. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  304. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  305. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  306. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  307. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  308. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  309. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  310. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  311. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  312. package/src/llama.cpp/tests/test-log.cpp +0 -39
  313. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  314. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  315. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  316. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  317. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  318. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  319. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  320. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  321. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  322. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  323. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  324. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  325. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  326. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  327. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  328. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  329. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  330. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  331. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  332. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  333. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  334. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  335. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  336. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  337. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  338. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  339. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  340. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  341. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  342. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  343. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  344. package/src/llama.cpp/tools/main/main.cpp +0 -977
  345. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  346. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  347. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  348. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  349. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  350. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  351. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  353. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  354. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  355. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  356. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  357. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  358. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  360. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  361. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  362. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  363. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  364. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  365. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  367. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  368. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  369. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  370. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  371. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  372. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  373. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  374. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  375. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  376. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  377. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,2063 +0,0 @@
1
- #include "arg.h"
2
- #include "common.h"
3
- #include "log.h"
4
- #include "llama.h"
5
-
6
- #include <chrono>
7
- #include <algorithm>
8
- #include <array>
9
- #include <atomic>
10
- #include <cmath>
11
- #include <cstdio>
12
- #include <cstring>
13
- #include <ctime>
14
- #include <fstream>
15
- #include <mutex>
16
- #include <random>
17
- #include <sstream>
18
- #include <thread>
19
- #include <vector>
20
-
21
- #if defined(_MSC_VER)
22
- #pragma warning(disable: 4244 4267) // possible loss of data
23
- #endif
24
-
25
- struct results_perplexity {
26
- std::vector<llama_token> tokens;
27
- double ppl_value;
28
- std::vector<float> logits;
29
- std::vector<float> probs;
30
- };
31
-
32
- struct results_log_softmax {
33
- double log_softmax;
34
- float logit;
35
- float prob;
36
- };
37
-
38
- static std::vector<float> softmax(const std::vector<float>& logits) {
39
- std::vector<float> probs(logits.size());
40
- float max_logit = logits[0];
41
- for (float v : logits) {
42
- max_logit = std::max(max_logit, v);
43
- }
44
- double sum_exp = 0.0;
45
- for (size_t i = 0; i < logits.size(); i++) {
46
- // Subtract the maximum logit value from the current logit value for numerical stability
47
- const float logit = logits[i] - max_logit;
48
- const float exp_logit = expf(logit);
49
- sum_exp += exp_logit;
50
- probs[i] = exp_logit;
51
- }
52
- for (size_t i = 0; i < probs.size(); i++) {
53
- probs[i] /= sum_exp;
54
- }
55
- return probs;
56
- }
57
-
58
- static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
59
- float max_logit = logits[0];
60
- for (int i = 1; i < n_vocab; ++i) {
61
- max_logit = std::max(max_logit, logits[i]);
62
- }
63
- double sum_exp = 0.0;
64
- for (int i = 0; i < n_vocab; ++i) {
65
- sum_exp += expf(logits[i] - max_logit);
66
- }
67
- return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
68
- }
69
-
70
- static inline int nearest_int(float fval) {
71
- //assert(fval <= 4194303.f);
72
- float val = fval + 12582912.f;
73
- int i; memcpy(&i, &val, sizeof(int));
74
- return (i & 0x007fffff) - 0x00400000;
75
- }
76
-
77
- static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
78
- float max_logit = logits[0];
79
- float min_logit = logits[0];
80
- for (int i = 1; i < n_vocab; ++i) {
81
- max_logit = std::max(max_logit, logits[i]);
82
- min_logit = std::min(min_logit, logits[i]);
83
- }
84
- min_logit = std::max(min_logit, max_logit - 16);
85
- double sum_exp = 0.0;
86
- for (int i = 0; i < n_vocab; ++i) {
87
- sum_exp += expf(logits[i] - max_logit);
88
- }
89
- const float log_sum_exp = log(sum_exp);
90
- const float min_log_prob = min_logit - max_logit - log_sum_exp;
91
- const float scale = (max_logit - min_logit)/65535.f;
92
- float * d = (float *)log_prob;
93
- d[0] = scale;
94
- d[1] = min_log_prob;
95
- log_prob += 4;
96
- if (scale) {
97
- const float inv_scale = 1/scale;
98
- for (int i = 0; i < n_vocab; ++i) {
99
- log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
100
- }
101
- } else {
102
- std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
103
- }
104
- return max_logit + log_sum_exp - logits[tok];
105
- }
106
-
107
- static void process_logits(
108
- int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
109
- double & nll, double & nll2, float * logit_history, float * prob_history
110
- ) {
111
- std::mutex mutex;
112
- int counter = 0;
113
- auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
114
- double local_nll = 0;
115
- double local_nll2 = 0;
116
- while (true) {
117
- std::unique_lock<std::mutex> lock(mutex);
118
- int i = counter++;
119
- if (i >= n_token) {
120
- nll += local_nll; nll2 += local_nll2;
121
- break;
122
- }
123
- lock.unlock();
124
- const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
125
- const double v = -results.log_softmax;
126
- local_nll += v;
127
- local_nll2 += v*v;
128
-
129
- logit_history[i] = results.logit;
130
- prob_history[i] = results.prob;
131
- }
132
- };
133
- for (auto & w : workers) {
134
- w = std::thread(compute);
135
- }
136
- compute();
137
- for (auto & w : workers) {
138
- w.join();
139
- }
140
- }
141
-
142
- static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
143
- std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
144
- std::mutex mutex;
145
- const int nv = 2*((n_vocab + 1)/2) + 4;
146
- int counter = 0;
147
- auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
148
- double local_nll = 0;
149
- double local_nll2 = 0;
150
- while (true) {
151
- std::unique_lock<std::mutex> lock(mutex);
152
- int i = counter++;
153
- if (i >= n_token) {
154
- nll += local_nll; nll2 += local_nll2;
155
- break;
156
- }
157
- lock.unlock();
158
- const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
159
- local_nll += v;
160
- local_nll2 += v*v;
161
- }
162
- };
163
- for (auto & w : workers) {
164
- w = std::thread(compute);
165
- }
166
- compute();
167
- for (auto & w : workers) {
168
- w.join();
169
- }
170
- out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
171
- }
172
-
173
- struct kl_divergence_result {
174
- double sum_nll = 0.0;
175
- double sum_nll2 = 0.0;
176
- double sum_nll_base = 0.0;
177
- double sum_nll_base2 = 0.0;
178
- double sum_nll_nll_base = 0.0;
179
- double sum_kld = 0.0;
180
- double sum_kld2 = 0.0;
181
- double sum_p_diff = 0.0;
182
- double sum_p_diff2 = 0.0;
183
- double sum_p_diff4 = 0.0;
184
- float max_p_diff = 0.0f;
185
- size_t n_same_top = 0.0;
186
- size_t count = 0.0;
187
- };
188
-
189
- static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
190
- float max_logit = logits[0];
191
- int imax = 0;
192
- for (int i = 1; i < n_vocab; ++i) {
193
- if (logits[i] > max_logit) {
194
- max_logit = logits[i];
195
- imax = i;
196
- }
197
- }
198
- double sum_exp = 0.0;
199
- for (int i = 0; i < n_vocab; ++i) {
200
- sum_exp += expf(logits[i] - max_logit);
201
- }
202
- const float log_sum_exp = log(sum_exp);
203
- const float * d = (const float *)base_log_prob;
204
- const float scale = d[0];
205
- const float min_log_prob = d[1];
206
- base_log_prob += 4;
207
-
208
- const float nll = max_logit + log_sum_exp - logits[tok];
209
- kld.sum_nll += nll;
210
- kld.sum_nll2 += nll*nll;
211
-
212
- const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
213
- kld.sum_nll_base += nll_base;
214
- kld.sum_nll_base2 += nll_base*nll_base;
215
-
216
- kld.sum_nll_nll_base += nll*nll_base;
217
-
218
- max_logit += log_sum_exp;
219
- double sum = 0;
220
- int imax_base = -1;
221
- float p_log_base_max = 0;
222
- for (int i = 0; i < n_vocab; ++i) {
223
- const float p_log_base = scale*base_log_prob[i] + min_log_prob;
224
- if (i == 0 || p_log_base > p_log_base_max) {
225
- p_log_base_max = p_log_base;
226
- imax_base = i;
227
- }
228
- if (p_log_base > -16.f) {
229
- const float p_base = expf(p_log_base);
230
- sum += p_base * (p_log_base - logits[i] + max_logit);
231
- }
232
- }
233
- kld.sum_kld += sum;
234
- kld.sum_kld2 += sum*sum;
235
- ++kld.count;
236
- if (imax == imax_base) {
237
- ++kld.n_same_top;
238
- }
239
-
240
- const float p_base = expf(-nll_base);
241
- const float p = expf(-nll);
242
- const float p_diff = p - p_base;
243
- kld.sum_p_diff += p_diff;
244
- const double p_diff2 = p_diff*p_diff;
245
- kld.sum_p_diff2 += p_diff2;
246
- kld.sum_p_diff4 += p_diff2*p_diff2;
247
- kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
248
-
249
- return std::make_pair(sum, p_diff);
250
- }
251
-
252
- static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
253
- std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
254
- float * kld_values, float * p_diff_values) {
255
- std::mutex mutex;
256
- const int nv = 2*((n_vocab + 1)/2) + 4;
257
- int counter = 0;
258
- auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
259
- kl_divergence_result local_kld;
260
- while (true) {
261
- std::unique_lock<std::mutex> lock(mutex);
262
- int i = counter++;
263
- if (i >= n_token) {
264
- kld.sum_nll += local_kld.sum_nll;
265
- kld.sum_nll2 += local_kld.sum_nll2;
266
- kld.sum_nll_base += local_kld.sum_nll_base;
267
- kld.sum_nll_base2 += local_kld.sum_nll_base2;
268
- kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
269
- kld.sum_kld += local_kld.sum_kld;
270
- kld.sum_kld2 += local_kld.sum_kld2;
271
- kld.sum_p_diff += local_kld.sum_p_diff;
272
- kld.sum_p_diff2 += local_kld.sum_p_diff2;
273
- kld.sum_p_diff4 += local_kld.sum_p_diff4;
274
- kld.n_same_top += local_kld.n_same_top;
275
- kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff);
276
- kld.count += local_kld.count;
277
- break;
278
- }
279
- lock.unlock();
280
- std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
281
- kld_values[i] = (float)v.first;
282
- p_diff_values[i] = v.second;
283
- }
284
- };
285
- for (auto & w : workers) {
286
- w = std::thread(compute);
287
- }
288
- compute();
289
- for (auto & w : workers) {
290
- w.join();
291
- }
292
- }
293
-
294
- static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
295
- // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
296
- // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
297
- // Output: `perplexity: 13.5106 [114/114]`
298
- // BOS tokens will be added for each chunk before eval
299
-
300
- const llama_model * model = llama_get_model(ctx);
301
- const llama_vocab * vocab = llama_model_get_vocab(model);
302
-
303
- const bool add_bos = llama_vocab_get_add_bos(vocab);
304
- GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
305
-
306
- LOG_INF("%s: tokenizing the input ..\n", __func__);
307
-
308
- std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
309
-
310
- const int n_ctx = llama_n_ctx(ctx);
311
-
312
- if (int(tokens.size()) < 2*n_ctx) {
313
- LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
314
- n_ctx);
315
- LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
316
- return {std::move(tokens), 0., {}, {}};
317
- }
318
-
319
- std::vector<float> logit_history;
320
- std::vector<float> prob_history;
321
-
322
- logit_history.resize(tokens.size());
323
- prob_history.resize(tokens.size());
324
-
325
- if (params.ppl_stride <= 0) {
326
- LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
327
- return {tokens, -1, logit_history, prob_history};
328
- }
329
-
330
- const int calc_chunk = n_ctx;
331
-
332
- LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
333
-
334
- if (int(tokens.size()) <= calc_chunk) {
335
- LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
336
- tokens.size(), n_ctx, params.ppl_stride);
337
- return {tokens, -1, logit_history, prob_history};
338
- }
339
-
340
- const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
341
-
342
- const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
343
- const int n_batch = params.n_batch;
344
-
345
- const int n_vocab = llama_vocab_n_tokens(vocab);
346
-
347
- int count = 0;
348
- double nll = 0.0;
349
-
350
- LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
351
-
352
- for (int i = 0; i < n_chunk; ++i) {
353
- const int start = i * params.ppl_stride;
354
- const int end = start + calc_chunk;
355
-
356
- const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
357
- //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
358
-
359
- std::vector<float> logits;
360
-
361
- const auto t_start = std::chrono::high_resolution_clock::now();
362
-
363
- // clear the KV cache
364
- llama_kv_self_clear(ctx);
365
-
366
- llama_batch batch = llama_batch_init(n_batch, 0, 1);
367
-
368
- for (int j = 0; j < num_batches; ++j) {
369
- const int batch_start = start + j * n_batch;
370
- const int batch_size = std::min(end - batch_start, n_batch);
371
-
372
- common_batch_clear(batch);
373
- for (int i = 0; i < batch_size; i++) {
374
- common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
375
- }
376
-
377
- //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
378
- if (llama_decode(ctx, batch)) {
379
- //LOG_ERR("%s : failed to eval\n", __func__);
380
- llama_batch_free(batch);
381
- return {tokens, -1, logit_history, prob_history};
382
- }
383
-
384
- // save original token and restore it after eval
385
- const auto token_org = tokens[batch_start];
386
-
387
- // add BOS token for the first batch of each chunk
388
- if (add_bos && j == 0) {
389
- tokens[batch_start] = llama_vocab_bos(vocab);
390
- }
391
-
392
- const auto * batch_logits = llama_get_logits(ctx);
393
- logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
394
-
395
- if (j == 0) {
396
- tokens[batch_start] = token_org;
397
- }
398
- }
399
-
400
- llama_batch_free(batch);
401
-
402
- const auto t_end = std::chrono::high_resolution_clock::now();
403
-
404
- if (i == 0) {
405
- const float t_total = std::chrono::duration<float>(t_end - t_start).count();
406
- LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
407
- int total_seconds = (int)(t_total * n_chunk);
408
- if (total_seconds >= 60*60) {
409
- LOG("%d hours ", total_seconds / (60*60));
410
- total_seconds = total_seconds % (60*60);
411
- }
412
- LOG("%.2f minutes\n", total_seconds / 60.0);
413
- }
414
-
415
- //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
416
- for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
417
- // Calculate probability of next token, given the previous ones.
418
- const std::vector<float> tok_logits(
419
- logits.begin() + size_t(j + 0) * n_vocab,
420
- logits.begin() + size_t(j + 1) * n_vocab);
421
-
422
- const float prob = softmax(tok_logits)[tokens[start + j + 1]];
423
- logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
424
- prob_history[start + j + 1] = prob;
425
-
426
- nll += -std::log(prob);
427
- ++count;
428
- }
429
- // perplexity is e^(average negative log-likelihood)
430
- if (params.ppl_output_type == 0) {
431
- LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
432
- } else {
433
- LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
434
- }
435
- }
436
- LOG("\n");
437
-
438
- return {tokens, std::exp(nll / count), logit_history, prob_history};
439
- }
440
-
441
- static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
442
- if (params.ppl_stride > 0) {
443
- return perplexity_v2(ctx, params);
444
- }
445
-
446
- // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
447
- // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
448
- // Output: `perplexity: 13.5106 [114/114]`
449
- // BOS tokens will be added for each chunk before eval
450
-
451
- const llama_model * model = llama_get_model(ctx);
452
- const llama_vocab * vocab = llama_model_get_vocab(model);
453
-
454
- const bool add_bos = llama_vocab_get_add_bos(vocab);
455
- GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
456
-
457
- std::ofstream logits_stream;
458
- if (!params.logits_file.empty()) {
459
- logits_stream.open(params.logits_file.c_str(), std::ios::binary);
460
- if (!logits_stream.is_open()) {
461
- LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
462
- return {};
463
- }
464
- LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
465
- logits_stream.write("_logits_", 8);
466
- logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
467
- }
468
-
469
- auto tim1 = std::chrono::high_resolution_clock::now();
470
- LOG_INF("%s: tokenizing the input ..\n", __func__);
471
-
472
- std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
473
-
474
- auto tim2 = std::chrono::high_resolution_clock::now();
475
- LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
476
-
477
- if (int(tokens.size()) < 2*n_ctx) {
478
- LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
479
- n_ctx);
480
- LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
481
- return {std::move(tokens), 0., {}, {}};
482
- }
483
-
484
- std::vector<float> logit_history;
485
- logit_history.resize(tokens.size());
486
-
487
- std::vector<float> prob_history;
488
- prob_history.resize(tokens.size());
489
-
490
- const int n_chunk_max = tokens.size() / n_ctx;
491
-
492
- const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
493
- const int n_batch = params.n_batch;
494
-
495
- const int n_vocab = llama_vocab_n_tokens(vocab);
496
-
497
- int count = 0;
498
- double nll = 0.0;
499
- double nll2 = 0.0;
500
-
501
- const int num_batches = (n_ctx + n_batch - 1) / n_batch;
502
- const int n_seq = std::max(1, n_batch / n_ctx);
503
-
504
- GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
505
- GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
506
-
507
- llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
508
-
509
- std::vector<float> logits;
510
- if (num_batches > 1) {
511
- logits.reserve(size_t(n_ctx) * n_vocab);
512
- }
513
-
514
- LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
515
-
516
- std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
517
-
518
- std::vector<uint16_t> log_probs;
519
- if (!params.logits_file.empty()) {
520
- logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
521
- logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
522
- logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
523
- const int nv = 2*((n_vocab + 1)/2) + 4;
524
- log_probs.resize(n_ctx * nv);
525
- }
526
-
527
- // We get the logits for all the tokens in the context window (params.n_ctx)
528
- // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
529
- // calculate the perplexity over the last half of the window (so the model always has
530
- // some context to predict the token).
531
- //
532
- // We rely on the fact that attention in the forward pass only looks at previous
533
- // tokens here, so the logits returned for each token are an accurate representation
534
- // of what the model would have predicted at that point.
535
- //
536
- // Example, we have a context window of 512, we will compute perplexity for each of the
537
- // last 256 tokens. Then, we split the input up into context window size chunks to
538
- // process the entire prompt.
539
- const int first = n_ctx/2;
540
-
541
- for (int i = 0; i < n_chunk; i += n_seq) {
542
- const int start = i * n_ctx;
543
- const int end = start + n_ctx;
544
-
545
- const int n_seq_batch = std::min(n_seq, n_chunk - i);
546
-
547
- const auto t_start = std::chrono::high_resolution_clock::now();
548
-
549
- // clear the KV cache
550
- llama_kv_self_clear(ctx);
551
-
552
- for (int j = 0; j < num_batches; ++j) {
553
- const int batch_start = start + j * n_batch;
554
- const int batch_size = std::min(end - batch_start, n_batch);
555
-
556
- int n_outputs = 0;
557
-
558
- batch.n_tokens = 0;
559
- for (int seq = 0; seq < n_seq_batch; seq++) {
560
- int seq_start = batch_start + seq*n_ctx;
561
-
562
- // save original token and restore it after eval
563
- const auto token_org = tokens[seq_start];
564
-
565
- // add BOS token for the first batch of each chunk
566
- if (add_bos && j == 0) {
567
- tokens[seq_start] = llama_vocab_bos(vocab);
568
- }
569
-
570
- for (int k = 0; k < batch_size; ++k) {
571
- const int idx = seq*n_ctx + k;
572
- batch.token [idx] = tokens[seq_start + k];
573
- batch.pos [idx] = j*n_batch + k;
574
- batch.n_seq_id[idx] = 1;
575
- batch.seq_id [idx][0] = seq;
576
- batch.logits [idx] = batch.pos[idx] >= first ? 1 : 0;
577
-
578
- n_outputs += batch.logits[idx] != 0;
579
- }
580
- batch.n_tokens += batch_size;
581
-
582
- // restore the original token in case it was set to BOS
583
- tokens[seq_start] = token_org;
584
- }
585
-
586
- if (llama_decode(ctx, batch)) {
587
- LOG_INF("%s : failed to eval\n", __func__);
588
- return {tokens, -1, logit_history, prob_history};
589
- }
590
-
591
- if (num_batches > 1 && n_outputs > 0) {
592
- const auto * batch_logits = llama_get_logits(ctx);
593
- logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
594
- }
595
- }
596
-
597
-
598
- if (i == 0) {
599
- llama_synchronize(ctx);
600
- const auto t_end = std::chrono::high_resolution_clock::now();
601
- const float t_total = std::chrono::duration<float>(t_end - t_start).count();
602
- LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
603
- int total_seconds = (int)(t_total*n_chunk/n_seq);
604
- if (total_seconds >= 60*60) {
605
- LOG("%d hours ", total_seconds / (60*60));
606
- total_seconds = total_seconds % (60*60);
607
- }
608
- LOG("%.2f minutes\n", total_seconds / 60.0);
609
- }
610
-
611
- for (int seq = 0; seq < n_seq_batch; seq++) {
612
- const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
613
-
614
- llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
615
- if (!params.logits_file.empty()) {
616
- process_logits(logits_stream, n_vocab, all_logits,
617
- tokens_data, n_ctx - 1 - first,
618
- workers, log_probs, nll, nll2);
619
- } else {
620
- process_logits(n_vocab, all_logits,
621
- tokens_data, n_ctx - 1 - first,
622
- workers, nll, nll2,
623
- logit_history.data() + start + seq*n_ctx + first,
624
- prob_history.data() + start + seq*n_ctx + first);
625
- }
626
- count += n_ctx - first - 1;
627
-
628
- // perplexity is e^(average negative log-likelihood)
629
- if (params.ppl_output_type == 0) {
630
- LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
631
- } else {
632
- double av = nll/count;
633
- double av2 = nll2/count - av*av;
634
- if (av2 > 0) {
635
- av2 = sqrt(av2/(count-1));
636
- }
637
- LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
638
- }
639
- }
640
-
641
- logits.clear();
642
- }
643
- LOG("\n");
644
-
645
- nll2 /= count;
646
- nll /= count;
647
- const double ppl = exp(nll);
648
- nll2 -= nll * nll;
649
- if (nll2 > 0) {
650
- nll2 = sqrt(nll2/(count-1));
651
- LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
652
- } else {
653
- LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
654
- }
655
-
656
- llama_batch_free(batch);
657
-
658
- return {tokens, ppl, logit_history, prob_history};
659
- }
660
-
661
- static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
662
- int prev_outputs = 0;
663
- for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
664
- const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
665
-
666
- llama_batch batch_view = {
667
- n_tokens,
668
- batch.token + i,
669
- nullptr,
670
- batch.pos + i,
671
- batch.n_seq_id + i,
672
- batch.seq_id + i,
673
- batch.logits + i,
674
- };
675
-
676
- const int ret = llama_decode(ctx, batch_view);
677
- if (ret != 0) {
678
- LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
679
- return false;
680
- }
681
-
682
- int n_outputs = 0;
683
- for (int i = 0; i < n_tokens; ++i) {
684
- n_outputs += batch_view.logits[i] != 0;
685
- }
686
-
687
- memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
688
-
689
- prev_outputs += n_outputs;
690
- }
691
-
692
- return true;
693
- }
694
-
695
- #define K_TOKEN_CHUNK 4
696
-
697
- static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
698
- const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
699
- if (eval_results.size() != eval_pairs.size()) {
700
- eval_results.resize(eval_pairs.size());
701
- }
702
- if (eval_pairs.empty()) {
703
- return;
704
- }
705
-
706
- size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
707
-
708
- std::atomic<int> counter(0);
709
- auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
710
- float local_logprobs[K_TOKEN_CHUNK];
711
- while (true) {
712
- const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
713
- if (first >= eval_results.size()) {
714
- break;
715
- }
716
- const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
717
- for (size_t i = first; i < last; ++i) {
718
- const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
719
- float max_logit = logits[0];
720
- for (int j = 1; j < n_vocab; ++j) {
721
- max_logit = std::max(max_logit, logits[j]);
722
- }
723
- float sum_p = 0.f;
724
- for (int j = 0; j < n_vocab; ++j) {
725
- sum_p += expf(logits[j] - max_logit);
726
- }
727
- local_logprobs[i - first] = logits[eval_pairs[i].second] - max_logit - std::log(sum_p);
728
- }
729
- std::memcpy(eval_results.data() + first, local_logprobs, (last - first)*sizeof(float));
730
- }
731
- };
732
-
733
- for (size_t it = 0; it < max_threads; ++it) {
734
- workers[it] = std::thread(compute);
735
- }
736
- for (size_t it = 0; it < max_threads; ++it) {
737
- workers[it].join();
738
- }
739
- }
740
-
741
- static void hellaswag_score(llama_context * ctx, const common_params & params) {
742
- const llama_model * model = llama_get_model(ctx);
743
- const llama_vocab * vocab = llama_model_get_vocab(model);
744
-
745
- // Calculates hellaswag score (acc_norm) from prompt
746
- //
747
- // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
748
- // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
749
- //
750
- // All 10042 tasks should be extracted to keep the results standardized like other implementations.
751
- //
752
- // Datafile layout:
753
- // ['??'] denotes json fields
754
- // 6 lines per task:
755
- // ['activity_label'] + ": " +['ctx'] - The first part of the query, the context
756
- // ['label'] - The index the best common sense ending aka gold ending
757
- // ['endings'][0] - Endings added to the first part of the query
758
- // ['endings'][1]
759
- // ['endings'][2]
760
- // ['endings'][3]
761
-
762
- std::vector<std::string> prompt_lines;
763
- std::istringstream strstream(params.prompt);
764
- std::string line;
765
-
766
- while (std::getline(strstream,line,'\n')) {
767
- prompt_lines.push_back(line);
768
- }
769
-
770
- if (prompt_lines.size() % 6 != 0) {
771
- LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
772
- return;
773
- }
774
-
775
- size_t hs_task_count = prompt_lines.size()/6;
776
- LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
777
-
778
- const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
779
- LOG_INF("================================= is_spm = %d\n", is_spm);
780
-
781
- // The tasks should be randomized so the score stabilizes quickly.
782
- bool randomize_tasks = true;
783
-
784
- // Number of tasks to use when computing the score
785
- if (params.hellaswag_tasks < hs_task_count) {
786
- hs_task_count = params.hellaswag_tasks;
787
- }
788
-
789
- // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
790
- std::mt19937 rng(1);
791
-
792
- // Dataholder for hellaswag tasks
793
- struct hs_data_t {
794
- std::string context;
795
- size_t gold_ending_idx;
796
- std::string ending[4];
797
- size_t ending_logprob_count[4];
798
- double ending_logprob[4];
799
-
800
- size_t i_logits; // starting index of logits in the llama_batch
801
- size_t common_prefix; // max number of initial tokens that are the same in all sentences
802
- size_t required_tokens; // needed number of tokens to evaluate all 4 endings
803
- std::vector<llama_token> seq_tokens[4];
804
- };
805
-
806
- LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
807
-
808
- // Select and read data from prompt lines
809
- std::vector<hs_data_t> hs_data(hs_task_count);
810
- for (size_t i = 0; i < hs_task_count; i++) {
811
- size_t idx = i;
812
-
813
- auto & hs_cur = hs_data[i];
814
-
815
- // Select a random example of those left in the prompt
816
- if (randomize_tasks) {
817
- std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
818
- idx = dist(rng);
819
- }
820
-
821
- hs_cur.context = prompt_lines[idx*6];
822
- hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
823
- for (size_t j = 0; j < 4; j++) {
824
- hs_cur.ending[j] = prompt_lines[idx*6+2+j];
825
- hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
826
- }
827
-
828
- // determine the common prefix of the endings
829
- hs_cur.common_prefix = 0;
830
- for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) {
831
- if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] ||
832
- hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] ||
833
- hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[3][k]) {
834
- break;
835
- }
836
- hs_cur.common_prefix++;
837
- }
838
- hs_cur.required_tokens = hs_cur.common_prefix +
839
- hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
840
- hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
841
- hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
842
- hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
843
-
844
- //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
845
-
846
- // Delete the selected random example from the prompt
847
- if (randomize_tasks) {
848
- prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) );
849
- }
850
- }
851
-
852
- LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
853
-
854
- LOG("\ntask\tacc_norm\t95%% confidence interval\n");
855
-
856
- double acc = 0.0f;
857
-
858
- const int n_ctx = llama_n_ctx(ctx);
859
- const int n_batch = params.n_batch;
860
-
861
- const int n_vocab = llama_vocab_n_tokens(vocab);
862
-
863
- const int max_tasks_per_batch = 32;
864
- const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
865
-
866
- llama_batch batch = llama_batch_init(n_ctx, 0, 4);
867
-
868
- std::vector<float> tok_logits(n_vocab);
869
- // TODO: this could be made smaller; it's currently the worst-case size
870
- std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
871
-
872
- std::vector<std::pair<size_t, llama_token>> eval_pairs;
873
- std::vector<float> eval_results;
874
- std::vector<std::thread> workers(std::thread::hardware_concurrency());
875
-
876
- for (size_t i0 = 0; i0 < hs_task_count; i0++) {
877
- int n_cur = 0;
878
-
879
- size_t i1 = i0;
880
- size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
881
-
882
- common_batch_clear(batch);
883
-
884
- // batch as much tasks as possible into the available context
885
- // each task has 4 unique sequence ids - one for each ending
886
- // the common prefix is shared among the 4 sequences to save tokens
887
- // we extract logits only from the last common token and from all ending tokens of each sequence
888
- while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
889
- auto & hs_cur = hs_data[i1];
890
- int n_logits = 0;
891
-
892
- const int s0 = 4*(i1 - i0);
893
- if (s0 + 4 > max_seq) {
894
- break;
895
- }
896
-
897
- for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
898
- common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
899
- }
900
- batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
901
- n_logits += 1;
902
-
903
- for (int s = 0; s < 4; ++s) {
904
- const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
905
- // TODO: don't evaluate the last token of each sequence
906
- for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
907
- const bool needs_logits = i < seq_tokens_size - 1;
908
- common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
909
- n_logits += needs_logits;
910
- }
911
- }
912
-
913
- hs_cur.i_logits = i_logits;
914
- i_logits += n_logits;
915
-
916
- n_cur += hs_data[i1].required_tokens;
917
- if (++i1 == hs_task_count) {
918
- break;
919
- }
920
- }
921
-
922
- if (i0 == i1) {
923
- LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
924
- return;
925
- }
926
-
927
- llama_kv_self_clear(ctx);
928
-
929
- // decode all tasks [i0, i1)
930
- if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
931
- LOG_ERR("%s: llama_decode() failed\n", __func__);
932
- return;
933
- }
934
-
935
- // Compute log-probs in parallel
936
- // First we collect all tasks
937
- eval_pairs.clear();
938
- for (size_t i = i0; i < i1; ++i) {
939
- auto & hs_cur = hs_data[i];
940
- size_t li = 1; // skip the last logit of the common prefix (computed separately below)
941
- for (int s = 0; s < 4; ++s) {
942
- for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
943
- eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]);
944
- }
945
- }
946
- }
947
- // Then we do the actual calculation
948
- compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
949
-
950
- size_t ir = 0;
951
-
952
- // compute the logprobs for each ending of the decoded tasks
953
- for (size_t i = i0; i < i1; ++i) {
954
- auto & hs_cur = hs_data[i];
955
-
956
- // get the logits of the last token of the common prefix
957
- std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
958
-
959
- const auto first_probs = softmax(tok_logits);
960
-
961
- for (int s = 0; s < 4; ++s) {
962
- hs_cur.ending_logprob_count[s] = 1;
963
- hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]);
964
- for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
965
- hs_cur.ending_logprob[s] += eval_results[ir++];
966
- hs_cur.ending_logprob_count[s]++;
967
- }
968
- hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s];
969
- }
970
-
971
- // Find the ending with maximum logprob
972
- size_t ending_logprob_max_idx = 0;
973
- double ending_logprob_max_val = hs_cur.ending_logprob[0];
974
- for (size_t s = 1; s < 4; s++) {
975
- if (hs_cur.ending_logprob[s] > ending_logprob_max_val) {
976
- ending_logprob_max_idx = s;
977
- ending_logprob_max_val = hs_cur.ending_logprob[s];
978
- }
979
- }
980
-
981
- //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
982
-
983
- // If the gold ending got the maximum logprobe add one accuracy point
984
- if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
985
- acc += 1.0;
986
- }
987
-
988
- double freq = acc / double(i + 1);
989
-
990
- const double za = 1.95996398454;
991
-
992
- // // Wald normal approx
993
- // double conf =za*sqrt(freq*(1-freq)/double(i + 1));
994
- // LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
995
-
996
- // Wilson score interval, more accurate
997
- double z = za * za / double(i + 1);
998
- double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
999
- double a = (freq + z * 0.5 - cnf) / (1.0 + z);
1000
- double b = (freq + z * 0.5 + cnf) / (1.0 + z);
1001
-
1002
- // Print the accumulated accuracy mean x 100 and confidence interval
1003
- LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
1004
- }
1005
-
1006
- i0 = i1 - 1;
1007
- }
1008
-
1009
- llama_batch_free(batch);
1010
-
1011
- LOG("\n");
1012
- }
1013
-
1014
- struct winogrande_entry {
1015
- std::string first;
1016
- std::string second;
1017
- std::array<std::string, 2> choices;
1018
- int answer;
1019
-
1020
- size_t i_logits;
1021
- size_t common_prefix;
1022
- size_t required_tokens;
1023
- size_t n_base1; // number of tokens for context + choice 1
1024
- size_t n_base2; // number of tokens for context + choice 2
1025
- std::vector<llama_token> seq_tokens[2];
1026
- };
1027
-
1028
- static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
1029
- std::vector<winogrande_entry> result;
1030
- std::istringstream in(prompt);
1031
- std::string line;
1032
- std::array<int, 4> comma_pos;
1033
- while (true) {
1034
- std::getline(in, line);
1035
- if (in.fail() || in.eof()) break;
1036
- int ipos = 0;
1037
- bool quote_open = false;
1038
- for (int i = 0; i < int(line.size()); ++i) {
1039
- if (!quote_open) {
1040
- if (line[i] == ',') {
1041
- comma_pos[ipos++] = i;
1042
- if (ipos == 4) break;
1043
- }
1044
- else if (line[i] == '"') {
1045
- quote_open = true;
1046
- }
1047
- }
1048
- else {
1049
- if (line[i] == '"') {
1050
- quote_open = false;
1051
- }
1052
- }
1053
- }
1054
- if (ipos != 4) {
1055
- LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
1056
- continue;
1057
- }
1058
- auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
1059
- : line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1);
1060
- auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1);
1061
- auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1);
1062
- auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1);
1063
- auto index = line.substr(0, comma_pos[0]);
1064
- int where = 0;
1065
- for ( ; where < int(sentence.size()); ++where) {
1066
- if (sentence[where] == '_') break;
1067
- }
1068
- if (where == int(sentence.size())) {
1069
- LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
1070
- continue;
1071
- }
1072
- std::istringstream stream(answer.c_str());
1073
- int i_answer; stream >> i_answer;
1074
- if (stream.fail() || i_answer < 1 || i_answer > 2) {
1075
- LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
1076
- continue;
1077
- }
1078
- result.emplace_back();
1079
- auto& wg = result.back();
1080
- wg.first = sentence.substr(0, where);
1081
- wg.second = sentence.substr(where + 1, sentence.size() - where - 1);
1082
- wg.choices[0] = std::move(choice1);
1083
- wg.choices[1] = std::move(choice2);
1084
- wg.answer = i_answer;
1085
- }
1086
- return result;
1087
- }
1088
-
1089
- /*
1090
- * Evaluates the Winogrande score.
1091
- * Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
1092
- * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
1093
- * As an example, the 1st row in the above dataset is
1094
- *
1095
- * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
1096
- *
1097
- */
1098
- static void winogrande_score(llama_context * ctx, const common_params & params) {
1099
- const llama_model * model = llama_get_model(ctx);
1100
- const llama_vocab * vocab = llama_model_get_vocab(model);
1101
-
1102
- constexpr int k_min_trailing_ctx = 3;
1103
-
1104
- auto data = load_winogrande_from_csv(params.prompt);
1105
- if (data.empty()) {
1106
- LOG_ERR("%s: no tasks\n", __func__);
1107
- return;
1108
- }
1109
-
1110
- LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
1111
-
1112
- if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
1113
- LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
1114
- std::mt19937 rng(1);
1115
- std::vector<int> aux(data.size());
1116
- for (int i = 0; i < int(data.size()); ++i) {
1117
- aux[i] = i;
1118
- }
1119
- float scale = 1/(1.f + (float)rng.max());
1120
- std::vector<winogrande_entry> selected;
1121
- selected.resize(params.winogrande_tasks);
1122
- for (int i = 0; i < int(params.winogrande_tasks); ++i) {
1123
- int j = int(scale*rng()*aux.size());
1124
- selected[i] = std::move(data[aux[j]]);
1125
- aux[j] = aux.back();
1126
- aux.pop_back();
1127
- }
1128
- data = std::move(selected);
1129
- }
1130
-
1131
- LOG_INF("%s : tokenizing selected tasks\n", __func__);
1132
-
1133
- for (auto & task : data) {
1134
- task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
1135
- task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
1136
-
1137
- task.common_prefix = 0;
1138
- for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
1139
- if (task.seq_tokens[0][k] != task.seq_tokens[1][k]) {
1140
- break;
1141
- }
1142
- task.common_prefix++;
1143
- }
1144
-
1145
- // TODO: the last token of each of the sequences don't need to be evaluated
1146
- task.required_tokens = task.common_prefix +
1147
- task.seq_tokens[0].size() - task.common_prefix +
1148
- task.seq_tokens[1].size() - task.common_prefix;
1149
-
1150
- task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
1151
- task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
1152
- }
1153
-
1154
- LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
1155
-
1156
- const int n_ctx = llama_n_ctx(ctx);
1157
- const int n_batch = params.n_batch;
1158
-
1159
- const int n_vocab = llama_vocab_n_tokens(vocab);
1160
-
1161
- const int max_tasks_per_batch = 128;
1162
- const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
1163
-
1164
- llama_batch batch = llama_batch_init(n_ctx, 0, 2);
1165
-
1166
- std::vector<float> tok_logits(n_vocab);
1167
- // TODO: this could be made smaller; it's currently the worst-case size
1168
- std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
1169
-
1170
- std::vector<std::pair<size_t, llama_token>> eval_pairs;
1171
- std::vector<float> eval_results;
1172
- std::vector<std::thread> workers(std::thread::hardware_concurrency());
1173
-
1174
- int n_correct = 0;
1175
- int n_done = 0;
1176
-
1177
- for (size_t i0 = 0; i0 < data.size(); i0++) {
1178
- int n_cur = 0;
1179
-
1180
- size_t i1 = i0;
1181
- size_t i_logits = 0;
1182
-
1183
- common_batch_clear(batch);
1184
-
1185
- while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
1186
- int n_logits = 0;
1187
- const int s0 = 2*(i1 - i0);
1188
- if (s0 + 2 > max_seq) {
1189
- break;
1190
- }
1191
-
1192
- for (size_t i = 0; i < data[i1].common_prefix; ++i) {
1193
- common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
1194
- }
1195
- batch.logits[batch.n_tokens - 1] = true;
1196
- n_logits += 1;
1197
-
1198
- for (int s = 0; s < 2; ++s) {
1199
- // TODO: end before the last token, no need to predict past the end of the sequences
1200
- for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
1201
- common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
1202
- n_logits += 1;
1203
- }
1204
- }
1205
-
1206
- data[i1].i_logits = i_logits;
1207
- i_logits += n_logits;
1208
-
1209
- n_cur += data[i1].required_tokens;
1210
- if (++i1 == data.size()) {
1211
- break;
1212
- }
1213
- }
1214
-
1215
- if (i0 == i1) {
1216
- LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
1217
- return;
1218
- }
1219
-
1220
- llama_kv_self_clear(ctx);
1221
-
1222
- // decode all tasks [i0, i1)
1223
- if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
1224
- LOG_ERR("%s: llama_decode() failed\n", __func__);
1225
- return;
1226
- }
1227
-
1228
- eval_pairs.clear();
1229
- for (size_t i = i0; i < i1; ++i) {
1230
- auto & task = data[i];
1231
-
1232
- const bool skip_choice =
1233
- task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
1234
- task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
1235
-
1236
- const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
1237
- const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
1238
- size_t li = n_base1 - task.common_prefix;
1239
- for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
1240
- eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]);
1241
- }
1242
- const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
1243
- const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
1244
- // FIXME: this uses the wrong first logits when not skipping the choice word
1245
- li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix;
1246
- for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
1247
- eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]);
1248
- }
1249
- }
1250
- compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
1251
-
1252
- size_t ir = 0;
1253
- for (size_t i = i0; i < i1; ++i) {
1254
- auto & task = data[i];
1255
-
1256
- const bool skip_choice =
1257
- task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
1258
- task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
1259
-
1260
- float score_1st = 0;
1261
- const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
1262
- const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
1263
- for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
1264
- score_1st += eval_results[ir++];
1265
- }
1266
- score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st);
1267
-
1268
- float score_2nd = 0;
1269
- const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
1270
- const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
1271
- for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
1272
- score_2nd += eval_results[ir++];
1273
- }
1274
- score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd);
1275
-
1276
- int result = score_1st > score_2nd ? 1 : 2;
1277
-
1278
- if (result == task.answer) {
1279
- ++n_correct;
1280
- }
1281
- ++n_done;
1282
-
1283
- // print the accumulated accuracy mean x 100
1284
- LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
1285
- }
1286
-
1287
- i0 = i1 - 1;
1288
- }
1289
-
1290
- LOG("\n");
1291
-
1292
- if (n_done < 100) return;
1293
-
1294
- const float p = 1.f*n_correct/n_done;
1295
- const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
1296
-
1297
- LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
1298
- }
1299
-
1300
- static bool deserialize_string(std::istream & in, std::string & str) {
1301
- uint32_t size;
1302
- if (!in.read((char *)&size, sizeof(size)).fail()) {
1303
- str.resize(size);
1304
- if (!in.read((char *)&str[0], size).fail()) return true;
1305
- }
1306
- return false;
1307
- }
1308
-
1309
- struct multiple_choice_answers {
1310
- std::vector<std::string> answers;
1311
- std::vector<int> labels;
1312
- bool deserialize(std::istream& in) {
1313
- uint32_t n;
1314
- in.read((char *)&n, sizeof(n));
1315
- if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
1316
- answers.resize(n);
1317
- labels.resize(n);
1318
- for (auto& a : answers) {
1319
- if (!deserialize_string(in, a)) return false;
1320
- }
1321
- in.read((char *)labels.data(), n*sizeof(int));
1322
- return !in.fail();
1323
- }
1324
- };
1325
-
1326
- struct multiple_choice_task {
1327
- std::string question; // the question (or context that needs to be continued)
1328
- multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer
1329
- multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet
1330
- bool deserialize(std::istream& in) {
1331
- if (!deserialize_string(in, question)) return false;
1332
- return mc1.deserialize(in) && mc2.deserialize(in);
1333
- }
1334
-
1335
- // For evaluation
1336
- size_t i_logits; // starting index of logits in the llama_batch
1337
- size_t common_prefix; // max number of initial tokens that are the same in all sentences
1338
- size_t required_tokens; // needed number of tokens to evaluate all answers
1339
- std::vector<std::vector<llama_token>> seq_tokens;
1340
- std::vector<float> log_probs;
1341
- };
1342
-
1343
- static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
1344
- if (task.question.empty() || task.mc1.answers.empty()) {
1345
- if (log_error) {
1346
- LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
1347
- }
1348
- return false;
1349
- }
1350
- task.seq_tokens.reserve(task.mc1.answers.size());
1351
- for (auto& answer : task.mc1.answers) {
1352
- if (answer.empty()) {
1353
- if (log_error) {
1354
- LOG_ERR("%s: found empty answer\n", __func__);
1355
- }
1356
- return false;
1357
- }
1358
- task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
1359
- }
1360
- auto min_len = task.seq_tokens.front().size();
1361
- for (auto& seq : task.seq_tokens) {
1362
- min_len = std::min(min_len, seq.size());
1363
- }
1364
- task.common_prefix = 0;
1365
- for (size_t k = 0; k < min_len; ++k) {
1366
- auto token = task.seq_tokens[0][k];
1367
- bool all_same = true;
1368
- for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
1369
- if (task.seq_tokens[i][k] != token) {
1370
- all_same = false;
1371
- break;
1372
- }
1373
- }
1374
- if (!all_same) {
1375
- break;
1376
- }
1377
- ++task.common_prefix;
1378
- }
1379
- task.required_tokens = task.common_prefix;
1380
- for (auto& seq : task.seq_tokens) {
1381
- task.required_tokens += seq.size() - task.common_prefix;
1382
- }
1383
- return true;
1384
- }
1385
-
1386
- //
1387
- // Calculates score for multiple choice tasks with single correct answer from prompt.
1388
- // Commonly used LLM evaluation metrics of this type are
1389
- // * ARC
1390
- // * HellaSwag
1391
- // * MMLU
1392
- // * TruthfulQA
1393
- //
1394
- // Validation datasets for these 4 tests can be found at
1395
- // https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
1396
- // The data for these datasets was extracted from
1397
- // git@hf.co:datasets/allenai/ai2_arc
1398
- // https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
1399
- // git@hf.co:datasets/Stevross/mmlu
1400
- // https://huggingface.co/datasets/truthful_qa
1401
- //
1402
- static void multiple_choice_score(llama_context * ctx, const common_params & params) {
1403
- const llama_model * model = llama_get_model(ctx);
1404
- const llama_vocab * vocab = llama_model_get_vocab(model);
1405
-
1406
- std::istringstream strstream(params.prompt);
1407
- uint32_t n_task;
1408
- strstream.read((char *)&n_task, sizeof(n_task));
1409
- if (strstream.fail() || n_task == 0) {
1410
- LOG_ERR("%s: no tasks\n", __func__);
1411
- return;
1412
- }
1413
- LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
1414
- std::vector<uint32_t> task_pos(n_task);
1415
- strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
1416
- if (strstream.fail()) {
1417
- LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
1418
- return;
1419
- }
1420
-
1421
- std::vector<multiple_choice_task> tasks;
1422
- if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
1423
- // Use all tasks
1424
- tasks.resize(n_task);
1425
- LOG_INF("%s: reading tasks", __func__);
1426
- int n_dot = std::max((int) n_task/100, 1);
1427
- int i = 0;
1428
- for (auto& task : tasks) {
1429
- ++i;
1430
- if (!task.deserialize(strstream)) {
1431
- LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
1432
- return;
1433
- }
1434
- if (i%n_dot == 0) LOG(".");
1435
- }
1436
- LOG("done\n");
1437
- }
1438
- else {
1439
- LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
1440
- std::mt19937 rng(1);
1441
- std::vector<int> aux(n_task);
1442
- for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
1443
- float scale = 1.f/(1.f + (float)std::mt19937::max());
1444
- tasks.resize(params.multiple_choice_tasks);
1445
- for (auto& task : tasks) {
1446
- int j = (int)(scale * rng() * aux.size());
1447
- int idx = aux[j];
1448
- aux[j] = aux.back();
1449
- aux.pop_back();
1450
- strstream.seekg(task_pos[idx], std::ios::beg);
1451
- if (!task.deserialize(strstream)) {
1452
- LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
1453
- return;
1454
- }
1455
- }
1456
- n_task = params.multiple_choice_tasks;
1457
- }
1458
-
1459
- LOG_INF("%s: preparing task data", __func__);
1460
- if (n_task > 500) {
1461
- LOG("...");
1462
- std::atomic<int> counter(0);
1463
- std::atomic<int> n_bad(0);
1464
- auto prepare = [&counter, &n_bad, &tasks, ctx] () {
1465
- int num_tasks = tasks.size();
1466
- int n_bad_local = 0;
1467
- while (true) {
1468
- int first = counter.fetch_add(K_TOKEN_CHUNK);
1469
- if (first >= num_tasks) {
1470
- if (n_bad_local > 0) n_bad += n_bad_local;
1471
- break;
1472
- }
1473
- int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
1474
- for (int i = first; i < last; ++i) {
1475
- if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
1476
- }
1477
- }
1478
- };
1479
- size_t max_thread = std::thread::hardware_concurrency();
1480
- max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
1481
- std::vector<std::thread> workers(max_thread-1);
1482
- for (auto& w : workers) w = std::thread(prepare);
1483
- prepare();
1484
- for (auto& w : workers) w.join();
1485
- LOG("done\n");
1486
- int nbad = n_bad;
1487
- if (nbad > 0) {
1488
- LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
1489
- return;
1490
- }
1491
- } else {
1492
- int n_dot = std::max((int) n_task/100, 1);
1493
- int i_task = 0;
1494
- for (auto& task : tasks) {
1495
- ++i_task;
1496
- if (!multiple_choice_prepare_one_task(ctx, task, true)) {
1497
- return;
1498
- }
1499
- if (i_task%n_dot == 0) {
1500
- LOG(".");
1501
- }
1502
- }
1503
- LOG("done\n");
1504
- }
1505
-
1506
- LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
1507
-
1508
- LOG("\ntask\tacc_norm\n");
1509
-
1510
- const int n_ctx = llama_n_ctx(ctx);
1511
- const int n_batch = params.n_batch;
1512
-
1513
- const int n_vocab = llama_vocab_n_tokens(vocab);
1514
-
1515
- const int max_tasks_per_batch = 32;
1516
- const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
1517
-
1518
- llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
1519
-
1520
- std::vector<float> tok_logits(n_vocab);
1521
- std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
1522
-
1523
- std::vector<std::pair<size_t, llama_token>> eval_pairs;
1524
- std::vector<float> eval_results;
1525
- std::vector<std::thread> workers(std::thread::hardware_concurrency());
1526
- std::vector<int> batch_indeces;
1527
-
1528
- int n_done = 0;
1529
- int n_correct = 0;
1530
- int n_tot_answers = 0;
1531
-
1532
- for (size_t i0 = 0; i0 < tasks.size(); i0++) {
1533
- int n_cur = 0;
1534
-
1535
- size_t i1 = i0;
1536
- size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
1537
-
1538
- common_batch_clear(batch);
1539
-
1540
- // batch as much tasks as possible into the available context
1541
- // each task has 4 unique sequence ids - one for each ending
1542
- // the common prefix is shared among the 4 sequences to save tokens
1543
- // we extract logits only from the last common token and from all ending tokens of each sequence
1544
- int s0 = 0;
1545
- while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
1546
- auto& cur_task = tasks[i1];
1547
- int n_logits = 0;
1548
-
1549
- int num_answers = cur_task.seq_tokens.size();
1550
- if (s0 + num_answers > max_seq) {
1551
- break;
1552
- }
1553
-
1554
- if (int(batch_indeces.size()) != num_answers) {
1555
- batch_indeces.resize(num_answers);
1556
- }
1557
-
1558
- for (int s = 0; s < num_answers; ++s) {
1559
- batch_indeces[s] = s0 + s;
1560
- }
1561
-
1562
- for (size_t i = 0; i < cur_task.common_prefix; ++i) {
1563
- //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
1564
- common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
1565
- }
1566
- batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
1567
- n_logits += 1;
1568
-
1569
- for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
1570
- const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
1571
- // TODO: don't evaluate the last token of each sequence
1572
- for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
1573
- const bool needs_logits = i < seq_tokens_size - 1;
1574
- common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
1575
- n_logits += needs_logits;
1576
- }
1577
- }
1578
-
1579
- s0 += num_answers;
1580
-
1581
- cur_task.i_logits = i_logits;
1582
- i_logits += n_logits;
1583
-
1584
- n_cur += cur_task.required_tokens;
1585
- if (++i1 == tasks.size()) {
1586
- break;
1587
- }
1588
- }
1589
-
1590
- if (i0 == i1) {
1591
- LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
1592
- return;
1593
- }
1594
-
1595
- llama_kv_self_clear(ctx);
1596
-
1597
- // decode all tasks [i0, i1)
1598
- if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
1599
- LOG_ERR("%s: llama_decode() failed\n", __func__);
1600
- return;
1601
- }
1602
-
1603
- // Compute log-probs in parallel
1604
- // First we collect all tasks
1605
- eval_pairs.clear();
1606
- for (size_t i = i0; i < i1; ++i) {
1607
- auto& cur_task = tasks[i];
1608
- size_t li = 1; // skip the last logit of the common prefix (computed separately below)
1609
- for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
1610
- for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
1611
- eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]);
1612
- }
1613
- }
1614
- }
1615
- // Then we do the actual calculation
1616
- compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
1617
-
1618
- size_t ir = 0;
1619
-
1620
- // compute the logprobs for each ending of the decoded tasks
1621
- for (size_t i = i0; i < i1; ++i) {
1622
- auto & cur_task = tasks[i];
1623
- //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
1624
- //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
1625
- // if (cur_task.mc1.labels[j] == 1) {
1626
- // LOG("%d", j+1);
1627
- // }
1628
- //}
1629
- //LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
1630
-
1631
- // get the logits of the last token of the common prefix
1632
- std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
1633
-
1634
- const auto first_probs = softmax(tok_logits);
1635
-
1636
- cur_task.log_probs.resize(cur_task.seq_tokens.size());
1637
- for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
1638
- size_t count = 1;
1639
- float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
1640
- for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
1641
- //LOG(" %zu %g\n", ir, eval_results[ir]);
1642
- ++count;
1643
- log_prob += eval_results[ir++];
1644
- }
1645
- cur_task.log_probs[s] = log_prob / count;
1646
- //LOG(" Final: %g\n", log_prob / count);
1647
- //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
1648
- }
1649
-
1650
- // Find the ending with maximum logprob
1651
- size_t logprob_max_idx = 0;
1652
- float logprob_max_val = cur_task.log_probs[0];
1653
- for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
1654
- if (cur_task.log_probs[s] > logprob_max_val) {
1655
- logprob_max_val = cur_task.log_probs[s];
1656
- logprob_max_idx = s;
1657
- }
1658
- }
1659
-
1660
- n_tot_answers += cur_task.log_probs.size();
1661
- if (cur_task.mc1.labels[logprob_max_idx] == 1) {
1662
- ++n_correct;
1663
- }
1664
- ++n_done;
1665
-
1666
- // Print the accumulated accuracy mean x 100
1667
- LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
1668
- }
1669
-
1670
- i0 = i1 - 1;
1671
- }
1672
-
1673
- llama_batch_free(batch);
1674
-
1675
- if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
1676
-
1677
- float p = 1.f*n_correct/n_done;
1678
- float sigma = sqrt(p*(1-p)/(n_done-1));
1679
- LOG("\n");
1680
- LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
1681
- p = 1.f*n_done/n_tot_answers;
1682
- sigma = sqrt(p*(1-p)/(n_done-1));
1683
- LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
1684
-
1685
- LOG_INF("\n");
1686
- }
1687
-
1688
- static void kl_divergence(llama_context * ctx, const common_params & params) {
1689
- const llama_model * model = llama_get_model(ctx);
1690
- const llama_vocab * vocab = llama_model_get_vocab(model);
1691
-
1692
- if (params.logits_file.empty()) {
1693
- LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
1694
- return;
1695
- }
1696
- std::ifstream in(params.logits_file.c_str(), std::ios::binary);
1697
- if (!in) {
1698
- LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
1699
- return;
1700
- }
1701
- {
1702
- char check[9]; check[8] = 0;
1703
- in.read(check, 8);
1704
- if (in.fail() || strncmp("_logits_", check, 8) != 0) {
1705
- LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
1706
- return;
1707
- }
1708
- }
1709
-
1710
- uint32_t n_ctx;
1711
- in.read((char *)&n_ctx, sizeof(n_ctx));
1712
- if (n_ctx > llama_n_ctx(ctx)) {
1713
- LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
1714
- __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
1715
- }
1716
-
1717
- int n_vocab;
1718
- int n_chunk;
1719
- in.read((char *)&n_vocab, sizeof(n_vocab));
1720
- in.read((char *)&n_chunk, sizeof(n_chunk));
1721
- if (in.fail()) {
1722
- LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
1723
- return;
1724
- }
1725
- if (n_vocab != llama_vocab_n_tokens(vocab)) {
1726
- LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
1727
- }
1728
-
1729
- std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
1730
- if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
1731
- LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
1732
- return;
1733
- }
1734
-
1735
- const int n_batch = params.n_batch;
1736
- const int num_batches = (n_ctx + n_batch - 1)/n_batch;
1737
- const int nv = 2*((n_vocab + 1)/2) + 4;
1738
- const bool add_bos = llama_vocab_get_add_bos(vocab);
1739
- GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
1740
-
1741
- std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
1742
- std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
1743
- std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
1744
- std::vector<float> logits;
1745
- if (num_batches > 1) {
1746
- logits.reserve(size_t(n_ctx) * n_vocab);
1747
- }
1748
-
1749
- std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
1750
-
1751
- auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
1752
- if (count < 1) {
1753
- return std::make_pair(0., 0.);
1754
- }
1755
- double f = sum/count;
1756
- double df = sum2/count - f*f;
1757
- df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
1758
- return std::make_pair(f, df);
1759
- };
1760
- auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
1761
- if (count < 10) {
1762
- return 0.0;
1763
- }
1764
- double var = sumab/count - (suma/count)*(sumb/count);
1765
- var /= count - 1;
1766
- return var;
1767
- };
1768
-
1769
- kl_divergence_result kld;
1770
- auto kld_ptr = kld_values.data();
1771
- auto p_diff_ptr = p_diff_values.data();
1772
-
1773
- for (int i = 0; i < n_chunk; ++i) {
1774
- const int start = i * n_ctx;
1775
- const int end = start + n_ctx;
1776
-
1777
- const auto t_start = std::chrono::high_resolution_clock::now();
1778
-
1779
- if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
1780
- LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
1781
- return;
1782
- }
1783
-
1784
- // clear the KV cache
1785
- llama_kv_self_clear(ctx);
1786
-
1787
- llama_batch batch = llama_batch_init(n_batch, 0, 1);
1788
-
1789
- for (int j = 0; j < num_batches; ++j) {
1790
- const int batch_start = start + j * n_batch;
1791
- const int batch_size = std::min(end - batch_start, n_batch);
1792
-
1793
- // save original token and restore it after eval
1794
- const auto token_org = tokens[batch_start];
1795
-
1796
- // add BOS token for the first batch of each chunk
1797
- if (add_bos && j == 0) {
1798
- tokens[batch_start] = llama_vocab_bos(vocab);
1799
- }
1800
-
1801
- common_batch_clear(batch);
1802
- for (int i = 0; i < batch_size; i++) {
1803
- common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
1804
- }
1805
-
1806
- if (llama_decode(ctx, batch)) {
1807
- LOG_ERR("%s : failed to eval\n", __func__);
1808
- llama_batch_free(batch);
1809
- return;
1810
- }
1811
-
1812
- // restore the original token in case it was set to BOS
1813
- tokens[batch_start] = token_org;
1814
-
1815
- if (num_batches > 1) {
1816
- const auto * batch_logits = llama_get_logits(ctx);
1817
- logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
1818
- }
1819
- }
1820
-
1821
- llama_batch_free(batch);
1822
-
1823
- const auto t_end = std::chrono::high_resolution_clock::now();
1824
-
1825
- if (i == 0) {
1826
- const float t_total = std::chrono::duration<float>(t_end - t_start).count();
1827
- LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
1828
- int total_seconds = (int)(t_total * n_chunk);
1829
- if (total_seconds >= 60*60) {
1830
- LOG("%d hours ", total_seconds / (60*60));
1831
- total_seconds = total_seconds % (60*60);
1832
- }
1833
- LOG("%.2f minutes\n", total_seconds / 60.0);
1834
- }
1835
- LOG("\n");
1836
- LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
1837
-
1838
- const int first = n_ctx/2;
1839
- const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
1840
- process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
1841
- workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
1842
- p_diff_ptr += n_ctx - 1 - first;
1843
- kld_ptr += n_ctx - 1 - first;
1844
-
1845
- LOG("%4d", i+1);
1846
-
1847
- auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1848
- const double ppl_val = exp(log_ppl.first);
1849
- const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
1850
- LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
1851
-
1852
- auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
1853
- const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
1854
- const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
1855
- const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
1856
- LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
1857
-
1858
- auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1859
- LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
1860
-
1861
- auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
1862
- const double p_diff_rms_val = sqrt(p_diff_mse.first);
1863
- const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
1864
- LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1865
-
1866
- double p_top_val = 1.*kld.n_same_top/kld.count;
1867
- double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
1868
- LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
1869
-
1870
- LOG("\n");
1871
-
1872
- logits.clear();
1873
- }
1874
- LOG("\n");
1875
-
1876
- if (kld.count < 100) return; // we do not wish to do statistics on so few values
1877
-
1878
- std::sort(kld_values.begin(), kld_values.end());
1879
- std::sort(p_diff_values.begin(), p_diff_values.end());
1880
-
1881
- LOG("====== Perplexity statistics ======\n");
1882
-
1883
- auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1884
- const double ppl_val = exp(log_ppl.first);
1885
- const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
1886
- LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
1887
-
1888
- auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
1889
- const double ppl_base_val = exp(log_ppl_base.first);
1890
- const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
1891
- LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
1892
-
1893
- const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
1894
- // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
1895
- const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
1896
- LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
1897
-
1898
- const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
1899
- const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
1900
- LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
1901
-
1902
- const double ppl_ratio_val = exp(log_ppl_ratio_val);
1903
- const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
1904
- LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
1905
-
1906
- const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
1907
- const double ppl_diff_val = ppl_val - ppl_base_val;
1908
- const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
1909
- LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
1910
-
1911
- LOG("\n");
1912
-
1913
- LOG("====== KL divergence statistics ======\n");
1914
- auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1915
- LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
1916
- auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
1917
- : kld_values[kld_values.size()/2];
1918
-
1919
- auto percentile = [] (std::vector<float> values, float fraction) {
1920
- if (fraction <= 0) return values.front();
1921
- if (fraction >= 1) return values.back();
1922
- float p = fraction*(values.size() - 1);
1923
- size_t ip = size_t(p); p -= ip;
1924
- return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
1925
- };
1926
-
1927
- LOG("Maximum KLD: %10.6f\n", kld_values.back());
1928
- LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
1929
- LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1930
- LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1931
- LOG("Median KLD: %10.6f\n", kld_median);
1932
- LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
1933
- LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
1934
- LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
1935
- LOG("Minimum KLD: %10.6f\n", kld_values.front());
1936
-
1937
- LOG("\n");
1938
-
1939
- LOG("====== Token probability statistics ======\n");
1940
-
1941
- auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
1942
- LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
1943
-
1944
- auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
1945
- : p_diff_values[p_diff_values.size()/2];
1946
-
1947
- LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
1948
- LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
1949
- LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
1950
- LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
1951
- LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
1952
- LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
1953
- LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
1954
- LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
1955
- LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
1956
- LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
1957
- LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
1958
- LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
1959
- LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
1960
-
1961
- auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
1962
- // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
1963
-
1964
- const double p_diff_rms_val = sqrt(p_diff_mse.first);
1965
- const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
1966
- LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1967
-
1968
- const double same_top_p = 1.0*kld.n_same_top/kld.count;
1969
- LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
1970
- }
1971
-
1972
- int main(int argc, char ** argv) {
1973
- common_params params;
1974
-
1975
- params.n_ctx = 512;
1976
- params.escape = false;
1977
-
1978
- if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
1979
- return 1;
1980
- }
1981
-
1982
- common_init();
1983
-
1984
- const int32_t n_ctx = params.n_ctx;
1985
-
1986
- if (n_ctx <= 0) {
1987
- LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
1988
- return 1;
1989
- }
1990
-
1991
- const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
1992
-
1993
- if (ppl) {
1994
- const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
1995
- const int32_t n_kv = n_seq * n_ctx;
1996
-
1997
- params.n_parallel = n_seq;
1998
- params.n_ctx = n_kv;
1999
-
2000
- params.n_batch = std::min(params.n_batch, n_kv);
2001
- } else {
2002
- params.n_batch = std::min(params.n_batch, params.n_ctx);
2003
- if (params.kl_divergence) {
2004
- params.n_parallel = 1;
2005
- } else {
2006
- // ensure there's at least enough seq_ids for HellaSwag
2007
- params.n_parallel = std::max(4, params.n_parallel);
2008
- }
2009
- }
2010
-
2011
- if (params.ppl_stride > 0) {
2012
- LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
2013
- params.n_ctx, params.n_ctx + params.ppl_stride/2);
2014
- params.n_ctx += params.ppl_stride/2;
2015
- }
2016
-
2017
- llama_backend_init();
2018
- llama_numa_init(params.numa);
2019
-
2020
- // load the model and apply lora adapter, if any
2021
- common_init_result llama_init = common_init_from_params(params);
2022
-
2023
- llama_model * model = llama_init.model.get();
2024
- llama_context * ctx = llama_init.context.get();
2025
-
2026
- if (model == NULL) {
2027
- LOG_ERR("%s: unable to load model\n", __func__);
2028
- return 1;
2029
- }
2030
-
2031
- const int n_ctx_train = llama_model_n_ctx_train(model);
2032
-
2033
- if (params.n_ctx > n_ctx_train) {
2034
- LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
2035
- __func__, n_ctx_train, params.n_ctx);
2036
- }
2037
-
2038
- // print system information
2039
- {
2040
- LOG_INF("\n");
2041
- LOG_INF("%s\n", common_params_get_system_info(params).c_str());
2042
- }
2043
-
2044
- struct results_perplexity results;
2045
- if (params.hellaswag) {
2046
- hellaswag_score(ctx, params);
2047
- } else if (params.winogrande) {
2048
- winogrande_score(ctx, params);
2049
- } else if (params.multiple_choice) {
2050
- multiple_choice_score(ctx, params);
2051
- } else if (params.kl_divergence) {
2052
- kl_divergence(ctx, params);
2053
- } else {
2054
- results = perplexity(ctx, params, n_ctx);
2055
- }
2056
-
2057
- LOG("\n");
2058
- llama_perf_context_print(ctx);
2059
-
2060
- llama_backend_free();
2061
-
2062
- return 0;
2063
- }