@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -15,10 +15,10 @@ on:
15
15
  push:
16
16
  branches:
17
17
  - master
18
- paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
18
+ paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
19
19
  pull_request:
20
20
  types: [opened, synchronize, reopened]
21
- paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
21
+ paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
22
22
 
23
23
  env:
24
24
  LLAMA_LOG_COLORS: 1
@@ -74,7 +74,7 @@ jobs:
74
74
  - name: Tests dependencies
75
75
  id: test_dependencies
76
76
  run: |
77
- pip install -r examples/server/tests/requirements.txt
77
+ pip install -r tools/server/tests/requirements.txt
78
78
 
79
79
  # Setup nodejs (to be used for verifying bundled index.html)
80
80
  - uses: actions/setup-node@v4
@@ -84,14 +84,14 @@ jobs:
84
84
  - name: WebUI - Install dependencies
85
85
  id: webui_lint
86
86
  run: |
87
- cd examples/server/webui
87
+ cd tools/server/webui
88
88
  npm ci
89
89
 
90
90
  - name: WebUI - Check code format
91
91
  id: webui_format
92
92
  run: |
93
93
  git config --global --add safe.directory $(realpath .)
94
- cd examples/server/webui
94
+ cd tools/server/webui
95
95
  git status
96
96
 
97
97
  npm run format
@@ -108,7 +108,7 @@ jobs:
108
108
  id: verify_server_index_html
109
109
  run: |
110
110
  git config --global --add safe.directory $(realpath .)
111
- cd examples/server/webui
111
+ cd tools/server/webui
112
112
  git status
113
113
 
114
114
  npm run build
@@ -161,21 +161,21 @@ jobs:
161
161
  env:
162
162
  GITHUB_ACTIONS: "true"
163
163
  run: |
164
- cd examples/server/tests
164
+ cd tools/server/tests
165
165
  ./tests.sh
166
166
 
167
167
  - name: Tests (sanitizers)
168
168
  id: server_integration_tests_sanitizers
169
169
  if: ${{ matrix.sanitizer != '' }}
170
170
  run: |
171
- cd examples/server/tests
171
+ cd tools/server/tests
172
172
  LLAMA_SANITIZE=1 ./tests.sh
173
173
 
174
174
  - name: Slow tests
175
175
  id: server_integration_tests_slow
176
176
  if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
177
177
  run: |
178
- cd examples/server/tests
178
+ cd tools/server/tests
179
179
  SLOW_TESTS=1 ./tests.sh
180
180
 
181
181
 
@@ -211,7 +211,7 @@ jobs:
211
211
  - name: Tests dependencies
212
212
  id: test_dependencies
213
213
  run: |
214
- pip install -r examples/server/tests/requirements.txt
214
+ pip install -r tools/server/tests/requirements.txt
215
215
 
216
216
  - name: Copy Libcurl
217
217
  id: prepare_libcurl
@@ -224,7 +224,7 @@ jobs:
224
224
  id: server_integration_tests
225
225
  if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
226
226
  run: |
227
- cd examples/server/tests
227
+ cd tools/server/tests
228
228
  $env:PYTHONIOENCODING = ":replace"
229
229
  pytest -v -x -m "not slow"
230
230
 
@@ -232,6 +232,6 @@ jobs:
232
232
  id: server_integration_tests_slow
233
233
  if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
234
234
  run: |
235
- cd examples/server/tests
235
+ cd tools/server/tests
236
236
  $env:SLOW_TESTS = "1"
237
237
  pytest -v -x
@@ -77,6 +77,7 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
77
77
 
78
78
  # extra artifacts
79
79
  option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
80
+ option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
80
81
  option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
81
82
  option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
82
83
 
@@ -187,6 +188,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
187
188
  add_subdirectory(pocs)
188
189
  endif()
189
190
 
191
+ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
192
+ add_subdirectory(tools)
193
+ endif()
194
+
190
195
  #
191
196
  # install
192
197
  #
@@ -247,20 +252,3 @@ configure_file(cmake/llama.pc.in
247
252
 
248
253
  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
249
254
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
250
-
251
- #
252
- # copy the license files
253
- #
254
-
255
- # Check if running in GitHub Actions
256
- if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
257
- message(STATUS "Running inside GitHub Actions - copying license files")
258
-
259
- # Copy all files from licenses/ to build/bin/
260
- file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
261
- foreach(LICENSE_FILE ${LICENSE_FILES})
262
- get_filename_component(FILENAME ${LICENSE_FILE} NAME)
263
- configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
264
- endforeach()
265
- endif()
266
-
@@ -41,14 +41,20 @@ endif()
41
41
 
42
42
  if(MSVC)
43
43
  set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
44
- set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
44
+ if (CMAKE_VS_PLATFORM_NAME)
45
+ set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
46
+ else()
47
+ set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
48
+ endif()
45
49
  else()
46
50
  execute_process(
47
- COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
51
+ COMMAND ${CMAKE_C_COMPILER} --version
48
52
  OUTPUT_VARIABLE OUT
49
53
  OUTPUT_STRIP_TRAILING_WHITESPACE
50
54
  )
55
+ string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
51
56
  set(BUILD_COMPILER ${OUT})
57
+
52
58
  execute_process(
53
59
  COMMAND ${CMAKE_C_COMPILER} -dumpmachine
54
60
  OUTPUT_VARIABLE OUT
@@ -3,9 +3,3 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
3
3
 
4
4
  set( CMAKE_C_COMPILER clang )
5
5
  set( CMAKE_CXX_COMPILER clang++ )
6
-
7
- set( arch_c_flags "-march=native" )
8
-
9
- set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
10
- set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
11
-
@@ -39,7 +39,9 @@ add_custom_command(
39
39
  COMMENT "Generating build details from Git"
40
40
  COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
41
41
  -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
42
- -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
42
+ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
43
+ -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
44
+ -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
43
45
  WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
44
46
  DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
45
47
  VERBATIM
@@ -71,6 +73,8 @@ add_library(${TARGET} STATIC
71
73
  minja/minja.hpp
72
74
  ngram-cache.cpp
73
75
  ngram-cache.h
76
+ regex-partial.cpp
77
+ regex-partial.h
74
78
  sampling.cpp
75
79
  sampling.h
76
80
  speculative.cpp
@@ -117,8 +121,8 @@ if (LLAMA_LLGUIDANCE)
117
121
 
118
122
  ExternalProject_Add(llguidance_ext
119
123
  GIT_REPOSITORY https://github.com/guidance-ai/llguidance
120
- # v0.7.10:
121
- GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
124
+ # v0.7.20 (+ fix to build on GCC 15):
125
+ GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
122
126
  PREFIX ${CMAKE_BINARY_DIR}/llguidance
123
127
  SOURCE_DIR ${LLGUIDANCE_SRC}
124
128
  BUILD_IN_SOURCE TRUE
@@ -142,3 +146,27 @@ endif ()
142
146
  target_include_directories(${TARGET} PUBLIC .)
143
147
  target_compile_features (${TARGET} PUBLIC cxx_std_17)
144
148
  target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
149
+
150
+
151
+ #
152
+ # copy the license files
153
+ #
154
+
155
+ # Check if running in GitHub Actions
156
+ if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
157
+ message(STATUS "Running inside GitHub Actions - copying license files")
158
+
159
+ # Copy all files from licenses/ to build/bin/
160
+ file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
161
+ foreach(LICENSE_FILE ${LICENSE_FILES})
162
+ get_filename_component(FILENAME ${LICENSE_FILE} NAME)
163
+ add_custom_command(
164
+ POST_BUILD
165
+ TARGET ${TARGET}
166
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different
167
+ "${LICENSE_FILE}"
168
+ "$<TARGET_FILE_DIR:llama>/${FILENAME}"
169
+ COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
170
+ message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
171
+ endforeach()
172
+ endif()
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;
40
40
 
41
41
  std::initializer_list<enum llama_example> mmproj_examples = {
42
42
  LLAMA_EXAMPLE_LLAVA,
43
- // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
43
+ LLAMA_EXAMPLE_SERVER,
44
44
  };
45
45
 
46
46
  static std::string read_file(const std::string & fname) {
@@ -217,13 +217,11 @@ struct curl_slist_ptr {
217
217
  #define CURL_MAX_RETRY 3
218
218
  #define CURL_RETRY_DELAY_SECONDS 2
219
219
 
220
- static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
220
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
221
221
  int remaining_attempts = max_attempts;
222
- char * method = nullptr;
223
- curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
224
222
 
225
223
  while (remaining_attempts > 0) {
226
- LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
224
+ LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
227
225
 
228
226
  CURLcode res = curl_easy_perform(curl);
229
227
  if (res == CURLE_OK) {
@@ -343,7 +341,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
343
341
 
344
342
  // we only allow retrying once for HEAD requests
345
343
  // this is for the use case of using running offline (no internet), retrying can be annoying
346
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
344
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
347
345
  if (!was_perform_successful) {
348
346
  head_request_ok = false;
349
347
  }
@@ -425,7 +423,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
425
423
  // start the download
426
424
  LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
427
425
  llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
428
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
426
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
429
427
  if (!was_perform_successful) {
430
428
  return false;
431
429
  }
@@ -1285,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1285
1283
  [](common_params & params) {
1286
1284
  params.use_color = true;
1287
1285
  }
1288
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
1286
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
1289
1287
  add_opt(common_arg(
1290
1288
  {"-t", "--threads"}, "N",
1291
1289
  string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -1418,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1418
1416
  add_opt(common_arg(
1419
1417
  {"-n", "--predict", "--n-predict"}, "N",
1420
1418
  string_format(
1421
- ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
1419
+ ex == LLAMA_EXAMPLE_MAIN
1422
1420
  ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
1423
1421
  : "number of tokens to predict (default: %d, -1 = infinity)",
1424
1422
  params.n_predict),
@@ -1657,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1657
1655
  params.input_prefix = value;
1658
1656
  params.enable_chat_template = false;
1659
1657
  }
1660
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
1658
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
1661
1659
  add_opt(common_arg(
1662
1660
  {"--in-suffix"}, "STRING",
1663
1661
  "string to suffix after user inputs with (default: empty)",
@@ -1665,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1665
1663
  params.input_suffix = value;
1666
1664
  params.enable_chat_template = false;
1667
1665
  }
1668
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
1666
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
1669
1667
  add_opt(common_arg(
1670
1668
  {"--no-warmup"},
1671
1669
  "skip warming up the model with an empty run",
@@ -1682,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1682
1680
  [](common_params & params) {
1683
1681
  params.spm_infill = true;
1684
1682
  }
1685
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
1683
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
1686
1684
  add_opt(common_arg(
1687
1685
  {"--samplers"}, "SAMPLERS",
1688
1686
  string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
@@ -2099,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2099
2097
  params.cache_type_v = kv_cache_type_from_str(value);
2100
2098
  }
2101
2099
  ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
2102
- add_opt(common_arg(
2103
- {"--perplexity", "--all-logits"},
2104
- string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
2105
- [](common_params & params) {
2106
- params.logits_all = true;
2107
- }
2108
- ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2109
2100
  add_opt(common_arg(
2110
2101
  {"--hellaswag"},
2111
2102
  "compute HellaSwag score over random tasks from datafile supplied with -f",
@@ -2213,32 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2213
2204
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
2214
2205
  add_opt(common_arg(
2215
2206
  {"--mmproj"}, "FILE",
2216
- "path to a multimodal projector file. see examples/llava/README.md",
2207
+ "path to a multimodal projector file. see tools/mtmd/README.md\n"
2208
+ "note: if -hf is used, this argument can be omitted",
2217
2209
  [](common_params & params, const std::string & value) {
2218
2210
  params.mmproj.path = value;
2219
2211
  }
2220
- ).set_examples(mmproj_examples));
2212
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
2221
2213
  add_opt(common_arg(
2222
2214
  {"--mmproj-url"}, "URL",
2223
- "URL to a multimodal projector file. see examples/llava/README.md",
2215
+ "URL to a multimodal projector file. see tools/mtmd/README.md",
2224
2216
  [](common_params & params, const std::string & value) {
2225
2217
  params.mmproj.url = value;
2226
2218
  }
2227
- ).set_examples(mmproj_examples));
2219
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
2228
2220
  add_opt(common_arg(
2229
2221
  {"--no-mmproj"},
2230
2222
  "explicitly disable multimodal projector, useful when using -hf",
2231
2223
  [](common_params & params) {
2232
2224
  params.no_mmproj = true;
2233
2225
  }
2234
- ).set_examples(mmproj_examples));
2226
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
2235
2227
  add_opt(common_arg(
2236
2228
  {"--no-mmproj-offload"},
2237
2229
  "do not offload multimodal projector to GPU",
2238
2230
  [](common_params & params) {
2239
2231
  params.mmproj_use_gpu = false;
2240
2232
  }
2241
- ).set_examples(mmproj_examples));
2233
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
2242
2234
  add_opt(common_arg(
2243
2235
  {"--image"}, "FILE",
2244
2236
  "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2445,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2445
2437
  }
2446
2438
  }
2447
2439
  ));
2440
+ add_opt(common_arg(
2441
+ {"--no-op-offload"},
2442
+ string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2443
+ [](common_params & params) {
2444
+ params.no_op_offload = true;
2445
+ }
2446
+ ));
2448
2447
  add_opt(common_arg(
2449
2448
  {"--lora"}, "FNAME",
2450
2449
  "path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -2586,7 +2585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2586
2585
  [](common_params & params, int value) {
2587
2586
  params.n_junk = value;
2588
2587
  }
2589
- ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2588
+ ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2590
2589
  add_opt(common_arg(
2591
2590
  {"--pos"}, "N",
2592
2591
  string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2636,13 +2635,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2636
2635
  params.i_chunk = value;
2637
2636
  }
2638
2637
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2638
+ add_opt(common_arg(
2639
+ {"--parse-special"},
2640
+ string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2641
+ [](common_params & params) {
2642
+ params.parse_special = true;
2643
+ }
2644
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2639
2645
  add_opt(common_arg(
2640
2646
  {"-pps"},
2641
2647
  string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
2642
2648
  [](common_params & params) {
2643
2649
  params.is_pp_shared = true;
2644
2650
  }
2645
- ).set_examples({LLAMA_EXAMPLE_BENCH}));
2651
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2646
2652
  add_opt(common_arg(
2647
2653
  {"-npp"}, "n0,n1,...",
2648
2654
  "number of prompt tokens",
@@ -2785,7 +2791,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2785
2791
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2786
2792
  add_opt(common_arg(
2787
2793
  {"--cache-reuse"}, "N",
2788
- string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
2794
+ string_format(
2795
+ "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2796
+ "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2797
+ ),
2789
2798
  [](common_params & params, int value) {
2790
2799
  params.n_cache_reuse = value;
2791
2800
  }
@@ -2871,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2871
2880
  params.chat_template = read_file(value);
2872
2881
  }
2873
2882
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2883
+ add_opt(common_arg(
2884
+ {"--no-prefill-assistant"},
2885
+ string_format(
2886
+ "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2887
+ "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2888
+ ),
2889
+ [](common_params & params) {
2890
+ params.prefill_assistant = false;
2891
+ }
2892
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2874
2893
  add_opt(common_arg(
2875
2894
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2876
2895
  string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2891,7 +2910,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2891
2910
  [](common_params & params) {
2892
2911
  params.simple_io = true;
2893
2912
  }
2894
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
2913
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
2895
2914
  add_opt(common_arg(
2896
2915
  {"--positive-file"}, "FNAME",
2897
2916
  string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),