@fugood/llama.node 0.3.17 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -15,10 +15,10 @@ on:
|
|
|
15
15
|
push:
|
|
16
16
|
branches:
|
|
17
17
|
- master
|
|
18
|
-
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '
|
|
18
|
+
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
|
19
19
|
pull_request:
|
|
20
20
|
types: [opened, synchronize, reopened]
|
|
21
|
-
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '
|
|
21
|
+
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
|
22
22
|
|
|
23
23
|
env:
|
|
24
24
|
LLAMA_LOG_COLORS: 1
|
|
@@ -74,7 +74,7 @@ jobs:
|
|
|
74
74
|
- name: Tests dependencies
|
|
75
75
|
id: test_dependencies
|
|
76
76
|
run: |
|
|
77
|
-
pip install -r
|
|
77
|
+
pip install -r tools/server/tests/requirements.txt
|
|
78
78
|
|
|
79
79
|
# Setup nodejs (to be used for verifying bundled index.html)
|
|
80
80
|
- uses: actions/setup-node@v4
|
|
@@ -84,14 +84,14 @@ jobs:
|
|
|
84
84
|
- name: WebUI - Install dependencies
|
|
85
85
|
id: webui_lint
|
|
86
86
|
run: |
|
|
87
|
-
cd
|
|
87
|
+
cd tools/server/webui
|
|
88
88
|
npm ci
|
|
89
89
|
|
|
90
90
|
- name: WebUI - Check code format
|
|
91
91
|
id: webui_format
|
|
92
92
|
run: |
|
|
93
93
|
git config --global --add safe.directory $(realpath .)
|
|
94
|
-
cd
|
|
94
|
+
cd tools/server/webui
|
|
95
95
|
git status
|
|
96
96
|
|
|
97
97
|
npm run format
|
|
@@ -108,7 +108,7 @@ jobs:
|
|
|
108
108
|
id: verify_server_index_html
|
|
109
109
|
run: |
|
|
110
110
|
git config --global --add safe.directory $(realpath .)
|
|
111
|
-
cd
|
|
111
|
+
cd tools/server/webui
|
|
112
112
|
git status
|
|
113
113
|
|
|
114
114
|
npm run build
|
|
@@ -161,21 +161,21 @@ jobs:
|
|
|
161
161
|
env:
|
|
162
162
|
GITHUB_ACTIONS: "true"
|
|
163
163
|
run: |
|
|
164
|
-
cd
|
|
164
|
+
cd tools/server/tests
|
|
165
165
|
./tests.sh
|
|
166
166
|
|
|
167
167
|
- name: Tests (sanitizers)
|
|
168
168
|
id: server_integration_tests_sanitizers
|
|
169
169
|
if: ${{ matrix.sanitizer != '' }}
|
|
170
170
|
run: |
|
|
171
|
-
cd
|
|
171
|
+
cd tools/server/tests
|
|
172
172
|
LLAMA_SANITIZE=1 ./tests.sh
|
|
173
173
|
|
|
174
174
|
- name: Slow tests
|
|
175
175
|
id: server_integration_tests_slow
|
|
176
176
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
177
177
|
run: |
|
|
178
|
-
cd
|
|
178
|
+
cd tools/server/tests
|
|
179
179
|
SLOW_TESTS=1 ./tests.sh
|
|
180
180
|
|
|
181
181
|
|
|
@@ -211,7 +211,7 @@ jobs:
|
|
|
211
211
|
- name: Tests dependencies
|
|
212
212
|
id: test_dependencies
|
|
213
213
|
run: |
|
|
214
|
-
pip install -r
|
|
214
|
+
pip install -r tools/server/tests/requirements.txt
|
|
215
215
|
|
|
216
216
|
- name: Copy Libcurl
|
|
217
217
|
id: prepare_libcurl
|
|
@@ -224,7 +224,7 @@ jobs:
|
|
|
224
224
|
id: server_integration_tests
|
|
225
225
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
|
226
226
|
run: |
|
|
227
|
-
cd
|
|
227
|
+
cd tools/server/tests
|
|
228
228
|
$env:PYTHONIOENCODING = ":replace"
|
|
229
229
|
pytest -v -x -m "not slow"
|
|
230
230
|
|
|
@@ -232,6 +232,6 @@ jobs:
|
|
|
232
232
|
id: server_integration_tests_slow
|
|
233
233
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
234
234
|
run: |
|
|
235
|
-
cd
|
|
235
|
+
cd tools/server/tests
|
|
236
236
|
$env:SLOW_TESTS = "1"
|
|
237
237
|
pytest -v -x
|
|
@@ -77,6 +77,7 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
|
|
|
77
77
|
|
|
78
78
|
# extra artifacts
|
|
79
79
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
|
80
|
+
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
|
80
81
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
|
81
82
|
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
|
82
83
|
|
|
@@ -187,6 +188,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
|
|
187
188
|
add_subdirectory(pocs)
|
|
188
189
|
endif()
|
|
189
190
|
|
|
191
|
+
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
|
192
|
+
add_subdirectory(tools)
|
|
193
|
+
endif()
|
|
194
|
+
|
|
190
195
|
#
|
|
191
196
|
# install
|
|
192
197
|
#
|
|
@@ -247,20 +252,3 @@ configure_file(cmake/llama.pc.in
|
|
|
247
252
|
|
|
248
253
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
|
249
254
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
|
250
|
-
|
|
251
|
-
#
|
|
252
|
-
# copy the license files
|
|
253
|
-
#
|
|
254
|
-
|
|
255
|
-
# Check if running in GitHub Actions
|
|
256
|
-
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
|
257
|
-
message(STATUS "Running inside GitHub Actions - copying license files")
|
|
258
|
-
|
|
259
|
-
# Copy all files from licenses/ to build/bin/
|
|
260
|
-
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
|
261
|
-
foreach(LICENSE_FILE ${LICENSE_FILES})
|
|
262
|
-
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
|
263
|
-
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
|
|
264
|
-
endforeach()
|
|
265
|
-
endif()
|
|
266
|
-
|
|
@@ -41,14 +41,20 @@ endif()
|
|
|
41
41
|
|
|
42
42
|
if(MSVC)
|
|
43
43
|
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
|
44
|
-
|
|
44
|
+
if (CMAKE_VS_PLATFORM_NAME)
|
|
45
|
+
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
|
46
|
+
else()
|
|
47
|
+
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
|
|
48
|
+
endif()
|
|
45
49
|
else()
|
|
46
50
|
execute_process(
|
|
47
|
-
COMMAND
|
|
51
|
+
COMMAND ${CMAKE_C_COMPILER} --version
|
|
48
52
|
OUTPUT_VARIABLE OUT
|
|
49
53
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
50
54
|
)
|
|
55
|
+
string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
|
|
51
56
|
set(BUILD_COMPILER ${OUT})
|
|
57
|
+
|
|
52
58
|
execute_process(
|
|
53
59
|
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
|
54
60
|
OUTPUT_VARIABLE OUT
|
|
@@ -39,7 +39,9 @@ add_custom_command(
|
|
|
39
39
|
COMMENT "Generating build details from Git"
|
|
40
40
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
|
41
41
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
|
42
|
-
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
|
42
|
+
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
|
43
|
+
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
|
44
|
+
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
|
43
45
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
|
44
46
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
|
45
47
|
VERBATIM
|
|
@@ -71,6 +73,8 @@ add_library(${TARGET} STATIC
|
|
|
71
73
|
minja/minja.hpp
|
|
72
74
|
ngram-cache.cpp
|
|
73
75
|
ngram-cache.h
|
|
76
|
+
regex-partial.cpp
|
|
77
|
+
regex-partial.h
|
|
74
78
|
sampling.cpp
|
|
75
79
|
sampling.h
|
|
76
80
|
speculative.cpp
|
|
@@ -117,8 +121,8 @@ if (LLAMA_LLGUIDANCE)
|
|
|
117
121
|
|
|
118
122
|
ExternalProject_Add(llguidance_ext
|
|
119
123
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
120
|
-
# v0.7.
|
|
121
|
-
GIT_TAG
|
|
124
|
+
# v0.7.20 (+ fix to build on GCC 15):
|
|
125
|
+
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
|
122
126
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
123
127
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
124
128
|
BUILD_IN_SOURCE TRUE
|
|
@@ -142,3 +146,27 @@ endif ()
|
|
|
142
146
|
target_include_directories(${TARGET} PUBLIC .)
|
|
143
147
|
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
|
144
148
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
#
|
|
152
|
+
# copy the license files
|
|
153
|
+
#
|
|
154
|
+
|
|
155
|
+
# Check if running in GitHub Actions
|
|
156
|
+
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
|
157
|
+
message(STATUS "Running inside GitHub Actions - copying license files")
|
|
158
|
+
|
|
159
|
+
# Copy all files from licenses/ to build/bin/
|
|
160
|
+
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
|
161
|
+
foreach(LICENSE_FILE ${LICENSE_FILES})
|
|
162
|
+
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
|
163
|
+
add_custom_command(
|
|
164
|
+
POST_BUILD
|
|
165
|
+
TARGET ${TARGET}
|
|
166
|
+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
|
167
|
+
"${LICENSE_FILE}"
|
|
168
|
+
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
|
169
|
+
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
|
170
|
+
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
|
171
|
+
endforeach()
|
|
172
|
+
endif()
|
|
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;
|
|
|
40
40
|
|
|
41
41
|
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
42
42
|
LLAMA_EXAMPLE_LLAVA,
|
|
43
|
-
|
|
43
|
+
LLAMA_EXAMPLE_SERVER,
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
static std::string read_file(const std::string & fname) {
|
|
@@ -217,13 +217,11 @@ struct curl_slist_ptr {
|
|
|
217
217
|
#define CURL_MAX_RETRY 3
|
|
218
218
|
#define CURL_RETRY_DELAY_SECONDS 2
|
|
219
219
|
|
|
220
|
-
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
|
220
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
|
|
221
221
|
int remaining_attempts = max_attempts;
|
|
222
|
-
char * method = nullptr;
|
|
223
|
-
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
|
|
224
222
|
|
|
225
223
|
while (remaining_attempts > 0) {
|
|
226
|
-
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ ,
|
|
224
|
+
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
227
225
|
|
|
228
226
|
CURLcode res = curl_easy_perform(curl);
|
|
229
227
|
if (res == CURLE_OK) {
|
|
@@ -343,7 +341,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
343
341
|
|
|
344
342
|
// we only allow retrying once for HEAD requests
|
|
345
343
|
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
346
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
|
|
344
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
|
347
345
|
if (!was_perform_successful) {
|
|
348
346
|
head_request_ok = false;
|
|
349
347
|
}
|
|
@@ -425,7 +423,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
425
423
|
// start the download
|
|
426
424
|
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
427
425
|
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
428
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
426
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
|
|
429
427
|
if (!was_perform_successful) {
|
|
430
428
|
return false;
|
|
431
429
|
}
|
|
@@ -1285,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1285
1283
|
[](common_params & params) {
|
|
1286
1284
|
params.use_color = true;
|
|
1287
1285
|
}
|
|
1288
|
-
).set_examples({LLAMA_EXAMPLE_MAIN,
|
|
1286
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
1289
1287
|
add_opt(common_arg(
|
|
1290
1288
|
{"-t", "--threads"}, "N",
|
|
1291
1289
|
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
@@ -1418,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1418
1416
|
add_opt(common_arg(
|
|
1419
1417
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
1420
1418
|
string_format(
|
|
1421
|
-
ex == LLAMA_EXAMPLE_MAIN
|
|
1419
|
+
ex == LLAMA_EXAMPLE_MAIN
|
|
1422
1420
|
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
1423
1421
|
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
1424
1422
|
params.n_predict),
|
|
@@ -1657,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1657
1655
|
params.input_prefix = value;
|
|
1658
1656
|
params.enable_chat_template = false;
|
|
1659
1657
|
}
|
|
1660
|
-
).set_examples({LLAMA_EXAMPLE_MAIN
|
|
1658
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1661
1659
|
add_opt(common_arg(
|
|
1662
1660
|
{"--in-suffix"}, "STRING",
|
|
1663
1661
|
"string to suffix after user inputs with (default: empty)",
|
|
@@ -1665,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1665
1663
|
params.input_suffix = value;
|
|
1666
1664
|
params.enable_chat_template = false;
|
|
1667
1665
|
}
|
|
1668
|
-
).set_examples({LLAMA_EXAMPLE_MAIN
|
|
1666
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1669
1667
|
add_opt(common_arg(
|
|
1670
1668
|
{"--no-warmup"},
|
|
1671
1669
|
"skip warming up the model with an empty run",
|
|
@@ -1682,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1682
1680
|
[](common_params & params) {
|
|
1683
1681
|
params.spm_infill = true;
|
|
1684
1682
|
}
|
|
1685
|
-
).set_examples({LLAMA_EXAMPLE_SERVER
|
|
1683
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1686
1684
|
add_opt(common_arg(
|
|
1687
1685
|
{"--samplers"}, "SAMPLERS",
|
|
1688
1686
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
|
@@ -2099,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2099
2097
|
params.cache_type_v = kv_cache_type_from_str(value);
|
|
2100
2098
|
}
|
|
2101
2099
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
|
2102
|
-
add_opt(common_arg(
|
|
2103
|
-
{"--perplexity", "--all-logits"},
|
|
2104
|
-
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
|
2105
|
-
[](common_params & params) {
|
|
2106
|
-
params.logits_all = true;
|
|
2107
|
-
}
|
|
2108
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
2109
2100
|
add_opt(common_arg(
|
|
2110
2101
|
{"--hellaswag"},
|
|
2111
2102
|
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
|
@@ -2213,32 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2213
2204
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
2214
2205
|
add_opt(common_arg(
|
|
2215
2206
|
{"--mmproj"}, "FILE",
|
|
2216
|
-
"path to a multimodal projector file. see
|
|
2207
|
+
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
|
2208
|
+
"note: if -hf is used, this argument can be omitted",
|
|
2217
2209
|
[](common_params & params, const std::string & value) {
|
|
2218
2210
|
params.mmproj.path = value;
|
|
2219
2211
|
}
|
|
2220
|
-
).set_examples(mmproj_examples));
|
|
2212
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
|
2221
2213
|
add_opt(common_arg(
|
|
2222
2214
|
{"--mmproj-url"}, "URL",
|
|
2223
|
-
"URL to a multimodal projector file. see
|
|
2215
|
+
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
|
2224
2216
|
[](common_params & params, const std::string & value) {
|
|
2225
2217
|
params.mmproj.url = value;
|
|
2226
2218
|
}
|
|
2227
|
-
).set_examples(mmproj_examples));
|
|
2219
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
|
2228
2220
|
add_opt(common_arg(
|
|
2229
2221
|
{"--no-mmproj"},
|
|
2230
2222
|
"explicitly disable multimodal projector, useful when using -hf",
|
|
2231
2223
|
[](common_params & params) {
|
|
2232
2224
|
params.no_mmproj = true;
|
|
2233
2225
|
}
|
|
2234
|
-
).set_examples(mmproj_examples));
|
|
2226
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
|
2235
2227
|
add_opt(common_arg(
|
|
2236
2228
|
{"--no-mmproj-offload"},
|
|
2237
2229
|
"do not offload multimodal projector to GPU",
|
|
2238
2230
|
[](common_params & params) {
|
|
2239
2231
|
params.mmproj_use_gpu = false;
|
|
2240
2232
|
}
|
|
2241
|
-
).set_examples(mmproj_examples));
|
|
2233
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
|
2242
2234
|
add_opt(common_arg(
|
|
2243
2235
|
{"--image"}, "FILE",
|
|
2244
2236
|
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
|
@@ -2445,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2445
2437
|
}
|
|
2446
2438
|
}
|
|
2447
2439
|
));
|
|
2440
|
+
add_opt(common_arg(
|
|
2441
|
+
{"--no-op-offload"},
|
|
2442
|
+
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
|
2443
|
+
[](common_params & params) {
|
|
2444
|
+
params.no_op_offload = true;
|
|
2445
|
+
}
|
|
2446
|
+
));
|
|
2448
2447
|
add_opt(common_arg(
|
|
2449
2448
|
{"--lora"}, "FNAME",
|
|
2450
2449
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
|
@@ -2586,7 +2585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2586
2585
|
[](common_params & params, int value) {
|
|
2587
2586
|
params.n_junk = value;
|
|
2588
2587
|
}
|
|
2589
|
-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
2588
|
+
).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
|
|
2590
2589
|
add_opt(common_arg(
|
|
2591
2590
|
{"--pos"}, "N",
|
|
2592
2591
|
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
|
@@ -2636,13 +2635,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2636
2635
|
params.i_chunk = value;
|
|
2637
2636
|
}
|
|
2638
2637
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2638
|
+
add_opt(common_arg(
|
|
2639
|
+
{"--parse-special"},
|
|
2640
|
+
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
|
2641
|
+
[](common_params & params) {
|
|
2642
|
+
params.parse_special = true;
|
|
2643
|
+
}
|
|
2644
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2639
2645
|
add_opt(common_arg(
|
|
2640
2646
|
{"-pps"},
|
|
2641
2647
|
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
|
2642
2648
|
[](common_params & params) {
|
|
2643
2649
|
params.is_pp_shared = true;
|
|
2644
2650
|
}
|
|
2645
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
2651
|
+
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
|
2646
2652
|
add_opt(common_arg(
|
|
2647
2653
|
{"-npp"}, "n0,n1,...",
|
|
2648
2654
|
"number of prompt tokens",
|
|
@@ -2785,7 +2791,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2785
2791
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
2786
2792
|
add_opt(common_arg(
|
|
2787
2793
|
{"--cache-reuse"}, "N",
|
|
2788
|
-
string_format(
|
|
2794
|
+
string_format(
|
|
2795
|
+
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
|
2796
|
+
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
|
2797
|
+
),
|
|
2789
2798
|
[](common_params & params, int value) {
|
|
2790
2799
|
params.n_cache_reuse = value;
|
|
2791
2800
|
}
|
|
@@ -2871,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2871
2880
|
params.chat_template = read_file(value);
|
|
2872
2881
|
}
|
|
2873
2882
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
2883
|
+
add_opt(common_arg(
|
|
2884
|
+
{"--no-prefill-assistant"},
|
|
2885
|
+
string_format(
|
|
2886
|
+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
|
2887
|
+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
|
2888
|
+
),
|
|
2889
|
+
[](common_params & params) {
|
|
2890
|
+
params.prefill_assistant = false;
|
|
2891
|
+
}
|
|
2892
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
|
|
2874
2893
|
add_opt(common_arg(
|
|
2875
2894
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
2876
2895
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
@@ -2891,7 +2910,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2891
2910
|
[](common_params & params) {
|
|
2892
2911
|
params.simple_io = true;
|
|
2893
2912
|
}
|
|
2894
|
-
).set_examples({LLAMA_EXAMPLE_MAIN
|
|
2913
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
2895
2914
|
add_opt(common_arg(
|
|
2896
2915
|
{"--positive-file"}, "FNAME",
|
|
2897
2916
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|