@fugood/llama.node 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +2 -0
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -112,9 +112,9 @@ jobs:
112
112
  -DGGML_OPENMP=OFF ;
113
113
  cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
114
114
 
115
- - name: Build
116
- id: cmake_build
117
- if: ${{ matrix.sanitizer != 'THREAD' }}
115
+ - name: Build (sanitizers)
116
+ id: cmake_build_sanitizers
117
+ if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
118
118
  run: |
119
119
  cmake -B build \
120
120
  -DGGML_NATIVE=OFF \
@@ -124,12 +124,31 @@ jobs:
124
124
  -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
125
125
  cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
126
126
 
127
+ - name: Build (sanitizers)
128
+ id: cmake_build
129
+ if: ${{ matrix.sanitizer == '' }}
130
+ run: |
131
+ cmake -B build \
132
+ -DGGML_NATIVE=OFF \
133
+ -DLLAMA_BUILD_SERVER=ON \
134
+ -DLLAMA_CURL=ON \
135
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
136
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
137
+
127
138
  - name: Tests
128
139
  id: server_integration_tests
140
+ if: ${{ matrix.sanitizer == '' }}
129
141
  run: |
130
142
  cd examples/server/tests
131
143
  ./tests.sh
132
144
 
145
+ - name: Tests (sanitizers)
146
+ id: server_integration_tests_sanitizers
147
+ if: ${{ matrix.sanitizer != '' }}
148
+ run: |
149
+ cd examples/server/tests
150
+ LLAMA_SANITIZE=1 ./tests.sh
151
+
133
152
  - name: Slow tests
134
153
  id: server_integration_tests_slow
135
154
  if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
@@ -83,11 +83,8 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
83
83
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
84
84
 
85
85
  # override ggml options
86
- set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
87
- set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
88
- set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
89
- set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
90
- set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
86
+ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
87
+ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
91
88
 
92
89
  # change the default for these ggml options
93
90
  if (NOT DEFINED GGML_LLAMAFILE)
@@ -117,16 +114,62 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
117
114
  llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
118
115
  llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
119
116
 
117
+ if (NOT MSVC)
118
+ if (LLAMA_SANITIZE_THREAD)
119
+ message(STATUS "Using -fsanitize=thread")
120
+
121
+ add_compile_options(-fsanitize=thread)
122
+ link_libraries (-fsanitize=thread)
123
+ endif()
124
+
125
+ if (LLAMA_SANITIZE_ADDRESS)
126
+ message(STATUS "Using -fsanitize=address")
127
+
128
+ add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
129
+ link_libraries (-fsanitize=address)
130
+ endif()
131
+
132
+ if (LLAMA_SANITIZE_UNDEFINED)
133
+ message(STATUS "Using -fsanitize=undefined")
134
+
135
+ add_compile_options(-fsanitize=undefined)
136
+ link_libraries (-fsanitize=undefined)
137
+ endif()
138
+ endif()
139
+
120
140
  #
121
- # build the library
141
+ # 3rd-party
122
142
  #
123
143
 
124
144
  if (NOT TARGET ggml)
125
145
  add_subdirectory(ggml)
126
146
  # ... otherwise assume ggml is added by a parent CMakeLists.txt
127
147
  endif()
148
+
149
+ #
150
+ # build the library
151
+ #
152
+
128
153
  add_subdirectory(src)
129
154
 
155
+ #
156
+ # utils, programs, examples and tests
157
+ #
158
+
159
+ if (LLAMA_BUILD_COMMON)
160
+ add_subdirectory(common)
161
+ endif()
162
+
163
+ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
164
+ include(CTest)
165
+ add_subdirectory(tests)
166
+ endif()
167
+
168
+ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
169
+ add_subdirectory(examples)
170
+ add_subdirectory(pocs)
171
+ endif()
172
+
130
173
  #
131
174
  # install
132
175
  #
@@ -200,21 +243,3 @@ configure_file(cmake/llama.pc.in
200
243
 
201
244
  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
202
245
  DESTINATION lib/pkgconfig)
203
-
204
- #
205
- # utils, programs, examples and tests
206
- #
207
-
208
- if (LLAMA_BUILD_COMMON)
209
- add_subdirectory(common)
210
- endif()
211
-
212
- if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
213
- include(CTest)
214
- add_subdirectory(tests)
215
- endif()
216
-
217
- if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
218
- add_subdirectory(examples)
219
- add_subdirectory(pocs)
220
- endif()
@@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
22
22
  return *this;
23
23
  }
24
24
 
25
+ common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
26
+ this->excludes = std::move(excludes);
27
+ return *this;
28
+ }
29
+
25
30
  common_arg & common_arg::set_env(const char * env) {
26
31
  help = help + "\n(env: " + env + ")";
27
32
  this->env = env;
@@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) {
37
42
  return examples.find(ex) != examples.end();
38
43
  }
39
44
 
45
+ bool common_arg::is_exclude(enum llama_example ex) {
46
+ return excludes.find(ex) != excludes.end();
47
+ }
48
+
40
49
  bool common_arg::get_value_from_env(std::string & output) {
41
50
  if (env == nullptr) return false;
42
51
  char * value = std::getenv(env);
@@ -121,17 +130,26 @@ std::string common_arg::to_string() {
121
130
 
122
131
  static void common_params_handle_model_default(
123
132
  std::string & model,
124
- std::string & model_url,
133
+ const std::string & model_url,
125
134
  std::string & hf_repo,
126
- std::string & hf_file) {
135
+ std::string & hf_file,
136
+ const std::string & hf_token) {
127
137
  if (!hf_repo.empty()) {
128
138
  // short-hand to avoid specifying --hf-file -> default it to --model
129
139
  if (hf_file.empty()) {
130
140
  if (model.empty()) {
131
- throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
141
+ auto auto_detected = common_get_hf_file(hf_repo, hf_token);
142
+ if (auto_detected.first.empty() || auto_detected.second.empty()) {
143
+ exit(1); // built without CURL, error message already printed
144
+ }
145
+ hf_repo = auto_detected.first;
146
+ hf_file = auto_detected.second;
147
+ } else {
148
+ hf_file = model;
132
149
  }
133
- hf_file = model;
134
- } else if (model.empty()) {
150
+ }
151
+ // make sure model path is present (for caching purposes)
152
+ if (model.empty()) {
135
153
  // this is to avoid different repo having same file name, or same file name in different subdirs
136
154
  std::string filename = hf_repo + "_" + hf_file;
137
155
  // to make sure we don't have any slashes in the filename
@@ -281,8 +299,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
281
299
  }
282
300
 
283
301
  // TODO: refactor model params in a common struct
284
- common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
285
- common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
302
+ common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
303
+ common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
286
304
 
287
305
  if (params.escape) {
288
306
  string_process_escapes(params.prompt);
@@ -358,6 +376,30 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
358
376
  return devices;
359
377
  }
360
378
 
379
+ static void add_rpc_devices(std::string servers) {
380
+ auto rpc_servers = string_split<std::string>(servers, ',');
381
+ if (rpc_servers.empty()) {
382
+ throw std::invalid_argument("no RPC servers specified");
383
+ }
384
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
385
+ if (!rpc_reg) {
386
+ throw std::invalid_argument("failed to find RPC backend");
387
+ }
388
+ typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
389
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
390
+ if (!ggml_backend_rpc_add_device_fn) {
391
+ throw std::invalid_argument("failed to find RPC device add function");
392
+ }
393
+ for (const auto & server : rpc_servers) {
394
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
395
+ if (dev) {
396
+ ggml_backend_device_register(dev);
397
+ } else {
398
+ throw std::invalid_argument("failed to register RPC device");
399
+ }
400
+ }
401
+ }
402
+
361
403
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
362
404
  auto ctx_arg = common_params_parser_init(params, ex, print_usage);
363
405
  const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -420,7 +462,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
420
462
  * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
421
463
  */
422
464
  auto add_opt = [&](common_arg arg) {
423
- if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
465
+ if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
424
466
  ctx_arg.options.push_back(std::move(arg));
425
467
  }
426
468
  };
@@ -649,7 +691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
649
691
  [](common_params & params, const std::string & value) {
650
692
  params.prompt = value;
651
693
  }
652
- ));
694
+ ).set_excludes({LLAMA_EXAMPLE_SERVER}));
653
695
  add_opt(common_arg(
654
696
  {"--no-perf"},
655
697
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -673,7 +715,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
673
715
  params.prompt.pop_back();
674
716
  }
675
717
  }
676
- ));
718
+ ).set_excludes({LLAMA_EXAMPLE_SERVER}));
677
719
  add_opt(common_arg(
678
720
  {"--in-file"}, "FNAME",
679
721
  "an input file (repeat to specify multiple files)",
@@ -700,7 +742,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
700
742
  params.prompt = ss.str();
701
743
  fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
702
744
  }
703
- ));
745
+ ).set_excludes({LLAMA_EXAMPLE_SERVER}));
704
746
  add_opt(common_arg(
705
747
  {"-e", "--escape"},
706
748
  string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
@@ -759,15 +801,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
759
801
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
760
802
  add_opt(common_arg(
761
803
  {"-cnv", "--conversation"},
762
- string_format(
763
- "run in conversation mode:\n"
764
- "- does not print special tokens and suffix/prefix\n"
765
- "- interactive mode is also enabled\n"
766
- "(default: %s)",
767
- params.conversation ? "true" : "false"
768
- ),
804
+ "run in conversation mode:\n"
805
+ "- does not print special tokens and suffix/prefix\n"
806
+ "- interactive mode is also enabled\n"
807
+ "(default: auto enabled if chat template is available)",
769
808
  [](common_params & params) {
770
- params.conversation = true;
809
+ params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
810
+ }
811
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
812
+ add_opt(common_arg(
813
+ {"-no-cnv", "--no-conversation"},
814
+ "force disable conversation mode (default: false)",
815
+ [](common_params & params) {
816
+ params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
771
817
  }
772
818
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
773
819
  add_opt(common_arg(
@@ -1363,7 +1409,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1363
1409
  {"--rpc"}, "SERVERS",
1364
1410
  "comma separated list of RPC servers",
1365
1411
  [](common_params & params, const std::string & value) {
1366
- params.rpc_servers = value;
1412
+ add_rpc_devices(value);
1413
+ GGML_UNUSED(params);
1367
1414
  }
1368
1415
  ).set_env("LLAMA_ARG_RPC"));
1369
1416
  }
@@ -1512,7 +1559,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1512
1559
  {"--lora"}, "FNAME",
1513
1560
  "path to LoRA adapter (can be repeated to use multiple adapters)",
1514
1561
  [](common_params & params, const std::string & value) {
1515
- params.lora_adapters.push_back({ std::string(value), 1.0 });
1562
+ params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
1516
1563
  }
1517
1564
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
1518
1565
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -1520,7 +1567,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1520
1567
  {"--lora-scaled"}, "FNAME", "SCALE",
1521
1568
  "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
1522
1569
  [](common_params & params, const std::string & fname, const std::string & scale) {
1523
- params.lora_adapters.push_back({ fname, std::stof(scale) });
1570
+ params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
1524
1571
  }
1525
1572
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
1526
1573
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -1574,21 +1621,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1574
1621
  }
1575
1622
  ).set_env("LLAMA_ARG_MODEL_URL"));
1576
1623
  add_opt(common_arg(
1577
- {"-hfr", "--hf-repo"}, "REPO",
1578
- "Hugging Face model repository (default: unused)",
1624
+ {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
1625
+ "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
1626
+ "example: unsloth/phi-4-GGUF:q4_k_m\n"
1627
+ "(default: unused)",
1579
1628
  [](common_params & params, const std::string & value) {
1580
1629
  params.hf_repo = value;
1581
1630
  }
1582
1631
  ).set_env("LLAMA_ARG_HF_REPO"));
1583
1632
  add_opt(common_arg(
1584
1633
  {"-hff", "--hf-file"}, "FILE",
1585
- "Hugging Face model file (default: unused)",
1634
+ "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
1586
1635
  [](common_params & params, const std::string & value) {
1587
1636
  params.hf_file = value;
1588
1637
  }
1589
1638
  ).set_env("LLAMA_ARG_HF_FILE"));
1590
1639
  add_opt(common_arg(
1591
- {"-hfrv", "--hf-repo-v"}, "REPO",
1640
+ {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
1592
1641
  "Hugging Face model repository for the vocoder model (default: unused)",
1593
1642
  [](common_params & params, const std::string & value) {
1594
1643
  params.vocoder.hf_repo = value;
@@ -2205,6 +2254,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2205
2254
  params.vocoder.model = value;
2206
2255
  }
2207
2256
  ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2257
+ add_opt(common_arg(
2258
+ {"--tts-use-guide-tokens"},
2259
+ "Use guide tokens to improve TTS word recall",
2260
+ [](common_params & params) {
2261
+ params.vocoder.use_guide_tokens = true;
2262
+ }
2263
+ ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2208
2264
 
2209
2265
  // model-specific
2210
2266
  add_opt(common_arg(
@@ -12,6 +12,7 @@
12
12
 
13
13
  struct common_arg {
14
14
  std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
15
+ std::set<enum llama_example> excludes = {};
15
16
  std::vector<const char *> args;
16
17
  const char * value_hint = nullptr; // help text or example for arg value
17
18
  const char * value_hint_2 = nullptr; // for second arg value
@@ -53,9 +54,11 @@ struct common_arg {
53
54
  ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
54
55
 
55
56
  common_arg & set_examples(std::initializer_list<enum llama_example> examples);
57
+ common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
56
58
  common_arg & set_env(const char * env);
57
59
  common_arg & set_sparam();
58
60
  bool in_example(enum llama_example ex);
61
+ bool is_exclude(enum llama_example ex);
59
62
  bool get_value_from_env(std::string & output);
60
63
  bool has_value_from_env();
61
64
  std::string to_string();