@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -131,9 +131,9 @@ struct slot_params {
131
131
  lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
132
132
  }
133
133
 
134
- std::vector<std::string> grammar_trigger_words;
135
- for (const auto & trigger : sampling.grammar_trigger_words) {
136
- grammar_trigger_words.push_back(trigger.word);
134
+ auto grammar_triggers = json::array();
135
+ for (const auto & trigger : sampling.grammar_triggers) {
136
+ grammar_triggers.push_back(trigger.to_json<json>());
137
137
  }
138
138
 
139
139
  return json {
@@ -170,8 +170,8 @@ struct slot_params {
170
170
  {"n_probs", sampling.n_probs},
171
171
  {"min_keep", sampling.min_keep},
172
172
  {"grammar", sampling.grammar},
173
- {"grammar_trigger_words", grammar_trigger_words},
174
- {"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
173
+ {"grammar_lazy", sampling.grammar_lazy},
174
+ {"grammar_triggers", grammar_triggers},
175
175
  {"preserved_tokens", sampling.preserved_tokens},
176
176
  {"chat_format", common_chat_format_name(oaicompat_chat_format)},
177
177
  {"samplers", samplers},
@@ -274,7 +274,7 @@ struct server_task {
274
274
  params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
275
275
 
276
276
  params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
277
- params.speculative.n_min = std::max(params.speculative.n_min, 2);
277
+ params.speculative.n_min = std::max(params.speculative.n_min, 0);
278
278
  params.speculative.n_max = std::max(params.speculative.n_max, 0);
279
279
 
280
280
  // Use OpenAI API logprobs only if n_probs wasn't provided
@@ -329,9 +329,6 @@ struct server_task {
329
329
  }
330
330
 
331
331
  // process "json_schema" and "grammar"
332
- if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
333
- throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
334
- }
335
332
  if (data.contains("json_schema") && !data.contains("grammar")) {
336
333
  try {
337
334
  auto schema = json_value(data, "json_schema", json::object());
@@ -359,24 +356,6 @@ struct server_task {
359
356
  }
360
357
 
361
358
  {
362
- const auto grammar_triggers = data.find("grammar_triggers");
363
- if (grammar_triggers != data.end()) {
364
- for (const auto & t : *grammar_triggers) {
365
- common_grammar_trigger trigger;
366
- trigger.word = t.at("word");
367
- trigger.at_start = t.at("at_start");
368
-
369
- auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
370
- if (ids.size() == 1) {
371
- SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
372
- params.sampling.grammar_trigger_tokens.push_back(ids[0]);
373
- params.sampling.preserved_tokens.insert(ids[0]);
374
- continue;
375
- }
376
- SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
377
- params.sampling.grammar_trigger_words.push_back(trigger);
378
- }
379
- }
380
359
  const auto preserved_tokens = data.find("preserved_tokens");
381
360
  if (preserved_tokens != data.end()) {
382
361
  for (const auto & t : *preserved_tokens) {
@@ -386,12 +365,39 @@ struct server_task {
386
365
  params.sampling.preserved_tokens.insert(ids[0]);
387
366
  } else {
388
367
  // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
389
- SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
368
+ SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
390
369
  }
391
370
  }
392
371
  }
393
- if (params.sampling.grammar_lazy) {
394
- GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
372
+ const auto grammar_triggers = data.find("grammar_triggers");
373
+ if (grammar_triggers != data.end()) {
374
+ for (const auto & t : *grammar_triggers) {
375
+ auto ct = common_grammar_trigger::from_json(t);
376
+ if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377
+ const auto & word = ct.value;
378
+ auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379
+ if (ids.size() == 1) {
380
+ auto token = ids[0];
381
+ if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
382
+ throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
383
+ }
384
+ SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
385
+ common_grammar_trigger trigger;
386
+ trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387
+ trigger.value = word;
388
+ trigger.token = token;
389
+ params.sampling.grammar_triggers.push_back(std::move(trigger));
390
+ } else {
391
+ SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
392
+ params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
393
+ }
394
+ } else {
395
+ params.sampling.grammar_triggers.push_back(ct);
396
+ }
397
+ }
398
+ }
399
+ if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
400
+ throw std::runtime_error("Error: no triggers set for lazy grammar!");
395
401
  }
396
402
  }
397
403
 
@@ -745,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
745
751
  {"name", tc.name},
746
752
  {"arguments", tc.arguments},
747
753
  }},
748
- {"id", tc.id},
754
+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755
+ // We only generate a random id for the ones that don't generate one by themselves
756
+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757
+ {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
749
758
  });
750
759
  }
751
760
  message["tool_calls"] = tool_calls;
@@ -1307,7 +1316,7 @@ struct server_slot {
1307
1316
  return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
1308
1317
  }
1309
1318
 
1310
- bool can_batch_with(server_slot & other_slot) {
1319
+ bool can_batch_with(server_slot & other_slot) const {
1311
1320
  return is_non_causal() == other_slot.is_non_causal()
1312
1321
  && are_lora_equal(lora, other_slot.lora);
1313
1322
  }
@@ -1807,7 +1816,7 @@ struct server_context {
1807
1816
  // Necessary similarity of prompt for slot selection
1808
1817
  float slot_prompt_similarity = 0.0f;
1809
1818
 
1810
- common_chat_templates chat_templates;
1819
+ common_chat_templates_ptr chat_templates;
1811
1820
 
1812
1821
  ~server_context() {
1813
1822
  // Clear any sampling context
@@ -1863,6 +1872,10 @@ struct server_context {
1863
1872
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1864
1873
  params_dft.n_parallel = 1;
1865
1874
 
1875
+ // force F16 KV cache for the draft model for extra performance
1876
+ params_dft.cache_type_k = GGML_TYPE_F16;
1877
+ params_dft.cache_type_v = GGML_TYPE_F16;
1878
+
1866
1879
  llama_init_dft = common_init_from_params(params_dft);
1867
1880
 
1868
1881
  model_dft = llama_init_dft.model.get();
@@ -1883,53 +1896,22 @@ struct server_context {
1883
1896
  cparams_dft = common_context_params_to_llama(params_dft);
1884
1897
  cparams_dft.n_batch = n_ctx_dft;
1885
1898
 
1886
- // force F16 KV cache for the draft model for extra performance
1887
- cparams_dft.type_k = GGML_TYPE_F16;
1888
- cparams_dft.type_v = GGML_TYPE_F16;
1889
-
1890
1899
  // the context is not needed - we will create one for each slot
1891
1900
  llama_init_dft.context.reset();
1892
1901
  }
1893
1902
 
1894
- if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
1903
+ chat_templates = common_chat_templates_init(model, params_base.chat_template);
1904
+ try {
1905
+ common_chat_format_example(chat_templates.get(), params.use_jinja);
1906
+ } catch (const std::exception & e) {
1907
+ SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
1895
1908
  SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
1896
- chat_templates = common_chat_templates_from_model(model, "chatml");
1897
- } else {
1898
- chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
1909
+ chat_templates = common_chat_templates_init(model, "chatml");
1899
1910
  }
1900
- GGML_ASSERT(chat_templates.template_default.get() != nullptr);
1901
1911
 
1902
1912
  return true;
1903
1913
  }
1904
1914
 
1905
- bool validate_builtin_chat_template(bool use_jinja) const {
1906
- llama_chat_message chat[] = {{"user", "test"}};
1907
-
1908
- if (use_jinja) {
1909
- auto templates = common_chat_templates_from_model(model, "");
1910
- common_chat_inputs inputs;
1911
- inputs.messages = json::array({{
1912
- {"role", "user"},
1913
- {"content", "test"},
1914
- }});
1915
- GGML_ASSERT(templates.template_default);
1916
- try {
1917
- common_chat_params_init(*templates.template_default, inputs);
1918
- if (templates.template_tool_use) {
1919
- common_chat_params_init(*templates.template_tool_use, inputs);
1920
- }
1921
- return true;
1922
- } catch (const std::exception & e) {
1923
- SRV_ERR("failed to apply template: %s\n", e.what());
1924
- return false;
1925
- }
1926
- } else {
1927
- const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
1928
- const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
1929
- return chat_res > 0;
1930
- }
1931
- }
1932
-
1933
1915
  void init() {
1934
1916
  const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
1935
1917
 
@@ -2058,6 +2040,18 @@ struct server_context {
2058
2040
  return ret;
2059
2041
  }
2060
2042
 
2043
+ bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2044
+ const llama_model * model = llama_get_model(ctx);
2045
+ const llama_vocab * vocab = llama_model_get_vocab(model);
2046
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2047
+ for (const auto & token : tokens) {
2048
+ if (token < 0 || token >= n_vocab) {
2049
+ return false;
2050
+ }
2051
+ }
2052
+ return true;
2053
+ }
2054
+
2061
2055
  bool launch_slot_with_task(server_slot & slot, const server_task & task) {
2062
2056
  slot.reset();
2063
2057
  slot.id_task = task.id;
@@ -2072,11 +2066,16 @@ struct server_context {
2072
2066
  slot.lora = task.params.lora;
2073
2067
  }
2074
2068
 
2069
+ bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2070
+ if (!can_detokenize) {
2071
+ send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2072
+ return false;
2073
+ }
2075
2074
  SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
2076
2075
 
2077
2076
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
2078
2077
  // Might be better to reject the request with a 400 ?
2079
- SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
2078
+ SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
2080
2079
  slot.params.n_predict = slot.n_predict;
2081
2080
  }
2082
2081
 
@@ -2114,7 +2113,7 @@ struct server_context {
2114
2113
  SRV_DBG("%s", "clearing KV cache\n");
2115
2114
 
2116
2115
  // clear the entire KV cache
2117
- llama_kv_cache_clear(ctx);
2116
+ llama_kv_self_clear(ctx);
2118
2117
  clean_kv_cache = false;
2119
2118
  }
2120
2119
 
@@ -2179,14 +2178,6 @@ struct server_context {
2179
2178
  }
2180
2179
 
2181
2180
  if (slot.has_new_line) {
2182
- // if we have already seen a new line, we stop after a certain time limit
2183
- if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2184
- slot.stop = STOP_TYPE_LIMIT;
2185
- slot.has_next_token = false;
2186
-
2187
- SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2188
- }
2189
-
2190
2181
  // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
2191
2182
  if (slot.params.n_indent > 0) {
2192
2183
  // check the current indentation
@@ -2225,6 +2216,14 @@ struct server_context {
2225
2216
  // check if there is a new line in the generated text
2226
2217
  if (result.text_to_send.find('\n') != std::string::npos) {
2227
2218
  slot.has_new_line = true;
2219
+
2220
+ // if we have seen a new line, we stop after a certain time limit, but only upon another new line
2221
+ if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2222
+ slot.stop = STOP_TYPE_LIMIT;
2223
+ slot.has_next_token = false;
2224
+
2225
+ SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2226
+ }
2228
2227
  }
2229
2228
 
2230
2229
  // if context shift is disabled, we stop when it reaches the context limit
@@ -2656,8 +2655,8 @@ struct server_context {
2656
2655
  res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
2657
2656
  res->t_start = metrics.t_start;
2658
2657
 
2659
- res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
2660
- res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
2658
+ res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
2659
+ res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
2661
2660
 
2662
2661
  res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
2663
2662
  res->t_prompt_processing_total = metrics.t_prompt_processing_total;
@@ -2773,7 +2772,7 @@ struct server_context {
2773
2772
 
2774
2773
  // Erase token cache
2775
2774
  const size_t n_erased = slot->cache_tokens.size();
2776
- llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
2775
+ llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
2777
2776
  slot->cache_tokens.clear();
2778
2777
 
2779
2778
  auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2841,8 +2840,8 @@ struct server_context {
2841
2840
 
2842
2841
  SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
2843
2842
 
2844
- llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2845
- llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2843
+ llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2844
+ llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2846
2845
 
2847
2846
  if (slot.params.cache_prompt) {
2848
2847
  for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3033,8 +3032,8 @@ struct server_context {
3033
3032
 
3034
3033
  const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
3035
3034
 
3036
- llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3037
- llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
3035
+ llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
3036
+ llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3038
3037
 
3039
3038
  for (size_t i = 0; i < n_match; i++) {
3040
3039
  slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3072,9 +3071,9 @@ struct server_context {
3072
3071
  }
3073
3072
 
3074
3073
  // keep only the common part
3075
- if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3074
+ if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3076
3075
  // could not partially delete (likely using a non-Transformer model)
3077
- llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
3076
+ llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
3078
3077
 
3079
3078
  // there is no common part left
3080
3079
  slot.n_past = 0;
@@ -3314,7 +3313,7 @@ struct server_context {
3314
3313
  slot.cache_tokens.push_back(id);
3315
3314
  slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3316
3315
 
3317
- llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
3316
+ llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
3318
3317
 
3319
3318
  for (size_t i = 0; i < ids.size(); ++i) {
3320
3319
  completion_token_output result;
@@ -3822,13 +3821,15 @@ int main(int argc, char ** argv) {
3822
3821
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3823
3822
  { "total_slots", ctx_server.params_base.n_parallel },
3824
3823
  { "model_path", ctx_server.params_base.model },
3825
- { "chat_template", ctx_server.chat_templates.template_default->source() },
3826
- { "bos_token", ctx_server.chat_templates.template_default->bos_token() },
3827
- { "eos_token", ctx_server.chat_templates.template_default->eos_token() },
3824
+ { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3825
+ { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3826
+ { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
3828
3827
  { "build_info", build_info },
3829
3828
  };
3830
- if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
3831
- data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
3829
+ if (ctx_server.params_base.use_jinja) {
3830
+ if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
3831
+ data["chat_template_tool_use"] = tool_use_src;
3832
+ }
3832
3833
  }
3833
3834
 
3834
3835
  res_ok(res, data);
@@ -4063,7 +4064,7 @@ int main(int argc, char ** argv) {
4063
4064
  }
4064
4065
 
4065
4066
  auto body = json::parse(req.body);
4066
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
4067
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4067
4068
 
4068
4069
  return handle_completions_impl(
4069
4070
  SERVER_TASK_TYPE_COMPLETION,
@@ -4076,7 +4077,7 @@ int main(int argc, char ** argv) {
4076
4077
  // same with handle_chat_completions, but without inference part
4077
4078
  const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4078
4079
  auto body = json::parse(req.body);
4079
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
4080
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4080
4081
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4081
4082
  };
4082
4083
 
@@ -4263,6 +4264,11 @@ int main(int argc, char ** argv) {
4263
4264
  // return;
4264
4265
  //}
4265
4266
 
4267
+ // if true, use TEI API format, otherwise use Jina API format
4268
+ // Jina: https://jina.ai/reranker/
4269
+ // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
4270
+ bool is_tei_format = body.contains("texts");
4271
+
4266
4272
  json query;
4267
4273
  if (body.count("query") == 1) {
4268
4274
  query = body.at("query");
@@ -4275,7 +4281,8 @@ int main(int argc, char ** argv) {
4275
4281
  return;
4276
4282
  }
4277
4283
 
4278
- std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>());
4284
+ std::vector<std::string> documents = json_value(body, "documents",
4285
+ json_value(body, "texts", std::vector<std::string>()));
4279
4286
  if (documents.empty()) {
4280
4287
  res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
4281
4288
  return;
@@ -4320,7 +4327,12 @@ int main(int argc, char ** argv) {
4320
4327
  }
4321
4328
 
4322
4329
  // write JSON response
4323
- json root = format_response_rerank(body, responses);
4330
+ json root = format_response_rerank(
4331
+ body,
4332
+ responses,
4333
+ is_tei_format,
4334
+ documents);
4335
+
4324
4336
  res_ok(res, root);
4325
4337
  };
4326
4338
 
@@ -4482,8 +4494,8 @@ int main(int argc, char ** argv) {
4482
4494
 
4483
4495
  // print sample chat example to make it clear which template is used
4484
4496
  LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
4485
- ctx_server.chat_templates.template_default->source().c_str(),
4486
- common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
4497
+ common_chat_templates_source(ctx_server.chat_templates.get()),
4498
+ common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
4487
4499
 
4488
4500
  ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4489
4501
  ctx_server.process_single_task(task);