@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -42,7 +42,7 @@ enum stop_type {
42
42
  STOP_TYPE_LIMIT,
43
43
  };
44
44
 
45
- // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
45
+ // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
46
46
  enum slot_state {
47
47
  SLOT_STATE_IDLE,
48
48
  SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
@@ -131,9 +131,9 @@ struct slot_params {
131
131
  lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
132
132
  }
133
133
 
134
- std::vector<std::string> grammar_trigger_words;
135
- for (const auto & trigger : sampling.grammar_trigger_words) {
136
- grammar_trigger_words.push_back(trigger.word);
134
+ auto grammar_triggers = json::array();
135
+ for (const auto & trigger : sampling.grammar_triggers) {
136
+ grammar_triggers.push_back(trigger.to_json<json>());
137
137
  }
138
138
 
139
139
  return json {
@@ -170,9 +170,10 @@ struct slot_params {
170
170
  {"n_probs", sampling.n_probs},
171
171
  {"min_keep", sampling.min_keep},
172
172
  {"grammar", sampling.grammar},
173
- {"grammar_trigger_words", grammar_trigger_words},
174
- {"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
173
+ {"grammar_lazy", sampling.grammar_lazy},
174
+ {"grammar_triggers", grammar_triggers},
175
175
  {"preserved_tokens", sampling.preserved_tokens},
176
+ {"chat_format", common_chat_format_name(oaicompat_chat_format)},
176
177
  {"samplers", samplers},
177
178
  {"speculative.n_max", speculative.n_max},
178
179
  {"speculative.n_min", speculative.n_min},
@@ -273,7 +274,7 @@ struct server_task {
273
274
  params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
274
275
 
275
276
  params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
276
- params.speculative.n_min = std::max(params.speculative.n_min, 2);
277
+ params.speculative.n_min = std::max(params.speculative.n_min, 0);
277
278
  params.speculative.n_max = std::max(params.speculative.n_max, 0);
278
279
 
279
280
  // Use OpenAI API logprobs only if n_probs wasn't provided
@@ -328,69 +329,75 @@ struct server_task {
328
329
  }
329
330
 
330
331
  // process "json_schema" and "grammar"
331
- if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
332
- throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
333
- }
334
332
  if (data.contains("json_schema") && !data.contains("grammar")) {
335
333
  try {
336
334
  auto schema = json_value(data, "json_schema", json::object());
337
- LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
335
+ SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
338
336
  params.sampling.grammar = json_schema_to_grammar(schema);
339
- LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
337
+ SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
340
338
  } catch (const std::exception & e) {
341
339
  throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
342
340
  }
343
341
  } else {
344
342
  params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
345
- LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
343
+ SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
346
344
  params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
347
- LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
345
+ SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
348
346
  }
349
347
 
350
348
  {
351
349
  auto it = data.find("chat_format");
352
350
  if (it != data.end()) {
353
351
  params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
354
- LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
352
+ SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
355
353
  } else {
356
354
  params.oaicompat_chat_format = defaults.oaicompat_chat_format;
357
355
  }
358
356
  }
359
357
 
360
358
  {
361
- const auto grammar_triggers = data.find("grammar_triggers");
362
- if (grammar_triggers != data.end()) {
363
- for (const auto & t : *grammar_triggers) {
364
- common_grammar_trigger trigger;
365
- trigger.word = t.at("word");
366
- trigger.at_start = t.at("at_start");
367
-
368
- auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
369
- if (ids.size() == 1) {
370
- LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
371
- params.sampling.grammar_trigger_tokens.push_back(ids[0]);
372
- params.sampling.preserved_tokens.insert(ids[0]);
373
- continue;
374
- }
375
- LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
376
- params.sampling.grammar_trigger_words.push_back(trigger);
377
- }
378
- }
379
359
  const auto preserved_tokens = data.find("preserved_tokens");
380
360
  if (preserved_tokens != data.end()) {
381
361
  for (const auto & t : *preserved_tokens) {
382
362
  auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
383
363
  if (ids.size() == 1) {
384
- LOG_DBG("Preserved token: %d\n", ids[0]);
364
+ SRV_DBG("Preserved token: %d\n", ids[0]);
385
365
  params.sampling.preserved_tokens.insert(ids[0]);
386
366
  } else {
387
367
  // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
388
- LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
368
+ SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
389
369
  }
390
370
  }
391
371
  }
392
- if (params.sampling.grammar_lazy) {
393
- GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
372
+ const auto grammar_triggers = data.find("grammar_triggers");
373
+ if (grammar_triggers != data.end()) {
374
+ for (const auto & t : *grammar_triggers) {
375
+ auto ct = common_grammar_trigger::from_json(t);
376
+ if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377
+ const auto & word = ct.value;
378
+ auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379
+ if (ids.size() == 1) {
380
+ auto token = ids[0];
381
+ if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
382
+ throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
383
+ }
384
+ SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
385
+ common_grammar_trigger trigger;
386
+ trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387
+ trigger.value = word;
388
+ trigger.token = token;
389
+ params.sampling.grammar_triggers.push_back(std::move(trigger));
390
+ } else {
391
+ SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
392
+ params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
393
+ }
394
+ } else {
395
+ params.sampling.grammar_triggers.push_back(ct);
396
+ }
397
+ }
398
+ }
399
+ if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
400
+ throw std::runtime_error("Error: no triggers set for lazy grammar!");
394
401
  }
395
402
  }
396
403
 
@@ -717,16 +724,26 @@ struct server_task_result_cmpl_final : server_task_result {
717
724
  std::string finish_reason = "length";
718
725
  common_chat_msg msg;
719
726
  if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
720
- LOG_DBG("Parsing chat message: %s\n", content.c_str());
727
+ SRV_DBG("Parsing chat message: %s\n", content.c_str());
721
728
  msg = common_chat_parse(content, oaicompat_chat_format);
722
729
  finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
723
730
  } else {
724
731
  msg.content = content;
725
732
  }
726
733
 
727
- json tool_calls;
734
+ json message {
735
+ {"role", "assistant"},
736
+ };
737
+ if (!msg.reasoning_content.empty()) {
738
+ message["reasoning_content"] = msg.reasoning_content;
739
+ }
740
+ if (msg.content.empty() && !msg.tool_calls.empty()) {
741
+ message["content"] = json();
742
+ } else {
743
+ message["content"] = msg.content;
744
+ }
728
745
  if (!msg.tool_calls.empty()) {
729
- tool_calls = json::array();
746
+ auto tool_calls = json::array();
730
747
  for (const auto & tc : msg.tool_calls) {
731
748
  tool_calls.push_back({
732
749
  {"type", "function"},
@@ -734,18 +751,13 @@ struct server_task_result_cmpl_final : server_task_result {
734
751
  {"name", tc.name},
735
752
  {"arguments", tc.arguments},
736
753
  }},
737
- {"id", tc.id},
754
+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755
+ // We only generate a random id for the ones that don't generate one by themselves
756
+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757
+ {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
738
758
  });
739
759
  }
740
- }
741
-
742
- json message {
743
- {"content", msg.content},
744
- {"tool_calls", tool_calls},
745
- {"role", "assistant"},
746
- };
747
- if (!msg.tool_plan.empty()) {
748
- message["tool_plan"] = msg.tool_plan;
760
+ message["tool_calls"] = tool_calls;
749
761
  }
750
762
 
751
763
  json choice {
@@ -1304,7 +1316,7 @@ struct server_slot {
1304
1316
  return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
1305
1317
  }
1306
1318
 
1307
- bool can_batch_with(server_slot & other_slot) {
1319
+ bool can_batch_with(server_slot & other_slot) const {
1308
1320
  return is_non_causal() == other_slot.is_non_causal()
1309
1321
  && are_lora_equal(lora, other_slot.lora);
1310
1322
  }
@@ -1600,6 +1612,10 @@ struct server_queue {
1600
1612
 
1601
1613
  while (true) {
1602
1614
  std::unique_lock<std::mutex> lock(mutex_tasks);
1615
+ if (!running) {
1616
+ QUE_DBG("%s", "terminate\n");
1617
+ return;
1618
+ }
1603
1619
  if (queue_tasks.empty()) {
1604
1620
  lock.unlock();
1605
1621
  break;
@@ -1620,11 +1636,11 @@ struct server_queue {
1620
1636
  QUE_DBG("%s", "waiting for new tasks\n");
1621
1637
  {
1622
1638
  std::unique_lock<std::mutex> lock(mutex_tasks);
1639
+ if (!running) {
1640
+ QUE_DBG("%s", "terminate\n");
1641
+ return;
1642
+ }
1623
1643
  if (queue_tasks.empty()) {
1624
- if (!running) {
1625
- QUE_DBG("%s", "terminate\n");
1626
- return;
1627
- }
1628
1644
  condition_tasks.wait(lock, [&]{
1629
1645
  return (!queue_tasks.empty() || !running);
1630
1646
  });
@@ -1800,7 +1816,7 @@ struct server_context {
1800
1816
  // Necessary similarity of prompt for slot selection
1801
1817
  float slot_prompt_similarity = 0.0f;
1802
1818
 
1803
- common_chat_templates chat_templates;
1819
+ common_chat_templates_ptr chat_templates;
1804
1820
 
1805
1821
  ~server_context() {
1806
1822
  // Clear any sampling context
@@ -1884,45 +1900,18 @@ struct server_context {
1884
1900
  llama_init_dft.context.reset();
1885
1901
  }
1886
1902
 
1887
- if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
1888
- LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
1889
- chat_templates = common_chat_templates_from_model(model, "chatml");
1890
- } else {
1891
- chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
1903
+ chat_templates = common_chat_templates_init(model, params_base.chat_template);
1904
+ try {
1905
+ common_chat_format_example(chat_templates.get(), params.use_jinja);
1906
+ } catch (const std::exception & e) {
1907
+ SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
1908
+ SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
1909
+ chat_templates = common_chat_templates_init(model, "chatml");
1892
1910
  }
1893
- GGML_ASSERT(chat_templates.template_default.get() != nullptr);
1894
1911
 
1895
1912
  return true;
1896
1913
  }
1897
1914
 
1898
- bool validate_builtin_chat_template(bool use_jinja) const {
1899
- llama_chat_message chat[] = {{"user", "test"}};
1900
-
1901
- if (use_jinja) {
1902
- auto templates = common_chat_templates_from_model(model, "");
1903
- common_chat_inputs inputs;
1904
- inputs.messages = json::array({{
1905
- {"role", "user"},
1906
- {"content", "test"},
1907
- }});
1908
- GGML_ASSERT(templates.template_default);
1909
- try {
1910
- common_chat_params_init(*templates.template_default, inputs);
1911
- if (templates.template_tool_use) {
1912
- common_chat_params_init(*templates.template_tool_use, inputs);
1913
- }
1914
- return true;
1915
- } catch (const std::exception & e) {
1916
- SRV_ERR("failed to apply template: %s\n", e.what());
1917
- return false;
1918
- }
1919
- } else {
1920
- const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
1921
- const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
1922
- return chat_res > 0;
1923
- }
1924
- }
1925
-
1926
1915
  void init() {
1927
1916
  const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
1928
1917
 
@@ -2069,8 +2058,8 @@ struct server_context {
2069
2058
 
2070
2059
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
2071
2060
  // Might be better to reject the request with a 400 ?
2061
+ SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
2072
2062
  slot.params.n_predict = slot.n_predict;
2073
- SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
2074
2063
  }
2075
2064
 
2076
2065
  if (slot.params.ignore_eos && has_eos_token) {
@@ -2172,14 +2161,6 @@ struct server_context {
2172
2161
  }
2173
2162
 
2174
2163
  if (slot.has_new_line) {
2175
- // if we have already seen a new line, we stop after a certain time limit
2176
- if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2177
- slot.stop = STOP_TYPE_LIMIT;
2178
- slot.has_next_token = false;
2179
-
2180
- SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2181
- }
2182
-
2183
2164
  // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
2184
2165
  if (slot.params.n_indent > 0) {
2185
2166
  // check the current indentation
@@ -2218,6 +2199,14 @@ struct server_context {
2218
2199
  // check if there is a new line in the generated text
2219
2200
  if (result.text_to_send.find('\n') != std::string::npos) {
2220
2201
  slot.has_new_line = true;
2202
+
2203
+ // if we have seen a new line, we stop after a certain time limit, but only upon another new line
2204
+ if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2205
+ slot.stop = STOP_TYPE_LIMIT;
2206
+ slot.has_next_token = false;
2207
+
2208
+ SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2209
+ }
2221
2210
  }
2222
2211
 
2223
2212
  // if context shift is disabled, we stop when it reaches the context limit
@@ -2275,7 +2264,7 @@ struct server_context {
2275
2264
  for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
2276
2265
  result.probs.push_back({
2277
2266
  cur_p->data[i].id,
2278
- common_detokenize(ctx, {cur_p->data[i].id}, special),
2267
+ common_token_to_piece(ctx, cur_p->data[i].id, special),
2279
2268
  cur_p->data[i].p
2280
2269
  });
2281
2270
  }
@@ -2297,7 +2286,7 @@ struct server_context {
2297
2286
  for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
2298
2287
  result.probs.push_back({
2299
2288
  cur[i].id,
2300
- common_detokenize(ctx, {cur[i].id}, special),
2289
+ common_token_to_piece(ctx, cur[i].id, special),
2301
2290
  cur[i].p
2302
2291
  });
2303
2292
  }
@@ -3027,7 +3016,7 @@ struct server_context {
3027
3016
  const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
3028
3017
 
3029
3018
  llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3030
- llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
3019
+ llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3031
3020
 
3032
3021
  for (size_t i = 0; i < n_match; i++) {
3033
3022
  slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3355,10 +3344,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
3355
3344
 
3356
3345
  // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
3357
3346
 
3358
- LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
3347
+ SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
3359
3348
 
3360
- LOG_DBG("request: %s\n", req.body.c_str());
3361
- LOG_DBG("response: %s\n", res.body.c_str());
3349
+ SRV_DBG("request: %s\n", req.body.c_str());
3350
+ SRV_DBG("response: %s\n", res.body.c_str());
3362
3351
  }
3363
3352
 
3364
3353
  std::function<void(int)> shutdown_handler;
@@ -3649,7 +3638,7 @@ int main(int argc, char ** argv) {
3649
3638
  }, {
3650
3639
  {"name", "n_busy_slots_per_decode"},
3651
3640
  {"help", "Average number of busy slots per llama_decode() call"},
3652
- {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
3641
+ {"value", (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
3653
3642
  }}},
3654
3643
  {"gauge", {{
3655
3644
  {"name", "prompt_tokens_seconds"},
@@ -3815,13 +3804,15 @@ int main(int argc, char ** argv) {
3815
3804
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3816
3805
  { "total_slots", ctx_server.params_base.n_parallel },
3817
3806
  { "model_path", ctx_server.params_base.model },
3818
- { "chat_template", ctx_server.chat_templates.template_default->source() },
3819
- { "bos_token", ctx_server.chat_templates.template_default->bos_token() },
3820
- { "eos_token", ctx_server.chat_templates.template_default->eos_token() },
3807
+ { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3808
+ { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3809
+ { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
3821
3810
  { "build_info", build_info },
3822
3811
  };
3823
- if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
3824
- data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
3812
+ if (ctx_server.params_base.use_jinja) {
3813
+ if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
3814
+ data["chat_template_tool_use"] = tool_use_src;
3815
+ }
3825
3816
  }
3826
3817
 
3827
3818
  res_ok(res, data);
@@ -3860,7 +3851,9 @@ int main(int argc, char ** argv) {
3860
3851
 
3861
3852
  try {
3862
3853
  const auto & prompt = data.at("prompt");
3863
- LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3854
+ // TODO: this log can become very long, put it behind a flag or think about a more compact format
3855
+ //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3856
+
3864
3857
  std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
3865
3858
  tasks.reserve(tokenized_prompts.size());
3866
3859
  for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4054,7 +4047,7 @@ int main(int argc, char ** argv) {
4054
4047
  }
4055
4048
 
4056
4049
  auto body = json::parse(req.body);
4057
- json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4050
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4058
4051
 
4059
4052
  return handle_completions_impl(
4060
4053
  SERVER_TASK_TYPE_COMPLETION,
@@ -4067,7 +4060,7 @@ int main(int argc, char ** argv) {
4067
4060
  // same with handle_chat_completions, but without inference part
4068
4061
  const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4069
4062
  auto body = json::parse(req.body);
4070
- json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4063
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4071
4064
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4072
4065
  };
4073
4066
 
@@ -4254,6 +4247,11 @@ int main(int argc, char ** argv) {
4254
4247
  // return;
4255
4248
  //}
4256
4249
 
4250
+ // if true, use TEI API format, otherwise use Jina API format
4251
+ // Jina: https://jina.ai/reranker/
4252
+ // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
4253
+ bool is_tei_format = body.contains("texts");
4254
+
4257
4255
  json query;
4258
4256
  if (body.count("query") == 1) {
4259
4257
  query = body.at("query");
@@ -4266,7 +4264,8 @@ int main(int argc, char ** argv) {
4266
4264
  return;
4267
4265
  }
4268
4266
 
4269
- std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>());
4267
+ std::vector<std::string> documents = json_value(body, "documents",
4268
+ json_value(body, "texts", std::vector<std::string>()));
4270
4269
  if (documents.empty()) {
4271
4270
  res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
4272
4271
  return;
@@ -4311,7 +4310,12 @@ int main(int argc, char ** argv) {
4311
4310
  }
4312
4311
 
4313
4312
  // write JSON response
4314
- json root = format_response_rerank(body, responses);
4313
+ json root = format_response_rerank(
4314
+ body,
4315
+ responses,
4316
+ is_tei_format,
4317
+ documents);
4318
+
4315
4319
  res_ok(res, root);
4316
4320
  };
4317
4321
 
@@ -4376,6 +4380,9 @@ int main(int argc, char ** argv) {
4376
4380
  res.set_content("Error: gzip is not supported by this browser", "text/plain");
4377
4381
  } else {
4378
4382
  res.set_header("Content-Encoding", "gzip");
4383
+ // COEP and COOP headers, required by pyodide (python interpreter)
4384
+ res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
4385
+ res.set_header("Cross-Origin-Opener-Policy", "same-origin");
4379
4386
  res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
4380
4387
  }
4381
4388
  return false;
@@ -4425,6 +4432,7 @@ int main(int argc, char ** argv) {
4425
4432
 
4426
4433
  // clean up function, to be called before exit
4427
4434
  auto clean_up = [&svr]() {
4435
+ SRV_INF("%s: cleaning up before exit...\n", __func__);
4428
4436
  svr->stop();
4429
4437
  llama_backend_free();
4430
4438
  };
@@ -4441,10 +4449,6 @@ int main(int argc, char ** argv) {
4441
4449
  }
4442
4450
 
4443
4451
  if (!was_bound) {
4444
- //LOG_ERROR("couldn't bind HTTP server socket", {
4445
- // {"hostname", params.hostname},
4446
- // {"port", params.port},
4447
- //});
4448
4452
  LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
4449
4453
  clean_up();
4450
4454
  return 1;
@@ -4461,7 +4465,7 @@ int main(int argc, char ** argv) {
4461
4465
 
4462
4466
  if (!ctx_server.load_model(params)) {
4463
4467
  clean_up();
4464
- t.join();
4468
+ // t.join(); // FIXME: see below
4465
4469
  LOG_ERR("%s: exiting due to model loading error\n", __func__);
4466
4470
  return 1;
4467
4471
  }
@@ -4473,8 +4477,8 @@ int main(int argc, char ** argv) {
4473
4477
 
4474
4478
  // print sample chat example to make it clear which template is used
4475
4479
  LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
4476
- ctx_server.chat_templates.template_default->source().c_str(),
4477
- common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
4480
+ common_chat_templates_source(ctx_server.chat_templates.get()),
4481
+ common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
4478
4482
 
4479
4483
  ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4480
4484
  ctx_server.process_single_task(task);
@@ -4485,13 +4489,10 @@ int main(int argc, char ** argv) {
4485
4489
  });
4486
4490
 
4487
4491
  shutdown_handler = [&](int) {
4492
+ // this will unblock start_loop()
4488
4493
  ctx_server.queue_tasks.terminate();
4489
4494
  };
4490
4495
 
4491
- LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
4492
-
4493
- ctx_server.queue_tasks.start_loop();
4494
-
4495
4496
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
4496
4497
  struct sigaction sigint_action;
4497
4498
  sigint_action.sa_handler = signal_handler;
@@ -4506,8 +4507,13 @@ int main(int argc, char ** argv) {
4506
4507
  SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
4507
4508
  #endif
4508
4509
 
4510
+ LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
4511
+
4512
+ // this call blocks the main thread until queue_tasks.terminate() is called
4513
+ ctx_server.queue_tasks.start_loop();
4514
+
4509
4515
  clean_up();
4510
- t.join();
4516
+ // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4511
4517
 
4512
4518
  return 0;
4513
4519
  }