@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -131,9 +131,9 @@ struct slot_params {
131
131
  lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
132
132
  }
133
133
 
134
- std::vector<std::string> grammar_trigger_words;
135
- for (const auto & trigger : sampling.grammar_trigger_words) {
136
- grammar_trigger_words.push_back(trigger.word);
134
+ auto grammar_triggers = json::array();
135
+ for (const auto & trigger : sampling.grammar_triggers) {
136
+ grammar_triggers.push_back(trigger.to_json<json>());
137
137
  }
138
138
 
139
139
  return json {
@@ -170,8 +170,8 @@ struct slot_params {
170
170
  {"n_probs", sampling.n_probs},
171
171
  {"min_keep", sampling.min_keep},
172
172
  {"grammar", sampling.grammar},
173
- {"grammar_trigger_words", grammar_trigger_words},
174
- {"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
173
+ {"grammar_lazy", sampling.grammar_lazy},
174
+ {"grammar_triggers", grammar_triggers},
175
175
  {"preserved_tokens", sampling.preserved_tokens},
176
176
  {"chat_format", common_chat_format_name(oaicompat_chat_format)},
177
177
  {"samplers", samplers},
@@ -274,7 +274,7 @@ struct server_task {
274
274
  params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
275
275
 
276
276
  params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
277
- params.speculative.n_min = std::max(params.speculative.n_min, 2);
277
+ params.speculative.n_min = std::max(params.speculative.n_min, 0);
278
278
  params.speculative.n_max = std::max(params.speculative.n_max, 0);
279
279
 
280
280
  // Use OpenAI API logprobs only if n_probs wasn't provided
@@ -329,9 +329,6 @@ struct server_task {
329
329
  }
330
330
 
331
331
  // process "json_schema" and "grammar"
332
- if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
333
- throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
334
- }
335
332
  if (data.contains("json_schema") && !data.contains("grammar")) {
336
333
  try {
337
334
  auto schema = json_value(data, "json_schema", json::object());
@@ -359,24 +356,6 @@ struct server_task {
359
356
  }
360
357
 
361
358
  {
362
- const auto grammar_triggers = data.find("grammar_triggers");
363
- if (grammar_triggers != data.end()) {
364
- for (const auto & t : *grammar_triggers) {
365
- common_grammar_trigger trigger;
366
- trigger.word = t.at("word");
367
- trigger.at_start = t.at("at_start");
368
-
369
- auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
370
- if (ids.size() == 1) {
371
- SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
372
- params.sampling.grammar_trigger_tokens.push_back(ids[0]);
373
- params.sampling.preserved_tokens.insert(ids[0]);
374
- continue;
375
- }
376
- SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
377
- params.sampling.grammar_trigger_words.push_back(trigger);
378
- }
379
- }
380
359
  const auto preserved_tokens = data.find("preserved_tokens");
381
360
  if (preserved_tokens != data.end()) {
382
361
  for (const auto & t : *preserved_tokens) {
@@ -386,12 +365,39 @@ struct server_task {
386
365
  params.sampling.preserved_tokens.insert(ids[0]);
387
366
  } else {
388
367
  // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
389
- SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
368
+ SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
369
+ }
370
+ }
371
+ }
372
+ const auto grammar_triggers = data.find("grammar_triggers");
373
+ if (grammar_triggers != data.end()) {
374
+ for (const auto & t : *grammar_triggers) {
375
+ auto ct = common_grammar_trigger::from_json(t);
376
+ if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377
+ const auto & word = ct.value;
378
+ auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379
+ if (ids.size() == 1) {
380
+ auto token = ids[0];
381
+ if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
382
+ throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
383
+ }
384
+ SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
385
+ common_grammar_trigger trigger;
386
+ trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387
+ trigger.value = word;
388
+ trigger.token = token;
389
+ params.sampling.grammar_triggers.push_back(std::move(trigger));
390
+ } else {
391
+ SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
392
+ params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
393
+ }
394
+ } else {
395
+ params.sampling.grammar_triggers.push_back(ct);
390
396
  }
391
397
  }
392
398
  }
393
- if (params.sampling.grammar_lazy) {
394
- GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
399
+ if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
400
+ throw std::runtime_error("Error: no triggers set for lazy grammar!");
395
401
  }
396
402
  }
397
403
 
@@ -745,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
745
751
  {"name", tc.name},
746
752
  {"arguments", tc.arguments},
747
753
  }},
748
- {"id", tc.id},
754
+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755
+ // We only generate a random id for the ones that don't generate one by themselves
756
+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757
+ {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
749
758
  });
750
759
  }
751
760
  message["tool_calls"] = tool_calls;
@@ -1307,7 +1316,7 @@ struct server_slot {
1307
1316
  return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
1308
1317
  }
1309
1318
 
1310
- bool can_batch_with(server_slot & other_slot) {
1319
+ bool can_batch_with(server_slot & other_slot) const {
1311
1320
  return is_non_causal() == other_slot.is_non_causal()
1312
1321
  && are_lora_equal(lora, other_slot.lora);
1313
1322
  }
@@ -1807,7 +1816,7 @@ struct server_context {
1807
1816
  // Necessary similarity of prompt for slot selection
1808
1817
  float slot_prompt_similarity = 0.0f;
1809
1818
 
1810
- common_chat_templates chat_templates;
1819
+ common_chat_templates_ptr chat_templates;
1811
1820
 
1812
1821
  ~server_context() {
1813
1822
  // Clear any sampling context
@@ -1891,45 +1900,18 @@ struct server_context {
1891
1900
  llama_init_dft.context.reset();
1892
1901
  }
1893
1902
 
1894
- if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
1903
+ chat_templates = common_chat_templates_init(model, params_base.chat_template);
1904
+ try {
1905
+ common_chat_format_example(chat_templates.get(), params.use_jinja);
1906
+ } catch (const std::exception & e) {
1907
+ SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
1895
1908
  SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
1896
- chat_templates = common_chat_templates_from_model(model, "chatml");
1897
- } else {
1898
- chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
1909
+ chat_templates = common_chat_templates_init(model, "chatml");
1899
1910
  }
1900
- GGML_ASSERT(chat_templates.template_default.get() != nullptr);
1901
1911
 
1902
1912
  return true;
1903
1913
  }
1904
1914
 
1905
- bool validate_builtin_chat_template(bool use_jinja) const {
1906
- llama_chat_message chat[] = {{"user", "test"}};
1907
-
1908
- if (use_jinja) {
1909
- auto templates = common_chat_templates_from_model(model, "");
1910
- common_chat_inputs inputs;
1911
- inputs.messages = json::array({{
1912
- {"role", "user"},
1913
- {"content", "test"},
1914
- }});
1915
- GGML_ASSERT(templates.template_default);
1916
- try {
1917
- common_chat_params_init(*templates.template_default, inputs);
1918
- if (templates.template_tool_use) {
1919
- common_chat_params_init(*templates.template_tool_use, inputs);
1920
- }
1921
- return true;
1922
- } catch (const std::exception & e) {
1923
- SRV_ERR("failed to apply template: %s\n", e.what());
1924
- return false;
1925
- }
1926
- } else {
1927
- const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
1928
- const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
1929
- return chat_res > 0;
1930
- }
1931
- }
1932
-
1933
1915
  void init() {
1934
1916
  const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
1935
1917
 
@@ -2076,7 +2058,7 @@ struct server_context {
2076
2058
 
2077
2059
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
2078
2060
  // Might be better to reject the request with a 400 ?
2079
- SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
2061
+ SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
2080
2062
  slot.params.n_predict = slot.n_predict;
2081
2063
  }
2082
2064
 
@@ -2179,14 +2161,6 @@ struct server_context {
2179
2161
  }
2180
2162
 
2181
2163
  if (slot.has_new_line) {
2182
- // if we have already seen a new line, we stop after a certain time limit
2183
- if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2184
- slot.stop = STOP_TYPE_LIMIT;
2185
- slot.has_next_token = false;
2186
-
2187
- SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2188
- }
2189
-
2190
2164
  // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
2191
2165
  if (slot.params.n_indent > 0) {
2192
2166
  // check the current indentation
@@ -2225,6 +2199,14 @@ struct server_context {
2225
2199
  // check if there is a new line in the generated text
2226
2200
  if (result.text_to_send.find('\n') != std::string::npos) {
2227
2201
  slot.has_new_line = true;
2202
+
2203
+ // if we have seen a new line, we stop after a certain time limit, but only upon another new line
2204
+ if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2205
+ slot.stop = STOP_TYPE_LIMIT;
2206
+ slot.has_next_token = false;
2207
+
2208
+ SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2209
+ }
2228
2210
  }
2229
2211
 
2230
2212
  // if context shift is disabled, we stop when it reaches the context limit
@@ -3034,7 +3016,7 @@ struct server_context {
3034
3016
  const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
3035
3017
 
3036
3018
  llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3037
- llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
3019
+ llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3038
3020
 
3039
3021
  for (size_t i = 0; i < n_match; i++) {
3040
3022
  slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3822,13 +3804,15 @@ int main(int argc, char ** argv) {
3822
3804
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3823
3805
  { "total_slots", ctx_server.params_base.n_parallel },
3824
3806
  { "model_path", ctx_server.params_base.model },
3825
- { "chat_template", ctx_server.chat_templates.template_default->source() },
3826
- { "bos_token", ctx_server.chat_templates.template_default->bos_token() },
3827
- { "eos_token", ctx_server.chat_templates.template_default->eos_token() },
3807
+ { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3808
+ { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3809
+ { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
3828
3810
  { "build_info", build_info },
3829
3811
  };
3830
- if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
3831
- data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
3812
+ if (ctx_server.params_base.use_jinja) {
3813
+ if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
3814
+ data["chat_template_tool_use"] = tool_use_src;
3815
+ }
3832
3816
  }
3833
3817
 
3834
3818
  res_ok(res, data);
@@ -4063,7 +4047,7 @@ int main(int argc, char ** argv) {
4063
4047
  }
4064
4048
 
4065
4049
  auto body = json::parse(req.body);
4066
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
4050
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4067
4051
 
4068
4052
  return handle_completions_impl(
4069
4053
  SERVER_TASK_TYPE_COMPLETION,
@@ -4076,7 +4060,7 @@ int main(int argc, char ** argv) {
4076
4060
  // same with handle_chat_completions, but without inference part
4077
4061
  const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4078
4062
  auto body = json::parse(req.body);
4079
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
4063
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4080
4064
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4081
4065
  };
4082
4066
 
@@ -4263,6 +4247,11 @@ int main(int argc, char ** argv) {
4263
4247
  // return;
4264
4248
  //}
4265
4249
 
4250
+ // if true, use TEI API format, otherwise use Jina API format
4251
+ // Jina: https://jina.ai/reranker/
4252
+ // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
4253
+ bool is_tei_format = body.contains("texts");
4254
+
4266
4255
  json query;
4267
4256
  if (body.count("query") == 1) {
4268
4257
  query = body.at("query");
@@ -4275,7 +4264,8 @@ int main(int argc, char ** argv) {
4275
4264
  return;
4276
4265
  }
4277
4266
 
4278
- std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>());
4267
+ std::vector<std::string> documents = json_value(body, "documents",
4268
+ json_value(body, "texts", std::vector<std::string>()));
4279
4269
  if (documents.empty()) {
4280
4270
  res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
4281
4271
  return;
@@ -4320,7 +4310,12 @@ int main(int argc, char ** argv) {
4320
4310
  }
4321
4311
 
4322
4312
  // write JSON response
4323
- json root = format_response_rerank(body, responses);
4313
+ json root = format_response_rerank(
4314
+ body,
4315
+ responses,
4316
+ is_tei_format,
4317
+ documents);
4318
+
4324
4319
  res_ok(res, root);
4325
4320
  };
4326
4321
 
@@ -4482,8 +4477,8 @@ int main(int argc, char ** argv) {
4482
4477
 
4483
4478
  // print sample chat example to make it clear which template is used
4484
4479
  LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
4485
- ctx_server.chat_templates.template_default->source().c_str(),
4486
- common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
4480
+ common_chat_templates_source(ctx_server.chat_templates.get()),
4481
+ common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
4487
4482
 
4488
4483
  ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4489
4484
  ctx_server.process_single_task(task);
@@ -7,14 +7,14 @@
7
7
 
8
8
  // increase max payload length to allow use of larger context size
9
9
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
10
+ // disable Nagle's algorithm
11
+ #define CPPHTTPLIB_TCP_NODELAY true
10
12
  #include "httplib.h"
11
13
 
12
14
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
13
15
  #define JSON_ASSERT GGML_ASSERT
14
16
  #include "json.hpp"
15
- #include "minja.hpp"
16
- #include "chat.hpp"
17
- #include "chat-template.hpp"
17
+ #include "chat.h"
18
18
 
19
19
  #include <random>
20
20
  #include <sstream>
@@ -347,41 +347,6 @@ static llama_tokens format_infill(
347
347
  return embd_inp;
348
348
  }
349
349
 
350
- // Format given chat. If tmpl is empty, we take the template from model metadata
351
- inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
352
- std::vector<common_chat_msg> chat;
353
-
354
- for (size_t i = 0; i < messages.size(); ++i) {
355
- const auto & curr_msg = messages[i];
356
-
357
- std::string role = json_value(curr_msg, "role", std::string(""));
358
-
359
- std::string content;
360
- if (curr_msg.contains("content")) {
361
- if (curr_msg["content"].is_string()) {
362
- content = curr_msg["content"].get<std::string>();
363
- } else if (curr_msg["content"].is_array()) {
364
- for (const auto & part : curr_msg["content"]) {
365
- if (part.contains("text")) {
366
- content += "\n" + part["text"].get<std::string>();
367
- }
368
- }
369
- } else {
370
- throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
371
- }
372
- } else {
373
- throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
374
- }
375
-
376
- chat.push_back({role, content, /* tool_calls= */ {}});
377
- }
378
-
379
- const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
380
- LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
381
-
382
- return formatted_chat;
383
- }
384
-
385
350
  //
386
351
  // base64 utils (TODO: move to common in the future)
387
352
  //
@@ -470,6 +435,10 @@ static std::string gen_chatcmplid() {
470
435
  return "chatcmpl-" + random_string();
471
436
  }
472
437
 
438
+ static std::string gen_tool_call_id() {
439
+ return random_string();
440
+ }
441
+
473
442
  //
474
443
  // other common utils
475
444
  //
@@ -556,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) {
556
525
  throw std::runtime_error("Only one completion choice is allowed");
557
526
  }
558
527
 
528
+ // Handle "echo" field
529
+ if (json_value(body, "echo", false)) {
530
+ throw std::runtime_error("Only no echo is supported");
531
+ }
532
+
559
533
  // Params supported by OAI but unsupported by llama.cpp
560
- static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
534
+ static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
561
535
  for (const auto & param : unsupported_params) {
562
536
  if (body.contains(param)) {
563
537
  throw std::runtime_error("Unsupported param: " + param);
@@ -579,12 +553,9 @@ static json oaicompat_completion_params_parse(
579
553
  const json & body, /* openai api json semantics */
580
554
  bool use_jinja,
581
555
  common_reasoning_format reasoning_format,
582
- const common_chat_templates & chat_templates)
556
+ const struct common_chat_templates * tmpls)
583
557
  {
584
558
  json llama_params;
585
- const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
586
- ? *chat_templates.template_tool_use
587
- : *chat_templates.template_default;
588
559
 
589
560
  auto tools = json_value(body, "tools", json());
590
561
  auto stream = json_value(body, "stream", false);
@@ -610,62 +581,56 @@ static json oaicompat_completion_params_parse(
610
581
  llama_params["stop"] = json_value(body, "stop", json::array());
611
582
  }
612
583
 
584
+ auto json_schema = json_value(body, "json_schema", json());
585
+ auto grammar = json_value(body, "grammar", std::string());
586
+ if (!json_schema.is_null() && !grammar.empty()) {
587
+ throw std::runtime_error("Cannot use both json_schema and grammar");
588
+ }
589
+
613
590
  // Handle "response_format" field
614
591
  if (body.contains("response_format")) {
615
592
  json response_format = json_value(body, "response_format", json::object());
616
593
  std::string response_type = json_value(response_format, "type", std::string());
617
594
  if (response_type == "json_object") {
618
- llama_params["json_schema"] = json_value(response_format, "schema", json::object());
595
+ json_schema = json_value(response_format, "schema", json::object());
619
596
  } else if (response_type == "json_schema") {
620
- json json_schema = json_value(response_format, "json_schema", json::object());
621
- llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
597
+ auto schema_wrapper = json_value(response_format, "json_schema", json::object());
598
+ json_schema = json_value(schema_wrapper, "schema", json::object());
622
599
  } else if (!response_type.empty() && response_type != "text") {
623
600
  throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
624
601
  }
625
602
  }
626
603
 
604
+ common_chat_templates_inputs inputs;
605
+ inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
606
+ inputs.tools = common_chat_tools_parse_oaicompat(tools);
607
+ inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
608
+ inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
609
+ inputs.grammar = grammar;
610
+ inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
611
+ inputs.use_jinja = use_jinja;
612
+ inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
613
+ inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
614
+ inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
615
+ if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
616
+ throw std::runtime_error("Cannot use custom grammar constraints with tools.");
617
+ }
618
+
627
619
  // Apply chat template to the list of messages
628
- if (use_jinja) {
629
- auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
630
- if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
631
- throw std::runtime_error("Invalid tool_choice: " + tool_choice);
632
- }
633
- if (tool_choice != "none" && llama_params.contains("grammar")) {
634
- throw std::runtime_error("Cannot use custom grammar constraints with tools.");
635
- }
636
- common_chat_inputs inputs;
637
- inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
638
- inputs.messages = body.at("messages");
639
- inputs.tools = tools;
640
- inputs.tool_choice = tool_choice;
641
- inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
642
- if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
643
- LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
644
- inputs.parallel_tool_calls = false;
645
- }
646
- inputs.stream = stream;
647
- // TODO: support mixing schema w/ tools beyond generic format.
648
- inputs.json_schema = json_value(llama_params, "json_schema", json());
649
- auto chat_params = common_chat_params_init(tmpl, inputs);
650
-
651
- llama_params["chat_format"] = static_cast<int>(chat_params.format);
652
- llama_params["prompt"] = chat_params.prompt;
653
- llama_params["grammar"] = chat_params.grammar;
654
- llama_params["grammar_lazy"] = chat_params.grammar_lazy;
655
- auto grammar_triggers = json::array();
656
- for (const auto & trigger : chat_params.grammar_triggers) {
657
- grammar_triggers.push_back({
658
- {"word", trigger.word},
659
- {"at_start", trigger.at_start},
660
- });
661
- }
662
- llama_params["grammar_triggers"] = grammar_triggers;
663
- llama_params["preserved_tokens"] = chat_params.preserved_tokens;
664
- for (const auto & stop : chat_params.additional_stops) {
665
- llama_params["stop"].push_back(stop);
666
- }
667
- } else {
668
- llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
620
+ auto chat_params = common_chat_templates_apply(tmpls, inputs);
621
+
622
+ llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
+ llama_params["prompt"] = chat_params.prompt;
624
+ llama_params["grammar"] = chat_params.grammar;
625
+ llama_params["grammar_lazy"] = chat_params.grammar_lazy;
626
+ auto grammar_triggers = json::array();
627
+ for (const auto & trigger : chat_params.grammar_triggers) {
628
+ grammar_triggers.push_back(trigger.to_json<json>());
629
+ }
630
+ llama_params["grammar_triggers"] = grammar_triggers;
631
+ llama_params["preserved_tokens"] = chat_params.preserved_tokens;
632
+ for (const auto & stop : chat_params.additional_stops) {
633
+ llama_params["stop"].push_back(stop);
669
634
  }
670
635
 
671
636
  // Handle "n" field
@@ -737,28 +702,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
737
702
  return res;
738
703
  }
739
704
 
740
- static json format_response_rerank(const json & request, const json & ranks) {
741
- json data = json::array();
742
- int32_t n_tokens = 0;
743
- int i = 0;
744
- for (const auto & rank : ranks) {
745
- data.push_back(json{
746
- {"index", i++},
747
- {"relevance_score", json_value(rank, "score", 0.0)},
748
- });
705
+ static json format_response_rerank(
706
+ const json & request,
707
+ const json & ranks,
708
+ bool is_tei_format,
709
+ std::vector<std::string> & texts) {
710
+ json res;
711
+ if (is_tei_format) {
712
+ // TEI response format
713
+ res = json::array();
714
+ bool return_text = json_value(request, "return_text", false);
715
+ for (const auto & rank : ranks) {
716
+ int index = json_value(rank, "index", 0);
717
+ json elem = json{
718
+ {"index", index},
719
+ {"score", json_value(rank, "score", 0.0)},
720
+ };
721
+ if (return_text) {
722
+ elem["text"] = std::move(texts[index]);
723
+ }
724
+ res.push_back(elem);
725
+ }
726
+ } else {
727
+ // Jina response format
728
+ json results = json::array();
729
+ int32_t n_tokens = 0;
730
+ for (const auto & rank : ranks) {
731
+ results.push_back(json{
732
+ {"index", json_value(rank, "index", 0)},
733
+ {"relevance_score", json_value(rank, "score", 0.0)},
734
+ });
749
735
 
750
- n_tokens += json_value(rank, "tokens_evaluated", 0);
751
- }
736
+ n_tokens += json_value(rank, "tokens_evaluated", 0);
737
+ }
752
738
 
753
- json res = json {
754
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
755
- {"object", "list"},
756
- {"usage", json {
757
- {"prompt_tokens", n_tokens},
758
- {"total_tokens", n_tokens}
759
- }},
760
- {"results", data}
761
- };
739
+ res = json{
740
+ {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
741
+ {"object", "list"},
742
+ {"usage", json{
743
+ {"prompt_tokens", n_tokens},
744
+ {"total_tokens", n_tokens}
745
+ }},
746
+ {"results", results}
747
+ };
748
+ }
762
749
 
763
750
  return res;
764
751
  }
@@ -3,7 +3,7 @@
3
3
  # MIT license
4
4
  # Copyright (C) 2024 Intel Corporation
5
5
  # SPDX-License-Identifier: MIT
6
-
6
+ export ONEAPI_DEVICE_SELECTOR="level_zero:0"
7
7
  source /opt/intel/oneapi/setvars.sh
8
8
 
9
9
  #export GGML_SYCL_DEBUG=1
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
13
13
  INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
14
14
  MODEL_FILE=models/llama-2-7b.Q4_0.gguf
15
15
  NGL=33
16
- CONEXT=8192
16
+ CONEXT=4096
17
17
 
18
18
  if [ $# -gt 0 ]; then
19
19
  GGML_SYCL_DEVICE=$1