@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -7,14 +7,14 @@
7
7
 
8
8
  // increase max payload length to allow use of larger context size
9
9
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
10
+ // disable Nagle's algorithm
11
+ #define CPPHTTPLIB_TCP_NODELAY true
10
12
  #include "httplib.h"
11
13
 
12
14
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
13
15
  #define JSON_ASSERT GGML_ASSERT
14
16
  #include "json.hpp"
15
- #include "minja.hpp"
16
- #include "chat.hpp"
17
- #include "chat-template.hpp"
17
+ #include "chat.h"
18
18
 
19
19
  #include <random>
20
20
  #include <sstream>
@@ -347,41 +347,6 @@ static llama_tokens format_infill(
347
347
  return embd_inp;
348
348
  }
349
349
 
350
- // Format given chat. If tmpl is empty, we take the template from model metadata
351
- inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
352
- std::vector<common_chat_msg> chat;
353
-
354
- for (size_t i = 0; i < messages.size(); ++i) {
355
- const auto & curr_msg = messages[i];
356
-
357
- std::string role = json_value(curr_msg, "role", std::string(""));
358
-
359
- std::string content;
360
- if (curr_msg.contains("content")) {
361
- if (curr_msg["content"].is_string()) {
362
- content = curr_msg["content"].get<std::string>();
363
- } else if (curr_msg["content"].is_array()) {
364
- for (const auto & part : curr_msg["content"]) {
365
- if (part.contains("text")) {
366
- content += "\n" + part["text"].get<std::string>();
367
- }
368
- }
369
- } else {
370
- throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
371
- }
372
- } else {
373
- throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
374
- }
375
-
376
- chat.push_back({role, content, /* tool_calls= */ {}});
377
- }
378
-
379
- const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
380
- LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
381
-
382
- return formatted_chat;
383
- }
384
-
385
350
  //
386
351
  // base64 utils (TODO: move to common in the future)
387
352
  //
@@ -470,6 +435,10 @@ static std::string gen_chatcmplid() {
470
435
  return "chatcmpl-" + random_string();
471
436
  }
472
437
 
438
+ static std::string gen_tool_call_id() {
439
+ return random_string();
440
+ }
441
+
473
442
  //
474
443
  // other common utils
475
444
  //
@@ -556,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) {
556
525
  throw std::runtime_error("Only one completion choice is allowed");
557
526
  }
558
527
 
528
+ // Handle "echo" field
529
+ if (json_value(body, "echo", false)) {
530
+ throw std::runtime_error("Only no echo is supported");
531
+ }
532
+
559
533
  // Params supported by OAI but unsupported by llama.cpp
560
- static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
534
+ static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
561
535
  for (const auto & param : unsupported_params) {
562
536
  if (body.contains(param)) {
563
537
  throw std::runtime_error("Unsupported param: " + param);
@@ -579,12 +553,9 @@ static json oaicompat_completion_params_parse(
579
553
  const json & body, /* openai api json semantics */
580
554
  bool use_jinja,
581
555
  common_reasoning_format reasoning_format,
582
- const common_chat_templates & chat_templates)
556
+ const struct common_chat_templates * tmpls)
583
557
  {
584
558
  json llama_params;
585
- const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
586
- ? *chat_templates.template_tool_use
587
- : *chat_templates.template_default;
588
559
 
589
560
  auto tools = json_value(body, "tools", json());
590
561
  auto stream = json_value(body, "stream", false);
@@ -610,62 +581,58 @@ static json oaicompat_completion_params_parse(
610
581
  llama_params["stop"] = json_value(body, "stop", json::array());
611
582
  }
612
583
 
584
+ auto json_schema = json_value(body, "json_schema", json());
585
+ auto grammar = json_value(body, "grammar", std::string());
586
+ if (!json_schema.is_null() && !grammar.empty()) {
587
+ throw std::runtime_error("Cannot use both json_schema and grammar");
588
+ }
589
+
613
590
  // Handle "response_format" field
614
591
  if (body.contains("response_format")) {
615
592
  json response_format = json_value(body, "response_format", json::object());
616
593
  std::string response_type = json_value(response_format, "type", std::string());
617
594
  if (response_type == "json_object") {
618
- llama_params["json_schema"] = json_value(response_format, "schema", json::object());
595
+ json_schema = json_value(response_format, "schema", json::object());
619
596
  } else if (response_type == "json_schema") {
620
- json json_schema = json_value(response_format, "json_schema", json::object());
621
- llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
597
+ auto schema_wrapper = json_value(response_format, "json_schema", json::object());
598
+ json_schema = json_value(schema_wrapper, "schema", json::object());
622
599
  } else if (!response_type.empty() && response_type != "text") {
623
600
  throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
624
601
  }
625
602
  }
626
603
 
604
+ common_chat_templates_inputs inputs;
605
+ inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
606
+ inputs.tools = common_chat_tools_parse_oaicompat(tools);
607
+ inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
608
+ inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
609
+ inputs.grammar = grammar;
610
+ inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
611
+ inputs.use_jinja = use_jinja;
612
+ inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
613
+ inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
614
+ inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
615
+ if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
616
+ throw std::runtime_error("Cannot use custom grammar constraints with tools.");
617
+ }
618
+
627
619
  // Apply chat template to the list of messages
628
- if (use_jinja) {
629
- auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
630
- if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
631
- throw std::runtime_error("Invalid tool_choice: " + tool_choice);
632
- }
633
- if (tool_choice != "none" && llama_params.contains("grammar")) {
634
- throw std::runtime_error("Cannot use custom grammar constraints with tools.");
635
- }
636
- common_chat_inputs inputs;
637
- inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
638
- inputs.messages = body.at("messages");
639
- inputs.tools = tools;
640
- inputs.tool_choice = tool_choice;
641
- inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
642
- if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
643
- LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
644
- inputs.parallel_tool_calls = false;
645
- }
646
- inputs.stream = stream;
647
- // TODO: support mixing schema w/ tools beyond generic format.
648
- inputs.json_schema = json_value(llama_params, "json_schema", json());
649
- auto chat_params = common_chat_params_init(tmpl, inputs);
620
+ auto chat_params = common_chat_templates_apply(tmpls, inputs);
650
621
 
651
- llama_params["chat_format"] = static_cast<int>(chat_params.format);
652
- llama_params["prompt"] = chat_params.prompt;
622
+ llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
+ llama_params["prompt"] = chat_params.prompt;
624
+ if (!chat_params.grammar.empty()) {
653
625
  llama_params["grammar"] = chat_params.grammar;
654
- llama_params["grammar_lazy"] = chat_params.grammar_lazy;
655
- auto grammar_triggers = json::array();
656
- for (const auto & trigger : chat_params.grammar_triggers) {
657
- grammar_triggers.push_back({
658
- {"word", trigger.word},
659
- {"at_start", trigger.at_start},
660
- });
661
- }
662
- llama_params["grammar_triggers"] = grammar_triggers;
663
- llama_params["preserved_tokens"] = chat_params.preserved_tokens;
664
- for (const auto & stop : chat_params.additional_stops) {
665
- llama_params["stop"].push_back(stop);
666
- }
667
- } else {
668
- llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
626
+ }
627
+ llama_params["grammar_lazy"] = chat_params.grammar_lazy;
628
+ auto grammar_triggers = json::array();
629
+ for (const auto & trigger : chat_params.grammar_triggers) {
630
+ grammar_triggers.push_back(trigger.to_json<json>());
631
+ }
632
+ llama_params["grammar_triggers"] = grammar_triggers;
633
+ llama_params["preserved_tokens"] = chat_params.preserved_tokens;
634
+ for (const auto & stop : chat_params.additional_stops) {
635
+ llama_params["stop"].push_back(stop);
669
636
  }
670
637
 
671
638
  // Handle "n" field
@@ -737,28 +704,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
737
704
  return res;
738
705
  }
739
706
 
740
- static json format_response_rerank(const json & request, const json & ranks) {
741
- json data = json::array();
742
- int32_t n_tokens = 0;
743
- int i = 0;
744
- for (const auto & rank : ranks) {
745
- data.push_back(json{
746
- {"index", i++},
747
- {"relevance_score", json_value(rank, "score", 0.0)},
748
- });
707
+ static json format_response_rerank(
708
+ const json & request,
709
+ const json & ranks,
710
+ bool is_tei_format,
711
+ std::vector<std::string> & texts) {
712
+ json res;
713
+ if (is_tei_format) {
714
+ // TEI response format
715
+ res = json::array();
716
+ bool return_text = json_value(request, "return_text", false);
717
+ for (const auto & rank : ranks) {
718
+ int index = json_value(rank, "index", 0);
719
+ json elem = json{
720
+ {"index", index},
721
+ {"score", json_value(rank, "score", 0.0)},
722
+ };
723
+ if (return_text) {
724
+ elem["text"] = std::move(texts[index]);
725
+ }
726
+ res.push_back(elem);
727
+ }
728
+ } else {
729
+ // Jina response format
730
+ json results = json::array();
731
+ int32_t n_tokens = 0;
732
+ for (const auto & rank : ranks) {
733
+ results.push_back(json{
734
+ {"index", json_value(rank, "index", 0)},
735
+ {"relevance_score", json_value(rank, "score", 0.0)},
736
+ });
749
737
 
750
- n_tokens += json_value(rank, "tokens_evaluated", 0);
751
- }
738
+ n_tokens += json_value(rank, "tokens_evaluated", 0);
739
+ }
752
740
 
753
- json res = json {
754
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
755
- {"object", "list"},
756
- {"usage", json {
757
- {"prompt_tokens", n_tokens},
758
- {"total_tokens", n_tokens}
759
- }},
760
- {"results", data}
761
- };
741
+ res = json{
742
+ {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
743
+ {"object", "list"},
744
+ {"usage", json{
745
+ {"prompt_tokens", n_tokens},
746
+ {"total_tokens", n_tokens}
747
+ }},
748
+ {"results", results}
749
+ };
750
+ }
762
751
 
763
752
  return res;
764
753
  }
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
98
98
  auto generate = [&](const std::string & prompt) {
99
99
  std::string response;
100
100
 
101
- const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
101
+ const bool is_first = llama_kv_self_used_cells(ctx) == 0;
102
102
 
103
103
  // tokenize the prompt
104
104
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
113
113
  while (true) {
114
114
  // check if we have enough space in the context to evaluate this batch
115
115
  int n_ctx = llama_n_ctx(ctx);
116
- int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
116
+ int n_ctx_used = llama_kv_self_used_cells(ctx);
117
117
  if (n_ctx_used + batch.n_tokens > n_ctx) {
118
118
  printf("\033[0m\n");
119
119
  fprintf(stderr, "context size exceeded\n");
@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
331
331
  }
332
332
 
333
333
  active_seqs.erase(s);
334
- for(int i = 0; i < n_seq_dft; i++) {
334
+ for (int i = 0; i < n_seq_dft; i++) {
335
335
  if (i == s) {
336
336
  continue;
337
337
  }
338
- if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
338
+ if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
339
339
  // synchronize active status for sequences with the same drafted token
340
340
  drafts[i].active = drafts[i].active && accept;
341
341
  if (!drafts[i].active) {
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
420
420
  {
421
421
  LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
422
422
 
423
- llama_kv_cache_seq_keep(ctx_dft, s_keep);
424
- llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
425
- llama_kv_cache_seq_keep(ctx_dft, 0);
423
+ llama_kv_self_seq_keep(ctx_dft, s_keep);
424
+ llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
425
+ llama_kv_self_seq_keep(ctx_dft, 0);
426
426
 
427
- llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
428
- llama_kv_cache_seq_keep(ctx_tgt, s_keep);
429
- llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
430
- llama_kv_cache_seq_keep(ctx_tgt, 0);
427
+ llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
428
+ llama_kv_self_seq_keep(ctx_tgt, s_keep);
429
+ llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
430
+ llama_kv_self_seq_keep(ctx_tgt, 0);
431
431
  }
432
432
 
433
433
  for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
444
444
  common_batch_clear(batch_dft);
445
445
  common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
446
446
 
447
- llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
447
+ llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
448
448
  // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
449
449
  llama_decode(ctx_dft, batch_dft);
450
450
 
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
503
503
  if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
504
504
  LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
505
505
 
506
- llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
507
- llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
506
+ llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
507
+ llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
508
508
 
509
509
  // all previous tokens from this branch are now also part of the new branch
510
510
  for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
585
585
 
586
586
  // evaluate the target model on the drafted tokens
587
587
  {
588
- llama_kv_cache_seq_keep(ctx_tgt, 0);
588
+ llama_kv_self_seq_keep(ctx_tgt, 0);
589
589
  for (int s = 1; s < n_seq_dft; ++s) {
590
- llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
590
+ llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
591
591
  }
592
592
 
593
593
  // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
217
217
  {
218
218
  LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
219
219
 
220
- llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
220
+ llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
221
221
  }
222
222
 
223
223
  if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
@@ -3,7 +3,7 @@
3
3
  # MIT license
4
4
  # Copyright (C) 2024 Intel Corporation
5
5
  # SPDX-License-Identifier: MIT
6
-
6
+ export ONEAPI_DEVICE_SELECTOR="level_zero:0"
7
7
  source /opt/intel/oneapi/setvars.sh
8
8
 
9
9
  #export GGML_SYCL_DEBUG=1
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
13
13
  INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
14
14
  MODEL_FILE=models/llama-2-7b.Q4_0.gguf
15
15
  NGL=33
16
- CONEXT=8192
16
+ CONEXT=4096
17
17
 
18
18
  if [ $# -gt 0 ]; then
19
19
  GGML_SYCL_DEVICE=$1