@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -7,14 +7,14 @@
7
7
 
8
8
  // increase max payload length to allow use of larger context size
9
9
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
10
+ // disable Nagle's algorithm
11
+ #define CPPHTTPLIB_TCP_NODELAY true
10
12
  #include "httplib.h"
11
13
 
12
14
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
13
15
  #define JSON_ASSERT GGML_ASSERT
14
16
  #include "json.hpp"
15
- #include "minja.hpp"
16
- #include "chat.hpp"
17
- #include "chat-template.hpp"
17
+ #include "chat.h"
18
18
 
19
19
  #include <random>
20
20
  #include <sstream>
@@ -347,41 +347,6 @@ static llama_tokens format_infill(
347
347
  return embd_inp;
348
348
  }
349
349
 
350
- // Format given chat. If tmpl is empty, we take the template from model metadata
351
- inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
352
- std::vector<common_chat_msg> chat;
353
-
354
- for (size_t i = 0; i < messages.size(); ++i) {
355
- const auto & curr_msg = messages[i];
356
-
357
- std::string role = json_value(curr_msg, "role", std::string(""));
358
-
359
- std::string content;
360
- if (curr_msg.contains("content")) {
361
- if (curr_msg["content"].is_string()) {
362
- content = curr_msg["content"].get<std::string>();
363
- } else if (curr_msg["content"].is_array()) {
364
- for (const auto & part : curr_msg["content"]) {
365
- if (part.contains("text")) {
366
- content += "\n" + part["text"].get<std::string>();
367
- }
368
- }
369
- } else {
370
- throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
371
- }
372
- } else {
373
- throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
374
- }
375
-
376
- chat.push_back({role, content, /* tool_calls= */ {}});
377
- }
378
-
379
- const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
380
- LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
381
-
382
- return formatted_chat;
383
- }
384
-
385
350
  //
386
351
  // base64 utils (TODO: move to common in the future)
387
352
  //
@@ -470,6 +435,10 @@ static std::string gen_chatcmplid() {
470
435
  return "chatcmpl-" + random_string();
471
436
  }
472
437
 
438
+ static std::string gen_tool_call_id() {
439
+ return random_string();
440
+ }
441
+
473
442
  //
474
443
  // other common utils
475
444
  //
@@ -556,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) {
556
525
  throw std::runtime_error("Only one completion choice is allowed");
557
526
  }
558
527
 
528
+ // Handle "echo" field
529
+ if (json_value(body, "echo", false)) {
530
+ throw std::runtime_error("Only no echo is supported");
531
+ }
532
+
559
533
  // Params supported by OAI but unsupported by llama.cpp
560
- static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
534
+ static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
561
535
  for (const auto & param : unsupported_params) {
562
536
  if (body.contains(param)) {
563
537
  throw std::runtime_error("Unsupported param: " + param);
@@ -578,12 +552,10 @@ static json oaicompat_completion_params_parse(const json & body) {
578
552
  static json oaicompat_completion_params_parse(
579
553
  const json & body, /* openai api json semantics */
580
554
  bool use_jinja,
581
- const common_chat_templates & chat_templates)
555
+ common_reasoning_format reasoning_format,
556
+ const struct common_chat_templates * tmpls)
582
557
  {
583
558
  json llama_params;
584
- const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
585
- ? *chat_templates.template_tool_use
586
- : *chat_templates.template_default;
587
559
 
588
560
  auto tools = json_value(body, "tools", json());
589
561
  auto stream = json_value(body, "stream", false);
@@ -609,61 +581,56 @@ static json oaicompat_completion_params_parse(
609
581
  llama_params["stop"] = json_value(body, "stop", json::array());
610
582
  }
611
583
 
584
+ auto json_schema = json_value(body, "json_schema", json());
585
+ auto grammar = json_value(body, "grammar", std::string());
586
+ if (!json_schema.is_null() && !grammar.empty()) {
587
+ throw std::runtime_error("Cannot use both json_schema and grammar");
588
+ }
589
+
612
590
  // Handle "response_format" field
613
591
  if (body.contains("response_format")) {
614
592
  json response_format = json_value(body, "response_format", json::object());
615
593
  std::string response_type = json_value(response_format, "type", std::string());
616
594
  if (response_type == "json_object") {
617
- llama_params["json_schema"] = json_value(response_format, "schema", json::object());
595
+ json_schema = json_value(response_format, "schema", json::object());
618
596
  } else if (response_type == "json_schema") {
619
- json json_schema = json_value(response_format, "json_schema", json::object());
620
- llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
597
+ auto schema_wrapper = json_value(response_format, "json_schema", json::object());
598
+ json_schema = json_value(schema_wrapper, "schema", json::object());
621
599
  } else if (!response_type.empty() && response_type != "text") {
622
600
  throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
623
601
  }
624
602
  }
625
603
 
604
+ common_chat_templates_inputs inputs;
605
+ inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
606
+ inputs.tools = common_chat_tools_parse_oaicompat(tools);
607
+ inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
608
+ inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
609
+ inputs.grammar = grammar;
610
+ inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
611
+ inputs.use_jinja = use_jinja;
612
+ inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
613
+ inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
614
+ inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
615
+ if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
616
+ throw std::runtime_error("Cannot use custom grammar constraints with tools.");
617
+ }
618
+
626
619
  // Apply chat template to the list of messages
627
- if (use_jinja) {
628
- auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
629
- if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
630
- throw std::runtime_error("Invalid tool_choice: " + tool_choice);
631
- }
632
- if (tool_choice != "none" && llama_params.contains("grammar")) {
633
- throw std::runtime_error("Cannot use custom grammar constraints with tools.");
634
- }
635
- common_chat_inputs inputs;
636
- inputs.messages = body.at("messages");
637
- inputs.tools = tools;
638
- inputs.tool_choice = tool_choice;
639
- inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
640
- if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
641
- LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
642
- inputs.parallel_tool_calls = false;
643
- }
644
- inputs.stream = stream;
645
- // TODO: support mixing schema w/ tools beyond generic format.
646
- inputs.json_schema = json_value(llama_params, "json_schema", json());
647
- auto chat_params = common_chat_params_init(tmpl, inputs);
648
-
649
- llama_params["chat_format"] = static_cast<int>(chat_params.format);
650
- llama_params["prompt"] = chat_params.prompt;
651
- llama_params["grammar"] = chat_params.grammar;
652
- llama_params["grammar_lazy"] = chat_params.grammar_lazy;
653
- auto grammar_triggers = json::array();
654
- for (const auto & trigger : chat_params.grammar_triggers) {
655
- grammar_triggers.push_back({
656
- {"word", trigger.word},
657
- {"at_start", trigger.at_start},
658
- });
659
- }
660
- llama_params["grammar_triggers"] = grammar_triggers;
661
- llama_params["preserved_tokens"] = chat_params.preserved_tokens;
662
- for (const auto & stop : chat_params.additional_stops) {
663
- llama_params["stop"].push_back(stop);
664
- }
665
- } else {
666
- llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
620
+ auto chat_params = common_chat_templates_apply(tmpls, inputs);
621
+
622
+ llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
+ llama_params["prompt"] = chat_params.prompt;
624
+ llama_params["grammar"] = chat_params.grammar;
625
+ llama_params["grammar_lazy"] = chat_params.grammar_lazy;
626
+ auto grammar_triggers = json::array();
627
+ for (const auto & trigger : chat_params.grammar_triggers) {
628
+ grammar_triggers.push_back(trigger.to_json<json>());
629
+ }
630
+ llama_params["grammar_triggers"] = grammar_triggers;
631
+ llama_params["preserved_tokens"] = chat_params.preserved_tokens;
632
+ for (const auto & stop : chat_params.additional_stops) {
633
+ llama_params["stop"].push_back(stop);
667
634
  }
668
635
 
669
636
  // Handle "n" field
@@ -735,28 +702,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
735
702
  return res;
736
703
  }
737
704
 
738
- static json format_response_rerank(const json & request, const json & ranks) {
739
- json data = json::array();
740
- int32_t n_tokens = 0;
741
- int i = 0;
742
- for (const auto & rank : ranks) {
743
- data.push_back(json{
744
- {"index", i++},
745
- {"relevance_score", json_value(rank, "score", 0.0)},
746
- });
705
+ static json format_response_rerank(
706
+ const json & request,
707
+ const json & ranks,
708
+ bool is_tei_format,
709
+ std::vector<std::string> & texts) {
710
+ json res;
711
+ if (is_tei_format) {
712
+ // TEI response format
713
+ res = json::array();
714
+ bool return_text = json_value(request, "return_text", false);
715
+ for (const auto & rank : ranks) {
716
+ int index = json_value(rank, "index", 0);
717
+ json elem = json{
718
+ {"index", index},
719
+ {"score", json_value(rank, "score", 0.0)},
720
+ };
721
+ if (return_text) {
722
+ elem["text"] = std::move(texts[index]);
723
+ }
724
+ res.push_back(elem);
725
+ }
726
+ } else {
727
+ // Jina response format
728
+ json results = json::array();
729
+ int32_t n_tokens = 0;
730
+ for (const auto & rank : ranks) {
731
+ results.push_back(json{
732
+ {"index", json_value(rank, "index", 0)},
733
+ {"relevance_score", json_value(rank, "score", 0.0)},
734
+ });
747
735
 
748
- n_tokens += json_value(rank, "tokens_evaluated", 0);
749
- }
736
+ n_tokens += json_value(rank, "tokens_evaluated", 0);
737
+ }
750
738
 
751
- json res = json {
752
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
753
- {"object", "list"},
754
- {"usage", json {
755
- {"prompt_tokens", n_tokens},
756
- {"total_tokens", n_tokens}
757
- }},
758
- {"results", data}
759
- };
739
+ res = json{
740
+ {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
741
+ {"object", "list"},
742
+ {"usage", json{
743
+ {"prompt_tokens", n_tokens},
744
+ {"total_tokens", n_tokens}
745
+ }},
746
+ {"results", results}
747
+ };
748
+ }
760
749
 
761
750
  return res;
762
751
  }
@@ -3,7 +3,7 @@
3
3
  # MIT license
4
4
  # Copyright (C) 2024 Intel Corporation
5
5
  # SPDX-License-Identifier: MIT
6
-
6
+ export ONEAPI_DEVICE_SELECTOR="level_zero:0"
7
7
  source /opt/intel/oneapi/setvars.sh
8
8
 
9
9
  #export GGML_SYCL_DEBUG=1
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
13
13
  INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
14
14
  MODEL_FILE=models/llama-2-7b.Q4_0.gguf
15
15
  NGL=33
16
- CONEXT=8192
16
+ CONEXT=4096
17
17
 
18
18
  if [ $# -gt 0 ]; then
19
19
  GGML_SYCL_DEVICE=$1