@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -7,14 +7,14 @@
|
|
|
7
7
|
|
|
8
8
|
// increase max payload length to allow use of larger context size
|
|
9
9
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
10
|
+
// disable Nagle's algorithm
|
|
11
|
+
#define CPPHTTPLIB_TCP_NODELAY true
|
|
10
12
|
#include "httplib.h"
|
|
11
13
|
|
|
12
14
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
13
15
|
#define JSON_ASSERT GGML_ASSERT
|
|
14
16
|
#include "json.hpp"
|
|
15
|
-
#include "
|
|
16
|
-
#include "chat.hpp"
|
|
17
|
-
#include "chat-template.hpp"
|
|
17
|
+
#include "chat.h"
|
|
18
18
|
|
|
19
19
|
#include <random>
|
|
20
20
|
#include <sstream>
|
|
@@ -347,41 +347,6 @@ static llama_tokens format_infill(
|
|
|
347
347
|
return embd_inp;
|
|
348
348
|
}
|
|
349
349
|
|
|
350
|
-
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
351
|
-
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
|
352
|
-
std::vector<common_chat_msg> chat;
|
|
353
|
-
|
|
354
|
-
for (size_t i = 0; i < messages.size(); ++i) {
|
|
355
|
-
const auto & curr_msg = messages[i];
|
|
356
|
-
|
|
357
|
-
std::string role = json_value(curr_msg, "role", std::string(""));
|
|
358
|
-
|
|
359
|
-
std::string content;
|
|
360
|
-
if (curr_msg.contains("content")) {
|
|
361
|
-
if (curr_msg["content"].is_string()) {
|
|
362
|
-
content = curr_msg["content"].get<std::string>();
|
|
363
|
-
} else if (curr_msg["content"].is_array()) {
|
|
364
|
-
for (const auto & part : curr_msg["content"]) {
|
|
365
|
-
if (part.contains("text")) {
|
|
366
|
-
content += "\n" + part["text"].get<std::string>();
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
} else {
|
|
370
|
-
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
|
371
|
-
}
|
|
372
|
-
} else {
|
|
373
|
-
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
chat.push_back({role, content, /* tool_calls= */ {}});
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
|
380
|
-
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
381
|
-
|
|
382
|
-
return formatted_chat;
|
|
383
|
-
}
|
|
384
|
-
|
|
385
350
|
//
|
|
386
351
|
// base64 utils (TODO: move to common in the future)
|
|
387
352
|
//
|
|
@@ -470,6 +435,10 @@ static std::string gen_chatcmplid() {
|
|
|
470
435
|
return "chatcmpl-" + random_string();
|
|
471
436
|
}
|
|
472
437
|
|
|
438
|
+
static std::string gen_tool_call_id() {
|
|
439
|
+
return random_string();
|
|
440
|
+
}
|
|
441
|
+
|
|
473
442
|
//
|
|
474
443
|
// other common utils
|
|
475
444
|
//
|
|
@@ -556,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
556
525
|
throw std::runtime_error("Only one completion choice is allowed");
|
|
557
526
|
}
|
|
558
527
|
|
|
528
|
+
// Handle "echo" field
|
|
529
|
+
if (json_value(body, "echo", false)) {
|
|
530
|
+
throw std::runtime_error("Only no echo is supported");
|
|
531
|
+
}
|
|
532
|
+
|
|
559
533
|
// Params supported by OAI but unsupported by llama.cpp
|
|
560
|
-
static const std::vector<std::string> unsupported_params { "best_of", "
|
|
534
|
+
static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
|
|
561
535
|
for (const auto & param : unsupported_params) {
|
|
562
536
|
if (body.contains(param)) {
|
|
563
537
|
throw std::runtime_error("Unsupported param: " + param);
|
|
@@ -578,12 +552,10 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
578
552
|
static json oaicompat_completion_params_parse(
|
|
579
553
|
const json & body, /* openai api json semantics */
|
|
580
554
|
bool use_jinja,
|
|
581
|
-
|
|
555
|
+
common_reasoning_format reasoning_format,
|
|
556
|
+
const struct common_chat_templates * tmpls)
|
|
582
557
|
{
|
|
583
558
|
json llama_params;
|
|
584
|
-
const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
|
|
585
|
-
? *chat_templates.template_tool_use
|
|
586
|
-
: *chat_templates.template_default;
|
|
587
559
|
|
|
588
560
|
auto tools = json_value(body, "tools", json());
|
|
589
561
|
auto stream = json_value(body, "stream", false);
|
|
@@ -609,61 +581,56 @@ static json oaicompat_completion_params_parse(
|
|
|
609
581
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
|
610
582
|
}
|
|
611
583
|
|
|
584
|
+
auto json_schema = json_value(body, "json_schema", json());
|
|
585
|
+
auto grammar = json_value(body, "grammar", std::string());
|
|
586
|
+
if (!json_schema.is_null() && !grammar.empty()) {
|
|
587
|
+
throw std::runtime_error("Cannot use both json_schema and grammar");
|
|
588
|
+
}
|
|
589
|
+
|
|
612
590
|
// Handle "response_format" field
|
|
613
591
|
if (body.contains("response_format")) {
|
|
614
592
|
json response_format = json_value(body, "response_format", json::object());
|
|
615
593
|
std::string response_type = json_value(response_format, "type", std::string());
|
|
616
594
|
if (response_type == "json_object") {
|
|
617
|
-
|
|
595
|
+
json_schema = json_value(response_format, "schema", json::object());
|
|
618
596
|
} else if (response_type == "json_schema") {
|
|
619
|
-
|
|
620
|
-
|
|
597
|
+
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
|
|
598
|
+
json_schema = json_value(schema_wrapper, "schema", json::object());
|
|
621
599
|
} else if (!response_type.empty() && response_type != "text") {
|
|
622
600
|
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
|
623
601
|
}
|
|
624
602
|
}
|
|
625
603
|
|
|
604
|
+
common_chat_templates_inputs inputs;
|
|
605
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
|
|
606
|
+
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
|
607
|
+
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
|
608
|
+
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
|
609
|
+
inputs.grammar = grammar;
|
|
610
|
+
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
|
611
|
+
inputs.use_jinja = use_jinja;
|
|
612
|
+
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
|
613
|
+
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
614
|
+
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
|
615
|
+
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
|
616
|
+
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
617
|
+
}
|
|
618
|
+
|
|
626
619
|
// Apply chat template to the list of messages
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
|
|
642
|
-
inputs.parallel_tool_calls = false;
|
|
643
|
-
}
|
|
644
|
-
inputs.stream = stream;
|
|
645
|
-
// TODO: support mixing schema w/ tools beyond generic format.
|
|
646
|
-
inputs.json_schema = json_value(llama_params, "json_schema", json());
|
|
647
|
-
auto chat_params = common_chat_params_init(tmpl, inputs);
|
|
648
|
-
|
|
649
|
-
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
650
|
-
llama_params["prompt"] = chat_params.prompt;
|
|
651
|
-
llama_params["grammar"] = chat_params.grammar;
|
|
652
|
-
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
653
|
-
auto grammar_triggers = json::array();
|
|
654
|
-
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
655
|
-
grammar_triggers.push_back({
|
|
656
|
-
{"word", trigger.word},
|
|
657
|
-
{"at_start", trigger.at_start},
|
|
658
|
-
});
|
|
659
|
-
}
|
|
660
|
-
llama_params["grammar_triggers"] = grammar_triggers;
|
|
661
|
-
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
662
|
-
for (const auto & stop : chat_params.additional_stops) {
|
|
663
|
-
llama_params["stop"].push_back(stop);
|
|
664
|
-
}
|
|
665
|
-
} else {
|
|
666
|
-
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
|
620
|
+
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
|
621
|
+
|
|
622
|
+
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
623
|
+
llama_params["prompt"] = chat_params.prompt;
|
|
624
|
+
llama_params["grammar"] = chat_params.grammar;
|
|
625
|
+
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
626
|
+
auto grammar_triggers = json::array();
|
|
627
|
+
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
628
|
+
grammar_triggers.push_back(trigger.to_json<json>());
|
|
629
|
+
}
|
|
630
|
+
llama_params["grammar_triggers"] = grammar_triggers;
|
|
631
|
+
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
632
|
+
for (const auto & stop : chat_params.additional_stops) {
|
|
633
|
+
llama_params["stop"].push_back(stop);
|
|
667
634
|
}
|
|
668
635
|
|
|
669
636
|
// Handle "n" field
|
|
@@ -735,28 +702,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
735
702
|
return res;
|
|
736
703
|
}
|
|
737
704
|
|
|
738
|
-
static json format_response_rerank(
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
705
|
+
static json format_response_rerank(
|
|
706
|
+
const json & request,
|
|
707
|
+
const json & ranks,
|
|
708
|
+
bool is_tei_format,
|
|
709
|
+
std::vector<std::string> & texts) {
|
|
710
|
+
json res;
|
|
711
|
+
if (is_tei_format) {
|
|
712
|
+
// TEI response format
|
|
713
|
+
res = json::array();
|
|
714
|
+
bool return_text = json_value(request, "return_text", false);
|
|
715
|
+
for (const auto & rank : ranks) {
|
|
716
|
+
int index = json_value(rank, "index", 0);
|
|
717
|
+
json elem = json{
|
|
718
|
+
{"index", index},
|
|
719
|
+
{"score", json_value(rank, "score", 0.0)},
|
|
720
|
+
};
|
|
721
|
+
if (return_text) {
|
|
722
|
+
elem["text"] = std::move(texts[index]);
|
|
723
|
+
}
|
|
724
|
+
res.push_back(elem);
|
|
725
|
+
}
|
|
726
|
+
} else {
|
|
727
|
+
// Jina response format
|
|
728
|
+
json results = json::array();
|
|
729
|
+
int32_t n_tokens = 0;
|
|
730
|
+
for (const auto & rank : ranks) {
|
|
731
|
+
results.push_back(json{
|
|
732
|
+
{"index", json_value(rank, "index", 0)},
|
|
733
|
+
{"relevance_score", json_value(rank, "score", 0.0)},
|
|
734
|
+
});
|
|
747
735
|
|
|
748
|
-
|
|
749
|
-
|
|
736
|
+
n_tokens += json_value(rank, "tokens_evaluated", 0);
|
|
737
|
+
}
|
|
750
738
|
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
739
|
+
res = json{
|
|
740
|
+
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
741
|
+
{"object", "list"},
|
|
742
|
+
{"usage", json{
|
|
743
|
+
{"prompt_tokens", n_tokens},
|
|
744
|
+
{"total_tokens", n_tokens}
|
|
745
|
+
}},
|
|
746
|
+
{"results", results}
|
|
747
|
+
};
|
|
748
|
+
}
|
|
760
749
|
|
|
761
750
|
return res;
|
|
762
751
|
}
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# MIT license
|
|
4
4
|
# Copyright (C) 2024 Intel Corporation
|
|
5
5
|
# SPDX-License-Identifier: MIT
|
|
6
|
-
|
|
6
|
+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
|
7
7
|
source /opt/intel/oneapi/setvars.sh
|
|
8
8
|
|
|
9
9
|
#export GGML_SYCL_DEBUG=1
|
|
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
|
|
|
13
13
|
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
14
14
|
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
|
15
15
|
NGL=33
|
|
16
|
-
CONEXT=
|
|
16
|
+
CONEXT=4096
|
|
17
17
|
|
|
18
18
|
if [ $# -gt 0 ]; then
|
|
19
19
|
GGML_SYCL_DEVICE=$1
|