@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -7,14 +7,14 @@
|
|
|
7
7
|
|
|
8
8
|
// increase max payload length to allow use of larger context size
|
|
9
9
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
10
|
+
// disable Nagle's algorithm
|
|
11
|
+
#define CPPHTTPLIB_TCP_NODELAY true
|
|
10
12
|
#include "httplib.h"
|
|
11
13
|
|
|
12
14
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
13
15
|
#define JSON_ASSERT GGML_ASSERT
|
|
14
16
|
#include "json.hpp"
|
|
15
|
-
#include "
|
|
16
|
-
#include "chat.hpp"
|
|
17
|
-
#include "chat-template.hpp"
|
|
17
|
+
#include "chat.h"
|
|
18
18
|
|
|
19
19
|
#include <random>
|
|
20
20
|
#include <sstream>
|
|
@@ -347,41 +347,6 @@ static llama_tokens format_infill(
|
|
|
347
347
|
return embd_inp;
|
|
348
348
|
}
|
|
349
349
|
|
|
350
|
-
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
351
|
-
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
|
352
|
-
std::vector<common_chat_msg> chat;
|
|
353
|
-
|
|
354
|
-
for (size_t i = 0; i < messages.size(); ++i) {
|
|
355
|
-
const auto & curr_msg = messages[i];
|
|
356
|
-
|
|
357
|
-
std::string role = json_value(curr_msg, "role", std::string(""));
|
|
358
|
-
|
|
359
|
-
std::string content;
|
|
360
|
-
if (curr_msg.contains("content")) {
|
|
361
|
-
if (curr_msg["content"].is_string()) {
|
|
362
|
-
content = curr_msg["content"].get<std::string>();
|
|
363
|
-
} else if (curr_msg["content"].is_array()) {
|
|
364
|
-
for (const auto & part : curr_msg["content"]) {
|
|
365
|
-
if (part.contains("text")) {
|
|
366
|
-
content += "\n" + part["text"].get<std::string>();
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
} else {
|
|
370
|
-
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
371
|
-
}
|
|
372
|
-
} else {
|
|
373
|
-
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
chat.push_back({role, content, /* tool_calls= */ {}});
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
|
380
|
-
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
381
|
-
|
|
382
|
-
return formatted_chat;
|
|
383
|
-
}
|
|
384
|
-
|
|
385
350
|
//
|
|
386
351
|
// base64 utils (TODO: move to common in the future)
|
|
387
352
|
//
|
|
@@ -470,6 +435,10 @@ static std::string gen_chatcmplid() {
|
|
|
470
435
|
return "chatcmpl-" + random_string();
|
|
471
436
|
}
|
|
472
437
|
|
|
438
|
+
static std::string gen_tool_call_id() {
|
|
439
|
+
return random_string();
|
|
440
|
+
}
|
|
441
|
+
|
|
473
442
|
//
|
|
474
443
|
// other common utils
|
|
475
444
|
//
|
|
@@ -556,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
556
525
|
throw std::runtime_error("Only one completion choice is allowed");
|
|
557
526
|
}
|
|
558
527
|
|
|
528
|
+
// Handle "echo" field
|
|
529
|
+
if (json_value(body, "echo", false)) {
|
|
530
|
+
throw std::runtime_error("Only no echo is supported");
|
|
531
|
+
}
|
|
532
|
+
|
|
559
533
|
// Params supported by OAI but unsupported by llama.cpp
|
|
560
|
-
static const std::vector<std::string> unsupported_params { "best_of", "
|
|
534
|
+
static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
|
|
561
535
|
for (const auto & param : unsupported_params) {
|
|
562
536
|
if (body.contains(param)) {
|
|
563
537
|
throw std::runtime_error("Unsupported param: " + param);
|
|
@@ -579,12 +553,9 @@ static json oaicompat_completion_params_parse(
|
|
|
579
553
|
const json & body, /* openai api json semantics */
|
|
580
554
|
bool use_jinja,
|
|
581
555
|
common_reasoning_format reasoning_format,
|
|
582
|
-
const common_chat_templates
|
|
556
|
+
const struct common_chat_templates * tmpls)
|
|
583
557
|
{
|
|
584
558
|
json llama_params;
|
|
585
|
-
const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
|
|
586
|
-
? *chat_templates.template_tool_use
|
|
587
|
-
: *chat_templates.template_default;
|
|
588
559
|
|
|
589
560
|
auto tools = json_value(body, "tools", json());
|
|
590
561
|
auto stream = json_value(body, "stream", false);
|
|
@@ -610,62 +581,58 @@ static json oaicompat_completion_params_parse(
|
|
|
610
581
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
|
611
582
|
}
|
|
612
583
|
|
|
584
|
+
auto json_schema = json_value(body, "json_schema", json());
|
|
585
|
+
auto grammar = json_value(body, "grammar", std::string());
|
|
586
|
+
if (!json_schema.is_null() && !grammar.empty()) {
|
|
587
|
+
throw std::runtime_error("Cannot use both json_schema and grammar");
|
|
588
|
+
}
|
|
589
|
+
|
|
613
590
|
// Handle "response_format" field
|
|
614
591
|
if (body.contains("response_format")) {
|
|
615
592
|
json response_format = json_value(body, "response_format", json::object());
|
|
616
593
|
std::string response_type = json_value(response_format, "type", std::string());
|
|
617
594
|
if (response_type == "json_object") {
|
|
618
|
-
|
|
595
|
+
json_schema = json_value(response_format, "schema", json::object());
|
|
619
596
|
} else if (response_type == "json_schema") {
|
|
620
|
-
|
|
621
|
-
|
|
597
|
+
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
|
|
598
|
+
json_schema = json_value(schema_wrapper, "schema", json::object());
|
|
622
599
|
} else if (!response_type.empty() && response_type != "text") {
|
|
623
600
|
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
|
624
601
|
}
|
|
625
602
|
}
|
|
626
603
|
|
|
604
|
+
common_chat_templates_inputs inputs;
|
|
605
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
|
|
606
|
+
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
|
607
|
+
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
|
608
|
+
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
|
609
|
+
inputs.grammar = grammar;
|
|
610
|
+
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
|
611
|
+
inputs.use_jinja = use_jinja;
|
|
612
|
+
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
|
613
|
+
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
614
|
+
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
|
615
|
+
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
|
616
|
+
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
617
|
+
}
|
|
618
|
+
|
|
627
619
|
// Apply chat template to the list of messages
|
|
628
|
-
|
|
629
|
-
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
|
|
630
|
-
if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
|
|
631
|
-
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
|
632
|
-
}
|
|
633
|
-
if (tool_choice != "none" && llama_params.contains("grammar")) {
|
|
634
|
-
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
635
|
-
}
|
|
636
|
-
common_chat_inputs inputs;
|
|
637
|
-
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
638
|
-
inputs.messages = body.at("messages");
|
|
639
|
-
inputs.tools = tools;
|
|
640
|
-
inputs.tool_choice = tool_choice;
|
|
641
|
-
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
|
642
|
-
if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
|
|
643
|
-
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
|
|
644
|
-
inputs.parallel_tool_calls = false;
|
|
645
|
-
}
|
|
646
|
-
inputs.stream = stream;
|
|
647
|
-
// TODO: support mixing schema w/ tools beyond generic format.
|
|
648
|
-
inputs.json_schema = json_value(llama_params, "json_schema", json());
|
|
649
|
-
auto chat_params = common_chat_params_init(tmpl, inputs);
|
|
620
|
+
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
|
650
621
|
|
|
651
|
-
|
|
652
|
-
|
|
622
|
+
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
623
|
+
llama_params["prompt"] = chat_params.prompt;
|
|
624
|
+
if (!chat_params.grammar.empty()) {
|
|
653
625
|
llama_params["grammar"] = chat_params.grammar;
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
llama_params["
|
|
664
|
-
for (const auto & stop : chat_params.additional_stops) {
|
|
665
|
-
llama_params["stop"].push_back(stop);
|
|
666
|
-
}
|
|
667
|
-
} else {
|
|
668
|
-
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
|
626
|
+
}
|
|
627
|
+
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
628
|
+
auto grammar_triggers = json::array();
|
|
629
|
+
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
630
|
+
grammar_triggers.push_back(trigger.to_json<json>());
|
|
631
|
+
}
|
|
632
|
+
llama_params["grammar_triggers"] = grammar_triggers;
|
|
633
|
+
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
634
|
+
for (const auto & stop : chat_params.additional_stops) {
|
|
635
|
+
llama_params["stop"].push_back(stop);
|
|
669
636
|
}
|
|
670
637
|
|
|
671
638
|
// Handle "n" field
|
|
@@ -737,28 +704,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
737
704
|
return res;
|
|
738
705
|
}
|
|
739
706
|
|
|
740
|
-
static json format_response_rerank(
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
707
|
+
static json format_response_rerank(
|
|
708
|
+
const json & request,
|
|
709
|
+
const json & ranks,
|
|
710
|
+
bool is_tei_format,
|
|
711
|
+
std::vector<std::string> & texts) {
|
|
712
|
+
json res;
|
|
713
|
+
if (is_tei_format) {
|
|
714
|
+
// TEI response format
|
|
715
|
+
res = json::array();
|
|
716
|
+
bool return_text = json_value(request, "return_text", false);
|
|
717
|
+
for (const auto & rank : ranks) {
|
|
718
|
+
int index = json_value(rank, "index", 0);
|
|
719
|
+
json elem = json{
|
|
720
|
+
{"index", index},
|
|
721
|
+
{"score", json_value(rank, "score", 0.0)},
|
|
722
|
+
};
|
|
723
|
+
if (return_text) {
|
|
724
|
+
elem["text"] = std::move(texts[index]);
|
|
725
|
+
}
|
|
726
|
+
res.push_back(elem);
|
|
727
|
+
}
|
|
728
|
+
} else {
|
|
729
|
+
// Jina response format
|
|
730
|
+
json results = json::array();
|
|
731
|
+
int32_t n_tokens = 0;
|
|
732
|
+
for (const auto & rank : ranks) {
|
|
733
|
+
results.push_back(json{
|
|
734
|
+
{"index", json_value(rank, "index", 0)},
|
|
735
|
+
{"relevance_score", json_value(rank, "score", 0.0)},
|
|
736
|
+
});
|
|
749
737
|
|
|
750
|
-
|
|
751
|
-
|
|
738
|
+
n_tokens += json_value(rank, "tokens_evaluated", 0);
|
|
739
|
+
}
|
|
752
740
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
741
|
+
res = json{
|
|
742
|
+
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
743
|
+
{"object", "list"},
|
|
744
|
+
{"usage", json{
|
|
745
|
+
{"prompt_tokens", n_tokens},
|
|
746
|
+
{"total_tokens", n_tokens}
|
|
747
|
+
}},
|
|
748
|
+
{"results", results}
|
|
749
|
+
};
|
|
750
|
+
}
|
|
762
751
|
|
|
763
752
|
return res;
|
|
764
753
|
}
|
|
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
|
|
98
98
|
auto generate = [&](const std::string & prompt) {
|
|
99
99
|
std::string response;
|
|
100
100
|
|
|
101
|
-
const bool is_first =
|
|
101
|
+
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
|
102
102
|
|
|
103
103
|
// tokenize the prompt
|
|
104
104
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
|
|
113
113
|
while (true) {
|
|
114
114
|
// check if we have enough space in the context to evaluate this batch
|
|
115
115
|
int n_ctx = llama_n_ctx(ctx);
|
|
116
|
-
int n_ctx_used =
|
|
116
|
+
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
|
117
117
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
118
118
|
printf("\033[0m\n");
|
|
119
119
|
fprintf(stderr, "context size exceeded\n");
|
|
@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
|
|
|
331
331
|
}
|
|
332
332
|
|
|
333
333
|
active_seqs.erase(s);
|
|
334
|
-
for(int i = 0; i < n_seq_dft; i++) {
|
|
334
|
+
for (int i = 0; i < n_seq_dft; i++) {
|
|
335
335
|
if (i == s) {
|
|
336
336
|
continue;
|
|
337
337
|
}
|
|
338
|
-
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
|
|
338
|
+
if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
|
|
339
339
|
// synchronize active status for sequences with the same drafted token
|
|
340
340
|
drafts[i].active = drafts[i].active && accept;
|
|
341
341
|
if (!drafts[i].active) {
|
|
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
|
|
|
420
420
|
{
|
|
421
421
|
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
|
422
422
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
423
|
+
llama_kv_self_seq_keep(ctx_dft, s_keep);
|
|
424
|
+
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
|
425
|
+
llama_kv_self_seq_keep(ctx_dft, 0);
|
|
426
426
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
427
|
+
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
|
428
|
+
llama_kv_self_seq_keep(ctx_tgt, s_keep);
|
|
429
|
+
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
|
430
|
+
llama_kv_self_seq_keep(ctx_tgt, 0);
|
|
431
431
|
}
|
|
432
432
|
|
|
433
433
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
|
|
|
444
444
|
common_batch_clear(batch_dft);
|
|
445
445
|
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
|
446
446
|
|
|
447
|
-
|
|
447
|
+
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
|
448
448
|
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
|
449
449
|
llama_decode(ctx_dft, batch_dft);
|
|
450
450
|
|
|
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
|
|
|
503
503
|
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
|
504
504
|
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
|
505
505
|
|
|
506
|
-
|
|
507
|
-
|
|
506
|
+
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
|
507
|
+
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
|
508
508
|
|
|
509
509
|
// all previous tokens from this branch are now also part of the new branch
|
|
510
510
|
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
|
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
|
|
|
585
585
|
|
|
586
586
|
// evaluate the target model on the drafted tokens
|
|
587
587
|
{
|
|
588
|
-
|
|
588
|
+
llama_kv_self_seq_keep(ctx_tgt, 0);
|
|
589
589
|
for (int s = 1; s < n_seq_dft; ++s) {
|
|
590
|
-
|
|
590
|
+
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
|
|
591
591
|
}
|
|
592
592
|
|
|
593
593
|
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
|
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
|
|
|
217
217
|
{
|
|
218
218
|
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
|
219
219
|
|
|
220
|
-
|
|
220
|
+
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
|
|
221
221
|
}
|
|
222
222
|
|
|
223
223
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# MIT license
|
|
4
4
|
# Copyright (C) 2024 Intel Corporation
|
|
5
5
|
# SPDX-License-Identifier: MIT
|
|
6
|
-
|
|
6
|
+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
|
7
7
|
source /opt/intel/oneapi/setvars.sh
|
|
8
8
|
|
|
9
9
|
#export GGML_SYCL_DEBUG=1
|
|
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
|
|
|
13
13
|
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
14
14
|
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
|
15
15
|
NGL=33
|
|
16
|
-
CONEXT=
|
|
16
|
+
CONEXT=4096
|
|
17
17
|
|
|
18
18
|
if [ $# -gt 0 ]; then
|
|
19
19
|
GGML_SYCL_DEVICE=$1
|