@fugood/llama.node 0.0.1-alpha.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +42 -7
- package/README.md +10 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.js +1 -1
- package/lib/binding.ts +16 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/DetokenizeWorker.cpp +22 -0
- package/src/DetokenizeWorker.h +19 -0
- package/src/EmbeddingWorker.cpp +46 -0
- package/src/EmbeddingWorker.h +23 -0
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +80 -1
- package/src/LlamaContext.h +3 -0
- package/src/TokenizeWorker.cpp +26 -0
- package/src/TokenizeWorker.h +23 -0
- package/src/common.hpp +12 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
// increase max payload length to allow use of larger context size
|
|
13
13
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
14
14
|
#include "httplib.h"
|
|
15
|
+
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
16
|
+
#define JSON_ASSERT GGML_ASSERT
|
|
15
17
|
#include "json.hpp"
|
|
16
18
|
|
|
17
19
|
// auto generated files (update with ./deps.sh)
|
|
@@ -854,12 +856,12 @@ struct server_context {
|
|
|
854
856
|
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
|
855
857
|
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
|
|
856
858
|
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
|
|
857
|
-
slot.
|
|
859
|
+
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
|
858
860
|
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
|
859
861
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
|
860
862
|
|
|
861
863
|
// process "json_schema" and "grammar"
|
|
862
|
-
if (data.contains("json_schema") && !data
|
|
864
|
+
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
|
863
865
|
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
|
|
864
866
|
return false;
|
|
865
867
|
} else if (data.contains("json_schema") && !data.contains("grammar")) {
|
|
@@ -1028,7 +1030,6 @@ struct server_context {
|
|
|
1028
1030
|
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
|
1029
1031
|
return false;
|
|
1030
1032
|
}
|
|
1031
|
-
llama_set_rng_seed(ctx, slot.params.seed);
|
|
1032
1033
|
}
|
|
1033
1034
|
|
|
1034
1035
|
slot.command = SLOT_COMMAND_LOAD_PROMPT;
|
|
@@ -1118,7 +1119,7 @@ struct server_context {
|
|
|
1118
1119
|
|
|
1119
1120
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
|
1120
1121
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
|
1121
|
-
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
|
1122
|
+
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
|
1122
1123
|
slot.sampled = result.tok;
|
|
1123
1124
|
|
|
1124
1125
|
// search stop word and delete it
|
|
@@ -1208,6 +1209,27 @@ struct server_context {
|
|
|
1208
1209
|
LOG_VERBOSE("eos token found", {});
|
|
1209
1210
|
}
|
|
1210
1211
|
|
|
1212
|
+
auto n_ctx_train = llama_n_ctx_train(model);
|
|
1213
|
+
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
|
|
1214
|
+
&& slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
|
1215
|
+
LOG_WARNING("n_predict is not set and self-context extend is disabled."
|
|
1216
|
+
" Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
|
|
1217
|
+
{ "id_slot", slot.id },
|
|
1218
|
+
{ "params.n_predict", slot.params.n_predict },
|
|
1219
|
+
{ "slot.n_prompt_tokens", slot.n_prompt_tokens },
|
|
1220
|
+
{ "slot.n_decoded", slot.n_decoded },
|
|
1221
|
+
{ "slot.n_predict", slot.n_predict },
|
|
1222
|
+
{ "n_slots", params.n_parallel },
|
|
1223
|
+
{ "slot.n_ctx", slot.n_ctx },
|
|
1224
|
+
{ "n_ctx", n_ctx },
|
|
1225
|
+
{ "n_ctx_train", n_ctx_train },
|
|
1226
|
+
{ "ga_n", slot.ga_n },
|
|
1227
|
+
});
|
|
1228
|
+
slot.truncated = true;
|
|
1229
|
+
slot.stopped_limit = true;
|
|
1230
|
+
slot.has_next_token = false; // stop prediction
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1211
1233
|
LOG_VERBOSE("next token", {
|
|
1212
1234
|
{"id_slot", slot.id},
|
|
1213
1235
|
{"id_task", slot.id_task},
|
|
@@ -1363,9 +1385,10 @@ struct server_context {
|
|
|
1363
1385
|
if (!slot.params.stream && slot.stopped_word) {
|
|
1364
1386
|
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
|
1365
1387
|
|
|
1388
|
+
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
|
1366
1389
|
probs = std::vector<completion_token_output>(
|
|
1367
1390
|
slot.generated_token_probs.begin(),
|
|
1368
|
-
slot.generated_token_probs.end() -
|
|
1391
|
+
slot.generated_token_probs.end() - safe_offset);
|
|
1369
1392
|
} else {
|
|
1370
1393
|
probs = std::vector<completion_token_output>(
|
|
1371
1394
|
slot.generated_token_probs.begin(),
|
|
@@ -1491,7 +1514,7 @@ struct server_context {
|
|
|
1491
1514
|
// add subtasks
|
|
1492
1515
|
for (int i = 0; i < prompt_count; i++) {
|
|
1493
1516
|
json subtask_data = multiprompt_task.data;
|
|
1494
|
-
subtask_data["prompt"] = subtask_data
|
|
1517
|
+
subtask_data["prompt"] = subtask_data.at("prompt")[i];
|
|
1495
1518
|
|
|
1496
1519
|
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
|
1497
1520
|
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
|
|
@@ -1511,7 +1534,7 @@ struct server_context {
|
|
|
1511
1534
|
}
|
|
1512
1535
|
|
|
1513
1536
|
if (task.data.contains("system_prompt")) {
|
|
1514
|
-
system_prompt_set(task.data
|
|
1537
|
+
system_prompt_set(task.data.at("system_prompt"));
|
|
1515
1538
|
|
|
1516
1539
|
for (server_slot & slot : slots) {
|
|
1517
1540
|
slot.n_past = 0;
|
|
@@ -1623,7 +1646,7 @@ struct server_context {
|
|
|
1623
1646
|
} break;
|
|
1624
1647
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
|
1625
1648
|
{
|
|
1626
|
-
int id_slot = task.data
|
|
1649
|
+
int id_slot = task.data.at("id_slot");
|
|
1627
1650
|
server_slot * slot = get_slot(id_slot);
|
|
1628
1651
|
if (slot == nullptr) {
|
|
1629
1652
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
|
@@ -1633,8 +1656,8 @@ struct server_context {
|
|
|
1633
1656
|
const size_t token_count = slot->cache_tokens.size();
|
|
1634
1657
|
const int64_t t_start = ggml_time_us();
|
|
1635
1658
|
|
|
1636
|
-
std::string filename = task.data
|
|
1637
|
-
std::string filepath = task.data
|
|
1659
|
+
std::string filename = task.data.at("filename");
|
|
1660
|
+
std::string filepath = task.data.at("filepath");
|
|
1638
1661
|
|
|
1639
1662
|
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
|
1640
1663
|
|
|
@@ -1658,7 +1681,7 @@ struct server_context {
|
|
|
1658
1681
|
} break;
|
|
1659
1682
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
|
1660
1683
|
{
|
|
1661
|
-
int id_slot = task.data
|
|
1684
|
+
int id_slot = task.data.at("id_slot");
|
|
1662
1685
|
server_slot * slot = get_slot(id_slot);
|
|
1663
1686
|
if (slot == nullptr) {
|
|
1664
1687
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
|
@@ -1667,8 +1690,8 @@ struct server_context {
|
|
|
1667
1690
|
|
|
1668
1691
|
const int64_t t_start = ggml_time_us();
|
|
1669
1692
|
|
|
1670
|
-
std::string filename = task.data
|
|
1671
|
-
std::string filepath = task.data
|
|
1693
|
+
std::string filename = task.data.at("filename");
|
|
1694
|
+
std::string filepath = task.data.at("filepath");
|
|
1672
1695
|
|
|
1673
1696
|
slot->cache_tokens.resize(slot->n_ctx);
|
|
1674
1697
|
size_t token_count = 0;
|
|
@@ -1700,7 +1723,7 @@ struct server_context {
|
|
|
1700
1723
|
} break;
|
|
1701
1724
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
|
1702
1725
|
{
|
|
1703
|
-
int id_slot = task.data
|
|
1726
|
+
int id_slot = task.data.at("id_slot");
|
|
1704
1727
|
server_slot * slot = get_slot(id_slot);
|
|
1705
1728
|
if (slot == nullptr) {
|
|
1706
1729
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
|
@@ -2142,7 +2165,7 @@ struct server_context {
|
|
|
2142
2165
|
});
|
|
2143
2166
|
|
|
2144
2167
|
// process the created batch of tokens
|
|
2145
|
-
for (int32_t i = 0; i <
|
|
2168
|
+
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
|
2146
2169
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
|
2147
2170
|
|
|
2148
2171
|
for (auto & slot : slots) {
|
|
@@ -2245,17 +2268,31 @@ struct server_context {
|
|
|
2245
2268
|
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
|
2246
2269
|
result.tok = id;
|
|
2247
2270
|
|
|
2248
|
-
const
|
|
2249
|
-
if (
|
|
2250
|
-
|
|
2251
|
-
llama_sample_softmax(ctx, &cur_p);
|
|
2252
|
-
}
|
|
2271
|
+
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
|
2272
|
+
if (n_probs > 0) {
|
|
2273
|
+
const size_t n_considered = slot.ctx_sampling->n_considered;
|
|
2253
2274
|
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
cur_p
|
|
2257
|
-
|
|
2258
|
-
|
|
2275
|
+
// Make sure at least n_probs top tokens are at the front of the vector:
|
|
2276
|
+
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
|
|
2277
|
+
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
|
2278
|
+
}
|
|
2279
|
+
|
|
2280
|
+
if (slot.sparams.temp == 0.0f) {
|
|
2281
|
+
// With greedy sampling the probabilities have possibly not been calculated.
|
|
2282
|
+
for (size_t i = 0; i < n_probs; ++i) {
|
|
2283
|
+
result.probs.push_back({
|
|
2284
|
+
cur_p.data[i].id,
|
|
2285
|
+
i == 0 ? 1.0f : 0.0f
|
|
2286
|
+
});
|
|
2287
|
+
}
|
|
2288
|
+
} else {
|
|
2289
|
+
for (size_t i = 0; i < n_probs; ++i) {
|
|
2290
|
+
result.probs.push_back({
|
|
2291
|
+
cur_p.data[i].id,
|
|
2292
|
+
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
|
2293
|
+
});
|
|
2294
|
+
}
|
|
2295
|
+
}
|
|
2259
2296
|
}
|
|
2260
2297
|
|
|
2261
2298
|
if (!process_token(result, slot)) {
|
|
@@ -2333,7 +2370,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|
|
2333
2370
|
printf(" disable KV offload\n");
|
|
2334
2371
|
}
|
|
2335
2372
|
printf(" -m FNAME, --model FNAME\n");
|
|
2336
|
-
printf(" model path (default: %s)\n",
|
|
2373
|
+
printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
|
|
2337
2374
|
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
|
2338
2375
|
printf(" model download url (default: unused)\n");
|
|
2339
2376
|
printf(" -hfr REPO, --hf-repo REPO\n");
|
|
@@ -2357,6 +2394,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|
|
2357
2394
|
printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
|
2358
2395
|
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
|
2359
2396
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
|
|
2397
|
+
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
|
2360
2398
|
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
|
2361
2399
|
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
|
2362
2400
|
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
|
@@ -2372,7 +2410,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|
|
2372
2410
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
|
2373
2411
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
|
2374
2412
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
|
2375
|
-
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
|
2413
|
+
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
|
2376
2414
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
|
|
2377
2415
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
|
|
2378
2416
|
printf(" --chat-template JINJA_TEMPLATE\n");
|
|
@@ -2722,6 +2760,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2722
2760
|
params.embedding = true;
|
|
2723
2761
|
} else if (arg == "-cb" || arg == "--cont-batching") {
|
|
2724
2762
|
params.cont_batching = true;
|
|
2763
|
+
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
2764
|
+
params.flash_attn = true;
|
|
2725
2765
|
} else if (arg == "-np" || arg == "--parallel") {
|
|
2726
2766
|
if (++i >= argc) {
|
|
2727
2767
|
invalid_param = true;
|
|
@@ -2803,43 +2843,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2803
2843
|
invalid_param = true;
|
|
2804
2844
|
break;
|
|
2805
2845
|
}
|
|
2806
|
-
|
|
2807
|
-
if (sep == nullptr || sep - argv[i] >= 128) {
|
|
2808
|
-
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
|
2809
|
-
invalid_param = true;
|
|
2810
|
-
break;
|
|
2811
|
-
}
|
|
2812
|
-
|
|
2813
|
-
struct llama_model_kv_override kvo;
|
|
2814
|
-
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
|
2815
|
-
kvo.key[sep - argv[i]] = 0;
|
|
2816
|
-
sep++;
|
|
2817
|
-
if (strncmp(sep, "int:", 4) == 0) {
|
|
2818
|
-
sep += 4;
|
|
2819
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
2820
|
-
kvo.int_value = std::atol(sep);
|
|
2821
|
-
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
2822
|
-
sep += 6;
|
|
2823
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
2824
|
-
kvo.float_value = std::atof(sep);
|
|
2825
|
-
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
2826
|
-
sep += 5;
|
|
2827
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
2828
|
-
if (std::strcmp(sep, "true") == 0) {
|
|
2829
|
-
kvo.bool_value = true;
|
|
2830
|
-
} else if (std::strcmp(sep, "false") == 0) {
|
|
2831
|
-
kvo.bool_value = false;
|
|
2832
|
-
} else {
|
|
2833
|
-
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
|
2834
|
-
invalid_param = true;
|
|
2835
|
-
break;
|
|
2836
|
-
}
|
|
2837
|
-
} else {
|
|
2846
|
+
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
|
2838
2847
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
2839
2848
|
invalid_param = true;
|
|
2840
2849
|
break;
|
|
2841
2850
|
}
|
|
2842
|
-
params.kv_overrides.push_back(kvo);
|
|
2843
2851
|
} else {
|
|
2844
2852
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
2845
2853
|
server_print_usage(argv[0], default_params, default_sparams);
|
|
@@ -2847,6 +2855,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2847
2855
|
}
|
|
2848
2856
|
}
|
|
2849
2857
|
|
|
2858
|
+
gpt_params_handle_model_default(params);
|
|
2859
|
+
|
|
2850
2860
|
if (!params.kv_overrides.empty()) {
|
|
2851
2861
|
params.kv_overrides.emplace_back();
|
|
2852
2862
|
params.kv_overrides.back().key[0] = 0;
|
|
@@ -3128,8 +3138,8 @@ int main(int argc, char ** argv) {
|
|
|
3128
3138
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
|
3129
3139
|
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
|
3130
3140
|
|
|
3131
|
-
const int n_idle_slots = result.data
|
|
3132
|
-
const int n_processing_slots = result.data
|
|
3141
|
+
const int n_idle_slots = result.data.at("idle");
|
|
3142
|
+
const int n_processing_slots = result.data.at("processing");
|
|
3133
3143
|
|
|
3134
3144
|
json health = {
|
|
3135
3145
|
{"status", "ok"},
|
|
@@ -3139,7 +3149,7 @@ int main(int argc, char ** argv) {
|
|
|
3139
3149
|
|
|
3140
3150
|
res.status = 200; // HTTP OK
|
|
3141
3151
|
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
|
3142
|
-
health["slots"] = result.data
|
|
3152
|
+
health["slots"] = result.data.at("slots");
|
|
3143
3153
|
}
|
|
3144
3154
|
|
|
3145
3155
|
if (n_idle_slots == 0) {
|
|
@@ -3183,7 +3193,7 @@ int main(int argc, char ** argv) {
|
|
|
3183
3193
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
|
3184
3194
|
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
|
3185
3195
|
|
|
3186
|
-
res.set_content(result.data
|
|
3196
|
+
res.set_content(result.data.at("slots").dump(), "application/json");
|
|
3187
3197
|
res.status = 200; // HTTP OK
|
|
3188
3198
|
};
|
|
3189
3199
|
|
|
@@ -3210,32 +3220,32 @@ int main(int argc, char ** argv) {
|
|
|
3210
3220
|
|
|
3211
3221
|
json data = result.data;
|
|
3212
3222
|
|
|
3213
|
-
const uint64_t n_prompt_tokens_processed = data
|
|
3214
|
-
const uint64_t t_prompt_processing = data
|
|
3223
|
+
const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
|
|
3224
|
+
const uint64_t t_prompt_processing = data.at("t_prompt_processing");
|
|
3215
3225
|
|
|
3216
|
-
const uint64_t n_tokens_predicted = data
|
|
3217
|
-
const uint64_t t_tokens_generation = data
|
|
3226
|
+
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
|
|
3227
|
+
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
|
|
3218
3228
|
|
|
3219
|
-
const int32_t kv_cache_used_cells = data
|
|
3229
|
+
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
|
|
3220
3230
|
|
|
3221
3231
|
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
|
3222
3232
|
json all_metrics_def = json {
|
|
3223
3233
|
{"counter", {{
|
|
3224
3234
|
{"name", "prompt_tokens_total"},
|
|
3225
3235
|
{"help", "Number of prompt tokens processed."},
|
|
3226
|
-
{"value", (uint64_t) data
|
|
3236
|
+
{"value", (uint64_t) data.at("n_prompt_tokens_processed_total")}
|
|
3227
3237
|
}, {
|
|
3228
3238
|
{"name", "prompt_seconds_total"},
|
|
3229
3239
|
{"help", "Prompt process time"},
|
|
3230
|
-
{"value", (uint64_t) data
|
|
3240
|
+
{"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
|
|
3231
3241
|
}, {
|
|
3232
3242
|
{"name", "tokens_predicted_total"},
|
|
3233
3243
|
{"help", "Number of generation tokens processed."},
|
|
3234
|
-
{"value", (uint64_t) data
|
|
3244
|
+
{"value", (uint64_t) data.at("n_tokens_predicted_total")}
|
|
3235
3245
|
}, {
|
|
3236
3246
|
{"name", "tokens_predicted_seconds_total"},
|
|
3237
3247
|
{"help", "Predict process time"},
|
|
3238
|
-
{"value", (uint64_t) data
|
|
3248
|
+
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
|
|
3239
3249
|
}}},
|
|
3240
3250
|
{"gauge", {{
|
|
3241
3251
|
{"name", "prompt_tokens_seconds"},
|
|
@@ -3252,15 +3262,15 @@ int main(int argc, char ** argv) {
|
|
|
3252
3262
|
},{
|
|
3253
3263
|
{"name", "kv_cache_tokens"},
|
|
3254
3264
|
{"help", "KV-cache tokens."},
|
|
3255
|
-
{"value", (uint64_t) data
|
|
3265
|
+
{"value", (uint64_t) data.at("kv_cache_tokens_count")}
|
|
3256
3266
|
},{
|
|
3257
3267
|
{"name", "requests_processing"},
|
|
3258
3268
|
{"help", "Number of request processing."},
|
|
3259
|
-
{"value", (uint64_t) data
|
|
3269
|
+
{"value", (uint64_t) data.at("processing")}
|
|
3260
3270
|
},{
|
|
3261
3271
|
{"name", "requests_deferred"},
|
|
3262
3272
|
{"help", "Number of request deferred."},
|
|
3263
|
-
{"value", (uint64_t) data
|
|
3273
|
+
{"value", (uint64_t) data.at("deferred")}
|
|
3264
3274
|
}}}
|
|
3265
3275
|
};
|
|
3266
3276
|
|
|
@@ -3271,8 +3281,8 @@ int main(int argc, char ** argv) {
|
|
|
3271
3281
|
const auto & metrics_def = el.value();
|
|
3272
3282
|
|
|
3273
3283
|
for (const auto & metric_def : metrics_def) {
|
|
3274
|
-
const std::string name = metric_def
|
|
3275
|
-
const std::string help = metric_def
|
|
3284
|
+
const std::string name = metric_def.at("name");
|
|
3285
|
+
const std::string help = metric_def.at("help");
|
|
3276
3286
|
|
|
3277
3287
|
auto value = json_value(metric_def, "value", 0.);
|
|
3278
3288
|
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
|
@@ -3281,7 +3291,7 @@ int main(int argc, char ** argv) {
|
|
|
3281
3291
|
}
|
|
3282
3292
|
}
|
|
3283
3293
|
|
|
3284
|
-
const int64_t t_start = data
|
|
3294
|
+
const int64_t t_start = data.at("t_start");
|
|
3285
3295
|
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
|
|
3286
3296
|
|
|
3287
3297
|
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
|
@@ -3290,7 +3300,7 @@ int main(int argc, char ** argv) {
|
|
|
3290
3300
|
|
|
3291
3301
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3292
3302
|
json request_data = json::parse(req.body);
|
|
3293
|
-
std::string filename = request_data
|
|
3303
|
+
std::string filename = request_data.at("filename");
|
|
3294
3304
|
if (!validate_file_name(filename)) {
|
|
3295
3305
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3296
3306
|
return;
|
|
@@ -3320,7 +3330,7 @@ int main(int argc, char ** argv) {
|
|
|
3320
3330
|
|
|
3321
3331
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3322
3332
|
json request_data = json::parse(req.body);
|
|
3323
|
-
std::string filename = request_data
|
|
3333
|
+
std::string filename = request_data.at("filename");
|
|
3324
3334
|
if (!validate_file_name(filename)) {
|
|
3325
3335
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3326
3336
|
return;
|
|
@@ -3639,7 +3649,8 @@ int main(int argc, char ** argv) {
|
|
|
3639
3649
|
|
|
3640
3650
|
std::vector<llama_token> tokens;
|
|
3641
3651
|
if (body.count("content") != 0) {
|
|
3642
|
-
|
|
3652
|
+
const bool add_special = json_value(body, "add_special", false);
|
|
3653
|
+
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
|
3643
3654
|
}
|
|
3644
3655
|
const json data = format_tokenizer_response(tokens);
|
|
3645
3656
|
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
|
@@ -3651,7 +3662,7 @@ int main(int argc, char ** argv) {
|
|
|
3651
3662
|
|
|
3652
3663
|
std::string content;
|
|
3653
3664
|
if (body.count("tokens") != 0) {
|
|
3654
|
-
const std::vector<llama_token> tokens = body
|
|
3665
|
+
const std::vector<llama_token> tokens = body.at("tokens");
|
|
3655
3666
|
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
|
|
3656
3667
|
}
|
|
3657
3668
|
|
|
@@ -3674,10 +3685,10 @@ int main(int argc, char ** argv) {
|
|
|
3674
3685
|
json prompt;
|
|
3675
3686
|
if (body.count("input") != 0) {
|
|
3676
3687
|
is_openai = true;
|
|
3677
|
-
prompt = body
|
|
3688
|
+
prompt = body.at("input");
|
|
3678
3689
|
} else if (body.count("content") != 0) {
|
|
3679
3690
|
// with "content", we only support single prompt
|
|
3680
|
-
prompt = std::vector<std::string>{body
|
|
3691
|
+
prompt = std::vector<std::string>{body.at("content")};
|
|
3681
3692
|
} else {
|
|
3682
3693
|
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
|
3683
3694
|
return;
|
|
@@ -3696,7 +3707,7 @@ int main(int argc, char ** argv) {
|
|
|
3696
3707
|
if (!result.error) {
|
|
3697
3708
|
if (result.data.count("results")) {
|
|
3698
3709
|
// result for multi-task
|
|
3699
|
-
responses = result.data
|
|
3710
|
+
responses = result.data.at("results");
|
|
3700
3711
|
} else {
|
|
3701
3712
|
// result for single task
|
|
3702
3713
|
responses = std::vector<json>{result.data};
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
#include "common.h"
|
|
5
5
|
|
|
6
|
+
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
7
|
+
#define JSON_ASSERT GGML_ASSERT
|
|
6
8
|
#include "json.hpp"
|
|
7
9
|
|
|
8
10
|
#include <string>
|
|
@@ -49,18 +51,18 @@ extern bool server_log_json;
|
|
|
49
51
|
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
50
52
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
51
53
|
|
|
52
|
-
static inline void server_log(const char *level, const char *function, int line, const char *message, const
|
|
54
|
+
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
|
|
53
55
|
|
|
54
56
|
template <typename T>
|
|
55
|
-
static T json_value(const json &body, const std::string &key, const T &default_value) {
|
|
57
|
+
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
|
56
58
|
// Fallback null to default value
|
|
57
|
-
if (body.contains(key) && !body.at(key).is_null()){
|
|
59
|
+
if (body.contains(key) && !body.at(key).is_null()) {
|
|
58
60
|
try {
|
|
59
|
-
return body.
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
return body.at(key);
|
|
62
|
+
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
|
63
|
+
std::stringstream ss;
|
|
64
|
+
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
|
|
65
|
+
LOG_WARNING(ss.str().c_str(), body);
|
|
64
66
|
return default_value;
|
|
65
67
|
}
|
|
66
68
|
} else {
|
|
@@ -68,16 +70,16 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
|
|
68
70
|
}
|
|
69
71
|
}
|
|
70
72
|
|
|
71
|
-
static inline void server_log(const char *level, const char *function, int line, const char *message, const
|
|
73
|
+
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
|
|
72
74
|
std::stringstream ss_tid;
|
|
73
75
|
ss_tid << std::this_thread::get_id();
|
|
74
|
-
json log =
|
|
76
|
+
json log = json{
|
|
75
77
|
{"tid", ss_tid.str()},
|
|
76
78
|
{"timestamp", time(nullptr)},
|
|
77
79
|
};
|
|
78
80
|
|
|
79
81
|
if (server_log_json) {
|
|
80
|
-
log.merge_patch(
|
|
82
|
+
log.merge_patch({
|
|
81
83
|
{"level", level},
|
|
82
84
|
{"function", function},
|
|
83
85
|
{"line", line},
|
|
@@ -98,7 +100,7 @@ static inline void server_log(const char *level, const char *function, int line,
|
|
|
98
100
|
}
|
|
99
101
|
std::stringstream ss;
|
|
100
102
|
ss << buf << " |";
|
|
101
|
-
for (const auto& el : log.items())
|
|
103
|
+
for (const auto & el : log.items())
|
|
102
104
|
{
|
|
103
105
|
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
|
104
106
|
ss << " " << el.key() << "=" << value;
|
|
@@ -373,11 +375,11 @@ static json oaicompat_completion_params_parse(
|
|
|
373
375
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
374
376
|
|
|
375
377
|
// Apply chat template to the list of messages
|
|
376
|
-
llama_params["prompt"] = format_chat(model, chat_template, body
|
|
378
|
+
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
|
377
379
|
|
|
378
380
|
// Handle "stop" field
|
|
379
|
-
if (body.contains("stop") && body
|
|
380
|
-
llama_params["stop"] = json::array({body
|
|
381
|
+
if (body.contains("stop") && body.at("stop").is_string()) {
|
|
382
|
+
llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
|
|
381
383
|
} else {
|
|
382
384
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
|
383
385
|
}
|
|
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
|
1784
1784
|
|
|
1785
1785
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1786
1786
|
// reset state for the next run
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1787
|
+
if (!sched->is_reset) {
|
|
1788
|
+
size_t hash_size = sched->hash_set.size;
|
|
1789
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
|
1790
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
|
1791
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
|
1791
1792
|
|
|
1792
|
-
|
|
1793
|
+
sched->is_reset = true;
|
|
1794
|
+
}
|
|
1793
1795
|
sched->is_alloc = false;
|
|
1794
1796
|
}
|
|
1795
1797
|
|