@fugood/llama.node 0.0.1-alpha.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CMakeLists.txt +42 -7
  2. package/README.md +10 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/lib/binding.js +1 -1
  12. package/lib/binding.ts +16 -2
  13. package/lib/index.ts +2 -2
  14. package/package.json +15 -3
  15. package/src/DetokenizeWorker.cpp +22 -0
  16. package/src/DetokenizeWorker.h +19 -0
  17. package/src/EmbeddingWorker.cpp +46 -0
  18. package/src/EmbeddingWorker.h +23 -0
  19. package/src/LlamaCompletionWorker.cpp +5 -1
  20. package/src/LlamaCompletionWorker.h +4 -0
  21. package/src/LlamaContext.cpp +80 -1
  22. package/src/LlamaContext.h +3 -0
  23. package/src/TokenizeWorker.cpp +26 -0
  24. package/src/TokenizeWorker.h +23 -0
  25. package/src/common.hpp +12 -7
  26. package/src/llama.cpp/CMakeLists.txt +13 -7
  27. package/src/llama.cpp/common/common.cpp +221 -173
  28. package/src/llama.cpp/common/common.h +19 -8
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/log.h +2 -2
  31. package/src/llama.cpp/common/sampling.cpp +17 -1
  32. package/src/llama.cpp/common/sampling.h +28 -20
  33. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  36. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  39. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  41. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  42. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  43. package/src/llama.cpp/examples/main/main.cpp +10 -8
  44. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  45. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  47. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  48. package/src/llama.cpp/examples/server/server.cpp +97 -86
  49. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  50. package/src/llama.cpp/ggml-backend.c +7 -5
  51. package/src/llama.cpp/ggml-impl.h +339 -4
  52. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  53. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  54. package/src/llama.cpp/ggml-quants.c +302 -293
  55. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  56. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  57. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  58. package/src/llama.cpp/ggml.c +1469 -116
  59. package/src/llama.cpp/ggml.h +37 -7
  60. package/src/llama.cpp/llama.cpp +969 -432
  61. package/src/llama.cpp/llama.h +46 -14
  62. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  63. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  64. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  65. package/src/llama.cpp/requirements.txt +1 -0
  66. package/src/llama.cpp/sgemm.cpp +134 -103
  67. package/src/llama.cpp/sgemm.h +4 -2
  68. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  70. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  71. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  72. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  73. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  74. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  75. package/src/llama.cpp/unicode-data.cpp +1188 -656
  76. package/src/llama.cpp/unicode-data.h +4 -3
  77. package/src/llama.cpp/unicode.cpp +590 -49
  78. package/src/llama.cpp/unicode.h +6 -3
  79. package/bin/win32/arm64/llama-node.node +0 -0
  80. package/bin/win32/arm64/node.lib +0 -0
  81. package/bin/win32/x64/llama-node.node +0 -0
  82. package/bin/win32/x64/node.lib +0 -0
  83. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  84. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -12,6 +12,8 @@
12
12
  // increase max payload length to allow use of larger context size
13
13
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
14
14
  #include "httplib.h"
15
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
16
+ #define JSON_ASSERT GGML_ASSERT
15
17
  #include "json.hpp"
16
18
 
17
19
  // auto generated files (update with ./deps.sh)
@@ -854,12 +856,12 @@ struct server_context {
854
856
  slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
855
857
  slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
856
858
  slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
857
- slot.params.seed = json_value(data, "seed", default_params.seed);
859
+ slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
858
860
  slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
859
861
  slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
860
862
 
861
863
  // process "json_schema" and "grammar"
862
- if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
864
+ if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
863
865
  send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
864
866
  return false;
865
867
  } else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1028,7 +1030,6 @@ struct server_context {
1028
1030
  send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
1029
1031
  return false;
1030
1032
  }
1031
- llama_set_rng_seed(ctx, slot.params.seed);
1032
1033
  }
1033
1034
 
1034
1035
  slot.command = SLOT_COMMAND_LOAD_PROMPT;
@@ -1118,7 +1119,7 @@ struct server_context {
1118
1119
 
1119
1120
  bool process_token(completion_token_output & result, server_slot & slot) {
1120
1121
  // remember which tokens were sampled - used for repetition penalties during sampling
1121
- const std::string token_str = llama_token_to_piece(ctx, result.tok);
1122
+ const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
1122
1123
  slot.sampled = result.tok;
1123
1124
 
1124
1125
  // search stop word and delete it
@@ -1208,6 +1209,27 @@ struct server_context {
1208
1209
  LOG_VERBOSE("eos token found", {});
1209
1210
  }
1210
1211
 
1212
+ auto n_ctx_train = llama_n_ctx_train(model);
1213
+ if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
1214
+ && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
1215
+ LOG_WARNING("n_predict is not set and self-context extend is disabled."
1216
+ " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
1217
+ { "id_slot", slot.id },
1218
+ { "params.n_predict", slot.params.n_predict },
1219
+ { "slot.n_prompt_tokens", slot.n_prompt_tokens },
1220
+ { "slot.n_decoded", slot.n_decoded },
1221
+ { "slot.n_predict", slot.n_predict },
1222
+ { "n_slots", params.n_parallel },
1223
+ { "slot.n_ctx", slot.n_ctx },
1224
+ { "n_ctx", n_ctx },
1225
+ { "n_ctx_train", n_ctx_train },
1226
+ { "ga_n", slot.ga_n },
1227
+ });
1228
+ slot.truncated = true;
1229
+ slot.stopped_limit = true;
1230
+ slot.has_next_token = false; // stop prediction
1231
+ }
1232
+
1211
1233
  LOG_VERBOSE("next token", {
1212
1234
  {"id_slot", slot.id},
1213
1235
  {"id_task", slot.id_task},
@@ -1363,9 +1385,10 @@ struct server_context {
1363
1385
  if (!slot.params.stream && slot.stopped_word) {
1364
1386
  const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
1365
1387
 
1388
+ size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
1366
1389
  probs = std::vector<completion_token_output>(
1367
1390
  slot.generated_token_probs.begin(),
1368
- slot.generated_token_probs.end() - stop_word_toks.size());
1391
+ slot.generated_token_probs.end() - safe_offset);
1369
1392
  } else {
1370
1393
  probs = std::vector<completion_token_output>(
1371
1394
  slot.generated_token_probs.begin(),
@@ -1491,7 +1514,7 @@ struct server_context {
1491
1514
  // add subtasks
1492
1515
  for (int i = 0; i < prompt_count; i++) {
1493
1516
  json subtask_data = multiprompt_task.data;
1494
- subtask_data["prompt"] = subtask_data["prompt"][i];
1517
+ subtask_data["prompt"] = subtask_data.at("prompt")[i];
1495
1518
 
1496
1519
  // subtasks inherit everything else (infill mode, embedding mode, etc.)
1497
1520
  request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@@ -1511,7 +1534,7 @@ struct server_context {
1511
1534
  }
1512
1535
 
1513
1536
  if (task.data.contains("system_prompt")) {
1514
- system_prompt_set(task.data["system_prompt"]);
1537
+ system_prompt_set(task.data.at("system_prompt"));
1515
1538
 
1516
1539
  for (server_slot & slot : slots) {
1517
1540
  slot.n_past = 0;
@@ -1623,7 +1646,7 @@ struct server_context {
1623
1646
  } break;
1624
1647
  case SERVER_TASK_TYPE_SLOT_SAVE:
1625
1648
  {
1626
- int id_slot = task.data["id_slot"];
1649
+ int id_slot = task.data.at("id_slot");
1627
1650
  server_slot * slot = get_slot(id_slot);
1628
1651
  if (slot == nullptr) {
1629
1652
  send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1633,8 +1656,8 @@ struct server_context {
1633
1656
  const size_t token_count = slot->cache_tokens.size();
1634
1657
  const int64_t t_start = ggml_time_us();
1635
1658
 
1636
- std::string filename = task.data["filename"];
1637
- std::string filepath = task.data["filepath"];
1659
+ std::string filename = task.data.at("filename");
1660
+ std::string filepath = task.data.at("filepath");
1638
1661
 
1639
1662
  const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
1640
1663
 
@@ -1658,7 +1681,7 @@ struct server_context {
1658
1681
  } break;
1659
1682
  case SERVER_TASK_TYPE_SLOT_RESTORE:
1660
1683
  {
1661
- int id_slot = task.data["id_slot"];
1684
+ int id_slot = task.data.at("id_slot");
1662
1685
  server_slot * slot = get_slot(id_slot);
1663
1686
  if (slot == nullptr) {
1664
1687
  send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1667,8 +1690,8 @@ struct server_context {
1667
1690
 
1668
1691
  const int64_t t_start = ggml_time_us();
1669
1692
 
1670
- std::string filename = task.data["filename"];
1671
- std::string filepath = task.data["filepath"];
1693
+ std::string filename = task.data.at("filename");
1694
+ std::string filepath = task.data.at("filepath");
1672
1695
 
1673
1696
  slot->cache_tokens.resize(slot->n_ctx);
1674
1697
  size_t token_count = 0;
@@ -1700,7 +1723,7 @@ struct server_context {
1700
1723
  } break;
1701
1724
  case SERVER_TASK_TYPE_SLOT_ERASE:
1702
1725
  {
1703
- int id_slot = task.data["id_slot"];
1726
+ int id_slot = task.data.at("id_slot");
1704
1727
  server_slot * slot = get_slot(id_slot);
1705
1728
  if (slot == nullptr) {
1706
1729
  send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -2142,7 +2165,7 @@ struct server_context {
2142
2165
  });
2143
2166
 
2144
2167
  // process the created batch of tokens
2145
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
2168
+ for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
2146
2169
  const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
2147
2170
 
2148
2171
  for (auto & slot : slots) {
@@ -2245,17 +2268,31 @@ struct server_context {
2245
2268
  llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
2246
2269
  result.tok = id;
2247
2270
 
2248
- const int32_t n_probs = slot.sparams.n_probs;
2249
- if (slot.sparams.temp <= 0 && n_probs > 0) {
2250
- // for llama_sample_token_greedy we need to sort candidates
2251
- llama_sample_softmax(ctx, &cur_p);
2252
- }
2271
+ const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
2272
+ if (n_probs > 0) {
2273
+ const size_t n_considered = slot.ctx_sampling->n_considered;
2253
2274
 
2254
- for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
2255
- result.probs.push_back({
2256
- cur_p.data[i].id,
2257
- cur_p.data[i].p
2258
- });
2275
+ // Make sure at least n_probs top tokens are at the front of the vector:
2276
+ if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
2277
+ llama_sample_top_k(ctx, &cur_p, n_probs, 0);
2278
+ }
2279
+
2280
+ if (slot.sparams.temp == 0.0f) {
2281
+ // With greedy sampling the probabilities have possibly not been calculated.
2282
+ for (size_t i = 0; i < n_probs; ++i) {
2283
+ result.probs.push_back({
2284
+ cur_p.data[i].id,
2285
+ i == 0 ? 1.0f : 0.0f
2286
+ });
2287
+ }
2288
+ } else {
2289
+ for (size_t i = 0; i < n_probs; ++i) {
2290
+ result.probs.push_back({
2291
+ cur_p.data[i].id,
2292
+ i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
2293
+ });
2294
+ }
2295
+ }
2259
2296
  }
2260
2297
 
2261
2298
  if (!process_token(result, slot)) {
@@ -2333,7 +2370,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
2333
2370
  printf(" disable KV offload\n");
2334
2371
  }
2335
2372
  printf(" -m FNAME, --model FNAME\n");
2336
- printf(" model path (default: %s)\n", params.model.c_str());
2373
+ printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
2337
2374
  printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
2338
2375
  printf(" model download url (default: unused)\n");
2339
2376
  printf(" -hfr REPO, --hf-repo REPO\n");
@@ -2357,6 +2394,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
2357
2394
  printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
2358
2395
  printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
2359
2396
  printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
2397
+ printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
2360
2398
  printf(" -spf FNAME, --system-prompt-file FNAME\n");
2361
2399
  printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
2362
2400
  printf(" -ctk TYPE, --cache-type-k TYPE\n");
@@ -2372,7 +2410,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
2372
2410
  printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
2373
2411
  printf(" --override-kv KEY=TYPE:VALUE\n");
2374
2412
  printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
2375
- printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
2413
+ printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
2376
2414
  printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
2377
2415
  printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
2378
2416
  printf(" --chat-template JINJA_TEMPLATE\n");
@@ -2722,6 +2760,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2722
2760
  params.embedding = true;
2723
2761
  } else if (arg == "-cb" || arg == "--cont-batching") {
2724
2762
  params.cont_batching = true;
2763
+ } else if (arg == "-fa" || arg == "--flash-attn") {
2764
+ params.flash_attn = true;
2725
2765
  } else if (arg == "-np" || arg == "--parallel") {
2726
2766
  if (++i >= argc) {
2727
2767
  invalid_param = true;
@@ -2803,43 +2843,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2803
2843
  invalid_param = true;
2804
2844
  break;
2805
2845
  }
2806
- char * sep = strchr(argv[i], '=');
2807
- if (sep == nullptr || sep - argv[i] >= 128) {
2808
- fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
2809
- invalid_param = true;
2810
- break;
2811
- }
2812
-
2813
- struct llama_model_kv_override kvo;
2814
- std::strncpy(kvo.key, argv[i], sep - argv[i]);
2815
- kvo.key[sep - argv[i]] = 0;
2816
- sep++;
2817
- if (strncmp(sep, "int:", 4) == 0) {
2818
- sep += 4;
2819
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
2820
- kvo.int_value = std::atol(sep);
2821
- } else if (strncmp(sep, "float:", 6) == 0) {
2822
- sep += 6;
2823
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
2824
- kvo.float_value = std::atof(sep);
2825
- } else if (strncmp(sep, "bool:", 5) == 0) {
2826
- sep += 5;
2827
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
2828
- if (std::strcmp(sep, "true") == 0) {
2829
- kvo.bool_value = true;
2830
- } else if (std::strcmp(sep, "false") == 0) {
2831
- kvo.bool_value = false;
2832
- } else {
2833
- fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
2834
- invalid_param = true;
2835
- break;
2836
- }
2837
- } else {
2846
+ if (!parse_kv_override(argv[i], params.kv_overrides)) {
2838
2847
  fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
2839
2848
  invalid_param = true;
2840
2849
  break;
2841
2850
  }
2842
- params.kv_overrides.push_back(kvo);
2843
2851
  } else {
2844
2852
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
2845
2853
  server_print_usage(argv[0], default_params, default_sparams);
@@ -2847,6 +2855,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2847
2855
  }
2848
2856
  }
2849
2857
 
2858
+ gpt_params_handle_model_default(params);
2859
+
2850
2860
  if (!params.kv_overrides.empty()) {
2851
2861
  params.kv_overrides.emplace_back();
2852
2862
  params.kv_overrides.back().key[0] = 0;
@@ -3128,8 +3138,8 @@ int main(int argc, char ** argv) {
3128
3138
  server_task_result result = ctx_server.queue_results.recv(task.id);
3129
3139
  ctx_server.queue_results.remove_waiting_task_id(task.id);
3130
3140
 
3131
- const int n_idle_slots = result.data["idle"];
3132
- const int n_processing_slots = result.data["processing"];
3141
+ const int n_idle_slots = result.data.at("idle");
3142
+ const int n_processing_slots = result.data.at("processing");
3133
3143
 
3134
3144
  json health = {
3135
3145
  {"status", "ok"},
@@ -3139,7 +3149,7 @@ int main(int argc, char ** argv) {
3139
3149
 
3140
3150
  res.status = 200; // HTTP OK
3141
3151
  if (sparams.slots_endpoint && req.has_param("include_slots")) {
3142
- health["slots"] = result.data["slots"];
3152
+ health["slots"] = result.data.at("slots");
3143
3153
  }
3144
3154
 
3145
3155
  if (n_idle_slots == 0) {
@@ -3183,7 +3193,7 @@ int main(int argc, char ** argv) {
3183
3193
  server_task_result result = ctx_server.queue_results.recv(task.id);
3184
3194
  ctx_server.queue_results.remove_waiting_task_id(task.id);
3185
3195
 
3186
- res.set_content(result.data["slots"].dump(), "application/json");
3196
+ res.set_content(result.data.at("slots").dump(), "application/json");
3187
3197
  res.status = 200; // HTTP OK
3188
3198
  };
3189
3199
 
@@ -3210,32 +3220,32 @@ int main(int argc, char ** argv) {
3210
3220
 
3211
3221
  json data = result.data;
3212
3222
 
3213
- const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
3214
- const uint64_t t_prompt_processing = data["t_prompt_processing"];
3223
+ const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
3224
+ const uint64_t t_prompt_processing = data.at("t_prompt_processing");
3215
3225
 
3216
- const uint64_t n_tokens_predicted = data["n_tokens_predicted"];
3217
- const uint64_t t_tokens_generation = data["t_tokens_generation"];
3226
+ const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
3227
+ const uint64_t t_tokens_generation = data.at("t_tokens_generation");
3218
3228
 
3219
- const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
3229
+ const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
3220
3230
 
3221
3231
  // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
3222
3232
  json all_metrics_def = json {
3223
3233
  {"counter", {{
3224
3234
  {"name", "prompt_tokens_total"},
3225
3235
  {"help", "Number of prompt tokens processed."},
3226
- {"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
3236
+ {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")}
3227
3237
  }, {
3228
3238
  {"name", "prompt_seconds_total"},
3229
3239
  {"help", "Prompt process time"},
3230
- {"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
3240
+ {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
3231
3241
  }, {
3232
3242
  {"name", "tokens_predicted_total"},
3233
3243
  {"help", "Number of generation tokens processed."},
3234
- {"value", (uint64_t) data["n_tokens_predicted_total"]}
3244
+ {"value", (uint64_t) data.at("n_tokens_predicted_total")}
3235
3245
  }, {
3236
3246
  {"name", "tokens_predicted_seconds_total"},
3237
3247
  {"help", "Predict process time"},
3238
- {"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
3248
+ {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
3239
3249
  }}},
3240
3250
  {"gauge", {{
3241
3251
  {"name", "prompt_tokens_seconds"},
@@ -3252,15 +3262,15 @@ int main(int argc, char ** argv) {
3252
3262
  },{
3253
3263
  {"name", "kv_cache_tokens"},
3254
3264
  {"help", "KV-cache tokens."},
3255
- {"value", (uint64_t) data["kv_cache_tokens_count"]}
3265
+ {"value", (uint64_t) data.at("kv_cache_tokens_count")}
3256
3266
  },{
3257
3267
  {"name", "requests_processing"},
3258
3268
  {"help", "Number of request processing."},
3259
- {"value", (uint64_t) data["processing"]}
3269
+ {"value", (uint64_t) data.at("processing")}
3260
3270
  },{
3261
3271
  {"name", "requests_deferred"},
3262
3272
  {"help", "Number of request deferred."},
3263
- {"value", (uint64_t) data["deferred"]}
3273
+ {"value", (uint64_t) data.at("deferred")}
3264
3274
  }}}
3265
3275
  };
3266
3276
 
@@ -3271,8 +3281,8 @@ int main(int argc, char ** argv) {
3271
3281
  const auto & metrics_def = el.value();
3272
3282
 
3273
3283
  for (const auto & metric_def : metrics_def) {
3274
- const std::string name = metric_def["name"];
3275
- const std::string help = metric_def["help"];
3284
+ const std::string name = metric_def.at("name");
3285
+ const std::string help = metric_def.at("help");
3276
3286
 
3277
3287
  auto value = json_value(metric_def, "value", 0.);
3278
3288
  prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
@@ -3281,7 +3291,7 @@ int main(int argc, char ** argv) {
3281
3291
  }
3282
3292
  }
3283
3293
 
3284
- const int64_t t_start = data["t_start"];
3294
+ const int64_t t_start = data.at("t_start");
3285
3295
  res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
3286
3296
 
3287
3297
  res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@@ -3290,7 +3300,7 @@ int main(int argc, char ** argv) {
3290
3300
 
3291
3301
  const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3292
3302
  json request_data = json::parse(req.body);
3293
- std::string filename = request_data["filename"];
3303
+ std::string filename = request_data.at("filename");
3294
3304
  if (!validate_file_name(filename)) {
3295
3305
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3296
3306
  return;
@@ -3320,7 +3330,7 @@ int main(int argc, char ** argv) {
3320
3330
 
3321
3331
  const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3322
3332
  json request_data = json::parse(req.body);
3323
- std::string filename = request_data["filename"];
3333
+ std::string filename = request_data.at("filename");
3324
3334
  if (!validate_file_name(filename)) {
3325
3335
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3326
3336
  return;
@@ -3639,7 +3649,8 @@ int main(int argc, char ** argv) {
3639
3649
 
3640
3650
  std::vector<llama_token> tokens;
3641
3651
  if (body.count("content") != 0) {
3642
- tokens = ctx_server.tokenize(body["content"], false);
3652
+ const bool add_special = json_value(body, "add_special", false);
3653
+ tokens = ctx_server.tokenize(body.at("content"), add_special);
3643
3654
  }
3644
3655
  const json data = format_tokenizer_response(tokens);
3645
3656
  return res.set_content(data.dump(), "application/json; charset=utf-8");
@@ -3651,7 +3662,7 @@ int main(int argc, char ** argv) {
3651
3662
 
3652
3663
  std::string content;
3653
3664
  if (body.count("tokens") != 0) {
3654
- const std::vector<llama_token> tokens = body["tokens"];
3665
+ const std::vector<llama_token> tokens = body.at("tokens");
3655
3666
  content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
3656
3667
  }
3657
3668
 
@@ -3674,10 +3685,10 @@ int main(int argc, char ** argv) {
3674
3685
  json prompt;
3675
3686
  if (body.count("input") != 0) {
3676
3687
  is_openai = true;
3677
- prompt = body["input"];
3688
+ prompt = body.at("input");
3678
3689
  } else if (body.count("content") != 0) {
3679
3690
  // with "content", we only support single prompt
3680
- prompt = std::vector<std::string>{body["content"]};
3691
+ prompt = std::vector<std::string>{body.at("content")};
3681
3692
  } else {
3682
3693
  res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
3683
3694
  return;
@@ -3696,7 +3707,7 @@ int main(int argc, char ** argv) {
3696
3707
  if (!result.error) {
3697
3708
  if (result.data.count("results")) {
3698
3709
  // result for multi-task
3699
- responses = result.data["results"];
3710
+ responses = result.data.at("results");
3700
3711
  } else {
3701
3712
  // result for single task
3702
3713
  responses = std::vector<json>{result.data};
@@ -3,6 +3,8 @@
3
3
  #include "llama.h"
4
4
  #include "common.h"
5
5
 
6
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
7
+ #define JSON_ASSERT GGML_ASSERT
6
8
  #include "json.hpp"
7
9
 
8
10
  #include <string>
@@ -49,18 +51,18 @@ extern bool server_log_json;
49
51
  #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
50
52
  #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
51
53
 
52
- static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
54
+ static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
53
55
 
54
56
  template <typename T>
55
- static T json_value(const json &body, const std::string &key, const T &default_value) {
57
+ static T json_value(const json & body, const std::string & key, const T & default_value) {
56
58
  // Fallback null to default value
57
- if (body.contains(key) && !body.at(key).is_null()){
59
+ if (body.contains(key) && !body.at(key).is_null()) {
58
60
  try {
59
- return body.value(key, default_value);
60
- }
61
- catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
62
- std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
63
- server_log("WARN", __func__, __LINE__, message.c_str(), body);
61
+ return body.at(key);
62
+ } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
63
+ std::stringstream ss;
64
+ ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
65
+ LOG_WARNING(ss.str().c_str(), body);
64
66
  return default_value;
65
67
  }
66
68
  } else {
@@ -68,16 +70,16 @@ static T json_value(const json &body, const std::string &key, const T &default_v
68
70
  }
69
71
  }
70
72
 
71
- static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
73
+ static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
72
74
  std::stringstream ss_tid;
73
75
  ss_tid << std::this_thread::get_id();
74
- json log = nlohmann::ordered_json{
76
+ json log = json{
75
77
  {"tid", ss_tid.str()},
76
78
  {"timestamp", time(nullptr)},
77
79
  };
78
80
 
79
81
  if (server_log_json) {
80
- log.merge_patch( {
82
+ log.merge_patch({
81
83
  {"level", level},
82
84
  {"function", function},
83
85
  {"line", line},
@@ -98,7 +100,7 @@ static inline void server_log(const char *level, const char *function, int line,
98
100
  }
99
101
  std::stringstream ss;
100
102
  ss << buf << " |";
101
- for (const auto& el : log.items())
103
+ for (const auto & el : log.items())
102
104
  {
103
105
  const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
104
106
  ss << " " << el.key() << "=" << value;
@@ -373,11 +375,11 @@ static json oaicompat_completion_params_parse(
373
375
  llama_params["top_p"] = json_value(body, "top_p", 1.0);
374
376
 
375
377
  // Apply chat template to the list of messages
376
- llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
378
+ llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
377
379
 
378
380
  // Handle "stop" field
379
- if (body.contains("stop") && body["stop"].is_string()) {
380
- llama_params["stop"] = json::array({body["stop"].get<std::string>()});
381
+ if (body.contains("stop") && body.at("stop").is_string()) {
382
+ llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
381
383
  } else {
382
384
  llama_params["stop"] = json_value(body, "stop", json::array());
383
385
  }
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1784
1784
 
1785
1785
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1786
1786
  // reset state for the next run
1787
- size_t hash_size = sched->hash_set.size;
1788
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1789
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1790
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
+ if (!sched->is_reset) {
1788
+ size_t hash_size = sched->hash_set.size;
1789
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1791
1792
 
1792
- sched->is_reset = true;
1793
+ sched->is_reset = true;
1794
+ }
1793
1795
  sched->is_alloc = false;
1794
1796
  }
1795
1797