@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +243 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +14 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  136. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  138. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
  143. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  144. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  145. package/src/llama.cpp/include/llama.h +30 -11
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  147. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  149. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  150. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  151. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  152. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  153. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  154. package/src/llama.cpp/src/llama-arch.cpp +161 -17
  155. package/src/llama.cpp/src/llama-arch.h +16 -0
  156. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  157. package/src/llama.cpp/src/llama-chat.h +6 -2
  158. package/src/llama.cpp/src/llama-context.cpp +108 -92
  159. package/src/llama.cpp/src/llama-context.h +1 -2
  160. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  161. package/src/llama.cpp/src/llama-graph.h +26 -6
  162. package/src/llama.cpp/src/llama-hparams.h +13 -0
  163. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  164. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  165. package/src/llama.cpp/src/llama-memory.h +1 -1
  166. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  167. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  168. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  169. package/src/llama.cpp/src/llama-model.cpp +1544 -291
  170. package/src/llama.cpp/src/llama-model.h +13 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  172. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  173. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  174. package/src/llama.cpp/src/llama.cpp +1 -1
  175. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  176. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  177. package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
  178. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  179. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  180. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  181. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  182. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  183. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  184. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  185. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  186. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  188. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  189. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  190. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  191. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  192. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  193. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  203. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -133,7 +133,8 @@ struct slot_params {
133
133
 
134
134
  auto grammar_triggers = json::array();
135
135
  for (const auto & trigger : sampling.grammar_triggers) {
136
- grammar_triggers.push_back(trigger.to_json<json>());
136
+ server_grammar_trigger ct(std::move(trigger));
137
+ grammar_triggers.push_back(ct.to_json());
137
138
  }
138
139
 
139
140
  return json {
@@ -372,9 +373,9 @@ struct server_task {
372
373
  const auto grammar_triggers = data.find("grammar_triggers");
373
374
  if (grammar_triggers != data.end()) {
374
375
  for (const auto & t : *grammar_triggers) {
375
- auto ct = common_grammar_trigger::from_json(t);
376
- if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377
- const auto & word = ct.value;
376
+ server_grammar_trigger ct(t);
377
+ if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
378
+ const auto & word = ct.value.value;
378
379
  auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379
380
  if (ids.size() == 1) {
380
381
  auto token = ids[0];
@@ -392,7 +393,7 @@ struct server_task {
392
393
  params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
393
394
  }
394
395
  } else {
395
- params.sampling.grammar_triggers.push_back(ct);
396
+ params.sampling.grammar_triggers.push_back(std::move(ct.value));
396
397
  }
397
398
  }
398
399
  }
@@ -489,8 +490,12 @@ struct result_timings {
489
490
  double predicted_per_token_ms;
490
491
  double predicted_per_second;
491
492
 
493
+ // Optional speculative metrics - only included when > 0
494
+ int32_t draft_n = 0;
495
+ int32_t draft_n_accepted = 0;
496
+
492
497
  json to_json() const {
493
- return {
498
+ json base = {
494
499
  {"prompt_n", prompt_n},
495
500
  {"prompt_ms", prompt_ms},
496
501
  {"prompt_per_token_ms", prompt_per_token_ms},
@@ -501,6 +506,13 @@ struct result_timings {
501
506
  {"predicted_per_token_ms", predicted_per_token_ms},
502
507
  {"predicted_per_second", predicted_per_second},
503
508
  };
509
+
510
+ if (draft_n > 0) {
511
+ base["draft_n"] = draft_n;
512
+ base["draft_n_accepted"] = draft_n_accepted;
513
+ }
514
+
515
+ return base;
504
516
  }
505
517
  };
506
518
 
@@ -830,6 +842,11 @@ struct server_task_result_cmpl_final : server_task_result {
830
842
  ret.push_back({"timings", timings.to_json()});
831
843
  }
832
844
 
845
+ // extra fields for debugging purposes
846
+ if (verbose) {
847
+ ret["__verbose"] = to_json_non_oaicompat();
848
+ }
849
+
833
850
  return ret;
834
851
  }
835
852
  };
@@ -1294,6 +1311,10 @@ struct server_slot {
1294
1311
 
1295
1312
  std::function<void(int)> callback_on_release;
1296
1313
 
1314
+ // Speculative decoding stats
1315
+ int32_t n_draft_total = 0; // Total draft tokens generated
1316
+ int32_t n_draft_accepted = 0; // Draft tokens actually accepted
1317
+
1297
1318
  void reset() {
1298
1319
  SLT_DBG(*this, "%s", "\n");
1299
1320
 
@@ -1310,6 +1331,10 @@ struct server_slot {
1310
1331
 
1311
1332
  generated_tokens.clear();
1312
1333
  generated_token_probs.clear();
1334
+
1335
+ // clear speculative decoding stats
1336
+ n_draft_total = 0;
1337
+ n_draft_accepted = 0;
1313
1338
  }
1314
1339
 
1315
1340
  bool is_non_causal() const {
@@ -1376,6 +1401,12 @@ struct server_slot {
1376
1401
  timings.predicted_per_token_ms = t_token_generation / n_decoded;
1377
1402
  timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
1378
1403
 
1404
+ // Add speculative metrics
1405
+ if (n_draft_total > 0) {
1406
+ timings.draft_n = n_draft_total;
1407
+ timings.draft_n_accepted = n_draft_accepted;
1408
+ }
1409
+
1379
1410
  return timings;
1380
1411
  }
1381
1412
 
@@ -1423,6 +1454,15 @@ struct server_slot {
1423
1454
  t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
1424
1455
  t_token_generation, n_decoded, t_gen, n_gen_second,
1425
1456
  t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
1457
+
1458
+ if (n_draft_total > 0) {
1459
+ const float draft_ratio = (float) n_draft_accepted / n_draft_total;
1460
+ SLT_INF(*this,
1461
+ "\n"
1462
+ "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
1463
+ draft_ratio, n_draft_accepted, n_draft_total
1464
+ );
1465
+ }
1426
1466
  }
1427
1467
 
1428
1468
  json to_json() const {
@@ -1512,29 +1552,30 @@ struct server_queue {
1512
1552
  std::condition_variable condition_tasks;
1513
1553
 
1514
1554
  // callback functions
1515
- std::function<void(server_task)> callback_new_task;
1516
- std::function<void(void)> callback_update_slots;
1555
+ std::function<void(server_task &&)> callback_new_task;
1556
+ std::function<void(void)> callback_update_slots;
1517
1557
 
1518
1558
  // Add a new task to the end of the queue
1519
- int post(server_task task, bool front = false) {
1559
+ int post(server_task && task, bool front = false) {
1520
1560
  std::unique_lock<std::mutex> lock(mutex_tasks);
1521
1561
  GGML_ASSERT(task.id != -1);
1522
1562
  // if this is cancel task make sure to clean up pending tasks
1523
1563
  if (task.type == SERVER_TASK_TYPE_CANCEL) {
1524
1564
  cleanup_pending_task(task.id_target);
1525
1565
  }
1526
- QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
1566
+ const int task_id = task.id;
1567
+ QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
1527
1568
  if (front) {
1528
1569
  queue_tasks.push_front(std::move(task));
1529
1570
  } else {
1530
1571
  queue_tasks.push_back(std::move(task));
1531
1572
  }
1532
1573
  condition_tasks.notify_one();
1533
- return task.id;
1574
+ return task_id;
1534
1575
  }
1535
1576
 
1536
1577
  // multi-task version of post()
1537
- int post(std::vector<server_task> & tasks, bool front = false) {
1578
+ int post(std::vector<server_task> && tasks, bool front = false) {
1538
1579
  std::unique_lock<std::mutex> lock(mutex_tasks);
1539
1580
  for (auto & task : tasks) {
1540
1581
  if (task.id == -1) {
@@ -1556,7 +1597,7 @@ struct server_queue {
1556
1597
  }
1557
1598
 
1558
1599
  // Add a new task, but defer until one slot is available
1559
- void defer(server_task task) {
1600
+ void defer(server_task && task) {
1560
1601
  std::unique_lock<std::mutex> lock(mutex_tasks);
1561
1602
  QUE_DBG("defer task, id = %d\n", task.id);
1562
1603
  queue_tasks_deferred.push_back(std::move(task));
@@ -1571,7 +1612,7 @@ struct server_queue {
1571
1612
  }
1572
1613
 
1573
1614
  // Register function to process a new task
1574
- void on_new_task(std::function<void(server_task)> callback) {
1615
+ void on_new_task(std::function<void(server_task &&)> callback) {
1575
1616
  callback_new_task = std::move(callback);
1576
1617
  }
1577
1618
 
@@ -1620,7 +1661,7 @@ struct server_queue {
1620
1661
  lock.unlock();
1621
1662
  break;
1622
1663
  }
1623
- server_task task = queue_tasks.front();
1664
+ server_task task = std::move(queue_tasks.front());
1624
1665
  queue_tasks.pop_front();
1625
1666
  lock.unlock();
1626
1667
 
@@ -1665,6 +1706,8 @@ private:
1665
1706
  };
1666
1707
 
1667
1708
  struct server_response {
1709
+ bool running = true;
1710
+
1668
1711
  // for keeping track of all tasks waiting for the result
1669
1712
  std::unordered_set<int> waiting_task_ids;
1670
1713
 
@@ -1719,6 +1762,10 @@ struct server_response {
1719
1762
  while (true) {
1720
1763
  std::unique_lock<std::mutex> lock(mutex_results);
1721
1764
  condition_results.wait(lock, [&]{
1765
+ if (!running) {
1766
+ SRV_DBG("%s : queue result stop\n", __func__);
1767
+ std::terminate(); // we cannot return here since the caller is HTTP code
1768
+ }
1722
1769
  return !queue_results.empty();
1723
1770
  });
1724
1771
 
@@ -1749,6 +1796,10 @@ struct server_response {
1749
1796
  }
1750
1797
 
1751
1798
  std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
1799
+ if (!running) {
1800
+ SRV_DBG("%s : queue result stop\n", __func__);
1801
+ std::terminate(); // we cannot return here since the caller is HTTP code
1802
+ }
1752
1803
  if (cr_res == std::cv_status::timeout) {
1753
1804
  return nullptr;
1754
1805
  }
@@ -1778,6 +1829,12 @@ struct server_response {
1778
1829
  }
1779
1830
  }
1780
1831
  }
1832
+
1833
+ // terminate the waiting loop
1834
+ void terminate() {
1835
+ running = false;
1836
+ condition_results.notify_all();
1837
+ }
1781
1838
  };
1782
1839
 
1783
1840
  struct server_context {
@@ -1837,7 +1894,7 @@ struct server_context {
1837
1894
  }
1838
1895
 
1839
1896
  bool load_model(const common_params & params) {
1840
- SRV_INF("loading model '%s'\n", params.model.c_str());
1897
+ SRV_INF("loading model '%s'\n", params.model.path.c_str());
1841
1898
 
1842
1899
  params_base = params;
1843
1900
 
@@ -1847,7 +1904,7 @@ struct server_context {
1847
1904
  ctx = llama_init.context.get();
1848
1905
 
1849
1906
  if (model == nullptr) {
1850
- SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
1907
+ SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
1851
1908
  return false;
1852
1909
  }
1853
1910
 
@@ -1858,16 +1915,13 @@ struct server_context {
1858
1915
  add_bos_token = llama_vocab_get_add_bos(vocab);
1859
1916
  has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
1860
1917
 
1861
- if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
1862
- SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
1918
+ if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
1919
+ SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
1863
1920
 
1864
1921
  auto params_dft = params_base;
1865
1922
 
1866
1923
  params_dft.devices = params_base.speculative.devices;
1867
- params_dft.hf_file = params_base.speculative.hf_file;
1868
- params_dft.hf_repo = params_base.speculative.hf_repo;
1869
1924
  params_dft.model = params_base.speculative.model;
1870
- params_dft.model_url = params_base.speculative.model_url;
1871
1925
  params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
1872
1926
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1873
1927
  params_dft.n_parallel = 1;
@@ -1881,12 +1935,12 @@ struct server_context {
1881
1935
  model_dft = llama_init_dft.model.get();
1882
1936
 
1883
1937
  if (model_dft == nullptr) {
1884
- SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
1938
+ SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
1885
1939
  return false;
1886
1940
  }
1887
1941
 
1888
1942
  if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
1889
- SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
1943
+ SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
1890
1944
 
1891
1945
  return false;
1892
1946
  }
@@ -1951,7 +2005,7 @@ struct server_context {
1951
2005
 
1952
2006
  slot.reset();
1953
2007
 
1954
- slots.push_back(slot);
2008
+ slots.push_back(std::move(slot));
1955
2009
  }
1956
2010
 
1957
2011
  default_generation_settings_for_props = slots[0].to_json();
@@ -2052,7 +2106,7 @@ struct server_context {
2052
2106
  return true;
2053
2107
  }
2054
2108
 
2055
- bool launch_slot_with_task(server_slot & slot, const server_task & task) {
2109
+ bool launch_slot_with_task(server_slot & slot, server_task && task) {
2056
2110
  slot.reset();
2057
2111
  slot.id_task = task.id;
2058
2112
  slot.index = task.index;
@@ -2060,10 +2114,10 @@ struct server_context {
2060
2114
  slot.params = std::move(task.params);
2061
2115
  slot.prompt_tokens = std::move(task.prompt_tokens);
2062
2116
 
2063
- if (!are_lora_equal(task.params.lora, slot.lora)) {
2117
+ if (!are_lora_equal(slot.params.lora, slot.lora)) {
2064
2118
  // if lora is changed, we cannot reuse cached tokens
2065
2119
  slot.cache_tokens.clear();
2066
- slot.lora = task.params.lora;
2120
+ slot.lora = slot.params.lora;
2067
2121
  }
2068
2122
 
2069
2123
  bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
@@ -2494,10 +2548,10 @@ struct server_context {
2494
2548
  server_task task(SERVER_TASK_TYPE_CANCEL);
2495
2549
  task.id_target = id_task;
2496
2550
  queue_results.remove_waiting_task_id(id_task);
2497
- cancel_tasks.push_back(task);
2551
+ cancel_tasks.push_back(std::move(task));
2498
2552
  }
2499
2553
  // push to beginning of the queue, so it has highest priority
2500
- queue_tasks.post(cancel_tasks, true);
2554
+ queue_tasks.post(std::move(cancel_tasks), true);
2501
2555
  }
2502
2556
 
2503
2557
  // receive the results from task(s)
@@ -2584,7 +2638,7 @@ struct server_context {
2584
2638
  // Functions to process the task
2585
2639
  //
2586
2640
 
2587
- void process_single_task(server_task task) {
2641
+ void process_single_task(server_task && task) {
2588
2642
  switch (task.type) {
2589
2643
  case SERVER_TASK_TYPE_COMPLETION:
2590
2644
  case SERVER_TASK_TYPE_INFILL:
@@ -2598,17 +2652,17 @@ struct server_context {
2598
2652
  if (slot == nullptr) {
2599
2653
  // if no slot is available, we defer this task for processing later
2600
2654
  SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
2601
- queue_tasks.defer(task);
2655
+ queue_tasks.defer(std::move(task));
2602
2656
  break;
2603
2657
  }
2604
2658
  if (slot->is_processing()) {
2605
2659
  // if requested slot is unavailable, we defer this task for processing later
2606
2660
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2607
- queue_tasks.defer(task);
2661
+ queue_tasks.defer(std::move(task));
2608
2662
  break;
2609
2663
  }
2610
2664
 
2611
- if (!launch_slot_with_task(*slot, task)) {
2665
+ if (!launch_slot_with_task(*slot, std::move(task))) {
2612
2666
  SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
2613
2667
  break;
2614
2668
  }
@@ -2687,7 +2741,7 @@ struct server_context {
2687
2741
  if (slot->is_processing()) {
2688
2742
  // if requested slot is unavailable, we defer this task for processing later
2689
2743
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2690
- queue_tasks.defer(task);
2744
+ queue_tasks.defer(std::move(task));
2691
2745
  break;
2692
2746
  }
2693
2747
 
@@ -2723,7 +2777,7 @@ struct server_context {
2723
2777
  if (slot->is_processing()) {
2724
2778
  // if requested slot is unavailable, we defer this task for processing later
2725
2779
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2726
- queue_tasks.defer(task);
2780
+ queue_tasks.defer(std::move(task));
2727
2781
  break;
2728
2782
  }
2729
2783
 
@@ -2766,7 +2820,7 @@ struct server_context {
2766
2820
  if (slot->is_processing()) {
2767
2821
  // if requested slot is unavailable, we defer this task for processing later
2768
2822
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2769
- queue_tasks.defer(task);
2823
+ queue_tasks.defer(std::move(task));
2770
2824
  break;
2771
2825
  }
2772
2826
 
@@ -2818,7 +2872,7 @@ struct server_context {
2818
2872
 
2819
2873
  server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
2820
2874
  task.id = queue_tasks.get_new_id();
2821
- queue_tasks.post(task);
2875
+ queue_tasks.post(std::move(task));
2822
2876
  }
2823
2877
 
2824
2878
  // apply context-shift if needed
@@ -3285,6 +3339,9 @@ struct server_context {
3285
3339
 
3286
3340
  llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
3287
3341
 
3342
+ // keep track of total number of tokens generated in the draft
3343
+ slot.n_draft_total += draft.size();
3344
+
3288
3345
  // ignore small drafts
3289
3346
  if (slot.params.speculative.n_min > (int) draft.size()) {
3290
3347
  SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3310,6 +3367,9 @@ struct server_context {
3310
3367
  slot.n_past += ids.size();
3311
3368
  slot.n_decoded += ids.size();
3312
3369
 
3370
+ // update how many tokens out of draft was accepted
3371
+ slot.n_draft_accepted += ids.size() - 1;
3372
+
3313
3373
  slot.cache_tokens.push_back(id);
3314
3374
  slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3315
3375
 
@@ -3574,14 +3634,17 @@ int main(int argc, char ** argv) {
3574
3634
  }
3575
3635
 
3576
3636
  // request slots data using task queue
3577
- server_task task(SERVER_TASK_TYPE_METRICS);
3578
- task.id = ctx_server.queue_tasks.get_new_id();
3579
- ctx_server.queue_results.add_waiting_task_id(task.id);
3580
- ctx_server.queue_tasks.post(task, true); // high-priority task
3637
+ int task_id = ctx_server.queue_tasks.get_new_id();
3638
+ {
3639
+ server_task task(SERVER_TASK_TYPE_METRICS);
3640
+ task.id = task_id;
3641
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3642
+ ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
3643
+ }
3581
3644
 
3582
3645
  // get the result
3583
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3584
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3646
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3647
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3585
3648
 
3586
3649
  if (result->is_error()) {
3587
3650
  res_error(res, result->to_json());
@@ -3610,16 +3673,17 @@ int main(int argc, char ** argv) {
3610
3673
  }
3611
3674
 
3612
3675
  // request slots data using task queue
3613
- server_task task(SERVER_TASK_TYPE_METRICS);
3614
- task.id = ctx_server.queue_tasks.get_new_id();
3615
- task.metrics_reset_bucket = true;
3616
-
3617
- ctx_server.queue_results.add_waiting_task_id(task.id);
3618
- ctx_server.queue_tasks.post(task, true); // high-priority task
3676
+ int task_id = ctx_server.queue_tasks.get_new_id();
3677
+ {
3678
+ server_task task(SERVER_TASK_TYPE_METRICS);
3679
+ task.id = task_id;
3680
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3681
+ ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
3682
+ }
3619
3683
 
3620
3684
  // get the result
3621
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3622
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3685
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3686
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3623
3687
 
3624
3688
  if (result->is_error()) {
3625
3689
  res_error(res, result->to_json());
@@ -3716,17 +3780,20 @@ int main(int argc, char ** argv) {
3716
3780
  }
3717
3781
  std::string filepath = params.slot_save_path + filename;
3718
3782
 
3719
- server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
3720
- task.id = ctx_server.queue_tasks.get_new_id();
3721
- task.slot_action.slot_id = id_slot;
3722
- task.slot_action.filename = filename;
3723
- task.slot_action.filepath = filepath;
3783
+ int task_id = ctx_server.queue_tasks.get_new_id();
3784
+ {
3785
+ server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
3786
+ task.id = task_id;
3787
+ task.slot_action.slot_id = id_slot;
3788
+ task.slot_action.filename = filename;
3789
+ task.slot_action.filepath = filepath;
3724
3790
 
3725
- ctx_server.queue_results.add_waiting_task_id(task.id);
3726
- ctx_server.queue_tasks.post(task);
3791
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3792
+ ctx_server.queue_tasks.post(std::move(task));
3793
+ }
3727
3794
 
3728
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3729
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3795
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3796
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3730
3797
 
3731
3798
  if (result->is_error()) {
3732
3799
  res_error(res, result->to_json());
@@ -3745,17 +3812,20 @@ int main(int argc, char ** argv) {
3745
3812
  }
3746
3813
  std::string filepath = params.slot_save_path + filename;
3747
3814
 
3748
- server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
3749
- task.id = ctx_server.queue_tasks.get_new_id();
3750
- task.slot_action.slot_id = id_slot;
3751
- task.slot_action.filename = filename;
3752
- task.slot_action.filepath = filepath;
3815
+ int task_id = ctx_server.queue_tasks.get_new_id();
3816
+ {
3817
+ server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
3818
+ task.id = task_id;
3819
+ task.slot_action.slot_id = id_slot;
3820
+ task.slot_action.filename = filename;
3821
+ task.slot_action.filepath = filepath;
3753
3822
 
3754
- ctx_server.queue_results.add_waiting_task_id(task.id);
3755
- ctx_server.queue_tasks.post(task);
3823
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3824
+ ctx_server.queue_tasks.post(std::move(task));
3825
+ }
3756
3826
 
3757
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3758
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3827
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3828
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3759
3829
 
3760
3830
  if (result->is_error()) {
3761
3831
  res_error(res, result->to_json());
@@ -3767,15 +3837,18 @@ int main(int argc, char ** argv) {
3767
3837
  };
3768
3838
 
3769
3839
  const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
3770
- server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
3771
- task.id = ctx_server.queue_tasks.get_new_id();
3772
- task.slot_action.slot_id = id_slot;
3840
+ int task_id = ctx_server.queue_tasks.get_new_id();
3841
+ {
3842
+ server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
3843
+ task.id = task_id;
3844
+ task.slot_action.slot_id = id_slot;
3773
3845
 
3774
- ctx_server.queue_results.add_waiting_task_id(task.id);
3775
- ctx_server.queue_tasks.post(task);
3846
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3847
+ ctx_server.queue_tasks.post(std::move(task));
3848
+ }
3776
3849
 
3777
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3778
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3850
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3851
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3779
3852
 
3780
3853
  if (result->is_error()) {
3781
3854
  res_error(res, result->to_json());
@@ -3820,7 +3893,7 @@ int main(int argc, char ** argv) {
3820
3893
  json data = {
3821
3894
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3822
3895
  { "total_slots", ctx_server.params_base.n_parallel },
3823
- { "model_path", ctx_server.params_base.model },
3896
+ { "model_path", ctx_server.params_base.model.path },
3824
3897
  { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3825
3898
  { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3826
3899
  { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -3848,6 +3921,21 @@ int main(int argc, char ** argv) {
3848
3921
  res_ok(res, {{ "success", true }});
3849
3922
  };
3850
3923
 
3924
+ const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
3925
+ json data = {
3926
+ {
3927
+ "template", common_chat_templates_source(ctx_server.chat_templates.get()),
3928
+ },
3929
+ {
3930
+ "model_info", {
3931
+ { "llama.context_length", ctx_server.slots.back().n_ctx, },
3932
+ }
3933
+ },
3934
+ };
3935
+
3936
+ res_ok(res, data);
3937
+ };
3938
+
3851
3939
  // handle completion-like requests (completion, chat, infill)
3852
3940
  // we can optionally provide a custom format for partial results and final results
3853
3941
  const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
@@ -3864,9 +3952,10 @@ int main(int argc, char ** argv) {
3864
3952
  }
3865
3953
 
3866
3954
  auto completion_id = gen_chatcmplid();
3867
- std::vector<server_task> tasks;
3868
-
3955
+ std::unordered_set<int> task_ids;
3869
3956
  try {
3957
+ std::vector<server_task> tasks;
3958
+
3870
3959
  const auto & prompt = data.at("prompt");
3871
3960
  // TODO: this log can become very long, put it behind a flag or think about a more compact format
3872
3961
  //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
@@ -3881,9 +3970,9 @@ int main(int argc, char ** argv) {
3881
3970
 
3882
3971
  task.prompt_tokens = std::move(tokenized_prompts[i]);
3883
3972
  task.params = server_task::params_from_json_cmpl(
3884
- ctx_server.ctx,
3885
- ctx_server.params_base,
3886
- data);
3973
+ ctx_server.ctx,
3974
+ ctx_server.params_base,
3975
+ data);
3887
3976
  task.id_selected_slot = json_value(data, "id_slot", -1);
3888
3977
 
3889
3978
  // OAI-compat
@@ -3891,18 +3980,18 @@ int main(int argc, char ** argv) {
3891
3980
  task.params.oaicompat_cmpl_id = completion_id;
3892
3981
  // oaicompat_model is already populated by params_from_json_cmpl
3893
3982
 
3894
- tasks.push_back(task);
3983
+ tasks.push_back(std::move(task));
3895
3984
  }
3985
+
3986
+ task_ids = server_task::get_list_id(tasks);
3987
+ ctx_server.queue_results.add_waiting_tasks(tasks);
3988
+ ctx_server.queue_tasks.post(std::move(tasks));
3896
3989
  } catch (const std::exception & e) {
3897
3990
  res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
3898
3991
  return;
3899
3992
  }
3900
3993
 
3901
- ctx_server.queue_results.add_waiting_tasks(tasks);
3902
- ctx_server.queue_tasks.post(tasks);
3903
-
3904
3994
  bool stream = json_value(data, "stream", false);
3905
- const auto task_ids = server_task::get_list_id(tasks);
3906
3995
 
3907
3996
  if (!stream) {
3908
3997
  ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
@@ -4086,7 +4175,7 @@ int main(int argc, char ** argv) {
4086
4175
  {"object", "list"},
4087
4176
  {"data", {
4088
4177
  {
4089
- {"id", params.model_alias.empty() ? params.model : params.model_alias},
4178
+ {"id", params.model_alias.empty() ? params.model.path : params.model_alias},
4090
4179
  {"object", "model"},
4091
4180
  {"created", std::time(0)},
4092
4181
  {"owned_by", "llamacpp"},
@@ -4194,6 +4283,7 @@ int main(int argc, char ** argv) {
4194
4283
  // create and queue the task
4195
4284
  json responses = json::array();
4196
4285
  bool error = false;
4286
+ std::unordered_set<int> task_ids;
4197
4287
  {
4198
4288
  std::vector<server_task> tasks;
4199
4289
  for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4206,27 +4296,26 @@ int main(int argc, char ** argv) {
4206
4296
  // OAI-compat
4207
4297
  task.params.oaicompat = oaicompat;
4208
4298
 
4209
- tasks.push_back(task);
4299
+ tasks.push_back(std::move(task));
4210
4300
  }
4211
4301
 
4302
+ task_ids = server_task::get_list_id(tasks);
4212
4303
  ctx_server.queue_results.add_waiting_tasks(tasks);
4213
- ctx_server.queue_tasks.post(tasks);
4214
-
4215
- // get the result
4216
- std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
4304
+ ctx_server.queue_tasks.post(std::move(tasks));
4305
+ }
4217
4306
 
4218
- ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4219
- for (auto & res : results) {
4220
- GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
4221
- responses.push_back(res->to_json());
4222
- }
4223
- }, [&](const json & error_data) {
4224
- res_error(res, error_data);
4225
- error = true;
4226
- }, req.is_connection_closed);
4307
+ // get the result
4308
+ ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4309
+ for (auto & res : results) {
4310
+ GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
4311
+ responses.push_back(res->to_json());
4312
+ }
4313
+ }, [&](const json & error_data) {
4314
+ res_error(res, error_data);
4315
+ error = true;
4316
+ }, req.is_connection_closed);
4227
4317
 
4228
- ctx_server.queue_results.remove_waiting_task_ids(task_ids);
4229
- }
4318
+ ctx_server.queue_results.remove_waiting_task_ids(task_ids);
4230
4319
 
4231
4320
  if (error) {
4232
4321
  return;
@@ -4293,6 +4382,7 @@ int main(int argc, char ** argv) {
4293
4382
  // create and queue the task
4294
4383
  json responses = json::array();
4295
4384
  bool error = false;
4385
+ std::unordered_set<int> task_ids;
4296
4386
  {
4297
4387
  std::vector<server_task> tasks;
4298
4388
  std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
@@ -4302,26 +4392,24 @@ int main(int argc, char ** argv) {
4302
4392
  task.id = ctx_server.queue_tasks.get_new_id();
4303
4393
  task.index = i;
4304
4394
  task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
4305
- tasks.push_back(task);
4395
+ tasks.push_back(std::move(task));
4306
4396
  }
4307
4397
 
4398
+ task_ids = server_task::get_list_id(tasks);
4308
4399
  ctx_server.queue_results.add_waiting_tasks(tasks);
4309
- ctx_server.queue_tasks.post(tasks);
4310
-
4311
- // get the result
4312
- std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
4313
-
4314
- ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4315
- for (auto & res : results) {
4316
- GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
4317
- responses.push_back(res->to_json());
4318
- }
4319
- }, [&](const json & error_data) {
4320
- res_error(res, error_data);
4321
- error = true;
4322
- }, req.is_connection_closed);
4400
+ ctx_server.queue_tasks.post(std::move(tasks));
4323
4401
  }
4324
4402
 
4403
+ ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4404
+ for (auto & res : results) {
4405
+ GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
4406
+ responses.push_back(res->to_json());
4407
+ }
4408
+ }, [&](const json & error_data) {
4409
+ res_error(res, error_data);
4410
+ error = true;
4411
+ }, req.is_connection_closed);
4412
+
4325
4413
  if (error) {
4326
4414
  return;
4327
4415
  }
@@ -4357,14 +4445,19 @@ int main(int argc, char ** argv) {
4357
4445
  res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
4358
4446
  return;
4359
4447
  }
4360
- server_task task(SERVER_TASK_TYPE_SET_LORA);
4361
- task.id = ctx_server.queue_tasks.get_new_id();
4362
- task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
4363
- ctx_server.queue_results.add_waiting_task_id(task.id);
4364
- ctx_server.queue_tasks.post(task);
4365
4448
 
4366
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
4367
- ctx_server.queue_results.remove_waiting_task_id(task.id);
4449
+ int task_id = ctx_server.queue_tasks.get_new_id();
4450
+ {
4451
+ server_task task(SERVER_TASK_TYPE_SET_LORA);
4452
+ task.id = task_id;
4453
+ task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
4454
+ ctx_server.queue_results.add_waiting_task_id(task_id);
4455
+ ctx_server.queue_tasks.post(std::move(task));
4456
+ }
4457
+
4458
+ // get the result
4459
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
4460
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
4368
4461
 
4369
4462
  if (result->is_error()) {
4370
4463
  res_error(res, result->to_json());
@@ -4412,6 +4505,7 @@ int main(int argc, char ** argv) {
4412
4505
  svr->Get ("/metrics", handle_metrics);
4413
4506
  svr->Get ("/props", handle_props);
4414
4507
  svr->Post("/props", handle_props_change);
4508
+ svr->Post("/api/show", handle_api_show);
4415
4509
  svr->Get ("/models", handle_models); // public endpoint (no API key check)
4416
4510
  svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
4417
4511
  svr->Post("/completion", handle_completions); // legacy
@@ -4448,21 +4542,31 @@ int main(int argc, char ** argv) {
4448
4542
  svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
4449
4543
 
4450
4544
  // clean up function, to be called before exit
4451
- auto clean_up = [&svr]() {
4545
+ auto clean_up = [&svr, &ctx_server]() {
4452
4546
  SRV_INF("%s: cleaning up before exit...\n", __func__);
4453
4547
  svr->stop();
4548
+ ctx_server.queue_results.terminate();
4454
4549
  llama_backend_free();
4455
4550
  };
4456
4551
 
4457
- // bind HTTP listen port
4458
4552
  bool was_bound = false;
4459
- if (params.port == 0) {
4460
- int bound_port = svr->bind_to_any_port(params.hostname);
4461
- if ((was_bound = (bound_port >= 0))) {
4462
- params.port = bound_port;
4463
- }
4553
+ if (string_ends_with(std::string(params.hostname), ".sock")) {
4554
+ LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
4555
+ svr->set_address_family(AF_UNIX);
4556
+ // bind_to_port requires a second arg, any value other than 0 should
4557
+ // simply get ignored
4558
+ was_bound = svr->bind_to_port(params.hostname, 8080);
4464
4559
  } else {
4465
- was_bound = svr->bind_to_port(params.hostname, params.port);
4560
+ LOG_INF("%s: binding port with default address family\n", __func__);
4561
+ // bind HTTP listen port
4562
+ if (params.port == 0) {
4563
+ int bound_port = svr->bind_to_any_port(params.hostname);
4564
+ if ((was_bound = (bound_port >= 0))) {
4565
+ params.port = bound_port;
4566
+ }
4567
+ } else {
4568
+ was_bound = svr->bind_to_port(params.hostname, params.port);
4569
+ }
4466
4570
  }
4467
4571
 
4468
4572
  if (!was_bound) {
@@ -4482,7 +4586,7 @@ int main(int argc, char ** argv) {
4482
4586
 
4483
4587
  if (!ctx_server.load_model(params)) {
4484
4588
  clean_up();
4485
- // t.join(); // FIXME: see below
4589
+ t.join();
4486
4590
  LOG_ERR("%s: exiting due to model loading error\n", __func__);
4487
4591
  return 1;
4488
4592
  }
@@ -4497,8 +4601,8 @@ int main(int argc, char ** argv) {
4497
4601
  common_chat_templates_source(ctx_server.chat_templates.get()),
4498
4602
  common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
4499
4603
 
4500
- ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4501
- ctx_server.process_single_task(task);
4604
+ ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
4605
+ ctx_server.process_single_task(std::move(task));
4502
4606
  });
4503
4607
 
4504
4608
  ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
@@ -4530,7 +4634,7 @@ int main(int argc, char ** argv) {
4530
4634
  ctx_server.queue_tasks.start_loop();
4531
4635
 
4532
4636
  clean_up();
4533
- // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4637
+ t.join();
4534
4638
 
4535
4639
  return 0;
4536
4640
  }