@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -133,7 +133,8 @@ struct slot_params {
133
133
 
134
134
  auto grammar_triggers = json::array();
135
135
  for (const auto & trigger : sampling.grammar_triggers) {
136
- grammar_triggers.push_back(trigger.to_json<json>());
136
+ server_grammar_trigger ct(std::move(trigger));
137
+ grammar_triggers.push_back(ct.to_json());
137
138
  }
138
139
 
139
140
  return json {
@@ -372,9 +373,9 @@ struct server_task {
372
373
  const auto grammar_triggers = data.find("grammar_triggers");
373
374
  if (grammar_triggers != data.end()) {
374
375
  for (const auto & t : *grammar_triggers) {
375
- auto ct = common_grammar_trigger::from_json(t);
376
- if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377
- const auto & word = ct.value;
376
+ server_grammar_trigger ct(t);
377
+ if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
378
+ const auto & word = ct.value.value;
378
379
  auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379
380
  if (ids.size() == 1) {
380
381
  auto token = ids[0];
@@ -392,7 +393,7 @@ struct server_task {
392
393
  params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
393
394
  }
394
395
  } else {
395
- params.sampling.grammar_triggers.push_back(ct);
396
+ params.sampling.grammar_triggers.push_back(std::move(ct.value));
396
397
  }
397
398
  }
398
399
  }
@@ -489,8 +490,12 @@ struct result_timings {
489
490
  double predicted_per_token_ms;
490
491
  double predicted_per_second;
491
492
 
493
+ // Optional speculative metrics - only included when > 0
494
+ int32_t draft_n = 0;
495
+ int32_t draft_n_accepted = 0;
496
+
492
497
  json to_json() const {
493
- return {
498
+ json base = {
494
499
  {"prompt_n", prompt_n},
495
500
  {"prompt_ms", prompt_ms},
496
501
  {"prompt_per_token_ms", prompt_per_token_ms},
@@ -501,6 +506,13 @@ struct result_timings {
501
506
  {"predicted_per_token_ms", predicted_per_token_ms},
502
507
  {"predicted_per_second", predicted_per_second},
503
508
  };
509
+
510
+ if (draft_n > 0) {
511
+ base["draft_n"] = draft_n;
512
+ base["draft_n_accepted"] = draft_n_accepted;
513
+ }
514
+
515
+ return base;
504
516
  }
505
517
  };
506
518
 
@@ -1299,6 +1311,10 @@ struct server_slot {
1299
1311
 
1300
1312
  std::function<void(int)> callback_on_release;
1301
1313
 
1314
+ // Speculative decoding stats
1315
+ int32_t n_draft_total = 0; // Total draft tokens generated
1316
+ int32_t n_draft_accepted = 0; // Draft tokens actually accepted
1317
+
1302
1318
  void reset() {
1303
1319
  SLT_DBG(*this, "%s", "\n");
1304
1320
 
@@ -1315,6 +1331,10 @@ struct server_slot {
1315
1331
 
1316
1332
  generated_tokens.clear();
1317
1333
  generated_token_probs.clear();
1334
+
1335
+ // clear speculative decoding stats
1336
+ n_draft_total = 0;
1337
+ n_draft_accepted = 0;
1318
1338
  }
1319
1339
 
1320
1340
  bool is_non_causal() const {
@@ -1381,6 +1401,12 @@ struct server_slot {
1381
1401
  timings.predicted_per_token_ms = t_token_generation / n_decoded;
1382
1402
  timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
1383
1403
 
1404
+ // Add speculative metrics
1405
+ if (n_draft_total > 0) {
1406
+ timings.draft_n = n_draft_total;
1407
+ timings.draft_n_accepted = n_draft_accepted;
1408
+ }
1409
+
1384
1410
  return timings;
1385
1411
  }
1386
1412
 
@@ -1428,6 +1454,15 @@ struct server_slot {
1428
1454
  t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
1429
1455
  t_token_generation, n_decoded, t_gen, n_gen_second,
1430
1456
  t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
1457
+
1458
+ if (n_draft_total > 0) {
1459
+ const float draft_ratio = (float) n_draft_accepted / n_draft_total;
1460
+ SLT_INF(*this,
1461
+ "\n"
1462
+ "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
1463
+ draft_ratio, n_draft_accepted, n_draft_total
1464
+ );
1465
+ }
1431
1466
  }
1432
1467
 
1433
1468
  json to_json() const {
@@ -1517,29 +1552,30 @@ struct server_queue {
1517
1552
  std::condition_variable condition_tasks;
1518
1553
 
1519
1554
  // callback functions
1520
- std::function<void(server_task)> callback_new_task;
1521
- std::function<void(void)> callback_update_slots;
1555
+ std::function<void(server_task &&)> callback_new_task;
1556
+ std::function<void(void)> callback_update_slots;
1522
1557
 
1523
1558
  // Add a new task to the end of the queue
1524
- int post(server_task task, bool front = false) {
1559
+ int post(server_task && task, bool front = false) {
1525
1560
  std::unique_lock<std::mutex> lock(mutex_tasks);
1526
1561
  GGML_ASSERT(task.id != -1);
1527
1562
  // if this is cancel task make sure to clean up pending tasks
1528
1563
  if (task.type == SERVER_TASK_TYPE_CANCEL) {
1529
1564
  cleanup_pending_task(task.id_target);
1530
1565
  }
1531
- QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
1566
+ const int task_id = task.id;
1567
+ QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
1532
1568
  if (front) {
1533
1569
  queue_tasks.push_front(std::move(task));
1534
1570
  } else {
1535
1571
  queue_tasks.push_back(std::move(task));
1536
1572
  }
1537
1573
  condition_tasks.notify_one();
1538
- return task.id;
1574
+ return task_id;
1539
1575
  }
1540
1576
 
1541
1577
  // multi-task version of post()
1542
- int post(std::vector<server_task> & tasks, bool front = false) {
1578
+ int post(std::vector<server_task> && tasks, bool front = false) {
1543
1579
  std::unique_lock<std::mutex> lock(mutex_tasks);
1544
1580
  for (auto & task : tasks) {
1545
1581
  if (task.id == -1) {
@@ -1561,7 +1597,7 @@ struct server_queue {
1561
1597
  }
1562
1598
 
1563
1599
  // Add a new task, but defer until one slot is available
1564
- void defer(server_task task) {
1600
+ void defer(server_task && task) {
1565
1601
  std::unique_lock<std::mutex> lock(mutex_tasks);
1566
1602
  QUE_DBG("defer task, id = %d\n", task.id);
1567
1603
  queue_tasks_deferred.push_back(std::move(task));
@@ -1576,7 +1612,7 @@ struct server_queue {
1576
1612
  }
1577
1613
 
1578
1614
  // Register function to process a new task
1579
- void on_new_task(std::function<void(server_task)> callback) {
1615
+ void on_new_task(std::function<void(server_task &&)> callback) {
1580
1616
  callback_new_task = std::move(callback);
1581
1617
  }
1582
1618
 
@@ -1625,7 +1661,7 @@ struct server_queue {
1625
1661
  lock.unlock();
1626
1662
  break;
1627
1663
  }
1628
- server_task task = queue_tasks.front();
1664
+ server_task task = std::move(queue_tasks.front());
1629
1665
  queue_tasks.pop_front();
1630
1666
  lock.unlock();
1631
1667
 
@@ -1670,6 +1706,8 @@ private:
1670
1706
  };
1671
1707
 
1672
1708
  struct server_response {
1709
+ bool running = true;
1710
+
1673
1711
  // for keeping track of all tasks waiting for the result
1674
1712
  std::unordered_set<int> waiting_task_ids;
1675
1713
 
@@ -1724,6 +1762,10 @@ struct server_response {
1724
1762
  while (true) {
1725
1763
  std::unique_lock<std::mutex> lock(mutex_results);
1726
1764
  condition_results.wait(lock, [&]{
1765
+ if (!running) {
1766
+ SRV_DBG("%s : queue result stop\n", __func__);
1767
+ std::terminate(); // we cannot return here since the caller is HTTP code
1768
+ }
1727
1769
  return !queue_results.empty();
1728
1770
  });
1729
1771
 
@@ -1754,6 +1796,10 @@ struct server_response {
1754
1796
  }
1755
1797
 
1756
1798
  std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
1799
+ if (!running) {
1800
+ SRV_DBG("%s : queue result stop\n", __func__);
1801
+ std::terminate(); // we cannot return here since the caller is HTTP code
1802
+ }
1757
1803
  if (cr_res == std::cv_status::timeout) {
1758
1804
  return nullptr;
1759
1805
  }
@@ -1783,6 +1829,12 @@ struct server_response {
1783
1829
  }
1784
1830
  }
1785
1831
  }
1832
+
1833
+ // terminate the waiting loop
1834
+ void terminate() {
1835
+ running = false;
1836
+ condition_results.notify_all();
1837
+ }
1786
1838
  };
1787
1839
 
1788
1840
  struct server_context {
@@ -1842,7 +1894,7 @@ struct server_context {
1842
1894
  }
1843
1895
 
1844
1896
  bool load_model(const common_params & params) {
1845
- SRV_INF("loading model '%s'\n", params.model.c_str());
1897
+ SRV_INF("loading model '%s'\n", params.model.path.c_str());
1846
1898
 
1847
1899
  params_base = params;
1848
1900
 
@@ -1852,7 +1904,7 @@ struct server_context {
1852
1904
  ctx = llama_init.context.get();
1853
1905
 
1854
1906
  if (model == nullptr) {
1855
- SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
1907
+ SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
1856
1908
  return false;
1857
1909
  }
1858
1910
 
@@ -1863,16 +1915,13 @@ struct server_context {
1863
1915
  add_bos_token = llama_vocab_get_add_bos(vocab);
1864
1916
  has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
1865
1917
 
1866
- if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
1867
- SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
1918
+ if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
1919
+ SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
1868
1920
 
1869
1921
  auto params_dft = params_base;
1870
1922
 
1871
1923
  params_dft.devices = params_base.speculative.devices;
1872
- params_dft.hf_file = params_base.speculative.hf_file;
1873
- params_dft.hf_repo = params_base.speculative.hf_repo;
1874
1924
  params_dft.model = params_base.speculative.model;
1875
- params_dft.model_url = params_base.speculative.model_url;
1876
1925
  params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
1877
1926
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1878
1927
  params_dft.n_parallel = 1;
@@ -1886,12 +1935,12 @@ struct server_context {
1886
1935
  model_dft = llama_init_dft.model.get();
1887
1936
 
1888
1937
  if (model_dft == nullptr) {
1889
- SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
1938
+ SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
1890
1939
  return false;
1891
1940
  }
1892
1941
 
1893
1942
  if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
1894
- SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
1943
+ SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
1895
1944
 
1896
1945
  return false;
1897
1946
  }
@@ -1956,7 +2005,7 @@ struct server_context {
1956
2005
 
1957
2006
  slot.reset();
1958
2007
 
1959
- slots.push_back(slot);
2008
+ slots.push_back(std::move(slot));
1960
2009
  }
1961
2010
 
1962
2011
  default_generation_settings_for_props = slots[0].to_json();
@@ -2057,7 +2106,7 @@ struct server_context {
2057
2106
  return true;
2058
2107
  }
2059
2108
 
2060
- bool launch_slot_with_task(server_slot & slot, const server_task & task) {
2109
+ bool launch_slot_with_task(server_slot & slot, server_task && task) {
2061
2110
  slot.reset();
2062
2111
  slot.id_task = task.id;
2063
2112
  slot.index = task.index;
@@ -2065,10 +2114,10 @@ struct server_context {
2065
2114
  slot.params = std::move(task.params);
2066
2115
  slot.prompt_tokens = std::move(task.prompt_tokens);
2067
2116
 
2068
- if (!are_lora_equal(task.params.lora, slot.lora)) {
2117
+ if (!are_lora_equal(slot.params.lora, slot.lora)) {
2069
2118
  // if lora is changed, we cannot reuse cached tokens
2070
2119
  slot.cache_tokens.clear();
2071
- slot.lora = task.params.lora;
2120
+ slot.lora = slot.params.lora;
2072
2121
  }
2073
2122
 
2074
2123
  bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
@@ -2499,10 +2548,10 @@ struct server_context {
2499
2548
  server_task task(SERVER_TASK_TYPE_CANCEL);
2500
2549
  task.id_target = id_task;
2501
2550
  queue_results.remove_waiting_task_id(id_task);
2502
- cancel_tasks.push_back(task);
2551
+ cancel_tasks.push_back(std::move(task));
2503
2552
  }
2504
2553
  // push to beginning of the queue, so it has highest priority
2505
- queue_tasks.post(cancel_tasks, true);
2554
+ queue_tasks.post(std::move(cancel_tasks), true);
2506
2555
  }
2507
2556
 
2508
2557
  // receive the results from task(s)
@@ -2589,7 +2638,7 @@ struct server_context {
2589
2638
  // Functions to process the task
2590
2639
  //
2591
2640
 
2592
- void process_single_task(server_task task) {
2641
+ void process_single_task(server_task && task) {
2593
2642
  switch (task.type) {
2594
2643
  case SERVER_TASK_TYPE_COMPLETION:
2595
2644
  case SERVER_TASK_TYPE_INFILL:
@@ -2603,17 +2652,17 @@ struct server_context {
2603
2652
  if (slot == nullptr) {
2604
2653
  // if no slot is available, we defer this task for processing later
2605
2654
  SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
2606
- queue_tasks.defer(task);
2655
+ queue_tasks.defer(std::move(task));
2607
2656
  break;
2608
2657
  }
2609
2658
  if (slot->is_processing()) {
2610
2659
  // if requested slot is unavailable, we defer this task for processing later
2611
2660
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2612
- queue_tasks.defer(task);
2661
+ queue_tasks.defer(std::move(task));
2613
2662
  break;
2614
2663
  }
2615
2664
 
2616
- if (!launch_slot_with_task(*slot, task)) {
2665
+ if (!launch_slot_with_task(*slot, std::move(task))) {
2617
2666
  SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
2618
2667
  break;
2619
2668
  }
@@ -2692,7 +2741,7 @@ struct server_context {
2692
2741
  if (slot->is_processing()) {
2693
2742
  // if requested slot is unavailable, we defer this task for processing later
2694
2743
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2695
- queue_tasks.defer(task);
2744
+ queue_tasks.defer(std::move(task));
2696
2745
  break;
2697
2746
  }
2698
2747
 
@@ -2728,7 +2777,7 @@ struct server_context {
2728
2777
  if (slot->is_processing()) {
2729
2778
  // if requested slot is unavailable, we defer this task for processing later
2730
2779
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2731
- queue_tasks.defer(task);
2780
+ queue_tasks.defer(std::move(task));
2732
2781
  break;
2733
2782
  }
2734
2783
 
@@ -2771,7 +2820,7 @@ struct server_context {
2771
2820
  if (slot->is_processing()) {
2772
2821
  // if requested slot is unavailable, we defer this task for processing later
2773
2822
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2774
- queue_tasks.defer(task);
2823
+ queue_tasks.defer(std::move(task));
2775
2824
  break;
2776
2825
  }
2777
2826
 
@@ -2823,7 +2872,7 @@ struct server_context {
2823
2872
 
2824
2873
  server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
2825
2874
  task.id = queue_tasks.get_new_id();
2826
- queue_tasks.post(task);
2875
+ queue_tasks.post(std::move(task));
2827
2876
  }
2828
2877
 
2829
2878
  // apply context-shift if needed
@@ -3290,6 +3339,9 @@ struct server_context {
3290
3339
 
3291
3340
  llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
3292
3341
 
3342
+ // keep track of total number of tokens generated in the draft
3343
+ slot.n_draft_total += draft.size();
3344
+
3293
3345
  // ignore small drafts
3294
3346
  if (slot.params.speculative.n_min > (int) draft.size()) {
3295
3347
  SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3315,6 +3367,9 @@ struct server_context {
3315
3367
  slot.n_past += ids.size();
3316
3368
  slot.n_decoded += ids.size();
3317
3369
 
3370
+ // update how many tokens out of draft was accepted
3371
+ slot.n_draft_accepted += ids.size() - 1;
3372
+
3318
3373
  slot.cache_tokens.push_back(id);
3319
3374
  slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3320
3375
 
@@ -3579,14 +3634,17 @@ int main(int argc, char ** argv) {
3579
3634
  }
3580
3635
 
3581
3636
  // request slots data using task queue
3582
- server_task task(SERVER_TASK_TYPE_METRICS);
3583
- task.id = ctx_server.queue_tasks.get_new_id();
3584
- ctx_server.queue_results.add_waiting_task_id(task.id);
3585
- ctx_server.queue_tasks.post(task, true); // high-priority task
3637
+ int task_id = ctx_server.queue_tasks.get_new_id();
3638
+ {
3639
+ server_task task(SERVER_TASK_TYPE_METRICS);
3640
+ task.id = task_id;
3641
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3642
+ ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
3643
+ }
3586
3644
 
3587
3645
  // get the result
3588
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3589
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3646
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3647
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3590
3648
 
3591
3649
  if (result->is_error()) {
3592
3650
  res_error(res, result->to_json());
@@ -3615,16 +3673,17 @@ int main(int argc, char ** argv) {
3615
3673
  }
3616
3674
 
3617
3675
  // request slots data using task queue
3618
- server_task task(SERVER_TASK_TYPE_METRICS);
3619
- task.id = ctx_server.queue_tasks.get_new_id();
3620
- task.metrics_reset_bucket = true;
3621
-
3622
- ctx_server.queue_results.add_waiting_task_id(task.id);
3623
- ctx_server.queue_tasks.post(task, true); // high-priority task
3676
+ int task_id = ctx_server.queue_tasks.get_new_id();
3677
+ {
3678
+ server_task task(SERVER_TASK_TYPE_METRICS);
3679
+ task.id = task_id;
3680
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3681
+ ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
3682
+ }
3624
3683
 
3625
3684
  // get the result
3626
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3627
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3685
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3686
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3628
3687
 
3629
3688
  if (result->is_error()) {
3630
3689
  res_error(res, result->to_json());
@@ -3721,17 +3780,20 @@ int main(int argc, char ** argv) {
3721
3780
  }
3722
3781
  std::string filepath = params.slot_save_path + filename;
3723
3782
 
3724
- server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
3725
- task.id = ctx_server.queue_tasks.get_new_id();
3726
- task.slot_action.slot_id = id_slot;
3727
- task.slot_action.filename = filename;
3728
- task.slot_action.filepath = filepath;
3783
+ int task_id = ctx_server.queue_tasks.get_new_id();
3784
+ {
3785
+ server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
3786
+ task.id = task_id;
3787
+ task.slot_action.slot_id = id_slot;
3788
+ task.slot_action.filename = filename;
3789
+ task.slot_action.filepath = filepath;
3729
3790
 
3730
- ctx_server.queue_results.add_waiting_task_id(task.id);
3731
- ctx_server.queue_tasks.post(task);
3791
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3792
+ ctx_server.queue_tasks.post(std::move(task));
3793
+ }
3732
3794
 
3733
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3734
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3795
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3796
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3735
3797
 
3736
3798
  if (result->is_error()) {
3737
3799
  res_error(res, result->to_json());
@@ -3750,17 +3812,20 @@ int main(int argc, char ** argv) {
3750
3812
  }
3751
3813
  std::string filepath = params.slot_save_path + filename;
3752
3814
 
3753
- server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
3754
- task.id = ctx_server.queue_tasks.get_new_id();
3755
- task.slot_action.slot_id = id_slot;
3756
- task.slot_action.filename = filename;
3757
- task.slot_action.filepath = filepath;
3815
+ int task_id = ctx_server.queue_tasks.get_new_id();
3816
+ {
3817
+ server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
3818
+ task.id = task_id;
3819
+ task.slot_action.slot_id = id_slot;
3820
+ task.slot_action.filename = filename;
3821
+ task.slot_action.filepath = filepath;
3758
3822
 
3759
- ctx_server.queue_results.add_waiting_task_id(task.id);
3760
- ctx_server.queue_tasks.post(task);
3823
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3824
+ ctx_server.queue_tasks.post(std::move(task));
3825
+ }
3761
3826
 
3762
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3763
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3827
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3828
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3764
3829
 
3765
3830
  if (result->is_error()) {
3766
3831
  res_error(res, result->to_json());
@@ -3772,15 +3837,18 @@ int main(int argc, char ** argv) {
3772
3837
  };
3773
3838
 
3774
3839
  const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
3775
- server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
3776
- task.id = ctx_server.queue_tasks.get_new_id();
3777
- task.slot_action.slot_id = id_slot;
3840
+ int task_id = ctx_server.queue_tasks.get_new_id();
3841
+ {
3842
+ server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
3843
+ task.id = task_id;
3844
+ task.slot_action.slot_id = id_slot;
3778
3845
 
3779
- ctx_server.queue_results.add_waiting_task_id(task.id);
3780
- ctx_server.queue_tasks.post(task);
3846
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3847
+ ctx_server.queue_tasks.post(std::move(task));
3848
+ }
3781
3849
 
3782
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3783
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3850
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3851
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3784
3852
 
3785
3853
  if (result->is_error()) {
3786
3854
  res_error(res, result->to_json());
@@ -3825,7 +3893,7 @@ int main(int argc, char ** argv) {
3825
3893
  json data = {
3826
3894
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3827
3895
  { "total_slots", ctx_server.params_base.n_parallel },
3828
- { "model_path", ctx_server.params_base.model },
3896
+ { "model_path", ctx_server.params_base.model.path },
3829
3897
  { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3830
3898
  { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3831
3899
  { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -3853,6 +3921,21 @@ int main(int argc, char ** argv) {
3853
3921
  res_ok(res, {{ "success", true }});
3854
3922
  };
3855
3923
 
3924
+ const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
3925
+ json data = {
3926
+ {
3927
+ "template", common_chat_templates_source(ctx_server.chat_templates.get()),
3928
+ },
3929
+ {
3930
+ "model_info", {
3931
+ { "llama.context_length", ctx_server.slots.back().n_ctx, },
3932
+ }
3933
+ },
3934
+ };
3935
+
3936
+ res_ok(res, data);
3937
+ };
3938
+
3856
3939
  // handle completion-like requests (completion, chat, infill)
3857
3940
  // we can optionally provide a custom format for partial results and final results
3858
3941
  const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
@@ -3869,9 +3952,10 @@ int main(int argc, char ** argv) {
3869
3952
  }
3870
3953
 
3871
3954
  auto completion_id = gen_chatcmplid();
3872
- std::vector<server_task> tasks;
3873
-
3955
+ std::unordered_set<int> task_ids;
3874
3956
  try {
3957
+ std::vector<server_task> tasks;
3958
+
3875
3959
  const auto & prompt = data.at("prompt");
3876
3960
  // TODO: this log can become very long, put it behind a flag or think about a more compact format
3877
3961
  //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
@@ -3886,9 +3970,9 @@ int main(int argc, char ** argv) {
3886
3970
 
3887
3971
  task.prompt_tokens = std::move(tokenized_prompts[i]);
3888
3972
  task.params = server_task::params_from_json_cmpl(
3889
- ctx_server.ctx,
3890
- ctx_server.params_base,
3891
- data);
3973
+ ctx_server.ctx,
3974
+ ctx_server.params_base,
3975
+ data);
3892
3976
  task.id_selected_slot = json_value(data, "id_slot", -1);
3893
3977
 
3894
3978
  // OAI-compat
@@ -3896,18 +3980,18 @@ int main(int argc, char ** argv) {
3896
3980
  task.params.oaicompat_cmpl_id = completion_id;
3897
3981
  // oaicompat_model is already populated by params_from_json_cmpl
3898
3982
 
3899
- tasks.push_back(task);
3983
+ tasks.push_back(std::move(task));
3900
3984
  }
3985
+
3986
+ task_ids = server_task::get_list_id(tasks);
3987
+ ctx_server.queue_results.add_waiting_tasks(tasks);
3988
+ ctx_server.queue_tasks.post(std::move(tasks));
3901
3989
  } catch (const std::exception & e) {
3902
3990
  res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
3903
3991
  return;
3904
3992
  }
3905
3993
 
3906
- ctx_server.queue_results.add_waiting_tasks(tasks);
3907
- ctx_server.queue_tasks.post(tasks);
3908
-
3909
3994
  bool stream = json_value(data, "stream", false);
3910
- const auto task_ids = server_task::get_list_id(tasks);
3911
3995
 
3912
3996
  if (!stream) {
3913
3997
  ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
@@ -4091,7 +4175,7 @@ int main(int argc, char ** argv) {
4091
4175
  {"object", "list"},
4092
4176
  {"data", {
4093
4177
  {
4094
- {"id", params.model_alias.empty() ? params.model : params.model_alias},
4178
+ {"id", params.model_alias.empty() ? params.model.path : params.model_alias},
4095
4179
  {"object", "model"},
4096
4180
  {"created", std::time(0)},
4097
4181
  {"owned_by", "llamacpp"},
@@ -4199,6 +4283,7 @@ int main(int argc, char ** argv) {
4199
4283
  // create and queue the task
4200
4284
  json responses = json::array();
4201
4285
  bool error = false;
4286
+ std::unordered_set<int> task_ids;
4202
4287
  {
4203
4288
  std::vector<server_task> tasks;
4204
4289
  for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4211,27 +4296,26 @@ int main(int argc, char ** argv) {
4211
4296
  // OAI-compat
4212
4297
  task.params.oaicompat = oaicompat;
4213
4298
 
4214
- tasks.push_back(task);
4299
+ tasks.push_back(std::move(task));
4215
4300
  }
4216
4301
 
4302
+ task_ids = server_task::get_list_id(tasks);
4217
4303
  ctx_server.queue_results.add_waiting_tasks(tasks);
4218
- ctx_server.queue_tasks.post(tasks);
4304
+ ctx_server.queue_tasks.post(std::move(tasks));
4305
+ }
4219
4306
 
4220
- // get the result
4221
- std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
4307
+ // get the result
4308
+ ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4309
+ for (auto & res : results) {
4310
+ GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
4311
+ responses.push_back(res->to_json());
4312
+ }
4313
+ }, [&](const json & error_data) {
4314
+ res_error(res, error_data);
4315
+ error = true;
4316
+ }, req.is_connection_closed);
4222
4317
 
4223
- ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4224
- for (auto & res : results) {
4225
- GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
4226
- responses.push_back(res->to_json());
4227
- }
4228
- }, [&](const json & error_data) {
4229
- res_error(res, error_data);
4230
- error = true;
4231
- }, req.is_connection_closed);
4232
-
4233
- ctx_server.queue_results.remove_waiting_task_ids(task_ids);
4234
- }
4318
+ ctx_server.queue_results.remove_waiting_task_ids(task_ids);
4235
4319
 
4236
4320
  if (error) {
4237
4321
  return;
@@ -4298,6 +4382,7 @@ int main(int argc, char ** argv) {
4298
4382
  // create and queue the task
4299
4383
  json responses = json::array();
4300
4384
  bool error = false;
4385
+ std::unordered_set<int> task_ids;
4301
4386
  {
4302
4387
  std::vector<server_task> tasks;
4303
4388
  std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
@@ -4307,26 +4392,24 @@ int main(int argc, char ** argv) {
4307
4392
  task.id = ctx_server.queue_tasks.get_new_id();
4308
4393
  task.index = i;
4309
4394
  task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
4310
- tasks.push_back(task);
4395
+ tasks.push_back(std::move(task));
4311
4396
  }
4312
4397
 
4398
+ task_ids = server_task::get_list_id(tasks);
4313
4399
  ctx_server.queue_results.add_waiting_tasks(tasks);
4314
- ctx_server.queue_tasks.post(tasks);
4315
-
4316
- // get the result
4317
- std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
4318
-
4319
- ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4320
- for (auto & res : results) {
4321
- GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
4322
- responses.push_back(res->to_json());
4323
- }
4324
- }, [&](const json & error_data) {
4325
- res_error(res, error_data);
4326
- error = true;
4327
- }, req.is_connection_closed);
4400
+ ctx_server.queue_tasks.post(std::move(tasks));
4328
4401
  }
4329
4402
 
4403
+ ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4404
+ for (auto & res : results) {
4405
+ GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
4406
+ responses.push_back(res->to_json());
4407
+ }
4408
+ }, [&](const json & error_data) {
4409
+ res_error(res, error_data);
4410
+ error = true;
4411
+ }, req.is_connection_closed);
4412
+
4330
4413
  if (error) {
4331
4414
  return;
4332
4415
  }
@@ -4362,14 +4445,19 @@ int main(int argc, char ** argv) {
4362
4445
  res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
4363
4446
  return;
4364
4447
  }
4365
- server_task task(SERVER_TASK_TYPE_SET_LORA);
4366
- task.id = ctx_server.queue_tasks.get_new_id();
4367
- task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
4368
- ctx_server.queue_results.add_waiting_task_id(task.id);
4369
- ctx_server.queue_tasks.post(task);
4370
4448
 
4371
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
4372
- ctx_server.queue_results.remove_waiting_task_id(task.id);
4449
+ int task_id = ctx_server.queue_tasks.get_new_id();
4450
+ {
4451
+ server_task task(SERVER_TASK_TYPE_SET_LORA);
4452
+ task.id = task_id;
4453
+ task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
4454
+ ctx_server.queue_results.add_waiting_task_id(task_id);
4455
+ ctx_server.queue_tasks.post(std::move(task));
4456
+ }
4457
+
4458
+ // get the result
4459
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
4460
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
4373
4461
 
4374
4462
  if (result->is_error()) {
4375
4463
  res_error(res, result->to_json());
@@ -4417,6 +4505,7 @@ int main(int argc, char ** argv) {
4417
4505
  svr->Get ("/metrics", handle_metrics);
4418
4506
  svr->Get ("/props", handle_props);
4419
4507
  svr->Post("/props", handle_props_change);
4508
+ svr->Post("/api/show", handle_api_show);
4420
4509
  svr->Get ("/models", handle_models); // public endpoint (no API key check)
4421
4510
  svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
4422
4511
  svr->Post("/completion", handle_completions); // legacy
@@ -4453,21 +4542,31 @@ int main(int argc, char ** argv) {
4453
4542
  svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
4454
4543
 
4455
4544
  // clean up function, to be called before exit
4456
- auto clean_up = [&svr]() {
4545
+ auto clean_up = [&svr, &ctx_server]() {
4457
4546
  SRV_INF("%s: cleaning up before exit...\n", __func__);
4458
4547
  svr->stop();
4548
+ ctx_server.queue_results.terminate();
4459
4549
  llama_backend_free();
4460
4550
  };
4461
4551
 
4462
- // bind HTTP listen port
4463
4552
  bool was_bound = false;
4464
- if (params.port == 0) {
4465
- int bound_port = svr->bind_to_any_port(params.hostname);
4466
- if ((was_bound = (bound_port >= 0))) {
4467
- params.port = bound_port;
4468
- }
4553
+ if (string_ends_with(std::string(params.hostname), ".sock")) {
4554
+ LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
4555
+ svr->set_address_family(AF_UNIX);
4556
+ // bind_to_port requires a second arg, any value other than 0 should
4557
+ // simply get ignored
4558
+ was_bound = svr->bind_to_port(params.hostname, 8080);
4469
4559
  } else {
4470
- was_bound = svr->bind_to_port(params.hostname, params.port);
4560
+ LOG_INF("%s: binding port with default address family\n", __func__);
4561
+ // bind HTTP listen port
4562
+ if (params.port == 0) {
4563
+ int bound_port = svr->bind_to_any_port(params.hostname);
4564
+ if ((was_bound = (bound_port >= 0))) {
4565
+ params.port = bound_port;
4566
+ }
4567
+ } else {
4568
+ was_bound = svr->bind_to_port(params.hostname, params.port);
4569
+ }
4471
4570
  }
4472
4571
 
4473
4572
  if (!was_bound) {
@@ -4487,7 +4586,7 @@ int main(int argc, char ** argv) {
4487
4586
 
4488
4587
  if (!ctx_server.load_model(params)) {
4489
4588
  clean_up();
4490
- // t.join(); // FIXME: see below
4589
+ t.join();
4491
4590
  LOG_ERR("%s: exiting due to model loading error\n", __func__);
4492
4591
  return 1;
4493
4592
  }
@@ -4502,8 +4601,8 @@ int main(int argc, char ** argv) {
4502
4601
  common_chat_templates_source(ctx_server.chat_templates.get()),
4503
4602
  common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
4504
4603
 
4505
- ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4506
- ctx_server.process_single_task(task);
4604
+ ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
4605
+ ctx_server.process_single_task(std::move(task));
4507
4606
  });
4508
4607
 
4509
4608
  ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
@@ -4535,7 +4634,7 @@ int main(int argc, char ** argv) {
4535
4634
  ctx_server.queue_tasks.start_loop();
4536
4635
 
4537
4636
  clean_up();
4538
- // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4637
+ t.join();
4539
4638
 
4540
4639
  return 0;
4541
4640
  }