@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +20 -6
  18. package/lib/index.js +41 -17
  19. package/lib/index.ts +50 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +9 -9
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +37 -18
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +16 -12
  26. package/src/TokenizeWorker.h +2 -2
  27. package/src/common.hpp +54 -50
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -39,6 +39,7 @@
39
39
  # define MTMD_API
40
40
  #endif
41
41
 
42
+ // deprecated marker, use mtmd_default_marker() instead
42
43
  #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
43
44
 
44
45
  #ifdef __cplusplus
@@ -48,6 +49,7 @@ extern "C" {
48
49
  enum mtmd_input_chunk_type {
49
50
  MTMD_INPUT_CHUNK_TYPE_TEXT,
50
51
  MTMD_INPUT_CHUNK_TYPE_IMAGE,
52
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
51
53
  };
52
54
 
53
55
  // opaque types
@@ -79,9 +81,12 @@ struct mtmd_context_params {
79
81
  bool print_timings;
80
82
  int n_threads;
81
83
  enum ggml_log_level verbosity;
82
- const char * image_marker;
84
+ const char * image_marker; // deprecated, use media_marker instead
85
+ const char * media_marker;
83
86
  };
84
87
 
88
+ MTMD_API const char * mtmd_default_marker(void);
89
+
85
90
  MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
86
91
 
87
92
  // initialize the mtmd context
@@ -98,18 +103,28 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
98
103
  // whether the current model use M-RoPE for llama_decode
99
104
  MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
100
105
 
106
+ // whether the current model supports vision input
107
+ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
108
+
109
+ // whether the current model supports audio input
110
+ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
101
111
 
102
112
  // mtmd_bitmap
103
113
  //
104
- // length of data must be nx * ny * 3
105
- // the data is in RGBRGBRGB... format
106
- MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx,
107
- uint32_t ny,
108
- const unsigned char * data);
109
- MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
110
- MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
111
- MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
112
- MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
114
+ // if bitmap is image:
115
+ // length of data must be nx * ny * 3
116
+ // the data is in RGBRGBRGB... format
117
+ // if bitmap is audio:
118
+ // length of data must be n_samples * sizeof(float)
119
+ // the data is in float format (PCM F32)
120
+ MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
121
+ MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
122
+ MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
123
+ MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
124
+ MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
125
+ MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
126
+ MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
127
+ MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
113
128
  // bitmap ID is optional, but useful for KV cache tracking
114
129
  // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
115
130
  MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@@ -132,6 +147,11 @@ MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chu
132
147
  MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
133
148
  MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
134
149
  MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
150
+ MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
151
+ // returns nullptr for ID on text chunk
152
+ MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
153
+ // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
154
+ MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
135
155
 
136
156
  // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
137
157
  // you can move the chunk ownership to your own code by copying it
@@ -144,27 +164,28 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
144
164
  //
145
165
  // the instance will be constructed via mtmd_tokenize()
146
166
  // it will be freed along with mtmd_input_chunk
147
- MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
167
+ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
148
168
  MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
149
169
  MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
150
- MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens);
170
+ MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
151
171
  // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
152
- MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens);
172
+ MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
153
173
 
154
- // tokenize an input text prompt and an image
155
- // the prompt must have the input image marker (default: "<__image__>") in it
156
- // the marker will be replaced with the image tokens
174
+ // tokenize an input text prompt and a list of bitmaps (images/audio)
175
+ // the prompt must have the input image marker (default: "<__media__>") in it
176
+ // the default marker is defined by mtmd_default_marker()
177
+ // the marker will be replaced with the image/audio chunk
157
178
  // for example:
158
- // "here is an image: <__image__>\ndescribe it in detail."
179
+ // "here is an image: <__media__>\ndescribe it in detail."
159
180
  // this will gives 3 chunks:
160
181
  // 1. "here is an image: <start_of_image>"
161
- // 2. (image tokens)
182
+ // 2. (image/audio tokens)
162
183
  // 3. "<end_of_image>\ndescribe it in detail."
163
- // number of bitmaps must be equal to the number of image markers in the prompt
184
+ // number of bitmaps must be equal to the number of markers in the prompt
164
185
  // this function is thread-safe (shared ctx)
165
186
  // return values:
166
187
  // 0 on success
167
- // 1 on number of images not matching the number of markers
188
+ // 1 on number of bitmaps not matching the number of markers
168
189
  // 2 on image preprocessing error
169
190
  MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
170
191
  mtmd_input_chunks * output,
@@ -173,9 +194,14 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
173
194
  size_t n_bitmaps);
174
195
 
175
196
  // returns 0 on success
197
+ // TODO: deprecate
176
198
  MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
177
199
  const mtmd_image_tokens * image_tokens);
178
200
 
201
+ // returns 0 on success
202
+ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
203
+ const mtmd_input_chunk * chunk);
204
+
179
205
  // get output embeddings from the last encode pass
180
206
  MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
181
207
 
@@ -189,12 +215,16 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
189
215
  //
190
216
 
191
217
  // helper function to construct a mtmd_bitmap from a file
218
+ // it calls mtmd_helper_bitmap_init_from_buf() internally
192
219
  // returns nullptr on failure
193
220
  // this function is thread-safe
194
221
  MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
195
222
 
196
223
  // helper function to construct a mtmd_bitmap from a buffer containing a file
197
- // the file content must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
224
+ // supported formats:
225
+ // image: formats supported by stb_image: jpg, png, bmp, gif, etc.
226
+ // audio: formats supported by miniaudio: wav, mp3, flac
227
+ // note: audio files will be auto-detected based on magic bytes
198
228
  // returns nullptr on failure
199
229
  // this function is thread-safe
200
230
  MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
@@ -293,6 +323,7 @@ struct bitmap {
293
323
  uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
294
324
  uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
295
325
  const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
326
+ size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
296
327
  std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
297
328
  void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
298
329
  };
@@ -936,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
936
936
  // Function to tokenize the prompt
937
937
  static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
938
938
  std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
939
- const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
939
+ const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
940
940
 
941
941
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
942
942
  prompt_tokens.resize(n_prompt_tokens);
@@ -952,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
952
952
  // Check if we have enough space in the context to evaluate this batch
953
953
  static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
954
954
  const int n_ctx = llama_n_ctx(ctx.get());
955
- const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
955
+ const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
956
956
  if (n_ctx_used + batch.n_tokens > n_ctx) {
957
957
  printf(LOG_COL_DEFAULT "\n");
958
958
  printe("context size exceeded\n");
@@ -951,7 +951,7 @@ struct server_task_result_cmpl_partial : server_task_result {
951
951
  }
952
952
 
953
953
  json to_json_oaicompat_chat() {
954
- bool first = n_decoded == 0;
954
+ bool first = n_decoded == 1;
955
955
  std::time_t t = std::time(0);
956
956
  json choices;
957
957
 
@@ -962,15 +962,18 @@ struct server_task_result_cmpl_partial : server_task_result {
962
962
  {"delta", json{{"role", "assistant"}}}}});
963
963
  } else {
964
964
  // We have to send this as two updates to conform to openai behavior
965
+ // initial_ret is the role message for stream=True
965
966
  json initial_ret = json{{"choices", json::array({json{
966
967
  {"finish_reason", nullptr},
967
968
  {"index", 0},
968
969
  {"delta", json{
969
- {"role", "assistant"}
970
+ {"role", "assistant"},
971
+ {"content", ""}
970
972
  }}}})},
971
973
  {"created", t},
972
974
  {"id", oaicompat_cmpl_id},
973
975
  {"model", oaicompat_model},
976
+ {"system_fingerprint", build_info},
974
977
  {"object", "chat.completion.chunk"}};
975
978
 
976
979
  json second_ret = json{
@@ -982,8 +985,19 @@ struct server_task_result_cmpl_partial : server_task_result {
982
985
  {"created", t},
983
986
  {"id", oaicompat_cmpl_id},
984
987
  {"model", oaicompat_model},
988
+ {"system_fingerprint", build_info},
985
989
  {"object", "chat.completion.chunk"}};
986
990
 
991
+ if (prob_output.probs.size() > 0) {
992
+ second_ret["choices"][0]["logprobs"] = json{
993
+ {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
994
+ };
995
+ }
996
+
997
+ if (timings.prompt_n >= 0) {
998
+ second_ret.push_back({"timings", timings.to_json()});
999
+ }
1000
+
987
1001
  return std::vector<json>({initial_ret, second_ret});
988
1002
  }
989
1003
  } else {
@@ -1137,9 +1151,6 @@ struct server_task_result_metrics : server_task_result {
1137
1151
  int n_tasks_deferred;
1138
1152
  int64_t t_start;
1139
1153
 
1140
- int32_t kv_cache_tokens_count;
1141
- int32_t kv_cache_used_cells;
1142
-
1143
1154
  // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
1144
1155
  uint64_t n_prompt_tokens_processed_total = 0;
1145
1156
  uint64_t t_prompt_processing_total = 0;
@@ -1179,9 +1190,6 @@ struct server_task_result_metrics : server_task_result {
1179
1190
  { "n_decode_total", n_decode_total },
1180
1191
  { "n_busy_slots_total", n_busy_slots_total },
1181
1192
 
1182
- { "kv_cache_tokens_count", kv_cache_tokens_count },
1183
- { "kv_cache_used_cells", kv_cache_used_cells },
1184
-
1185
1193
  { "slots", slots_data },
1186
1194
  };
1187
1195
  }
@@ -1883,6 +1891,7 @@ struct server_context {
1883
1891
  float slot_prompt_similarity = 0.0f;
1884
1892
 
1885
1893
  common_chat_templates_ptr chat_templates;
1894
+ oaicompat_parser_options oai_parser_opt;
1886
1895
 
1887
1896
  ~server_context() {
1888
1897
  mtmd_free(mctx);
@@ -2004,6 +2013,23 @@ struct server_context {
2004
2013
  }
2005
2014
  }
2006
2015
 
2016
+ if (!llama_kv_self_can_shift(ctx)) {
2017
+ if (params_base.ctx_shift) {
2018
+ params_base.ctx_shift = false;
2019
+ SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
2020
+ }
2021
+
2022
+ if (params_base.n_cache_reuse) {
2023
+ params_base.n_cache_reuse = 0;
2024
+ SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
2025
+ }
2026
+
2027
+ if (!params_base.speculative.model.path.empty()) {
2028
+ SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
2029
+ return false;
2030
+ }
2031
+ }
2032
+
2007
2033
  return true;
2008
2034
  }
2009
2035
 
@@ -2061,6 +2087,15 @@ struct server_context {
2061
2087
  }
2062
2088
 
2063
2089
  metrics.init();
2090
+
2091
+ oai_parser_opt = {
2092
+ /* use_jinja */ params_base.use_jinja,
2093
+ /* prefill_assistant */ params_base.prefill_assistant,
2094
+ /* reasoning_format */ params_base.reasoning_format,
2095
+ /* common_chat_templates */ chat_templates.get(),
2096
+ /* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
2097
+ /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
2098
+ };
2064
2099
  }
2065
2100
 
2066
2101
  server_slot * get_slot_by_id(int id) {
@@ -2754,9 +2789,6 @@ struct server_context {
2754
2789
  res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
2755
2790
  res->t_start = metrics.t_start;
2756
2791
 
2757
- res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
2758
- res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
2759
-
2760
2792
  res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
2761
2793
  res->t_prompt_processing_total = metrics.t_prompt_processing_total;
2762
2794
  res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
@@ -3181,7 +3213,15 @@ struct server_context {
3181
3213
  // if we don't cache the prompt, we have to remove the entire KV cache
3182
3214
  llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
3183
3215
  slot.n_past = 0;
3184
- slot.cache_tokens.clear();
3216
+ slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
3217
+ }
3218
+
3219
+ if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
3220
+ if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
3221
+ SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
3222
+ "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
3223
+ slot.n_past = 0;
3224
+ }
3185
3225
  }
3186
3226
  }
3187
3227
 
@@ -3311,6 +3351,37 @@ struct server_context {
3311
3351
  common_set_adapter_lora(ctx, slot_batched->lora);
3312
3352
  }
3313
3353
 
3354
+ const bool do_encode = (params_base.embedding || params_base.reranking);
3355
+
3356
+ // pad the batch so that batch.n_tokens >= n_slots
3357
+ // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
3358
+ if (do_encode) {
3359
+ const int n_slots = slots.size();
3360
+
3361
+ if (batch.n_tokens < n_slots) {
3362
+ std::set<llama_seq_id> seq_ids;
3363
+ for (int j = 0; j < batch.n_tokens; ++j) {
3364
+ seq_ids.insert(batch.seq_id[j][0]);
3365
+ }
3366
+
3367
+ // find unused sequence id
3368
+ llama_seq_id seq_id = -1;
3369
+ for (int i = 0; i < n_slots; ++i) {
3370
+ if (seq_ids.find(i) == seq_ids.end()) {
3371
+ seq_id = i;
3372
+ }
3373
+ }
3374
+
3375
+ const int n_add = n_slots - batch.n_tokens;
3376
+
3377
+ SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
3378
+
3379
+ for (int j = 0; j < n_add; ++j) {
3380
+ common_batch_add(batch, 0, j, { seq_id }, false);
3381
+ }
3382
+ }
3383
+ }
3384
+
3314
3385
  // process the created batch of tokens
3315
3386
  for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
3316
3387
  const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
@@ -3327,7 +3398,7 @@ struct server_context {
3327
3398
 
3328
3399
  int ret = 0;
3329
3400
 
3330
- if (params_base.embedding || params_base.reranking) {
3401
+ if (do_encode) {
3331
3402
  ret = llama_encode(ctx, batch_view);
3332
3403
  } else {
3333
3404
  ret = llama_decode(ctx, batch_view);
@@ -3336,14 +3407,29 @@ struct server_context {
3336
3407
  metrics.on_decoded(slots);
3337
3408
 
3338
3409
  if (ret != 0) {
3339
- if (n_batch == 1 || ret < 0) {
3340
- // if you get here, it means the KV cache is full - try increasing it via the context size
3341
- SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
3342
- for (auto & slot : slots) {
3343
- slot.release();
3344
- send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
3410
+ {
3411
+ std::string err;
3412
+
3413
+ if (n_batch == 1 && ret == 1) {
3414
+ err = "Context size has been exceeded.";
3415
+ }
3416
+
3417
+ if (ret == -1) {
3418
+ err = "Invalid input batch.";
3419
+ }
3420
+
3421
+ if (ret < -1) {
3422
+ err = "Compute error.";
3423
+ }
3424
+
3425
+ if (!err.empty()) {
3426
+ SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
3427
+ for (auto & slot : slots) {
3428
+ slot.release();
3429
+ send_error(slot, err);
3430
+ }
3431
+ break;
3345
3432
  }
3346
- break; // break loop of n_batch
3347
3433
  }
3348
3434
 
3349
3435
  // retry with half the batch size to try to find a free slot in the KV cache
@@ -3677,6 +3763,7 @@ int main(int argc, char ** argv) {
3677
3763
  "/health",
3678
3764
  "/models",
3679
3765
  "/v1/models",
3766
+ "/api/tags"
3680
3767
  };
3681
3768
 
3682
3769
  // If API key is not set, skip validation
@@ -3715,7 +3802,7 @@ int main(int argc, char ** argv) {
3715
3802
  if (req.path == "/" || tmp.back() == "html") {
3716
3803
  res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
3717
3804
  res.status = 503;
3718
- } else if (req.path == "/models" || req.path == "/v1/models") {
3805
+ } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
3719
3806
  // allow the models endpoint to be accessed during loading
3720
3807
  return true;
3721
3808
  } else {
@@ -3858,14 +3945,6 @@ int main(int argc, char ** argv) {
3858
3945
  {"name", "predicted_tokens_seconds"},
3859
3946
  {"help", "Average generation throughput in tokens/s."},
3860
3947
  {"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
3861
- },{
3862
- {"name", "kv_cache_usage_ratio"},
3863
- {"help", "KV-cache usage. 1 means 100 percent usage."},
3864
- {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx}
3865
- },{
3866
- {"name", "kv_cache_tokens"},
3867
- {"help", "KV-cache tokens."},
3868
- {"value", (uint64_t) res_metrics->kv_cache_tokens_count}
3869
3948
  },{
3870
3949
  {"name", "requests_processing"},
3871
3950
  {"help", "Number of requests processing."},
@@ -4023,7 +4102,10 @@ int main(int argc, char ** argv) {
4023
4102
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
4024
4103
  { "total_slots", ctx_server.params_base.n_parallel },
4025
4104
  { "model_path", ctx_server.params_base.model.path },
4026
- { "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
4105
+ { "modalities", json{
4106
+ {"vision", ctx_server.oai_parser_opt.allow_image},
4107
+ {"audio", ctx_server.oai_parser_opt.allow_audio},
4108
+ } },
4027
4109
  { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
4028
4110
  { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
4029
4111
  { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4061,6 +4143,19 @@ int main(int argc, char ** argv) {
4061
4143
  { "llama.context_length", ctx_server.slots.back().n_ctx, },
4062
4144
  }
4063
4145
  },
4146
+ {"modelfile", ""},
4147
+ {"parameters", ""},
4148
+ {"template", common_chat_templates_source(ctx_server.chat_templates.get())},
4149
+ {"details", {
4150
+ {"parent_model", ""},
4151
+ {"format", "gguf"},
4152
+ {"family", ""},
4153
+ {"families", {""}},
4154
+ {"parameter_size", ""},
4155
+ {"quantization_level", ""}
4156
+ }},
4157
+ {"model_info", ""},
4158
+ {"capabilities", {"completion"}}
4064
4159
  };
4065
4160
 
4066
4161
  res_ok(res, data);
@@ -4101,10 +4196,10 @@ int main(int argc, char ** argv) {
4101
4196
  for (auto & file : files) {
4102
4197
  mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
4103
4198
  if (!bmp.ptr) {
4104
- throw std::runtime_error("Failed to load image");
4199
+ throw std::runtime_error("Failed to load image or audio file");
4105
4200
  }
4106
4201
  // calculate bitmap hash (for KV caching)
4107
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
4202
+ std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
4108
4203
  bmp.set_id(hash.c_str());
4109
4204
  bitmaps.entries.push_back(std::move(bmp));
4110
4205
  }
@@ -4336,7 +4431,7 @@ int main(int argc, char ** argv) {
4336
4431
  OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
4337
4432
  };
4338
4433
 
4339
- const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
4434
+ const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
4340
4435
  LOG_DBG("request: %s\n", req.body.c_str());
4341
4436
  if (ctx_server.params_base.embedding) {
4342
4437
  res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
@@ -4345,13 +4440,9 @@ int main(int argc, char ** argv) {
4345
4440
 
4346
4441
  auto body = json::parse(req.body);
4347
4442
  std::vector<raw_buffer> files;
4348
- json data = oaicompat_completion_params_parse(
4443
+ json data = oaicompat_chat_params_parse(
4349
4444
  body,
4350
- params.use_jinja,
4351
- params.prefill_assistant,
4352
- params.reasoning_format,
4353
- ctx_server.chat_templates.get(),
4354
- ctx_server.mctx,
4445
+ ctx_server.oai_parser_opt,
4355
4446
  files);
4356
4447
 
4357
4448
  handle_completions_impl(
@@ -4364,16 +4455,12 @@ int main(int argc, char ** argv) {
4364
4455
  };
4365
4456
 
4366
4457
  // same with handle_chat_completions, but without inference part
4367
- const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4458
+ const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
4368
4459
  auto body = json::parse(req.body);
4369
4460
  std::vector<raw_buffer> files; // dummy, unused
4370
- json data = oaicompat_completion_params_parse(
4461
+ json data = oaicompat_chat_params_parse(
4371
4462
  body,
4372
- params.use_jinja,
4373
- params.prefill_assistant,
4374
- params.reasoning_format,
4375
- ctx_server.chat_templates.get(),
4376
- ctx_server.mctx,
4463
+ ctx_server.oai_parser_opt,
4377
4464
  files);
4378
4465
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4379
4466
  };
@@ -4386,6 +4473,28 @@ int main(int argc, char ** argv) {
4386
4473
  }
4387
4474
 
4388
4475
  json models = {
4476
+ {"models", {
4477
+ {
4478
+ {"name", params.model_alias.empty() ? params.model.path : params.model_alias},
4479
+ {"model", params.model_alias.empty() ? params.model.path : params.model_alias},
4480
+ {"modified_at", ""},
4481
+ {"size", ""},
4482
+ {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
4483
+ {"type", "model"},
4484
+ {"description", ""},
4485
+ {"tags", {""}},
4486
+ {"capabilities", {"completion"}},
4487
+ {"parameters", ""},
4488
+ {"details", {
4489
+ {"parent_model", ""},
4490
+ {"format", "gguf"},
4491
+ {"family", ""},
4492
+ {"families", {""}},
4493
+ {"parameter_size", ""},
4494
+ {"quantization_level", ""}
4495
+ }}
4496
+ }
4497
+ }},
4389
4498
  {"object", "list"},
4390
4499
  {"data", {
4391
4500
  {
@@ -4395,7 +4504,7 @@ int main(int argc, char ** argv) {
4395
4504
  {"owned_by", "llamacpp"},
4396
4505
  {"meta", model_meta},
4397
4506
  },
4398
- }}
4507
+ }}
4399
4508
  };
4400
4509
 
4401
4510
  res_ok(res, models);
@@ -4723,11 +4832,13 @@ int main(int argc, char ** argv) {
4723
4832
  svr->Post("/api/show", handle_api_show);
4724
4833
  svr->Get ("/models", handle_models); // public endpoint (no API key check)
4725
4834
  svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
4835
+ svr->Get ("/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check)
4726
4836
  svr->Post("/completion", handle_completions); // legacy
4727
4837
  svr->Post("/completions", handle_completions);
4728
4838
  svr->Post("/v1/completions", handle_completions_oai);
4729
4839
  svr->Post("/chat/completions", handle_chat_completions);
4730
4840
  svr->Post("/v1/chat/completions", handle_chat_completions);
4841
+ svr->Post("/api/chat", handle_chat_completions); // ollama specific endpoint
4731
4842
  svr->Post("/infill", handle_infill);
4732
4843
  svr->Post("/embedding", handle_embeddings); // legacy
4733
4844
  svr->Post("/embeddings", handle_embeddings);