@fugood/llama.node 0.4.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +66 -6
- package/lib/index.js +59 -17
- package/lib/index.ts +74 -23
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +68 -54
- package/src/LlamaCompletionWorker.h +7 -8
- package/src/LlamaContext.cpp +551 -235
- package/src/LlamaContext.h +26 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +23 -14
- package/src/TokenizeWorker.h +2 -2
- package/src/addons.cc +8 -11
- package/src/common.hpp +129 -126
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
# define MTMD_API
|
|
40
40
|
#endif
|
|
41
41
|
|
|
42
|
+
// deprecated marker, use mtmd_default_marker() instead
|
|
42
43
|
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
|
|
43
44
|
|
|
44
45
|
#ifdef __cplusplus
|
|
@@ -48,6 +49,7 @@ extern "C" {
|
|
|
48
49
|
enum mtmd_input_chunk_type {
|
|
49
50
|
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
50
51
|
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
52
|
+
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
|
51
53
|
};
|
|
52
54
|
|
|
53
55
|
// opaque types
|
|
@@ -79,9 +81,12 @@ struct mtmd_context_params {
|
|
|
79
81
|
bool print_timings;
|
|
80
82
|
int n_threads;
|
|
81
83
|
enum ggml_log_level verbosity;
|
|
82
|
-
const char * image_marker;
|
|
84
|
+
const char * image_marker; // deprecated, use media_marker instead
|
|
85
|
+
const char * media_marker;
|
|
83
86
|
};
|
|
84
87
|
|
|
88
|
+
MTMD_API const char * mtmd_default_marker(void);
|
|
89
|
+
|
|
85
90
|
MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
|
|
86
91
|
|
|
87
92
|
// initialize the mtmd context
|
|
@@ -98,18 +103,28 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
|
|
98
103
|
// whether the current model use M-RoPE for llama_decode
|
|
99
104
|
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
|
100
105
|
|
|
106
|
+
// whether the current model supports vision input
|
|
107
|
+
MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
|
108
|
+
|
|
109
|
+
// whether the current model supports audio input
|
|
110
|
+
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
|
101
111
|
|
|
102
112
|
// mtmd_bitmap
|
|
103
113
|
//
|
|
104
|
-
//
|
|
105
|
-
//
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
MTMD_API uint32_t
|
|
111
|
-
MTMD_API
|
|
112
|
-
MTMD_API
|
|
114
|
+
// if bitmap is image:
|
|
115
|
+
// length of data must be nx * ny * 3
|
|
116
|
+
// the data is in RGBRGBRGB... format
|
|
117
|
+
// if bitmap is audio:
|
|
118
|
+
// length of data must be n_samples * sizeof(float)
|
|
119
|
+
// the data is in float format (PCM F32)
|
|
120
|
+
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
|
121
|
+
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
|
122
|
+
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
|
123
|
+
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
|
|
124
|
+
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
|
|
125
|
+
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
|
|
126
|
+
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
|
|
127
|
+
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
|
|
113
128
|
// bitmap ID is optional, but useful for KV cache tracking
|
|
114
129
|
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
|
|
115
130
|
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
|
|
@@ -132,6 +147,11 @@ MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chu
|
|
|
132
147
|
MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
|
|
133
148
|
MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
|
|
134
149
|
MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
|
|
150
|
+
MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
|
|
151
|
+
// returns nullptr for ID on text chunk
|
|
152
|
+
MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
|
|
153
|
+
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
|
154
|
+
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
|
|
135
155
|
|
|
136
156
|
// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
|
|
137
157
|
// you can move the chunk ownership to your own code by copying it
|
|
@@ -144,27 +164,28 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
|
|
144
164
|
//
|
|
145
165
|
// the instance will be constructed via mtmd_tokenize()
|
|
146
166
|
// it will be freed along with mtmd_input_chunk
|
|
147
|
-
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
|
|
167
|
+
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
|
148
168
|
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
|
|
149
169
|
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
|
|
150
|
-
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens);
|
|
170
|
+
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
|
151
171
|
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
|
152
|
-
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens);
|
|
172
|
+
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
|
153
173
|
|
|
154
|
-
// tokenize an input text prompt and
|
|
155
|
-
// the prompt must have the input image marker (default: "<
|
|
156
|
-
// the marker
|
|
174
|
+
// tokenize an input text prompt and a list of bitmaps (images/audio)
|
|
175
|
+
// the prompt must have the input image marker (default: "<__media__>") in it
|
|
176
|
+
// the default marker is defined by mtmd_default_marker()
|
|
177
|
+
// the marker will be replaced with the image/audio chunk
|
|
157
178
|
// for example:
|
|
158
|
-
// "here is an image: <
|
|
179
|
+
// "here is an image: <__media__>\ndescribe it in detail."
|
|
159
180
|
// this will gives 3 chunks:
|
|
160
181
|
// 1. "here is an image: <start_of_image>"
|
|
161
|
-
// 2. (image tokens)
|
|
182
|
+
// 2. (image/audio tokens)
|
|
162
183
|
// 3. "<end_of_image>\ndescribe it in detail."
|
|
163
|
-
// number of bitmaps must be equal to the number of
|
|
184
|
+
// number of bitmaps must be equal to the number of markers in the prompt
|
|
164
185
|
// this function is thread-safe (shared ctx)
|
|
165
186
|
// return values:
|
|
166
187
|
// 0 on success
|
|
167
|
-
// 1 on number of
|
|
188
|
+
// 1 on number of bitmaps not matching the number of markers
|
|
168
189
|
// 2 on image preprocessing error
|
|
169
190
|
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
170
191
|
mtmd_input_chunks * output,
|
|
@@ -173,9 +194,14 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
173
194
|
size_t n_bitmaps);
|
|
174
195
|
|
|
175
196
|
// returns 0 on success
|
|
197
|
+
// TODO: deprecate
|
|
176
198
|
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
|
177
199
|
const mtmd_image_tokens * image_tokens);
|
|
178
200
|
|
|
201
|
+
// returns 0 on success
|
|
202
|
+
MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
|
|
203
|
+
const mtmd_input_chunk * chunk);
|
|
204
|
+
|
|
179
205
|
// get output embeddings from the last encode pass
|
|
180
206
|
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|
181
207
|
|
|
@@ -189,12 +215,16 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|
|
189
215
|
//
|
|
190
216
|
|
|
191
217
|
// helper function to construct a mtmd_bitmap from a file
|
|
218
|
+
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
|
192
219
|
// returns nullptr on failure
|
|
193
220
|
// this function is thread-safe
|
|
194
221
|
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
|
|
195
222
|
|
|
196
223
|
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
|
197
|
-
//
|
|
224
|
+
// supported formats:
|
|
225
|
+
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
|
226
|
+
// audio: formats supported by miniaudio: wav, mp3, flac
|
|
227
|
+
// note: audio files will be auto-detected based on magic bytes
|
|
198
228
|
// returns nullptr on failure
|
|
199
229
|
// this function is thread-safe
|
|
200
230
|
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
|
|
@@ -293,6 +323,7 @@ struct bitmap {
|
|
|
293
323
|
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
|
294
324
|
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
|
295
325
|
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
|
326
|
+
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
|
296
327
|
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
|
297
328
|
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
|
298
329
|
};
|
|
@@ -936,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
|
|
|
936
936
|
// Function to tokenize the prompt
|
|
937
937
|
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
|
938
938
|
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
|
939
|
-
const bool is_first =
|
|
939
|
+
const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
|
|
940
940
|
|
|
941
941
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
942
942
|
prompt_tokens.resize(n_prompt_tokens);
|
|
@@ -952,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
|
|
|
952
952
|
// Check if we have enough space in the context to evaluate this batch
|
|
953
953
|
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
|
954
954
|
const int n_ctx = llama_n_ctx(ctx.get());
|
|
955
|
-
const int n_ctx_used =
|
|
955
|
+
const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
|
|
956
956
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
957
957
|
printf(LOG_COL_DEFAULT "\n");
|
|
958
958
|
printe("context size exceeded\n");
|
|
@@ -951,7 +951,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
951
951
|
}
|
|
952
952
|
|
|
953
953
|
json to_json_oaicompat_chat() {
|
|
954
|
-
bool first = n_decoded ==
|
|
954
|
+
bool first = n_decoded == 1;
|
|
955
955
|
std::time_t t = std::time(0);
|
|
956
956
|
json choices;
|
|
957
957
|
|
|
@@ -962,15 +962,18 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
962
962
|
{"delta", json{{"role", "assistant"}}}}});
|
|
963
963
|
} else {
|
|
964
964
|
// We have to send this as two updates to conform to openai behavior
|
|
965
|
+
// initial_ret is the role message for stream=True
|
|
965
966
|
json initial_ret = json{{"choices", json::array({json{
|
|
966
967
|
{"finish_reason", nullptr},
|
|
967
968
|
{"index", 0},
|
|
968
969
|
{"delta", json{
|
|
969
|
-
{"role", "assistant"}
|
|
970
|
+
{"role", "assistant"},
|
|
971
|
+
{"content", ""}
|
|
970
972
|
}}}})},
|
|
971
973
|
{"created", t},
|
|
972
974
|
{"id", oaicompat_cmpl_id},
|
|
973
975
|
{"model", oaicompat_model},
|
|
976
|
+
{"system_fingerprint", build_info},
|
|
974
977
|
{"object", "chat.completion.chunk"}};
|
|
975
978
|
|
|
976
979
|
json second_ret = json{
|
|
@@ -982,8 +985,19 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
982
985
|
{"created", t},
|
|
983
986
|
{"id", oaicompat_cmpl_id},
|
|
984
987
|
{"model", oaicompat_model},
|
|
988
|
+
{"system_fingerprint", build_info},
|
|
985
989
|
{"object", "chat.completion.chunk"}};
|
|
986
990
|
|
|
991
|
+
if (prob_output.probs.size() > 0) {
|
|
992
|
+
second_ret["choices"][0]["logprobs"] = json{
|
|
993
|
+
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
if (timings.prompt_n >= 0) {
|
|
998
|
+
second_ret.push_back({"timings", timings.to_json()});
|
|
999
|
+
}
|
|
1000
|
+
|
|
987
1001
|
return std::vector<json>({initial_ret, second_ret});
|
|
988
1002
|
}
|
|
989
1003
|
} else {
|
|
@@ -1137,9 +1151,6 @@ struct server_task_result_metrics : server_task_result {
|
|
|
1137
1151
|
int n_tasks_deferred;
|
|
1138
1152
|
int64_t t_start;
|
|
1139
1153
|
|
|
1140
|
-
int32_t kv_cache_tokens_count;
|
|
1141
|
-
int32_t kv_cache_used_cells;
|
|
1142
|
-
|
|
1143
1154
|
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
|
1144
1155
|
uint64_t n_prompt_tokens_processed_total = 0;
|
|
1145
1156
|
uint64_t t_prompt_processing_total = 0;
|
|
@@ -1179,9 +1190,6 @@ struct server_task_result_metrics : server_task_result {
|
|
|
1179
1190
|
{ "n_decode_total", n_decode_total },
|
|
1180
1191
|
{ "n_busy_slots_total", n_busy_slots_total },
|
|
1181
1192
|
|
|
1182
|
-
{ "kv_cache_tokens_count", kv_cache_tokens_count },
|
|
1183
|
-
{ "kv_cache_used_cells", kv_cache_used_cells },
|
|
1184
|
-
|
|
1185
1193
|
{ "slots", slots_data },
|
|
1186
1194
|
};
|
|
1187
1195
|
}
|
|
@@ -1883,6 +1891,7 @@ struct server_context {
|
|
|
1883
1891
|
float slot_prompt_similarity = 0.0f;
|
|
1884
1892
|
|
|
1885
1893
|
common_chat_templates_ptr chat_templates;
|
|
1894
|
+
oaicompat_parser_options oai_parser_opt;
|
|
1886
1895
|
|
|
1887
1896
|
~server_context() {
|
|
1888
1897
|
mtmd_free(mctx);
|
|
@@ -2004,6 +2013,23 @@ struct server_context {
|
|
|
2004
2013
|
}
|
|
2005
2014
|
}
|
|
2006
2015
|
|
|
2016
|
+
if (!llama_kv_self_can_shift(ctx)) {
|
|
2017
|
+
if (params_base.ctx_shift) {
|
|
2018
|
+
params_base.ctx_shift = false;
|
|
2019
|
+
SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
|
|
2020
|
+
}
|
|
2021
|
+
|
|
2022
|
+
if (params_base.n_cache_reuse) {
|
|
2023
|
+
params_base.n_cache_reuse = 0;
|
|
2024
|
+
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
|
2025
|
+
}
|
|
2026
|
+
|
|
2027
|
+
if (!params_base.speculative.model.path.empty()) {
|
|
2028
|
+
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
|
|
2029
|
+
return false;
|
|
2030
|
+
}
|
|
2031
|
+
}
|
|
2032
|
+
|
|
2007
2033
|
return true;
|
|
2008
2034
|
}
|
|
2009
2035
|
|
|
@@ -2061,6 +2087,15 @@ struct server_context {
|
|
|
2061
2087
|
}
|
|
2062
2088
|
|
|
2063
2089
|
metrics.init();
|
|
2090
|
+
|
|
2091
|
+
oai_parser_opt = {
|
|
2092
|
+
/* use_jinja */ params_base.use_jinja,
|
|
2093
|
+
/* prefill_assistant */ params_base.prefill_assistant,
|
|
2094
|
+
/* reasoning_format */ params_base.reasoning_format,
|
|
2095
|
+
/* common_chat_templates */ chat_templates.get(),
|
|
2096
|
+
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
|
2097
|
+
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
|
2098
|
+
};
|
|
2064
2099
|
}
|
|
2065
2100
|
|
|
2066
2101
|
server_slot * get_slot_by_id(int id) {
|
|
@@ -2754,9 +2789,6 @@ struct server_context {
|
|
|
2754
2789
|
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
|
|
2755
2790
|
res->t_start = metrics.t_start;
|
|
2756
2791
|
|
|
2757
|
-
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
|
|
2758
|
-
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
|
|
2759
|
-
|
|
2760
2792
|
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
|
|
2761
2793
|
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
|
|
2762
2794
|
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
|
|
@@ -3181,7 +3213,15 @@ struct server_context {
|
|
|
3181
3213
|
// if we don't cache the prompt, we have to remove the entire KV cache
|
|
3182
3214
|
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
|
3183
3215
|
slot.n_past = 0;
|
|
3184
|
-
slot.cache_tokens.clear();
|
|
3216
|
+
slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
|
|
3217
|
+
}
|
|
3218
|
+
|
|
3219
|
+
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
|
|
3220
|
+
if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
|
|
3221
|
+
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
|
3222
|
+
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
|
3223
|
+
slot.n_past = 0;
|
|
3224
|
+
}
|
|
3185
3225
|
}
|
|
3186
3226
|
}
|
|
3187
3227
|
|
|
@@ -3311,6 +3351,37 @@ struct server_context {
|
|
|
3311
3351
|
common_set_adapter_lora(ctx, slot_batched->lora);
|
|
3312
3352
|
}
|
|
3313
3353
|
|
|
3354
|
+
const bool do_encode = (params_base.embedding || params_base.reranking);
|
|
3355
|
+
|
|
3356
|
+
// pad the batch so that batch.n_tokens >= n_slots
|
|
3357
|
+
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
|
|
3358
|
+
if (do_encode) {
|
|
3359
|
+
const int n_slots = slots.size();
|
|
3360
|
+
|
|
3361
|
+
if (batch.n_tokens < n_slots) {
|
|
3362
|
+
std::set<llama_seq_id> seq_ids;
|
|
3363
|
+
for (int j = 0; j < batch.n_tokens; ++j) {
|
|
3364
|
+
seq_ids.insert(batch.seq_id[j][0]);
|
|
3365
|
+
}
|
|
3366
|
+
|
|
3367
|
+
// find unused sequence id
|
|
3368
|
+
llama_seq_id seq_id = -1;
|
|
3369
|
+
for (int i = 0; i < n_slots; ++i) {
|
|
3370
|
+
if (seq_ids.find(i) == seq_ids.end()) {
|
|
3371
|
+
seq_id = i;
|
|
3372
|
+
}
|
|
3373
|
+
}
|
|
3374
|
+
|
|
3375
|
+
const int n_add = n_slots - batch.n_tokens;
|
|
3376
|
+
|
|
3377
|
+
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
|
|
3378
|
+
|
|
3379
|
+
for (int j = 0; j < n_add; ++j) {
|
|
3380
|
+
common_batch_add(batch, 0, j, { seq_id }, false);
|
|
3381
|
+
}
|
|
3382
|
+
}
|
|
3383
|
+
}
|
|
3384
|
+
|
|
3314
3385
|
// process the created batch of tokens
|
|
3315
3386
|
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
|
3316
3387
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
|
@@ -3327,7 +3398,7 @@ struct server_context {
|
|
|
3327
3398
|
|
|
3328
3399
|
int ret = 0;
|
|
3329
3400
|
|
|
3330
|
-
if (
|
|
3401
|
+
if (do_encode) {
|
|
3331
3402
|
ret = llama_encode(ctx, batch_view);
|
|
3332
3403
|
} else {
|
|
3333
3404
|
ret = llama_decode(ctx, batch_view);
|
|
@@ -3336,14 +3407,29 @@ struct server_context {
|
|
|
3336
3407
|
metrics.on_decoded(slots);
|
|
3337
3408
|
|
|
3338
3409
|
if (ret != 0) {
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
|
|
3343
|
-
|
|
3344
|
-
|
|
3410
|
+
{
|
|
3411
|
+
std::string err;
|
|
3412
|
+
|
|
3413
|
+
if (n_batch == 1 && ret == 1) {
|
|
3414
|
+
err = "Context size has been exceeded.";
|
|
3415
|
+
}
|
|
3416
|
+
|
|
3417
|
+
if (ret == -1) {
|
|
3418
|
+
err = "Invalid input batch.";
|
|
3419
|
+
}
|
|
3420
|
+
|
|
3421
|
+
if (ret < -1) {
|
|
3422
|
+
err = "Compute error.";
|
|
3423
|
+
}
|
|
3424
|
+
|
|
3425
|
+
if (!err.empty()) {
|
|
3426
|
+
SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
|
|
3427
|
+
for (auto & slot : slots) {
|
|
3428
|
+
slot.release();
|
|
3429
|
+
send_error(slot, err);
|
|
3430
|
+
}
|
|
3431
|
+
break;
|
|
3345
3432
|
}
|
|
3346
|
-
break; // break loop of n_batch
|
|
3347
3433
|
}
|
|
3348
3434
|
|
|
3349
3435
|
// retry with half the batch size to try to find a free slot in the KV cache
|
|
@@ -3677,6 +3763,7 @@ int main(int argc, char ** argv) {
|
|
|
3677
3763
|
"/health",
|
|
3678
3764
|
"/models",
|
|
3679
3765
|
"/v1/models",
|
|
3766
|
+
"/api/tags"
|
|
3680
3767
|
};
|
|
3681
3768
|
|
|
3682
3769
|
// If API key is not set, skip validation
|
|
@@ -3715,7 +3802,7 @@ int main(int argc, char ** argv) {
|
|
|
3715
3802
|
if (req.path == "/" || tmp.back() == "html") {
|
|
3716
3803
|
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
|
3717
3804
|
res.status = 503;
|
|
3718
|
-
} else if (req.path == "/models" || req.path == "/v1/models") {
|
|
3805
|
+
} else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
|
|
3719
3806
|
// allow the models endpoint to be accessed during loading
|
|
3720
3807
|
return true;
|
|
3721
3808
|
} else {
|
|
@@ -3858,14 +3945,6 @@ int main(int argc, char ** argv) {
|
|
|
3858
3945
|
{"name", "predicted_tokens_seconds"},
|
|
3859
3946
|
{"help", "Average generation throughput in tokens/s."},
|
|
3860
3947
|
{"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
|
|
3861
|
-
},{
|
|
3862
|
-
{"name", "kv_cache_usage_ratio"},
|
|
3863
|
-
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
|
3864
|
-
{"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx}
|
|
3865
|
-
},{
|
|
3866
|
-
{"name", "kv_cache_tokens"},
|
|
3867
|
-
{"help", "KV-cache tokens."},
|
|
3868
|
-
{"value", (uint64_t) res_metrics->kv_cache_tokens_count}
|
|
3869
3948
|
},{
|
|
3870
3949
|
{"name", "requests_processing"},
|
|
3871
3950
|
{"help", "Number of requests processing."},
|
|
@@ -4023,7 +4102,10 @@ int main(int argc, char ** argv) {
|
|
|
4023
4102
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
4024
4103
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
4025
4104
|
{ "model_path", ctx_server.params_base.model.path },
|
|
4026
|
-
{ "modalities", json{
|
|
4105
|
+
{ "modalities", json{
|
|
4106
|
+
{"vision", ctx_server.oai_parser_opt.allow_image},
|
|
4107
|
+
{"audio", ctx_server.oai_parser_opt.allow_audio},
|
|
4108
|
+
} },
|
|
4027
4109
|
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
4028
4110
|
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
4029
4111
|
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
@@ -4061,6 +4143,19 @@ int main(int argc, char ** argv) {
|
|
|
4061
4143
|
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
|
4062
4144
|
}
|
|
4063
4145
|
},
|
|
4146
|
+
{"modelfile", ""},
|
|
4147
|
+
{"parameters", ""},
|
|
4148
|
+
{"template", common_chat_templates_source(ctx_server.chat_templates.get())},
|
|
4149
|
+
{"details", {
|
|
4150
|
+
{"parent_model", ""},
|
|
4151
|
+
{"format", "gguf"},
|
|
4152
|
+
{"family", ""},
|
|
4153
|
+
{"families", {""}},
|
|
4154
|
+
{"parameter_size", ""},
|
|
4155
|
+
{"quantization_level", ""}
|
|
4156
|
+
}},
|
|
4157
|
+
{"model_info", ""},
|
|
4158
|
+
{"capabilities", {"completion"}}
|
|
4064
4159
|
};
|
|
4065
4160
|
|
|
4066
4161
|
res_ok(res, data);
|
|
@@ -4101,10 +4196,10 @@ int main(int argc, char ** argv) {
|
|
|
4101
4196
|
for (auto & file : files) {
|
|
4102
4197
|
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
|
4103
4198
|
if (!bmp.ptr) {
|
|
4104
|
-
throw std::runtime_error("Failed to load image");
|
|
4199
|
+
throw std::runtime_error("Failed to load image or audio file");
|
|
4105
4200
|
}
|
|
4106
4201
|
// calculate bitmap hash (for KV caching)
|
|
4107
|
-
std::string hash = fnv_hash(bmp.data(), bmp.
|
|
4202
|
+
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
|
|
4108
4203
|
bmp.set_id(hash.c_str());
|
|
4109
4204
|
bitmaps.entries.push_back(std::move(bmp));
|
|
4110
4205
|
}
|
|
@@ -4336,7 +4431,7 @@ int main(int argc, char ** argv) {
|
|
|
4336
4431
|
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
|
4337
4432
|
};
|
|
4338
4433
|
|
|
4339
|
-
const auto handle_chat_completions = [&ctx_server, &
|
|
4434
|
+
const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
4340
4435
|
LOG_DBG("request: %s\n", req.body.c_str());
|
|
4341
4436
|
if (ctx_server.params_base.embedding) {
|
|
4342
4437
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
@@ -4345,13 +4440,9 @@ int main(int argc, char ** argv) {
|
|
|
4345
4440
|
|
|
4346
4441
|
auto body = json::parse(req.body);
|
|
4347
4442
|
std::vector<raw_buffer> files;
|
|
4348
|
-
json data =
|
|
4443
|
+
json data = oaicompat_chat_params_parse(
|
|
4349
4444
|
body,
|
|
4350
|
-
|
|
4351
|
-
params.prefill_assistant,
|
|
4352
|
-
params.reasoning_format,
|
|
4353
|
-
ctx_server.chat_templates.get(),
|
|
4354
|
-
ctx_server.mctx,
|
|
4445
|
+
ctx_server.oai_parser_opt,
|
|
4355
4446
|
files);
|
|
4356
4447
|
|
|
4357
4448
|
handle_completions_impl(
|
|
@@ -4364,16 +4455,12 @@ int main(int argc, char ** argv) {
|
|
|
4364
4455
|
};
|
|
4365
4456
|
|
|
4366
4457
|
// same with handle_chat_completions, but without inference part
|
|
4367
|
-
const auto handle_apply_template = [&ctx_server, &
|
|
4458
|
+
const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4368
4459
|
auto body = json::parse(req.body);
|
|
4369
4460
|
std::vector<raw_buffer> files; // dummy, unused
|
|
4370
|
-
json data =
|
|
4461
|
+
json data = oaicompat_chat_params_parse(
|
|
4371
4462
|
body,
|
|
4372
|
-
|
|
4373
|
-
params.prefill_assistant,
|
|
4374
|
-
params.reasoning_format,
|
|
4375
|
-
ctx_server.chat_templates.get(),
|
|
4376
|
-
ctx_server.mctx,
|
|
4463
|
+
ctx_server.oai_parser_opt,
|
|
4377
4464
|
files);
|
|
4378
4465
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4379
4466
|
};
|
|
@@ -4386,6 +4473,28 @@ int main(int argc, char ** argv) {
|
|
|
4386
4473
|
}
|
|
4387
4474
|
|
|
4388
4475
|
json models = {
|
|
4476
|
+
{"models", {
|
|
4477
|
+
{
|
|
4478
|
+
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
|
|
4479
|
+
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
|
|
4480
|
+
{"modified_at", ""},
|
|
4481
|
+
{"size", ""},
|
|
4482
|
+
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
|
|
4483
|
+
{"type", "model"},
|
|
4484
|
+
{"description", ""},
|
|
4485
|
+
{"tags", {""}},
|
|
4486
|
+
{"capabilities", {"completion"}},
|
|
4487
|
+
{"parameters", ""},
|
|
4488
|
+
{"details", {
|
|
4489
|
+
{"parent_model", ""},
|
|
4490
|
+
{"format", "gguf"},
|
|
4491
|
+
{"family", ""},
|
|
4492
|
+
{"families", {""}},
|
|
4493
|
+
{"parameter_size", ""},
|
|
4494
|
+
{"quantization_level", ""}
|
|
4495
|
+
}}
|
|
4496
|
+
}
|
|
4497
|
+
}},
|
|
4389
4498
|
{"object", "list"},
|
|
4390
4499
|
{"data", {
|
|
4391
4500
|
{
|
|
@@ -4395,7 +4504,7 @@ int main(int argc, char ** argv) {
|
|
|
4395
4504
|
{"owned_by", "llamacpp"},
|
|
4396
4505
|
{"meta", model_meta},
|
|
4397
4506
|
},
|
|
4398
|
-
|
|
4507
|
+
}}
|
|
4399
4508
|
};
|
|
4400
4509
|
|
|
4401
4510
|
res_ok(res, models);
|
|
@@ -4723,11 +4832,13 @@ int main(int argc, char ** argv) {
|
|
|
4723
4832
|
svr->Post("/api/show", handle_api_show);
|
|
4724
4833
|
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
|
4725
4834
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
|
4835
|
+
svr->Get ("/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check)
|
|
4726
4836
|
svr->Post("/completion", handle_completions); // legacy
|
|
4727
4837
|
svr->Post("/completions", handle_completions);
|
|
4728
4838
|
svr->Post("/v1/completions", handle_completions_oai);
|
|
4729
4839
|
svr->Post("/chat/completions", handle_chat_completions);
|
|
4730
4840
|
svr->Post("/v1/chat/completions", handle_chat_completions);
|
|
4841
|
+
svr->Post("/api/chat", handle_chat_completions); // ollama specific endpoint
|
|
4731
4842
|
svr->Post("/infill", handle_infill);
|
|
4732
4843
|
svr->Post("/embedding", handle_embeddings); // legacy
|
|
4733
4844
|
svr->Post("/embeddings", handle_embeddings);
|