@fugood/llama.node 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/CMakeLists.txt +15 -0
  2. package/README.md +3 -2
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +12 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/DetokenizeWorker.cpp +22 -0
  23. package/src/DetokenizeWorker.h +19 -0
  24. package/src/EmbeddingWorker.cpp +46 -0
  25. package/src/EmbeddingWorker.h +23 -0
  26. package/src/LlamaContext.cpp +62 -0
  27. package/src/LlamaContext.h +3 -0
  28. package/src/TokenizeWorker.cpp +26 -0
  29. package/src/TokenizeWorker.h +23 -0
  30. package/src/common.hpp +3 -2
  31. package/src/llama.cpp/CMakeLists.txt +14 -12
  32. package/src/llama.cpp/common/common.cpp +19 -5
  33. package/src/llama.cpp/common/common.h +2 -0
  34. package/src/llama.cpp/common/grammar-parser.cpp +9 -0
  35. package/src/llama.cpp/common/sampling.cpp +3 -3
  36. package/src/llama.cpp/common/sampling.h +1 -1
  37. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
  39. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
  40. package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
  42. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  43. package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
  44. package/src/llama.cpp/examples/main/main.cpp +5 -1
  45. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  46. package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
  47. package/src/llama.cpp/examples/server/server.cpp +12 -16
  48. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  49. package/src/llama.cpp/ggml-backend.c +2 -2
  50. package/src/llama.cpp/ggml-kompute.cpp +9 -3
  51. package/src/llama.cpp/ggml-quants.c +6 -0
  52. package/src/llama.cpp/ggml-rpc.cpp +1023 -0
  53. package/src/llama.cpp/ggml-rpc.h +24 -0
  54. package/src/llama.cpp/ggml-sycl.cpp +20 -143
  55. package/src/llama.cpp/ggml-vulkan.cpp +4 -2
  56. package/src/llama.cpp/ggml.c +116 -271
  57. package/src/llama.cpp/ggml.h +12 -15
  58. package/src/llama.cpp/llama.cpp +451 -265
  59. package/src/llama.cpp/llama.h +3 -0
  60. package/src/llama.cpp/requirements.txt +0 -1
  61. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
  63. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  64. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  65. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
@@ -49,4 +49,7 @@ else()
49
49
  add_subdirectory(server)
50
50
  endif()
51
51
  add_subdirectory(export-lora)
52
+ if (LLAMA_RPC)
53
+ add_subdirectory(rpc)
54
+ endif()
52
55
  endif()
@@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
49
49
  }
50
50
 
51
51
  float * out = output + batch.seq_id[i][0] * n_embd;
52
+ //TODO: I would also add a parameter here to enable normalization or not.
53
+ /*fprintf(stdout, "unnormalized_embedding:");
54
+ for (int hh = 0; hh < n_embd; hh++) {
55
+ fprintf(stdout, "%9.6f ", embd[hh]);
56
+ }
57
+ fprintf(stdout, "\n");*/
52
58
  llama_embd_normalize(embd, out, n_embd);
53
59
  }
54
60
  }
@@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
123
129
  inputs.push_back(inp);
124
130
  }
125
131
 
126
- // add SEP if not present
132
+ // check if the last token is SEP
133
+ // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
127
134
  for (auto & inp : inputs) {
128
135
  if (inp.empty() || inp.back() != llama_token_sep(model)) {
129
- inp.push_back(llama_token_sep(model));
136
+ fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
137
+ fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
130
138
  }
131
139
  }
132
140
 
@@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
161
161
  }
162
162
  }
163
163
 
164
+ static std::string pair_str(const std::pair<int, int> & p) {
165
+ static char buf[32];
166
+ snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
167
+ return buf;
168
+ }
169
+
164
170
  struct cmd_params {
165
171
  std::vector<std::string> model;
166
172
  std::vector<int> n_prompt;
167
173
  std::vector<int> n_gen;
174
+ std::vector<std::pair<int, int>> n_pg;
168
175
  std::vector<int> n_batch;
169
176
  std::vector<int> n_ubatch;
170
177
  std::vector<ggml_type> type_k;
@@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = {
188
195
  /* model */ {"models/7B/ggml-model-q4_0.gguf"},
189
196
  /* n_prompt */ {512},
190
197
  /* n_gen */ {128},
198
+ /* n_pg */ {{512, 128}},
191
199
  /* n_batch */ {2048},
192
200
  /* n_ubatch */ {512},
193
201
  /* type_k */ {GGML_TYPE_F16},
@@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
215
223
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
216
224
  printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
217
225
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
226
+ printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
218
227
  printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
219
- printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
220
- printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
221
- printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
228
+ printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
229
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
230
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
222
231
  printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
223
232
  printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
224
233
  printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
304
313
  }
305
314
  auto p = split<int>(argv[i], split_delim);
306
315
  params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
316
+ } else if (arg == "-pg") {
317
+ if (++i >= argc) {
318
+ invalid_param = true;
319
+ break;
320
+ }
321
+ auto p = split<std::string>(argv[i], ',');
322
+ if (p.size() != 2) {
323
+ invalid_param = true;
324
+ break;
325
+ }
326
+ params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
307
327
  } else if (arg == "-b" || arg == "--batch-size") {
308
328
  if (++i >= argc) {
309
329
  invalid_param = true;
@@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
493
513
  if (params.model.empty()) { params.model = cmd_params_defaults.model; }
494
514
  if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
495
515
  if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
516
+ if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
496
517
  if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
497
518
  if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
498
519
  if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
@@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
632
653
  };
633
654
  instances.push_back(instance);
634
655
  }
656
+
657
+ for (const auto & n_pg : params.n_pg) {
658
+ if (n_pg.first == 0 && n_pg.second == 0) {
659
+ continue;
660
+ }
661
+ cmd_params_instance instance = {
662
+ /* .model = */ m,
663
+ /* .n_prompt = */ n_pg.first,
664
+ /* .n_gen = */ n_pg.second,
665
+ /* .n_batch = */ nb,
666
+ /* .n_ubatch = */ nub,
667
+ /* .type_k = */ tk,
668
+ /* .type_v = */ tv,
669
+ /* .n_threads = */ nt,
670
+ /* .n_gpu_layers = */ nl,
671
+ /* .split_mode = */ sm,
672
+ /* .main_gpu = */ mg,
673
+ /* .no_kv_offload= */ nkvo,
674
+ /* .flash_attn = */ fa,
675
+ /* .tensor_split = */ ts,
676
+ /* .use_mmap = */ mmp,
677
+ /* .embeddings = */ embd,
678
+ };
679
+ instances.push_back(instance);
680
+ }
635
681
  }
636
682
 
637
683
  return instances;
@@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
965
1011
  if (field == "n_gpu_layers") {
966
1012
  return 3;
967
1013
  }
1014
+ if (field == "test") {
1015
+ return 13;
1016
+ }
968
1017
 
969
1018
  int width = std::max((int)field.length(), 10);
970
1019
 
@@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
1091
1140
  value = test::get_backend();
1092
1141
  } else if (field == "test") {
1093
1142
  if (t.n_prompt > 0 && t.n_gen == 0) {
1094
- snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
1143
+ snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1095
1144
  } else if (t.n_gen > 0 && t.n_prompt == 0) {
1096
- snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
1145
+ snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1097
1146
  } else {
1098
- assert(false);
1099
- exit(1);
1147
+ snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1100
1148
  }
1101
1149
  value = buf;
1102
1150
  } else if (field == "t/s") {
@@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
1297
1345
  llama_kv_cache_clear(ctx);
1298
1346
 
1299
1347
  uint64_t t_start = get_time_ns();
1348
+
1300
1349
  if (t.n_prompt > 0) {
1301
1350
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1302
1351
  }
@@ -37,7 +37,7 @@ FetchContent_MakeAvailable(llama)
37
37
  # used in the AndroidManifest.xml file.
38
38
  add_library(${CMAKE_PROJECT_NAME} SHARED
39
39
  # List C/C++ source files with relative paths to this CMakeLists.txt.
40
- llama-android.cpp)
40
+ llama-android.cpp)
41
41
 
42
42
  # Specifies libraries CMake should link to your target library. You
43
43
  # can link libraries from various origins, such as libraries defined in this
@@ -0,0 +1,49 @@
1
+ # For more information about using CMake with Android Studio, read the
2
+ # documentation: https://d.android.com/studio/projects/add-native-code.html.
3
+ # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
4
+
5
+ # Sets the minimum CMake version required for this project.
6
+ cmake_minimum_required(VERSION 3.22.1)
7
+
8
+ # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
9
+ # Since this is the top level CMakeLists.txt, the project name is also accessible
10
+ # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
11
+ # build script scope).
12
+ project("llama-android")
13
+
14
+ include(FetchContent)
15
+ FetchContent_Declare(
16
+ llama
17
+ GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
18
+ GIT_TAG master
19
+ )
20
+
21
+ # Also provides "common"
22
+ FetchContent_MakeAvailable(llama)
23
+
24
+ # Creates and names a library, sets it as either STATIC
25
+ # or SHARED, and provides the relative paths to its source code.
26
+ # You can define multiple libraries, and CMake builds them for you.
27
+ # Gradle automatically packages shared libraries with your APK.
28
+ #
29
+ # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
30
+ # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
31
+ # is preferred for the same purpose.
32
+ #
33
+ # In order to load a library into your app from Java/Kotlin, you must call
34
+ # System.loadLibrary() and pass the name of the library defined here;
35
+ # for GameActivity/NativeActivity derived applications, the same library name must be
36
+ # used in the AndroidManifest.xml file.
37
+ add_library(${CMAKE_PROJECT_NAME} SHARED
38
+ # List C/C++ source files with relative paths to this CMakeLists.txt.
39
+ llama-android.cpp)
40
+
41
+ # Specifies libraries CMake should link to your target library. You
42
+ # can link libraries from various origins, such as libraries defined in this
43
+ # build script, prebuilt third-party libraries, or Android system libraries.
44
+ target_link_libraries(${CMAKE_PROJECT_NAME}
45
+ # List libraries link to the target library
46
+ llama
47
+ common
48
+ android
49
+ log)
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
81
81
 
82
82
  extern "C"
83
83
  JNIEXPORT jlong JNICALL
84
- Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
84
+ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
85
85
  llama_model_params model_params = llama_model_default_params();
86
86
 
87
87
  auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
101
101
 
102
102
  extern "C"
103
103
  JNIEXPORT void JNICALL
104
- Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
104
+ Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
105
105
  llama_free_model(reinterpret_cast<llama_model *>(model));
106
106
  }
107
107
 
108
108
  extern "C"
109
109
  JNIEXPORT jlong JNICALL
110
- Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
110
+ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
111
111
  auto model = reinterpret_cast<llama_model *>(jmodel);
112
112
 
113
113
  if (!model) {
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
139
139
 
140
140
  extern "C"
141
141
  JNIEXPORT void JNICALL
142
- Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
142
+ Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
143
143
  llama_free(reinterpret_cast<llama_context *>(context));
144
144
  }
145
145
 
146
146
  extern "C"
147
147
  JNIEXPORT void JNICALL
148
- Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
148
+ Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
149
149
  llama_backend_free();
150
150
  }
151
151
 
152
152
  extern "C"
153
153
  JNIEXPORT void JNICALL
154
- Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
154
+ Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
155
155
  llama_log_set(log_callback, NULL);
156
156
  }
157
157
 
158
158
  extern "C"
159
159
  JNIEXPORT jstring JNICALL
160
- Java_com_example_llama_Llm_bench_1model(
160
+ Java_android_llama_cpp_LLamaAndroid_bench_1model(
161
161
  JNIEnv *env,
162
162
  jobject,
163
163
  jlong context_pointer,
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
271
271
 
272
272
  extern "C"
273
273
  JNIEXPORT void JNICALL
274
- Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
274
+ Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
275
275
  llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
276
276
  }
277
277
 
278
278
  extern "C"
279
279
  JNIEXPORT jlong JNICALL
280
- Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
280
+ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
281
281
 
282
282
  // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
283
283
 
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
313
313
 
314
314
  extern "C"
315
315
  JNIEXPORT void JNICALL
316
- Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
316
+ Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
317
317
  llama_backend_init();
318
318
  }
319
319
 
320
320
  extern "C"
321
321
  JNIEXPORT jstring JNICALL
322
- Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
322
+ Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
323
323
  return env->NewStringUTF(llama_print_system_info());
324
324
  }
325
325
 
326
326
  extern "C"
327
327
  JNIEXPORT jint JNICALL
328
- Java_com_example_llama_Llm_completion_1init(
328
+ Java_android_llama_cpp_LLamaAndroid_completion_1init(
329
329
  JNIEnv *env,
330
330
  jobject,
331
331
  jlong context_pointer,
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
376
376
 
377
377
  extern "C"
378
378
  JNIEXPORT jstring JNICALL
379
- Java_com_example_llama_Llm_completion_1loop(
379
+ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
380
380
  JNIEnv * env,
381
381
  jobject,
382
382
  jlong context_pointer,
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
438
438
 
439
439
  extern "C"
440
440
  JNIEXPORT void JNICALL
441
- Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
441
+ Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
442
442
  llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
443
443
  }
@@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
189
189
  LOG_TEE("\n");
190
190
 
191
191
  struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
192
+ if (!ctx_sampling) {
193
+ fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
194
+ exit(1);
195
+ }
196
+
192
197
  std::string response = "";
193
198
  for (int i = 0; i < max_tgt_len; i++) {
194
199
  const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
@@ -295,14 +300,10 @@ int main(int argc, char ** argv) {
295
300
  return 1;
296
301
  }
297
302
 
298
- for (auto & image : params.image) {
303
+ if (prompt_contains_image(params.prompt)) {
299
304
  auto ctx_llava = llava_init_context(&params, model);
300
305
 
301
- auto image_embed = load_image(ctx_llava, &params, image);
302
- if (!image_embed) {
303
- std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
304
- return 1;
305
- }
306
+ auto image_embed = load_image(ctx_llava, &params, "");
306
307
 
307
308
  // process the prompt
308
309
  process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -311,7 +312,26 @@ int main(int argc, char ** argv) {
311
312
  llava_image_embed_free(image_embed);
312
313
  ctx_llava->model = NULL;
313
314
  llava_free(ctx_llava);
315
+ } else {
316
+ for (auto & image : params.image) {
317
+ auto ctx_llava = llava_init_context(&params, model);
318
+
319
+ auto image_embed = load_image(ctx_llava, &params, image);
320
+ if (!image_embed) {
321
+ std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
322
+ return 1;
323
+ }
324
+
325
+ // process the prompt
326
+ process_prompt(ctx_llava, image_embed, &params, params.prompt);
327
+
328
+ llama_print_timings(ctx_llava->ctx_llama);
329
+ llava_image_embed_free(image_embed);
330
+ ctx_llava->model = NULL;
331
+ llava_free(ctx_llava);
332
+ }
314
333
  }
334
+
315
335
  llama_free_model(model);
316
336
 
317
337
  return 0;
@@ -523,6 +523,10 @@ int main(int argc, char ** argv) {
523
523
  }
524
524
 
525
525
  struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
526
+ if (!ctx_sampling) {
527
+ fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
528
+ exit(1);
529
+ }
526
530
 
527
531
  while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
528
532
  // predict
@@ -879,7 +883,7 @@ int main(int argc, char ** argv) {
879
883
  }
880
884
 
881
885
  const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
882
- const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
886
+ const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
883
887
  const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
884
888
 
885
889
  LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -0,0 +1,2 @@
1
+ add_executable(rpc-server rpc-server.cpp)
2
+ target_link_libraries(rpc-server PRIVATE ggml llama)
@@ -0,0 +1,70 @@
1
+ #ifdef GGML_USE_CUDA
2
+ #include "ggml-cuda.h"
3
+ #endif
4
+
5
+ #ifdef GGML_USE_METAL
6
+ #include "ggml-metal.h"
7
+ #endif
8
+
9
+ #include "ggml-rpc.h"
10
+ #include <string>
11
+ #include <stdio.h>
12
+
13
+ static ggml_backend_t create_backend() {
14
+ ggml_backend_t backend = NULL;
15
+ #ifdef GGML_USE_CUDA
16
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
17
+ backend = ggml_backend_cuda_init(0); // init device 0
18
+ if (!backend) {
19
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
20
+ }
21
+ #elif GGML_USE_METAL
22
+ fprintf(stderr, "%s: using Metal backend\n", __func__);
23
+ backend = ggml_backend_metal_init();
24
+ if (!backend) {
25
+ fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
26
+ }
27
+ #endif
28
+
29
+ // if there aren't GPU Backends fallback to CPU backend
30
+ if (!backend) {
31
+ fprintf(stderr, "%s: using CPU backend\n", __func__);
32
+ backend = ggml_backend_cpu_init();
33
+ }
34
+ return backend;
35
+ }
36
+
37
+ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
38
+ #ifdef GGML_USE_CUDA
39
+ ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
40
+ #else
41
+ // TODO: implement for other backends
42
+ *free_mem = 1;
43
+ *total_mem = 1;
44
+ #endif
45
+ }
46
+
47
+ int main(int argc, char * argv[]) {
48
+ if (argc < 3) {
49
+ fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
50
+ return 1;
51
+ }
52
+ const char * host = argv[1];
53
+ int port = std::stoi(argv[2]);
54
+ if (port <= 0 || port > 65535) {
55
+ fprintf(stderr, "Invalid port number: %d\n", port);
56
+ return 1;
57
+ }
58
+ ggml_backend_t backend = create_backend();
59
+ if (!backend) {
60
+ fprintf(stderr, "Failed to create backend\n");
61
+ return 1;
62
+ }
63
+ printf("Starting RPC server on %s:%d\n", host, port);
64
+ size_t free_mem, total_mem;
65
+ get_backend_memory(&free_mem, &total_mem);
66
+ std::string endpoint = std::string(host) + ":" + std::to_string(port);
67
+ start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
68
+ ggml_backend_free(backend);
69
+ return 0;
70
+ }
@@ -651,9 +651,6 @@ struct server_context {
651
651
  std::string system_prompt;
652
652
  std::vector<llama_token> system_tokens;
653
653
 
654
- std::string name_user; // this should be the antiprompt
655
- std::string name_assistant;
656
-
657
654
  // slots / clients
658
655
  std::vector<server_slot> slots;
659
656
  json default_generation_settings_for_props;
@@ -673,6 +670,8 @@ struct server_context {
673
670
  llama_free_model(model);
674
671
  model = nullptr;
675
672
  }
673
+
674
+ llama_batch_free(batch);
676
675
  }
677
676
 
678
677
  bool load_model(const gpt_params & params_) {
@@ -1098,15 +1097,11 @@ struct server_context {
1098
1097
  system_need_update = false;
1099
1098
  }
1100
1099
 
1101
- void system_prompt_set(const json & sys_props) {
1102
- system_prompt = sys_props.value("prompt", "");
1103
- name_user = sys_props.value("anti_prompt", "");
1104
- name_assistant = sys_props.value("assistant_name", "");
1100
+ bool system_prompt_set(const std::string & sys_prompt) {
1101
+ system_prompt = sys_prompt;
1105
1102
 
1106
1103
  LOG_VERBOSE("system prompt process", {
1107
1104
  {"system_prompt", system_prompt},
1108
- {"name_user", name_user},
1109
- {"name_assistant", name_assistant},
1110
1105
  });
1111
1106
 
1112
1107
  // release all slots
@@ -1115,6 +1110,7 @@ struct server_context {
1115
1110
  }
1116
1111
 
1117
1112
  system_need_update = true;
1113
+ return true;
1118
1114
  }
1119
1115
 
1120
1116
  bool process_token(completion_token_output & result, server_slot & slot) {
@@ -1534,7 +1530,8 @@ struct server_context {
1534
1530
  }
1535
1531
 
1536
1532
  if (task.data.contains("system_prompt")) {
1537
- system_prompt_set(task.data.at("system_prompt"));
1533
+ std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
1534
+ system_prompt_set(sys_prompt);
1538
1535
 
1539
1536
  for (server_slot & slot : slots) {
1540
1537
  slot.n_past = 0;
@@ -2270,10 +2267,10 @@ struct server_context {
2270
2267
 
2271
2268
  const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
2272
2269
  if (n_probs > 0) {
2273
- const size_t n_considered = slot.ctx_sampling->n_considered;
2270
+ const size_t n_valid = slot.ctx_sampling->n_valid;
2274
2271
 
2275
2272
  // Make sure at least n_probs top tokens are at the front of the vector:
2276
- if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
2273
+ if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
2277
2274
  llama_sample_top_k(ctx, &cur_p, n_probs, 0);
2278
2275
  }
2279
2276
 
@@ -2289,7 +2286,7 @@ struct server_context {
2289
2286
  for (size_t i = 0; i < n_probs; ++i) {
2290
2287
  result.probs.push_back({
2291
2288
  cur_p.data[i].id,
2292
- i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
2289
+ i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
2293
2290
  });
2294
2291
  }
2295
2292
  }
@@ -2918,7 +2915,7 @@ int main(int argc, char ** argv) {
2918
2915
  server_params_parse(argc, argv, sparams, params);
2919
2916
 
2920
2917
  if (!sparams.system_prompt.empty()) {
2921
- ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
2918
+ ctx_server.system_prompt_set(sparams.system_prompt);
2922
2919
  }
2923
2920
 
2924
2921
  if (params.model_alias == "unknown") {
@@ -3407,8 +3404,7 @@ int main(int argc, char ** argv) {
3407
3404
  const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
3408
3405
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3409
3406
  json data = {
3410
- { "user_name", ctx_server.name_user.c_str() },
3411
- { "assistant_name", ctx_server.name_assistant.c_str() },
3407
+ { "system_prompt", ctx_server.system_prompt.c_str() },
3412
3408
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3413
3409
  { "total_slots", ctx_server.params.n_parallel }
3414
3410
  };
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
371
371
  llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
372
372
  llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
373
373
  llama_params["stream"] = json_value(body, "stream", false);
374
- llama_params["temperature"] = json_value(body, "temperature", 0.0);
374
+ llama_params["temperature"] = json_value(body, "temperature", 1.0);
375
375
  llama_params["top_p"] = json_value(body, "top_p", 1.0);
376
376
 
377
377
  // Apply chat template to the list of messages
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1182
1182
  static char * fmt_size(size_t size) {
1183
1183
  static char buffer[128];
1184
1184
  if (size >= 1024*1024) {
1185
- sprintf(buffer, "%zuM", size/1024/1024);
1185
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1186
1186
  } else {
1187
- sprintf(buffer, "%zuK", size/1024);
1187
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1188
1188
  }
1189
1189
  return buffer;
1190
1190
  }
@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1559
  case GGML_OP_SOFT_MAX:
1560
1560
  {
1561
1561
  float scale;
1562
- memcpy(&scale, dst->op_params, sizeof(float));
1562
+ float max_bias;
1563
1563
 
1564
- #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1564
+ memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
1565
+ memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
1566
+
1567
+ #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
1565
1568
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
1569
  GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
- GGML_ASSERT(src2 == nullptr);
1570
+
1571
+ #pragma message("TODO: add ALiBi support")
1572
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
1573
+ GGML_ASSERT(max_bias == 0.0f);
1568
1574
 
1569
1575
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1570
1576
  } break;
@@ -14,6 +14,12 @@
14
14
  #include <stdlib.h> // for qsort
15
15
  #include <stdio.h> // for GGML_ASSERT
16
16
 
17
+ #if defined(_MSC_VER)
18
+ // disable "possible loss of data" to avoid warnings for hundreds of casts
19
+ // we should just be careful :)
20
+ #pragma warning(disable: 4244 4267)
21
+ #endif
22
+
17
23
  #define UNUSED GGML_UNUSED
18
24
 
19
25
  // some compilers don't provide _mm256_set_m128i, e.g. gcc 7