@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -50,9 +50,9 @@ static void write_logfile(
50
50
  return;
51
51
  }
52
52
 
53
- const std::string timestamp = get_sortable_timestamp();
53
+ const std::string timestamp = string_get_sortable_timestamp();
54
54
 
55
- const bool success = create_directory_with_parents(params.logdir);
55
+ const bool success = fs_create_directory_with_parents(params.logdir);
56
56
  if (!success) {
57
57
  fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
58
58
  __func__, params.logdir.c_str());
@@ -70,7 +70,7 @@ static void write_logfile(
70
70
  fprintf(logfile, "binary: infill\n");
71
71
  char model_desc[128];
72
72
  llama_model_desc(model, model_desc, sizeof(model_desc));
73
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
73
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
74
74
 
75
75
  fprintf(logfile, "\n");
76
76
  fprintf(logfile, "######################\n");
@@ -78,8 +78,8 @@ static void write_logfile(
78
78
  fprintf(logfile, "######################\n");
79
79
  fprintf(logfile, "\n");
80
80
 
81
- dump_string_yaml_multiline(logfile, "output", output.c_str());
82
- dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
81
+ yaml_dump_string_multiline(logfile, "output", output.c_str());
82
+ yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
83
83
 
84
84
  llama_dump_timing_info_yaml(logfile, ctx);
85
85
  fclose(logfile);
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
236
236
  // print system information
237
237
  {
238
238
  LOG_TEE("\n");
239
- LOG_TEE("%s\n", get_system_info(params).c_str());
239
+ LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
240
240
  }
241
241
  const bool add_bos = llama_should_add_bos_token(model);
242
242
  GGML_ASSERT(llama_add_eos_token(model) != 1);
@@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
621
621
 
622
622
  if (params.escape) {
623
623
  //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
624
- process_escapes(params.input_prefix);
625
- process_escapes(params.input_suffix);
624
+ string_process_escapes(params.input_prefix);
625
+ string_process_escapes(params.input_suffix);
626
626
  }
627
627
  suff_rm_leading_spc = params.escape;
628
628
  if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
@@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
161
161
  }
162
162
  }
163
163
 
164
+ static std::string pair_str(const std::pair<int, int> & p) {
165
+ static char buf[32];
166
+ snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
167
+ return buf;
168
+ }
169
+
164
170
  struct cmd_params {
165
171
  std::vector<std::string> model;
166
172
  std::vector<int> n_prompt;
167
173
  std::vector<int> n_gen;
174
+ std::vector<std::pair<int, int>> n_pg;
168
175
  std::vector<int> n_batch;
169
176
  std::vector<int> n_ubatch;
170
177
  std::vector<ggml_type> type_k;
@@ -188,11 +195,12 @@ static const cmd_params cmd_params_defaults = {
188
195
  /* model */ {"models/7B/ggml-model-q4_0.gguf"},
189
196
  /* n_prompt */ {512},
190
197
  /* n_gen */ {128},
198
+ /* n_pg */ {},
191
199
  /* n_batch */ {2048},
192
200
  /* n_ubatch */ {512},
193
201
  /* type_k */ {GGML_TYPE_F16},
194
202
  /* type_v */ {GGML_TYPE_F16},
195
- /* n_threads */ {get_math_cpu_count()},
203
+ /* n_threads */ {cpu_get_num_math()},
196
204
  /* n_gpu_layers */ {99},
197
205
  /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
198
206
  /* main_gpu */ {0},
@@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
215
223
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
216
224
  printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
217
225
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
226
+ printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
218
227
  printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
219
- printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
220
- printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
221
- printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
228
+ printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
229
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
230
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
222
231
  printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
223
232
  printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
224
233
  printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
304
313
  }
305
314
  auto p = split<int>(argv[i], split_delim);
306
315
  params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
316
+ } else if (arg == "-pg") {
317
+ if (++i >= argc) {
318
+ invalid_param = true;
319
+ break;
320
+ }
321
+ auto p = split<std::string>(argv[i], ',');
322
+ if (p.size() != 2) {
323
+ invalid_param = true;
324
+ break;
325
+ }
326
+ params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
307
327
  } else if (arg == "-b" || arg == "--batch-size") {
308
328
  if (++i >= argc) {
309
329
  invalid_param = true;
@@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
493
513
  if (params.model.empty()) { params.model = cmd_params_defaults.model; }
494
514
  if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
495
515
  if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
516
+ if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
496
517
  if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
497
518
  if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
498
519
  if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
@@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
632
653
  };
633
654
  instances.push_back(instance);
634
655
  }
656
+
657
+ for (const auto & n_pg : params.n_pg) {
658
+ if (n_pg.first == 0 && n_pg.second == 0) {
659
+ continue;
660
+ }
661
+ cmd_params_instance instance = {
662
+ /* .model = */ m,
663
+ /* .n_prompt = */ n_pg.first,
664
+ /* .n_gen = */ n_pg.second,
665
+ /* .n_batch = */ nb,
666
+ /* .n_ubatch = */ nub,
667
+ /* .type_k = */ tk,
668
+ /* .type_v = */ tv,
669
+ /* .n_threads = */ nt,
670
+ /* .n_gpu_layers = */ nl,
671
+ /* .split_mode = */ sm,
672
+ /* .main_gpu = */ mg,
673
+ /* .no_kv_offload= */ nkvo,
674
+ /* .flash_attn = */ fa,
675
+ /* .tensor_split = */ ts,
676
+ /* .use_mmap = */ mmp,
677
+ /* .embeddings = */ embd,
678
+ };
679
+ instances.push_back(instance);
680
+ }
635
681
  }
636
682
 
637
683
  return instances;
@@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
965
1011
  if (field == "n_gpu_layers") {
966
1012
  return 3;
967
1013
  }
1014
+ if (field == "test") {
1015
+ return 13;
1016
+ }
968
1017
 
969
1018
  int width = std::max((int)field.length(), 10);
970
1019
 
@@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
1091
1140
  value = test::get_backend();
1092
1141
  } else if (field == "test") {
1093
1142
  if (t.n_prompt > 0 && t.n_gen == 0) {
1094
- snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
1143
+ snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1095
1144
  } else if (t.n_gen > 0 && t.n_prompt == 0) {
1096
- snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
1145
+ snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1097
1146
  } else {
1098
- assert(false);
1099
- exit(1);
1147
+ snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1100
1148
  }
1101
1149
  value = buf;
1102
1150
  } else if (field == "t/s") {
@@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
1297
1345
  llama_kv_cache_clear(ctx);
1298
1346
 
1299
1347
  uint64_t t_start = get_time_ns();
1348
+
1300
1349
  if (t.n_prompt > 0) {
1301
1350
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1302
1351
  }
@@ -0,0 +1,55 @@
1
+
2
+ # For more information about using CMake with Android Studio, read the
3
+ # documentation: https://d.android.com/studio/projects/add-native-code.html.
4
+ # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
5
+
6
+ # Sets the minimum CMake version required for this project.
7
+ cmake_minimum_required(VERSION 3.22.1)
8
+
9
+ # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
10
+ # Since this is the top level CMakeLists.txt, the project name is also accessible
11
+ # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
12
+ # build script scope).
13
+ project("llama-android")
14
+
15
+ ## Fetch latest llama.cpp from GitHub
16
+ #include(FetchContent)
17
+ #FetchContent_Declare(
18
+ # llama
19
+ # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
20
+ # GIT_TAG master
21
+ #)
22
+ #
23
+ ## Also provides "common"
24
+ #FetchContent_MakeAvailable(llama)
25
+
26
+ # llama.cpp CI uses the code from the current branch
27
+ # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
28
+ add_subdirectory(../../../../../../ build-llama)
29
+
30
+ # Creates and names a library, sets it as either STATIC
31
+ # or SHARED, and provides the relative paths to its source code.
32
+ # You can define multiple libraries, and CMake builds them for you.
33
+ # Gradle automatically packages shared libraries with your APK.
34
+ #
35
+ # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
36
+ # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
37
+ # is preferred for the same purpose.
38
+ #
39
+ # In order to load a library into your app from Java/Kotlin, you must call
40
+ # System.loadLibrary() and pass the name of the library defined here;
41
+ # for GameActivity/NativeActivity derived applications, the same library name must be
42
+ # used in the AndroidManifest.xml file.
43
+ add_library(${CMAKE_PROJECT_NAME} SHARED
44
+ # List C/C++ source files with relative paths to this CMakeLists.txt.
45
+ llama-android.cpp)
46
+
47
+ # Specifies libraries CMake should link to your target library. You
48
+ # can link libraries from various origins, such as libraries defined in this
49
+ # build script, prebuilt third-party libraries, or Android system libraries.
50
+ target_link_libraries(${CMAKE_PROJECT_NAME}
51
+ # List libraries link to the target library
52
+ llama
53
+ common
54
+ android
55
+ log)
@@ -1,4 +1,3 @@
1
-
2
1
  # For more information about using CMake with Android Studio, read the
3
2
  # documentation: https://d.android.com/studio/projects/add-native-code.html.
4
3
  # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
@@ -36,15 +35,15 @@ FetchContent_MakeAvailable(llama)
36
35
  # for GameActivity/NativeActivity derived applications, the same library name must be
37
36
  # used in the AndroidManifest.xml file.
38
37
  add_library(${CMAKE_PROJECT_NAME} SHARED
39
- # List C/C++ source files with relative paths to this CMakeLists.txt.
40
- llama-android.cpp)
38
+ # List C/C++ source files with relative paths to this CMakeLists.txt.
39
+ llama-android.cpp)
41
40
 
42
41
  # Specifies libraries CMake should link to your target library. You
43
42
  # can link libraries from various origins, such as libraries defined in this
44
43
  # build script, prebuilt third-party libraries, or Android system libraries.
45
44
  target_link_libraries(${CMAKE_PROJECT_NAME}
46
- # List libraries link to the target library
47
- llama
48
- common
49
- android
50
- log)
45
+ # List libraries link to the target library
46
+ llama
47
+ common
48
+ android
49
+ log)
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
81
81
 
82
82
  extern "C"
83
83
  JNIEXPORT jlong JNICALL
84
- Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
84
+ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
85
85
  llama_model_params model_params = llama_model_default_params();
86
86
 
87
87
  auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
101
101
 
102
102
  extern "C"
103
103
  JNIEXPORT void JNICALL
104
- Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
104
+ Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
105
105
  llama_free_model(reinterpret_cast<llama_model *>(model));
106
106
  }
107
107
 
108
108
  extern "C"
109
109
  JNIEXPORT jlong JNICALL
110
- Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
110
+ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
111
111
  auto model = reinterpret_cast<llama_model *>(jmodel);
112
112
 
113
113
  if (!model) {
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
139
139
 
140
140
  extern "C"
141
141
  JNIEXPORT void JNICALL
142
- Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
142
+ Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
143
143
  llama_free(reinterpret_cast<llama_context *>(context));
144
144
  }
145
145
 
146
146
  extern "C"
147
147
  JNIEXPORT void JNICALL
148
- Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
148
+ Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
149
149
  llama_backend_free();
150
150
  }
151
151
 
152
152
  extern "C"
153
153
  JNIEXPORT void JNICALL
154
- Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
154
+ Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
155
155
  llama_log_set(log_callback, NULL);
156
156
  }
157
157
 
158
158
  extern "C"
159
159
  JNIEXPORT jstring JNICALL
160
- Java_com_example_llama_Llm_bench_1model(
160
+ Java_android_llama_cpp_LLamaAndroid_bench_1model(
161
161
  JNIEnv *env,
162
162
  jobject,
163
163
  jlong context_pointer,
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
271
271
 
272
272
  extern "C"
273
273
  JNIEXPORT void JNICALL
274
- Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
274
+ Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
275
275
  llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
276
276
  }
277
277
 
278
278
  extern "C"
279
279
  JNIEXPORT jlong JNICALL
280
- Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
280
+ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
281
281
 
282
282
  // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
283
283
 
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
313
313
 
314
314
  extern "C"
315
315
  JNIEXPORT void JNICALL
316
- Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
316
+ Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
317
317
  llama_backend_init();
318
318
  }
319
319
 
320
320
  extern "C"
321
321
  JNIEXPORT jstring JNICALL
322
- Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
322
+ Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
323
323
  return env->NewStringUTF(llama_print_system_info());
324
324
  }
325
325
 
326
326
  extern "C"
327
327
  JNIEXPORT jint JNICALL
328
- Java_com_example_llama_Llm_completion_1init(
328
+ Java_android_llama_cpp_LLamaAndroid_completion_1init(
329
329
  JNIEnv *env,
330
330
  jobject,
331
331
  jlong context_pointer,
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
376
376
 
377
377
  extern "C"
378
378
  JNIEXPORT jstring JNICALL
379
- Java_com_example_llama_Llm_completion_1loop(
379
+ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
380
380
  JNIEnv * env,
381
381
  jobject,
382
382
  jlong context_pointer,
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
438
438
 
439
439
  extern "C"
440
440
  JNIEXPORT void JNICALL
441
- Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
441
+ Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
442
442
  llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
443
443
  }
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
68
68
  /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
69
69
  CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
70
70
 
71
- /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
71
+ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
72
72
  CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
73
73
 
74
74
  CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
@@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
189
189
  LOG_TEE("\n");
190
190
 
191
191
  struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
192
+ if (!ctx_sampling) {
193
+ fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
194
+ exit(1);
195
+ }
196
+
192
197
  std::string response = "";
193
198
  for (int i = 0; i < max_tgt_len; i++) {
194
199
  const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
@@ -285,7 +290,7 @@ int main(int argc, char ** argv) {
285
290
  #endif // LOG_DISABLE_LOGS
286
291
 
287
292
  if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
288
- gpt_print_usage(argc, argv, params);
293
+ gpt_params_print_usage(argc, argv, params);
289
294
  show_additional_info(argc, argv);
290
295
  return 1;
291
296
  }
@@ -295,14 +300,10 @@ int main(int argc, char ** argv) {
295
300
  return 1;
296
301
  }
297
302
 
298
- for (auto & image : params.image) {
303
+ if (prompt_contains_image(params.prompt)) {
299
304
  auto ctx_llava = llava_init_context(&params, model);
300
305
 
301
- auto image_embed = load_image(ctx_llava, &params, image);
302
- if (!image_embed) {
303
- std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
304
- return 1;
305
- }
306
+ auto image_embed = load_image(ctx_llava, &params, "");
306
307
 
307
308
  // process the prompt
308
309
  process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -311,7 +312,26 @@ int main(int argc, char ** argv) {
311
312
  llava_image_embed_free(image_embed);
312
313
  ctx_llava->model = NULL;
313
314
  llava_free(ctx_llava);
315
+ } else {
316
+ for (auto & image : params.image) {
317
+ auto ctx_llava = llava_init_context(&params, model);
318
+
319
+ auto image_embed = load_image(ctx_llava, &params, image);
320
+ if (!image_embed) {
321
+ std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
322
+ return 1;
323
+ }
324
+
325
+ // process the prompt
326
+ process_prompt(ctx_llava, image_embed, &params, params.prompt);
327
+
328
+ llama_print_timings(ctx_llava->ctx_llama);
329
+ llava_image_embed_free(image_embed);
330
+ ctx_llava->model = NULL;
331
+ llava_free(ctx_llava);
332
+ }
314
333
  }
334
+
315
335
  llama_free_model(model);
316
336
 
317
337
  return 0;
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
88
88
  // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
89
89
  static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
90
90
  struct {
91
- struct ggml_tensor * newline;
92
91
  struct ggml_context * ctx;
93
92
  } model;
94
93
 
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
150
149
 
151
150
  model.ctx = ggml_init(params);
152
151
 
153
- ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
154
- model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
155
- if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
156
- if (newline_tmp->buffer == NULL) {
157
- LOG_TEE("newline_tmp tensor buffer is NULL\n");
158
- }
159
- ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
160
- } else {
161
- model.newline->data = newline_tmp->data;
162
- if (model.newline->data == NULL) {
163
- LOG_TEE("newline_tmp tensor data is NULL\n");
164
- }
165
- }
166
-
167
152
  struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
168
153
  // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
169
154
  // fill it with the image embeddings, ignoring the base
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
174
174
  // debug
175
175
  if (dump_kv_cache) {
176
176
  llama_kv_cache_view_update(ctx, &kvc_view);
177
- dump_kv_cache_view_seqs(kvc_view, 40);
177
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
178
178
  }
179
179
 
180
180
  // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
121
121
  // debug
122
122
  if (dump_kv_cache) {
123
123
  llama_kv_cache_view_update(ctx, &kvc_view);
124
- dump_kv_cache_view_seqs(kvc_view, 40);
124
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
125
125
  }
126
126
 
127
127
  // print current draft sequence
@@ -60,9 +60,9 @@ static void write_logfile(
60
60
  return;
61
61
  }
62
62
 
63
- const std::string timestamp = get_sortable_timestamp();
63
+ const std::string timestamp = string_get_sortable_timestamp();
64
64
 
65
- const bool success = create_directory_with_parents(params.logdir);
65
+ const bool success = fs_create_directory_with_parents(params.logdir);
66
66
  if (!success) {
67
67
  fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
68
68
  __func__, params.logdir.c_str());
@@ -80,7 +80,7 @@ static void write_logfile(
80
80
  fprintf(logfile, "binary: main\n");
81
81
  char model_desc[128];
82
82
  llama_model_desc(model, model_desc, sizeof(model_desc));
83
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
83
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
84
84
 
85
85
  fprintf(logfile, "\n");
86
86
  fprintf(logfile, "######################\n");
@@ -88,8 +88,8 @@ static void write_logfile(
88
88
  fprintf(logfile, "######################\n");
89
89
  fprintf(logfile, "\n");
90
90
 
91
- dump_string_yaml_multiline(logfile, "output", output.c_str());
92
- dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
91
+ yaml_dump_string_multiline(logfile, "output", output.c_str());
92
+ yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
93
93
 
94
94
  llama_dump_timing_info_yaml(logfile, ctx);
95
95
  fclose(logfile);
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
181
181
 
182
182
  std::mt19937 rng(params.seed);
183
183
  if (params.random_prompt) {
184
- params.prompt = gpt_random_prompt(rng);
184
+ params.prompt = string_random_prompt(rng);
185
185
  }
186
186
 
187
187
  LOG("%s: llama backend init\n", __func__);
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
219
219
  // print system information
220
220
  {
221
221
  LOG_TEE("\n");
222
- LOG_TEE("%s\n", get_system_info(params).c_str());
222
+ LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
223
223
  }
224
224
 
225
225
  std::string path_session = params.path_prompt_cache;
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
474
474
  LOG_TEE("\n\n");
475
475
 
476
476
  if (params.interactive) {
477
- const char *control_message;
477
+ const char * control_message;
478
478
  if (params.multiline_input) {
479
- control_message = " - To return control to LLaMa, end your input with '\\'.\n"
479
+ control_message = " - To return control to the AI, end your input with '\\'.\n"
480
480
  " - To return control without starting a new line, end your input with '/'.\n";
481
481
  } else {
482
- control_message = " - Press Return to return control to LLaMa.\n"
482
+ control_message = " - Press Return to return control to the AI.\n"
483
483
  " - To return control without starting a new line, end your input with '/'.\n"
484
484
  " - If you want to submit another line, end your input with '\\'.\n";
485
485
  }
@@ -523,6 +523,10 @@ int main(int argc, char ** argv) {
523
523
  }
524
524
 
525
525
  struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
526
+ if (!ctx_sampling) {
527
+ fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
528
+ exit(1);
529
+ }
526
530
 
527
531
  while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
528
532
  // predict
@@ -703,7 +707,7 @@ int main(int argc, char ** argv) {
703
707
 
704
708
  const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
705
709
 
706
- llama_sampling_accept(ctx_sampling, ctx, id, true);
710
+ llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
707
711
 
708
712
  LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
709
713
 
@@ -724,7 +728,7 @@ int main(int argc, char ** argv) {
724
728
 
725
729
  // push the prompt in the sampling context in order to apply repetition penalties later
726
730
  // for the prompt, we don't apply grammar rules
727
- llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
731
+ llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
728
732
 
729
733
  ++n_consumed;
730
734
  if ((int) embd.size() >= params.n_batch) {
@@ -736,18 +740,26 @@ int main(int argc, char ** argv) {
736
740
  // display text
737
741
  if (input_echo && display) {
738
742
  for (auto id : embd) {
739
- const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
740
- printf("%s", token_str.c_str());
743
+ const std::string token_str = llama_token_to_piece(ctx, id, params.special);
744
+
745
+ // Console/Stream Output
746
+ fprintf(stdout, "%s", token_str.c_str());
741
747
 
748
+ // Record Displayed Tokens To Log
749
+ // Note: Generated tokens are created one by one hence this check
742
750
  if (embd.size() > 1) {
751
+ // Incoming Requested Tokens
743
752
  input_tokens.push_back(id);
744
753
  } else {
754
+ // Outgoing Generated Tokens
745
755
  output_tokens.push_back(id);
746
756
  output_ss << token_str;
747
757
  }
758
+
759
+ fflush(stdout);
748
760
  }
749
- fflush(stdout);
750
761
  }
762
+
751
763
  // reset color to default if there is no pending user input
752
764
  if (input_echo && (int) embd_inp.size() == n_consumed) {
753
765
  console::set_display(console::reset);
@@ -875,11 +887,11 @@ int main(int argc, char ** argv) {
875
887
  embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
876
888
  }
877
889
  if (params.escape) {
878
- process_escapes(buffer);
890
+ string_process_escapes(buffer);
879
891
  }
880
892
 
881
893
  const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
882
- const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
894
+ const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
883
895
  const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
884
896
 
885
897
  LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
210
210
  while (true) {
211
211
  if (dump_kv_cache) {
212
212
  llama_kv_cache_view_update(ctx, &kvc_view);
213
- dump_kv_cache_view_seqs(kvc_view, 40);
213
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
214
214
  }
215
215
 
216
216
  llama_batch_clear(batch);