@fugood/llama.node 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/llama.cpp/CMakeLists.txt +14 -12
  24. package/src/llama.cpp/common/common.cpp +19 -5
  25. package/src/llama.cpp/common/common.h +2 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +9 -0
  27. package/src/llama.cpp/common/sampling.cpp +3 -3
  28. package/src/llama.cpp/common/sampling.h +1 -1
  29. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
  31. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
  32. package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
  33. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
  34. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
  36. package/src/llama.cpp/examples/main/main.cpp +5 -1
  37. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  38. package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
  39. package/src/llama.cpp/examples/server/server.cpp +12 -16
  40. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  41. package/src/llama.cpp/ggml-backend.c +2 -2
  42. package/src/llama.cpp/ggml-kompute.cpp +9 -3
  43. package/src/llama.cpp/ggml-quants.c +6 -0
  44. package/src/llama.cpp/ggml-rpc.cpp +1023 -0
  45. package/src/llama.cpp/ggml-rpc.h +24 -0
  46. package/src/llama.cpp/ggml-sycl.cpp +20 -143
  47. package/src/llama.cpp/ggml-vulkan.cpp +4 -2
  48. package/src/llama.cpp/ggml.c +116 -271
  49. package/src/llama.cpp/ggml.h +12 -15
  50. package/src/llama.cpp/llama.cpp +451 -265
  51. package/src/llama.cpp/llama.h +3 -0
  52. package/src/llama.cpp/requirements.txt +0 -1
  53. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
  55. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  56. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  57. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/CMakeLists.txt CHANGED
@@ -64,6 +64,15 @@ if (VULKAN_SDK)
64
64
  find_package(Vulkan REQUIRED)
65
65
  endif()
66
66
 
67
+ find_program(PATCH patch REQUIRED)
68
+
69
+ add_custom_target(
70
+ patch ALL
71
+ COMMAND ${PATCH} -p1 -N < ${CMAKE_SOURCE_DIR}/patches/llama.patch || true
72
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp
73
+ COMMENT "Applying patches"
74
+ )
75
+
67
76
  set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
68
77
  add_subdirectory("src/llama.cpp")
69
78
 
package/README.md CHANGED
@@ -30,7 +30,7 @@ const context = await loadModel({
30
30
  })
31
31
 
32
32
  // Do completion
33
- const { text, timings } = await context.completion(
33
+ const { text } = await context.completion(
34
34
  {
35
35
  prompt: 'This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.\n\nUser: Hello!\nLlama:',
36
36
  n_predict: 100,
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -62,7 +62,7 @@ export interface Module {
62
62
  LlamaContext: LlamaContext
63
63
  }
64
64
 
65
- export type LibVariant = 'default' | 'opencl'
65
+ export type LibVariant = 'default' | 'vulkan'
66
66
 
67
67
  const setupEnv = (variant?: string) => {
68
68
  const postfix = variant ? `-${variant}` : ''
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.2.0",
4
+ "version": "0.2.1",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -38,6 +38,7 @@
38
38
  ]
39
39
  },
40
40
  "files": [
41
+ "patches/*.patch",
41
42
  "bin/**/*",
42
43
  "src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
43
44
  "lib/*.js",
@@ -0,0 +1,22 @@
1
+ diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
2
+ index b9449be0..cfa0f774 100644
3
+ --- a/ggml-vulkan.cpp
4
+ +++ b/ggml-vulkan.cpp
5
+ @@ -525,9 +525,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
6
+ vk::PipelineCreateFlags(),
7
+ pipeline_shader_create_info,
8
+ pipeline->layout);
9
+ - pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
10
+
11
+ - ctx->device->pipelines.push_back(pipeline);
12
+ + try {
13
+ + pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
14
+ + ctx->device->pipelines.push_back(pipeline);
15
+ + } catch (vk::UnknownError const&) {
16
+ + std::cerr << "ggml_vk_create_pipeline: Failed to create pipeline " << name << std::endl;
17
+ + ggml_vk_destroy_pipeline(ctx->device->device, pipeline);
18
+ + pipeline.reset();
19
+ + }
20
+ }
21
+
22
+ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
@@ -7,7 +7,7 @@ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
7
7
 
8
8
  void TokenizeWorker::Execute() {
9
9
  const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
10
- _result = {.tokens = std::move(tokens)};
10
+ _result.tokens = std::move(tokens);
11
11
  }
12
12
 
13
13
  void TokenizeWorker::OnOK() {
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
123
123
  set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
124
124
  option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
125
125
  option(LLAMA_MPI "llama: use MPI" OFF)
126
+ option(LLAMA_RPC "llama: use RPC" OFF)
126
127
  option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
127
128
  option(LLAMA_SYCL "llama: use SYCL" OFF)
128
129
  option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
@@ -296,7 +297,7 @@ if (LLAMA_BLAS)
296
297
  if (LLAMA_STATIC)
297
298
  set(BLA_STATIC ON)
298
299
  endif()
299
- if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
300
+ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
300
301
  set(BLA_SIZEOF_INTEGER 8)
301
302
  endif()
302
303
 
@@ -494,6 +495,17 @@ if (LLAMA_MPI)
494
495
  endif()
495
496
  endif()
496
497
 
498
+ if (LLAMA_RPC)
499
+ add_compile_definitions(GGML_USE_RPC)
500
+
501
+ if (WIN32)
502
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
503
+ endif()
504
+
505
+ set(GGML_HEADERS_RPC ggml-rpc.h)
506
+ set(GGML_SOURCES_RPC ggml-rpc.cpp)
507
+ endif()
508
+
497
509
  if (LLAMA_CLBLAST)
498
510
  find_package(CLBlast)
499
511
  if (CLBlast_FOUND)
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
1176
1188
  ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
1177
1189
  ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
1178
1190
  ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
1191
+ ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
1179
1192
  ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
1180
1193
  ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
1181
1194
  ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
@@ -1281,17 +1294,6 @@ install(
1281
1294
  WORLD_READ
1282
1295
  WORLD_EXECUTE
1283
1296
  DESTINATION ${CMAKE_INSTALL_BINDIR})
1284
- install(
1285
- FILES convert-lora-to-ggml.py
1286
- PERMISSIONS
1287
- OWNER_READ
1288
- OWNER_WRITE
1289
- OWNER_EXECUTE
1290
- GROUP_READ
1291
- GROUP_EXECUTE
1292
- WORLD_READ
1293
- WORLD_EXECUTE
1294
- DESTINATION ${CMAKE_INSTALL_BINDIR})
1295
1297
  if (LLAMA_METAL)
1296
1298
  install(
1297
1299
  FILES ggml-metal.metal
@@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
901
901
  params.interactive = true;
902
902
  return true;
903
903
  }
904
+ if (arg == "--interactive-specials") {
905
+ params.interactive_specials = true;
906
+ return true;
907
+ }
904
908
  if (arg == "--embedding") {
905
909
  params.embedding = true;
906
910
  return true;
@@ -1056,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1056
1060
  #endif // GGML_USE_CUDA_SYCL_VULKAN
1057
1061
  return true;
1058
1062
  }
1063
+ if (arg == "--rpc") {
1064
+ if (++i >= argc) {
1065
+ invalid_param = true;
1066
+ return true;
1067
+ }
1068
+ params.rpc_servers = argv[i];
1069
+ return true;
1070
+ }
1059
1071
  if (arg == "--no-mmap") {
1060
1072
  params.use_mmap = false;
1061
1073
  return true;
@@ -1367,14 +1379,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
1367
1379
  if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1368
1380
  std::replace(arg.begin(), arg.end(), '_', '-');
1369
1381
  }
1370
-
1371
1382
  if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
1372
1383
  throw std::invalid_argument("error: unknown argument: " + arg);
1373
1384
  }
1374
- }
1375
-
1376
- if (invalid_param) {
1377
- throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1385
+ if (invalid_param) {
1386
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1387
+ }
1378
1388
  }
1379
1389
 
1380
1390
  if (params.prompt_cache_all &&
@@ -1422,6 +1432,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1422
1432
  printf(" -h, --help show this help message and exit\n");
1423
1433
  printf(" --version show version and build info\n");
1424
1434
  printf(" -i, --interactive run in interactive mode\n");
1435
+ printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
1425
1436
  printf(" --interactive-first run in interactive mode and wait for input right away\n");
1426
1437
  printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
1427
1438
  printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -1554,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1554
1565
  printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
1555
1566
  printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
1556
1567
  }
1568
+ printf(" --rpc SERVERS comma separated list of RPC servers\n");
1557
1569
  printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
1558
1570
  printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
1559
1571
  printf(" -gan N, --grp-attn-n N\n");
@@ -1827,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
1827
1839
  if (params.n_gpu_layers != -1) {
1828
1840
  mparams.n_gpu_layers = params.n_gpu_layers;
1829
1841
  }
1842
+ mparams.rpc_servers = params.rpc_servers.c_str();
1830
1843
  mparams.main_gpu = params.main_gpu;
1831
1844
  mparams.split_mode = params.split_mode;
1832
1845
  mparams.tensor_split = params.tensor_split;
@@ -2652,6 +2665,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
2652
2665
  dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
2653
2666
  fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2654
2667
  fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2668
+ fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
2655
2669
  fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2656
2670
  fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2657
2671
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@@ -82,6 +82,7 @@ struct gpt_params {
82
82
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
83
83
  int32_t yarn_orig_ctx = 0; // YaRN original context length
84
84
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
85
+ std::string rpc_servers = ""; // comma separated list of RPC servers
85
86
 
86
87
  ggml_backend_sched_eval_callback cb_eval = nullptr;
87
88
  void * cb_eval_user_data = nullptr;
@@ -140,6 +141,7 @@ struct gpt_params {
140
141
  bool random_prompt = false; // do not randomize prompt if none provided
141
142
  bool use_color = false; // use color to distinguish generations and inputs
142
143
  bool interactive = false; // interactive mode
144
+ bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
143
145
  bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144
146
  bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145
147
  bool prompt_cache_all = false; // save user input and generations to prompt cache
@@ -142,6 +142,9 @@ namespace grammar_parser {
142
142
  pos++;
143
143
  last_sym_start = out_elements.size();
144
144
  while (*pos != '"') {
145
+ if (!*pos) {
146
+ throw std::runtime_error("unexpected end of input");
147
+ }
145
148
  auto char_pair = parse_char(pos);
146
149
  pos = char_pair.second;
147
150
  out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -156,6 +159,9 @@ namespace grammar_parser {
156
159
  }
157
160
  last_sym_start = out_elements.size();
158
161
  while (*pos != ']') {
162
+ if (!*pos) {
163
+ throw std::runtime_error("unexpected end of input");
164
+ }
159
165
  auto char_pair = parse_char(pos);
160
166
  pos = char_pair.second;
161
167
  enum llama_gretype type = last_sym_start < out_elements.size()
@@ -164,6 +170,9 @@ namespace grammar_parser {
164
170
 
165
171
  out_elements.push_back({type, char_pair.first});
166
172
  if (pos[0] == '-' && pos[1] != ']') {
173
+ if (!pos[1]) {
174
+ throw std::runtime_error("unexpected end of input");
175
+ }
167
176
  auto endchar_pair = parse_char(pos + 1);
168
177
  pos = endchar_pair.second;
169
178
  out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
35
35
 
36
36
  result->prev.resize(params.n_prev);
37
37
 
38
- result->n_considered = 0;
38
+ result->n_valid = 0;
39
39
 
40
40
  llama_sampling_set_rng_seed(result, params.seed);
41
41
 
@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
66
66
 
67
67
  std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
68
68
  ctx->cur.clear();
69
- ctx->n_considered = 0;
69
+ ctx->n_valid = 0;
70
70
  }
71
71
 
72
72
  void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -256,7 +256,7 @@ static llama_token llama_sampling_sample_impl(
256
256
  }
257
257
  }
258
258
 
259
- ctx_sampling->n_considered = cur_p.size;
259
+ ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
260
260
 
261
261
  return id;
262
262
  }
@@ -81,7 +81,7 @@ struct llama_sampling_context {
81
81
  // TODO: replace with ring-buffer
82
82
  std::vector<llama_token> prev;
83
83
  std::vector<llama_token_data> cur;
84
- size_t n_considered;
84
+ size_t n_valid; // Number of correct top tokens with correct probabilities.
85
85
 
86
86
  std::mt19937 rng;
87
87
  };
@@ -49,4 +49,7 @@ else()
49
49
  add_subdirectory(server)
50
50
  endif()
51
51
  add_subdirectory(export-lora)
52
+ if (LLAMA_RPC)
53
+ add_subdirectory(rpc)
54
+ endif()
52
55
  endif()
@@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
49
49
  }
50
50
 
51
51
  float * out = output + batch.seq_id[i][0] * n_embd;
52
+ //TODO: I would also add a parameter here to enable normalization or not.
53
+ /*fprintf(stdout, "unnormalized_embedding:");
54
+ for (int hh = 0; hh < n_embd; hh++) {
55
+ fprintf(stdout, "%9.6f ", embd[hh]);
56
+ }
57
+ fprintf(stdout, "\n");*/
52
58
  llama_embd_normalize(embd, out, n_embd);
53
59
  }
54
60
  }
@@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
123
129
  inputs.push_back(inp);
124
130
  }
125
131
 
126
- // add SEP if not present
132
+ // check if the last token is SEP
133
+ // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
127
134
  for (auto & inp : inputs) {
128
135
  if (inp.empty() || inp.back() != llama_token_sep(model)) {
129
- inp.push_back(llama_token_sep(model));
136
+ fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
137
+ fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
130
138
  }
131
139
  }
132
140
 
@@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
161
161
  }
162
162
  }
163
163
 
164
+ static std::string pair_str(const std::pair<int, int> & p) {
165
+ static char buf[32];
166
+ snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
167
+ return buf;
168
+ }
169
+
164
170
  struct cmd_params {
165
171
  std::vector<std::string> model;
166
172
  std::vector<int> n_prompt;
167
173
  std::vector<int> n_gen;
174
+ std::vector<std::pair<int, int>> n_pg;
168
175
  std::vector<int> n_batch;
169
176
  std::vector<int> n_ubatch;
170
177
  std::vector<ggml_type> type_k;
@@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = {
188
195
  /* model */ {"models/7B/ggml-model-q4_0.gguf"},
189
196
  /* n_prompt */ {512},
190
197
  /* n_gen */ {128},
198
+ /* n_pg */ {{512, 128}},
191
199
  /* n_batch */ {2048},
192
200
  /* n_ubatch */ {512},
193
201
  /* type_k */ {GGML_TYPE_F16},
@@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
215
223
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
216
224
  printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
217
225
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
226
+ printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
218
227
  printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
219
- printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
220
- printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
221
- printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
228
+ printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
229
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
230
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
222
231
  printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
223
232
  printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
224
233
  printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
304
313
  }
305
314
  auto p = split<int>(argv[i], split_delim);
306
315
  params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
316
+ } else if (arg == "-pg") {
317
+ if (++i >= argc) {
318
+ invalid_param = true;
319
+ break;
320
+ }
321
+ auto p = split<std::string>(argv[i], ',');
322
+ if (p.size() != 2) {
323
+ invalid_param = true;
324
+ break;
325
+ }
326
+ params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
307
327
  } else if (arg == "-b" || arg == "--batch-size") {
308
328
  if (++i >= argc) {
309
329
  invalid_param = true;
@@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
493
513
  if (params.model.empty()) { params.model = cmd_params_defaults.model; }
494
514
  if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
495
515
  if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
516
+ if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
496
517
  if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
497
518
  if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
498
519
  if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
@@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
632
653
  };
633
654
  instances.push_back(instance);
634
655
  }
656
+
657
+ for (const auto & n_pg : params.n_pg) {
658
+ if (n_pg.first == 0 && n_pg.second == 0) {
659
+ continue;
660
+ }
661
+ cmd_params_instance instance = {
662
+ /* .model = */ m,
663
+ /* .n_prompt = */ n_pg.first,
664
+ /* .n_gen = */ n_pg.second,
665
+ /* .n_batch = */ nb,
666
+ /* .n_ubatch = */ nub,
667
+ /* .type_k = */ tk,
668
+ /* .type_v = */ tv,
669
+ /* .n_threads = */ nt,
670
+ /* .n_gpu_layers = */ nl,
671
+ /* .split_mode = */ sm,
672
+ /* .main_gpu = */ mg,
673
+ /* .no_kv_offload= */ nkvo,
674
+ /* .flash_attn = */ fa,
675
+ /* .tensor_split = */ ts,
676
+ /* .use_mmap = */ mmp,
677
+ /* .embeddings = */ embd,
678
+ };
679
+ instances.push_back(instance);
680
+ }
635
681
  }
636
682
 
637
683
  return instances;
@@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
965
1011
  if (field == "n_gpu_layers") {
966
1012
  return 3;
967
1013
  }
1014
+ if (field == "test") {
1015
+ return 13;
1016
+ }
968
1017
 
969
1018
  int width = std::max((int)field.length(), 10);
970
1019
 
@@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
1091
1140
  value = test::get_backend();
1092
1141
  } else if (field == "test") {
1093
1142
  if (t.n_prompt > 0 && t.n_gen == 0) {
1094
- snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
1143
+ snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1095
1144
  } else if (t.n_gen > 0 && t.n_prompt == 0) {
1096
- snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
1145
+ snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1097
1146
  } else {
1098
- assert(false);
1099
- exit(1);
1147
+ snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1100
1148
  }
1101
1149
  value = buf;
1102
1150
  } else if (field == "t/s") {
@@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
1297
1345
  llama_kv_cache_clear(ctx);
1298
1346
 
1299
1347
  uint64_t t_start = get_time_ns();
1348
+
1300
1349
  if (t.n_prompt > 0) {
1301
1350
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1302
1351
  }
@@ -37,7 +37,7 @@ FetchContent_MakeAvailable(llama)
37
37
  # used in the AndroidManifest.xml file.
38
38
  add_library(${CMAKE_PROJECT_NAME} SHARED
39
39
  # List C/C++ source files with relative paths to this CMakeLists.txt.
40
- llama-android.cpp)
40
+ llama-android.cpp)
41
41
 
42
42
  # Specifies libraries CMake should link to your target library. You
43
43
  # can link libraries from various origins, such as libraries defined in this
@@ -0,0 +1,49 @@
1
+ # For more information about using CMake with Android Studio, read the
2
+ # documentation: https://d.android.com/studio/projects/add-native-code.html.
3
+ # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
4
+
5
+ # Sets the minimum CMake version required for this project.
6
+ cmake_minimum_required(VERSION 3.22.1)
7
+
8
+ # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
9
+ # Since this is the top level CMakeLists.txt, the project name is also accessible
10
+ # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
11
+ # build script scope).
12
+ project("llama-android")
13
+
14
+ include(FetchContent)
15
+ FetchContent_Declare(
16
+ llama
17
+ GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
18
+ GIT_TAG master
19
+ )
20
+
21
+ # Also provides "common"
22
+ FetchContent_MakeAvailable(llama)
23
+
24
+ # Creates and names a library, sets it as either STATIC
25
+ # or SHARED, and provides the relative paths to its source code.
26
+ # You can define multiple libraries, and CMake builds them for you.
27
+ # Gradle automatically packages shared libraries with your APK.
28
+ #
29
+ # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
30
+ # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
31
+ # is preferred for the same purpose.
32
+ #
33
+ # In order to load a library into your app from Java/Kotlin, you must call
34
+ # System.loadLibrary() and pass the name of the library defined here;
35
+ # for GameActivity/NativeActivity derived applications, the same library name must be
36
+ # used in the AndroidManifest.xml file.
37
+ add_library(${CMAKE_PROJECT_NAME} SHARED
38
+ # List C/C++ source files with relative paths to this CMakeLists.txt.
39
+ llama-android.cpp)
40
+
41
+ # Specifies libraries CMake should link to your target library. You
42
+ # can link libraries from various origins, such as libraries defined in this
43
+ # build script, prebuilt third-party libraries, or Android system libraries.
44
+ target_link_libraries(${CMAKE_PROJECT_NAME}
45
+ # List libraries link to the target library
46
+ llama
47
+ common
48
+ android
49
+ log)