@fugood/llama.node 0.0.1-alpha.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CMakeLists.txt +36 -7
  2. package/README.md +9 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/lib/binding.js +18 -1
  14. package/lib/binding.ts +22 -2
  15. package/lib/index.ts +2 -2
  16. package/package.json +15 -3
  17. package/src/LlamaCompletionWorker.cpp +5 -1
  18. package/src/LlamaCompletionWorker.h +4 -0
  19. package/src/LlamaContext.cpp +18 -1
  20. package/src/common.hpp +11 -7
  21. package/src/llama.cpp/CMakeLists.txt +13 -7
  22. package/src/llama.cpp/common/common.cpp +221 -173
  23. package/src/llama.cpp/common/common.h +19 -8
  24. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  25. package/src/llama.cpp/common/log.h +2 -2
  26. package/src/llama.cpp/common/sampling.cpp +17 -1
  27. package/src/llama.cpp/common/sampling.h +28 -20
  28. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  29. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  30. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  31. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  32. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  33. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  34. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  36. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  37. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  38. package/src/llama.cpp/examples/main/main.cpp +10 -8
  39. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  40. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  42. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  43. package/src/llama.cpp/examples/server/server.cpp +97 -86
  44. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  45. package/src/llama.cpp/ggml-backend.c +7 -5
  46. package/src/llama.cpp/ggml-impl.h +339 -4
  47. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  48. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  49. package/src/llama.cpp/ggml-quants.c +302 -293
  50. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  51. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  52. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  53. package/src/llama.cpp/ggml.c +1469 -116
  54. package/src/llama.cpp/ggml.h +37 -7
  55. package/src/llama.cpp/llama.cpp +969 -432
  56. package/src/llama.cpp/llama.h +46 -14
  57. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  58. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  59. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  60. package/src/llama.cpp/requirements.txt +1 -0
  61. package/src/llama.cpp/sgemm.cpp +134 -103
  62. package/src/llama.cpp/sgemm.h +4 -2
  63. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  64. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  65. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  66. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  67. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  68. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  69. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  70. package/src/llama.cpp/unicode-data.cpp +1188 -656
  71. package/src/llama.cpp/unicode-data.h +4 -3
  72. package/src/llama.cpp/unicode.cpp +590 -49
  73. package/src/llama.cpp/unicode.h +6 -3
  74. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  75. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -174,9 +174,11 @@ struct cmd_params {
174
174
  std::vector<llama_split_mode> split_mode;
175
175
  std::vector<int> main_gpu;
176
176
  std::vector<bool> no_kv_offload;
177
+ std::vector<bool> flash_attn;
177
178
  std::vector<std::vector<float>> tensor_split;
178
179
  std::vector<bool> use_mmap;
179
180
  std::vector<bool> embeddings;
181
+ ggml_numa_strategy numa;
180
182
  int reps;
181
183
  bool verbose;
182
184
  output_formats output_format;
@@ -195,9 +197,11 @@ static const cmd_params cmd_params_defaults = {
195
197
  /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
196
198
  /* main_gpu */ {0},
197
199
  /* no_kv_offload */ {false},
200
+ /* flash_attn */ {false},
198
201
  /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
199
202
  /* use_mmap */ {true},
200
203
  /* embeddings */ {false},
204
+ /* numa */ GGML_NUMA_STRATEGY_DISABLED,
201
205
  /* reps */ 5,
202
206
  /* verbose */ false,
203
207
  /* output_format */ MARKDOWN
@@ -220,7 +224,9 @@ static void print_usage(int /* argc */, char ** argv) {
220
224
  printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
221
225
  printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
222
226
  printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
227
+ printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
223
228
  printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
229
+ printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
224
230
  printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
225
231
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
226
232
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -393,6 +399,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
393
399
  }
394
400
  auto p = split<bool>(argv[i], split_delim);
395
401
  params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
402
+ } else if (arg == "--numa") {
403
+ if (++i >= argc) {
404
+ invalid_param = true;
405
+ break;
406
+ } else {
407
+ std::string value(argv[i]);
408
+ /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
409
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
410
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
411
+ else { invalid_param = true; break; }
412
+ }
413
+ } else if (arg == "-fa" || arg == "--flash-attn") {
414
+ if (++i >= argc) {
415
+ invalid_param = true;
416
+ break;
417
+ }
418
+ auto p = split<bool>(argv[i], split_delim);
419
+ params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
396
420
  } else if (arg == "-mmp" || arg == "--mmap") {
397
421
  if (++i >= argc) {
398
422
  invalid_param = true;
@@ -477,6 +501,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
477
501
  if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
478
502
  if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
479
503
  if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
504
+ if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
480
505
  if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
481
506
  if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
482
507
  if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
@@ -498,6 +523,7 @@ struct cmd_params_instance {
498
523
  llama_split_mode split_mode;
499
524
  int main_gpu;
500
525
  bool no_kv_offload;
526
+ bool flash_attn;
501
527
  std::vector<float> tensor_split;
502
528
  bool use_mmap;
503
529
  bool embeddings;
@@ -532,6 +558,7 @@ struct cmd_params_instance {
532
558
  cparams.type_k = type_k;
533
559
  cparams.type_v = type_v;
534
560
  cparams.offload_kqv = !no_kv_offload;
561
+ cparams.flash_attn = flash_attn;
535
562
  cparams.embeddings = embeddings;
536
563
 
537
564
  return cparams;
@@ -554,6 +581,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
554
581
  for (const auto & tk : params.type_k)
555
582
  for (const auto & tv : params.type_v)
556
583
  for (const auto & nkvo : params.no_kv_offload)
584
+ for (const auto & fa : params.flash_attn)
557
585
  for (const auto & nt : params.n_threads) {
558
586
  for (const auto & n_prompt : params.n_prompt) {
559
587
  if (n_prompt == 0) {
@@ -572,6 +600,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
572
600
  /* .split_mode = */ sm,
573
601
  /* .main_gpu = */ mg,
574
602
  /* .no_kv_offload= */ nkvo,
603
+ /* .flash_attn = */ fa,
575
604
  /* .tensor_split = */ ts,
576
605
  /* .use_mmap = */ mmp,
577
606
  /* .embeddings = */ embd,
@@ -596,6 +625,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
596
625
  /* .split_mode = */ sm,
597
626
  /* .main_gpu = */ mg,
598
627
  /* .no_kv_offload= */ nkvo,
628
+ /* .flash_attn = */ fa,
599
629
  /* .tensor_split = */ ts,
600
630
  /* .use_mmap = */ mmp,
601
631
  /* .embeddings = */ embd,
@@ -633,6 +663,7 @@ struct test {
633
663
  llama_split_mode split_mode;
634
664
  int main_gpu;
635
665
  bool no_kv_offload;
666
+ bool flash_attn;
636
667
  std::vector<float> tensor_split;
637
668
  bool use_mmap;
638
669
  bool embeddings;
@@ -657,6 +688,7 @@ struct test {
657
688
  split_mode = inst.split_mode;
658
689
  main_gpu = inst.main_gpu;
659
690
  no_kv_offload = inst.no_kv_offload;
691
+ flash_attn = inst.flash_attn;
660
692
  tensor_split = inst.tensor_split;
661
693
  use_mmap = inst.use_mmap;
662
694
  embeddings = inst.embeddings;
@@ -731,7 +763,7 @@ struct test {
731
763
  "n_batch", "n_ubatch",
732
764
  "n_threads", "type_k", "type_v",
733
765
  "n_gpu_layers", "split_mode",
734
- "main_gpu", "no_kv_offload",
766
+ "main_gpu", "no_kv_offload", "flash_attn",
735
767
  "tensor_split", "use_mmap", "embeddings",
736
768
  "n_prompt", "n_gen", "test_time",
737
769
  "avg_ns", "stddev_ns",
@@ -753,7 +785,7 @@ struct test {
753
785
  }
754
786
  if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
755
787
  field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
756
- field == "use_mmap" || field == "embeddings") {
788
+ field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
757
789
  return BOOL;
758
790
  }
759
791
  if (field == "avg_ts" || field == "stddev_ts") {
@@ -787,7 +819,7 @@ struct test {
787
819
  std::to_string(n_batch), std::to_string(n_ubatch),
788
820
  std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
789
821
  std::to_string(n_gpu_layers), split_mode_str(split_mode),
790
- std::to_string(main_gpu), std::to_string(no_kv_offload),
822
+ std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
791
823
  tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
792
824
  std::to_string(n_prompt), std::to_string(n_gen), test_time,
793
825
  std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -955,6 +987,9 @@ struct markdown_printer : public printer {
955
987
  if (field == "no_kv_offload") {
956
988
  return "nkvo";
957
989
  }
990
+ if (field == "flash_attn") {
991
+ return "fa";
992
+ }
958
993
  if (field == "use_mmap") {
959
994
  return "mmap";
960
995
  }
@@ -1001,6 +1036,9 @@ struct markdown_printer : public printer {
1001
1036
  if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
1002
1037
  fields.emplace_back("no_kv_offload");
1003
1038
  }
1039
+ if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
1040
+ fields.emplace_back("flash_attn");
1041
+ }
1004
1042
  if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1005
1043
  fields.emplace_back("tensor_split");
1006
1044
  }
@@ -1191,6 +1229,7 @@ int main(int argc, char ** argv) {
1191
1229
  llama_log_set(llama_null_log_callback, NULL);
1192
1230
  }
1193
1231
  llama_backend_init();
1232
+ llama_numa_init(params.numa);
1194
1233
 
1195
1234
  // initialize printer
1196
1235
  std::unique_ptr<printer> p;
@@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
104
104
  #define TN_POS_EMBD "%s.position_embd.weight"
105
105
  #define TN_CLASS_EMBD "v.class_embd"
106
106
  #define TN_PATCH_EMBD "v.patch_embd.weight"
107
+ #define TN_PATCH_BIAS "v.patch_embd.bias"
107
108
  #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
108
109
  #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
109
110
  #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@@ -425,6 +426,7 @@ struct clip_vision_model {
425
426
  // embeddings
426
427
  struct ggml_tensor * class_embedding;
427
428
  struct ggml_tensor * patch_embeddings;
429
+ struct ggml_tensor * patch_bias;
428
430
  struct ggml_tensor * position_embeddings;
429
431
 
430
432
  struct ggml_tensor * pre_ln_w;
@@ -501,6 +503,11 @@ struct clip_ctx {
501
503
  bool use_gelu = false;
502
504
  int32_t ftype = 1;
503
505
 
506
+ bool has_class_embedding = true;
507
+ bool has_pre_norm = true;
508
+ bool has_post_norm = false;
509
+ bool has_patch_bias = false;
510
+
504
511
  struct gguf_context * ctx_gguf;
505
512
  struct ggml_context * ctx_data;
506
513
 
@@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
526
533
  const int patch_size = hparams.patch_size;
527
534
  const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
528
535
  const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
529
- const int num_positions = num_patches + 1;
536
+ const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
530
537
  const int hidden_size = hparams.hidden_size;
531
538
  const int n_head = hparams.n_head;
532
539
  const int d_head = hidden_size / n_head;
@@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
557
564
  inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
558
565
  inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
559
566
 
560
- // concat class_embeddings and patch_embeddings
561
- struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
562
- ggml_set_name(embeddings, "embeddings");
563
- ggml_set_input(embeddings);
567
+ if (ctx->has_patch_bias) {
568
+ // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
569
+ inp = ggml_add(ctx0, inp, model.patch_bias);
570
+ }
564
571
 
565
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
566
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
572
+ // concat class_embeddings and patch_embeddings
573
+ struct ggml_tensor * embeddings = inp;
574
+ if (ctx->has_class_embedding) {
575
+ embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
576
+ ggml_set_name(embeddings, "embeddings");
577
+ ggml_set_input(embeddings);
578
+ embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
579
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
580
+ embeddings = ggml_acc(ctx0, embeddings, inp,
581
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
582
+ }
567
583
 
568
- embeddings = ggml_acc(ctx0, embeddings, inp,
569
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
570
584
 
571
585
  struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
572
586
  ggml_set_name(positions, "positions");
@@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
576
590
  ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
577
591
 
578
592
  // pre-layernorm
579
- {
593
+ if (ctx->has_pre_norm) {
580
594
  embeddings = ggml_norm(ctx0, embeddings, eps);
581
595
  ggml_set_name(embeddings, "pre_ln");
582
596
 
@@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
664
678
  embeddings = cur;
665
679
  }
666
680
 
681
+ // post-layernorm
682
+ if (ctx->has_post_norm) {
683
+ embeddings = ggml_norm(ctx0, embeddings, eps);
684
+ ggml_set_name(embeddings, "post_ln");
685
+
686
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
687
+ }
688
+
667
689
  // llava projector
668
690
  {
669
691
  embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1148
1170
 
1149
1171
  }
1150
1172
 
1173
+ try {
1174
+ vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1175
+ new_clip->has_class_embedding = true;
1176
+ } catch (const std::exception& e) {
1177
+ new_clip->has_class_embedding = false;
1178
+ }
1179
+
1180
+ try {
1181
+ vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1182
+ vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1183
+ new_clip->has_pre_norm = true;
1184
+ } catch (std::exception & e) {
1185
+ new_clip->has_pre_norm = false;
1186
+ }
1187
+
1188
+ try {
1189
+ vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
1190
+ vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
1191
+ new_clip->has_post_norm = true;
1192
+ } catch (std::exception & e) {
1193
+ new_clip->has_post_norm = false;
1194
+ }
1195
+
1196
+ try {
1197
+ vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
1198
+ new_clip->has_patch_bias = true;
1199
+ } catch (std::exception & e) {
1200
+ new_clip->has_patch_bias = false;
1201
+ }
1202
+
1151
1203
  try {
1152
1204
  vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1153
- vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1154
1205
  vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1155
- vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1156
- vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1157
1206
  } catch(const std::exception& e) {
1158
1207
  LOG_TEE("%s: failed to load vision model tensors\n", __func__);
1159
1208
  }
@@ -1325,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
1325
1374
  }
1326
1375
 
1327
1376
  // Linear interpolation between two points
1328
- inline float lerp(float s, float e, float t) {
1377
+ inline float clip_lerp(float s, float e, float t) {
1329
1378
  return s + (e - s) * t;
1330
1379
  }
1331
1380
  // Bilinear resize function
@@ -1347,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
1347
1396
  float y_lerp = py - y_floor;
1348
1397
 
1349
1398
  for (int c = 0; c < 3; c++) {
1350
- float top = lerp(
1399
+ float top = clip_lerp(
1351
1400
  static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
1352
1401
  static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
1353
1402
  x_lerp
1354
1403
  );
1355
- float bottom = lerp(
1404
+ float bottom = clip_lerp(
1356
1405
  static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
1357
1406
  static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
1358
1407
  x_lerp
1359
1408
  );
1360
- dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
1409
+ dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
1361
1410
  }
1362
1411
  }
1363
1412
  }
@@ -1797,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1797
1846
  const int image_size = hparams.image_size;
1798
1847
  const int patch_size = hparams.patch_size;
1799
1848
  const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
1800
- const int num_positions = num_patches + 1;
1849
+ const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
1801
1850
 
1802
1851
  {
1803
1852
  struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1825,12 +1874,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1825
1874
  }
1826
1875
 
1827
1876
  {
1828
- struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1877
+ if (ctx->has_class_embedding) {
1878
+ struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1829
1879
 
1830
- void* zero_mem = malloc(ggml_nbytes(embeddings));
1831
- memset(zero_mem, 0, ggml_nbytes(embeddings));
1832
- ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1833
- free(zero_mem);
1880
+ void* zero_mem = malloc(ggml_nbytes(embeddings));
1881
+ memset(zero_mem, 0, ggml_nbytes(embeddings));
1882
+ ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1883
+ free(zero_mem);
1884
+ }
1834
1885
  }
1835
1886
 
1836
1887
  {
@@ -113,11 +113,11 @@ struct llava_context {
113
113
  };
114
114
 
115
115
  static void show_additional_info(int /*argc*/, char ** argv) {
116
- LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
116
+ LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
117
117
  LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
118
118
  }
119
119
 
120
- static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
120
+ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
121
121
 
122
122
  // load and preprocess the image
123
123
  llava_image_embed * embed = NULL;
@@ -133,9 +133,9 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
133
133
  }
134
134
  params->prompt = remove_image_from_prompt(prompt);
135
135
  } else {
136
- embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
136
+ embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
137
137
  if (!embed) {
138
- LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
138
+ fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
139
139
  return NULL;
140
140
  }
141
141
  }
@@ -207,17 +207,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
207
207
  printf("\n");
208
208
  }
209
209
 
210
-
211
- static struct llava_context * llava_init(gpt_params * params) {
212
- const char * clip_path = params->mmproj.c_str();
213
-
214
- auto prompt = params->prompt;
215
- if (prompt.empty()) {
216
- prompt = "describe the image in detail.";
217
- }
218
-
219
- auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
220
-
210
+ static struct llama_model * llava_init(gpt_params * params) {
221
211
  llama_backend_init();
222
212
  llama_numa_init(params->numa);
223
213
 
@@ -228,6 +218,19 @@ static struct llava_context * llava_init(gpt_params * params) {
228
218
  LOG_TEE("%s: error: unable to load model\n" , __func__);
229
219
  return NULL;
230
220
  }
221
+ return model;
222
+ }
223
+
224
+ static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
225
+ const char * clip_path = params->mmproj.c_str();
226
+
227
+ auto prompt = params->prompt;
228
+ if (prompt.empty()) {
229
+ prompt = "describe the image in detail.";
230
+ }
231
+
232
+ auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
233
+
231
234
 
232
235
  llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
233
236
  ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -286,24 +289,30 @@ int main(int argc, char ** argv) {
286
289
  show_additional_info(argc, argv);
287
290
  return 1;
288
291
  }
289
-
290
- auto ctx_llava = llava_init(&params);
291
- if (ctx_llava == NULL) {
292
- LOG_TEE("%s: error: failed to init llava\n", __func__);
292
+ auto model = llava_init(&params);
293
+ if (model == NULL) {
294
+ fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
293
295
  return 1;
294
296
  }
295
297
 
296
- auto image_embed = load_image(ctx_llava, &params);
297
- if (!image_embed) {
298
- return 1;
299
- }
298
+ for (auto & image : params.image) {
299
+ auto ctx_llava = llava_init_context(&params, model);
300
300
 
301
- // process the prompt
302
- process_prompt(ctx_llava, image_embed, &params, params.prompt);
301
+ auto image_embed = load_image(ctx_llava, &params, image);
302
+ if (!image_embed) {
303
+ std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
304
+ return 1;
305
+ }
306
+
307
+ // process the prompt
308
+ process_prompt(ctx_llava, image_embed, &params, params.prompt);
303
309
 
304
- llama_print_timings(ctx_llava->ctx_llama);
310
+ llama_print_timings(ctx_llava->ctx_llama);
311
+ llava_image_embed_free(image_embed);
312
+ ctx_llava->model = NULL;
313
+ llava_free(ctx_llava);
314
+ }
315
+ llama_free_model(model);
305
316
 
306
- llava_image_embed_free(image_embed);
307
- llava_free(ctx_llava);
308
317
  return 0;
309
318
  }
@@ -30,7 +30,6 @@ int main(int argc, char ** argv){
30
30
 
31
31
  // load the model
32
32
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
33
- llama_set_rng_seed(ctx, params.seed);
34
33
  GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
35
34
 
36
35
  // tokenize the prompt
@@ -38,7 +38,6 @@ int main(int argc, char ** argv){
38
38
 
39
39
  // load the model
40
40
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
41
- llama_set_rng_seed(ctx, params.seed);
42
41
  GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
43
42
 
44
43
  // tokenize the prompt
@@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
240
240
  return 1;
241
241
  }
242
242
  session_tokens.resize(n_token_count_out);
243
- llama_set_rng_seed(ctx, params.seed);
244
243
  LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
245
244
  }
246
245
  }
@@ -325,7 +324,7 @@ int main(int argc, char ** argv) {
325
324
  log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
326
325
 
327
326
  // if we will use the cache for the full prompt without reaching the end of the cache, force
328
- // reevaluation of the last token token to recalculate the cached logits
327
+ // reevaluation of the last token to recalculate the cached logits
329
328
  if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
330
329
  LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
331
330
 
@@ -363,6 +362,9 @@ int main(int argc, char ** argv) {
363
362
  params.interactive_first = true;
364
363
  params.antiprompt.emplace_back("<|im_start|>user\n");
365
364
  }
365
+ else if (params.conversation) {
366
+ params.interactive_first = true;
367
+ }
366
368
 
367
369
  // enable interactive mode if interactive start is specified
368
370
  if (params.interactive_first) {
@@ -545,7 +547,7 @@ int main(int argc, char ** argv) {
545
547
  // if we run out of context:
546
548
  // - take the n_keep first tokens from the original prompt (via n_past)
547
549
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
548
- if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
550
+ if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
549
551
  if (params.n_predict == -2) {
550
552
  LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
551
553
  break;
@@ -734,7 +736,7 @@ int main(int argc, char ** argv) {
734
736
  // display text
735
737
  if (input_echo && display) {
736
738
  for (auto id : embd) {
737
- const std::string token_str = llama_token_to_piece(ctx, id);
739
+ const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
738
740
  printf("%s", token_str.c_str());
739
741
 
740
742
  if (embd.size() > 1) {
@@ -797,7 +799,7 @@ int main(int argc, char ** argv) {
797
799
 
798
800
  // deal with end of generation tokens in interactive mode
799
801
  if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
800
- LOG("found EOS token\n");
802
+ LOG("found an EOG token\n");
801
803
 
802
804
  if (params.interactive) {
803
805
  if (!params.antiprompt.empty()) {
@@ -817,7 +819,7 @@ int main(int argc, char ** argv) {
817
819
  if (n_past > 0 && is_interacting) {
818
820
  LOG("waiting for user input\n");
819
821
 
820
- if (params.instruct || params.chatml) {
822
+ if (params.conversation || params.instruct || params.chatml) {
821
823
  printf("\n> ");
822
824
  }
823
825
 
@@ -827,7 +829,7 @@ int main(int argc, char ** argv) {
827
829
  }
828
830
 
829
831
  std::string buffer;
830
- if (!params.input_prefix.empty()) {
832
+ if (!params.input_prefix.empty() && !params.conversation) {
831
833
  LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
832
834
  printf("%s", params.input_prefix.c_str());
833
835
  }
@@ -851,7 +853,7 @@ int main(int argc, char ** argv) {
851
853
  // Entering a empty line lets the user pass control back
852
854
  if (buffer.length() > 1) {
853
855
  // append input suffix if any
854
- if (!params.input_suffix.empty()) {
856
+ if (!params.input_suffix.empty() && !params.conversation) {
855
857
  LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
856
858
  printf("%s", params.input_suffix.c_str());
857
859
  }