llama_cpp 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -303,6 +303,7 @@ extern "C" {
303
303
  GGML_OP_STEP,
304
304
  GGML_OP_RELU,
305
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
306
307
  GGML_OP_SILU,
307
308
  GGML_OP_SILU_BACK,
308
309
  GGML_OP_NORM, // normalize
@@ -331,12 +332,15 @@ extern "C" {
331
332
  GGML_OP_ROPE_BACK,
332
333
  GGML_OP_ALIBI,
333
334
  GGML_OP_CLAMP,
334
- GGML_OP_CONV_1D_1S,
335
- GGML_OP_CONV_1D_2S,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
336
338
 
337
339
  GGML_OP_FLASH_ATTN,
338
340
  GGML_OP_FLASH_FF,
339
341
  GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
340
344
 
341
345
  GGML_OP_MAP_UNARY,
342
346
  GGML_OP_MAP_BINARY,
@@ -500,8 +504,9 @@ extern "C" {
500
504
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501
505
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502
506
 
503
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
507
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
508
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
509
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
505
510
 
506
511
  GGML_API struct ggml_tensor * ggml_new_tensor(
507
512
  struct ggml_context * ctx,
@@ -556,8 +561,8 @@ extern "C" {
556
561
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
557
562
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
558
563
 
559
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
560
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
564
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
565
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
561
566
 
562
567
  //
563
568
  // operations on tensors with backpropagation
@@ -610,24 +615,47 @@ extern "C" {
610
615
  struct ggml_tensor * a,
611
616
  struct ggml_tensor * b);
612
617
 
618
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ struct ggml_tensor * b);
622
+
613
623
  GGML_API struct ggml_tensor * ggml_mul(
614
624
  struct ggml_context * ctx,
615
625
  struct ggml_tensor * a,
616
626
  struct ggml_tensor * b);
617
627
 
628
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
629
+ struct ggml_context * ctx,
630
+ struct ggml_tensor * a,
631
+ struct ggml_tensor * b);
632
+
618
633
  GGML_API struct ggml_tensor * ggml_div(
619
634
  struct ggml_context * ctx,
620
635
  struct ggml_tensor * a,
621
636
  struct ggml_tensor * b);
622
637
 
638
+ GGML_API struct ggml_tensor * ggml_div_inplace(
639
+ struct ggml_context * ctx,
640
+ struct ggml_tensor * a,
641
+ struct ggml_tensor * b);
642
+
623
643
  GGML_API struct ggml_tensor * ggml_sqr(
624
644
  struct ggml_context * ctx,
625
645
  struct ggml_tensor * a);
626
646
 
647
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
648
+ struct ggml_context * ctx,
649
+ struct ggml_tensor * a);
650
+
627
651
  GGML_API struct ggml_tensor * ggml_sqrt(
628
652
  struct ggml_context * ctx,
629
653
  struct ggml_tensor * a);
630
654
 
655
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
656
+ struct ggml_context * ctx,
657
+ struct ggml_tensor * a);
658
+
631
659
  GGML_API struct ggml_tensor * ggml_log(
632
660
  struct ggml_context * ctx,
633
661
  struct ggml_tensor * a);
@@ -667,31 +695,67 @@ extern "C" {
667
695
  struct ggml_context * ctx,
668
696
  struct ggml_tensor * a);
669
697
 
698
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a);
701
+
670
702
  GGML_API struct ggml_tensor * ggml_sgn(
671
703
  struct ggml_context * ctx,
672
704
  struct ggml_tensor * a);
673
705
 
706
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
707
+ struct ggml_context * ctx,
708
+ struct ggml_tensor * a);
709
+
674
710
  GGML_API struct ggml_tensor * ggml_neg(
675
711
  struct ggml_context * ctx,
676
712
  struct ggml_tensor * a);
677
713
 
714
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
715
+ struct ggml_context * ctx,
716
+ struct ggml_tensor * a);
717
+
678
718
  GGML_API struct ggml_tensor * ggml_step(
679
719
  struct ggml_context * ctx,
680
720
  struct ggml_tensor * a);
681
721
 
722
+ GGML_API struct ggml_tensor * ggml_step_inplace(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
682
726
  GGML_API struct ggml_tensor * ggml_relu(
683
727
  struct ggml_context * ctx,
684
728
  struct ggml_tensor * a);
685
729
 
730
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a);
733
+
686
734
  // TODO: double-check this computation is correct
687
735
  GGML_API struct ggml_tensor * ggml_gelu(
688
736
  struct ggml_context * ctx,
689
737
  struct ggml_tensor * a);
690
738
 
739
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
740
+ struct ggml_context * ctx,
741
+ struct ggml_tensor * a);
742
+
743
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
744
+ struct ggml_context * ctx,
745
+ struct ggml_tensor * a);
746
+
747
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
748
+ struct ggml_context * ctx,
749
+ struct ggml_tensor * a);
750
+
691
751
  GGML_API struct ggml_tensor * ggml_silu(
692
752
  struct ggml_context * ctx,
693
753
  struct ggml_tensor * a);
694
754
 
755
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a);
758
+
695
759
  // a - x
696
760
  // b - dy
697
761
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -705,10 +769,18 @@ extern "C" {
705
769
  struct ggml_context * ctx,
706
770
  struct ggml_tensor * a);
707
771
 
772
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
773
+ struct ggml_context * ctx,
774
+ struct ggml_tensor * a);
775
+
708
776
  GGML_API struct ggml_tensor * ggml_rms_norm(
709
777
  struct ggml_context * ctx,
710
778
  struct ggml_tensor * a);
711
779
 
780
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
781
+ struct ggml_context * ctx,
782
+ struct ggml_tensor * a);
783
+
712
784
  // a - x
713
785
  // b - dy
714
786
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -998,16 +1070,55 @@ extern "C" {
998
1070
  float min,
999
1071
  float max);
1000
1072
 
1001
- // padding = 1
1073
+ // TODO: implement general-purpose convolutions
1074
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1075
+ // struct ggml_context * ctx,
1076
+ // struct ggml_tensor * a,
1077
+ // struct ggml_tensor * b,
1078
+ // int s0
1079
+ // int p0,
1080
+ // int d0);
1081
+ //
1082
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1083
+ // struct ggml_context * ctx,
1084
+ // struct ggml_tensor * a,
1085
+ // struct ggml_tensor * b,
1086
+ // int s0,
1087
+ // int s1,
1088
+ // int p0,
1089
+ // int p1,
1090
+ // int d0,
1091
+ // int d1);
1092
+
1093
+ // padding = half
1002
1094
  // TODO: we don't support extra parameters for now
1003
1095
  // that's why we are hard-coding the stride, padding, and dilation
1004
1096
  // not great ..
1005
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
1097
+ // example:
1098
+ // a: 3 80 768 1
1099
+ // b: 3000 80 1 1
1100
+ // res: 3000 768 1 1
1101
+ // used in whisper
1102
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1006
1103
  struct ggml_context * ctx,
1007
1104
  struct ggml_tensor * a,
1008
1105
  struct ggml_tensor * b);
1009
1106
 
1010
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
1107
+ // used in whisper
1108
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1109
+ struct ggml_context * ctx,
1110
+ struct ggml_tensor * a,
1111
+ struct ggml_tensor * b);
1112
+
1113
+ // kernel size is a->ne[0] x a->ne[1]
1114
+ // stride is equal to kernel size
1115
+ // padding is zero
1116
+ // example:
1117
+ // a: 16 16 3 768
1118
+ // b: 1024 1024 3 1
1119
+ // res: 64 64 768 1
1120
+ // used in sam
1121
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1011
1122
  struct ggml_context * ctx,
1012
1123
  struct ggml_tensor * a,
1013
1124
  struct ggml_tensor * b);
@@ -1035,6 +1146,26 @@ extern "C" {
1035
1146
  struct ggml_tensor * c0,
1036
1147
  struct ggml_tensor * c1);
1037
1148
 
1149
+ // partition into non-overlapping windows with padding if needed
1150
+ // example:
1151
+ // a: 768 64 64 1
1152
+ // w: 14
1153
+ // res: 768 14 14 25
1154
+ // used in sam
1155
+ GGML_API struct ggml_tensor * ggml_win_part(
1156
+ struct ggml_context * ctx,
1157
+ struct ggml_tensor * a,
1158
+ int w);
1159
+
1160
+ // reverse of ggml_win_part
1161
+ // used in sam
1162
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1163
+ struct ggml_context * ctx,
1164
+ struct ggml_tensor * a,
1165
+ int w0,
1166
+ int h0,
1167
+ int w);
1168
+
1038
1169
  // Mapping operations
1039
1170
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1040
1171
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
@@ -19,6 +19,11 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #define QK_K 256
25
+ #endif
26
+ #endif
22
27
 
23
28
  #include <array>
24
29
  #include <ctime>
@@ -40,6 +45,10 @@
40
45
  #include <sstream>
41
46
  #include <numeric>
42
47
 
48
+ #if defined(_MSC_VER)
49
+ #pragma warning(disable: 4244 4267) // possible loss of data
50
+ #endif
51
+
43
52
  #define LLAMA_USE_SCRATCH
44
53
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
54
 
@@ -882,6 +891,7 @@ static bool kv_cache_init(
882
891
  const int64_t n_elements = n_embd*n_mem;
883
892
 
884
893
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
894
+ cache.n = 0;
885
895
 
886
896
  struct ggml_init_params params;
887
897
  params.mem_size = cache.buf.size;
@@ -900,6 +910,7 @@ static bool kv_cache_init(
900
910
  ggml_set_name(cache.k, "cache_k");
901
911
  ggml_set_name(cache.v, "cache_v");
902
912
 
913
+ (void) n_gpu_layers;
903
914
  #ifdef GGML_USE_CUBLAS
904
915
  if (n_gpu_layers > n_layer + 1) {
905
916
  ggml_cuda_assign_buffers_no_scratch(cache.v);
@@ -914,21 +925,21 @@ static bool kv_cache_init(
914
925
 
915
926
  struct llama_context_params llama_context_default_params() {
916
927
  struct llama_context_params result = {
928
+ /*.seed =*/ -1,
917
929
  /*.n_ctx =*/ 512,
918
930
  /*.n_batch =*/ 512,
919
931
  /*.gpu_layers =*/ 0,
920
932
  /*.main_gpu =*/ 0,
921
933
  /*.tensor_split =*/ {0},
934
+ /*.progress_callback =*/ nullptr,
935
+ /*.progress_callback_user_data =*/ nullptr,
922
936
  /*.low_vram =*/ false,
923
- /*.seed =*/ -1,
924
937
  /*.f16_kv =*/ true,
925
938
  /*.logits_all =*/ false,
926
939
  /*.vocab_only =*/ false,
927
940
  /*.use_mmap =*/ true,
928
941
  /*.use_mlock =*/ false,
929
942
  /*.embedding =*/ false,
930
- /*.progress_callback =*/ nullptr,
931
- /*.progress_callback_user_data =*/ nullptr,
932
943
  };
933
944
 
934
945
  return result;
@@ -1249,7 +1260,7 @@ static void llama_model_load_internal(
1249
1260
  vram_scratch = n_batch * MB;
1250
1261
  ggml_cuda_set_scratch_size(vram_scratch);
1251
1262
  if (n_gpu_layers > 0) {
1252
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1263
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1253
1264
  __func__, vram_scratch / MB);
1254
1265
  }
1255
1266
  }
@@ -1609,7 +1620,7 @@ static bool llama_eval_internal(
1609
1620
  model.layers[il].w1,
1610
1621
  cur);
1611
1622
  offload_func(cur);
1612
- ggml_set_name(cur, "result_w2");
1623
+ ggml_set_name(cur, "result_w1");
1613
1624
 
1614
1625
  // SILU activation
1615
1626
  cur = ggml_silu(ctx0, cur);
@@ -1646,15 +1657,11 @@ static bool llama_eval_internal(
1646
1657
  {
1647
1658
  cur = ggml_rms_norm(ctx0, inpL);
1648
1659
  offload_func_nr(cur);
1649
- ggml_set_name(cur, "rms_norm_inpL");
1650
-
1651
- cur = ggml_rms_norm(ctx0, cur);
1652
- offload_func_nr(cur);
1653
- ggml_set_name(cur, "rms_norm_after");
1660
+ ggml_set_name(cur, "rms_norm_2");
1654
1661
 
1655
1662
  // cur = cur*norm(broadcasted)
1656
1663
  cur = ggml_mul(ctx0, cur, model.norm);
1657
- offload_func_nr(cur);
1664
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1658
1665
  ggml_set_name(cur, "result_norm");
1659
1666
 
1660
1667
  embeddings = cur;
@@ -2485,8 +2492,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2485
2492
  } else {
2486
2493
  new_type = quantized_type;
2487
2494
  #ifdef GGML_USE_K_QUANTS
2495
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2496
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2497
+ int nx = tensor.ne.at(0);
2498
+ int ny = tensor.ne.at(1);
2499
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2500
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2501
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2502
+ fprintf(stderr, "========================================================================================\n\n");
2503
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2504
+ }
2505
+ }
2488
2506
  if (tensor.name == "output.weight") {
2489
- new_type = GGML_TYPE_Q6_K;
2507
+ int nx = tensor.ne.at(0);
2508
+ int ny = tensor.ne.at(1);
2509
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2510
+ new_type = GGML_TYPE_Q6_K;
2511
+ }
2490
2512
  } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2491
2513
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2492
2514
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -2690,16 +2712,21 @@ struct llama_context * llama_init_from_file(
2690
2712
  // this allocates all Metal resources and memory buffers
2691
2713
  ctx->ctx_metal = ggml_metal_init();
2692
2714
 
2693
- void *data_ptr = NULL;
2715
+ void * data_ptr = NULL;
2694
2716
  size_t data_size = 0;
2717
+
2695
2718
  if (params.use_mmap) {
2696
- data_ptr = ctx->model.mapping->addr;
2697
- data_size= ctx->model.mapping->size;
2719
+ data_ptr = ctx->model.mapping->addr;
2720
+ data_size = ctx->model.mapping->size;
2698
2721
  } else {
2699
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2700
- data_size= ggml_get_mem_size(ctx->model.ctx);
2722
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2723
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2701
2724
  }
2702
2725
 
2726
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2727
+
2728
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2729
+
2703
2730
  #define LLAMA_METAL_CHECK_BUF(result) \
2704
2731
  if (!(result)) { \
2705
2732
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2707,12 +2734,13 @@ struct llama_context * llama_init_from_file(
2707
2734
  return NULL; \
2708
2735
  }
2709
2736
 
2710
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2711
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2737
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2712
2738
 
2713
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2714
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2715
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2739
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2741
+
2742
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
2716
2744
  #undef LLAMA_METAL_CHECK_BUF
2717
2745
  }
2718
2746
  #endif
@@ -3098,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3098
3126
  if (kv_size) {
3099
3127
  const size_t elt_size = ggml_element_size(kv_self.k);
3100
3128
 
3101
- char buffer[4096];
3102
-
3103
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3129
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3104
3130
  ggml_cgraph gf{};
3105
3131
  gf.n_threads = 1;
3106
3132
 
@@ -3206,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3206
3232
 
3207
3233
  const size_t elt_size = ggml_element_size(kv_self.k);
3208
3234
 
3209
- char buffer[4096];
3210
-
3211
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3235
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3212
3236
  ggml_cgraph gf{};
3213
3237
  gf.n_threads = 1;
3214
3238
 
@@ -3443,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
3443
3467
 
3444
3468
  fprintf(stderr, "\n");
3445
3469
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3446
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3447
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3448
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
3470
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3471
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3472
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3473
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3474
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3475
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3449
3476
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3450
3477
  }
3451
3478
 
@@ -71,28 +71,27 @@ extern "C" {
71
71
 
72
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
73
73
 
74
- struct llama_context_params {
74
+ struct llama_context_params {
75
+ int seed; // RNG seed, -1 for random
75
76
  int n_ctx; // text context
76
77
  int n_batch; // prompt processing batch size
77
78
  int n_gpu_layers; // number of layers to store in VRAM
78
79
  int main_gpu; // the GPU that is used for scratch and small tensors
79
80
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
81
- int seed; // RNG seed, -1 for random
81
+ // called with a progress value between 0 and 1, pass NULL to disable
82
+ llama_progress_callback progress_callback;
83
+ // context pointer passed to the progress callback
84
+ void * progress_callback_user_data;
82
85
 
86
+ // Keep the booleans together to avoid misalignment during copy-by-value.
87
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
83
88
  bool f16_kv; // use fp16 for KV cache
84
89
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
85
90
  bool vocab_only; // only load the vocabulary, no weights
86
91
  bool use_mmap; // use mmap if possible
87
92
  bool use_mlock; // force system to keep model in RAM
88
93
  bool embedding; // embedding mode only
89
-
90
- // called with a progress value between 0 and 1, pass NULL to disable
91
- llama_progress_callback progress_callback;
92
- // context pointer passed to the progress callback
93
- void * progress_callback_user_data;
94
94
  };
95
-
96
95
  // model file types
97
96
  enum llama_ftype {
98
97
  LLAMA_FTYPE_ALL_F32 = 0,
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.2.1'
6
+ VERSION = '0.2.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-a09f919'
9
+ LLAMA_CPP_VERSION = 'master-7487137'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-17 00:00:00.000000000 Z
11
+ date: 2023-06-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -24,6 +24,7 @@ files:
24
24
  - README.md
25
25
  - examples/README.md
26
26
  - examples/chat.rb
27
+ - examples/embedding.rb
27
28
  - ext/llama_cpp/extconf.rb
28
29
  - ext/llama_cpp/llama_cpp.cpp
29
30
  - ext/llama_cpp/llama_cpp.h