llama_cpp 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -303,6 +303,7 @@ extern "C" {
303
303
  GGML_OP_STEP,
304
304
  GGML_OP_RELU,
305
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
306
307
  GGML_OP_SILU,
307
308
  GGML_OP_SILU_BACK,
308
309
  GGML_OP_NORM, // normalize
@@ -331,12 +332,15 @@ extern "C" {
331
332
  GGML_OP_ROPE_BACK,
332
333
  GGML_OP_ALIBI,
333
334
  GGML_OP_CLAMP,
334
- GGML_OP_CONV_1D_1S,
335
- GGML_OP_CONV_1D_2S,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
336
338
 
337
339
  GGML_OP_FLASH_ATTN,
338
340
  GGML_OP_FLASH_FF,
339
341
  GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
340
344
 
341
345
  GGML_OP_MAP_UNARY,
342
346
  GGML_OP_MAP_BINARY,
@@ -500,8 +504,9 @@ extern "C" {
500
504
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501
505
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502
506
 
503
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
507
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
508
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
509
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
505
510
 
506
511
  GGML_API struct ggml_tensor * ggml_new_tensor(
507
512
  struct ggml_context * ctx,
@@ -556,8 +561,8 @@ extern "C" {
556
561
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
557
562
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
558
563
 
559
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
560
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
564
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
565
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
561
566
 
562
567
  //
563
568
  // operations on tensors with backpropagation
@@ -610,24 +615,47 @@ extern "C" {
610
615
  struct ggml_tensor * a,
611
616
  struct ggml_tensor * b);
612
617
 
618
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ struct ggml_tensor * b);
622
+
613
623
  GGML_API struct ggml_tensor * ggml_mul(
614
624
  struct ggml_context * ctx,
615
625
  struct ggml_tensor * a,
616
626
  struct ggml_tensor * b);
617
627
 
628
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
629
+ struct ggml_context * ctx,
630
+ struct ggml_tensor * a,
631
+ struct ggml_tensor * b);
632
+
618
633
  GGML_API struct ggml_tensor * ggml_div(
619
634
  struct ggml_context * ctx,
620
635
  struct ggml_tensor * a,
621
636
  struct ggml_tensor * b);
622
637
 
638
+ GGML_API struct ggml_tensor * ggml_div_inplace(
639
+ struct ggml_context * ctx,
640
+ struct ggml_tensor * a,
641
+ struct ggml_tensor * b);
642
+
623
643
  GGML_API struct ggml_tensor * ggml_sqr(
624
644
  struct ggml_context * ctx,
625
645
  struct ggml_tensor * a);
626
646
 
647
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
648
+ struct ggml_context * ctx,
649
+ struct ggml_tensor * a);
650
+
627
651
  GGML_API struct ggml_tensor * ggml_sqrt(
628
652
  struct ggml_context * ctx,
629
653
  struct ggml_tensor * a);
630
654
 
655
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
656
+ struct ggml_context * ctx,
657
+ struct ggml_tensor * a);
658
+
631
659
  GGML_API struct ggml_tensor * ggml_log(
632
660
  struct ggml_context * ctx,
633
661
  struct ggml_tensor * a);
@@ -667,31 +695,67 @@ extern "C" {
667
695
  struct ggml_context * ctx,
668
696
  struct ggml_tensor * a);
669
697
 
698
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a);
701
+
670
702
  GGML_API struct ggml_tensor * ggml_sgn(
671
703
  struct ggml_context * ctx,
672
704
  struct ggml_tensor * a);
673
705
 
706
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
707
+ struct ggml_context * ctx,
708
+ struct ggml_tensor * a);
709
+
674
710
  GGML_API struct ggml_tensor * ggml_neg(
675
711
  struct ggml_context * ctx,
676
712
  struct ggml_tensor * a);
677
713
 
714
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
715
+ struct ggml_context * ctx,
716
+ struct ggml_tensor * a);
717
+
678
718
  GGML_API struct ggml_tensor * ggml_step(
679
719
  struct ggml_context * ctx,
680
720
  struct ggml_tensor * a);
681
721
 
722
+ GGML_API struct ggml_tensor * ggml_step_inplace(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
682
726
  GGML_API struct ggml_tensor * ggml_relu(
683
727
  struct ggml_context * ctx,
684
728
  struct ggml_tensor * a);
685
729
 
730
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a);
733
+
686
734
  // TODO: double-check this computation is correct
687
735
  GGML_API struct ggml_tensor * ggml_gelu(
688
736
  struct ggml_context * ctx,
689
737
  struct ggml_tensor * a);
690
738
 
739
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
740
+ struct ggml_context * ctx,
741
+ struct ggml_tensor * a);
742
+
743
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
744
+ struct ggml_context * ctx,
745
+ struct ggml_tensor * a);
746
+
747
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
748
+ struct ggml_context * ctx,
749
+ struct ggml_tensor * a);
750
+
691
751
  GGML_API struct ggml_tensor * ggml_silu(
692
752
  struct ggml_context * ctx,
693
753
  struct ggml_tensor * a);
694
754
 
755
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a);
758
+
695
759
  // a - x
696
760
  // b - dy
697
761
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -705,10 +769,18 @@ extern "C" {
705
769
  struct ggml_context * ctx,
706
770
  struct ggml_tensor * a);
707
771
 
772
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
773
+ struct ggml_context * ctx,
774
+ struct ggml_tensor * a);
775
+
708
776
  GGML_API struct ggml_tensor * ggml_rms_norm(
709
777
  struct ggml_context * ctx,
710
778
  struct ggml_tensor * a);
711
779
 
780
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
781
+ struct ggml_context * ctx,
782
+ struct ggml_tensor * a);
783
+
712
784
  // a - x
713
785
  // b - dy
714
786
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -998,16 +1070,55 @@ extern "C" {
998
1070
  float min,
999
1071
  float max);
1000
1072
 
1001
- // padding = 1
1073
+ // TODO: implement general-purpose convolutions
1074
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1075
+ // struct ggml_context * ctx,
1076
+ // struct ggml_tensor * a,
1077
+ // struct ggml_tensor * b,
1078
+ // int s0
1079
+ // int p0,
1080
+ // int d0);
1081
+ //
1082
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1083
+ // struct ggml_context * ctx,
1084
+ // struct ggml_tensor * a,
1085
+ // struct ggml_tensor * b,
1086
+ // int s0,
1087
+ // int s1,
1088
+ // int p0,
1089
+ // int p1,
1090
+ // int d0,
1091
+ // int d1);
1092
+
1093
+ // padding = half
1002
1094
  // TODO: we don't support extra parameters for now
1003
1095
  // that's why we are hard-coding the stride, padding, and dilation
1004
1096
  // not great ..
1005
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
1097
+ // example:
1098
+ // a: 3 80 768 1
1099
+ // b: 3000 80 1 1
1100
+ // res: 3000 768 1 1
1101
+ // used in whisper
1102
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1006
1103
  struct ggml_context * ctx,
1007
1104
  struct ggml_tensor * a,
1008
1105
  struct ggml_tensor * b);
1009
1106
 
1010
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
1107
+ // used in whisper
1108
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1109
+ struct ggml_context * ctx,
1110
+ struct ggml_tensor * a,
1111
+ struct ggml_tensor * b);
1112
+
1113
+ // kernel size is a->ne[0] x a->ne[1]
1114
+ // stride is equal to kernel size
1115
+ // padding is zero
1116
+ // example:
1117
+ // a: 16 16 3 768
1118
+ // b: 1024 1024 3 1
1119
+ // res: 64 64 768 1
1120
+ // used in sam
1121
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1011
1122
  struct ggml_context * ctx,
1012
1123
  struct ggml_tensor * a,
1013
1124
  struct ggml_tensor * b);
@@ -1035,6 +1146,26 @@ extern "C" {
1035
1146
  struct ggml_tensor * c0,
1036
1147
  struct ggml_tensor * c1);
1037
1148
 
1149
+ // partition into non-overlapping windows with padding if needed
1150
+ // example:
1151
+ // a: 768 64 64 1
1152
+ // w: 14
1153
+ // res: 768 14 14 25
1154
+ // used in sam
1155
+ GGML_API struct ggml_tensor * ggml_win_part(
1156
+ struct ggml_context * ctx,
1157
+ struct ggml_tensor * a,
1158
+ int w);
1159
+
1160
+ // reverse of ggml_win_part
1161
+ // used in sam
1162
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1163
+ struct ggml_context * ctx,
1164
+ struct ggml_tensor * a,
1165
+ int w0,
1166
+ int h0,
1167
+ int w);
1168
+
1038
1169
  // Mapping operations
1039
1170
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1040
1171
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
@@ -19,6 +19,11 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #define QK_K 256
25
+ #endif
26
+ #endif
22
27
 
23
28
  #include <array>
24
29
  #include <ctime>
@@ -40,6 +45,10 @@
40
45
  #include <sstream>
41
46
  #include <numeric>
42
47
 
48
+ #if defined(_MSC_VER)
49
+ #pragma warning(disable: 4244 4267) // possible loss of data
50
+ #endif
51
+
43
52
  #define LLAMA_USE_SCRATCH
44
53
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
54
 
@@ -882,6 +891,7 @@ static bool kv_cache_init(
882
891
  const int64_t n_elements = n_embd*n_mem;
883
892
 
884
893
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
894
+ cache.n = 0;
885
895
 
886
896
  struct ggml_init_params params;
887
897
  params.mem_size = cache.buf.size;
@@ -900,6 +910,7 @@ static bool kv_cache_init(
900
910
  ggml_set_name(cache.k, "cache_k");
901
911
  ggml_set_name(cache.v, "cache_v");
902
912
 
913
+ (void) n_gpu_layers;
903
914
  #ifdef GGML_USE_CUBLAS
904
915
  if (n_gpu_layers > n_layer + 1) {
905
916
  ggml_cuda_assign_buffers_no_scratch(cache.v);
@@ -914,21 +925,21 @@ static bool kv_cache_init(
914
925
 
915
926
  struct llama_context_params llama_context_default_params() {
916
927
  struct llama_context_params result = {
928
+ /*.seed =*/ -1,
917
929
  /*.n_ctx =*/ 512,
918
930
  /*.n_batch =*/ 512,
919
931
  /*.gpu_layers =*/ 0,
920
932
  /*.main_gpu =*/ 0,
921
933
  /*.tensor_split =*/ {0},
934
+ /*.progress_callback =*/ nullptr,
935
+ /*.progress_callback_user_data =*/ nullptr,
922
936
  /*.low_vram =*/ false,
923
- /*.seed =*/ -1,
924
937
  /*.f16_kv =*/ true,
925
938
  /*.logits_all =*/ false,
926
939
  /*.vocab_only =*/ false,
927
940
  /*.use_mmap =*/ true,
928
941
  /*.use_mlock =*/ false,
929
942
  /*.embedding =*/ false,
930
- /*.progress_callback =*/ nullptr,
931
- /*.progress_callback_user_data =*/ nullptr,
932
943
  };
933
944
 
934
945
  return result;
@@ -1249,7 +1260,7 @@ static void llama_model_load_internal(
1249
1260
  vram_scratch = n_batch * MB;
1250
1261
  ggml_cuda_set_scratch_size(vram_scratch);
1251
1262
  if (n_gpu_layers > 0) {
1252
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1263
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1253
1264
  __func__, vram_scratch / MB);
1254
1265
  }
1255
1266
  }
@@ -1609,7 +1620,7 @@ static bool llama_eval_internal(
1609
1620
  model.layers[il].w1,
1610
1621
  cur);
1611
1622
  offload_func(cur);
1612
- ggml_set_name(cur, "result_w2");
1623
+ ggml_set_name(cur, "result_w1");
1613
1624
 
1614
1625
  // SILU activation
1615
1626
  cur = ggml_silu(ctx0, cur);
@@ -1646,15 +1657,11 @@ static bool llama_eval_internal(
1646
1657
  {
1647
1658
  cur = ggml_rms_norm(ctx0, inpL);
1648
1659
  offload_func_nr(cur);
1649
- ggml_set_name(cur, "rms_norm_inpL");
1650
-
1651
- cur = ggml_rms_norm(ctx0, cur);
1652
- offload_func_nr(cur);
1653
- ggml_set_name(cur, "rms_norm_after");
1660
+ ggml_set_name(cur, "rms_norm_2");
1654
1661
 
1655
1662
  // cur = cur*norm(broadcasted)
1656
1663
  cur = ggml_mul(ctx0, cur, model.norm);
1657
- offload_func_nr(cur);
1664
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1658
1665
  ggml_set_name(cur, "result_norm");
1659
1666
 
1660
1667
  embeddings = cur;
@@ -2485,8 +2492,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2485
2492
  } else {
2486
2493
  new_type = quantized_type;
2487
2494
  #ifdef GGML_USE_K_QUANTS
2495
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2496
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2497
+ int nx = tensor.ne.at(0);
2498
+ int ny = tensor.ne.at(1);
2499
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2500
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2501
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2502
+ fprintf(stderr, "========================================================================================\n\n");
2503
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2504
+ }
2505
+ }
2488
2506
  if (tensor.name == "output.weight") {
2489
- new_type = GGML_TYPE_Q6_K;
2507
+ int nx = tensor.ne.at(0);
2508
+ int ny = tensor.ne.at(1);
2509
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2510
+ new_type = GGML_TYPE_Q6_K;
2511
+ }
2490
2512
  } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2491
2513
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2492
2514
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -2690,16 +2712,21 @@ struct llama_context * llama_init_from_file(
2690
2712
  // this allocates all Metal resources and memory buffers
2691
2713
  ctx->ctx_metal = ggml_metal_init();
2692
2714
 
2693
- void *data_ptr = NULL;
2715
+ void * data_ptr = NULL;
2694
2716
  size_t data_size = 0;
2717
+
2695
2718
  if (params.use_mmap) {
2696
- data_ptr = ctx->model.mapping->addr;
2697
- data_size= ctx->model.mapping->size;
2719
+ data_ptr = ctx->model.mapping->addr;
2720
+ data_size = ctx->model.mapping->size;
2698
2721
  } else {
2699
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2700
- data_size= ggml_get_mem_size(ctx->model.ctx);
2722
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2723
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2701
2724
  }
2702
2725
 
2726
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2727
+
2728
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2729
+
2703
2730
  #define LLAMA_METAL_CHECK_BUF(result) \
2704
2731
  if (!(result)) { \
2705
2732
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2707,12 +2734,13 @@ struct llama_context * llama_init_from_file(
2707
2734
  return NULL; \
2708
2735
  }
2709
2736
 
2710
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2711
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2737
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2712
2738
 
2713
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2714
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2715
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2739
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2741
+
2742
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
2716
2744
  #undef LLAMA_METAL_CHECK_BUF
2717
2745
  }
2718
2746
  #endif
@@ -3098,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3098
3126
  if (kv_size) {
3099
3127
  const size_t elt_size = ggml_element_size(kv_self.k);
3100
3128
 
3101
- char buffer[4096];
3102
-
3103
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3129
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3104
3130
  ggml_cgraph gf{};
3105
3131
  gf.n_threads = 1;
3106
3132
 
@@ -3206,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3206
3232
 
3207
3233
  const size_t elt_size = ggml_element_size(kv_self.k);
3208
3234
 
3209
- char buffer[4096];
3210
-
3211
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3235
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3212
3236
  ggml_cgraph gf{};
3213
3237
  gf.n_threads = 1;
3214
3238
 
@@ -3443,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
3443
3467
 
3444
3468
  fprintf(stderr, "\n");
3445
3469
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3446
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3447
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3448
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
3470
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3471
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3472
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3473
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3474
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3475
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3449
3476
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3450
3477
  }
3451
3478
 
@@ -71,28 +71,27 @@ extern "C" {
71
71
 
72
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
73
73
 
74
- struct llama_context_params {
74
+ struct llama_context_params {
75
+ int seed; // RNG seed, -1 for random
75
76
  int n_ctx; // text context
76
77
  int n_batch; // prompt processing batch size
77
78
  int n_gpu_layers; // number of layers to store in VRAM
78
79
  int main_gpu; // the GPU that is used for scratch and small tensors
79
80
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
81
- int seed; // RNG seed, -1 for random
81
+ // called with a progress value between 0 and 1, pass NULL to disable
82
+ llama_progress_callback progress_callback;
83
+ // context pointer passed to the progress callback
84
+ void * progress_callback_user_data;
82
85
 
86
+ // Keep the booleans together to avoid misalignment during copy-by-value.
87
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
83
88
  bool f16_kv; // use fp16 for KV cache
84
89
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
85
90
  bool vocab_only; // only load the vocabulary, no weights
86
91
  bool use_mmap; // use mmap if possible
87
92
  bool use_mlock; // force system to keep model in RAM
88
93
  bool embedding; // embedding mode only
89
-
90
- // called with a progress value between 0 and 1, pass NULL to disable
91
- llama_progress_callback progress_callback;
92
- // context pointer passed to the progress callback
93
- void * progress_callback_user_data;
94
94
  };
95
-
96
95
  // model file types
97
96
  enum llama_ftype {
98
97
  LLAMA_FTYPE_ALL_F32 = 0,
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.2.1'
6
+ VERSION = '0.2.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-a09f919'
9
+ LLAMA_CPP_VERSION = 'master-7487137'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-17 00:00:00.000000000 Z
11
+ date: 2023-06-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -24,6 +24,7 @@ files:
24
24
  - README.md
25
25
  - examples/README.md
26
26
  - examples/chat.rb
27
+ - examples/embedding.rb
27
28
  - ext/llama_cpp/extconf.rb
28
29
  - ext/llama_cpp/llama_cpp.cpp
29
30
  - ext/llama_cpp/llama_cpp.h