llama_cpp 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -258,11 +258,11 @@ struct ggml_tensor {
258
258
  enum ggml_type type;
259
259
 
260
260
  int n_dims;
261
- int ne[GGML_MAX_DIMS]; // number of elements
262
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
- // nb[0] = sizeof(type)
264
- // nb[1] = nb[0] * ne[0] + padding
265
- // nb[i] = nb[i-1] * ne[i-1]
261
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
262
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
+ // nb[0] = sizeof(type)
264
+ // nb[1] = nb[0] * ne[0] + padding
265
+ // nb[i] = nb[i-1] * ne[i-1]
266
266
 
267
267
  // compute data
268
268
  enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
328
328
  void ggml_print_object (const struct ggml_object * obj);
329
329
  void ggml_print_objects(const struct ggml_context * ctx);
330
330
 
331
- int ggml_nelements(const struct ggml_tensor * tensor);
332
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
331
+ int64_t ggml_nelements(const struct ggml_tensor * tensor);
332
+ size_t ggml_nbytes (const struct ggml_tensor * tensor);
333
333
 
334
334
  int ggml_blck_size (enum ggml_type type);
335
335
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
355
355
  struct ggml_context * ctx,
356
356
  enum ggml_type type,
357
357
  int n_dims,
358
- const int *ne);
358
+ const int64_t *ne);
359
359
 
360
360
  struct ggml_tensor * ggml_new_tensor_1d(
361
361
  struct ggml_context * ctx,
362
362
  enum ggml_type type,
363
- int ne0);
363
+ int64_t ne0);
364
364
 
365
365
  struct ggml_tensor * ggml_new_tensor_2d(
366
366
  struct ggml_context * ctx,
367
367
  enum ggml_type type,
368
- int ne0,
369
- int ne1);
368
+ int64_t ne0,
369
+ int64_t ne1);
370
370
 
371
371
  struct ggml_tensor * ggml_new_tensor_3d(
372
372
  struct ggml_context * ctx,
373
373
  enum ggml_type type,
374
- int ne0,
375
- int ne1,
376
- int ne2);
374
+ int64_t ne0,
375
+ int64_t ne1,
376
+ int64_t ne2);
377
377
 
378
378
  struct ggml_tensor * ggml_new_tensor_4d(
379
379
  struct ggml_context * ctx,
380
380
  enum ggml_type type,
381
- int ne0,
382
- int ne1,
383
- int ne2,
384
- int ne3);
381
+ int64_t ne0,
382
+ int64_t ne1,
383
+ int64_t ne2,
384
+ int64_t ne3);
385
385
 
386
386
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
387
387
  struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
531
531
  struct ggml_tensor * ggml_reshape_2d(
532
532
  struct ggml_context * ctx,
533
533
  struct ggml_tensor * a,
534
- int ne0,
535
- int ne1);
534
+ int64_t ne0,
535
+ int64_t ne1);
536
536
 
537
537
  // return view(a)
538
538
  // TODO: when we start computing gradient, make a copy instead of view
539
539
  struct ggml_tensor * ggml_reshape_3d(
540
540
  struct ggml_context * ctx,
541
541
  struct ggml_tensor * a,
542
- int ne0,
543
- int ne1,
544
- int ne2);
542
+ int64_t ne0,
543
+ int64_t ne1,
544
+ int64_t ne2);
545
545
 
546
546
  // offset in bytes
547
547
  struct ggml_tensor * ggml_view_1d(
548
548
  struct ggml_context * ctx,
549
549
  struct ggml_tensor * a,
550
- int ne0,
550
+ int64_t ne0,
551
551
  size_t offset);
552
552
 
553
553
  struct ggml_tensor * ggml_view_2d(
554
554
  struct ggml_context * ctx,
555
555
  struct ggml_tensor * a,
556
- int ne0,
557
- int ne1,
556
+ int64_t ne0,
557
+ int64_t ne1,
558
558
  size_t nb1, // row stride in bytes
559
559
  size_t offset);
560
560
 
561
+ struct ggml_tensor * ggml_view_3d(
562
+ struct ggml_context * ctx,
563
+ struct ggml_tensor * a,
564
+ int64_t ne0,
565
+ int64_t ne1,
566
+ int64_t ne2,
567
+ size_t nb1, // row stride in bytes
568
+ size_t nb2, // slice stride in bytes
569
+ size_t offset);
570
+
561
571
  struct ggml_tensor * ggml_permute(
562
572
  struct ggml_context * ctx,
563
573
  struct ggml_tensor * a,
@@ -256,8 +256,8 @@ static bool kv_cache_init(
256
256
  const int n_embd = hparams.n_embd;
257
257
  const int n_layer = hparams.n_layer;
258
258
 
259
- const int n_mem = n_layer*n_ctx;
260
- const int n_elements = n_embd*n_mem;
259
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
260
+ const int64_t n_elements = n_embd*n_mem;
261
261
 
262
262
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
263
 
@@ -679,7 +679,7 @@ static bool llama_model_load(
679
679
  return false;
680
680
  }
681
681
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
683
  __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
684
  return false;
685
685
  }
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
810
810
 
811
811
  // self-attention
812
812
  {
813
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
813
+ // compute Q and K and RoPE them
814
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
815
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
816
816
 
817
817
  // store key and value to memory
818
- if (N >= 1) {
818
+ {
819
+ // compute the transposed [N, n_embd] V matrix
820
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
821
+
819
822
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
823
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
824
+ ( n_ctx)*ggml_element_size(kv_self.v),
825
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
821
826
 
827
+ // important: storing RoPE-ed version of K in the KV cache!
822
828
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
829
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
830
  }
825
831
 
826
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
832
  struct ggml_tensor * Q =
828
833
  ggml_permute(ctx0,
829
- ggml_rope(ctx0,
830
- ggml_cpy(ctx0,
831
- Qcur,
832
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
- n_past, n_rot, 0),
834
+ Qcur,
834
835
  0, 2, 1, 3);
835
836
 
836
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
837
  struct ggml_tensor * K =
838
838
  ggml_permute(ctx0,
839
- ggml_rope(ctx0,
840
- ggml_reshape_3d(ctx0,
841
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
- n_embd/n_head, n_head, n_past + N),
843
- n_past, n_rot, 1),
839
+ ggml_reshape_3d(ctx0,
840
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
841
+ n_embd/n_head, n_head, n_past + N),
844
842
  0, 2, 1, 3);
845
843
 
846
844
  // K * Q
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
858
856
  // KQ = soft_max(KQ_masked)
859
857
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
858
 
861
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
- struct ggml_tensor * V_trans =
863
- ggml_cpy(ctx0,
864
- ggml_permute(ctx0,
865
- ggml_reshape_3d(ctx0,
866
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
- n_embd/n_head, n_head, n_past + N),
868
- 1, 2, 0, 3),
869
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
859
+ // split cached V into n_head heads
860
+ struct ggml_tensor * V =
861
+ ggml_view_3d(ctx0, kv_self.v,
862
+ n_past + N, n_embd/n_head, n_head,
863
+ n_ctx*ggml_element_size(kv_self.v),
864
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
865
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
870
866
 
871
- // KQV = transpose(V) * KQ_soft_max
872
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
867
+ #if 1
868
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
869
+ #else
870
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
871
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
872
+ // is there a better way?
873
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
874
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
875
+ #endif
873
876
 
874
877
  // KQV_merged = KQV.permute(0, 2, 1, 3)
875
878
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
955
958
  ggml_build_forward_expand(&gf, inpL);
956
959
  ggml_graph_compute (ctx0, &gf);
957
960
 
961
+ // print timing information per ggml operation (for debugging purposes)
962
+ // requires GGML_PERF to be defined
963
+ //ggml_graph_print(&gf);
964
+
965
+ // plot the computation graph in dot format (for debugging purposes)
958
966
  //if (n_past%100 == 0) {
959
- // ggml_graph_print (&gf);
960
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
967
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
961
968
  //}
962
969
 
963
970
  //embd_w.resize(n_vocab*N);
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1194
1201
  const auto & logits = lctx.logits;
1195
1202
  const auto * plogits = logits.data() + logits.size() - n_logits;
1196
1203
 
1204
+ if (temp <= 0) {
1205
+ // select the token with the highest logit directly
1206
+ float max_logit = plogits[0];
1207
+ llama_vocab::id max_id = 0;
1208
+
1209
+ for (int i = 1; i < n_logits; ++i) {
1210
+ if (plogits[i] > max_logit) {
1211
+ max_logit = plogits[i];
1212
+ max_id = i;
1213
+ }
1214
+ }
1215
+ return max_id;
1216
+ }
1217
+
1197
1218
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
1219
  logits_id.reserve(n_logits);
1199
1220
 
@@ -1215,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
1215
1236
  }
1216
1237
  }
1217
1238
 
1218
- sample_top_k(logits_id, top_k);
1219
-
1220
- float maxl = -std::numeric_limits<float>::infinity();
1221
- for (const auto & kv : logits_id) {
1222
- maxl = Max(maxl, kv.first);
1223
- }
1239
+ sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1224
1240
 
1225
1241
  // compute probs for the top k tokens
1226
1242
  std::vector<float> probs;
1227
1243
  probs.reserve(logits_id.size());
1228
1244
 
1245
+ float maxl = logits_id[0].first;
1229
1246
  double sum = 0.0;
1230
1247
  for (const auto & kv : logits_id) {
1231
1248
  const float p = expf(kv.first - maxl);
@@ -1248,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
1248
1265
  break;
1249
1266
  }
1250
1267
  }
1251
-
1252
- cumsum = 1.0/cumsum;
1253
- for (int i = 0; i < (int) probs.size(); i++) {
1254
- probs[i] *= cumsum;
1255
- }
1256
1268
  }
1257
1269
 
1258
1270
  //printf("\n");
1259
1271
  //for (int i = 0; i < (int) 10; i++) {
1260
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1272
+ // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1261
1273
  //}
1262
1274
  //printf("\n\n");
1263
1275
  //exit(0);
@@ -1608,7 +1620,7 @@ struct llama_context * llama_init_from_file(
1608
1620
  }
1609
1621
 
1610
1622
  // reserve memory for context buffers
1611
- {
1623
+ if (!params.vocab_only) {
1612
1624
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
1625
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
1626
  llama_free(ctx);
@@ -1668,6 +1680,33 @@ int llama_model_quantize(
1668
1680
  return 0;
1669
1681
  }
1670
1682
 
1683
+ // Returns the KV cache that will contain the context for the
1684
+ // ongoing prediction with the model.
1685
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
+ return ctx->model.kv_self.buf.data();
1687
+ }
1688
+
1689
+ // Returns the size of the KV cache
1690
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
+ return ctx->model.kv_self.buf.size();
1692
+ }
1693
+
1694
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1695
+ return ctx->model.kv_self.n;
1696
+ }
1697
+
1698
+ // Sets the KV cache containing the current context for the model
1699
+ void llama_set_kv_cache(
1700
+ struct llama_context * ctx,
1701
+ const uint8_t * kv_cache,
1702
+ size_t n_size,
1703
+ int n_token_count) {
1704
+ // Make sure we have the same kv cache setup
1705
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
+ memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1707
+ ctx->model.kv_self.n = n_token_count;
1708
+ }
1709
+
1671
1710
  int llama_eval(
1672
1711
  struct llama_context * ctx,
1673
1712
  const llama_token * tokens,
@@ -83,6 +83,23 @@ extern "C" {
83
83
  const char * fname_out,
84
84
  int itype);
85
85
 
86
+ // Returns the KV cache that will contain the context for the
87
+ // ongoing prediction with the model.
88
+ LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
89
+
90
+ // Returns the size of the KV cache
91
+ LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
92
+
93
+ // Returns the number of tokens in the KV cache
94
+ LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
95
+
96
+ // Sets the KV cache containing the current context for the model
97
+ LLAMA_API void llama_set_kv_cache(
98
+ struct llama_context * ctx,
99
+ const uint8_t * kv_cache,
100
+ size_t n_size,
101
+ int n_token_count);
102
+
86
103
  // Run the llama inference to obtain the logits and probabilities for the next token.
87
104
  // tokens + n_tokens is the provided batch of new tokens to process
88
105
  // n_past is the number of tokens to use from previous eval calls
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.2'
6
+ VERSION = '0.0.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-5b70e7d'
9
+ LLAMA_CPP_VERSION = 'master-698f7b5'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -5,13 +5,16 @@ require_relative 'llama_cpp/llama_cpp'
5
5
 
6
6
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
7
  module LLaMACpp
8
+ # Class alias to match interface of whispercpp gem.
9
+ Params = ContextParams
10
+
8
11
  module_function
9
12
 
10
13
  # Generates sentences following the given prompt for operation check.
11
14
  #
12
15
  # @param context [LLaMACpp::Context]
13
16
  # @param prompt [String]
14
- # @parma n_threads [Integer]
17
+ # @param n_threads [Integer]
15
18
  # @return [String]
16
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
17
20
  prompt.insert(0, ' ')
data/sig/llama_cpp.rbs ADDED
@@ -0,0 +1,52 @@
1
+ module LLaMACpp
2
+ VERSION: String
3
+ LLAMA_CPP_VERSION: String
4
+ LLAMA_FILE_VERSION: String
5
+ LLAMA_FILE_MAGIC: String
6
+ LLAMA_FILE_MAGIC_UNVERSIONED: String
7
+
8
+ def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
9
+ def self?.print_system_info: () -> void
10
+ def self?.token_bos: () -> Integer
11
+ def self?.token_eos: () -> Integer
12
+
13
+ class Context
14
+ public
15
+
16
+ def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
17
+ def embeddings: () -> Array[Float]
18
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
19
+ def logits: () -> Array[Float]
20
+ def n_ctx: () -> Integer
21
+ def n_embd: () -> Integer
22
+ def n_vocab: () -> Integer
23
+ def print_timings: () -> void
24
+ def reset_timings: () -> void
25
+ def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
26
+ def token_to_str: (Integer) -> String
27
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
28
+ end
29
+
30
+ class ContextParams
31
+ public
32
+
33
+ def embedding: () -> bool
34
+ def embedding=: (bool) -> bool
35
+ def f16_kv: () -> bool
36
+ def f16_kv=: (bool) -> bool
37
+ def logits_all: () -> bool
38
+ def logits_all=: (bool) -> bool
39
+ def n_ctx: () -> Integer
40
+ def n_ctx=: (Integer) -> Integer
41
+ def n_parts: () -> Integer
42
+ def n_parts=: (Integer) -> Integer
43
+ def seed: () -> Integer
44
+ def seed=: (Integer) -> Integer
45
+ def use_mlock: () -> bool
46
+ def use_mlock=: (bool) -> bool
47
+ def vocab_only: () -> bool
48
+ def vocab_only=: (bool) -> bool
49
+ end
50
+
51
+ class Params = ContextParams
52
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-02 00:00:00.000000000 Z
11
+ date: 2023-04-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -32,6 +32,7 @@ files:
32
32
  - ext/llama_cpp/src/llama.h
33
33
  - lib/llama_cpp.rb
34
34
  - lib/llama_cpp/version.rb
35
+ - sig/llama_cpp.rbs
35
36
  homepage: https://github.com/yoshoku/llama_cpp.rb
36
37
  licenses:
37
38
  - MIT