llama_cpp 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -258,11 +258,11 @@ struct ggml_tensor {
258
258
  enum ggml_type type;
259
259
 
260
260
  int n_dims;
261
- int ne[GGML_MAX_DIMS]; // number of elements
262
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
- // nb[0] = sizeof(type)
264
- // nb[1] = nb[0] * ne[0] + padding
265
- // nb[i] = nb[i-1] * ne[i-1]
261
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
262
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
+ // nb[0] = sizeof(type)
264
+ // nb[1] = nb[0] * ne[0] + padding
265
+ // nb[i] = nb[i-1] * ne[i-1]
266
266
 
267
267
  // compute data
268
268
  enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
328
328
  void ggml_print_object (const struct ggml_object * obj);
329
329
  void ggml_print_objects(const struct ggml_context * ctx);
330
330
 
331
- int ggml_nelements(const struct ggml_tensor * tensor);
332
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
331
+ int64_t ggml_nelements(const struct ggml_tensor * tensor);
332
+ size_t ggml_nbytes (const struct ggml_tensor * tensor);
333
333
 
334
334
  int ggml_blck_size (enum ggml_type type);
335
335
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
355
355
  struct ggml_context * ctx,
356
356
  enum ggml_type type,
357
357
  int n_dims,
358
- const int *ne);
358
+ const int64_t *ne);
359
359
 
360
360
  struct ggml_tensor * ggml_new_tensor_1d(
361
361
  struct ggml_context * ctx,
362
362
  enum ggml_type type,
363
- int ne0);
363
+ int64_t ne0);
364
364
 
365
365
  struct ggml_tensor * ggml_new_tensor_2d(
366
366
  struct ggml_context * ctx,
367
367
  enum ggml_type type,
368
- int ne0,
369
- int ne1);
368
+ int64_t ne0,
369
+ int64_t ne1);
370
370
 
371
371
  struct ggml_tensor * ggml_new_tensor_3d(
372
372
  struct ggml_context * ctx,
373
373
  enum ggml_type type,
374
- int ne0,
375
- int ne1,
376
- int ne2);
374
+ int64_t ne0,
375
+ int64_t ne1,
376
+ int64_t ne2);
377
377
 
378
378
  struct ggml_tensor * ggml_new_tensor_4d(
379
379
  struct ggml_context * ctx,
380
380
  enum ggml_type type,
381
- int ne0,
382
- int ne1,
383
- int ne2,
384
- int ne3);
381
+ int64_t ne0,
382
+ int64_t ne1,
383
+ int64_t ne2,
384
+ int64_t ne3);
385
385
 
386
386
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
387
387
  struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
531
531
  struct ggml_tensor * ggml_reshape_2d(
532
532
  struct ggml_context * ctx,
533
533
  struct ggml_tensor * a,
534
- int ne0,
535
- int ne1);
534
+ int64_t ne0,
535
+ int64_t ne1);
536
536
 
537
537
  // return view(a)
538
538
  // TODO: when we start computing gradient, make a copy instead of view
539
539
  struct ggml_tensor * ggml_reshape_3d(
540
540
  struct ggml_context * ctx,
541
541
  struct ggml_tensor * a,
542
- int ne0,
543
- int ne1,
544
- int ne2);
542
+ int64_t ne0,
543
+ int64_t ne1,
544
+ int64_t ne2);
545
545
 
546
546
  // offset in bytes
547
547
  struct ggml_tensor * ggml_view_1d(
548
548
  struct ggml_context * ctx,
549
549
  struct ggml_tensor * a,
550
- int ne0,
550
+ int64_t ne0,
551
551
  size_t offset);
552
552
 
553
553
  struct ggml_tensor * ggml_view_2d(
554
554
  struct ggml_context * ctx,
555
555
  struct ggml_tensor * a,
556
- int ne0,
557
- int ne1,
556
+ int64_t ne0,
557
+ int64_t ne1,
558
558
  size_t nb1, // row stride in bytes
559
559
  size_t offset);
560
560
 
561
+ struct ggml_tensor * ggml_view_3d(
562
+ struct ggml_context * ctx,
563
+ struct ggml_tensor * a,
564
+ int64_t ne0,
565
+ int64_t ne1,
566
+ int64_t ne2,
567
+ size_t nb1, // row stride in bytes
568
+ size_t nb2, // slice stride in bytes
569
+ size_t offset);
570
+
561
571
  struct ggml_tensor * ggml_permute(
562
572
  struct ggml_context * ctx,
563
573
  struct ggml_tensor * a,
@@ -256,8 +256,8 @@ static bool kv_cache_init(
256
256
  const int n_embd = hparams.n_embd;
257
257
  const int n_layer = hparams.n_layer;
258
258
 
259
- const int n_mem = n_layer*n_ctx;
260
- const int n_elements = n_embd*n_mem;
259
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
260
+ const int64_t n_elements = n_embd*n_mem;
261
261
 
262
262
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
263
 
@@ -679,7 +679,7 @@ static bool llama_model_load(
679
679
  return false;
680
680
  }
681
681
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
683
  __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
684
  return false;
685
685
  }
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
810
810
 
811
811
  // self-attention
812
812
  {
813
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
813
+ // compute Q and K and RoPE them
814
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
815
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
816
816
 
817
817
  // store key and value to memory
818
- if (N >= 1) {
818
+ {
819
+ // compute the transposed [N, n_embd] V matrix
820
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
821
+
819
822
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
823
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
824
+ ( n_ctx)*ggml_element_size(kv_self.v),
825
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
821
826
 
827
+ // important: storing RoPE-ed version of K in the KV cache!
822
828
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
829
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
830
  }
825
831
 
826
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
832
  struct ggml_tensor * Q =
828
833
  ggml_permute(ctx0,
829
- ggml_rope(ctx0,
830
- ggml_cpy(ctx0,
831
- Qcur,
832
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
- n_past, n_rot, 0),
834
+ Qcur,
834
835
  0, 2, 1, 3);
835
836
 
836
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
837
  struct ggml_tensor * K =
838
838
  ggml_permute(ctx0,
839
- ggml_rope(ctx0,
840
- ggml_reshape_3d(ctx0,
841
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
- n_embd/n_head, n_head, n_past + N),
843
- n_past, n_rot, 1),
839
+ ggml_reshape_3d(ctx0,
840
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
841
+ n_embd/n_head, n_head, n_past + N),
844
842
  0, 2, 1, 3);
845
843
 
846
844
  // K * Q
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
858
856
  // KQ = soft_max(KQ_masked)
859
857
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
858
 
861
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
- struct ggml_tensor * V_trans =
863
- ggml_cpy(ctx0,
864
- ggml_permute(ctx0,
865
- ggml_reshape_3d(ctx0,
866
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
- n_embd/n_head, n_head, n_past + N),
868
- 1, 2, 0, 3),
869
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
859
+ // split cached V into n_head heads
860
+ struct ggml_tensor * V =
861
+ ggml_view_3d(ctx0, kv_self.v,
862
+ n_past + N, n_embd/n_head, n_head,
863
+ n_ctx*ggml_element_size(kv_self.v),
864
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
865
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
870
866
 
871
- // KQV = transpose(V) * KQ_soft_max
872
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
867
+ #if 1
868
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
869
+ #else
870
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
871
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
872
+ // is there a better way?
873
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
874
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
875
+ #endif
873
876
 
874
877
  // KQV_merged = KQV.permute(0, 2, 1, 3)
875
878
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
955
958
  ggml_build_forward_expand(&gf, inpL);
956
959
  ggml_graph_compute (ctx0, &gf);
957
960
 
961
+ // print timing information per ggml operation (for debugging purposes)
962
+ // requires GGML_PERF to be defined
963
+ //ggml_graph_print(&gf);
964
+
965
+ // plot the computation graph in dot format (for debugging purposes)
958
966
  //if (n_past%100 == 0) {
959
- // ggml_graph_print (&gf);
960
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
967
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
961
968
  //}
962
969
 
963
970
  //embd_w.resize(n_vocab*N);
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1194
1201
  const auto & logits = lctx.logits;
1195
1202
  const auto * plogits = logits.data() + logits.size() - n_logits;
1196
1203
 
1204
+ if (temp <= 0) {
1205
+ // select the token with the highest logit directly
1206
+ float max_logit = plogits[0];
1207
+ llama_vocab::id max_id = 0;
1208
+
1209
+ for (int i = 1; i < n_logits; ++i) {
1210
+ if (plogits[i] > max_logit) {
1211
+ max_logit = plogits[i];
1212
+ max_id = i;
1213
+ }
1214
+ }
1215
+ return max_id;
1216
+ }
1217
+
1197
1218
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
1219
  logits_id.reserve(n_logits);
1199
1220
 
@@ -1215,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
1215
1236
  }
1216
1237
  }
1217
1238
 
1218
- sample_top_k(logits_id, top_k);
1219
-
1220
- float maxl = -std::numeric_limits<float>::infinity();
1221
- for (const auto & kv : logits_id) {
1222
- maxl = Max(maxl, kv.first);
1223
- }
1239
+ sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1224
1240
 
1225
1241
  // compute probs for the top k tokens
1226
1242
  std::vector<float> probs;
1227
1243
  probs.reserve(logits_id.size());
1228
1244
 
1245
+ float maxl = logits_id[0].first;
1229
1246
  double sum = 0.0;
1230
1247
  for (const auto & kv : logits_id) {
1231
1248
  const float p = expf(kv.first - maxl);
@@ -1248,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
1248
1265
  break;
1249
1266
  }
1250
1267
  }
1251
-
1252
- cumsum = 1.0/cumsum;
1253
- for (int i = 0; i < (int) probs.size(); i++) {
1254
- probs[i] *= cumsum;
1255
- }
1256
1268
  }
1257
1269
 
1258
1270
  //printf("\n");
1259
1271
  //for (int i = 0; i < (int) 10; i++) {
1260
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1272
+ // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1261
1273
  //}
1262
1274
  //printf("\n\n");
1263
1275
  //exit(0);
@@ -1608,7 +1620,7 @@ struct llama_context * llama_init_from_file(
1608
1620
  }
1609
1621
 
1610
1622
  // reserve memory for context buffers
1611
- {
1623
+ if (!params.vocab_only) {
1612
1624
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
1625
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
1626
  llama_free(ctx);
@@ -1668,6 +1680,33 @@ int llama_model_quantize(
1668
1680
  return 0;
1669
1681
  }
1670
1682
 
1683
+ // Returns the KV cache that will contain the context for the
1684
+ // ongoing prediction with the model.
1685
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
+ return ctx->model.kv_self.buf.data();
1687
+ }
1688
+
1689
+ // Returns the size of the KV cache
1690
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
+ return ctx->model.kv_self.buf.size();
1692
+ }
1693
+
1694
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1695
+ return ctx->model.kv_self.n;
1696
+ }
1697
+
1698
+ // Sets the KV cache containing the current context for the model
1699
+ void llama_set_kv_cache(
1700
+ struct llama_context * ctx,
1701
+ const uint8_t * kv_cache,
1702
+ size_t n_size,
1703
+ int n_token_count) {
1704
+ // Make sure we have the same kv cache setup
1705
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
+ memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1707
+ ctx->model.kv_self.n = n_token_count;
1708
+ }
1709
+
1671
1710
  int llama_eval(
1672
1711
  struct llama_context * ctx,
1673
1712
  const llama_token * tokens,
@@ -83,6 +83,23 @@ extern "C" {
83
83
  const char * fname_out,
84
84
  int itype);
85
85
 
86
+ // Returns the KV cache that will contain the context for the
87
+ // ongoing prediction with the model.
88
+ LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
89
+
90
+ // Returns the size of the KV cache
91
+ LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
92
+
93
+ // Returns the number of tokens in the KV cache
94
+ LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
95
+
96
+ // Sets the KV cache containing the current context for the model
97
+ LLAMA_API void llama_set_kv_cache(
98
+ struct llama_context * ctx,
99
+ const uint8_t * kv_cache,
100
+ size_t n_size,
101
+ int n_token_count);
102
+
86
103
  // Run the llama inference to obtain the logits and probabilities for the next token.
87
104
  // tokens + n_tokens is the provided batch of new tokens to process
88
105
  // n_past is the number of tokens to use from previous eval calls
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.2'
6
+ VERSION = '0.0.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-5b70e7d'
9
+ LLAMA_CPP_VERSION = 'master-698f7b5'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -5,13 +5,16 @@ require_relative 'llama_cpp/llama_cpp'
5
5
 
6
6
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
7
  module LLaMACpp
8
+ # Class alias to match interface of whispercpp gem.
9
+ Params = ContextParams
10
+
8
11
  module_function
9
12
 
10
13
  # Generates sentences following the given prompt for operation check.
11
14
  #
12
15
  # @param context [LLaMACpp::Context]
13
16
  # @param prompt [String]
14
- # @parma n_threads [Integer]
17
+ # @param n_threads [Integer]
15
18
  # @return [String]
16
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
17
20
  prompt.insert(0, ' ')
data/sig/llama_cpp.rbs ADDED
@@ -0,0 +1,52 @@
1
+ module LLaMACpp
2
+ VERSION: String
3
+ LLAMA_CPP_VERSION: String
4
+ LLAMA_FILE_VERSION: String
5
+ LLAMA_FILE_MAGIC: String
6
+ LLAMA_FILE_MAGIC_UNVERSIONED: String
7
+
8
+ def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
9
+ def self?.print_system_info: () -> void
10
+ def self?.token_bos: () -> Integer
11
+ def self?.token_eos: () -> Integer
12
+
13
+ class Context
14
+ public
15
+
16
+ def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
17
+ def embeddings: () -> Array[Float]
18
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
19
+ def logits: () -> Array[Float]
20
+ def n_ctx: () -> Integer
21
+ def n_embd: () -> Integer
22
+ def n_vocab: () -> Integer
23
+ def print_timings: () -> void
24
+ def reset_timings: () -> void
25
+ def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
26
+ def token_to_str: (Integer) -> String
27
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
28
+ end
29
+
30
+ class ContextParams
31
+ public
32
+
33
+ def embedding: () -> bool
34
+ def embedding=: (bool) -> bool
35
+ def f16_kv: () -> bool
36
+ def f16_kv=: (bool) -> bool
37
+ def logits_all: () -> bool
38
+ def logits_all=: (bool) -> bool
39
+ def n_ctx: () -> Integer
40
+ def n_ctx=: (Integer) -> Integer
41
+ def n_parts: () -> Integer
42
+ def n_parts=: (Integer) -> Integer
43
+ def seed: () -> Integer
44
+ def seed=: (Integer) -> Integer
45
+ def use_mlock: () -> bool
46
+ def use_mlock=: (bool) -> bool
47
+ def vocab_only: () -> bool
48
+ def vocab_only=: (bool) -> bool
49
+ end
50
+
51
+ class Params = ContextParams
52
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-02 00:00:00.000000000 Z
11
+ date: 2023-04-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -32,6 +32,7 @@ files:
32
32
  - ext/llama_cpp/src/llama.h
33
33
  - lib/llama_cpp.rb
34
34
  - lib/llama_cpp/version.rb
35
+ - sig/llama_cpp.rbs
35
36
  homepage: https://github.com/yoshoku/llama_cpp.rb
36
37
  licenses:
37
38
  - MIT