llama-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -258,11 +258,11 @@ struct ggml_tensor {
258
258
  enum ggml_type type;
259
259
 
260
260
  int n_dims;
261
- int ne[GGML_MAX_DIMS]; // number of elements
262
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
- // nb[0] = sizeof(type)
264
- // nb[1] = nb[0] * ne[0] + padding
265
- // nb[i] = nb[i-1] * ne[i-1]
261
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
262
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
+ // nb[0] = sizeof(type)
264
+ // nb[1] = nb[0] * ne[0] + padding
265
+ // nb[i] = nb[i-1] * ne[i-1]
266
266
 
267
267
  // compute data
268
268
  enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
328
328
  void ggml_print_object (const struct ggml_object * obj);
329
329
  void ggml_print_objects(const struct ggml_context * ctx);
330
330
 
331
- int ggml_nelements(const struct ggml_tensor * tensor);
332
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
331
+ int64_t ggml_nelements(const struct ggml_tensor * tensor);
332
+ size_t ggml_nbytes (const struct ggml_tensor * tensor);
333
333
 
334
334
  int ggml_blck_size (enum ggml_type type);
335
335
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
355
355
  struct ggml_context * ctx,
356
356
  enum ggml_type type,
357
357
  int n_dims,
358
- const int *ne);
358
+ const int64_t *ne);
359
359
 
360
360
  struct ggml_tensor * ggml_new_tensor_1d(
361
361
  struct ggml_context * ctx,
362
362
  enum ggml_type type,
363
- int ne0);
363
+ int64_t ne0);
364
364
 
365
365
  struct ggml_tensor * ggml_new_tensor_2d(
366
366
  struct ggml_context * ctx,
367
367
  enum ggml_type type,
368
- int ne0,
369
- int ne1);
368
+ int64_t ne0,
369
+ int64_t ne1);
370
370
 
371
371
  struct ggml_tensor * ggml_new_tensor_3d(
372
372
  struct ggml_context * ctx,
373
373
  enum ggml_type type,
374
- int ne0,
375
- int ne1,
376
- int ne2);
374
+ int64_t ne0,
375
+ int64_t ne1,
376
+ int64_t ne2);
377
377
 
378
378
  struct ggml_tensor * ggml_new_tensor_4d(
379
379
  struct ggml_context * ctx,
380
380
  enum ggml_type type,
381
- int ne0,
382
- int ne1,
383
- int ne2,
384
- int ne3);
381
+ int64_t ne0,
382
+ int64_t ne1,
383
+ int64_t ne2,
384
+ int64_t ne3);
385
385
 
386
386
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
387
387
  struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
531
531
  struct ggml_tensor * ggml_reshape_2d(
532
532
  struct ggml_context * ctx,
533
533
  struct ggml_tensor * a,
534
- int ne0,
535
- int ne1);
534
+ int64_t ne0,
535
+ int64_t ne1);
536
536
 
537
537
  // return view(a)
538
538
  // TODO: when we start computing gradient, make a copy instead of view
539
539
  struct ggml_tensor * ggml_reshape_3d(
540
540
  struct ggml_context * ctx,
541
541
  struct ggml_tensor * a,
542
- int ne0,
543
- int ne1,
544
- int ne2);
542
+ int64_t ne0,
543
+ int64_t ne1,
544
+ int64_t ne2);
545
545
 
546
546
  // offset in bytes
547
547
  struct ggml_tensor * ggml_view_1d(
548
548
  struct ggml_context * ctx,
549
549
  struct ggml_tensor * a,
550
- int ne0,
550
+ int64_t ne0,
551
551
  size_t offset);
552
552
 
553
553
  struct ggml_tensor * ggml_view_2d(
554
554
  struct ggml_context * ctx,
555
555
  struct ggml_tensor * a,
556
- int ne0,
557
- int ne1,
556
+ int64_t ne0,
557
+ int64_t ne1,
558
558
  size_t nb1, // row stride in bytes
559
559
  size_t offset);
560
560
 
561
+ struct ggml_tensor * ggml_view_3d(
562
+ struct ggml_context * ctx,
563
+ struct ggml_tensor * a,
564
+ int64_t ne0,
565
+ int64_t ne1,
566
+ int64_t ne2,
567
+ size_t nb1, // row stride in bytes
568
+ size_t nb2, // slice stride in bytes
569
+ size_t offset);
570
+
561
571
  struct ggml_tensor * ggml_permute(
562
572
  struct ggml_context * ctx,
563
573
  struct ggml_tensor * a,
@@ -256,8 +256,8 @@ static bool kv_cache_init(
256
256
  const int n_embd = hparams.n_embd;
257
257
  const int n_layer = hparams.n_layer;
258
258
 
259
- const int n_mem = n_layer*n_ctx;
260
- const int n_elements = n_embd*n_mem;
259
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
260
+ const int64_t n_elements = n_embd*n_mem;
261
261
 
262
262
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
263
 
@@ -679,7 +679,7 @@ static bool llama_model_load(
679
679
  return false;
680
680
  }
681
681
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
683
  __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
684
  return false;
685
685
  }
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
810
810
 
811
811
  // self-attention
812
812
  {
813
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
813
+ // compute Q and K and RoPE them
814
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
815
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
816
816
 
817
817
  // store key and value to memory
818
- if (N >= 1) {
818
+ {
819
+ // compute the transposed [N, n_embd] V matrix
820
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
821
+
819
822
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
823
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
824
+ ( n_ctx)*ggml_element_size(kv_self.v),
825
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
821
826
 
827
+ // important: storing RoPE-ed version of K in the KV cache!
822
828
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
829
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
830
  }
825
831
 
826
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
832
  struct ggml_tensor * Q =
828
833
  ggml_permute(ctx0,
829
- ggml_rope(ctx0,
830
- ggml_cpy(ctx0,
831
- Qcur,
832
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
- n_past, n_rot, 0),
834
+ Qcur,
834
835
  0, 2, 1, 3);
835
836
 
836
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
837
  struct ggml_tensor * K =
838
838
  ggml_permute(ctx0,
839
- ggml_rope(ctx0,
840
- ggml_reshape_3d(ctx0,
841
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
- n_embd/n_head, n_head, n_past + N),
843
- n_past, n_rot, 1),
839
+ ggml_reshape_3d(ctx0,
840
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
841
+ n_embd/n_head, n_head, n_past + N),
844
842
  0, 2, 1, 3);
845
843
 
846
844
  // K * Q
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
858
856
  // KQ = soft_max(KQ_masked)
859
857
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
858
 
861
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
- struct ggml_tensor * V_trans =
863
- ggml_cpy(ctx0,
864
- ggml_permute(ctx0,
865
- ggml_reshape_3d(ctx0,
866
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
- n_embd/n_head, n_head, n_past + N),
868
- 1, 2, 0, 3),
869
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
859
+ // split cached V into n_head heads
860
+ struct ggml_tensor * V =
861
+ ggml_view_3d(ctx0, kv_self.v,
862
+ n_past + N, n_embd/n_head, n_head,
863
+ n_ctx*ggml_element_size(kv_self.v),
864
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
865
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
870
866
 
871
- // KQV = transpose(V) * KQ_soft_max
872
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
867
+ #if 1
868
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
869
+ #else
870
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
871
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
872
+ // is there a better way?
873
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
874
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
875
+ #endif
873
876
 
874
877
  // KQV_merged = KQV.permute(0, 2, 1, 3)
875
878
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
955
958
  ggml_build_forward_expand(&gf, inpL);
956
959
  ggml_graph_compute (ctx0, &gf);
957
960
 
961
+ // print timing information per ggml operation (for debugging purposes)
962
+ // requires GGML_PERF to be defined
963
+ //ggml_graph_print(&gf);
964
+
965
+ // plot the computation graph in dot format (for debugging purposes)
958
966
  //if (n_past%100 == 0) {
959
- // ggml_graph_print (&gf);
960
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
967
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
961
968
  //}
962
969
 
963
970
  //embd_w.resize(n_vocab*N);
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1194
1201
  const auto & logits = lctx.logits;
1195
1202
  const auto * plogits = logits.data() + logits.size() - n_logits;
1196
1203
 
1204
+ if (temp <= 0) {
1205
+ // select the token with the highest logit directly
1206
+ float max_logit = plogits[0];
1207
+ llama_vocab::id max_id = 0;
1208
+
1209
+ for (int i = 1; i < n_logits; ++i) {
1210
+ if (plogits[i] > max_logit) {
1211
+ max_logit = plogits[i];
1212
+ max_id = i;
1213
+ }
1214
+ }
1215
+ return max_id;
1216
+ }
1217
+
1197
1218
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
1219
  logits_id.reserve(n_logits);
1199
1220
 
@@ -1215,7 +1236,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
1215
1236
  }
1216
1237
  }
1217
1238
 
1218
- sample_top_k(logits_id, top_k);
1239
+ if (top_k > 0 && top_k < n_logits) {
1240
+ sample_top_k(logits_id, top_k);
1241
+ }
1219
1242
 
1220
1243
  float maxl = -std::numeric_limits<float>::infinity();
1221
1244
  for (const auto & kv : logits_id) {
@@ -1608,7 +1631,7 @@ struct llama_context * llama_init_from_file(
1608
1631
  }
1609
1632
 
1610
1633
  // reserve memory for context buffers
1611
- {
1634
+ if (!params.vocab_only) {
1612
1635
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
1636
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
1637
  llama_free(ctx);
@@ -1668,6 +1691,33 @@ int llama_model_quantize(
1668
1691
  return 0;
1669
1692
  }
1670
1693
 
1694
+ // Returns the KV cache that will contain the context for the
1695
+ // ongoing prediction with the model.
1696
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1697
+ return ctx->model.kv_self.buf.data();
1698
+ }
1699
+
1700
+ // Returns the size of the KV cache
1701
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1702
+ return ctx->model.kv_self.buf.size();
1703
+ }
1704
+
1705
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1706
+ return ctx->model.kv_self.n;
1707
+ }
1708
+
1709
+ // Sets the KV cache containing the current context for the model
1710
+ void llama_set_kv_cache(
1711
+ struct llama_context * ctx,
1712
+ const uint8_t * kv_cache,
1713
+ size_t n_size,
1714
+ int n_token_count) {
1715
+ // Make sure we have the same kv cache setup
1716
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1717
+ memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1718
+ ctx->model.kv_self.n = n_token_count;
1719
+ }
1720
+
1671
1721
  int llama_eval(
1672
1722
  struct llama_context * ctx,
1673
1723
  const llama_token * tokens,
@@ -83,6 +83,23 @@ extern "C" {
83
83
  const char * fname_out,
84
84
  int itype);
85
85
 
86
+ // Returns the KV cache that will contain the context for the
87
+ // ongoing prediction with the model.
88
+ LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
89
+
90
+ // Returns the size of the KV cache
91
+ LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
92
+
93
+ // Returns the number of tokens in the KV cache
94
+ LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
95
+
96
+ // Sets the KV cache containing the current context for the model
97
+ LLAMA_API void llama_set_kv_cache(
98
+ struct llama_context * ctx,
99
+ const uint8_t * kv_cache,
100
+ size_t n_size,
101
+ int n_token_count);
102
+
86
103
  // Run the llama inference to obtain the logits and probabilities for the next token.
87
104
  // tokens + n_tokens is the provided batch of new tokens to process
88
105
  // n_past is the number of tokens to use from previous eval calls
metadata CHANGED
@@ -1,35 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - zfletch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-02 00:00:00.000000000 Z
12
- dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: rice
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: 4.0.4
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: 4.0.4
11
+ date: 2023-04-06 00:00:00.000000000 Z
12
+ dependencies: []
27
13
  description: ggerganov/llama.cpp with Ruby hooks
28
14
  email:
29
- - zfletch2@gmail.com
15
+ - zf.rubygems@gmail.com
30
16
  executables: []
31
17
  extensions:
32
- - ext/llama/extconf.rb
18
+ - ext/extconf.rb
33
19
  extra_rdoc_files: []
34
20
  files:
35
21
  - Gemfile
@@ -37,18 +23,23 @@ files:
37
23
  - LICENSE
38
24
  - README.md
39
25
  - Rakefile
40
- - ext/llama/common.cpp
41
- - ext/llama/common.h
42
- - ext/llama/extconf.rb
43
- - ext/llama/ggml.c
44
- - ext/llama/ggml.h
45
- - ext/llama/llama.cpp
46
- - ext/llama/llama.h
47
- - ext/llama/model.cpp
26
+ - bin/console
27
+ - ext/Makefile
28
+ - ext/extconf.rb
48
29
  - lib/llama.rb
49
30
  - lib/llama/model.rb
50
31
  - lib/llama/version.rb
51
32
  - llama-rb.gemspec
33
+ - llama.cpp/LICENSE
34
+ - llama.cpp/Makefile
35
+ - llama.cpp/README.md
36
+ - llama.cpp/examples/common.cpp
37
+ - llama.cpp/examples/common.h
38
+ - llama.cpp/examples/main/main.cpp
39
+ - llama.cpp/ggml.c
40
+ - llama.cpp/ggml.h
41
+ - llama.cpp/llama.cpp
42
+ - llama.cpp/llama.h
52
43
  - models/.gitkeep
53
44
  homepage: https://github.com/zfletch/llama-rb
54
45
  licenses:
data/ext/llama/extconf.rb DELETED
@@ -1,12 +0,0 @@
1
- require 'mkmf-rice'
2
-
3
- # Compile llama.cpp
4
- # root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
5
- # llama_cpp = File.join(root, 'llama.cpp')
6
- #
7
- # Dir.chdir(llama_cpp) do
8
- # system("make", exception: true)
9
- # end
10
-
11
- # Create Makefile for Ruby bindings
12
- create_makefile 'llama/model'
data/ext/llama/model.cpp DELETED
@@ -1,192 +0,0 @@
1
- #include <rice/rice.hpp>
2
-
3
- #include "common.h"
4
- #include "llama.h"
5
-
6
- #include <cassert>
7
- #include <cinttypes>
8
- #include <cmath>
9
- #include <cstdio>
10
- #include <cstring>
11
- #include <fstream>
12
- #include <iostream>
13
- #include <string>
14
- #include <vector>
15
-
16
- class ModelCpp
17
- {
18
- public:
19
- llama_context *ctx;
20
- ModelCpp()
21
- {
22
- ctx = NULL;
23
- }
24
- void model_initialize(
25
- const char *model,
26
- const int32_t n_ctx,
27
- const int32_t n_parts,
28
- const int32_t seed,
29
- const bool memory_f16,
30
- const bool use_mlock
31
- );
32
- Rice::Object model_predict(
33
- const char *prompt,
34
- const int32_t n_predict
35
- );
36
- ~ModelCpp()
37
- {
38
-
39
- if (ctx != NULL) {
40
- llama_free(ctx);
41
- }
42
- }
43
- };
44
-
45
- void ModelCpp::model_initialize(
46
- const char *model, // path to model file, e.g. "models/7B/ggml-model-q4_0.bin"
47
- const int32_t n_ctx, // context size
48
- const int32_t n_parts, // amount of model parts (-1 = determine from model dimensions)
49
- const int32_t seed, // RNG seed
50
- const bool memory_f16, // use f16 instead of f32 for memory kv
51
- const bool use_mlock // use mlock to keep model in memory
52
- )
53
- {
54
- auto lparams = llama_context_default_params();
55
-
56
- lparams.n_ctx = n_ctx;
57
- lparams.n_parts = n_parts;
58
- lparams.seed = seed;
59
- lparams.f16_kv = memory_f16;
60
- lparams.use_mlock = use_mlock;
61
-
62
- ctx = llama_init_from_file(model, lparams);
63
- }
64
-
65
- Rice::Object ModelCpp::model_predict(
66
- const char *prompt, // string used as prompt
67
- const int32_t n_predict // number of tokens to predict
68
- )
69
- {
70
- std::string return_val = "";
71
-
72
- gpt_params params;
73
- params.prompt = prompt;
74
- params.n_predict = n_predict;
75
-
76
- // add a space in front of the first character to match OG llama tokenizer behavior
77
- params.prompt.insert(0, 1, ' ');
78
-
79
- // tokenize the prompt
80
- auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
81
- const int n_ctx = llama_n_ctx(ctx);
82
-
83
- // determine newline token
84
- auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
85
-
86
- // generate output
87
- {
88
- std::vector<llama_token> last_n_tokens(n_ctx);
89
- std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
90
-
91
- int n_past = 0;
92
- int n_remain = params.n_predict;
93
- int n_consumed = 0;
94
-
95
- std::vector<llama_token> embd;
96
-
97
- while (n_remain != 0) {
98
- if (embd.size() > 0) {
99
- // infinite text generation via context swapping
100
- // if we run out of context:
101
- // - take the n_keep first tokens from the original prompt (via n_past)
102
- // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
103
- if (n_past + (int) embd.size() > n_ctx) {
104
- const int n_left = n_past - params.n_keep;
105
-
106
- n_past = params.n_keep;
107
-
108
- // insert n_left/2 tokens at the start of embd from last_n_tokens
109
- embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
110
- }
111
-
112
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
113
- throw Rice::Exception(rb_eRuntimeError, "Failed to eval");
114
- }
115
- }
116
-
117
-
118
- n_past += embd.size();
119
- embd.clear();
120
-
121
- if ((int) embd_inp.size() <= n_consumed) {
122
- // out of user input, sample next token
123
- const int32_t top_k = params.top_k;
124
- const float top_p = params.top_p;
125
- const float temp = params.temp;
126
- const float repeat_penalty = params.repeat_penalty;
127
-
128
- llama_token id = 0;
129
-
130
- {
131
- auto logits = llama_get_logits(ctx);
132
-
133
- if (params.ignore_eos) {
134
- logits[llama_token_eos()] = 0;
135
- }
136
-
137
- id = llama_sample_top_p_top_k(ctx,
138
- last_n_tokens.data() + n_ctx - params.repeat_last_n,
139
- params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
140
-
141
- last_n_tokens.erase(last_n_tokens.begin());
142
- last_n_tokens.push_back(id);
143
- }
144
-
145
- // replace end of text token with newline token when in interactive mode
146
- if (id == llama_token_eos() && params.interactive && !params.instruct) {
147
- id = llama_token_newline.front();
148
- if (params.antiprompt.size() != 0) {
149
- // tokenize and inject first reverse prompt
150
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
151
- embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
152
- }
153
- }
154
-
155
- // add it to the context
156
- embd.push_back(id);
157
-
158
- // decrement remaining sampling budget
159
- --n_remain;
160
- } else {
161
- // some user input remains from prompt or interaction, forward it to processing
162
- while ((int) embd_inp.size() > n_consumed) {
163
- embd.push_back(embd_inp[n_consumed]);
164
- last_n_tokens.erase(last_n_tokens.begin());
165
- last_n_tokens.push_back(embd_inp[n_consumed]);
166
- ++n_consumed;
167
- if ((int) embd.size() >= params.n_batch) {
168
- break;
169
- }
170
- }
171
- }
172
-
173
- for (auto id : embd) {
174
- return_val += llama_token_to_str(ctx, id);
175
- }
176
- }
177
- }
178
-
179
- Rice::String ruby_return_val(return_val);
180
- return ruby_return_val;
181
- }
182
-
183
- extern "C"
184
- void Init_model()
185
- {
186
- Rice::Module rb_mLlama = Rice::define_module("Llama");
187
- Rice::Data_Type<ModelCpp> rb_cModel =Rice::define_class_under<ModelCpp>(rb_mLlama, "Model");
188
-
189
- rb_cModel.define_constructor(Rice::Constructor<ModelCpp>());
190
- rb_cModel.define_method("initialize_cpp", &ModelCpp::model_initialize);
191
- rb_cModel.define_method("predict_cpp", &ModelCpp::model_predict);
192
- }
File without changes