llama-rb 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -258,11 +258,11 @@ struct ggml_tensor {
258
258
  enum ggml_type type;
259
259
 
260
260
  int n_dims;
261
- int ne[GGML_MAX_DIMS]; // number of elements
262
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
- // nb[0] = sizeof(type)
264
- // nb[1] = nb[0] * ne[0] + padding
265
- // nb[i] = nb[i-1] * ne[i-1]
261
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
262
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
+ // nb[0] = sizeof(type)
264
+ // nb[1] = nb[0] * ne[0] + padding
265
+ // nb[i] = nb[i-1] * ne[i-1]
266
266
 
267
267
  // compute data
268
268
  enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
328
328
  void ggml_print_object (const struct ggml_object * obj);
329
329
  void ggml_print_objects(const struct ggml_context * ctx);
330
330
 
331
- int ggml_nelements(const struct ggml_tensor * tensor);
332
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
331
+ int64_t ggml_nelements(const struct ggml_tensor * tensor);
332
+ size_t ggml_nbytes (const struct ggml_tensor * tensor);
333
333
 
334
334
  int ggml_blck_size (enum ggml_type type);
335
335
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
355
355
  struct ggml_context * ctx,
356
356
  enum ggml_type type,
357
357
  int n_dims,
358
- const int *ne);
358
+ const int64_t *ne);
359
359
 
360
360
  struct ggml_tensor * ggml_new_tensor_1d(
361
361
  struct ggml_context * ctx,
362
362
  enum ggml_type type,
363
- int ne0);
363
+ int64_t ne0);
364
364
 
365
365
  struct ggml_tensor * ggml_new_tensor_2d(
366
366
  struct ggml_context * ctx,
367
367
  enum ggml_type type,
368
- int ne0,
369
- int ne1);
368
+ int64_t ne0,
369
+ int64_t ne1);
370
370
 
371
371
  struct ggml_tensor * ggml_new_tensor_3d(
372
372
  struct ggml_context * ctx,
373
373
  enum ggml_type type,
374
- int ne0,
375
- int ne1,
376
- int ne2);
374
+ int64_t ne0,
375
+ int64_t ne1,
376
+ int64_t ne2);
377
377
 
378
378
  struct ggml_tensor * ggml_new_tensor_4d(
379
379
  struct ggml_context * ctx,
380
380
  enum ggml_type type,
381
- int ne0,
382
- int ne1,
383
- int ne2,
384
- int ne3);
381
+ int64_t ne0,
382
+ int64_t ne1,
383
+ int64_t ne2,
384
+ int64_t ne3);
385
385
 
386
386
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
387
387
  struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
531
531
  struct ggml_tensor * ggml_reshape_2d(
532
532
  struct ggml_context * ctx,
533
533
  struct ggml_tensor * a,
534
- int ne0,
535
- int ne1);
534
+ int64_t ne0,
535
+ int64_t ne1);
536
536
 
537
537
  // return view(a)
538
538
  // TODO: when we start computing gradient, make a copy instead of view
539
539
  struct ggml_tensor * ggml_reshape_3d(
540
540
  struct ggml_context * ctx,
541
541
  struct ggml_tensor * a,
542
- int ne0,
543
- int ne1,
544
- int ne2);
542
+ int64_t ne0,
543
+ int64_t ne1,
544
+ int64_t ne2);
545
545
 
546
546
  // offset in bytes
547
547
  struct ggml_tensor * ggml_view_1d(
548
548
  struct ggml_context * ctx,
549
549
  struct ggml_tensor * a,
550
- int ne0,
550
+ int64_t ne0,
551
551
  size_t offset);
552
552
 
553
553
  struct ggml_tensor * ggml_view_2d(
554
554
  struct ggml_context * ctx,
555
555
  struct ggml_tensor * a,
556
- int ne0,
557
- int ne1,
556
+ int64_t ne0,
557
+ int64_t ne1,
558
558
  size_t nb1, // row stride in bytes
559
559
  size_t offset);
560
560
 
561
+ struct ggml_tensor * ggml_view_3d(
562
+ struct ggml_context * ctx,
563
+ struct ggml_tensor * a,
564
+ int64_t ne0,
565
+ int64_t ne1,
566
+ int64_t ne2,
567
+ size_t nb1, // row stride in bytes
568
+ size_t nb2, // slice stride in bytes
569
+ size_t offset);
570
+
561
571
  struct ggml_tensor * ggml_permute(
562
572
  struct ggml_context * ctx,
563
573
  struct ggml_tensor * a,
@@ -256,8 +256,8 @@ static bool kv_cache_init(
256
256
  const int n_embd = hparams.n_embd;
257
257
  const int n_layer = hparams.n_layer;
258
258
 
259
- const int n_mem = n_layer*n_ctx;
260
- const int n_elements = n_embd*n_mem;
259
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
260
+ const int64_t n_elements = n_embd*n_mem;
261
261
 
262
262
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
263
 
@@ -679,7 +679,7 @@ static bool llama_model_load(
679
679
  return false;
680
680
  }
681
681
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
683
  __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
684
  return false;
685
685
  }
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
810
810
 
811
811
  // self-attention
812
812
  {
813
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
813
+ // compute Q and K and RoPE them
814
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
815
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
816
816
 
817
817
  // store key and value to memory
818
- if (N >= 1) {
818
+ {
819
+ // compute the transposed [N, n_embd] V matrix
820
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
821
+
819
822
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
823
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
824
+ ( n_ctx)*ggml_element_size(kv_self.v),
825
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
821
826
 
827
+ // important: storing RoPE-ed version of K in the KV cache!
822
828
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
829
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
830
  }
825
831
 
826
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
832
  struct ggml_tensor * Q =
828
833
  ggml_permute(ctx0,
829
- ggml_rope(ctx0,
830
- ggml_cpy(ctx0,
831
- Qcur,
832
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
- n_past, n_rot, 0),
834
+ Qcur,
834
835
  0, 2, 1, 3);
835
836
 
836
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
837
  struct ggml_tensor * K =
838
838
  ggml_permute(ctx0,
839
- ggml_rope(ctx0,
840
- ggml_reshape_3d(ctx0,
841
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
- n_embd/n_head, n_head, n_past + N),
843
- n_past, n_rot, 1),
839
+ ggml_reshape_3d(ctx0,
840
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
841
+ n_embd/n_head, n_head, n_past + N),
844
842
  0, 2, 1, 3);
845
843
 
846
844
  // K * Q
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
858
856
  // KQ = soft_max(KQ_masked)
859
857
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
858
 
861
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
- struct ggml_tensor * V_trans =
863
- ggml_cpy(ctx0,
864
- ggml_permute(ctx0,
865
- ggml_reshape_3d(ctx0,
866
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
- n_embd/n_head, n_head, n_past + N),
868
- 1, 2, 0, 3),
869
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
859
+ // split cached V into n_head heads
860
+ struct ggml_tensor * V =
861
+ ggml_view_3d(ctx0, kv_self.v,
862
+ n_past + N, n_embd/n_head, n_head,
863
+ n_ctx*ggml_element_size(kv_self.v),
864
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
865
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
870
866
 
871
- // KQV = transpose(V) * KQ_soft_max
872
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
867
+ #if 1
868
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
869
+ #else
870
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
871
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
872
+ // is there a better way?
873
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
874
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
875
+ #endif
873
876
 
874
877
  // KQV_merged = KQV.permute(0, 2, 1, 3)
875
878
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
955
958
  ggml_build_forward_expand(&gf, inpL);
956
959
  ggml_graph_compute (ctx0, &gf);
957
960
 
961
+ // print timing information per ggml operation (for debugging purposes)
962
+ // requires GGML_PERF to be defined
963
+ //ggml_graph_print(&gf);
964
+
965
+ // plot the computation graph in dot format (for debugging purposes)
958
966
  //if (n_past%100 == 0) {
959
- // ggml_graph_print (&gf);
960
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
967
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
961
968
  //}
962
969
 
963
970
  //embd_w.resize(n_vocab*N);
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1194
1201
  const auto & logits = lctx.logits;
1195
1202
  const auto * plogits = logits.data() + logits.size() - n_logits;
1196
1203
 
1204
+ if (temp <= 0) {
1205
+ // select the token with the highest logit directly
1206
+ float max_logit = plogits[0];
1207
+ llama_vocab::id max_id = 0;
1208
+
1209
+ for (int i = 1; i < n_logits; ++i) {
1210
+ if (plogits[i] > max_logit) {
1211
+ max_logit = plogits[i];
1212
+ max_id = i;
1213
+ }
1214
+ }
1215
+ return max_id;
1216
+ }
1217
+
1197
1218
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
1219
  logits_id.reserve(n_logits);
1199
1220
 
@@ -1215,7 +1236,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
1215
1236
  }
1216
1237
  }
1217
1238
 
1218
- sample_top_k(logits_id, top_k);
1239
+ if (top_k > 0 && top_k < n_logits) {
1240
+ sample_top_k(logits_id, top_k);
1241
+ }
1219
1242
 
1220
1243
  float maxl = -std::numeric_limits<float>::infinity();
1221
1244
  for (const auto & kv : logits_id) {
@@ -1608,7 +1631,7 @@ struct llama_context * llama_init_from_file(
1608
1631
  }
1609
1632
 
1610
1633
  // reserve memory for context buffers
1611
- {
1634
+ if (!params.vocab_only) {
1612
1635
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
1636
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
1637
  llama_free(ctx);
@@ -1668,6 +1691,33 @@ int llama_model_quantize(
1668
1691
  return 0;
1669
1692
  }
1670
1693
 
1694
+ // Returns the KV cache that will contain the context for the
1695
+ // ongoing prediction with the model.
1696
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1697
+ return ctx->model.kv_self.buf.data();
1698
+ }
1699
+
1700
+ // Returns the size of the KV cache
1701
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1702
+ return ctx->model.kv_self.buf.size();
1703
+ }
1704
+
1705
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1706
+ return ctx->model.kv_self.n;
1707
+ }
1708
+
1709
+ // Sets the KV cache containing the current context for the model
1710
+ void llama_set_kv_cache(
1711
+ struct llama_context * ctx,
1712
+ const uint8_t * kv_cache,
1713
+ size_t n_size,
1714
+ int n_token_count) {
1715
+ // Make sure we have the same kv cache setup
1716
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1717
+ memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1718
+ ctx->model.kv_self.n = n_token_count;
1719
+ }
1720
+
1671
1721
  int llama_eval(
1672
1722
  struct llama_context * ctx,
1673
1723
  const llama_token * tokens,
@@ -83,6 +83,23 @@ extern "C" {
83
83
  const char * fname_out,
84
84
  int itype);
85
85
 
86
+ // Returns the KV cache that will contain the context for the
87
+ // ongoing prediction with the model.
88
+ LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
89
+
90
+ // Returns the size of the KV cache
91
+ LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
92
+
93
+ // Returns the number of tokens in the KV cache
94
+ LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
95
+
96
+ // Sets the KV cache containing the current context for the model
97
+ LLAMA_API void llama_set_kv_cache(
98
+ struct llama_context * ctx,
99
+ const uint8_t * kv_cache,
100
+ size_t n_size,
101
+ int n_token_count);
102
+
86
103
  // Run the llama inference to obtain the logits and probabilities for the next token.
87
104
  // tokens + n_tokens is the provided batch of new tokens to process
88
105
  // n_past is the number of tokens to use from previous eval calls
metadata CHANGED
@@ -1,35 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - zfletch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-02 00:00:00.000000000 Z
12
- dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: rice
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: 4.0.4
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: 4.0.4
11
+ date: 2023-04-06 00:00:00.000000000 Z
12
+ dependencies: []
27
13
  description: ggerganov/llama.cpp with Ruby hooks
28
14
  email:
29
- - zfletch2@gmail.com
15
+ - zf.rubygems@gmail.com
30
16
  executables: []
31
17
  extensions:
32
- - ext/llama/extconf.rb
18
+ - ext/extconf.rb
33
19
  extra_rdoc_files: []
34
20
  files:
35
21
  - Gemfile
@@ -37,18 +23,23 @@ files:
37
23
  - LICENSE
38
24
  - README.md
39
25
  - Rakefile
40
- - ext/llama/common.cpp
41
- - ext/llama/common.h
42
- - ext/llama/extconf.rb
43
- - ext/llama/ggml.c
44
- - ext/llama/ggml.h
45
- - ext/llama/llama.cpp
46
- - ext/llama/llama.h
47
- - ext/llama/model.cpp
26
+ - bin/console
27
+ - ext/Makefile
28
+ - ext/extconf.rb
48
29
  - lib/llama.rb
49
30
  - lib/llama/model.rb
50
31
  - lib/llama/version.rb
51
32
  - llama-rb.gemspec
33
+ - llama.cpp/LICENSE
34
+ - llama.cpp/Makefile
35
+ - llama.cpp/README.md
36
+ - llama.cpp/examples/common.cpp
37
+ - llama.cpp/examples/common.h
38
+ - llama.cpp/examples/main/main.cpp
39
+ - llama.cpp/ggml.c
40
+ - llama.cpp/ggml.h
41
+ - llama.cpp/llama.cpp
42
+ - llama.cpp/llama.h
52
43
  - models/.gitkeep
53
44
  homepage: https://github.com/zfletch/llama-rb
54
45
  licenses:
data/ext/llama/extconf.rb DELETED
@@ -1,12 +0,0 @@
1
- require 'mkmf-rice'
2
-
3
- # Compile llama.cpp
4
- # root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
5
- # llama_cpp = File.join(root, 'llama.cpp')
6
- #
7
- # Dir.chdir(llama_cpp) do
8
- # system("make", exception: true)
9
- # end
10
-
11
- # Create Makefile for Ruby bindings
12
- create_makefile 'llama/model'
data/ext/llama/model.cpp DELETED
@@ -1,192 +0,0 @@
1
- #include <rice/rice.hpp>
2
-
3
- #include "common.h"
4
- #include "llama.h"
5
-
6
- #include <cassert>
7
- #include <cinttypes>
8
- #include <cmath>
9
- #include <cstdio>
10
- #include <cstring>
11
- #include <fstream>
12
- #include <iostream>
13
- #include <string>
14
- #include <vector>
15
-
16
- class ModelCpp
17
- {
18
- public:
19
- llama_context *ctx;
20
- ModelCpp()
21
- {
22
- ctx = NULL;
23
- }
24
- void model_initialize(
25
- const char *model,
26
- const int32_t n_ctx,
27
- const int32_t n_parts,
28
- const int32_t seed,
29
- const bool memory_f16,
30
- const bool use_mlock
31
- );
32
- Rice::Object model_predict(
33
- const char *prompt,
34
- const int32_t n_predict
35
- );
36
- ~ModelCpp()
37
- {
38
-
39
- if (ctx != NULL) {
40
- llama_free(ctx);
41
- }
42
- }
43
- };
44
-
45
- void ModelCpp::model_initialize(
46
- const char *model, // path to model file, e.g. "models/7B/ggml-model-q4_0.bin"
47
- const int32_t n_ctx, // context size
48
- const int32_t n_parts, // amount of model parts (-1 = determine from model dimensions)
49
- const int32_t seed, // RNG seed
50
- const bool memory_f16, // use f16 instead of f32 for memory kv
51
- const bool use_mlock // use mlock to keep model in memory
52
- )
53
- {
54
- auto lparams = llama_context_default_params();
55
-
56
- lparams.n_ctx = n_ctx;
57
- lparams.n_parts = n_parts;
58
- lparams.seed = seed;
59
- lparams.f16_kv = memory_f16;
60
- lparams.use_mlock = use_mlock;
61
-
62
- ctx = llama_init_from_file(model, lparams);
63
- }
64
-
65
- Rice::Object ModelCpp::model_predict(
66
- const char *prompt, // string used as prompt
67
- const int32_t n_predict // number of tokens to predict
68
- )
69
- {
70
- std::string return_val = "";
71
-
72
- gpt_params params;
73
- params.prompt = prompt;
74
- params.n_predict = n_predict;
75
-
76
- // add a space in front of the first character to match OG llama tokenizer behavior
77
- params.prompt.insert(0, 1, ' ');
78
-
79
- // tokenize the prompt
80
- auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
81
- const int n_ctx = llama_n_ctx(ctx);
82
-
83
- // determine newline token
84
- auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
85
-
86
- // generate output
87
- {
88
- std::vector<llama_token> last_n_tokens(n_ctx);
89
- std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
90
-
91
- int n_past = 0;
92
- int n_remain = params.n_predict;
93
- int n_consumed = 0;
94
-
95
- std::vector<llama_token> embd;
96
-
97
- while (n_remain != 0) {
98
- if (embd.size() > 0) {
99
- // infinite text generation via context swapping
100
- // if we run out of context:
101
- // - take the n_keep first tokens from the original prompt (via n_past)
102
- // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
103
- if (n_past + (int) embd.size() > n_ctx) {
104
- const int n_left = n_past - params.n_keep;
105
-
106
- n_past = params.n_keep;
107
-
108
- // insert n_left/2 tokens at the start of embd from last_n_tokens
109
- embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
110
- }
111
-
112
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
113
- throw Rice::Exception(rb_eRuntimeError, "Failed to eval");
114
- }
115
- }
116
-
117
-
118
- n_past += embd.size();
119
- embd.clear();
120
-
121
- if ((int) embd_inp.size() <= n_consumed) {
122
- // out of user input, sample next token
123
- const int32_t top_k = params.top_k;
124
- const float top_p = params.top_p;
125
- const float temp = params.temp;
126
- const float repeat_penalty = params.repeat_penalty;
127
-
128
- llama_token id = 0;
129
-
130
- {
131
- auto logits = llama_get_logits(ctx);
132
-
133
- if (params.ignore_eos) {
134
- logits[llama_token_eos()] = 0;
135
- }
136
-
137
- id = llama_sample_top_p_top_k(ctx,
138
- last_n_tokens.data() + n_ctx - params.repeat_last_n,
139
- params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
140
-
141
- last_n_tokens.erase(last_n_tokens.begin());
142
- last_n_tokens.push_back(id);
143
- }
144
-
145
- // replace end of text token with newline token when in interactive mode
146
- if (id == llama_token_eos() && params.interactive && !params.instruct) {
147
- id = llama_token_newline.front();
148
- if (params.antiprompt.size() != 0) {
149
- // tokenize and inject first reverse prompt
150
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
151
- embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
152
- }
153
- }
154
-
155
- // add it to the context
156
- embd.push_back(id);
157
-
158
- // decrement remaining sampling budget
159
- --n_remain;
160
- } else {
161
- // some user input remains from prompt or interaction, forward it to processing
162
- while ((int) embd_inp.size() > n_consumed) {
163
- embd.push_back(embd_inp[n_consumed]);
164
- last_n_tokens.erase(last_n_tokens.begin());
165
- last_n_tokens.push_back(embd_inp[n_consumed]);
166
- ++n_consumed;
167
- if ((int) embd.size() >= params.n_batch) {
168
- break;
169
- }
170
- }
171
- }
172
-
173
- for (auto id : embd) {
174
- return_val += llama_token_to_str(ctx, id);
175
- }
176
- }
177
- }
178
-
179
- Rice::String ruby_return_val(return_val);
180
- return ruby_return_val;
181
- }
182
-
183
- extern "C"
184
- void Init_model()
185
- {
186
- Rice::Module rb_mLlama = Rice::define_module("Llama");
187
- Rice::Data_Type<ModelCpp> rb_cModel =Rice::define_class_under<ModelCpp>(rb_mLlama, "Model");
188
-
189
- rb_cModel.define_constructor(Rice::Constructor<ModelCpp>());
190
- rb_cModel.define_method("initialize_cpp", &ModelCpp::model_initialize);
191
- rb_cModel.define_method("predict_cpp", &ModelCpp::model_predict);
192
- }
File without changes