llama-rb 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +1 -3
- data/README.md +2 -8
- data/bin/console +7 -0
- data/ext/Makefile +4 -0
- data/ext/extconf.rb +10 -0
- data/lib/llama/model.rb +36 -64
- data/lib/llama/version.rb +1 -1
- data/lib/llama.rb +0 -1
- data/llama-rb.gemspec +25 -25
- data/llama.cpp/LICENSE +21 -0
- data/llama.cpp/Makefile +175 -0
- data/llama.cpp/README.md +389 -0
- data/{ext/llama → llama.cpp/examples}/common.cpp +10 -3
- data/llama.cpp/examples/main/main.cpp +460 -0
- data/{ext/llama → llama.cpp}/ggml.c +587 -485
- data/{ext/llama → llama.cpp}/ggml.h +36 -26
- data/{ext/llama → llama.cpp}/llama.cpp +85 -35
- data/{ext/llama → llama.cpp}/llama.h +17 -0
- metadata +18 -27
- data/ext/llama/extconf.rb +0 -12
- data/ext/llama/model.cpp +0 -192
- /data/{ext/llama → llama.cpp/examples}/common.h +0 -0
@@ -258,11 +258,11 @@ struct ggml_tensor {
|
|
258
258
|
enum ggml_type type;
|
259
259
|
|
260
260
|
int n_dims;
|
261
|
-
|
262
|
-
size_t
|
263
|
-
|
264
|
-
|
265
|
-
|
261
|
+
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
262
|
+
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
263
|
+
// nb[0] = sizeof(type)
|
264
|
+
// nb[1] = nb[0] * ne[0] + padding
|
265
|
+
// nb[i] = nb[i-1] * ne[i-1]
|
266
266
|
|
267
267
|
// compute data
|
268
268
|
enum ggml_op op;
|
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
|
|
328
328
|
void ggml_print_object (const struct ggml_object * obj);
|
329
329
|
void ggml_print_objects(const struct ggml_context * ctx);
|
330
330
|
|
331
|
-
|
332
|
-
size_t
|
331
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
332
|
+
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
333
333
|
|
334
334
|
int ggml_blck_size (enum ggml_type type);
|
335
335
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
|
|
355
355
|
struct ggml_context * ctx,
|
356
356
|
enum ggml_type type,
|
357
357
|
int n_dims,
|
358
|
-
const
|
358
|
+
const int64_t *ne);
|
359
359
|
|
360
360
|
struct ggml_tensor * ggml_new_tensor_1d(
|
361
361
|
struct ggml_context * ctx,
|
362
362
|
enum ggml_type type,
|
363
|
-
|
363
|
+
int64_t ne0);
|
364
364
|
|
365
365
|
struct ggml_tensor * ggml_new_tensor_2d(
|
366
366
|
struct ggml_context * ctx,
|
367
367
|
enum ggml_type type,
|
368
|
-
|
369
|
-
|
368
|
+
int64_t ne0,
|
369
|
+
int64_t ne1);
|
370
370
|
|
371
371
|
struct ggml_tensor * ggml_new_tensor_3d(
|
372
372
|
struct ggml_context * ctx,
|
373
373
|
enum ggml_type type,
|
374
|
-
|
375
|
-
|
376
|
-
|
374
|
+
int64_t ne0,
|
375
|
+
int64_t ne1,
|
376
|
+
int64_t ne2);
|
377
377
|
|
378
378
|
struct ggml_tensor * ggml_new_tensor_4d(
|
379
379
|
struct ggml_context * ctx,
|
380
380
|
enum ggml_type type,
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
381
|
+
int64_t ne0,
|
382
|
+
int64_t ne1,
|
383
|
+
int64_t ne2,
|
384
|
+
int64_t ne3);
|
385
385
|
|
386
386
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
387
387
|
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
|
|
531
531
|
struct ggml_tensor * ggml_reshape_2d(
|
532
532
|
struct ggml_context * ctx,
|
533
533
|
struct ggml_tensor * a,
|
534
|
-
|
535
|
-
|
534
|
+
int64_t ne0,
|
535
|
+
int64_t ne1);
|
536
536
|
|
537
537
|
// return view(a)
|
538
538
|
// TODO: when we start computing gradient, make a copy instead of view
|
539
539
|
struct ggml_tensor * ggml_reshape_3d(
|
540
540
|
struct ggml_context * ctx,
|
541
541
|
struct ggml_tensor * a,
|
542
|
-
|
543
|
-
|
544
|
-
|
542
|
+
int64_t ne0,
|
543
|
+
int64_t ne1,
|
544
|
+
int64_t ne2);
|
545
545
|
|
546
546
|
// offset in bytes
|
547
547
|
struct ggml_tensor * ggml_view_1d(
|
548
548
|
struct ggml_context * ctx,
|
549
549
|
struct ggml_tensor * a,
|
550
|
-
|
550
|
+
int64_t ne0,
|
551
551
|
size_t offset);
|
552
552
|
|
553
553
|
struct ggml_tensor * ggml_view_2d(
|
554
554
|
struct ggml_context * ctx,
|
555
555
|
struct ggml_tensor * a,
|
556
|
-
|
557
|
-
|
556
|
+
int64_t ne0,
|
557
|
+
int64_t ne1,
|
558
558
|
size_t nb1, // row stride in bytes
|
559
559
|
size_t offset);
|
560
560
|
|
561
|
+
struct ggml_tensor * ggml_view_3d(
|
562
|
+
struct ggml_context * ctx,
|
563
|
+
struct ggml_tensor * a,
|
564
|
+
int64_t ne0,
|
565
|
+
int64_t ne1,
|
566
|
+
int64_t ne2,
|
567
|
+
size_t nb1, // row stride in bytes
|
568
|
+
size_t nb2, // slice stride in bytes
|
569
|
+
size_t offset);
|
570
|
+
|
561
571
|
struct ggml_tensor * ggml_permute(
|
562
572
|
struct ggml_context * ctx,
|
563
573
|
struct ggml_tensor * a,
|
@@ -256,8 +256,8 @@ static bool kv_cache_init(
|
|
256
256
|
const int n_embd = hparams.n_embd;
|
257
257
|
const int n_layer = hparams.n_layer;
|
258
258
|
|
259
|
-
const
|
260
|
-
const
|
259
|
+
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
260
|
+
const int64_t n_elements = n_embd*n_mem;
|
261
261
|
|
262
262
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
263
|
|
@@ -679,7 +679,7 @@ static bool llama_model_load(
|
|
679
679
|
return false;
|
680
680
|
}
|
681
681
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%
|
682
|
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
683
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
684
|
return false;
|
685
685
|
}
|
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
|
|
810
810
|
|
811
811
|
// self-attention
|
812
812
|
{
|
813
|
-
|
814
|
-
struct ggml_tensor *
|
815
|
-
struct ggml_tensor *
|
813
|
+
// compute Q and K and RoPE them
|
814
|
+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
815
|
+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
816
816
|
|
817
817
|
// store key and value to memory
|
818
|
-
|
818
|
+
{
|
819
|
+
// compute the transposed [N, n_embd] V matrix
|
820
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
821
|
+
|
819
822
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
820
|
-
struct ggml_tensor * v =
|
823
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
824
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
825
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
821
826
|
|
827
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
822
828
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
823
829
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
824
830
|
}
|
825
831
|
|
826
|
-
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
827
832
|
struct ggml_tensor * Q =
|
828
833
|
ggml_permute(ctx0,
|
829
|
-
|
830
|
-
ggml_cpy(ctx0,
|
831
|
-
Qcur,
|
832
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
833
|
-
n_past, n_rot, 0),
|
834
|
+
Qcur,
|
834
835
|
0, 2, 1, 3);
|
835
836
|
|
836
|
-
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
837
837
|
struct ggml_tensor * K =
|
838
838
|
ggml_permute(ctx0,
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
n_embd/n_head, n_head, n_past + N),
|
843
|
-
n_past, n_rot, 1),
|
839
|
+
ggml_reshape_3d(ctx0,
|
840
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
841
|
+
n_embd/n_head, n_head, n_past + N),
|
844
842
|
0, 2, 1, 3);
|
845
843
|
|
846
844
|
// K * Q
|
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
|
|
858
856
|
// KQ = soft_max(KQ_masked)
|
859
857
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
860
858
|
|
861
|
-
//
|
862
|
-
struct ggml_tensor *
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
1, 2, 0, 3),
|
869
|
-
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
859
|
+
// split cached V into n_head heads
|
860
|
+
struct ggml_tensor * V =
|
861
|
+
ggml_view_3d(ctx0, kv_self.v,
|
862
|
+
n_past + N, n_embd/n_head, n_head,
|
863
|
+
n_ctx*ggml_element_size(kv_self.v),
|
864
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
865
|
+
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
870
866
|
|
871
|
-
|
872
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0,
|
867
|
+
#if 1
|
868
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
869
|
+
#else
|
870
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
871
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
872
|
+
// is there a better way?
|
873
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
874
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
875
|
+
#endif
|
873
876
|
|
874
877
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
875
878
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
|
|
955
958
|
ggml_build_forward_expand(&gf, inpL);
|
956
959
|
ggml_graph_compute (ctx0, &gf);
|
957
960
|
|
961
|
+
// print timing information per ggml operation (for debugging purposes)
|
962
|
+
// requires GGML_PERF to be defined
|
963
|
+
//ggml_graph_print(&gf);
|
964
|
+
|
965
|
+
// plot the computation graph in dot format (for debugging purposes)
|
958
966
|
//if (n_past%100 == 0) {
|
959
|
-
//
|
960
|
-
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
967
|
+
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
961
968
|
//}
|
962
969
|
|
963
970
|
//embd_w.resize(n_vocab*N);
|
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1194
1201
|
const auto & logits = lctx.logits;
|
1195
1202
|
const auto * plogits = logits.data() + logits.size() - n_logits;
|
1196
1203
|
|
1204
|
+
if (temp <= 0) {
|
1205
|
+
// select the token with the highest logit directly
|
1206
|
+
float max_logit = plogits[0];
|
1207
|
+
llama_vocab::id max_id = 0;
|
1208
|
+
|
1209
|
+
for (int i = 1; i < n_logits; ++i) {
|
1210
|
+
if (plogits[i] > max_logit) {
|
1211
|
+
max_logit = plogits[i];
|
1212
|
+
max_id = i;
|
1213
|
+
}
|
1214
|
+
}
|
1215
|
+
return max_id;
|
1216
|
+
}
|
1217
|
+
|
1197
1218
|
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
1198
1219
|
logits_id.reserve(n_logits);
|
1199
1220
|
|
@@ -1215,7 +1236,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1215
1236
|
}
|
1216
1237
|
}
|
1217
1238
|
|
1218
|
-
|
1239
|
+
if (top_k > 0 && top_k < n_logits) {
|
1240
|
+
sample_top_k(logits_id, top_k);
|
1241
|
+
}
|
1219
1242
|
|
1220
1243
|
float maxl = -std::numeric_limits<float>::infinity();
|
1221
1244
|
for (const auto & kv : logits_id) {
|
@@ -1608,7 +1631,7 @@ struct llama_context * llama_init_from_file(
|
|
1608
1631
|
}
|
1609
1632
|
|
1610
1633
|
// reserve memory for context buffers
|
1611
|
-
{
|
1634
|
+
if (!params.vocab_only) {
|
1612
1635
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
1613
1636
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
1614
1637
|
llama_free(ctx);
|
@@ -1668,6 +1691,33 @@ int llama_model_quantize(
|
|
1668
1691
|
return 0;
|
1669
1692
|
}
|
1670
1693
|
|
1694
|
+
// Returns the KV cache that will contain the context for the
|
1695
|
+
// ongoing prediction with the model.
|
1696
|
+
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1697
|
+
return ctx->model.kv_self.buf.data();
|
1698
|
+
}
|
1699
|
+
|
1700
|
+
// Returns the size of the KV cache
|
1701
|
+
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1702
|
+
return ctx->model.kv_self.buf.size();
|
1703
|
+
}
|
1704
|
+
|
1705
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
1706
|
+
return ctx->model.kv_self.n;
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
// Sets the KV cache containing the current context for the model
|
1710
|
+
void llama_set_kv_cache(
|
1711
|
+
struct llama_context * ctx,
|
1712
|
+
const uint8_t * kv_cache,
|
1713
|
+
size_t n_size,
|
1714
|
+
int n_token_count) {
|
1715
|
+
// Make sure we have the same kv cache setup
|
1716
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
1717
|
+
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
1718
|
+
ctx->model.kv_self.n = n_token_count;
|
1719
|
+
}
|
1720
|
+
|
1671
1721
|
int llama_eval(
|
1672
1722
|
struct llama_context * ctx,
|
1673
1723
|
const llama_token * tokens,
|
@@ -83,6 +83,23 @@ extern "C" {
|
|
83
83
|
const char * fname_out,
|
84
84
|
int itype);
|
85
85
|
|
86
|
+
// Returns the KV cache that will contain the context for the
|
87
|
+
// ongoing prediction with the model.
|
88
|
+
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
89
|
+
|
90
|
+
// Returns the size of the KV cache
|
91
|
+
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
92
|
+
|
93
|
+
// Returns the number of tokens in the KV cache
|
94
|
+
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
95
|
+
|
96
|
+
// Sets the KV cache containing the current context for the model
|
97
|
+
LLAMA_API void llama_set_kv_cache(
|
98
|
+
struct llama_context * ctx,
|
99
|
+
const uint8_t * kv_cache,
|
100
|
+
size_t n_size,
|
101
|
+
int n_token_count);
|
102
|
+
|
86
103
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
87
104
|
// tokens + n_tokens is the provided batch of new tokens to process
|
88
105
|
// n_past is the number of tokens to use from previous eval calls
|
metadata
CHANGED
@@ -1,35 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zfletch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
12
|
-
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: rice
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 4.0.4
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 4.0.4
|
11
|
+
date: 2023-04-06 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
27
13
|
description: ggerganov/llama.cpp with Ruby hooks
|
28
14
|
email:
|
29
|
-
-
|
15
|
+
- zf.rubygems@gmail.com
|
30
16
|
executables: []
|
31
17
|
extensions:
|
32
|
-
- ext/
|
18
|
+
- ext/extconf.rb
|
33
19
|
extra_rdoc_files: []
|
34
20
|
files:
|
35
21
|
- Gemfile
|
@@ -37,18 +23,23 @@ files:
|
|
37
23
|
- LICENSE
|
38
24
|
- README.md
|
39
25
|
- Rakefile
|
40
|
-
-
|
41
|
-
- ext/
|
42
|
-
- ext/
|
43
|
-
- ext/llama/ggml.c
|
44
|
-
- ext/llama/ggml.h
|
45
|
-
- ext/llama/llama.cpp
|
46
|
-
- ext/llama/llama.h
|
47
|
-
- ext/llama/model.cpp
|
26
|
+
- bin/console
|
27
|
+
- ext/Makefile
|
28
|
+
- ext/extconf.rb
|
48
29
|
- lib/llama.rb
|
49
30
|
- lib/llama/model.rb
|
50
31
|
- lib/llama/version.rb
|
51
32
|
- llama-rb.gemspec
|
33
|
+
- llama.cpp/LICENSE
|
34
|
+
- llama.cpp/Makefile
|
35
|
+
- llama.cpp/README.md
|
36
|
+
- llama.cpp/examples/common.cpp
|
37
|
+
- llama.cpp/examples/common.h
|
38
|
+
- llama.cpp/examples/main/main.cpp
|
39
|
+
- llama.cpp/ggml.c
|
40
|
+
- llama.cpp/ggml.h
|
41
|
+
- llama.cpp/llama.cpp
|
42
|
+
- llama.cpp/llama.h
|
52
43
|
- models/.gitkeep
|
53
44
|
homepage: https://github.com/zfletch/llama-rb
|
54
45
|
licenses:
|
data/ext/llama/extconf.rb
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
require 'mkmf-rice'
|
2
|
-
|
3
|
-
# Compile llama.cpp
|
4
|
-
# root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
|
5
|
-
# llama_cpp = File.join(root, 'llama.cpp')
|
6
|
-
#
|
7
|
-
# Dir.chdir(llama_cpp) do
|
8
|
-
# system("make", exception: true)
|
9
|
-
# end
|
10
|
-
|
11
|
-
# Create Makefile for Ruby bindings
|
12
|
-
create_makefile 'llama/model'
|
data/ext/llama/model.cpp
DELETED
@@ -1,192 +0,0 @@
|
|
1
|
-
#include <rice/rice.hpp>
|
2
|
-
|
3
|
-
#include "common.h"
|
4
|
-
#include "llama.h"
|
5
|
-
|
6
|
-
#include <cassert>
|
7
|
-
#include <cinttypes>
|
8
|
-
#include <cmath>
|
9
|
-
#include <cstdio>
|
10
|
-
#include <cstring>
|
11
|
-
#include <fstream>
|
12
|
-
#include <iostream>
|
13
|
-
#include <string>
|
14
|
-
#include <vector>
|
15
|
-
|
16
|
-
class ModelCpp
|
17
|
-
{
|
18
|
-
public:
|
19
|
-
llama_context *ctx;
|
20
|
-
ModelCpp()
|
21
|
-
{
|
22
|
-
ctx = NULL;
|
23
|
-
}
|
24
|
-
void model_initialize(
|
25
|
-
const char *model,
|
26
|
-
const int32_t n_ctx,
|
27
|
-
const int32_t n_parts,
|
28
|
-
const int32_t seed,
|
29
|
-
const bool memory_f16,
|
30
|
-
const bool use_mlock
|
31
|
-
);
|
32
|
-
Rice::Object model_predict(
|
33
|
-
const char *prompt,
|
34
|
-
const int32_t n_predict
|
35
|
-
);
|
36
|
-
~ModelCpp()
|
37
|
-
{
|
38
|
-
|
39
|
-
if (ctx != NULL) {
|
40
|
-
llama_free(ctx);
|
41
|
-
}
|
42
|
-
}
|
43
|
-
};
|
44
|
-
|
45
|
-
void ModelCpp::model_initialize(
|
46
|
-
const char *model, // path to model file, e.g. "models/7B/ggml-model-q4_0.bin"
|
47
|
-
const int32_t n_ctx, // context size
|
48
|
-
const int32_t n_parts, // amount of model parts (-1 = determine from model dimensions)
|
49
|
-
const int32_t seed, // RNG seed
|
50
|
-
const bool memory_f16, // use f16 instead of f32 for memory kv
|
51
|
-
const bool use_mlock // use mlock to keep model in memory
|
52
|
-
)
|
53
|
-
{
|
54
|
-
auto lparams = llama_context_default_params();
|
55
|
-
|
56
|
-
lparams.n_ctx = n_ctx;
|
57
|
-
lparams.n_parts = n_parts;
|
58
|
-
lparams.seed = seed;
|
59
|
-
lparams.f16_kv = memory_f16;
|
60
|
-
lparams.use_mlock = use_mlock;
|
61
|
-
|
62
|
-
ctx = llama_init_from_file(model, lparams);
|
63
|
-
}
|
64
|
-
|
65
|
-
Rice::Object ModelCpp::model_predict(
|
66
|
-
const char *prompt, // string used as prompt
|
67
|
-
const int32_t n_predict // number of tokens to predict
|
68
|
-
)
|
69
|
-
{
|
70
|
-
std::string return_val = "";
|
71
|
-
|
72
|
-
gpt_params params;
|
73
|
-
params.prompt = prompt;
|
74
|
-
params.n_predict = n_predict;
|
75
|
-
|
76
|
-
// add a space in front of the first character to match OG llama tokenizer behavior
|
77
|
-
params.prompt.insert(0, 1, ' ');
|
78
|
-
|
79
|
-
// tokenize the prompt
|
80
|
-
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
81
|
-
const int n_ctx = llama_n_ctx(ctx);
|
82
|
-
|
83
|
-
// determine newline token
|
84
|
-
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
85
|
-
|
86
|
-
// generate output
|
87
|
-
{
|
88
|
-
std::vector<llama_token> last_n_tokens(n_ctx);
|
89
|
-
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
90
|
-
|
91
|
-
int n_past = 0;
|
92
|
-
int n_remain = params.n_predict;
|
93
|
-
int n_consumed = 0;
|
94
|
-
|
95
|
-
std::vector<llama_token> embd;
|
96
|
-
|
97
|
-
while (n_remain != 0) {
|
98
|
-
if (embd.size() > 0) {
|
99
|
-
// infinite text generation via context swapping
|
100
|
-
// if we run out of context:
|
101
|
-
// - take the n_keep first tokens from the original prompt (via n_past)
|
102
|
-
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
|
103
|
-
if (n_past + (int) embd.size() > n_ctx) {
|
104
|
-
const int n_left = n_past - params.n_keep;
|
105
|
-
|
106
|
-
n_past = params.n_keep;
|
107
|
-
|
108
|
-
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
109
|
-
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
|
110
|
-
}
|
111
|
-
|
112
|
-
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
|
113
|
-
throw Rice::Exception(rb_eRuntimeError, "Failed to eval");
|
114
|
-
}
|
115
|
-
}
|
116
|
-
|
117
|
-
|
118
|
-
n_past += embd.size();
|
119
|
-
embd.clear();
|
120
|
-
|
121
|
-
if ((int) embd_inp.size() <= n_consumed) {
|
122
|
-
// out of user input, sample next token
|
123
|
-
const int32_t top_k = params.top_k;
|
124
|
-
const float top_p = params.top_p;
|
125
|
-
const float temp = params.temp;
|
126
|
-
const float repeat_penalty = params.repeat_penalty;
|
127
|
-
|
128
|
-
llama_token id = 0;
|
129
|
-
|
130
|
-
{
|
131
|
-
auto logits = llama_get_logits(ctx);
|
132
|
-
|
133
|
-
if (params.ignore_eos) {
|
134
|
-
logits[llama_token_eos()] = 0;
|
135
|
-
}
|
136
|
-
|
137
|
-
id = llama_sample_top_p_top_k(ctx,
|
138
|
-
last_n_tokens.data() + n_ctx - params.repeat_last_n,
|
139
|
-
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
|
140
|
-
|
141
|
-
last_n_tokens.erase(last_n_tokens.begin());
|
142
|
-
last_n_tokens.push_back(id);
|
143
|
-
}
|
144
|
-
|
145
|
-
// replace end of text token with newline token when in interactive mode
|
146
|
-
if (id == llama_token_eos() && params.interactive && !params.instruct) {
|
147
|
-
id = llama_token_newline.front();
|
148
|
-
if (params.antiprompt.size() != 0) {
|
149
|
-
// tokenize and inject first reverse prompt
|
150
|
-
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
151
|
-
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
152
|
-
}
|
153
|
-
}
|
154
|
-
|
155
|
-
// add it to the context
|
156
|
-
embd.push_back(id);
|
157
|
-
|
158
|
-
// decrement remaining sampling budget
|
159
|
-
--n_remain;
|
160
|
-
} else {
|
161
|
-
// some user input remains from prompt or interaction, forward it to processing
|
162
|
-
while ((int) embd_inp.size() > n_consumed) {
|
163
|
-
embd.push_back(embd_inp[n_consumed]);
|
164
|
-
last_n_tokens.erase(last_n_tokens.begin());
|
165
|
-
last_n_tokens.push_back(embd_inp[n_consumed]);
|
166
|
-
++n_consumed;
|
167
|
-
if ((int) embd.size() >= params.n_batch) {
|
168
|
-
break;
|
169
|
-
}
|
170
|
-
}
|
171
|
-
}
|
172
|
-
|
173
|
-
for (auto id : embd) {
|
174
|
-
return_val += llama_token_to_str(ctx, id);
|
175
|
-
}
|
176
|
-
}
|
177
|
-
}
|
178
|
-
|
179
|
-
Rice::String ruby_return_val(return_val);
|
180
|
-
return ruby_return_val;
|
181
|
-
}
|
182
|
-
|
183
|
-
extern "C"
|
184
|
-
void Init_model()
|
185
|
-
{
|
186
|
-
Rice::Module rb_mLlama = Rice::define_module("Llama");
|
187
|
-
Rice::Data_Type<ModelCpp> rb_cModel =Rice::define_class_under<ModelCpp>(rb_mLlama, "Model");
|
188
|
-
|
189
|
-
rb_cModel.define_constructor(Rice::Constructor<ModelCpp>());
|
190
|
-
rb_cModel.define_method("initialize_cpp", &ModelCpp::model_initialize);
|
191
|
-
rb_cModel.define_method("predict_cpp", &ModelCpp::model_predict);
|
192
|
-
}
|
File without changes
|