llama_cpp 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -2
- data/README.md +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +39 -1
- data/ext/llama_cpp/src/ggml.c +587 -485
- data/ext/llama_cpp/src/ggml.h +36 -26
- data/ext/llama_cpp/src/llama.cpp +85 -46
- data/ext/llama_cpp/src/llama.h +17 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -1
- data/sig/llama_cpp.rbs +52 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -258,11 +258,11 @@ struct ggml_tensor {
|
|
258
258
|
enum ggml_type type;
|
259
259
|
|
260
260
|
int n_dims;
|
261
|
-
|
262
|
-
size_t
|
263
|
-
|
264
|
-
|
265
|
-
|
261
|
+
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
262
|
+
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
263
|
+
// nb[0] = sizeof(type)
|
264
|
+
// nb[1] = nb[0] * ne[0] + padding
|
265
|
+
// nb[i] = nb[i-1] * ne[i-1]
|
266
266
|
|
267
267
|
// compute data
|
268
268
|
enum ggml_op op;
|
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
|
|
328
328
|
void ggml_print_object (const struct ggml_object * obj);
|
329
329
|
void ggml_print_objects(const struct ggml_context * ctx);
|
330
330
|
|
331
|
-
|
332
|
-
size_t
|
331
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
332
|
+
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
333
333
|
|
334
334
|
int ggml_blck_size (enum ggml_type type);
|
335
335
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
|
|
355
355
|
struct ggml_context * ctx,
|
356
356
|
enum ggml_type type,
|
357
357
|
int n_dims,
|
358
|
-
const
|
358
|
+
const int64_t *ne);
|
359
359
|
|
360
360
|
struct ggml_tensor * ggml_new_tensor_1d(
|
361
361
|
struct ggml_context * ctx,
|
362
362
|
enum ggml_type type,
|
363
|
-
|
363
|
+
int64_t ne0);
|
364
364
|
|
365
365
|
struct ggml_tensor * ggml_new_tensor_2d(
|
366
366
|
struct ggml_context * ctx,
|
367
367
|
enum ggml_type type,
|
368
|
-
|
369
|
-
|
368
|
+
int64_t ne0,
|
369
|
+
int64_t ne1);
|
370
370
|
|
371
371
|
struct ggml_tensor * ggml_new_tensor_3d(
|
372
372
|
struct ggml_context * ctx,
|
373
373
|
enum ggml_type type,
|
374
|
-
|
375
|
-
|
376
|
-
|
374
|
+
int64_t ne0,
|
375
|
+
int64_t ne1,
|
376
|
+
int64_t ne2);
|
377
377
|
|
378
378
|
struct ggml_tensor * ggml_new_tensor_4d(
|
379
379
|
struct ggml_context * ctx,
|
380
380
|
enum ggml_type type,
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
381
|
+
int64_t ne0,
|
382
|
+
int64_t ne1,
|
383
|
+
int64_t ne2,
|
384
|
+
int64_t ne3);
|
385
385
|
|
386
386
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
387
387
|
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
|
|
531
531
|
struct ggml_tensor * ggml_reshape_2d(
|
532
532
|
struct ggml_context * ctx,
|
533
533
|
struct ggml_tensor * a,
|
534
|
-
|
535
|
-
|
534
|
+
int64_t ne0,
|
535
|
+
int64_t ne1);
|
536
536
|
|
537
537
|
// return view(a)
|
538
538
|
// TODO: when we start computing gradient, make a copy instead of view
|
539
539
|
struct ggml_tensor * ggml_reshape_3d(
|
540
540
|
struct ggml_context * ctx,
|
541
541
|
struct ggml_tensor * a,
|
542
|
-
|
543
|
-
|
544
|
-
|
542
|
+
int64_t ne0,
|
543
|
+
int64_t ne1,
|
544
|
+
int64_t ne2);
|
545
545
|
|
546
546
|
// offset in bytes
|
547
547
|
struct ggml_tensor * ggml_view_1d(
|
548
548
|
struct ggml_context * ctx,
|
549
549
|
struct ggml_tensor * a,
|
550
|
-
|
550
|
+
int64_t ne0,
|
551
551
|
size_t offset);
|
552
552
|
|
553
553
|
struct ggml_tensor * ggml_view_2d(
|
554
554
|
struct ggml_context * ctx,
|
555
555
|
struct ggml_tensor * a,
|
556
|
-
|
557
|
-
|
556
|
+
int64_t ne0,
|
557
|
+
int64_t ne1,
|
558
558
|
size_t nb1, // row stride in bytes
|
559
559
|
size_t offset);
|
560
560
|
|
561
|
+
struct ggml_tensor * ggml_view_3d(
|
562
|
+
struct ggml_context * ctx,
|
563
|
+
struct ggml_tensor * a,
|
564
|
+
int64_t ne0,
|
565
|
+
int64_t ne1,
|
566
|
+
int64_t ne2,
|
567
|
+
size_t nb1, // row stride in bytes
|
568
|
+
size_t nb2, // slice stride in bytes
|
569
|
+
size_t offset);
|
570
|
+
|
561
571
|
struct ggml_tensor * ggml_permute(
|
562
572
|
struct ggml_context * ctx,
|
563
573
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -256,8 +256,8 @@ static bool kv_cache_init(
|
|
256
256
|
const int n_embd = hparams.n_embd;
|
257
257
|
const int n_layer = hparams.n_layer;
|
258
258
|
|
259
|
-
const
|
260
|
-
const
|
259
|
+
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
260
|
+
const int64_t n_elements = n_embd*n_mem;
|
261
261
|
|
262
262
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
263
|
|
@@ -679,7 +679,7 @@ static bool llama_model_load(
|
|
679
679
|
return false;
|
680
680
|
}
|
681
681
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%
|
682
|
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
683
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
684
|
return false;
|
685
685
|
}
|
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
|
|
810
810
|
|
811
811
|
// self-attention
|
812
812
|
{
|
813
|
-
|
814
|
-
struct ggml_tensor *
|
815
|
-
struct ggml_tensor *
|
813
|
+
// compute Q and K and RoPE them
|
814
|
+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
815
|
+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
816
816
|
|
817
817
|
// store key and value to memory
|
818
|
-
|
818
|
+
{
|
819
|
+
// compute the transposed [N, n_embd] V matrix
|
820
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
821
|
+
|
819
822
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
820
|
-
struct ggml_tensor * v =
|
823
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
824
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
825
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
821
826
|
|
827
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
822
828
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
823
829
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
824
830
|
}
|
825
831
|
|
826
|
-
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
827
832
|
struct ggml_tensor * Q =
|
828
833
|
ggml_permute(ctx0,
|
829
|
-
|
830
|
-
ggml_cpy(ctx0,
|
831
|
-
Qcur,
|
832
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
833
|
-
n_past, n_rot, 0),
|
834
|
+
Qcur,
|
834
835
|
0, 2, 1, 3);
|
835
836
|
|
836
|
-
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
837
837
|
struct ggml_tensor * K =
|
838
838
|
ggml_permute(ctx0,
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
n_embd/n_head, n_head, n_past + N),
|
843
|
-
n_past, n_rot, 1),
|
839
|
+
ggml_reshape_3d(ctx0,
|
840
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
841
|
+
n_embd/n_head, n_head, n_past + N),
|
844
842
|
0, 2, 1, 3);
|
845
843
|
|
846
844
|
// K * Q
|
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
|
|
858
856
|
// KQ = soft_max(KQ_masked)
|
859
857
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
860
858
|
|
861
|
-
//
|
862
|
-
struct ggml_tensor *
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
1, 2, 0, 3),
|
869
|
-
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
859
|
+
// split cached V into n_head heads
|
860
|
+
struct ggml_tensor * V =
|
861
|
+
ggml_view_3d(ctx0, kv_self.v,
|
862
|
+
n_past + N, n_embd/n_head, n_head,
|
863
|
+
n_ctx*ggml_element_size(kv_self.v),
|
864
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
865
|
+
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
870
866
|
|
871
|
-
|
872
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0,
|
867
|
+
#if 1
|
868
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
869
|
+
#else
|
870
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
871
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
872
|
+
// is there a better way?
|
873
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
874
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
875
|
+
#endif
|
873
876
|
|
874
877
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
875
878
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
|
|
955
958
|
ggml_build_forward_expand(&gf, inpL);
|
956
959
|
ggml_graph_compute (ctx0, &gf);
|
957
960
|
|
961
|
+
// print timing information per ggml operation (for debugging purposes)
|
962
|
+
// requires GGML_PERF to be defined
|
963
|
+
//ggml_graph_print(&gf);
|
964
|
+
|
965
|
+
// plot the computation graph in dot format (for debugging purposes)
|
958
966
|
//if (n_past%100 == 0) {
|
959
|
-
//
|
960
|
-
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
967
|
+
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
961
968
|
//}
|
962
969
|
|
963
970
|
//embd_w.resize(n_vocab*N);
|
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1194
1201
|
const auto & logits = lctx.logits;
|
1195
1202
|
const auto * plogits = logits.data() + logits.size() - n_logits;
|
1196
1203
|
|
1204
|
+
if (temp <= 0) {
|
1205
|
+
// select the token with the highest logit directly
|
1206
|
+
float max_logit = plogits[0];
|
1207
|
+
llama_vocab::id max_id = 0;
|
1208
|
+
|
1209
|
+
for (int i = 1; i < n_logits; ++i) {
|
1210
|
+
if (plogits[i] > max_logit) {
|
1211
|
+
max_logit = plogits[i];
|
1212
|
+
max_id = i;
|
1213
|
+
}
|
1214
|
+
}
|
1215
|
+
return max_id;
|
1216
|
+
}
|
1217
|
+
|
1197
1218
|
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
1198
1219
|
logits_id.reserve(n_logits);
|
1199
1220
|
|
@@ -1215,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1215
1236
|
}
|
1216
1237
|
}
|
1217
1238
|
|
1218
|
-
sample_top_k(logits_id, top_k);
|
1219
|
-
|
1220
|
-
float maxl = -std::numeric_limits<float>::infinity();
|
1221
|
-
for (const auto & kv : logits_id) {
|
1222
|
-
maxl = Max(maxl, kv.first);
|
1223
|
-
}
|
1239
|
+
sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
|
1224
1240
|
|
1225
1241
|
// compute probs for the top k tokens
|
1226
1242
|
std::vector<float> probs;
|
1227
1243
|
probs.reserve(logits_id.size());
|
1228
1244
|
|
1245
|
+
float maxl = logits_id[0].first;
|
1229
1246
|
double sum = 0.0;
|
1230
1247
|
for (const auto & kv : logits_id) {
|
1231
1248
|
const float p = expf(kv.first - maxl);
|
@@ -1248,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1248
1265
|
break;
|
1249
1266
|
}
|
1250
1267
|
}
|
1251
|
-
|
1252
|
-
cumsum = 1.0/cumsum;
|
1253
|
-
for (int i = 0; i < (int) probs.size(); i++) {
|
1254
|
-
probs[i] *= cumsum;
|
1255
|
-
}
|
1256
1268
|
}
|
1257
1269
|
|
1258
1270
|
//printf("\n");
|
1259
1271
|
//for (int i = 0; i < (int) 10; i++) {
|
1260
|
-
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
1272
|
+
// printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
|
1261
1273
|
//}
|
1262
1274
|
//printf("\n\n");
|
1263
1275
|
//exit(0);
|
@@ -1608,7 +1620,7 @@ struct llama_context * llama_init_from_file(
|
|
1608
1620
|
}
|
1609
1621
|
|
1610
1622
|
// reserve memory for context buffers
|
1611
|
-
{
|
1623
|
+
if (!params.vocab_only) {
|
1612
1624
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
1613
1625
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
1614
1626
|
llama_free(ctx);
|
@@ -1668,6 +1680,33 @@ int llama_model_quantize(
|
|
1668
1680
|
return 0;
|
1669
1681
|
}
|
1670
1682
|
|
1683
|
+
// Returns the KV cache that will contain the context for the
|
1684
|
+
// ongoing prediction with the model.
|
1685
|
+
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1686
|
+
return ctx->model.kv_self.buf.data();
|
1687
|
+
}
|
1688
|
+
|
1689
|
+
// Returns the size of the KV cache
|
1690
|
+
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1691
|
+
return ctx->model.kv_self.buf.size();
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
1695
|
+
return ctx->model.kv_self.n;
|
1696
|
+
}
|
1697
|
+
|
1698
|
+
// Sets the KV cache containing the current context for the model
|
1699
|
+
void llama_set_kv_cache(
|
1700
|
+
struct llama_context * ctx,
|
1701
|
+
const uint8_t * kv_cache,
|
1702
|
+
size_t n_size,
|
1703
|
+
int n_token_count) {
|
1704
|
+
// Make sure we have the same kv cache setup
|
1705
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
1706
|
+
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
1707
|
+
ctx->model.kv_self.n = n_token_count;
|
1708
|
+
}
|
1709
|
+
|
1671
1710
|
int llama_eval(
|
1672
1711
|
struct llama_context * ctx,
|
1673
1712
|
const llama_token * tokens,
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -83,6 +83,23 @@ extern "C" {
|
|
83
83
|
const char * fname_out,
|
84
84
|
int itype);
|
85
85
|
|
86
|
+
// Returns the KV cache that will contain the context for the
|
87
|
+
// ongoing prediction with the model.
|
88
|
+
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
89
|
+
|
90
|
+
// Returns the size of the KV cache
|
91
|
+
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
92
|
+
|
93
|
+
// Returns the number of tokens in the KV cache
|
94
|
+
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
95
|
+
|
96
|
+
// Sets the KV cache containing the current context for the model
|
97
|
+
LLAMA_API void llama_set_kv_cache(
|
98
|
+
struct llama_context * ctx,
|
99
|
+
const uint8_t * kv_cache,
|
100
|
+
size_t n_size,
|
101
|
+
int n_token_count);
|
102
|
+
|
86
103
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
87
104
|
// tokens + n_tokens is the provided batch of new tokens to process
|
88
105
|
// n_past is the number of tokens to use from previous eval calls
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-698f7b5'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -5,13 +5,16 @@ require_relative 'llama_cpp/llama_cpp'
|
|
5
5
|
|
6
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
7
|
module LLaMACpp
|
8
|
+
# Class alias to match interface of whispercpp gem.
|
9
|
+
Params = ContextParams
|
10
|
+
|
8
11
|
module_function
|
9
12
|
|
10
13
|
# Generates sentences following the given prompt for operation check.
|
11
14
|
#
|
12
15
|
# @param context [LLaMACpp::Context]
|
13
16
|
# @param prompt [String]
|
14
|
-
# @
|
17
|
+
# @param n_threads [Integer]
|
15
18
|
# @return [String]
|
16
19
|
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
17
20
|
prompt.insert(0, ' ')
|
data/sig/llama_cpp.rbs
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
module LLaMACpp
|
2
|
+
VERSION: String
|
3
|
+
LLAMA_CPP_VERSION: String
|
4
|
+
LLAMA_FILE_VERSION: String
|
5
|
+
LLAMA_FILE_MAGIC: String
|
6
|
+
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
|
+
|
8
|
+
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
9
|
+
def self?.print_system_info: () -> void
|
10
|
+
def self?.token_bos: () -> Integer
|
11
|
+
def self?.token_eos: () -> Integer
|
12
|
+
|
13
|
+
class Context
|
14
|
+
public
|
15
|
+
|
16
|
+
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
17
|
+
def embeddings: () -> Array[Float]
|
18
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
|
19
|
+
def logits: () -> Array[Float]
|
20
|
+
def n_ctx: () -> Integer
|
21
|
+
def n_embd: () -> Integer
|
22
|
+
def n_vocab: () -> Integer
|
23
|
+
def print_timings: () -> void
|
24
|
+
def reset_timings: () -> void
|
25
|
+
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
26
|
+
def token_to_str: (Integer) -> String
|
27
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
28
|
+
end
|
29
|
+
|
30
|
+
class ContextParams
|
31
|
+
public
|
32
|
+
|
33
|
+
def embedding: () -> bool
|
34
|
+
def embedding=: (bool) -> bool
|
35
|
+
def f16_kv: () -> bool
|
36
|
+
def f16_kv=: (bool) -> bool
|
37
|
+
def logits_all: () -> bool
|
38
|
+
def logits_all=: (bool) -> bool
|
39
|
+
def n_ctx: () -> Integer
|
40
|
+
def n_ctx=: (Integer) -> Integer
|
41
|
+
def n_parts: () -> Integer
|
42
|
+
def n_parts=: (Integer) -> Integer
|
43
|
+
def seed: () -> Integer
|
44
|
+
def seed=: (Integer) -> Integer
|
45
|
+
def use_mlock: () -> bool
|
46
|
+
def use_mlock=: (bool) -> bool
|
47
|
+
def vocab_only: () -> bool
|
48
|
+
def vocab_only=: (bool) -> bool
|
49
|
+
end
|
50
|
+
|
51
|
+
class Params = ContextParams
|
52
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- ext/llama_cpp/src/llama.h
|
33
33
|
- lib/llama_cpp.rb
|
34
34
|
- lib/llama_cpp/version.rb
|
35
|
+
- sig/llama_cpp.rbs
|
35
36
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|
36
37
|
licenses:
|
37
38
|
- MIT
|