llama_cpp 0.3.4 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -199,6 +199,7 @@
|
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
|
+
#define GGML_MAX_OP_PARAMS 32
|
202
203
|
#define GGML_DEFAULT_N_THREADS 4
|
203
204
|
|
204
205
|
|
@@ -207,6 +208,7 @@
|
|
207
208
|
|
208
209
|
#define GGML_UNUSED(x) (void)(x)
|
209
210
|
|
211
|
+
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
210
212
|
|
211
213
|
#define GGML_ASSERT(x) \
|
212
214
|
do { \
|
@@ -329,16 +331,6 @@ extern "C" {
|
|
329
331
|
GGML_OP_ARGMAX,
|
330
332
|
GGML_OP_REPEAT,
|
331
333
|
GGML_OP_REPEAT_BACK,
|
332
|
-
GGML_OP_ABS,
|
333
|
-
GGML_OP_SGN,
|
334
|
-
GGML_OP_NEG,
|
335
|
-
GGML_OP_STEP,
|
336
|
-
GGML_OP_TANH,
|
337
|
-
GGML_OP_ELU,
|
338
|
-
GGML_OP_RELU,
|
339
|
-
GGML_OP_GELU,
|
340
|
-
GGML_OP_GELU_QUICK,
|
341
|
-
GGML_OP_SILU,
|
342
334
|
GGML_OP_SILU_BACK,
|
343
335
|
GGML_OP_NORM, // normalize
|
344
336
|
GGML_OP_RMS_NORM,
|
@@ -377,6 +369,8 @@ extern "C" {
|
|
377
369
|
GGML_OP_WIN_PART,
|
378
370
|
GGML_OP_WIN_UNPART,
|
379
371
|
|
372
|
+
GGML_OP_UNARY,
|
373
|
+
|
380
374
|
GGML_OP_MAP_UNARY,
|
381
375
|
GGML_OP_MAP_BINARY,
|
382
376
|
|
@@ -390,6 +384,24 @@ extern "C" {
|
|
390
384
|
GGML_OP_COUNT,
|
391
385
|
};
|
392
386
|
|
387
|
+
enum ggml_unary_op {
|
388
|
+
GGML_UNARY_OP_ABS,
|
389
|
+
GGML_UNARY_OP_SGN,
|
390
|
+
GGML_UNARY_OP_NEG,
|
391
|
+
GGML_UNARY_OP_STEP,
|
392
|
+
GGML_UNARY_OP_TANH,
|
393
|
+
GGML_UNARY_OP_ELU,
|
394
|
+
GGML_UNARY_OP_RELU,
|
395
|
+
GGML_UNARY_OP_GELU,
|
396
|
+
GGML_UNARY_OP_GELU_QUICK,
|
397
|
+
GGML_UNARY_OP_SILU,
|
398
|
+
};
|
399
|
+
|
400
|
+
enum ggml_object_type {
|
401
|
+
GGML_OBJECT_TENSOR,
|
402
|
+
GGML_OBJECT_GRAPH,
|
403
|
+
GGML_OBJECT_WORK_BUFFER
|
404
|
+
};
|
393
405
|
|
394
406
|
// ggml object
|
395
407
|
struct ggml_object {
|
@@ -398,7 +410,9 @@ extern "C" {
|
|
398
410
|
|
399
411
|
struct ggml_object * next;
|
400
412
|
|
401
|
-
|
413
|
+
enum ggml_object_type type;
|
414
|
+
|
415
|
+
char padding[4];
|
402
416
|
};
|
403
417
|
|
404
418
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
@@ -418,6 +432,9 @@ extern "C" {
|
|
418
432
|
// compute data
|
419
433
|
enum ggml_op op;
|
420
434
|
|
435
|
+
// op params - allocated as int32_t for alignment
|
436
|
+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
437
|
+
|
421
438
|
bool is_param;
|
422
439
|
|
423
440
|
struct ggml_tensor * grad;
|
@@ -434,7 +451,7 @@ extern "C" {
|
|
434
451
|
|
435
452
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
436
453
|
|
437
|
-
char padding[
|
454
|
+
char padding[4];
|
438
455
|
};
|
439
456
|
|
440
457
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -455,6 +472,11 @@ extern "C" {
|
|
455
472
|
void * abort_callback_data;
|
456
473
|
};
|
457
474
|
|
475
|
+
// next prime after GGML_MAX_NODES
|
476
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
477
|
+
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
478
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
479
|
+
|
458
480
|
// computation graph
|
459
481
|
struct ggml_cgraph {
|
460
482
|
int n_nodes;
|
@@ -464,12 +486,16 @@ extern "C" {
|
|
464
486
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
465
487
|
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
466
488
|
|
489
|
+
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
490
|
+
|
467
491
|
// performance
|
468
492
|
int perf_runs;
|
469
493
|
int64_t perf_cycles;
|
470
494
|
int64_t perf_time_us;
|
471
495
|
};
|
472
496
|
|
497
|
+
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
498
|
+
|
473
499
|
// scratch buffer
|
474
500
|
struct ggml_scratch {
|
475
501
|
size_t offs;
|
@@ -531,6 +557,7 @@ extern "C" {
|
|
531
557
|
|
532
558
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
533
559
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
560
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
534
561
|
|
535
562
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
536
563
|
|
@@ -554,6 +581,7 @@ extern "C" {
|
|
554
581
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
555
582
|
|
556
583
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
584
|
+
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
557
585
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
558
586
|
|
559
587
|
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
@@ -613,9 +641,11 @@ extern "C" {
|
|
613
641
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
614
642
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
615
643
|
|
616
|
-
GGML_API
|
617
|
-
|
618
|
-
GGML_API
|
644
|
+
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
645
|
+
|
646
|
+
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
647
|
+
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
648
|
+
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
619
649
|
|
620
650
|
//
|
621
651
|
// operations on tensors with backpropagation
|
@@ -625,6 +655,11 @@ extern "C" {
|
|
625
655
|
struct ggml_context * ctx,
|
626
656
|
struct ggml_tensor * a);
|
627
657
|
|
658
|
+
// in-place, returns view(a)
|
659
|
+
GGML_API struct ggml_tensor * ggml_dup_inplace(
|
660
|
+
struct ggml_context * ctx,
|
661
|
+
struct ggml_tensor * a);
|
662
|
+
|
628
663
|
GGML_API struct ggml_tensor * ggml_add(
|
629
664
|
struct ggml_context * ctx,
|
630
665
|
struct ggml_tensor * a,
|
@@ -849,14 +884,17 @@ extern "C" {
|
|
849
884
|
|
850
885
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
851
886
|
struct ggml_context * ctx,
|
852
|
-
struct ggml_tensor * a
|
887
|
+
struct ggml_tensor * a,
|
888
|
+
float eps);
|
853
889
|
|
854
890
|
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
855
891
|
struct ggml_context * ctx,
|
856
|
-
struct ggml_tensor * a
|
892
|
+
struct ggml_tensor * a,
|
893
|
+
float eps);
|
857
894
|
|
858
895
|
// a - x
|
859
896
|
// b - dy
|
897
|
+
// TODO: update with configurable eps
|
860
898
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
861
899
|
struct ggml_context * ctx,
|
862
900
|
struct ggml_tensor * a,
|
@@ -948,11 +986,22 @@ extern "C" {
|
|
948
986
|
struct ggml_tensor * a,
|
949
987
|
struct ggml_tensor * b);
|
950
988
|
|
989
|
+
// a -> b, in-place, return view(b)
|
990
|
+
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
991
|
+
struct ggml_context * ctx,
|
992
|
+
struct ggml_tensor * a,
|
993
|
+
struct ggml_tensor * b);
|
994
|
+
|
951
995
|
// make contiguous
|
952
996
|
GGML_API struct ggml_tensor * ggml_cont(
|
953
997
|
struct ggml_context * ctx,
|
954
998
|
struct ggml_tensor * a);
|
955
999
|
|
1000
|
+
// make contiguous, in-place
|
1001
|
+
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
1002
|
+
struct ggml_context * ctx,
|
1003
|
+
struct ggml_tensor * a);
|
1004
|
+
|
956
1005
|
// return view(a), b specifies the new shape
|
957
1006
|
// TODO: when we start computing gradient, make a copy instead of view
|
958
1007
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1121,7 +1170,18 @@ extern "C" {
|
|
1121
1170
|
int mode,
|
1122
1171
|
int n_ctx);
|
1123
1172
|
|
1124
|
-
// custom RoPE
|
1173
|
+
// custom RoPE
|
1174
|
+
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1175
|
+
struct ggml_context * ctx,
|
1176
|
+
struct ggml_tensor * a,
|
1177
|
+
int n_past,
|
1178
|
+
int n_dims,
|
1179
|
+
int mode,
|
1180
|
+
int n_ctx,
|
1181
|
+
float freq_base,
|
1182
|
+
float freq_scale);
|
1183
|
+
|
1184
|
+
// in-place, returns view(a)
|
1125
1185
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1126
1186
|
struct ggml_context * ctx,
|
1127
1187
|
struct ggml_tensor * a,
|
@@ -1264,6 +1324,16 @@ extern "C" {
|
|
1264
1324
|
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1265
1325
|
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1266
1326
|
|
1327
|
+
GGML_API struct ggml_tensor * ggml_unary(
|
1328
|
+
struct ggml_context * ctx,
|
1329
|
+
struct ggml_tensor * a,
|
1330
|
+
enum ggml_unary_op op);
|
1331
|
+
|
1332
|
+
GGML_API struct ggml_tensor * ggml_unary_inplace(
|
1333
|
+
struct ggml_context * ctx,
|
1334
|
+
struct ggml_tensor * a,
|
1335
|
+
enum ggml_unary_op op);
|
1336
|
+
|
1267
1337
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1268
1338
|
struct ggml_context * ctx,
|
1269
1339
|
struct ggml_tensor * a,
|
@@ -1343,11 +1413,17 @@ extern "C" {
|
|
1343
1413
|
struct ggml_context * ctx,
|
1344
1414
|
struct ggml_tensor * tensor);
|
1345
1415
|
|
1416
|
+
|
1346
1417
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1347
1418
|
|
1348
1419
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1349
1420
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1350
1421
|
|
1422
|
+
// graph allocation in a context
|
1423
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
|
1424
|
+
GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
1425
|
+
GGML_API size_t ggml_graph_overhead(void);
|
1426
|
+
|
1351
1427
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1352
1428
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1353
1429
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|