llama_cpp 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -199,6 +199,7 @@
|
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
|
+
#define GGML_MAX_OP_PARAMS 32
|
202
203
|
#define GGML_DEFAULT_N_THREADS 4
|
203
204
|
|
204
205
|
|
@@ -207,6 +208,7 @@
|
|
207
208
|
|
208
209
|
#define GGML_UNUSED(x) (void)(x)
|
209
210
|
|
211
|
+
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
210
212
|
|
211
213
|
#define GGML_ASSERT(x) \
|
212
214
|
do { \
|
@@ -329,16 +331,6 @@ extern "C" {
|
|
329
331
|
GGML_OP_ARGMAX,
|
330
332
|
GGML_OP_REPEAT,
|
331
333
|
GGML_OP_REPEAT_BACK,
|
332
|
-
GGML_OP_ABS,
|
333
|
-
GGML_OP_SGN,
|
334
|
-
GGML_OP_NEG,
|
335
|
-
GGML_OP_STEP,
|
336
|
-
GGML_OP_TANH,
|
337
|
-
GGML_OP_ELU,
|
338
|
-
GGML_OP_RELU,
|
339
|
-
GGML_OP_GELU,
|
340
|
-
GGML_OP_GELU_QUICK,
|
341
|
-
GGML_OP_SILU,
|
342
334
|
GGML_OP_SILU_BACK,
|
343
335
|
GGML_OP_NORM, // normalize
|
344
336
|
GGML_OP_RMS_NORM,
|
@@ -377,6 +369,8 @@ extern "C" {
|
|
377
369
|
GGML_OP_WIN_PART,
|
378
370
|
GGML_OP_WIN_UNPART,
|
379
371
|
|
372
|
+
GGML_OP_UNARY,
|
373
|
+
|
380
374
|
GGML_OP_MAP_UNARY,
|
381
375
|
GGML_OP_MAP_BINARY,
|
382
376
|
|
@@ -390,6 +384,24 @@ extern "C" {
|
|
390
384
|
GGML_OP_COUNT,
|
391
385
|
};
|
392
386
|
|
387
|
+
enum ggml_unary_op {
|
388
|
+
GGML_UNARY_OP_ABS,
|
389
|
+
GGML_UNARY_OP_SGN,
|
390
|
+
GGML_UNARY_OP_NEG,
|
391
|
+
GGML_UNARY_OP_STEP,
|
392
|
+
GGML_UNARY_OP_TANH,
|
393
|
+
GGML_UNARY_OP_ELU,
|
394
|
+
GGML_UNARY_OP_RELU,
|
395
|
+
GGML_UNARY_OP_GELU,
|
396
|
+
GGML_UNARY_OP_GELU_QUICK,
|
397
|
+
GGML_UNARY_OP_SILU,
|
398
|
+
};
|
399
|
+
|
400
|
+
enum ggml_object_type {
|
401
|
+
GGML_OBJECT_TENSOR,
|
402
|
+
GGML_OBJECT_GRAPH,
|
403
|
+
GGML_OBJECT_WORK_BUFFER
|
404
|
+
};
|
393
405
|
|
394
406
|
// ggml object
|
395
407
|
struct ggml_object {
|
@@ -398,7 +410,9 @@ extern "C" {
|
|
398
410
|
|
399
411
|
struct ggml_object * next;
|
400
412
|
|
401
|
-
|
413
|
+
enum ggml_object_type type;
|
414
|
+
|
415
|
+
char padding[4];
|
402
416
|
};
|
403
417
|
|
404
418
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
@@ -418,6 +432,9 @@ extern "C" {
|
|
418
432
|
// compute data
|
419
433
|
enum ggml_op op;
|
420
434
|
|
435
|
+
// op params - allocated as int32_t for alignment
|
436
|
+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
437
|
+
|
421
438
|
bool is_param;
|
422
439
|
|
423
440
|
struct ggml_tensor * grad;
|
@@ -434,7 +451,7 @@ extern "C" {
|
|
434
451
|
|
435
452
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
436
453
|
|
437
|
-
char padding[
|
454
|
+
char padding[4];
|
438
455
|
};
|
439
456
|
|
440
457
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -455,6 +472,11 @@ extern "C" {
|
|
455
472
|
void * abort_callback_data;
|
456
473
|
};
|
457
474
|
|
475
|
+
// next prime after GGML_MAX_NODES
|
476
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
477
|
+
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
478
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
479
|
+
|
458
480
|
// computation graph
|
459
481
|
struct ggml_cgraph {
|
460
482
|
int n_nodes;
|
@@ -464,12 +486,16 @@ extern "C" {
|
|
464
486
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
465
487
|
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
466
488
|
|
489
|
+
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
490
|
+
|
467
491
|
// performance
|
468
492
|
int perf_runs;
|
469
493
|
int64_t perf_cycles;
|
470
494
|
int64_t perf_time_us;
|
471
495
|
};
|
472
496
|
|
497
|
+
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
498
|
+
|
473
499
|
// scratch buffer
|
474
500
|
struct ggml_scratch {
|
475
501
|
size_t offs;
|
@@ -531,6 +557,7 @@ extern "C" {
|
|
531
557
|
|
532
558
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
533
559
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
560
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
534
561
|
|
535
562
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
536
563
|
|
@@ -554,6 +581,7 @@ extern "C" {
|
|
554
581
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
555
582
|
|
556
583
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
584
|
+
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
557
585
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
558
586
|
|
559
587
|
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
@@ -613,9 +641,11 @@ extern "C" {
|
|
613
641
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
614
642
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
615
643
|
|
616
|
-
GGML_API
|
617
|
-
|
618
|
-
GGML_API
|
644
|
+
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
645
|
+
|
646
|
+
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
647
|
+
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
648
|
+
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
619
649
|
|
620
650
|
//
|
621
651
|
// operations on tensors with backpropagation
|
@@ -625,6 +655,11 @@ extern "C" {
|
|
625
655
|
struct ggml_context * ctx,
|
626
656
|
struct ggml_tensor * a);
|
627
657
|
|
658
|
+
// in-place, returns view(a)
|
659
|
+
GGML_API struct ggml_tensor * ggml_dup_inplace(
|
660
|
+
struct ggml_context * ctx,
|
661
|
+
struct ggml_tensor * a);
|
662
|
+
|
628
663
|
GGML_API struct ggml_tensor * ggml_add(
|
629
664
|
struct ggml_context * ctx,
|
630
665
|
struct ggml_tensor * a,
|
@@ -849,14 +884,17 @@ extern "C" {
|
|
849
884
|
|
850
885
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
851
886
|
struct ggml_context * ctx,
|
852
|
-
struct ggml_tensor * a
|
887
|
+
struct ggml_tensor * a,
|
888
|
+
float eps);
|
853
889
|
|
854
890
|
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
855
891
|
struct ggml_context * ctx,
|
856
|
-
struct ggml_tensor * a
|
892
|
+
struct ggml_tensor * a,
|
893
|
+
float eps);
|
857
894
|
|
858
895
|
// a - x
|
859
896
|
// b - dy
|
897
|
+
// TODO: update with configurable eps
|
860
898
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
861
899
|
struct ggml_context * ctx,
|
862
900
|
struct ggml_tensor * a,
|
@@ -948,11 +986,22 @@ extern "C" {
|
|
948
986
|
struct ggml_tensor * a,
|
949
987
|
struct ggml_tensor * b);
|
950
988
|
|
989
|
+
// a -> b, in-place, return view(b)
|
990
|
+
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
991
|
+
struct ggml_context * ctx,
|
992
|
+
struct ggml_tensor * a,
|
993
|
+
struct ggml_tensor * b);
|
994
|
+
|
951
995
|
// make contiguous
|
952
996
|
GGML_API struct ggml_tensor * ggml_cont(
|
953
997
|
struct ggml_context * ctx,
|
954
998
|
struct ggml_tensor * a);
|
955
999
|
|
1000
|
+
// make contiguous, in-place
|
1001
|
+
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
1002
|
+
struct ggml_context * ctx,
|
1003
|
+
struct ggml_tensor * a);
|
1004
|
+
|
956
1005
|
// return view(a), b specifies the new shape
|
957
1006
|
// TODO: when we start computing gradient, make a copy instead of view
|
958
1007
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1121,7 +1170,18 @@ extern "C" {
|
|
1121
1170
|
int mode,
|
1122
1171
|
int n_ctx);
|
1123
1172
|
|
1124
|
-
// custom RoPE
|
1173
|
+
// custom RoPE
|
1174
|
+
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1175
|
+
struct ggml_context * ctx,
|
1176
|
+
struct ggml_tensor * a,
|
1177
|
+
int n_past,
|
1178
|
+
int n_dims,
|
1179
|
+
int mode,
|
1180
|
+
int n_ctx,
|
1181
|
+
float freq_base,
|
1182
|
+
float freq_scale);
|
1183
|
+
|
1184
|
+
// in-place, returns view(a)
|
1125
1185
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1126
1186
|
struct ggml_context * ctx,
|
1127
1187
|
struct ggml_tensor * a,
|
@@ -1264,6 +1324,16 @@ extern "C" {
|
|
1264
1324
|
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1265
1325
|
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1266
1326
|
|
1327
|
+
GGML_API struct ggml_tensor * ggml_unary(
|
1328
|
+
struct ggml_context * ctx,
|
1329
|
+
struct ggml_tensor * a,
|
1330
|
+
enum ggml_unary_op op);
|
1331
|
+
|
1332
|
+
GGML_API struct ggml_tensor * ggml_unary_inplace(
|
1333
|
+
struct ggml_context * ctx,
|
1334
|
+
struct ggml_tensor * a,
|
1335
|
+
enum ggml_unary_op op);
|
1336
|
+
|
1267
1337
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1268
1338
|
struct ggml_context * ctx,
|
1269
1339
|
struct ggml_tensor * a,
|
@@ -1343,11 +1413,17 @@ extern "C" {
|
|
1343
1413
|
struct ggml_context * ctx,
|
1344
1414
|
struct ggml_tensor * tensor);
|
1345
1415
|
|
1416
|
+
|
1346
1417
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1347
1418
|
|
1348
1419
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1349
1420
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1350
1421
|
|
1422
|
+
// graph allocation in a context
|
1423
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
|
1424
|
+
GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
1425
|
+
GGML_API size_t ggml_graph_overhead(void);
|
1426
|
+
|
1351
1427
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1352
1428
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1353
1429
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|