llama_cpp 0.3.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#define UNUSED(x) (void)(x)
|
10
10
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
11
12
|
|
12
13
|
//#define GGML_ALLOCATOR_DEBUG
|
13
14
|
|
@@ -67,8 +68,8 @@ struct ggml_allocr {
|
|
67
68
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
69
|
size_t max_size;
|
69
70
|
bool measure;
|
70
|
-
int parse_seq[
|
71
|
-
|
71
|
+
int parse_seq[GGML_MAX_CONCUR];
|
72
|
+
int parse_seq_len;
|
72
73
|
|
73
74
|
#ifdef GGML_ALLOCATOR_DEBUG
|
74
75
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -76,7 +77,7 @@ struct ggml_allocr {
|
|
76
77
|
};
|
77
78
|
|
78
79
|
#ifdef GGML_ALLOCATOR_DEBUG
|
79
|
-
static void add_allocated_tensor(struct
|
80
|
+
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
80
81
|
for (int i = 0; i < 1024; i++) {
|
81
82
|
if (alloc->allocated_tensors[i] == NULL) {
|
82
83
|
alloc->allocated_tensors[i] = tensor;
|
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|
85
86
|
}
|
86
87
|
GGML_ASSERT(!"out of allocated_tensors");
|
87
88
|
}
|
88
|
-
static void remove_allocated_tensor(struct
|
89
|
+
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
89
90
|
for (int i = 0; i < 1024; i++) {
|
90
91
|
if (alloc->allocated_tensors[i] == tensor ||
|
91
92
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -106,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
|
|
106
107
|
}
|
107
108
|
|
108
109
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
+
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
|
112
|
+
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
|
+
#endif
|
109
114
|
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
110
115
|
size = aligned_offset(NULL, size, alloc->alignment);
|
111
116
|
|
@@ -238,15 +243,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
238
243
|
alloc->n_free_blocks++;
|
239
244
|
}
|
240
245
|
|
241
|
-
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
242
|
-
int pos = 0;
|
246
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
243
247
|
for (int i = 0; i < n; i++) {
|
244
|
-
|
245
|
-
alloc->parse_seq[pos] = list[i];
|
246
|
-
pos++;
|
247
|
-
}
|
248
|
+
alloc->parse_seq[i] = list[i];
|
248
249
|
}
|
249
|
-
alloc->
|
250
|
+
alloc->parse_seq_len = n;
|
250
251
|
}
|
251
252
|
|
252
253
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
@@ -269,9 +270,9 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
269
270
|
/*.max_size = */ 0,
|
270
271
|
/*.measure = */ false,
|
271
272
|
/*.parse_seq = */ {0},
|
272
|
-
/*.
|
273
|
+
/*.parse_seq_len = */ 0,
|
273
274
|
#ifdef GGML_ALLOCATOR_DEBUG
|
274
|
-
/*.allocated_tensors = */
|
275
|
+
/*.allocated_tensors = */ {0},
|
275
276
|
#endif
|
276
277
|
};
|
277
278
|
|
@@ -298,9 +299,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
298
299
|
/*.max_size = */ 0,
|
299
300
|
/*.measure = */ true,
|
300
301
|
/*.parse_seq = */ {0},
|
301
|
-
/*.
|
302
|
+
/*.parse_seq_len = */ 0,
|
302
303
|
#ifdef GGML_ALLOCATOR_DEBUG
|
303
|
-
/*.allocated_tensors = */
|
304
|
+
/*.allocated_tensors = */ {0},
|
304
305
|
#endif
|
305
306
|
};
|
306
307
|
|
@@ -320,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
320
321
|
//////////// compute graph allocator
|
321
322
|
|
322
323
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
323
|
-
return t->
|
324
|
-
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
324
|
+
return t->view_src != NULL;
|
325
325
|
}
|
326
326
|
|
327
327
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -339,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
339
339
|
return true;
|
340
340
|
}
|
341
341
|
|
342
|
-
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
343
|
-
switch (t->op) {
|
344
|
-
case GGML_OP_PERMUTE:
|
345
|
-
case GGML_OP_RESHAPE:
|
346
|
-
case GGML_OP_TRANSPOSE:
|
347
|
-
case GGML_OP_VIEW:
|
348
|
-
return t->src[0];
|
349
|
-
case GGML_OP_CPY:
|
350
|
-
return t->src[1];
|
351
|
-
default:
|
352
|
-
return NULL;
|
353
|
-
}
|
354
|
-
}
|
355
|
-
|
356
|
-
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
357
|
-
struct ggml_tensor * parent = t;
|
358
|
-
do {
|
359
|
-
parent = get_view_parent(parent);
|
360
|
-
} while (ggml_is_view(parent));
|
361
|
-
return parent;
|
362
|
-
}
|
363
|
-
|
364
342
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
365
343
|
switch (op) {
|
366
344
|
case GGML_OP_SCALE:
|
@@ -368,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
368
346
|
case GGML_OP_DIAG_MASK_INF:
|
369
347
|
case GGML_OP_ADD:
|
370
348
|
case GGML_OP_ADD1:
|
371
|
-
case GGML_OP_ACC:
|
372
349
|
case GGML_OP_SUB:
|
373
350
|
case GGML_OP_MUL:
|
374
351
|
case GGML_OP_DIV:
|
@@ -378,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
378
355
|
case GGML_OP_UNARY:
|
379
356
|
case GGML_OP_ROPE:
|
380
357
|
case GGML_OP_RMS_NORM:
|
381
|
-
case GGML_OP_SET:
|
382
358
|
case GGML_OP_SOFT_MAX:
|
383
359
|
case GGML_OP_CONT:
|
384
360
|
return true;
|
@@ -392,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
392
368
|
struct hash_node * ht = alloc->hash_table;
|
393
369
|
if (node->data == NULL) {
|
394
370
|
if (ggml_is_view(node)) {
|
395
|
-
|
396
|
-
|
397
|
-
case GGML_OP_VIEW:
|
398
|
-
memcpy(&offset, node->op_params, sizeof(size_t));
|
399
|
-
node->data = (char *) node->src[0]->data + offset;
|
400
|
-
break;
|
401
|
-
case GGML_OP_PERMUTE:
|
402
|
-
case GGML_OP_RESHAPE:
|
403
|
-
case GGML_OP_TRANSPOSE:
|
404
|
-
node->data = node->src[0]->data;
|
405
|
-
break;
|
406
|
-
case GGML_OP_CPY:
|
407
|
-
node->data = node->src[1]->data;
|
408
|
-
break;
|
409
|
-
default:
|
410
|
-
GGML_ASSERT(!"unknown view op");
|
411
|
-
break;
|
412
|
-
}
|
371
|
+
assert(node->view_src->data != NULL);
|
372
|
+
node->data = (char *)node->view_src->data + node->view_offs;
|
413
373
|
} else {
|
414
374
|
// see if we can reuse a parent's buffer (inplace)
|
415
375
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -429,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
429
389
|
struct hash_node * p_hn = hash_get(ht, parent);
|
430
390
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
431
391
|
if (ggml_is_view(parent)) {
|
432
|
-
struct ggml_tensor * view_src =
|
392
|
+
struct ggml_tensor * view_src = parent->view_src;
|
433
393
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
434
394
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
435
395
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
@@ -445,8 +405,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
445
405
|
else {
|
446
406
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
447
407
|
node->data = parent->data;
|
408
|
+
return;
|
448
409
|
}
|
449
|
-
return;
|
450
410
|
}
|
451
411
|
}
|
452
412
|
}
|
@@ -471,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
471
431
|
struct ggml_tensor * node = gf->nodes[i];
|
472
432
|
|
473
433
|
if (ggml_is_view(node)) {
|
474
|
-
struct ggml_tensor * view_src =
|
434
|
+
struct ggml_tensor * view_src = node->view_src;
|
475
435
|
hash_get(ht, view_src)->n_views += 1;
|
476
436
|
}
|
477
437
|
|
@@ -497,69 +457,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
497
457
|
allocate_node(alloc, input);
|
498
458
|
}
|
499
459
|
}
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
460
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
461
|
+
int last_barrier_pos = 0;
|
462
|
+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
463
|
+
|
464
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
465
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
466
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
467
|
+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
468
|
+
struct ggml_tensor * node = gf->nodes[i];
|
469
|
+
|
470
|
+
// allocate parents (leafs)
|
471
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
472
|
+
struct ggml_tensor * parent = node->src[j];
|
473
|
+
if (parent == NULL) {
|
474
|
+
break;
|
475
|
+
}
|
476
|
+
allocate_node(alloc, parent);
|
514
477
|
}
|
515
|
-
allocate_node(alloc, parent);
|
516
|
-
}
|
517
478
|
|
518
|
-
|
519
|
-
|
479
|
+
// allocate node
|
480
|
+
allocate_node(alloc, node);
|
520
481
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
482
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
483
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
484
|
+
struct ggml_tensor * parent = node->src[j];
|
485
|
+
if (parent == NULL) {
|
486
|
+
break;
|
487
|
+
}
|
488
|
+
AT_PRINTF("%s", parent->name);
|
489
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
490
|
+
AT_PRINTF(", ");
|
491
|
+
}
|
530
492
|
}
|
493
|
+
AT_PRINTF("\n");
|
531
494
|
}
|
532
|
-
|
495
|
+
|
533
496
|
|
534
497
|
// update parents
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
549
|
-
view_src_hn->n_views -= 1;
|
550
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
551
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
552
|
-
ggml_allocator_free_tensor(alloc, view_src);
|
498
|
+
// update immediately if there is no parse_seq
|
499
|
+
// update only at barriers if there is parse_seq
|
500
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
501
|
+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
502
|
+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
503
|
+
for (int i = update_start; i < update_end; i++) {
|
504
|
+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
505
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
506
|
+
|
507
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
508
|
+
struct ggml_tensor * parent = node->src[j];
|
509
|
+
if (parent == NULL) {
|
510
|
+
break;
|
553
511
|
}
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
512
|
+
struct hash_node * p_hn = hash_get(ht, parent);
|
513
|
+
p_hn->n_children -= 1;
|
514
|
+
|
515
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
516
|
+
|
517
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
518
|
+
if (ggml_is_view(parent)) {
|
519
|
+
struct ggml_tensor * view_src = parent->view_src;
|
520
|
+
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
521
|
+
view_src_hn->n_views -= 1;
|
522
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
523
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
524
|
+
ggml_allocator_free_tensor(alloc, view_src);
|
525
|
+
}
|
526
|
+
}
|
527
|
+
else {
|
528
|
+
if (parent->data != node->data) {
|
529
|
+
ggml_allocator_free_tensor(alloc, parent);
|
530
|
+
}
|
531
|
+
}
|
558
532
|
}
|
559
533
|
}
|
560
534
|
}
|
535
|
+
AT_PRINTF("\n");
|
536
|
+
if (alloc->parse_seq_len) {
|
537
|
+
last_barrier_pos = ind + 1;
|
538
|
+
}
|
561
539
|
}
|
562
|
-
AT_PRINTF("\n");
|
563
540
|
}
|
564
541
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
565
542
|
if (outputs != NULL && outputs[g] != NULL) {
|
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
|
12
12
|
|
13
13
|
// tell the allocator to parse nodes following the order described in the list
|
14
14
|
// you should call this if your graph are optimized to execute out-of-order
|
15
|
-
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
16
|
|
17
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|