llama_cpp 0.3.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#define UNUSED(x) (void)(x)
|
10
10
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
11
12
|
|
12
13
|
//#define GGML_ALLOCATOR_DEBUG
|
13
14
|
|
@@ -67,8 +68,8 @@ struct ggml_allocr {
|
|
67
68
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
69
|
size_t max_size;
|
69
70
|
bool measure;
|
70
|
-
int parse_seq[
|
71
|
-
|
71
|
+
int parse_seq[GGML_MAX_CONCUR];
|
72
|
+
int parse_seq_len;
|
72
73
|
|
73
74
|
#ifdef GGML_ALLOCATOR_DEBUG
|
74
75
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -76,7 +77,7 @@ struct ggml_allocr {
|
|
76
77
|
};
|
77
78
|
|
78
79
|
#ifdef GGML_ALLOCATOR_DEBUG
|
79
|
-
static void add_allocated_tensor(struct
|
80
|
+
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
80
81
|
for (int i = 0; i < 1024; i++) {
|
81
82
|
if (alloc->allocated_tensors[i] == NULL) {
|
82
83
|
alloc->allocated_tensors[i] = tensor;
|
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|
85
86
|
}
|
86
87
|
GGML_ASSERT(!"out of allocated_tensors");
|
87
88
|
}
|
88
|
-
static void remove_allocated_tensor(struct
|
89
|
+
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
89
90
|
for (int i = 0; i < 1024; i++) {
|
90
91
|
if (alloc->allocated_tensors[i] == tensor ||
|
91
92
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -106,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
|
|
106
107
|
}
|
107
108
|
|
108
109
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
+
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
|
112
|
+
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
|
+
#endif
|
109
114
|
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
110
115
|
size = aligned_offset(NULL, size, alloc->alignment);
|
111
116
|
|
@@ -238,15 +243,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
238
243
|
alloc->n_free_blocks++;
|
239
244
|
}
|
240
245
|
|
241
|
-
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
242
|
-
int pos = 0;
|
246
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
243
247
|
for (int i = 0; i < n; i++) {
|
244
|
-
|
245
|
-
alloc->parse_seq[pos] = list[i];
|
246
|
-
pos++;
|
247
|
-
}
|
248
|
+
alloc->parse_seq[i] = list[i];
|
248
249
|
}
|
249
|
-
alloc->
|
250
|
+
alloc->parse_seq_len = n;
|
250
251
|
}
|
251
252
|
|
252
253
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
@@ -269,9 +270,9 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
269
270
|
/*.max_size = */ 0,
|
270
271
|
/*.measure = */ false,
|
271
272
|
/*.parse_seq = */ {0},
|
272
|
-
/*.
|
273
|
+
/*.parse_seq_len = */ 0,
|
273
274
|
#ifdef GGML_ALLOCATOR_DEBUG
|
274
|
-
/*.allocated_tensors = */
|
275
|
+
/*.allocated_tensors = */ {0},
|
275
276
|
#endif
|
276
277
|
};
|
277
278
|
|
@@ -298,9 +299,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
298
299
|
/*.max_size = */ 0,
|
299
300
|
/*.measure = */ true,
|
300
301
|
/*.parse_seq = */ {0},
|
301
|
-
/*.
|
302
|
+
/*.parse_seq_len = */ 0,
|
302
303
|
#ifdef GGML_ALLOCATOR_DEBUG
|
303
|
-
/*.allocated_tensors = */
|
304
|
+
/*.allocated_tensors = */ {0},
|
304
305
|
#endif
|
305
306
|
};
|
306
307
|
|
@@ -320,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
320
321
|
//////////// compute graph allocator
|
321
322
|
|
322
323
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
323
|
-
return t->
|
324
|
-
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
324
|
+
return t->view_src != NULL;
|
325
325
|
}
|
326
326
|
|
327
327
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -339,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
339
339
|
return true;
|
340
340
|
}
|
341
341
|
|
342
|
-
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
343
|
-
switch (t->op) {
|
344
|
-
case GGML_OP_PERMUTE:
|
345
|
-
case GGML_OP_RESHAPE:
|
346
|
-
case GGML_OP_TRANSPOSE:
|
347
|
-
case GGML_OP_VIEW:
|
348
|
-
return t->src[0];
|
349
|
-
case GGML_OP_CPY:
|
350
|
-
return t->src[1];
|
351
|
-
default:
|
352
|
-
return NULL;
|
353
|
-
}
|
354
|
-
}
|
355
|
-
|
356
|
-
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
357
|
-
struct ggml_tensor * parent = t;
|
358
|
-
do {
|
359
|
-
parent = get_view_parent(parent);
|
360
|
-
} while (ggml_is_view(parent));
|
361
|
-
return parent;
|
362
|
-
}
|
363
|
-
|
364
342
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
365
343
|
switch (op) {
|
366
344
|
case GGML_OP_SCALE:
|
@@ -368,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
368
346
|
case GGML_OP_DIAG_MASK_INF:
|
369
347
|
case GGML_OP_ADD:
|
370
348
|
case GGML_OP_ADD1:
|
371
|
-
case GGML_OP_ACC:
|
372
349
|
case GGML_OP_SUB:
|
373
350
|
case GGML_OP_MUL:
|
374
351
|
case GGML_OP_DIV:
|
@@ -378,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
378
355
|
case GGML_OP_UNARY:
|
379
356
|
case GGML_OP_ROPE:
|
380
357
|
case GGML_OP_RMS_NORM:
|
381
|
-
case GGML_OP_SET:
|
382
358
|
case GGML_OP_SOFT_MAX:
|
383
359
|
case GGML_OP_CONT:
|
384
360
|
return true;
|
@@ -392,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
392
368
|
struct hash_node * ht = alloc->hash_table;
|
393
369
|
if (node->data == NULL) {
|
394
370
|
if (ggml_is_view(node)) {
|
395
|
-
|
396
|
-
|
397
|
-
case GGML_OP_VIEW:
|
398
|
-
memcpy(&offset, node->op_params, sizeof(size_t));
|
399
|
-
node->data = (char *) node->src[0]->data + offset;
|
400
|
-
break;
|
401
|
-
case GGML_OP_PERMUTE:
|
402
|
-
case GGML_OP_RESHAPE:
|
403
|
-
case GGML_OP_TRANSPOSE:
|
404
|
-
node->data = node->src[0]->data;
|
405
|
-
break;
|
406
|
-
case GGML_OP_CPY:
|
407
|
-
node->data = node->src[1]->data;
|
408
|
-
break;
|
409
|
-
default:
|
410
|
-
GGML_ASSERT(!"unknown view op");
|
411
|
-
break;
|
412
|
-
}
|
371
|
+
assert(node->view_src->data != NULL);
|
372
|
+
node->data = (char *)node->view_src->data + node->view_offs;
|
413
373
|
} else {
|
414
374
|
// see if we can reuse a parent's buffer (inplace)
|
415
375
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -429,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
429
389
|
struct hash_node * p_hn = hash_get(ht, parent);
|
430
390
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
431
391
|
if (ggml_is_view(parent)) {
|
432
|
-
struct ggml_tensor * view_src =
|
392
|
+
struct ggml_tensor * view_src = parent->view_src;
|
433
393
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
434
394
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
435
395
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
@@ -445,8 +405,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
445
405
|
else {
|
446
406
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
447
407
|
node->data = parent->data;
|
408
|
+
return;
|
448
409
|
}
|
449
|
-
return;
|
450
410
|
}
|
451
411
|
}
|
452
412
|
}
|
@@ -471,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
471
431
|
struct ggml_tensor * node = gf->nodes[i];
|
472
432
|
|
473
433
|
if (ggml_is_view(node)) {
|
474
|
-
struct ggml_tensor * view_src =
|
434
|
+
struct ggml_tensor * view_src = node->view_src;
|
475
435
|
hash_get(ht, view_src)->n_views += 1;
|
476
436
|
}
|
477
437
|
|
@@ -497,69 +457,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
497
457
|
allocate_node(alloc, input);
|
498
458
|
}
|
499
459
|
}
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
460
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
461
|
+
int last_barrier_pos = 0;
|
462
|
+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
463
|
+
|
464
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
465
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
466
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
467
|
+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
468
|
+
struct ggml_tensor * node = gf->nodes[i];
|
469
|
+
|
470
|
+
// allocate parents (leafs)
|
471
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
472
|
+
struct ggml_tensor * parent = node->src[j];
|
473
|
+
if (parent == NULL) {
|
474
|
+
break;
|
475
|
+
}
|
476
|
+
allocate_node(alloc, parent);
|
514
477
|
}
|
515
|
-
allocate_node(alloc, parent);
|
516
|
-
}
|
517
478
|
|
518
|
-
|
519
|
-
|
479
|
+
// allocate node
|
480
|
+
allocate_node(alloc, node);
|
520
481
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
482
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
483
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
484
|
+
struct ggml_tensor * parent = node->src[j];
|
485
|
+
if (parent == NULL) {
|
486
|
+
break;
|
487
|
+
}
|
488
|
+
AT_PRINTF("%s", parent->name);
|
489
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
490
|
+
AT_PRINTF(", ");
|
491
|
+
}
|
530
492
|
}
|
493
|
+
AT_PRINTF("\n");
|
531
494
|
}
|
532
|
-
|
495
|
+
|
533
496
|
|
534
497
|
// update parents
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
549
|
-
view_src_hn->n_views -= 1;
|
550
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
551
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
552
|
-
ggml_allocator_free_tensor(alloc, view_src);
|
498
|
+
// update immediately if there is no parse_seq
|
499
|
+
// update only at barriers if there is parse_seq
|
500
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
501
|
+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
502
|
+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
503
|
+
for (int i = update_start; i < update_end; i++) {
|
504
|
+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
505
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
506
|
+
|
507
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
508
|
+
struct ggml_tensor * parent = node->src[j];
|
509
|
+
if (parent == NULL) {
|
510
|
+
break;
|
553
511
|
}
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
512
|
+
struct hash_node * p_hn = hash_get(ht, parent);
|
513
|
+
p_hn->n_children -= 1;
|
514
|
+
|
515
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
516
|
+
|
517
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
518
|
+
if (ggml_is_view(parent)) {
|
519
|
+
struct ggml_tensor * view_src = parent->view_src;
|
520
|
+
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
521
|
+
view_src_hn->n_views -= 1;
|
522
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
523
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
524
|
+
ggml_allocator_free_tensor(alloc, view_src);
|
525
|
+
}
|
526
|
+
}
|
527
|
+
else {
|
528
|
+
if (parent->data != node->data) {
|
529
|
+
ggml_allocator_free_tensor(alloc, parent);
|
530
|
+
}
|
531
|
+
}
|
558
532
|
}
|
559
533
|
}
|
560
534
|
}
|
535
|
+
AT_PRINTF("\n");
|
536
|
+
if (alloc->parse_seq_len) {
|
537
|
+
last_barrier_pos = ind + 1;
|
538
|
+
}
|
561
539
|
}
|
562
|
-
AT_PRINTF("\n");
|
563
540
|
}
|
564
541
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
565
542
|
if (outputs != NULL && outputs[g] != NULL) {
|
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
|
12
12
|
|
13
13
|
// tell the allocator to parse nodes following the order described in the list
|
14
14
|
// you should call this if your graph are optimized to execute out-of-order
|
15
|
-
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
16
|
|
17
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|