llama_cpp 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@
8
8
 
9
9
  #define UNUSED(x) (void)(x)
10
10
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
11
12
 
12
13
  //#define GGML_ALLOCATOR_DEBUG
13
14
 
@@ -67,6 +68,8 @@ struct ggml_allocr {
67
68
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
69
  size_t max_size;
69
70
  bool measure;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
70
73
 
71
74
  #ifdef GGML_ALLOCATOR_DEBUG
72
75
  struct ggml_tensor * allocated_tensors[1024];
@@ -74,7 +77,7 @@ struct ggml_allocr {
74
77
  };
75
78
 
76
79
  #ifdef GGML_ALLOCATOR_DEBUG
77
- static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
78
81
  for (int i = 0; i < 1024; i++) {
79
82
  if (alloc->allocated_tensors[i] == NULL) {
80
83
  alloc->allocated_tensors[i] = tensor;
@@ -83,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
83
86
  }
84
87
  GGML_ASSERT(!"out of allocated_tensors");
85
88
  }
86
- static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
87
90
  for (int i = 0; i < 1024; i++) {
88
91
  if (alloc->allocated_tensors[i] == tensor ||
89
92
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -111,10 +114,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
111
114
 
112
115
  size_t max_avail = 0;
113
116
 
114
- // find the best fitting free block
117
+ // find the best fitting free block besides the last block
115
118
  int best_fit_block = -1;
116
119
  size_t best_fit_size = SIZE_MAX;
117
- for (int i = 0; i < alloc->n_free_blocks; i++) {
120
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
118
121
  struct free_block * block = &alloc->free_blocks[i];
119
122
  max_avail = MAX(max_avail, block->size);
120
123
  if (block->size >= size && block->size <= best_fit_size) {
@@ -126,10 +129,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
126
129
  AT_PRINTF("block %d\n", best_fit_block);
127
130
 
128
131
  if (best_fit_block == -1) {
129
- fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
130
- __func__, size, max_avail);
131
- GGML_ASSERT(!"not enough space in the buffer");
132
+ // the last block is our last resort
133
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
134
+ if (block->size >= size) {
135
+ best_fit_block = alloc->n_free_blocks - 1;
136
+ max_avail = MAX(max_avail, block->size);
137
+ } else {
138
+ fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
139
+ __func__, size, max_avail);
140
+ GGML_ASSERT(!"not enough space in the buffer");
132
141
  return;
142
+ }
133
143
  }
134
144
  struct free_block * block = &alloc->free_blocks[best_fit_block];
135
145
  void * addr = block->addr;
@@ -229,6 +239,13 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
229
239
  alloc->n_free_blocks++;
230
240
  }
231
241
 
242
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
+ for (int i = 0; i < n; i++) {
244
+ alloc->parse_seq[i] = list[i];
245
+ }
246
+ alloc->parse_seq_len = n;
247
+ }
248
+
232
249
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
233
250
  alloc->n_free_blocks = 1;
234
251
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +265,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
248
265
  /*.hash_table = */ {{0}},
249
266
  /*.max_size = */ 0,
250
267
  /*.measure = */ false,
268
+ /*.parse_seq = */ {0},
269
+ /*.parse_seq_len = */ 0,
251
270
  #ifdef GGML_ALLOCATOR_DEBUG
252
271
  /*.allocated_tensors = */ = {0},
253
272
  #endif
@@ -275,6 +294,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
275
294
  /*.hash_table = */ {{0}},
276
295
  /*.max_size = */ 0,
277
296
  /*.measure = */ true,
297
+ /*.parse_seq = */ {0},
298
+ /*.parse_seq_len = */ 0,
278
299
  #ifdef GGML_ALLOCATOR_DEBUG
279
300
  /*.allocated_tensors = */ = {0},
280
301
  #endif
@@ -421,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
421
442
  else {
422
443
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
423
444
  node->data = parent->data;
445
+ return;
424
446
  }
425
- return;
426
447
  }
427
448
  }
428
449
  }
@@ -473,63 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
473
494
  allocate_node(alloc, input);
474
495
  }
475
496
  }
476
- for (int i = 0; i < gf->n_nodes; i++) {
477
- struct ggml_tensor * node = gf->nodes[i];
478
-
479
- // allocate parents (leafs)
480
- for (int j = 0; j < GGML_MAX_SRC; j++) {
481
- struct ggml_tensor * parent = node->src[j];
482
- if (parent == NULL) {
483
- break;
497
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
498
+ int last_barrier_pos = 0;
499
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
500
+
501
+ for (int ind = 0; ind < n_nodes; ind++) {
502
+ // allocate a node if there is no parse_seq or this is not a barrier
503
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
504
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
505
+ struct ggml_tensor * node = gf->nodes[i];
506
+
507
+ // allocate parents (leafs)
508
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
509
+ struct ggml_tensor * parent = node->src[j];
510
+ if (parent == NULL) {
511
+ break;
512
+ }
513
+ allocate_node(alloc, parent);
484
514
  }
485
- allocate_node(alloc, parent);
486
- }
487
515
 
488
- // allocate node
489
- allocate_node(alloc, node);
516
+ // allocate node
517
+ allocate_node(alloc, node);
490
518
 
491
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
492
- for (int j = 0; j < GGML_MAX_SRC; j++) {
493
- struct ggml_tensor * parent = node->src[j];
494
- if (parent == NULL) {
495
- break;
496
- }
497
- AT_PRINTF("%s", parent->name);
498
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
499
- AT_PRINTF(", ");
519
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
520
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
521
+ struct ggml_tensor * parent = node->src[j];
522
+ if (parent == NULL) {
523
+ break;
524
+ }
525
+ AT_PRINTF("%s", parent->name);
526
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
527
+ AT_PRINTF(", ");
528
+ }
500
529
  }
530
+ AT_PRINTF("\n");
501
531
  }
502
- AT_PRINTF("\n");
532
+
503
533
 
504
534
  // update parents
505
- for (int j = 0; j < GGML_MAX_SRC; j++) {
506
- struct ggml_tensor * parent = node->src[j];
507
- if (parent == NULL) {
508
- break;
509
- }
510
- struct hash_node * p_hn = hash_get(ht, parent);
511
- p_hn->n_children -= 1;
512
-
513
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
514
-
515
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
516
- if (ggml_is_view(parent)) {
517
- struct ggml_tensor * view_src = get_view_source(parent);
518
- struct hash_node * view_src_hn = hash_get(ht, view_src);
519
- view_src_hn->n_views -= 1;
520
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
521
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
522
- ggml_allocator_free_tensor(alloc, view_src);
535
+ // update immediately if there is no parse_seq
536
+ // update only at barriers if there is parse_seq
537
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
538
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
+ for (int i = update_start; i < update_end; i++) {
541
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
542
+ struct ggml_tensor * node = gf->nodes[node_i];
543
+
544
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
545
+ struct ggml_tensor * parent = node->src[j];
546
+ if (parent == NULL) {
547
+ break;
523
548
  }
524
- }
525
- else {
526
- if (parent->data != node->data) {
527
- ggml_allocator_free_tensor(alloc, parent);
549
+ struct hash_node * p_hn = hash_get(ht, parent);
550
+ p_hn->n_children -= 1;
551
+
552
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
553
+
554
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
+ if (ggml_is_view(parent)) {
556
+ struct ggml_tensor * view_src = get_view_source(parent);
557
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
558
+ view_src_hn->n_views -= 1;
559
+ AT_PRINTF("view_src %s\n", view_src->name);
560
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
+ ggml_allocator_free_tensor(alloc, view_src);
562
+ }
563
+ }
564
+ else {
565
+ if (parent->data != node->data) {
566
+ ggml_allocator_free_tensor(alloc, parent);
567
+ }
568
+ }
528
569
  }
529
570
  }
530
571
  }
572
+ AT_PRINTF("\n");
573
+ if (alloc->parse_seq_len) {
574
+ last_barrier_pos = ind + 1;
575
+ }
531
576
  }
532
- AT_PRINTF("\n");
533
577
  }
534
578
  // free graph outputs here that wouldn't be freed otherwise because they have no children
535
579
  if (outputs != NULL && outputs[g] != NULL) {
@@ -10,6 +10,10 @@ extern "C" {
10
10
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
11
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
12
 
13
+ // tell the allocator to parse nodes following the order described in the list
14
+ // you should call this if your graph are optimized to execute out-of-order
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
+
13
17
  GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
14
18
  GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
15
19
  GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);