llama_cpp 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,6 +8,7 @@
8
8
 
9
9
  #define UNUSED(x) (void)(x)
10
10
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
11
12
 
12
13
  //#define GGML_ALLOCATOR_DEBUG
13
14
 
@@ -67,6 +68,8 @@ struct ggml_allocr {
67
68
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
69
  size_t max_size;
69
70
  bool measure;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
70
73
 
71
74
  #ifdef GGML_ALLOCATOR_DEBUG
72
75
  struct ggml_tensor * allocated_tensors[1024];
@@ -74,7 +77,7 @@ struct ggml_allocr {
74
77
  };
75
78
 
76
79
  #ifdef GGML_ALLOCATOR_DEBUG
77
- static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
78
81
  for (int i = 0; i < 1024; i++) {
79
82
  if (alloc->allocated_tensors[i] == NULL) {
80
83
  alloc->allocated_tensors[i] = tensor;
@@ -83,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
83
86
  }
84
87
  GGML_ASSERT(!"out of allocated_tensors");
85
88
  }
86
- static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
87
90
  for (int i = 0; i < 1024; i++) {
88
91
  if (alloc->allocated_tensors[i] == tensor ||
89
92
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -111,10 +114,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
111
114
 
112
115
  size_t max_avail = 0;
113
116
 
114
- // find the best fitting free block
117
+ // find the best fitting free block besides the last block
115
118
  int best_fit_block = -1;
116
119
  size_t best_fit_size = SIZE_MAX;
117
- for (int i = 0; i < alloc->n_free_blocks; i++) {
120
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
118
121
  struct free_block * block = &alloc->free_blocks[i];
119
122
  max_avail = MAX(max_avail, block->size);
120
123
  if (block->size >= size && block->size <= best_fit_size) {
@@ -126,10 +129,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
126
129
  AT_PRINTF("block %d\n", best_fit_block);
127
130
 
128
131
  if (best_fit_block == -1) {
129
- fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
130
- __func__, size, max_avail);
131
- GGML_ASSERT(!"not enough space in the buffer");
132
+ // the last block is our last resort
133
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
134
+ if (block->size >= size) {
135
+ best_fit_block = alloc->n_free_blocks - 1;
136
+ max_avail = MAX(max_avail, block->size);
137
+ } else {
138
+ fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
139
+ __func__, size, max_avail);
140
+ GGML_ASSERT(!"not enough space in the buffer");
132
141
  return;
142
+ }
133
143
  }
134
144
  struct free_block * block = &alloc->free_blocks[best_fit_block];
135
145
  void * addr = block->addr;
@@ -229,6 +239,13 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
229
239
  alloc->n_free_blocks++;
230
240
  }
231
241
 
242
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
+ for (int i = 0; i < n; i++) {
244
+ alloc->parse_seq[i] = list[i];
245
+ }
246
+ alloc->parse_seq_len = n;
247
+ }
248
+
232
249
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
233
250
  alloc->n_free_blocks = 1;
234
251
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +265,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
248
265
  /*.hash_table = */ {{0}},
249
266
  /*.max_size = */ 0,
250
267
  /*.measure = */ false,
268
+ /*.parse_seq = */ {0},
269
+ /*.parse_seq_len = */ 0,
251
270
  #ifdef GGML_ALLOCATOR_DEBUG
252
271
  /*.allocated_tensors = */ = {0},
253
272
  #endif
@@ -275,6 +294,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
275
294
  /*.hash_table = */ {{0}},
276
295
  /*.max_size = */ 0,
277
296
  /*.measure = */ true,
297
+ /*.parse_seq = */ {0},
298
+ /*.parse_seq_len = */ 0,
278
299
  #ifdef GGML_ALLOCATOR_DEBUG
279
300
  /*.allocated_tensors = */ = {0},
280
301
  #endif
@@ -421,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
421
442
  else {
422
443
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
423
444
  node->data = parent->data;
445
+ return;
424
446
  }
425
- return;
426
447
  }
427
448
  }
428
449
  }
@@ -473,63 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
473
494
  allocate_node(alloc, input);
474
495
  }
475
496
  }
476
- for (int i = 0; i < gf->n_nodes; i++) {
477
- struct ggml_tensor * node = gf->nodes[i];
478
-
479
- // allocate parents (leafs)
480
- for (int j = 0; j < GGML_MAX_SRC; j++) {
481
- struct ggml_tensor * parent = node->src[j];
482
- if (parent == NULL) {
483
- break;
497
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
498
+ int last_barrier_pos = 0;
499
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
500
+
501
+ for (int ind = 0; ind < n_nodes; ind++) {
502
+ // allocate a node if there is no parse_seq or this is not a barrier
503
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
504
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
505
+ struct ggml_tensor * node = gf->nodes[i];
506
+
507
+ // allocate parents (leafs)
508
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
509
+ struct ggml_tensor * parent = node->src[j];
510
+ if (parent == NULL) {
511
+ break;
512
+ }
513
+ allocate_node(alloc, parent);
484
514
  }
485
- allocate_node(alloc, parent);
486
- }
487
515
 
488
- // allocate node
489
- allocate_node(alloc, node);
516
+ // allocate node
517
+ allocate_node(alloc, node);
490
518
 
491
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
492
- for (int j = 0; j < GGML_MAX_SRC; j++) {
493
- struct ggml_tensor * parent = node->src[j];
494
- if (parent == NULL) {
495
- break;
496
- }
497
- AT_PRINTF("%s", parent->name);
498
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
499
- AT_PRINTF(", ");
519
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
520
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
521
+ struct ggml_tensor * parent = node->src[j];
522
+ if (parent == NULL) {
523
+ break;
524
+ }
525
+ AT_PRINTF("%s", parent->name);
526
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
527
+ AT_PRINTF(", ");
528
+ }
500
529
  }
530
+ AT_PRINTF("\n");
501
531
  }
502
- AT_PRINTF("\n");
532
+
503
533
 
504
534
  // update parents
505
- for (int j = 0; j < GGML_MAX_SRC; j++) {
506
- struct ggml_tensor * parent = node->src[j];
507
- if (parent == NULL) {
508
- break;
509
- }
510
- struct hash_node * p_hn = hash_get(ht, parent);
511
- p_hn->n_children -= 1;
512
-
513
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
514
-
515
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
516
- if (ggml_is_view(parent)) {
517
- struct ggml_tensor * view_src = get_view_source(parent);
518
- struct hash_node * view_src_hn = hash_get(ht, view_src);
519
- view_src_hn->n_views -= 1;
520
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
521
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
522
- ggml_allocator_free_tensor(alloc, view_src);
535
+ // update immediately if there is no parse_seq
536
+ // update only at barriers if there is parse_seq
537
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
538
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
+ for (int i = update_start; i < update_end; i++) {
541
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
542
+ struct ggml_tensor * node = gf->nodes[node_i];
543
+
544
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
545
+ struct ggml_tensor * parent = node->src[j];
546
+ if (parent == NULL) {
547
+ break;
523
548
  }
524
- }
525
- else {
526
- if (parent->data != node->data) {
527
- ggml_allocator_free_tensor(alloc, parent);
549
+ struct hash_node * p_hn = hash_get(ht, parent);
550
+ p_hn->n_children -= 1;
551
+
552
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
553
+
554
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
+ if (ggml_is_view(parent)) {
556
+ struct ggml_tensor * view_src = get_view_source(parent);
557
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
558
+ view_src_hn->n_views -= 1;
559
+ AT_PRINTF("view_src %s\n", view_src->name);
560
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
+ ggml_allocator_free_tensor(alloc, view_src);
562
+ }
563
+ }
564
+ else {
565
+ if (parent->data != node->data) {
566
+ ggml_allocator_free_tensor(alloc, parent);
567
+ }
568
+ }
528
569
  }
529
570
  }
530
571
  }
572
+ AT_PRINTF("\n");
573
+ if (alloc->parse_seq_len) {
574
+ last_barrier_pos = ind + 1;
575
+ }
531
576
  }
532
- AT_PRINTF("\n");
533
577
  }
534
578
  // free graph outputs here that wouldn't be freed otherwise because they have no children
535
579
  if (outputs != NULL && outputs[g] != NULL) {
@@ -10,6 +10,10 @@ extern "C" {
10
10
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
11
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
12
 
13
+ // tell the allocator to parse nodes following the order described in the list
14
+ // you should call this if your graph are optimized to execute out-of-order
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
+
13
17
  GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
14
18
  GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
15
19
  GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);