llama_cpp 0.3.8 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,6 +8,7 @@
8
8
 
9
9
  #define UNUSED(x) (void)(x)
10
10
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
11
12
 
12
13
  //#define GGML_ALLOCATOR_DEBUG
13
14
 
@@ -67,8 +68,8 @@ struct ggml_allocr {
67
68
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
69
  size_t max_size;
69
70
  bool measure;
70
- int parse_seq[GGML_MAX_NODES];
71
- bool has_parse_seq;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
72
73
 
73
74
  #ifdef GGML_ALLOCATOR_DEBUG
74
75
  struct ggml_tensor * allocated_tensors[1024];
@@ -76,7 +77,7 @@ struct ggml_allocr {
76
77
  };
77
78
 
78
79
  #ifdef GGML_ALLOCATOR_DEBUG
79
- static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
80
81
  for (int i = 0; i < 1024; i++) {
81
82
  if (alloc->allocated_tensors[i] == NULL) {
82
83
  alloc->allocated_tensors[i] = tensor;
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
85
86
  }
86
87
  GGML_ASSERT(!"out of allocated_tensors");
87
88
  }
88
- static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
89
90
  for (int i = 0; i < 1024; i++) {
90
91
  if (alloc->allocated_tensors[i] == tensor ||
91
92
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -106,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
106
107
  }
107
108
 
108
109
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
+ #ifdef GGML_ALLOCATOR_DEBUG
111
+ GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
112
+ GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
+ #endif
109
114
  size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
110
115
  size = aligned_offset(NULL, size, alloc->alignment);
111
116
 
@@ -238,15 +243,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
238
243
  alloc->n_free_blocks++;
239
244
  }
240
245
 
241
- void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
242
- int pos = 0;
246
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
247
  for (int i = 0; i < n; i++) {
244
- if (list[i] != -1) {
245
- alloc->parse_seq[pos] = list[i];
246
- pos++;
247
- }
248
+ alloc->parse_seq[i] = list[i];
248
249
  }
249
- alloc->has_parse_seq = true;
250
+ alloc->parse_seq_len = n;
250
251
  }
251
252
 
252
253
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,9 +270,9 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
269
270
  /*.max_size = */ 0,
270
271
  /*.measure = */ false,
271
272
  /*.parse_seq = */ {0},
272
- /*.has_parse_seq = */ false,
273
+ /*.parse_seq_len = */ 0,
273
274
  #ifdef GGML_ALLOCATOR_DEBUG
274
- /*.allocated_tensors = */ = {0},
275
+ /*.allocated_tensors = */ {0},
275
276
  #endif
276
277
  };
277
278
 
@@ -298,9 +299,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
299
  /*.max_size = */ 0,
299
300
  /*.measure = */ true,
300
301
  /*.parse_seq = */ {0},
301
- /*.has_parse_seq = */ false,
302
+ /*.parse_seq_len = */ 0,
302
303
  #ifdef GGML_ALLOCATOR_DEBUG
303
- /*.allocated_tensors = */ = {0},
304
+ /*.allocated_tensors = */ {0},
304
305
  #endif
305
306
  };
306
307
 
@@ -320,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
320
321
  //////////// compute graph allocator
321
322
 
322
323
  static bool ggml_is_view(struct ggml_tensor * t) {
323
- return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
324
- t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
324
+ return t->view_src != NULL;
325
325
  }
326
326
 
327
327
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -339,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
339
339
  return true;
340
340
  }
341
341
 
342
- static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
343
- switch (t->op) {
344
- case GGML_OP_PERMUTE:
345
- case GGML_OP_RESHAPE:
346
- case GGML_OP_TRANSPOSE:
347
- case GGML_OP_VIEW:
348
- return t->src[0];
349
- case GGML_OP_CPY:
350
- return t->src[1];
351
- default:
352
- return NULL;
353
- }
354
- }
355
-
356
- static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
357
- struct ggml_tensor * parent = t;
358
- do {
359
- parent = get_view_parent(parent);
360
- } while (ggml_is_view(parent));
361
- return parent;
362
- }
363
-
364
342
  static bool ggml_op_can_inplace(enum ggml_op op) {
365
343
  switch (op) {
366
344
  case GGML_OP_SCALE:
@@ -368,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
368
346
  case GGML_OP_DIAG_MASK_INF:
369
347
  case GGML_OP_ADD:
370
348
  case GGML_OP_ADD1:
371
- case GGML_OP_ACC:
372
349
  case GGML_OP_SUB:
373
350
  case GGML_OP_MUL:
374
351
  case GGML_OP_DIV:
@@ -378,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
355
  case GGML_OP_UNARY:
379
356
  case GGML_OP_ROPE:
380
357
  case GGML_OP_RMS_NORM:
381
- case GGML_OP_SET:
382
358
  case GGML_OP_SOFT_MAX:
383
359
  case GGML_OP_CONT:
384
360
  return true;
@@ -392,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
392
368
  struct hash_node * ht = alloc->hash_table;
393
369
  if (node->data == NULL) {
394
370
  if (ggml_is_view(node)) {
395
- size_t offset;
396
- switch(node->op) {
397
- case GGML_OP_VIEW:
398
- memcpy(&offset, node->op_params, sizeof(size_t));
399
- node->data = (char *) node->src[0]->data + offset;
400
- break;
401
- case GGML_OP_PERMUTE:
402
- case GGML_OP_RESHAPE:
403
- case GGML_OP_TRANSPOSE:
404
- node->data = node->src[0]->data;
405
- break;
406
- case GGML_OP_CPY:
407
- node->data = node->src[1]->data;
408
- break;
409
- default:
410
- GGML_ASSERT(!"unknown view op");
411
- break;
412
- }
371
+ assert(node->view_src->data != NULL);
372
+ node->data = (char *)node->view_src->data + node->view_offs;
413
373
  } else {
414
374
  // see if we can reuse a parent's buffer (inplace)
415
375
  if (ggml_op_can_inplace(node->op)) {
@@ -429,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
429
389
  struct hash_node * p_hn = hash_get(ht, parent);
430
390
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
431
391
  if (ggml_is_view(parent)) {
432
- struct ggml_tensor * view_src = get_view_source(parent);
392
+ struct ggml_tensor * view_src = parent->view_src;
433
393
  struct hash_node * view_src_hn = hash_get(ht, view_src);
434
394
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
435
395
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -445,8 +405,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
445
405
  else {
446
406
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
447
407
  node->data = parent->data;
408
+ return;
448
409
  }
449
- return;
450
410
  }
451
411
  }
452
412
  }
@@ -471,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
471
431
  struct ggml_tensor * node = gf->nodes[i];
472
432
 
473
433
  if (ggml_is_view(node)) {
474
- struct ggml_tensor * view_src = get_view_source(node);
434
+ struct ggml_tensor * view_src = node->view_src;
475
435
  hash_get(ht, view_src)->n_views += 1;
476
436
  }
477
437
 
@@ -497,69 +457,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
497
457
  allocate_node(alloc, input);
498
458
  }
499
459
  }
500
- for (int ind = 0; ind < gf->n_nodes; ind++) {
501
- int i;
502
- if (alloc->has_parse_seq) {
503
- i = alloc->parse_seq[ind];
504
- } else {
505
- i = ind;
506
- }
507
- struct ggml_tensor * node = gf->nodes[i];
508
-
509
- // allocate parents (leafs)
510
- for (int j = 0; j < GGML_MAX_SRC; j++) {
511
- struct ggml_tensor * parent = node->src[j];
512
- if (parent == NULL) {
513
- break;
460
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
461
+ int last_barrier_pos = 0;
462
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
463
+
464
+ for (int ind = 0; ind < n_nodes; ind++) {
465
+ // allocate a node if there is no parse_seq or this is not a barrier
466
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
467
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
468
+ struct ggml_tensor * node = gf->nodes[i];
469
+
470
+ // allocate parents (leafs)
471
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
472
+ struct ggml_tensor * parent = node->src[j];
473
+ if (parent == NULL) {
474
+ break;
475
+ }
476
+ allocate_node(alloc, parent);
514
477
  }
515
- allocate_node(alloc, parent);
516
- }
517
478
 
518
- // allocate node
519
- allocate_node(alloc, node);
479
+ // allocate node
480
+ allocate_node(alloc, node);
520
481
 
521
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
522
- for (int j = 0; j < GGML_MAX_SRC; j++) {
523
- struct ggml_tensor * parent = node->src[j];
524
- if (parent == NULL) {
525
- break;
526
- }
527
- AT_PRINTF("%s", parent->name);
528
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
529
- AT_PRINTF(", ");
482
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
483
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
484
+ struct ggml_tensor * parent = node->src[j];
485
+ if (parent == NULL) {
486
+ break;
487
+ }
488
+ AT_PRINTF("%s", parent->name);
489
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
490
+ AT_PRINTF(", ");
491
+ }
530
492
  }
493
+ AT_PRINTF("\n");
531
494
  }
532
- AT_PRINTF("\n");
495
+
533
496
 
534
497
  // update parents
535
- for (int j = 0; j < GGML_MAX_SRC; j++) {
536
- struct ggml_tensor * parent = node->src[j];
537
- if (parent == NULL) {
538
- break;
539
- }
540
- struct hash_node * p_hn = hash_get(ht, parent);
541
- p_hn->n_children -= 1;
542
-
543
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
544
-
545
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
546
- if (ggml_is_view(parent)) {
547
- struct ggml_tensor * view_src = get_view_source(parent);
548
- struct hash_node * view_src_hn = hash_get(ht, view_src);
549
- view_src_hn->n_views -= 1;
550
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
551
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
552
- ggml_allocator_free_tensor(alloc, view_src);
498
+ // update immediately if there is no parse_seq
499
+ // update only at barriers if there is parse_seq
500
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
501
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
502
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
503
+ for (int i = update_start; i < update_end; i++) {
504
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
505
+ struct ggml_tensor * node = gf->nodes[node_i];
506
+
507
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
508
+ struct ggml_tensor * parent = node->src[j];
509
+ if (parent == NULL) {
510
+ break;
553
511
  }
554
- }
555
- else {
556
- if (parent->data != node->data) {
557
- ggml_allocator_free_tensor(alloc, parent);
512
+ struct hash_node * p_hn = hash_get(ht, parent);
513
+ p_hn->n_children -= 1;
514
+
515
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
516
+
517
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
518
+ if (ggml_is_view(parent)) {
519
+ struct ggml_tensor * view_src = parent->view_src;
520
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
521
+ view_src_hn->n_views -= 1;
522
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
523
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
524
+ ggml_allocator_free_tensor(alloc, view_src);
525
+ }
526
+ }
527
+ else {
528
+ if (parent->data != node->data) {
529
+ ggml_allocator_free_tensor(alloc, parent);
530
+ }
531
+ }
558
532
  }
559
533
  }
560
534
  }
535
+ AT_PRINTF("\n");
536
+ if (alloc->parse_seq_len) {
537
+ last_barrier_pos = ind + 1;
538
+ }
561
539
  }
562
- AT_PRINTF("\n");
563
540
  }
564
541
  // free graph outputs here that wouldn't be freed otherwise because they have no children
565
542
  if (outputs != NULL && outputs[g] != NULL) {
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
12
 
13
13
  // tell the allocator to parse nodes following the order described in the list
14
14
  // you should call this if your graph are optimized to execute out-of-order
15
- GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
16
 
17
17
  GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
18
  GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);