llama_cpp 0.3.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@
8
8
 
9
9
  #define UNUSED(x) (void)(x)
10
10
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
11
12
 
12
13
  //#define GGML_ALLOCATOR_DEBUG
13
14
 
@@ -67,8 +68,8 @@ struct ggml_allocr {
67
68
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
69
  size_t max_size;
69
70
  bool measure;
70
- int parse_seq[GGML_MAX_NODES];
71
- bool has_parse_seq;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
72
73
 
73
74
  #ifdef GGML_ALLOCATOR_DEBUG
74
75
  struct ggml_tensor * allocated_tensors[1024];
@@ -76,7 +77,7 @@ struct ggml_allocr {
76
77
  };
77
78
 
78
79
  #ifdef GGML_ALLOCATOR_DEBUG
79
- static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
80
81
  for (int i = 0; i < 1024; i++) {
81
82
  if (alloc->allocated_tensors[i] == NULL) {
82
83
  alloc->allocated_tensors[i] = tensor;
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
85
86
  }
86
87
  GGML_ASSERT(!"out of allocated_tensors");
87
88
  }
88
- static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
89
90
  for (int i = 0; i < 1024; i++) {
90
91
  if (alloc->allocated_tensors[i] == tensor ||
91
92
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -106,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
106
107
  }
107
108
 
108
109
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
+ #ifdef GGML_ALLOCATOR_DEBUG
111
+ GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
112
+ GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
+ #endif
109
114
  size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
110
115
  size = aligned_offset(NULL, size, alloc->alignment);
111
116
 
@@ -238,15 +243,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
238
243
  alloc->n_free_blocks++;
239
244
  }
240
245
 
241
- void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
242
- int pos = 0;
246
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
247
  for (int i = 0; i < n; i++) {
244
- if (list[i] != -1) {
245
- alloc->parse_seq[pos] = list[i];
246
- pos++;
247
- }
248
+ alloc->parse_seq[i] = list[i];
248
249
  }
249
- alloc->has_parse_seq = true;
250
+ alloc->parse_seq_len = n;
250
251
  }
251
252
 
252
253
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,9 +270,9 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
269
270
  /*.max_size = */ 0,
270
271
  /*.measure = */ false,
271
272
  /*.parse_seq = */ {0},
272
- /*.has_parse_seq = */ false,
273
+ /*.parse_seq_len = */ 0,
273
274
  #ifdef GGML_ALLOCATOR_DEBUG
274
- /*.allocated_tensors = */ = {0},
275
+ /*.allocated_tensors = */ {0},
275
276
  #endif
276
277
  };
277
278
 
@@ -298,9 +299,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
299
  /*.max_size = */ 0,
299
300
  /*.measure = */ true,
300
301
  /*.parse_seq = */ {0},
301
- /*.has_parse_seq = */ false,
302
+ /*.parse_seq_len = */ 0,
302
303
  #ifdef GGML_ALLOCATOR_DEBUG
303
- /*.allocated_tensors = */ = {0},
304
+ /*.allocated_tensors = */ {0},
304
305
  #endif
305
306
  };
306
307
 
@@ -320,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
320
321
  //////////// compute graph allocator
321
322
 
322
323
  static bool ggml_is_view(struct ggml_tensor * t) {
323
- return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
324
- t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
324
+ return t->view_src != NULL;
325
325
  }
326
326
 
327
327
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -339,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
339
339
  return true;
340
340
  }
341
341
 
342
- static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
343
- switch (t->op) {
344
- case GGML_OP_PERMUTE:
345
- case GGML_OP_RESHAPE:
346
- case GGML_OP_TRANSPOSE:
347
- case GGML_OP_VIEW:
348
- return t->src[0];
349
- case GGML_OP_CPY:
350
- return t->src[1];
351
- default:
352
- return NULL;
353
- }
354
- }
355
-
356
- static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
357
- struct ggml_tensor * parent = t;
358
- do {
359
- parent = get_view_parent(parent);
360
- } while (ggml_is_view(parent));
361
- return parent;
362
- }
363
-
364
342
  static bool ggml_op_can_inplace(enum ggml_op op) {
365
343
  switch (op) {
366
344
  case GGML_OP_SCALE:
@@ -368,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
368
346
  case GGML_OP_DIAG_MASK_INF:
369
347
  case GGML_OP_ADD:
370
348
  case GGML_OP_ADD1:
371
- case GGML_OP_ACC:
372
349
  case GGML_OP_SUB:
373
350
  case GGML_OP_MUL:
374
351
  case GGML_OP_DIV:
@@ -378,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
355
  case GGML_OP_UNARY:
379
356
  case GGML_OP_ROPE:
380
357
  case GGML_OP_RMS_NORM:
381
- case GGML_OP_SET:
382
358
  case GGML_OP_SOFT_MAX:
383
359
  case GGML_OP_CONT:
384
360
  return true;
@@ -392,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
392
368
  struct hash_node * ht = alloc->hash_table;
393
369
  if (node->data == NULL) {
394
370
  if (ggml_is_view(node)) {
395
- size_t offset;
396
- switch(node->op) {
397
- case GGML_OP_VIEW:
398
- memcpy(&offset, node->op_params, sizeof(size_t));
399
- node->data = (char *) node->src[0]->data + offset;
400
- break;
401
- case GGML_OP_PERMUTE:
402
- case GGML_OP_RESHAPE:
403
- case GGML_OP_TRANSPOSE:
404
- node->data = node->src[0]->data;
405
- break;
406
- case GGML_OP_CPY:
407
- node->data = node->src[1]->data;
408
- break;
409
- default:
410
- GGML_ASSERT(!"unknown view op");
411
- break;
412
- }
371
+ assert(node->view_src->data != NULL);
372
+ node->data = (char *)node->view_src->data + node->view_offs;
413
373
  } else {
414
374
  // see if we can reuse a parent's buffer (inplace)
415
375
  if (ggml_op_can_inplace(node->op)) {
@@ -429,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
429
389
  struct hash_node * p_hn = hash_get(ht, parent);
430
390
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
431
391
  if (ggml_is_view(parent)) {
432
- struct ggml_tensor * view_src = get_view_source(parent);
392
+ struct ggml_tensor * view_src = parent->view_src;
433
393
  struct hash_node * view_src_hn = hash_get(ht, view_src);
434
394
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
435
395
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -445,8 +405,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
445
405
  else {
446
406
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
447
407
  node->data = parent->data;
408
+ return;
448
409
  }
449
- return;
450
410
  }
451
411
  }
452
412
  }
@@ -471,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
471
431
  struct ggml_tensor * node = gf->nodes[i];
472
432
 
473
433
  if (ggml_is_view(node)) {
474
- struct ggml_tensor * view_src = get_view_source(node);
434
+ struct ggml_tensor * view_src = node->view_src;
475
435
  hash_get(ht, view_src)->n_views += 1;
476
436
  }
477
437
 
@@ -497,69 +457,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
497
457
  allocate_node(alloc, input);
498
458
  }
499
459
  }
500
- for (int ind = 0; ind < gf->n_nodes; ind++) {
501
- int i;
502
- if (alloc->has_parse_seq) {
503
- i = alloc->parse_seq[ind];
504
- } else {
505
- i = ind;
506
- }
507
- struct ggml_tensor * node = gf->nodes[i];
508
-
509
- // allocate parents (leafs)
510
- for (int j = 0; j < GGML_MAX_SRC; j++) {
511
- struct ggml_tensor * parent = node->src[j];
512
- if (parent == NULL) {
513
- break;
460
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
461
+ int last_barrier_pos = 0;
462
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
463
+
464
+ for (int ind = 0; ind < n_nodes; ind++) {
465
+ // allocate a node if there is no parse_seq or this is not a barrier
466
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
467
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
468
+ struct ggml_tensor * node = gf->nodes[i];
469
+
470
+ // allocate parents (leafs)
471
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
472
+ struct ggml_tensor * parent = node->src[j];
473
+ if (parent == NULL) {
474
+ break;
475
+ }
476
+ allocate_node(alloc, parent);
514
477
  }
515
- allocate_node(alloc, parent);
516
- }
517
478
 
518
- // allocate node
519
- allocate_node(alloc, node);
479
+ // allocate node
480
+ allocate_node(alloc, node);
520
481
 
521
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
522
- for (int j = 0; j < GGML_MAX_SRC; j++) {
523
- struct ggml_tensor * parent = node->src[j];
524
- if (parent == NULL) {
525
- break;
526
- }
527
- AT_PRINTF("%s", parent->name);
528
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
529
- AT_PRINTF(", ");
482
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
483
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
484
+ struct ggml_tensor * parent = node->src[j];
485
+ if (parent == NULL) {
486
+ break;
487
+ }
488
+ AT_PRINTF("%s", parent->name);
489
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
490
+ AT_PRINTF(", ");
491
+ }
530
492
  }
493
+ AT_PRINTF("\n");
531
494
  }
532
- AT_PRINTF("\n");
495
+
533
496
 
534
497
  // update parents
535
- for (int j = 0; j < GGML_MAX_SRC; j++) {
536
- struct ggml_tensor * parent = node->src[j];
537
- if (parent == NULL) {
538
- break;
539
- }
540
- struct hash_node * p_hn = hash_get(ht, parent);
541
- p_hn->n_children -= 1;
542
-
543
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
544
-
545
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
546
- if (ggml_is_view(parent)) {
547
- struct ggml_tensor * view_src = get_view_source(parent);
548
- struct hash_node * view_src_hn = hash_get(ht, view_src);
549
- view_src_hn->n_views -= 1;
550
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
551
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
552
- ggml_allocator_free_tensor(alloc, view_src);
498
+ // update immediately if there is no parse_seq
499
+ // update only at barriers if there is parse_seq
500
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
501
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
502
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
503
+ for (int i = update_start; i < update_end; i++) {
504
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
505
+ struct ggml_tensor * node = gf->nodes[node_i];
506
+
507
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
508
+ struct ggml_tensor * parent = node->src[j];
509
+ if (parent == NULL) {
510
+ break;
553
511
  }
554
- }
555
- else {
556
- if (parent->data != node->data) {
557
- ggml_allocator_free_tensor(alloc, parent);
512
+ struct hash_node * p_hn = hash_get(ht, parent);
513
+ p_hn->n_children -= 1;
514
+
515
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
516
+
517
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
518
+ if (ggml_is_view(parent)) {
519
+ struct ggml_tensor * view_src = parent->view_src;
520
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
521
+ view_src_hn->n_views -= 1;
522
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
523
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
524
+ ggml_allocator_free_tensor(alloc, view_src);
525
+ }
526
+ }
527
+ else {
528
+ if (parent->data != node->data) {
529
+ ggml_allocator_free_tensor(alloc, parent);
530
+ }
531
+ }
558
532
  }
559
533
  }
560
534
  }
535
+ AT_PRINTF("\n");
536
+ if (alloc->parse_seq_len) {
537
+ last_barrier_pos = ind + 1;
538
+ }
561
539
  }
562
- AT_PRINTF("\n");
563
540
  }
564
541
  // free graph outputs here that wouldn't be freed otherwise because they have no children
565
542
  if (outputs != NULL && outputs[g] != NULL) {
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
12
 
13
13
  // tell the allocator to parse nodes following the order described in the list
14
14
  // you should call this if your graph are optimized to execute out-of-order
15
- GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
16
 
17
17
  GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
18
  GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);