llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -339,6 +339,7 @@ struct hash_node {
339
339
  };
340
340
 
341
341
  struct tensor_alloc {
342
+ int buffer_id;
342
343
  size_t offset;
343
344
  size_t size_max; // 0 = pre-allocated, unused, or view
344
345
  };
@@ -349,7 +350,6 @@ struct leaf_alloc {
349
350
  };
350
351
 
351
352
  struct node_alloc {
352
- int buffer_id;
353
353
  struct tensor_alloc dst;
354
354
  struct tensor_alloc src[GGML_MAX_SRC];
355
355
  };
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
386
386
  for (int i = 0; i < n_bufs; i++) {
387
387
  galloc->bufts[i] = bufts[i];
388
388
  galloc->buffers[i] = NULL;
389
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
390
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
389
+
390
+ // check if the same buffer type is used multiple times and reuse the same allocator
391
+ for (int j = 0; j < i; j++) {
392
+ if (bufts[i] == bufts[j]) {
393
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
394
+ break;
395
+ }
396
+ }
397
+
398
+ if (galloc->buf_tallocs[i] == NULL) {
399
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
400
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
401
+ }
391
402
  }
392
403
  galloc->n_buffers = n_bufs;
393
404
 
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
405
416
 
406
417
  for (int i = 0; i < galloc->n_buffers; i++) {
407
418
  if (galloc->buffers != NULL) {
408
- ggml_backend_buffer_free(galloc->buffers[i]);
419
+ // skip if already freed
420
+ bool freed = false;
421
+ for (int j = 0; j < i; j++) {
422
+ if (galloc->buffers[j] == galloc->buffers[i]) {
423
+ freed = true;
424
+ break;
425
+ }
426
+ }
427
+ if (!freed) {
428
+ ggml_backend_buffer_free(galloc->buffers[i]);
429
+ }
409
430
  }
410
431
  if (galloc->buf_tallocs != NULL) {
411
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
432
+ // skip if already freed
433
+ bool freed = false;
434
+ for (int j = 0; j < i; j++) {
435
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
436
+ freed = true;
437
+ break;
438
+ }
439
+ }
440
+ if (!freed) {
441
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
442
+ }
412
443
  }
413
444
  }
414
445
 
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511
542
  }
512
543
  }
513
544
 
514
- static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
545
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
515
546
  // graph outputs are never freed
516
547
  if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
517
548
  AT_PRINTF("not freeing output %s\n", node->name);
518
549
  return;
519
550
  }
520
551
 
521
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
522
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
523
552
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
524
553
  size_t offset = hn->offset;
554
+ int buffer_id = hn->buffer_id;
555
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
556
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
525
557
  size_t size = ggml_backend_buft_get_alloc_size(buft, node);
526
558
  ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
527
559
  hn->allocated = false;
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626
658
  AT_PRINTF("view_src %s: %d children, %d views\n",
627
659
  view_src->name, view_src_hn->n_children, view_src_hn->n_views);
628
660
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
629
- ggml_gallocr_free_node(galloc, view_src, buffer_id);
661
+ ggml_gallocr_free_node(galloc, view_src);
630
662
  }
631
663
  }
632
664
  else if (p_hn->allocated) {
633
- ggml_gallocr_free_node(galloc, parent, buffer_id);
665
+ ggml_gallocr_free_node(galloc, parent);
634
666
  }
635
667
  }
636
668
  AT_PRINTF("\n");
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674
706
  for (int i = 0; i < graph->n_nodes; i++) {
675
707
  struct ggml_tensor * node = graph->nodes[i];
676
708
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
677
- node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678
709
  if (node->view_src || node->data) {
710
+ node_alloc->dst.buffer_id = -1;
679
711
  node_alloc->dst.offset = SIZE_MAX;
680
712
  node_alloc->dst.size_max = 0;
681
713
  } else {
682
714
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
683
- node_alloc->dst.offset = hn->offset;
684
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
715
+ node_alloc->dst.buffer_id = hn->buffer_id;
716
+ node_alloc->dst.offset = hn->offset;
717
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
685
718
  }
686
719
  for (int j = 0; j < GGML_MAX_SRC; j++) {
687
720
  struct ggml_tensor * src = node->src[j];
688
721
  if (!src || src->view_src || src->data) {
722
+ node_alloc->src[j].buffer_id = -1;
689
723
  node_alloc->src[j].offset = SIZE_MAX;
690
724
  node_alloc->src[j].size_max = 0;
691
725
  } else {
692
726
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
727
+ node_alloc->src[j].buffer_id = hn->buffer_id;
693
728
  node_alloc->src[j].offset = hn->offset;
694
729
  node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
695
730
  }
@@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706
741
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
742
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
743
  if (leaf->view_src || leaf->data) {
744
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
709
745
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
746
  galloc->leaf_allocs[i].leaf.size_max = 0;
711
747
  } else {
748
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
712
749
  galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
750
  galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
751
  }
@@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
716
753
 
717
754
  // reallocate buffers if needed
718
755
  for (int i = 0; i < galloc->n_buffers; i++) {
756
+ // if the buffer type is used multiple times, we reuse the same buffer
757
+ for (int j = 0; j < i; j++) {
758
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
759
+ galloc->buffers[i] = galloc->buffers[j];
760
+ break;
761
+ }
762
+ }
763
+
719
764
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
720
765
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
721
766
 
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
724
769
  #ifndef NDEBUG
725
770
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
726
771
  #endif
772
+
727
773
  ggml_backend_buffer_free(galloc->buffers[i]);
728
774
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
729
775
  if (galloc->buffers[i] == NULL) {
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740
786
  return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
741
787
  }
742
788
 
743
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
789
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
790
+ int buffer_id = tensor_alloc->buffer_id;
744
791
  assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
745
792
 
746
793
  if (tensor->view_src != NULL) {
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768
815
  }
769
816
  }
770
817
 
771
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
772
- ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
818
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
819
+ ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
773
820
  size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
774
821
  return talloc->size_max >= node_size;
775
822
  }
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793
840
  struct ggml_tensor * node = graph->nodes[i];
794
841
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
795
842
 
796
- if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
843
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
797
844
  #ifndef NDEBUG
798
845
  fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
799
846
  #endif
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805
852
  if (src == NULL) {
806
853
  continue;
807
854
  }
808
- if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
855
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
809
856
  #ifndef NDEBUG
810
857
  fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
811
858
  #endif
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846
893
  for (int i = 0; i < graph->n_leafs; i++) {
847
894
  struct ggml_tensor * leaf = graph->leafs[i];
848
895
  struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
849
- ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
896
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
850
897
  }
851
898
  // nodes
852
899
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857
904
  if (src == NULL) {
858
905
  continue;
859
906
  }
860
- ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
907
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
861
908
  }
862
- ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
909
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
863
910
  }
864
911
 
865
912
  return true;
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
871
918
  if (galloc->buffers[buffer_id] == NULL) {
872
919
  return 0;
873
920
  }
921
+
922
+ for (int i = 0; i < buffer_id; i++) {
923
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
924
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
925
+ // only return the buffer size the first time it appears to avoid double counting
926
+ return 0;
927
+ }
928
+ }
929
+
874
930
  return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
875
931
  }
876
932
 
@@ -886,7 +942,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
886
942
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
887
943
  #endif
888
944
  for (size_t i = 0; i < *n_buffers; i++) {
889
- ggml_backend_buffer_free(*buffers[i]);
945
+ ggml_backend_buffer_free((*buffers)[i]);
890
946
  }
891
947
  free(*buffers);
892
948
  return false;
@@ -17,13 +17,15 @@ extern "C" {
17
17
 
18
18
  struct ggml_backend_buffer_type_i {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ // allocate a buffer of this type
20
21
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
23
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
24
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
22
+ // tensor alignment
23
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
+ // max buffer size that can be allocated
25
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
+ // data size needed to allocate the tensor, including padding
27
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
28
  // check if tensor data is in host memory
26
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
27
29
  bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
28
30
  };
29
31
 
@@ -92,27 +94,37 @@ extern "C" {
92
94
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
95
 
94
96
  // compute graph with a plan (not used currently)
97
+ // create a new plan for a graph
95
98
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
99
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
+ // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101
+ void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102
+ // compute the graph with the plan
103
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
104
 
98
- // compute graph with a plan
99
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
105
  // compute graph without a plan (async)
101
106
  enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
107
 
103
- // check if the backend supports an operation
108
+ // check if the backend can compute an operation
104
109
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
110
 
111
+ // check if the backend can use tensors allocated in a buffer type
112
+ bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113
+
106
114
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
115
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
116
  // even if the weight has to be copied from the CPU temporarily
109
117
  bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
118
 
111
119
  // (optional) event synchronization
120
+ // create a new event that can record events on this backend instance
112
121
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113
122
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
123
+ // record an event on the backend instance that created it
114
124
  void (*GGML_CALL event_record) (ggml_backend_event_t event);
125
+ // wait for an event on on a different backend instance
115
126
  void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127
+ // block until an event is recorded
116
128
  void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117
129
  };
118
130