llama_cpp 0.15.4 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -339,6 +339,7 @@ struct hash_node {
339
339
  };
340
340
 
341
341
  struct tensor_alloc {
342
+ int buffer_id;
342
343
  size_t offset;
343
344
  size_t size_max; // 0 = pre-allocated, unused, or view
344
345
  };
@@ -349,7 +350,6 @@ struct leaf_alloc {
349
350
  };
350
351
 
351
352
  struct node_alloc {
352
- int buffer_id;
353
353
  struct tensor_alloc dst;
354
354
  struct tensor_alloc src[GGML_MAX_SRC];
355
355
  };
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
377
377
  galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
383
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
386
386
  for (int i = 0; i < n_bufs; i++) {
387
387
  galloc->bufts[i] = bufts[i];
388
388
  galloc->buffers[i] = NULL;
389
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
390
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
389
+
390
+ // check if the same buffer type is used multiple times and reuse the same allocator
391
+ for (int j = 0; j < i; j++) {
392
+ if (bufts[i] == bufts[j]) {
393
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
394
+ break;
395
+ }
396
+ }
397
+
398
+ if (galloc->buf_tallocs[i] == NULL) {
399
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
400
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
401
+ }
391
402
  }
392
403
  galloc->n_buffers = n_bufs;
393
404
 
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
405
416
 
406
417
  for (int i = 0; i < galloc->n_buffers; i++) {
407
418
  if (galloc->buffers != NULL) {
408
- ggml_backend_buffer_free(galloc->buffers[i]);
419
+ // skip if already freed
420
+ bool freed = false;
421
+ for (int j = 0; j < i; j++) {
422
+ if (galloc->buffers[j] == galloc->buffers[i]) {
423
+ freed = true;
424
+ break;
425
+ }
426
+ }
427
+ if (!freed) {
428
+ ggml_backend_buffer_free(galloc->buffers[i]);
429
+ }
409
430
  }
410
431
  if (galloc->buf_tallocs != NULL) {
411
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
432
+ // skip if already freed
433
+ bool freed = false;
434
+ for (int j = 0; j < i; j++) {
435
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
436
+ freed = true;
437
+ break;
438
+ }
439
+ }
440
+ if (!freed) {
441
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
442
+ }
412
443
  }
413
444
  }
414
445
 
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511
542
  }
512
543
  }
513
544
 
514
- static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
545
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
515
546
  // graph outputs are never freed
516
547
  if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
517
548
  AT_PRINTF("not freeing output %s\n", node->name);
518
549
  return;
519
550
  }
520
551
 
521
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
522
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
523
552
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
524
553
  size_t offset = hn->offset;
554
+ int buffer_id = hn->buffer_id;
555
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
556
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
525
557
  size_t size = ggml_backend_buft_get_alloc_size(buft, node);
526
558
  ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
527
559
  hn->allocated = false;
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626
658
  AT_PRINTF("view_src %s: %d children, %d views\n",
627
659
  view_src->name, view_src_hn->n_children, view_src_hn->n_views);
628
660
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
629
- ggml_gallocr_free_node(galloc, view_src, buffer_id);
661
+ ggml_gallocr_free_node(galloc, view_src);
630
662
  }
631
663
  }
632
664
  else if (p_hn->allocated) {
633
- ggml_gallocr_free_node(galloc, parent, buffer_id);
665
+ ggml_gallocr_free_node(galloc, parent);
634
666
  }
635
667
  }
636
668
  AT_PRINTF("\n");
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674
706
  for (int i = 0; i < graph->n_nodes; i++) {
675
707
  struct ggml_tensor * node = graph->nodes[i];
676
708
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
677
- node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678
709
  if (node->view_src || node->data) {
710
+ node_alloc->dst.buffer_id = -1;
679
711
  node_alloc->dst.offset = SIZE_MAX;
680
712
  node_alloc->dst.size_max = 0;
681
713
  } else {
682
714
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
683
- node_alloc->dst.offset = hn->offset;
684
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
715
+ node_alloc->dst.buffer_id = hn->buffer_id;
716
+ node_alloc->dst.offset = hn->offset;
717
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
685
718
  }
686
719
  for (int j = 0; j < GGML_MAX_SRC; j++) {
687
720
  struct ggml_tensor * src = node->src[j];
688
721
  if (!src || src->view_src || src->data) {
722
+ node_alloc->src[j].buffer_id = -1;
689
723
  node_alloc->src[j].offset = SIZE_MAX;
690
724
  node_alloc->src[j].size_max = 0;
691
725
  } else {
692
726
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
727
+ node_alloc->src[j].buffer_id = hn->buffer_id;
693
728
  node_alloc->src[j].offset = hn->offset;
694
729
  node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
695
730
  }
@@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706
741
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
742
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
743
  if (leaf->view_src || leaf->data) {
744
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
709
745
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
746
  galloc->leaf_allocs[i].leaf.size_max = 0;
711
747
  } else {
748
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
712
749
  galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
750
  galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
751
  }
@@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
716
753
 
717
754
  // reallocate buffers if needed
718
755
  for (int i = 0; i < galloc->n_buffers; i++) {
756
+ // if the buffer type is used multiple times, we reuse the same buffer
757
+ for (int j = 0; j < i; j++) {
758
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
759
+ galloc->buffers[i] = galloc->buffers[j];
760
+ break;
761
+ }
762
+ }
763
+
719
764
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
720
765
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
721
766
 
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
724
769
  #ifndef NDEBUG
725
770
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
726
771
  #endif
772
+
727
773
  ggml_backend_buffer_free(galloc->buffers[i]);
728
774
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
729
775
  if (galloc->buffers[i] == NULL) {
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740
786
  return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
741
787
  }
742
788
 
743
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
789
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
790
+ int buffer_id = tensor_alloc->buffer_id;
744
791
  assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
745
792
 
746
793
  if (tensor->view_src != NULL) {
@@ -750,7 +797,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750
797
  // this tensor was allocated without ggml-backend
751
798
  return;
752
799
  }
753
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
800
+ ggml_backend_view_init(tensor);
754
801
  }
755
802
  } else {
756
803
  if (tensor->data == NULL) {
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768
815
  }
769
816
  }
770
817
 
771
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
772
- ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
818
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
819
+ ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
773
820
  size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
774
821
  return talloc->size_max >= node_size;
775
822
  }
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793
840
  struct ggml_tensor * node = graph->nodes[i];
794
841
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
795
842
 
796
- if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
843
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
797
844
  #ifndef NDEBUG
798
845
  fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
799
846
  #endif
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805
852
  if (src == NULL) {
806
853
  continue;
807
854
  }
808
- if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
855
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
809
856
  #ifndef NDEBUG
810
857
  fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
811
858
  #endif
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846
893
  for (int i = 0; i < graph->n_leafs; i++) {
847
894
  struct ggml_tensor * leaf = graph->leafs[i];
848
895
  struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
849
- ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
896
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
850
897
  }
851
898
  // nodes
852
899
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857
904
  if (src == NULL) {
858
905
  continue;
859
906
  }
860
- ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
907
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
861
908
  }
862
- ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
909
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
863
910
  }
864
911
 
865
912
  return true;
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
871
918
  if (galloc->buffers[buffer_id] == NULL) {
872
919
  return 0;
873
920
  }
921
+
922
+ for (int i = 0; i < buffer_id; i++) {
923
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
924
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
925
+ // only return the buffer size the first time it appears to avoid double counting
926
+ return 0;
927
+ }
928
+ }
929
+
874
930
  return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
875
931
  }
876
932
 
@@ -886,7 +942,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
886
942
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
887
943
  #endif
888
944
  for (size_t i = 0; i < *n_buffers; i++) {
889
- ggml_backend_buffer_free(*buffers[i]);
945
+ ggml_backend_buffer_free((*buffers)[i]);
890
946
  }
891
947
  free(*buffers);
892
948
  return false;
@@ -899,12 +955,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899
955
  if (t->view_src == NULL) {
900
956
  ggml_tallocr_alloc(&tallocr, t);
901
957
  } else if (t->buffer == NULL) {
902
- ggml_backend_view_init(buffer, t);
958
+ ggml_backend_view_init(t);
903
959
  }
904
960
  } else {
905
961
  if (t->view_src != NULL && t->buffer == NULL) {
906
962
  // view of a pre-allocated tensor
907
- ggml_backend_view_init(buffer, t);
963
+ ggml_backend_view_init(t);
908
964
  }
909
965
  }
910
966
  }
@@ -17,13 +17,15 @@ extern "C" {
17
17
 
18
18
  struct ggml_backend_buffer_type_i {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ // allocate a buffer of this type
20
21
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
23
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
24
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
22
+ // tensor alignment
23
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
+ // max buffer size that can be allocated
25
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
+ // data size needed to allocate the tensor, including padding
27
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
28
  // check if tensor data is in host memory
26
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
27
29
  bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
28
30
  };
29
31
 
@@ -92,27 +94,37 @@ extern "C" {
92
94
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
95
 
94
96
  // compute graph with a plan (not used currently)
97
+ // create a new plan for a graph
95
98
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
99
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
+ // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101
+ void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102
+ // compute the graph with the plan
103
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
104
 
98
- // compute graph with a plan
99
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
105
  // compute graph without a plan (async)
101
106
  enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
107
 
103
- // check if the backend supports an operation
108
+ // check if the backend can compute an operation
104
109
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
110
 
111
+ // check if the backend can use tensors allocated in a buffer type
112
+ bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113
+
106
114
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
115
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
116
  // even if the weight has to be copied from the CPU temporarily
109
117
  bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
118
 
111
119
  // (optional) event synchronization
120
+ // create a new event that can record events on this backend instance
112
121
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113
122
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
123
+ // record an event on the backend instance that created it
114
124
  void (*GGML_CALL event_record) (ggml_backend_event_t event);
125
+ // wait for an event on on a different backend instance
115
126
  void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127
+ // block until an event is recorded
116
128
  void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117
129
  };
118
130