llama_cpp 0.16.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,1041 +0,0 @@
1
- #include "ggml-alloc.h"
2
- #include "ggml-backend-impl.h"
3
- #include "ggml.h"
4
- #include "ggml-impl.h"
5
- #include <assert.h>
6
- #include <limits.h>
7
- #include <stdarg.h>
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
11
-
12
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
- #define MAX_FREE_BLOCKS 256
14
-
15
- //#define GGML_ALLOCATOR_DEBUG
16
-
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
- #define AT_PRINTF(...)
19
-
20
-
21
- static bool ggml_is_view(const struct ggml_tensor * t) {
22
- return t->view_src != NULL;
23
- }
24
-
25
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
- if (a->type != b->type) {
27
- return false;
28
- }
29
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
- if (a->ne[i] != b->ne[i]) {
31
- return false;
32
- }
33
- if (a->nb[i] != b->nb[i]) {
34
- return false;
35
- }
36
- }
37
- return true;
38
- }
39
-
40
- static bool ggml_op_can_inplace(enum ggml_op op) {
41
- switch (op) {
42
- case GGML_OP_SCALE:
43
- case GGML_OP_DIAG_MASK_ZERO:
44
- case GGML_OP_DIAG_MASK_INF:
45
- case GGML_OP_ADD:
46
- case GGML_OP_ADD1:
47
- case GGML_OP_SUB:
48
- case GGML_OP_MUL:
49
- case GGML_OP_DIV:
50
- case GGML_OP_SQR:
51
- case GGML_OP_SQRT:
52
- case GGML_OP_LOG:
53
- case GGML_OP_UNARY:
54
- case GGML_OP_ROPE:
55
- case GGML_OP_RMS_NORM:
56
- case GGML_OP_SOFT_MAX:
57
- return true;
58
-
59
- default:
60
- return false;
61
- }
62
- }
63
-
64
- static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
65
- assert(alignment && !(alignment & (alignment - 1))); // power of 2
66
- size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
67
- return offset + align;
68
- }
69
-
70
- // tallocr
71
-
72
- struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
73
- void * base = ggml_backend_buffer_get_base(buffer);
74
- size_t align = ggml_backend_buffer_get_alignment(buffer);
75
-
76
- assert(align && !(align & (align - 1))); // power of 2
77
-
78
- struct ggml_tallocr talloc = (struct ggml_tallocr) {
79
- /*.buffer = */ buffer,
80
- /*.base = */ base,
81
- /*.alignment = */ align,
82
- /*.offset = */ aligned_offset(base, 0, align),
83
- };
84
- return talloc;
85
- }
86
-
87
- void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
88
- size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
89
- size = GGML_PAD(size, talloc->alignment);
90
-
91
- if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
- __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
- GGML_ASSERT(!"not enough space in the buffer");
95
- return;
96
- }
97
-
98
- void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
99
- talloc->offset += size;
100
-
101
- assert(((uintptr_t)addr % talloc->alignment) == 0);
102
-
103
- ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
104
- }
105
-
106
- // dynamic tensor allocator
107
-
108
- struct free_block {
109
- size_t offset;
110
- size_t size;
111
- };
112
-
113
- struct ggml_dyn_tallocr {
114
- size_t alignment;
115
- int n_free_blocks;
116
- struct free_block free_blocks[MAX_FREE_BLOCKS];
117
- size_t max_size;
118
-
119
- #ifdef GGML_ALLOCATOR_DEBUG
120
- struct {
121
- const struct ggml_tensor * tensor;
122
- size_t offset;
123
- } allocated_tensors[1024];
124
- #endif
125
- };
126
-
127
- #ifdef GGML_ALLOCATOR_DEBUG
128
- static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
129
- for (int i = 0; i < 1024; i++) {
130
- if (alloc->allocated_tensors[i].tensor == NULL) {
131
- alloc->allocated_tensors[i].tensor = tensor;
132
- alloc->allocated_tensors[i].offset = offset;
133
- return;
134
- }
135
- }
136
- GGML_ASSERT(!"out of allocated_tensors");
137
- }
138
- static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
139
- for (int i = 0; i < 1024; i++) {
140
- if (alloc->allocated_tensors[i].offset == offset) {
141
- alloc->allocated_tensors[i].tensor = NULL;
142
- return;
143
- }
144
- }
145
- fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
146
- GGML_ASSERT(!"tensor not found");
147
- }
148
- #endif
149
-
150
- static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
151
- size = aligned_offset(NULL, size, alloc->alignment);
152
-
153
- AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
154
-
155
- size_t max_avail = 0;
156
-
157
- // find the best fitting free block besides the last block
158
- int best_fit_block = -1;
159
- size_t best_fit_size = SIZE_MAX;
160
- for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
161
- struct free_block * block = &alloc->free_blocks[i];
162
- max_avail = MAX(max_avail, block->size);
163
- if (block->size >= size && block->size <= best_fit_size) {
164
- best_fit_block = i;
165
- best_fit_size = block->size;
166
- }
167
- }
168
-
169
- if (best_fit_block == -1) {
170
- // the last block is our last resort
171
- struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
172
- max_avail = MAX(max_avail, block->size);
173
- if (block->size >= size) {
174
- best_fit_block = alloc->n_free_blocks - 1;
175
- } else {
176
- // this should never happen
177
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
178
- __func__, size, max_avail);
179
- GGML_ASSERT(!"not enough space in the buffer");
180
- GGML_UNREACHABLE();
181
- }
182
- }
183
-
184
- struct free_block * block = &alloc->free_blocks[best_fit_block];
185
- size_t offset = block->offset;
186
- block->offset = offset + size;
187
- block->size -= size;
188
- if (block->size == 0) {
189
- // remove block if empty
190
- alloc->n_free_blocks--;
191
- for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
192
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
193
- }
194
- }
195
-
196
- AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
197
-
198
- #ifdef GGML_ALLOCATOR_DEBUG
199
- add_allocated_tensor(alloc, offset, tensor);
200
- size_t cur_max = offset + size;
201
- if (cur_max > alloc->max_size) {
202
- // sort allocated_tensors by offset
203
- for (int i = 0; i < 1024; i++) {
204
- for (int j = i + 1; j < 1024; j++) {
205
- if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
206
- const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
207
- size_t tmp_offset = alloc->allocated_tensors[i].offset;
208
- alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
209
- alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
210
- alloc->allocated_tensors[j].tensor = tmp_tensor;
211
- alloc->allocated_tensors[j].offset = tmp_offset;
212
- }
213
- }
214
- }
215
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
216
- for (int i = 0; i < 1024; i++) {
217
- if (alloc->allocated_tensors[i].tensor) {
218
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
219
- alloc->allocated_tensors[i].offset,
220
- alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
221
- ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
222
- }
223
- }
224
- fprintf(stderr, "\n");
225
- }
226
- #endif
227
-
228
- alloc->max_size = MAX(alloc->max_size, offset + size);
229
-
230
- return offset;
231
-
232
- GGML_UNUSED(tensor);
233
- }
234
-
235
- // this is a very naive implementation, but for our case the number of free blocks should be very small
236
- static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
237
- size = aligned_offset(NULL, size, alloc->alignment);
238
-
239
- AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
240
-
241
- #ifdef GGML_ALLOCATOR_DEBUG
242
- remove_allocated_tensor(alloc, offset, tensor);
243
- #endif
244
-
245
- // see if we can merge with an existing block
246
- for (int i = 0; i < alloc->n_free_blocks; i++) {
247
- struct free_block * block = &alloc->free_blocks[i];
248
- // check if ptr is at the end of the block
249
- if (block->offset + block->size == offset) {
250
- block->size += size;
251
- // check if we can merge with the next block
252
- if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
253
- block->size += alloc->free_blocks[i+1].size;
254
- alloc->n_free_blocks--;
255
- for (int j = i+1; j < alloc->n_free_blocks; j++) {
256
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
257
- }
258
- }
259
- return;
260
- }
261
- // check if ptr is at the beginning of the block
262
- if (offset + size == block->offset) {
263
- block->offset = offset;
264
- block->size += size;
265
- // check if we can merge with the previous block
266
- if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
267
- alloc->free_blocks[i-1].size += block->size;
268
- alloc->n_free_blocks--;
269
- for (int j = i; j < alloc->n_free_blocks; j++) {
270
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
271
- }
272
- }
273
- return;
274
- }
275
- }
276
- // otherwise, add a new block
277
- GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
278
- // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
279
- int insert_pos = 0;
280
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
281
- insert_pos++;
282
- }
283
- // shift all blocks from insert_pos onward to make room for the new block
284
- for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
285
- alloc->free_blocks[i] = alloc->free_blocks[i-1];
286
- }
287
- // insert the new block
288
- alloc->free_blocks[insert_pos].offset = offset;
289
- alloc->free_blocks[insert_pos].size = size;
290
- alloc->n_free_blocks++;
291
-
292
- GGML_UNUSED(tensor);
293
- }
294
-
295
- static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
296
- alloc->n_free_blocks = 1;
297
- alloc->free_blocks[0].offset = 0;
298
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
299
- alloc->max_size = 0;
300
- }
301
-
302
- static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
303
- struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
304
-
305
- *alloc = (struct ggml_dyn_tallocr) {
306
- /*.alignment = */ alignment,
307
- /*.n_free_blocks = */ 0,
308
- /*.free_blocks = */ {{0}},
309
- /*.max_size = */ 0,
310
- #ifdef GGML_ALLOCATOR_DEBUG
311
- /*.allocated_tensors = */ {{0}},
312
- #endif
313
- };
314
-
315
- ggml_dyn_tallocr_reset(alloc);
316
-
317
- return alloc;
318
- }
319
-
320
- static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
321
- free(alloc);
322
- }
323
-
324
- static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
325
- return alloc->max_size;
326
- }
327
-
328
-
329
- /////////////////////////////////////
330
-
331
- // graph allocator
332
-
333
- struct hash_node {
334
- int n_children;
335
- int n_views;
336
- int buffer_id;
337
- size_t offset; // offset within the buffer
338
- bool allocated;
339
- };
340
-
341
- struct tensor_alloc {
342
- int buffer_id;
343
- size_t offset;
344
- size_t size_max; // 0 = pre-allocated, unused, or view
345
- };
346
-
347
- struct leaf_alloc {
348
- int buffer_id;
349
- struct tensor_alloc leaf;
350
- };
351
-
352
- struct node_alloc {
353
- struct tensor_alloc dst;
354
- struct tensor_alloc src[GGML_MAX_SRC];
355
- };
356
-
357
- struct ggml_gallocr {
358
- ggml_backend_buffer_type_t * bufts; // [n_buffers]
359
- ggml_backend_buffer_t * buffers; // [n_buffers]
360
- struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
361
- int n_buffers;
362
-
363
- struct ggml_hash_set hash_set;
364
- struct hash_node * hash_values; // [hash_set.size]
365
-
366
- struct node_alloc * node_allocs; // [n_nodes]
367
- int n_nodes;
368
-
369
- struct leaf_alloc * leaf_allocs; // [n_leafs]
370
- int n_leafs;
371
- };
372
-
373
- ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
- GGML_ASSERT(galloc != NULL);
376
-
377
- galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
- GGML_ASSERT(galloc->bufts != NULL);
379
-
380
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
381
- GGML_ASSERT(galloc->buffers != NULL);
382
-
383
- galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
- GGML_ASSERT(galloc->buf_tallocs != NULL);
385
-
386
- for (int i = 0; i < n_bufs; i++) {
387
- galloc->bufts[i] = bufts[i];
388
- galloc->buffers[i] = NULL;
389
-
390
- // check if the same buffer type is used multiple times and reuse the same allocator
391
- for (int j = 0; j < i; j++) {
392
- if (bufts[i] == bufts[j]) {
393
- galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
394
- break;
395
- }
396
- }
397
-
398
- if (galloc->buf_tallocs[i] == NULL) {
399
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
400
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
401
- }
402
- }
403
- galloc->n_buffers = n_bufs;
404
-
405
- return galloc;
406
- }
407
-
408
- ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
409
- return ggml_gallocr_new_n(&buft, 1);
410
- }
411
-
412
- void ggml_gallocr_free(ggml_gallocr_t galloc) {
413
- if (galloc == NULL) {
414
- return;
415
- }
416
-
417
- for (int i = 0; i < galloc->n_buffers; i++) {
418
- if (galloc->buffers != NULL) {
419
- // skip if already freed
420
- bool freed = false;
421
- for (int j = 0; j < i; j++) {
422
- if (galloc->buffers[j] == galloc->buffers[i]) {
423
- freed = true;
424
- break;
425
- }
426
- }
427
- if (!freed) {
428
- ggml_backend_buffer_free(galloc->buffers[i]);
429
- }
430
- }
431
- if (galloc->buf_tallocs != NULL) {
432
- // skip if already freed
433
- bool freed = false;
434
- for (int j = 0; j < i; j++) {
435
- if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
436
- freed = true;
437
- break;
438
- }
439
- }
440
- if (!freed) {
441
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
442
- }
443
- }
444
- }
445
-
446
- free(galloc->hash_set.keys);
447
- free(galloc->hash_values);
448
- free(galloc->bufts);
449
- free(galloc->buffers);
450
- free(galloc->buf_tallocs);
451
- free(galloc->node_allocs);
452
- free(galloc->leaf_allocs);
453
- free(galloc);
454
- }
455
-
456
- typedef struct ggml_gallocr * ggml_gallocr_t;
457
-
458
- static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
459
- size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
460
- return &galloc->hash_values[i];
461
- }
462
-
463
- static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
464
- return ggml_gallocr_hash_get(galloc, t)->allocated;
465
- }
466
-
467
- static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
468
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
469
- hn->buffer_id = buffer_id;
470
- hn->offset = offset;
471
- hn->allocated = true;
472
- }
473
-
474
- static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
475
- return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
476
- }
477
-
478
- static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
479
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
480
-
481
- if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
482
- hn->allocated = true;
483
- assert(hn->offset == 0);
484
-
485
- // try to reuse a parent's buffer (inplace)
486
- if (ggml_op_can_inplace(node->op)) {
487
- for (int i = 0; i < GGML_MAX_SRC; i++) {
488
- struct ggml_tensor * parent = node->src[i];
489
- if (parent == NULL) {
490
- continue;
491
- }
492
-
493
- // if the node's data is external, then we cannot re-use it
494
- if (!ggml_gallocr_is_own(galloc, parent)) {
495
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
496
- continue;
497
- }
498
-
499
- // outputs cannot be reused
500
- if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
501
- AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
502
- continue;
503
- }
504
-
505
- if (!ggml_are_same_layout(node, parent)) {
506
- AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
507
- continue;
508
- }
509
-
510
- struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
511
- if (p_hn->n_children == 1 && p_hn->n_views == 0) {
512
- if (ggml_is_view(parent)) {
513
- struct ggml_tensor * view_src = parent->view_src;
514
- struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
515
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
516
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
517
- assert(view_src_hn->offset == p_hn->offset);
518
- hn->buffer_id = p_hn->buffer_id;
519
- hn->offset = p_hn->offset;
520
- p_hn->allocated = false; // avoid freeing the parent
521
- view_src_hn->allocated = false;
522
- return;
523
- }
524
- } else {
525
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
526
- hn->buffer_id = p_hn->buffer_id;
527
- hn->offset = p_hn->offset;
528
- p_hn->allocated = false; // avoid freeing the parent
529
- return;
530
- }
531
- }
532
- }
533
- }
534
- // allocate tensor from the buffer
535
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
536
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
537
- size_t size = ggml_backend_buft_get_alloc_size(buft, node);
538
- size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
539
- hn->buffer_id = buffer_id;
540
- hn->offset = offset;
541
- return;
542
- }
543
- }
544
-
545
- static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
546
- // graph outputs are never freed
547
- if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
548
- AT_PRINTF("not freeing output %s\n", node->name);
549
- return;
550
- }
551
-
552
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
553
- size_t offset = hn->offset;
554
- int buffer_id = hn->buffer_id;
555
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
556
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
557
- size_t size = ggml_backend_buft_get_alloc_size(buft, node);
558
- ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
559
- hn->allocated = false;
560
- }
561
-
562
- static int get_node_buffer_id(const int * node_buffer_ids, int i) {
563
- return node_buffer_ids ? node_buffer_ids[i] : 0;
564
- }
565
-
566
- static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
567
- // clear hash tables
568
- memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
569
- memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
570
-
571
- // allocate leafs
572
- // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
573
- for (int i = 0; i < graph->n_leafs; i++) {
574
- struct ggml_tensor * leaf = graph->leafs[i];
575
- ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
576
- }
577
-
578
- // count number of children and views
579
- // allocate other graph inputs and leafs first to avoid overwriting them
580
- for (int i = 0; i < graph->n_nodes; i++) {
581
- struct ggml_tensor * node = graph->nodes[i];
582
-
583
- // TODO: better way to add external dependencies
584
- // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
585
- // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
586
- // itself is never used and should not be considered a dependency
587
- if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
588
- struct ggml_tensor * view_src = node->view_src;
589
- ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
590
- }
591
-
592
- if (node->flags & GGML_TENSOR_FLAG_INPUT) {
593
- ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
594
- }
595
-
596
- for (int j = 0; j < GGML_MAX_SRC; j++) {
597
- struct ggml_tensor * src = node->src[j];
598
- if (src == NULL) {
599
- continue;
600
- }
601
-
602
- ggml_gallocr_hash_get(galloc, src)->n_children += 1;
603
-
604
- // allocate explicit inputs
605
- if (src->flags & GGML_TENSOR_FLAG_INPUT) {
606
- ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
607
- }
608
- }
609
- }
610
-
611
- // allocate tensors
612
- for (int i = 0; i < graph->n_nodes; i++) {
613
- struct ggml_tensor * node = graph->nodes[i];
614
- int buffer_id = get_node_buffer_id(node_buffer_ids, i);
615
-
616
- // allocate parents (only leafs need to be allocated at this point)
617
- for (int j = 0; j < GGML_MAX_SRC; j++) {
618
- struct ggml_tensor * parent = node->src[j];
619
- if (parent == NULL) {
620
- continue;
621
- }
622
- ggml_gallocr_allocate_node(galloc, parent, buffer_id);
623
- }
624
-
625
- // allocate node
626
- ggml_gallocr_allocate_node(galloc, node, buffer_id);
627
-
628
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
629
- for (int j = 0; j < GGML_MAX_SRC; j++) {
630
- struct ggml_tensor * parent = node->src[j];
631
- if (parent == NULL) {
632
- continue;
633
- }
634
- AT_PRINTF("%s", parent->name);
635
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
636
- AT_PRINTF(", ");
637
- }
638
- }
639
- AT_PRINTF("\n");
640
-
641
- // update parents
642
- for (int j = 0; j < GGML_MAX_SRC; j++) {
643
- struct ggml_tensor * parent = node->src[j];
644
- if (parent == NULL) {
645
- continue;
646
- }
647
- struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
648
- p_hn->n_children -= 1;
649
-
650
- AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
651
- parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
652
-
653
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
654
- if (ggml_is_view(parent)) {
655
- struct ggml_tensor * view_src = parent->view_src;
656
- struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
657
- view_src_hn->n_views -= 1;
658
- AT_PRINTF("view_src %s: %d children, %d views\n",
659
- view_src->name, view_src_hn->n_children, view_src_hn->n_views);
660
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
661
- ggml_gallocr_free_node(galloc, view_src);
662
- }
663
- }
664
- else if (p_hn->allocated) {
665
- ggml_gallocr_free_node(galloc, parent);
666
- }
667
- }
668
- AT_PRINTF("\n");
669
- }
670
- }
671
- }
672
-
673
- bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
674
- size_t hash_size = graph->visited_hash_table.size;
675
-
676
- // initialize hash table
677
- if (galloc->hash_set.size < hash_size) {
678
- free(galloc->hash_set.keys);
679
- free(galloc->hash_values);
680
- galloc->hash_set.size = hash_size;
681
- galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
682
- galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
683
- GGML_ASSERT(galloc->hash_set.keys != NULL);
684
- GGML_ASSERT(galloc->hash_values != NULL);
685
- } else {
686
- // reset hash table
687
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
688
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
689
- }
690
-
691
- // reset allocators
692
- for (int i = 0; i < galloc->n_buffers; i++) {
693
- ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
694
- }
695
-
696
- // allocate in hash table
697
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
698
-
699
- // set the node_allocs from the hash table
700
- if (galloc->n_nodes < graph->n_nodes) {
701
- free(galloc->node_allocs);
702
- galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
703
- GGML_ASSERT(galloc->node_allocs != NULL);
704
- }
705
- galloc->n_nodes = graph->n_nodes;
706
- for (int i = 0; i < graph->n_nodes; i++) {
707
- struct ggml_tensor * node = graph->nodes[i];
708
- struct node_alloc * node_alloc = &galloc->node_allocs[i];
709
- if (node->view_src || node->data) {
710
- node_alloc->dst.buffer_id = -1;
711
- node_alloc->dst.offset = SIZE_MAX;
712
- node_alloc->dst.size_max = 0;
713
- } else {
714
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
715
- node_alloc->dst.buffer_id = hn->buffer_id;
716
- node_alloc->dst.offset = hn->offset;
717
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
718
- }
719
- for (int j = 0; j < GGML_MAX_SRC; j++) {
720
- struct ggml_tensor * src = node->src[j];
721
- if (!src || src->view_src || src->data) {
722
- node_alloc->src[j].buffer_id = -1;
723
- node_alloc->src[j].offset = SIZE_MAX;
724
- node_alloc->src[j].size_max = 0;
725
- } else {
726
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
727
- node_alloc->src[j].buffer_id = hn->buffer_id;
728
- node_alloc->src[j].offset = hn->offset;
729
- node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
730
- }
731
- }
732
- }
733
- if (galloc->n_leafs < graph->n_leafs) {
734
- free(galloc->leaf_allocs);
735
- galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
736
- GGML_ASSERT(galloc->leaf_allocs != NULL);
737
- }
738
- galloc->n_leafs = graph->n_leafs;
739
- for (int i = 0; i < graph->n_leafs; i++) {
740
- struct ggml_tensor * leaf = graph->leafs[i];
741
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
742
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
743
- if (leaf->view_src || leaf->data) {
744
- galloc->leaf_allocs[i].leaf.buffer_id = -1;
745
- galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
746
- galloc->leaf_allocs[i].leaf.size_max = 0;
747
- } else {
748
- galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
749
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
750
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
751
- }
752
- }
753
-
754
- // reallocate buffers if needed
755
- for (int i = 0; i < galloc->n_buffers; i++) {
756
- // if the buffer type is used multiple times, we reuse the same buffer
757
- for (int j = 0; j < i; j++) {
758
- if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
759
- galloc->buffers[i] = galloc->buffers[j];
760
- break;
761
- }
762
- }
763
-
764
- size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
765
- size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
766
-
767
- // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
768
- if (new_size > cur_size || galloc->buffers[i] == NULL) {
769
- #ifndef NDEBUG
770
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
771
- #endif
772
-
773
- ggml_backend_buffer_free(galloc->buffers[i]);
774
- galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
775
- if (galloc->buffers[i] == NULL) {
776
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
777
- return false;
778
- }
779
- }
780
- }
781
-
782
- return true;
783
- }
784
-
785
- bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
786
- return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
787
- }
788
-
789
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
790
- int buffer_id = tensor_alloc->buffer_id;
791
- assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
792
-
793
- if (tensor->view_src != NULL) {
794
- if (tensor->buffer == NULL) {
795
- assert(tensor_alloc->offset == SIZE_MAX);
796
- if (tensor->view_src->buffer == NULL) {
797
- // this tensor was allocated without ggml-backend
798
- return;
799
- }
800
- ggml_backend_view_init(tensor);
801
- }
802
- } else {
803
- if (tensor->data == NULL) {
804
- assert(tensor_alloc->offset != SIZE_MAX);
805
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
806
- void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
807
- void * addr = (char *)base + tensor_alloc->offset;
808
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
809
- } else {
810
- if (tensor->buffer == NULL) {
811
- // this tensor was allocated without ggml-backend
812
- return;
813
- }
814
- }
815
- }
816
- }
817
-
818
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
819
- ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
820
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
821
- return talloc->size_max >= node_size;
822
- }
823
-
824
- static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
825
- if (galloc->n_nodes != graph->n_nodes) {
826
- #ifndef NDEBUG
827
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
828
- #endif
829
- return true;
830
- }
831
-
832
- if (galloc->n_leafs != graph->n_leafs) {
833
- #ifndef NDEBUG
834
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
835
- #endif
836
- return true;
837
- }
838
-
839
- for (int i = 0; i < graph->n_nodes; i++) {
840
- struct ggml_tensor * node = graph->nodes[i];
841
- struct node_alloc * node_alloc = &galloc->node_allocs[i];
842
-
843
- if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
844
- #ifndef NDEBUG
845
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
846
- #endif
847
- return true;
848
- }
849
-
850
- for (int j = 0; j < GGML_MAX_SRC; j++) {
851
- struct ggml_tensor * src = node->src[j];
852
- if (src == NULL) {
853
- continue;
854
- }
855
- if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
856
- #ifndef NDEBUG
857
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
858
- #endif
859
- return true;
860
- }
861
- }
862
- }
863
-
864
- return false;
865
- }
866
-
867
- bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
868
- if (ggml_gallocr_needs_realloc(galloc, graph)) {
869
- if (galloc->n_buffers == 1) {
870
- #ifndef NDEBUG
871
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
872
- #endif
873
- if (!ggml_gallocr_reserve(galloc, graph)) {
874
- return false;
875
- }
876
- } else {
877
- #ifndef NDEBUG
878
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
879
- #endif
880
- return false;
881
- }
882
- }
883
-
884
- // reset buffers
885
- for (int i = 0; i < galloc->n_buffers; i++) {
886
- if (galloc->buffers[i] != NULL) {
887
- ggml_backend_buffer_reset(galloc->buffers[i]);
888
- }
889
- }
890
-
891
- // allocate the graph tensors from the previous assignments
892
- // leafs
893
- for (int i = 0; i < graph->n_leafs; i++) {
894
- struct ggml_tensor * leaf = graph->leafs[i];
895
- struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
896
- ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
897
- }
898
- // nodes
899
- for (int i = 0; i < graph->n_nodes; i++) {
900
- struct ggml_tensor * node = graph->nodes[i];
901
- struct node_alloc * node_alloc = &galloc->node_allocs[i];
902
- for (int j = 0; j < GGML_MAX_SRC; j++) {
903
- struct ggml_tensor * src = node->src[j];
904
- if (src == NULL) {
905
- continue;
906
- }
907
- ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
908
- }
909
- ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
910
- }
911
-
912
- return true;
913
- }
914
-
915
- size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
916
- GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
917
-
918
- if (galloc->buffers[buffer_id] == NULL) {
919
- return 0;
920
- }
921
-
922
- for (int i = 0; i < buffer_id; i++) {
923
- if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
924
- // this buffer is the same as a previous one due to the same buffer type being used multiple times
925
- // only return the buffer size the first time it appears to avoid double counting
926
- return 0;
927
- }
928
- }
929
-
930
- return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
931
- }
932
-
933
- // utils
934
-
935
- static bool alloc_tensor_range(struct ggml_context * ctx,
936
- struct ggml_tensor * first, struct ggml_tensor * last,
937
- ggml_backend_buffer_type_t buft, size_t size,
938
- ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
939
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
940
- if (buffer == NULL) {
941
- #ifndef NDEBUG
942
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
943
- #endif
944
- for (size_t i = 0; i < *n_buffers; i++) {
945
- ggml_backend_buffer_free((*buffers)[i]);
946
- }
947
- free(*buffers);
948
- return false;
949
- }
950
-
951
- struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
952
-
953
- for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
954
- if (t->data == NULL) {
955
- if (t->view_src == NULL) {
956
- ggml_tallocr_alloc(&tallocr, t);
957
- } else if (t->buffer == NULL) {
958
- ggml_backend_view_init(t);
959
- }
960
- } else {
961
- if (t->view_src != NULL && t->buffer == NULL) {
962
- // view of a pre-allocated tensor
963
- ggml_backend_view_init(t);
964
- }
965
- }
966
- }
967
-
968
- *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
969
- (*buffers)[(*n_buffers)++] = buffer;
970
-
971
- return true;
972
- }
973
-
974
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
975
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
976
-
977
- size_t alignment = ggml_backend_buft_get_alignment(buft);
978
- size_t max_size = ggml_backend_buft_get_max_size(buft);
979
-
980
- ggml_backend_buffer_t * buffers = NULL;
981
- size_t n_buffers = 0;
982
-
983
- size_t cur_buf_size = 0;
984
- struct ggml_tensor * first = ggml_get_first_tensor(ctx);
985
- for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
986
- size_t this_size = 0;
987
- if (t->data == NULL && t->view_src == NULL) {
988
- this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
989
- }
990
-
991
- if (this_size > max_size) {
992
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
993
- __func__, t->name,
994
- ggml_backend_buft_name(buft),
995
- this_size, max_size);
996
- for (size_t i = 0; i < n_buffers; i++) {
997
- ggml_backend_buffer_free(buffers[i]);
998
- }
999
- free(buffers);
1000
- return NULL;
1001
- }
1002
-
1003
- if ((cur_buf_size + this_size) > max_size) {
1004
- // allocate tensors in the current buffer
1005
- if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1006
- return NULL;
1007
- }
1008
- first = t;
1009
- cur_buf_size = this_size;
1010
- } else {
1011
- cur_buf_size += this_size;
1012
- }
1013
- }
1014
-
1015
- // allocate remaining tensors
1016
- if (cur_buf_size > 0) {
1017
- if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1018
- return NULL;
1019
- }
1020
- }
1021
-
1022
- if (n_buffers == 0) {
1023
- #ifndef NDEBUG
1024
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1025
- #endif
1026
- return NULL;
1027
- }
1028
-
1029
- ggml_backend_buffer_t buffer;
1030
- if (n_buffers == 1) {
1031
- buffer = buffers[0];
1032
- } else {
1033
- buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1034
- }
1035
- free(buffers);
1036
- return buffer;
1037
- }
1038
-
1039
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
1040
- return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
1041
- }