@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -1,9 +1,5 @@
1
- // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
2
1
  #pragma once
3
2
 
4
- #define GGML_COMMON_DECL_C
5
- #include "ggml-common.h"
6
-
7
3
  #include "ggml.h"
8
4
 
9
5
  // GGML internal header
@@ -12,27 +8,11 @@
12
8
  extern "C" {
13
9
  #endif
14
10
 
15
- // Quantization
16
- void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
17
- void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
18
-
19
- void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
20
-
21
11
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
22
12
  size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
23
13
  size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
24
14
  size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
25
15
 
26
- // GEMV
27
- void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
28
- void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
29
- void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
30
-
31
- // GEMM
32
- void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
33
- void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
34
- void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
35
-
36
16
  #ifdef __cplusplus
37
17
  }
38
18
  #endif
@@ -14,7 +14,7 @@
14
14
 
15
15
  //#define GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
17
+ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
20
 
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
89
89
  size = GGML_PAD(size, talloc->alignment);
90
90
 
91
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
92
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
94
  GGML_ABORT("not enough space in the buffer");
95
95
  }
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
172
172
  best_fit_block = alloc->n_free_blocks - 1;
173
173
  } else {
174
174
  // this should never happen
175
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
175
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
176
  __func__, size, max_avail);
177
177
  GGML_ABORT("not enough space in the buffer");
178
178
  }
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
209
209
  }
210
210
  }
211
211
  }
212
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
212
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
213
  for (int i = 0; i < 1024; i++) {
214
214
  if (alloc->allocated_tensors[i].tensor) {
215
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
215
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
216
  alloc->allocated_tensors[i].offset,
217
217
  alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
218
  ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
219
  }
220
220
  }
221
- fprintf(stderr, "\n");
221
+ GGML_LOG_DEBUG("\n");
222
222
  }
223
223
  #endif
224
224
 
@@ -348,7 +348,6 @@ struct tensor_alloc {
348
348
  };
349
349
 
350
350
  struct leaf_alloc {
351
- int buffer_id;
352
351
  struct tensor_alloc leaf;
353
352
  };
354
353
 
@@ -467,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
467
466
  return ggml_gallocr_hash_get(galloc, t)->allocated;
468
467
  }
469
468
 
470
- static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
471
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
472
- hn->buffer_id = buffer_id;
473
- hn->offset = offset;
474
- hn->allocated = true;
475
- }
476
-
477
469
  static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
478
470
  return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
479
471
  }
480
472
 
481
473
  static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
474
+ GGML_ASSERT(buffer_id >= 0);
482
475
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
483
476
 
484
477
  if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@@ -740,7 +733,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
740
733
  for (int i = 0; i < graph->n_leafs; i++) {
741
734
  struct ggml_tensor * leaf = graph->leafs[i];
742
735
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
743
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
744
736
  if (leaf->view_src || leaf->data) {
745
737
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
746
738
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -768,13 +760,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
768
760
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
769
761
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
770
762
  #ifndef NDEBUG
771
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
763
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
772
764
  #endif
773
765
 
774
766
  ggml_backend_buffer_free(galloc->buffers[i]);
775
767
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
776
768
  if (galloc->buffers[i] == NULL) {
777
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
769
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
778
770
  return false;
779
771
  }
780
772
  ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -818,21 +810,25 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
818
810
  }
819
811
 
820
812
  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
821
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
813
+ size_t node_size = 0;
814
+ if (!node->data && !node->view_src) {
815
+ GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
816
+ node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
817
+ }
822
818
  return talloc->size_max >= node_size;
823
819
  }
824
820
 
825
821
  static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
826
822
  if (galloc->n_nodes != graph->n_nodes) {
827
823
  #ifndef NDEBUG
828
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
824
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
829
825
  #endif
830
826
  return true;
831
827
  }
832
828
 
833
829
  if (galloc->n_leafs != graph->n_leafs) {
834
830
  #ifndef NDEBUG
835
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
831
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
836
832
  #endif
837
833
  return true;
838
834
  }
@@ -843,7 +839,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
843
839
 
844
840
  if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
845
841
  #ifndef NDEBUG
846
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
842
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
847
843
  #endif
848
844
  return true;
849
845
  }
@@ -855,7 +851,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
855
851
  }
856
852
  if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
857
853
  #ifndef NDEBUG
858
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
854
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
859
855
  #endif
860
856
  return true;
861
857
  }
@@ -869,14 +865,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
869
865
  if (ggml_gallocr_needs_realloc(galloc, graph)) {
870
866
  if (galloc->n_buffers == 1) {
871
867
  #ifndef NDEBUG
872
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
868
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
873
869
  #endif
874
870
  if (!ggml_gallocr_reserve(galloc, graph)) {
875
871
  return false;
876
872
  }
877
873
  } else {
878
874
  #ifndef NDEBUG
879
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
875
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
880
876
  #endif
881
877
  return false;
882
878
  }
@@ -940,7 +936,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
940
936
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
941
937
  if (buffer == NULL) {
942
938
  #ifndef NDEBUG
943
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
939
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
944
940
  #endif
945
941
  for (size_t i = 0; i < *n_buffers; i++) {
946
942
  ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +986,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
990
986
  }
991
987
 
992
988
  if (this_size > max_size) {
993
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
989
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
990
  __func__, t->name,
995
991
  ggml_backend_buft_name(buft),
996
992
  this_size, max_size);
@@ -1022,7 +1018,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
1022
1018
 
1023
1019
  if (n_buffers == 0) {
1024
1020
  #ifndef NDEBUG
1025
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1021
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1026
1022
  #endif
1027
1023
  return NULL;
1028
1024
  }
@@ -0,0 +1,107 @@
1
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
2
+ (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
3
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
4
+ CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
5
+ message(STATUS "Using AMX")
6
+
7
+ file(GLOB GGML_HEADERS_AMX "*.h")
8
+ list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
9
+
10
+ file(GLOB GGML_SOURCES_AMX "*.cpp")
11
+
12
+ add_library(ggml-amx
13
+ ${GGML_HEADERS_AMX}
14
+ ${GGML_SOURCES_AMX})
15
+
16
+ target_link_libraries(ggml-amx PRIVATE ggml-base)
17
+ target_include_directories(ggml-amx PRIVATE . ..)
18
+
19
+ # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
20
+ # TODO: integrate AMX backend into the CPU backend
21
+ if (MSVC)
22
+ # instruction set detection for MSVC only
23
+ if (GGML_NATIVE)
24
+ # TODO: improve, should not reference files from the parent folder
25
+ include(../ggml-cpu/cmake/FindSIMD.cmake)
26
+ endif ()
27
+ if (GGML_AVX512)
28
+ list(APPEND ARCH_FLAGS /arch:AVX512)
29
+ # MSVC has no compile-time flags enabling specific
30
+ # AVX512 extensions, neither it defines the
31
+ # macros corresponding to the extensions.
32
+ # Do it manually.
33
+ if (GGML_AVX512_VBMI)
34
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
35
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
36
+ endif()
37
+ if (GGML_AVX512_VNNI)
38
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
39
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
40
+ endif()
41
+ if (GGML_AVX512_BF16)
42
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
43
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
44
+ endif()
45
+ if (GGML_AMX_TILE)
46
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
47
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
48
+ endif()
49
+ if (GGML_AMX_INT8)
50
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
51
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
52
+ endif()
53
+ if (GGML_AMX_BF16)
54
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
55
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
56
+ endif()
57
+ elseif (GGML_AVX2)
58
+ list(APPEND ARCH_FLAGS /arch:AVX2)
59
+ elseif (GGML_AVX)
60
+ list(APPEND ARCH_FLAGS /arch:AVX)
61
+ endif()
62
+ else()
63
+ if (GGML_NATIVE)
64
+ list(APPEND ARCH_FLAGS -march=native)
65
+ endif()
66
+ if (GGML_F16C)
67
+ list(APPEND ARCH_FLAGS -mf16c)
68
+ endif()
69
+ if (GGML_FMA)
70
+ list(APPEND ARCH_FLAGS -mfma)
71
+ endif()
72
+ if (GGML_AVX)
73
+ list(APPEND ARCH_FLAGS -mavx)
74
+ endif()
75
+ if (GGML_AVX2)
76
+ list(APPEND ARCH_FLAGS -mavx2)
77
+ endif()
78
+ if (GGML_AVX512)
79
+ list(APPEND ARCH_FLAGS -mavx512f)
80
+ list(APPEND ARCH_FLAGS -mavx512dq)
81
+ list(APPEND ARCH_FLAGS -mavx512bw)
82
+ endif()
83
+ if (GGML_AVX512_VBMI)
84
+ list(APPEND ARCH_FLAGS -mavx512vbmi)
85
+ endif()
86
+ if (GGML_AVX512_VNNI)
87
+ list(APPEND ARCH_FLAGS -mavx512vnni)
88
+ endif()
89
+ if (GGML_AVX512_BF16)
90
+ list(APPEND ARCH_FLAGS -mavx512bf16)
91
+ endif()
92
+ if (GGML_AMX_TILE)
93
+ list(APPEND ARCH_FLAGS -mamx-tile)
94
+ endif()
95
+ if (GGML_AMX_INT8)
96
+ list(APPEND ARCH_FLAGS -mamx-int8)
97
+ endif()
98
+ if (GGML_AMX_BF16)
99
+ list(APPEND ARCH_FLAGS -mamx-bf16)
100
+ endif()
101
+ endif()
102
+
103
+ target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
104
+ else()
105
+ set(GGML_AMX OFF PARENT_SCOPE)
106
+ message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
107
+ endif()
@@ -0,0 +1,94 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ // hack until AMX is moved into the CPU backend
5
+ #include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
6
+
7
+ #include <algorithm>
8
+ #include <memory>
9
+ #include <type_traits>
10
+
11
+ #if defined(_OPENMP)
12
+ #include <omp.h>
13
+ #endif
14
+
15
+ #define TILE_M 16
16
+ #define TILE_N 16
17
+ #define TILE_K 32
18
+ #define VNNI_BLK 4
19
+
20
+ #define AMX_BLK_SIZE 32
21
+
22
+ #define TMM0 0
23
+ #define TMM1 1
24
+ #define TMM2 2
25
+ #define TMM3 3
26
+ #define TMM4 4
27
+ #define TMM5 5
28
+ #define TMM6 6
29
+ #define TMM7 7
30
+
31
+ // parallel routines
32
+ template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
33
+ inline T div_up(T x, T y) { return (x + y - 1) / y; }
34
+
35
+ template <typename T>
36
+ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
37
+ #if 0
38
+ // onednn partition pattern
39
+ T& n_my = n_end;
40
+ if (nth <= 1 || n == 0) {
41
+ n_start = 0;
42
+ n_my = n;
43
+ } else {
44
+ T n1 = div_up(n, nth);
45
+ T n2 = n1 - 1;
46
+ T T1 = n - n2 * nth;
47
+ n_my = ith < T1 ? n1 : n2;
48
+ n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
49
+ }
50
+ n_end += n_start;
51
+ #else
52
+ // pytorch aten partition pattern
53
+ T n_my = div_up(n, nth);
54
+ n_start = ith * n_my;
55
+ n_end = std::min(n_start + n_my, n);
56
+ #endif
57
+ }
58
+
59
+ template <typename func_t>
60
+ inline void parallel_for(int nth, int n, const func_t& f) {
61
+ #if defined(_OPENMP)
62
+ #pragma omp parallel num_threads(nth)
63
+ {
64
+ //int nth = omp_get_num_threads();
65
+ int ith = omp_get_thread_num();
66
+ int tbegin, tend;
67
+ balance211(n, nth, ith, tbegin, tend);
68
+ f(tbegin, tend);
69
+ }
70
+ #else
71
+ f(0, n);
72
+
73
+ GGML_UNUSED(nth);
74
+ #endif
75
+ }
76
+
77
+ // quantized types that have AMX support
78
+ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
79
+ // TODO: fix padding for vnni format
80
+ return (type == GGML_TYPE_Q4_0) ||
81
+ (type == GGML_TYPE_Q4_1);
82
+ //(type == GGML_TYPE_Q8_0) ||
83
+ //(type == GGML_TYPE_Q4_K) ||
84
+ //(type == GGML_TYPE_Q5_K) ||
85
+ //(type == GGML_TYPE_Q6_K) ||
86
+ //(type == GGML_TYPE_IQ4_XS);
87
+ }
88
+
89
+ // ggml backend context
90
+ struct ggml_backend_amx_context {
91
+ int n_threads = GGML_DEFAULT_N_THREADS;
92
+ std::unique_ptr<char[]> work_data;
93
+ size_t work_size = 0;
94
+ };