cui-llama.rn 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +54 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1085
  14. package/cpp/chat.h +143 -0
  15. package/cpp/common.cpp +1562 -1996
  16. package/cpp/common.h +677 -744
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-alloc.c +1039 -1030
  19. package/cpp/ggml-alloc.h +1 -1
  20. package/cpp/ggml-backend-impl.h +255 -255
  21. package/cpp/ggml-backend-reg.cpp +586 -582
  22. package/cpp/ggml-backend.cpp +2004 -2002
  23. package/cpp/ggml-backend.h +354 -354
  24. package/cpp/ggml-common.h +1857 -1851
  25. package/cpp/ggml-cpp.h +39 -39
  26. package/cpp/ggml-cpu-aarch64.cpp +5725 -4247
  27. package/cpp/ggml-cpu-aarch64.h +8 -8
  28. package/cpp/ggml-cpu-impl.h +512 -380
  29. package/cpp/ggml-cpu-quants.c +13026 -11517
  30. package/cpp/ggml-cpu-traits.cpp +36 -36
  31. package/cpp/ggml-cpu-traits.h +38 -38
  32. package/cpp/ggml-cpu.c +3438 -14485
  33. package/cpp/ggml-cpu.cpp +655 -633
  34. package/cpp/ggml-cpu.h +138 -135
  35. package/cpp/ggml-impl.h +594 -567
  36. package/cpp/ggml-metal-impl.h +312 -3
  37. package/cpp/ggml-metal.h +66 -66
  38. package/cpp/ggml-metal.m +5360 -5002
  39. package/cpp/ggml-opt.cpp +854 -854
  40. package/cpp/ggml-opt.h +216 -216
  41. package/cpp/ggml-quants.c +5238 -5238
  42. package/cpp/ggml-threading.h +14 -14
  43. package/cpp/ggml.c +6618 -6524
  44. package/cpp/ggml.h +2222 -2194
  45. package/cpp/gguf.cpp +1330 -1329
  46. package/cpp/gguf.h +202 -202
  47. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  48. package/cpp/json-schema-to-grammar.h +21 -22
  49. package/cpp/json.hpp +24766 -24766
  50. package/cpp/llama-adapter.cpp +382 -347
  51. package/cpp/llama-adapter.h +76 -74
  52. package/cpp/llama-arch.cpp +1714 -1492
  53. package/cpp/llama-arch.h +428 -402
  54. package/cpp/llama-batch.cpp +368 -368
  55. package/cpp/llama-batch.h +88 -88
  56. package/cpp/llama-chat.cpp +640 -587
  57. package/cpp/llama-chat.h +56 -53
  58. package/cpp/llama-context.cpp +2831 -1775
  59. package/cpp/llama-context.h +265 -128
  60. package/cpp/llama-cparams.cpp +1 -1
  61. package/cpp/llama-cparams.h +38 -37
  62. package/cpp/llama-cpp.h +30 -30
  63. package/cpp/llama-grammar.cpp +1219 -1219
  64. package/cpp/llama-grammar.h +173 -164
  65. package/cpp/llama-graph.cpp +1695 -0
  66. package/cpp/llama-graph.h +592 -0
  67. package/cpp/llama-hparams.cpp +79 -71
  68. package/cpp/llama-hparams.h +156 -139
  69. package/cpp/llama-impl.cpp +167 -167
  70. package/cpp/llama-impl.h +61 -61
  71. package/cpp/llama-io.cpp +15 -0
  72. package/cpp/llama-io.h +35 -0
  73. package/cpp/llama-kv-cache.cpp +1380 -718
  74. package/cpp/llama-kv-cache.h +213 -218
  75. package/cpp/llama-memory.cpp +1 -0
  76. package/cpp/llama-memory.h +21 -0
  77. package/cpp/llama-mmap.cpp +600 -590
  78. package/cpp/llama-mmap.h +68 -68
  79. package/cpp/llama-model-loader.cpp +1129 -1124
  80. package/cpp/llama-model-loader.h +169 -167
  81. package/cpp/llama-model.cpp +13080 -4023
  82. package/cpp/llama-model.h +409 -370
  83. package/cpp/llama-sampling.cpp +2563 -2525
  84. package/cpp/llama-sampling.h +32 -32
  85. package/cpp/llama-vocab.cpp +3295 -3252
  86. package/cpp/llama-vocab.h +125 -125
  87. package/cpp/llama.cpp +351 -10137
  88. package/cpp/llama.h +1434 -1340
  89. package/cpp/log.cpp +427 -423
  90. package/cpp/log.h +132 -132
  91. package/cpp/{chat-template.hpp → minja/chat-template.hpp} +537 -529
  92. package/cpp/{minja.hpp → minja/minja.hpp} +2941 -2883
  93. package/cpp/ops.cpp +8723 -0
  94. package/cpp/ops.h +128 -0
  95. package/cpp/rn-llama.cpp +45 -71
  96. package/cpp/rn-llama.h +3 -3
  97. package/cpp/sampling.cpp +573 -532
  98. package/cpp/sgemm.cpp +3043 -2598
  99. package/cpp/sgemm.h +14 -14
  100. package/cpp/simd-mappings.h +888 -0
  101. package/cpp/speculative.cpp +278 -277
  102. package/cpp/speculative.h +28 -28
  103. package/cpp/unary-ops.cpp +186 -0
  104. package/cpp/unary-ops.h +28 -0
  105. package/cpp/vec.cpp +258 -0
  106. package/cpp/vec.h +802 -0
  107. package/ios/CMakeLists.txt +5 -2
  108. package/ios/RNLlama.mm +2 -2
  109. package/ios/RNLlamaContext.mm +40 -24
  110. package/package.json +1 -1
  111. package/src/NativeRNLlama.ts +6 -4
  112. package/src/index.ts +3 -1
  113. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  114. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  115. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  116. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  117. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  118. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  119. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  120. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  122. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  124. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  125. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  126. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  127. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  128. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  129. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  130. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  131. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  132. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  133. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  134. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  135. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  136. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  194. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  195. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  196. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  197. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  198. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  199. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  200. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  201. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  202. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  203. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  204. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  205. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  206. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  207. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  208. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  209. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  210. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  211. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  212. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  213. package/android/src/main/build-arm64/Makefile +0 -1862
  214. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  215. package/cpp/chat.hpp +0 -55
  216. package/cpp/rn-llama.hpp +0 -913
@@ -1,1124 +1,1129 @@
1
- #include "llama-model-loader.h"
2
-
3
- #include "ggml.h"
4
-
5
- #include <array>
6
- #include <cinttypes>
7
- #include <cstring>
8
- #include <future>
9
-
10
- static const size_t kiB = 1024;
11
- static const size_t MiB = 1024*kiB;
12
- static const size_t GiB = 1024*MiB;
13
-
14
- const char * llama_file_version_name(llama_fver version) {
15
- switch (version) {
16
- case LM_GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
17
- case LM_GGUF_FILE_VERSION_V2: return "GGUF V2";
18
- case LM_GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
19
- }
20
-
21
- return "unknown";
22
- }
23
-
24
- static std::string llama_model_ftype_name(llama_ftype ftype) {
25
- if (ftype & LLAMA_FTYPE_GUESSED) {
26
- return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
27
- }
28
-
29
- switch (ftype) {
30
- case LLAMA_FTYPE_ALL_F32: return "all F32";
31
- case LLAMA_FTYPE_MOSTLY_F16: return "F16";
32
- case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
33
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
34
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
35
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
41
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
42
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
43
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
44
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
45
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
46
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
47
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
48
- case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
49
- case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
50
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
51
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
52
- case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
53
- case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
54
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
55
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
56
- case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
57
- case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
58
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
59
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
60
- case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
61
- case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
62
-
63
- default: return "unknown, may not work";
64
- }
65
- }
66
-
67
- // return a list of splits for a given path
68
- // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
69
- static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
70
- std::vector<std::string> paths;
71
- std::string split_prefix;
72
- std::vector<char> buf(llama_path_max(), 0);
73
-
74
- {
75
- int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
76
- if (!ret) {
77
- throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
78
- }
79
- split_prefix = std::string(buf.data(), ret);
80
- }
81
-
82
- if (split_prefix.empty()) {
83
- throw std::runtime_error(format("invalid split file: %s", path.c_str()));
84
- }
85
-
86
- for (int idx = 0; idx < n_split; ++idx) {
87
- int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
88
- paths.push_back(std::string(buf.data(), ret));
89
- }
90
-
91
- return paths;
92
- }
93
-
94
- namespace GGUFMeta {
95
- template <typename T, lm_gguf_type gt_, T (*gfun)(const lm_gguf_context *, const int64_t)>
96
- struct GKV_Base_Type {
97
- static constexpr lm_gguf_type gt = gt_;
98
-
99
- static T getter(const lm_gguf_context * ctx, const int kid) {
100
- return gfun(ctx, kid);
101
- }
102
- };
103
-
104
- template<typename T> struct GKV_Base;
105
-
106
- template<> struct GKV_Base<bool >: GKV_Base_Type<bool, LM_GGUF_TYPE_BOOL, lm_gguf_get_val_bool> {};
107
- template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, LM_GGUF_TYPE_UINT8, lm_gguf_get_val_u8 > {};
108
- template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, LM_GGUF_TYPE_UINT16, lm_gguf_get_val_u16 > {};
109
- template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, LM_GGUF_TYPE_UINT32, lm_gguf_get_val_u32 > {};
110
- template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, LM_GGUF_TYPE_UINT64, lm_gguf_get_val_u64 > {};
111
- template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, LM_GGUF_TYPE_INT8, lm_gguf_get_val_i8 > {};
112
- template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, LM_GGUF_TYPE_INT16, lm_gguf_get_val_i16 > {};
113
- template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, LM_GGUF_TYPE_INT32, lm_gguf_get_val_i32 > {};
114
- template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, LM_GGUF_TYPE_INT64, lm_gguf_get_val_i64 > {};
115
- template<> struct GKV_Base<float >: GKV_Base_Type<float, LM_GGUF_TYPE_FLOAT32, lm_gguf_get_val_f32 > {};
116
- template<> struct GKV_Base<double >: GKV_Base_Type<double, LM_GGUF_TYPE_FLOAT64, lm_gguf_get_val_f64 > {};
117
- template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, LM_GGUF_TYPE_STRING, lm_gguf_get_val_str > {};
118
-
119
- template<> struct GKV_Base<std::string> {
120
- static constexpr lm_gguf_type gt = LM_GGUF_TYPE_STRING;
121
-
122
- static std::string getter(const lm_gguf_context * ctx, const int kid) {
123
- return lm_gguf_get_val_str(ctx, kid);
124
- }
125
- };
126
-
127
- struct ArrayInfo {
128
- const lm_gguf_type gt;
129
- const size_t length;
130
- const void * data;
131
- };
132
-
133
- template<> struct GKV_Base<ArrayInfo> {
134
- public:
135
- static constexpr lm_gguf_type gt = LM_GGUF_TYPE_ARRAY;
136
- static ArrayInfo getter(const lm_gguf_context *ctx, const int k) {
137
- const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx, k);
138
- return ArrayInfo {
139
- arr_type,
140
- size_t(lm_gguf_get_arr_n(ctx, k)),
141
- arr_type == LM_GGUF_TYPE_STRING ? nullptr : lm_gguf_get_arr_data(ctx, k),
142
- };
143
- }
144
- };
145
-
146
- template<typename T>
147
- class GKV : public GKV_Base<T> {
148
- GKV() = delete;
149
-
150
- public:
151
- static T get_kv(const lm_gguf_context * ctx, const int k) {
152
- const enum lm_gguf_type kt = lm_gguf_get_kv_type(ctx, k);
153
-
154
- if (kt != GKV::gt) {
155
- throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
156
- lm_gguf_get_key(ctx, k), lm_gguf_type_name(kt), lm_gguf_type_name(GKV::gt)));
157
- }
158
- return GKV::getter(ctx, k);
159
- }
160
-
161
- static const char * override_type_to_str(const llama_model_kv_override_type ty) {
162
- switch (ty) {
163
- case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
164
- case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
165
- case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
166
- case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
167
- }
168
- return "unknown";
169
- }
170
-
171
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
172
- if (!ovrd) { return false; }
173
- if (ovrd->tag == expected_type) {
174
- LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
175
- __func__, override_type_to_str(ovrd->tag), ovrd->key);
176
- switch (ovrd->tag) {
177
- case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
178
- LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
179
- } break;
180
- case LLAMA_KV_OVERRIDE_TYPE_INT: {
181
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
182
- } break;
183
- case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
184
- LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
185
- } break;
186
- case LLAMA_KV_OVERRIDE_TYPE_STR: {
187
- LLAMA_LOG_INFO("%s\n", ovrd->val_str);
188
- } break;
189
- default:
190
- // Shouldn't be possible to end up here, but just in case...
191
- throw std::runtime_error(
192
- format("Unsupported attempt to override %s type for metadata key %s\n",
193
- override_type_to_str(ovrd->tag), ovrd->key));
194
- }
195
- return true;
196
- }
197
- LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
198
- __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
199
- return false;
200
- }
201
-
202
- template<typename OT>
203
- static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
204
- try_override(OT & target, const struct llama_model_kv_override * ovrd) {
205
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
206
- target = ovrd->val_bool;
207
- return true;
208
- }
209
- return false;
210
- }
211
-
212
- template<typename OT>
213
- static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
214
- try_override(OT & target, const struct llama_model_kv_override * ovrd) {
215
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
216
- target = ovrd->val_i64;
217
- return true;
218
- }
219
- return false;
220
- }
221
-
222
- template<typename OT>
223
- static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
224
- try_override(T & target, const struct llama_model_kv_override * ovrd) {
225
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
226
- target = ovrd->val_f64;
227
- return true;
228
- }
229
- return false;
230
- }
231
-
232
- template<typename OT>
233
- static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
234
- try_override(T & target, const struct llama_model_kv_override * ovrd) {
235
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
236
- target = ovrd->val_str;
237
- return true;
238
- }
239
- return false;
240
- }
241
-
242
- static bool set(const lm_gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
243
- if (try_override<T>(target, ovrd)) {
244
- return true;
245
- }
246
- if (k < 0) { return false; }
247
- target = get_kv(ctx, k);
248
- return true;
249
- }
250
-
251
- static bool set(const lm_gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
252
- return set(ctx, lm_gguf_find_key(ctx, key), target, ovrd);
253
- }
254
-
255
- static bool set(const lm_gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
256
- return set(ctx, key.c_str(), target, ovrd);
257
- }
258
- };
259
- }
260
-
261
- template<typename T>
262
- typename std::enable_if<std::is_integral<T>::value, bool>::type
263
- llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
264
- const int kid = lm_gguf_find_key(meta.get(), key.c_str());
265
-
266
- if (kid < 0) {
267
- if (required) {
268
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
269
- }
270
- return false;
271
- }
272
-
273
- struct GGUFMeta::ArrayInfo arr_info =
274
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
275
-
276
-
277
- result = arr_info.length;
278
- return true;
279
- }
280
-
281
- template<typename T>
282
- typename std::enable_if<std::is_integral<T>::value, bool>::type
283
- llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) {
284
- return get_arr_n(llm_kv(kid), result, required);
285
- }
286
-
287
- template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
288
-
289
- template<typename T>
290
- bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
291
- const int kid = lm_gguf_find_key(meta.get(), key.c_str());
292
-
293
- if (kid < 0 || lm_gguf_get_kv_type(meta.get(), kid) != LM_GGUF_TYPE_ARRAY) {
294
- if (required) {
295
- throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
296
- }
297
- return false;
298
- }
299
-
300
- struct GGUFMeta::ArrayInfo arr_info =
301
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
302
-
303
- switch (arr_info.gt) {
304
- case LM_GGUF_TYPE_FLOAT32: LM_GGML_ASSERT((std::is_same<T, float>::value)); break;
305
- case LM_GGUF_TYPE_INT32: LM_GGML_ASSERT(
306
- (std::is_same<T, int32_t>::value) ||
307
- (std::is_same<T, uint32_t>::value)); break;
308
- default:
309
- throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
310
- }
311
-
312
- result.resize(arr_info.length);
313
- result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
314
-
315
- return true;
316
- }
317
-
318
- template<typename T, size_t N_MAX>
319
- bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
320
- const int kid = lm_gguf_find_key(meta.get(), key.c_str());
321
-
322
- if (kid < 0 || lm_gguf_get_kv_type(meta.get(), kid) != LM_GGUF_TYPE_ARRAY) {
323
- if (required) {
324
- throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
325
- }
326
- return false;
327
- }
328
-
329
- struct GGUFMeta::ArrayInfo arr_info =
330
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
331
-
332
- switch (arr_info.gt) {
333
- case LM_GGUF_TYPE_FLOAT32: LM_GGML_ASSERT((std::is_same<T, float>::value)); break;
334
- case LM_GGUF_TYPE_INT32: LM_GGML_ASSERT(
335
- (std::is_same<T, int32_t>::value) ||
336
- (std::is_same<T, uint32_t>::value)); break;
337
- default:
338
- throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
339
- }
340
-
341
- if (arr_info.length > N_MAX) {
342
- throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
343
- }
344
-
345
- std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
346
-
347
- return true;
348
- }
349
-
350
- template<typename T>
351
- bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) {
352
- return get_arr(llm_kv(kid), result, required);
353
- }
354
-
355
- template<typename T>
356
- bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
357
- auto it = kv_overrides.find(key);
358
-
359
- const struct llama_model_kv_override * override =
360
- it != kv_overrides.end() ? &it->second : nullptr;
361
-
362
- const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
363
-
364
- if (required && !found) {
365
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
366
- }
367
-
368
- return found;
369
- }
370
-
371
- template<typename T>
372
- bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) {
373
- return get_key(llm_kv(kid), result, required);
374
- }
375
-
376
- template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
377
- template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
378
- template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
379
- template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
380
-
381
- template<>
382
- bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) {
383
- uint32_t tmp;
384
- const bool found = get_key(kid, tmp, required);
385
- if (found) {
386
- result = (enum llama_pooling_type) tmp;
387
- } else {
388
- result = LLAMA_POOLING_TYPE_UNSPECIFIED;
389
- }
390
- return found;
391
- }
392
-
393
- // get array of n <= N_MAX elements, or a single element repeated n times
394
- template<typename T, size_t N_MAX>
395
- bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
396
- const int kid = lm_gguf_find_key(meta.get(), key.c_str());
397
-
398
- if (kid < 0) {
399
- if (required) {
400
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
401
- }
402
- return false;
403
- }
404
-
405
- if (n > N_MAX) {
406
- throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
407
- }
408
-
409
- if (lm_gguf_get_kv_type(meta.get(), kid) == LM_GGUF_TYPE_ARRAY) {
410
- struct GGUFMeta::ArrayInfo arr_info =
411
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
412
-
413
- if (n != arr_info.length) {
414
- throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
415
- }
416
-
417
- return get_arr(key, result, required);
418
- }
419
-
420
- T value;
421
-
422
- bool ok = get_key(key, value, required);
423
- if (!ok) {
424
- return false;
425
- }
426
-
427
- for (uint32_t i = 0; i < n; i++) {
428
- result[i] = value;
429
- }
430
-
431
- return true;
432
- }
433
-
434
- template<typename T>
435
- bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) {
436
- return get_key_or_arr(llm_kv(kid), result, n, required);
437
- }
438
-
439
- // TODO: this is not very clever - figure out something better
440
- template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
441
- template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
442
-
443
- llama_model_loader::llama_model_loader(
444
- const std::string & fname,
445
- std::vector<std::string> & splits,
446
- bool use_mmap,
447
- bool check_tensors,
448
- const struct llama_model_kv_override * param_overrides_p) {
449
- int trace = 0;
450
- if (getenv("LLAMA_TRACE")) {
451
- trace = atoi(getenv("LLAMA_TRACE"));
452
- }
453
-
454
- if (param_overrides_p != nullptr) {
455
- for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
456
- kv_overrides.insert({std::string(p->key), *p});
457
- }
458
- }
459
-
460
- // Load the main GGUF
461
- struct lm_ggml_context * ctx = NULL;
462
- struct lm_gguf_init_params params = {
463
- /*.no_alloc = */ true,
464
- /*.ctx = */ &ctx,
465
- };
466
-
467
- meta.reset(lm_gguf_init_from_file(fname.c_str(), params));
468
- if (!meta) {
469
- throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
470
- }
471
-
472
- get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
473
- llm_kv = LLM_KV(llm_arch_from_string(arch_name));
474
-
475
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
476
- contexts.emplace_back(ctx);
477
-
478
- // Save tensors data offset of the main file.
479
- // For subsidiary files, `meta` tensor data offset must not be used,
480
- // so we build a unified tensors index for weights.
481
- for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur; cur = lm_ggml_get_next_tensor(ctx, cur)) {
482
- std::string tensor_name = std::string(cur->name);
483
- // make sure there is no duplicated tensor names
484
- if (weights_map.find(tensor_name) != weights_map.end()) {
485
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", lm_ggml_get_name(cur)));
486
- }
487
- n_elements += lm_ggml_nelements(cur);
488
- n_bytes += lm_ggml_nbytes(cur);
489
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
490
- }
491
- uint16_t n_split = 0;
492
- get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
493
-
494
- // Load additional GGML contexts
495
- if (n_split > 1) {
496
- // make sure the main file is loaded first
497
- uint16_t idx = 0;
498
- const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
499
- get_key(kv_split_no, idx);
500
- if (idx != 0) {
501
- throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
502
- }
503
-
504
- // generate list of splits if needed
505
- if (splits.empty()) {
506
- splits = llama_get_list_splits(fname, idx, n_split);
507
- }
508
-
509
- // in case user give a custom list of splits, check if it matches the expected number
510
- if (n_split != (uint16_t)splits.size()) {
511
- throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
512
- }
513
-
514
- if (trace > 0) {
515
- LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
516
- }
517
-
518
- // load other splits
519
- for (idx = 1; idx < n_split; idx++) {
520
- const char * fname_split = splits[idx].c_str();
521
-
522
- struct lm_gguf_init_params split_params = {
523
- /*.no_alloc = */ true,
524
- /*.ctx = */ &ctx,
525
- };
526
- lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(fname_split, split_params) };
527
- if (!ctx_gguf) {
528
- throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
529
- }
530
-
531
- // check idx
532
- {
533
- const int kid = lm_gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
534
- if (kid < 0) {
535
- throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
536
- }
537
- int idx_gguf = lm_gguf_get_val_u16(ctx_gguf.get(), kid);
538
- if (idx_gguf != idx) {
539
- throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
540
- }
541
- }
542
-
543
- files.emplace_back(new llama_file(fname_split, "rb"));
544
- contexts.emplace_back(ctx);
545
-
546
- // Save tensors data offset info of the shard.
547
- for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur; cur = lm_ggml_get_next_tensor(ctx, cur)) {
548
- std::string tensor_name = std::string(cur->name);
549
- // make sure there is no duplicated tensor names
550
- if (weights_map.find(tensor_name) != weights_map.end()) {
551
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", lm_ggml_get_name(cur)));
552
- }
553
- n_elements += lm_ggml_nelements(cur);
554
- n_bytes += lm_ggml_nbytes(cur);
555
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
556
- }
557
- }
558
-
559
- get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
560
-
561
- // sanity check
562
- {
563
- const int n_tensors_loaded = (int) weights_map.size();
564
- if (n_tensors != n_tensors_loaded) {
565
- throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
566
- }
567
- }
568
-
569
- LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
570
- }
571
-
572
- n_kv = lm_gguf_get_n_kv(meta.get());
573
- n_tensors = weights_map.size();
574
-
575
- fver = (enum llama_fver) lm_gguf_get_version(meta.get());
576
-
577
- LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
578
- __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
579
-
580
- // determine file type based on the number of tensors for each quantization and print meta data
581
- // TODO: make optional
582
- {
583
- std::map<enum lm_ggml_type, uint32_t> n_type;
584
-
585
- uint32_t n_type_max = 0;
586
- enum lm_ggml_type type_max = LM_GGML_TYPE_F32;
587
-
588
- for (const auto & it : weights_map) {
589
- const llama_tensor_weight & w = it.second;
590
- const lm_ggml_tensor * tensor = w.tensor;
591
-
592
- enum lm_ggml_type type = tensor->type;
593
-
594
- n_type[type]++;
595
-
596
- if (n_type_max < n_type[type]) {
597
- n_type_max = n_type[type];
598
- type_max = type;
599
- }
600
-
601
- if (trace > 0) {
602
- const uint16_t sid = w.idx;
603
- LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, lm_ggml_get_name(tensor), lm_ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
604
- }
605
- }
606
-
607
- switch (type_max) {
608
- case LM_GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
609
- case LM_GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
610
- case LM_GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
611
- case LM_GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
612
- case LM_GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
613
- case LM_GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
614
- case LM_GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
615
- case LM_GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
616
- case LM_GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
617
- case LM_GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
618
- case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
619
- case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
620
- case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
621
- case LM_GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
622
- case LM_GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
623
- case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
624
- case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
625
- case LM_GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
626
- case LM_GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
627
- case LM_GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
628
- case LM_GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
629
- case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
630
- case LM_GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
631
- case LM_GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
632
- default:
633
- {
634
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
635
- ftype = LLAMA_FTYPE_ALL_F32;
636
- } break;
637
- }
638
-
639
- // this is a way to mark that we have "guessed" the file type
640
- ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
641
-
642
- {
643
- const int kid = lm_gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV
644
- if (kid >= 0) {
645
- ftype = (llama_ftype) lm_gguf_get_val_u32(meta.get(), kid);
646
- }
647
- }
648
-
649
- LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
650
-
651
- for (int i = 0; i < n_kv; i++) {
652
- const char * name = lm_gguf_get_key(meta.get(), i);
653
- const enum lm_gguf_type type = lm_gguf_get_kv_type(meta.get(), i);
654
- const std::string type_name =
655
- type == LM_GGUF_TYPE_ARRAY
656
- ? format("%s[%s,%zu]", lm_gguf_type_name(type), lm_gguf_type_name(lm_gguf_get_arr_type(meta.get(), i)), lm_gguf_get_arr_n(meta.get(), i))
657
- : lm_gguf_type_name(type);
658
-
659
- std::string value = lm_gguf_kv_to_str(meta.get(), i);
660
- const size_t MAX_VALUE_LEN = 40;
661
- if (value.size() > MAX_VALUE_LEN) {
662
- value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
663
- }
664
- replace_all(value, "\n", "\\n");
665
-
666
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
667
- }
668
-
669
- // print type counts
670
- for (auto & kv : n_type) {
671
- if (kv.second == 0) {
672
- continue;
673
- }
674
-
675
- LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, lm_ggml_type_name(kv.first), kv.second);
676
- }
677
- }
678
-
679
- if (!llama_mmap::SUPPORTED) {
680
- LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
681
- use_mmap = false;
682
- }
683
-
684
- this->use_mmap = use_mmap;
685
- this->check_tensors = check_tensors;
686
- }
687
-
688
- std::string llama_model_loader::get_arch_name() const {
689
- return arch_name;
690
- }
691
-
692
- enum llm_arch llama_model_loader::get_arch() const {
693
- return llm_kv.arch;
694
- }
695
-
696
- const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
697
- auto pos = weights_map.find(name);
698
- if (pos != weights_map.end()) {
699
- return &pos->second;
700
- }
701
-
702
- return nullptr;
703
- }
704
-
705
- const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const {
706
- const llama_tensor_weight * weight = get_weight(name);
707
- if (!weight) {
708
- throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
709
- }
710
- return *weight;
711
- }
712
-
713
- struct lm_ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const {
714
- const auto * weight = get_weight(name);
715
- if (!weight) {
716
- return nullptr;
717
- }
718
- return weight->tensor;
719
- }
720
-
721
- struct lm_ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const {
722
- struct lm_ggml_tensor * tensor = get_tensor_meta(name.c_str());
723
- if (!tensor) {
724
- throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
725
- }
726
- return tensor;
727
- }
728
-
729
- const struct lm_ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
730
- const struct lm_ggml_tensor * cur = get_tensor_meta(name.c_str());
731
-
732
- if (cur == NULL) {
733
- if (!required) {
734
- return NULL;
735
- }
736
- throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
737
- }
738
-
739
- {
740
- bool is_ok = true;
741
- for (size_t i = 0; i < LM_GGML_MAX_DIMS; ++i) {
742
- if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
743
- is_ok = false;
744
- break;
745
- }
746
- }
747
- if (!is_ok) {
748
- throw std::runtime_error(
749
- format("%s: tensor '%s' has wrong shape; expected %s, got %s",
750
- __func__, name.c_str(),
751
- llama_format_tensor_shape(ne).c_str(),
752
- llama_format_tensor_shape(cur).c_str()));
753
- }
754
- }
755
-
756
- return cur;
757
- }
758
-
759
- struct lm_ggml_tensor * llama_model_loader::create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
760
- const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
761
-
762
- if (cur == NULL) {
763
- return NULL;
764
- }
765
-
766
- bool duplicated = flags & TENSOR_DUPLICATED;
767
-
768
- struct lm_ggml_tensor * tensor = lm_ggml_dup_tensor(ctx, cur);
769
- lm_ggml_set_name(tensor, lm_ggml_get_name(cur));
770
-
771
- if (duplicated) {
772
- size_data += lm_ggml_nbytes(cur);
773
- } else {
774
- n_created++;
775
- }
776
-
777
- return tensor;
778
-
779
- }
780
-
781
- struct lm_ggml_tensor * llama_model_loader::create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
782
- const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, required);
783
-
784
- if (cur == NULL) {
785
- return NULL;
786
- }
787
-
788
- if (cur->type != base->type) {
789
- throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), lm_ggml_type_name(base->type), lm_ggml_type_name(cur->type)));
790
- }
791
-
792
- std::array<int64_t, LM_GGML_MAX_DIMS> dims;
793
- for (size_t i = 0; i < LM_GGML_MAX_DIMS; ++i) {
794
- dims[i] = i < ne.size() ? ne.begin()[i] : 1;
795
- }
796
-
797
- struct lm_ggml_tensor * tensor = lm_ggml_view_4d(ctx, base,
798
- dims[0], dims[1], dims[2], dims[3],
799
- cur->nb[1], cur->nb[2], cur->nb[3],
800
- offset);
801
-
802
- lm_ggml_set_name(tensor, name.c_str());
803
-
804
- n_created++;
805
-
806
- return tensor;
807
- }
808
-
809
- void llama_model_loader::done_getting_tensors() const {
810
- if (n_created != n_tensors) {
811
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
812
- }
813
- }
814
-
815
- void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
816
- if (use_mmap) {
817
- mappings.reserve(files.size());
818
- mmaps_used.reserve(files.size());
819
- for (const auto & file : files) {
820
- auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
821
- auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
822
- std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
823
- mmaps_used.emplace_back(mapping->size(), 0);
824
- if (mlock_mmaps) {
825
- std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
826
- mlock_mmap->init(mapping->addr());
827
- mlock_mmaps->emplace_back(std::move(mlock_mmap));
828
- }
829
- mappings.emplace_back(std::move(mapping));
830
- }
831
- }
832
-
833
- // compute the total size of all tensors for progress reporting
834
- for (const auto & it : weights_map) {
835
- size_data += lm_ggml_nbytes(it.second.tensor);
836
- }
837
- }
838
-
839
- void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, lm_ggml_context * ctx) const {
840
- LM_GGML_ASSERT(!mappings.empty());
841
- const auto & mapping = mappings.at(idx);
842
-
843
- *first = mapping->size();
844
- *last = 0;
845
- *addr = mapping->addr();
846
- for (lm_ggml_tensor * tensor = lm_ggml_get_first_tensor(ctx); tensor; tensor = lm_ggml_get_next_tensor(ctx, tensor)) {
847
- const auto * weight = get_weight(lm_ggml_get_name(tensor));
848
- if (!weight || weight->idx != idx) {
849
- continue;
850
- }
851
- *first = std::min(*first, weight->offs);
852
- *last = std::max(*last, weight->offs + lm_ggml_nbytes(tensor));
853
- }
854
- }
855
-
856
- void llama_model_loader::load_data_for(struct lm_ggml_tensor * cur) const {
857
- const auto & w = require_weight(lm_ggml_get_name(cur));
858
-
859
- if (use_mmap) {
860
- const auto & mapping = mappings.at(w.idx);
861
- if (cur->data == nullptr) {
862
- cur->data = (uint8_t *)mapping->addr() + w.offs;
863
- } else {
864
- memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, lm_ggml_nbytes(cur));
865
- }
866
- } else {
867
- LM_GGML_ASSERT(cur->data != nullptr);
868
- LM_GGML_ASSERT(w.idx < files.size());
869
- const auto & file = files.at(w.idx);
870
- file->seek(w.offs, SEEK_SET);
871
- file->read_raw(cur->data, lm_ggml_nbytes(cur));
872
- }
873
-
874
- if (check_tensors && !lm_ggml_validate_row_data(cur->type, cur->data, lm_ggml_nbytes(cur))) {
875
- throw std::runtime_error(format("tensor '%s' has invalid data", lm_ggml_get_name(cur)));
876
- }
877
- }
878
-
879
- bool llama_model_loader::load_all_data(
880
- struct lm_ggml_context * ctx,
881
- llama_buf_map & bufs,
882
- llama_mlocks * lmlocks,
883
- llama_progress_callback progress_callback,
884
- void * progress_callback_user_data) {
885
- LM_GGML_ASSERT(size_data != 0 && "call init_mappings() first");
886
-
887
- std::vector<no_init<uint8_t>> read_buf;
888
- std::vector<std::future<std::pair<lm_ggml_tensor *, bool>>> validation_result;
889
-
890
- // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
891
- // NVMe raid configurations might require more / larger buffers.
892
- constexpr size_t n_buffers = 4;
893
- constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
894
-
895
- std::vector<lm_ggml_backend_buffer_t> host_buffers;
896
- std::vector<lm_ggml_backend_event_t> events;
897
- std::vector<void *> host_ptrs;
898
- size_t buffer_idx = 0; // buffer to use for async loads
899
- lm_ggml_backend_t upload_backend = [&](const char * func) -> lm_ggml_backend_t {
900
- if (use_mmap || check_tensors) {
901
- return nullptr;
902
- }
903
- // When not using mmaped io use async uploads from pinned memory to GPU memory.
904
- // First determine if the backend supports the necessary features for async uploads.
905
- auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
906
- if (!buf) {
907
- LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
908
- return nullptr;
909
- }
910
-
911
- auto * buft = lm_ggml_backend_buffer_get_type(buf);
912
- auto * dev = lm_ggml_backend_buft_get_device(buft);
913
- if (!dev) {
914
- LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
915
- lm_ggml_backend_buft_name(buft));
916
- return nullptr;
917
- }
918
-
919
- if (buft != lm_ggml_backend_dev_buffer_type(dev)) {
920
- LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
921
- lm_ggml_backend_buft_name(buft), lm_ggml_backend_dev_name(dev));
922
- return nullptr;
923
- }
924
-
925
- lm_ggml_backend_dev_props props;
926
- lm_ggml_backend_dev_get_props(dev, &props);
927
- if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
928
- LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
929
- lm_ggml_backend_dev_name(dev));
930
- return nullptr;
931
- }
932
-
933
- auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
934
- if (!host_buft) {
935
- LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
936
- lm_ggml_backend_dev_name(dev));
937
- return nullptr;
938
- }
939
-
940
- // If the backend is supported, create pinned memory buffers and events for synchronisation.
941
- for (size_t idx = 0; idx < n_buffers; ++idx) {
942
- auto * buf = lm_ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
943
- if (!buf) {
944
- LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
945
- lm_ggml_backend_dev_name(dev));
946
- return nullptr;
947
- }
948
-
949
- host_buffers.emplace_back(buf);
950
- host_ptrs.emplace_back(lm_ggml_backend_buffer_get_base(buf));
951
-
952
- auto * event = lm_ggml_backend_event_new(dev);
953
- if (!event) {
954
- LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
955
- lm_ggml_backend_dev_name(dev));
956
- return nullptr;
957
- }
958
-
959
- events.emplace_back(event);
960
- }
961
-
962
- lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
963
- if (!backend) {
964
- LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
965
- lm_ggml_backend_dev_name(dev));
966
- return nullptr;
967
- }
968
-
969
- return backend;
970
- }(__func__);
971
-
972
- if (upload_backend) {
973
- LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
974
- lm_ggml_backend_dev_name(lm_ggml_backend_get_device(upload_backend)),
975
- lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(bufs.at(0))),
976
- lm_ggml_backend_name(upload_backend));
977
- }
978
-
979
- for (struct lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur != NULL; cur = lm_ggml_get_next_tensor(ctx, cur)) {
980
- const auto * weight = get_weight(lm_ggml_get_name(cur));
981
- if (weight == nullptr) {
982
- // this can happen with split experts models
983
- continue;
984
- }
985
-
986
- if (progress_callback) {
987
- if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
988
- return false;
989
- }
990
- }
991
-
992
- size_t n_size = lm_ggml_nbytes(cur);
993
-
994
- if (use_mmap) {
995
- const auto & mapping = mappings.at(weight->idx);
996
- lm_ggml_backend_buffer_t buf_mmap = nullptr;
997
- if (bufs.count(weight->idx)) {
998
- buf_mmap = bufs.at(weight->idx);
999
- }
1000
- uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
1001
-
1002
- if (check_tensors) {
1003
- validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
1004
- return std::make_pair(cur, lm_ggml_validate_row_data(cur->type, data, n_size));
1005
- }));
1006
- }
1007
-
1008
- LM_GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
1009
- if (buf_mmap && cur->data == nullptr) {
1010
- lm_ggml_backend_tensor_alloc(buf_mmap, cur, data);
1011
- if (lmlocks) {
1012
- const auto & lmlock = lmlocks->at(weight->idx);
1013
- lmlock->grow_to(weight->offs + n_size);
1014
- }
1015
-
1016
- auto & mmap_used = mmaps_used[weight->idx];
1017
- mmap_used.first = std::min(mmap_used.first, weight->offs);
1018
- mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
1019
- } else {
1020
- lm_ggml_backend_tensor_set(cur, data, 0, n_size);
1021
- }
1022
- } else {
1023
- const auto & file = files.at(weight->idx);
1024
- if (lm_ggml_backend_buffer_is_host(cur->buffer)) {
1025
- file->seek(weight->offs, SEEK_SET);
1026
- file->read_raw(cur->data, n_size);
1027
- if (check_tensors) {
1028
- validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1029
- return std::make_pair(cur, lm_ggml_validate_row_data(cur->type, cur->data, n_size));
1030
- }));
1031
- }
1032
- } else {
1033
- // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1034
- if (upload_backend) {
1035
- file->seek(weight->offs, SEEK_SET);
1036
-
1037
- size_t bytes_read = 0;
1038
-
1039
- while (bytes_read < n_size) {
1040
- size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
1041
-
1042
- lm_ggml_backend_event_synchronize(events[buffer_idx]);
1043
- file->read_raw(host_ptrs[buffer_idx], read_iteration);
1044
- lm_ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
1045
- lm_ggml_backend_event_record(events[buffer_idx], upload_backend);
1046
-
1047
- bytes_read += read_iteration;
1048
- ++buffer_idx;
1049
- buffer_idx %= n_buffers;
1050
- }
1051
- } else {
1052
- read_buf.resize(n_size);
1053
- file->seek(weight->offs, SEEK_SET);
1054
- file->read_raw(read_buf.data(), n_size);
1055
- lm_ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1056
- if (check_tensors && !lm_ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1057
- throw std::runtime_error(format("tensor '%s' has invalid data", lm_ggml_get_name(cur)));
1058
- }
1059
- }
1060
- }
1061
- }
1062
-
1063
- size_done += n_size;
1064
- }
1065
-
1066
- // free temporary resources used for async uploads
1067
- for (auto * event : events) {
1068
- lm_ggml_backend_event_synchronize(event);
1069
- lm_ggml_backend_event_free(event);
1070
- }
1071
- for (auto * buf : host_buffers) {
1072
- lm_ggml_backend_buffer_free(buf);
1073
- }
1074
- lm_ggml_backend_free(upload_backend);
1075
-
1076
- // check validation results
1077
- bool validation_failed = false;
1078
- for (auto & future : validation_result) {
1079
- auto result = future.get();
1080
- if (!result.second) {
1081
- LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, lm_ggml_get_name(result.first));
1082
- validation_failed = true;
1083
- }
1084
- }
1085
- if (validation_failed) {
1086
- throw std::runtime_error("found tensors with invalid data");
1087
- }
1088
-
1089
- // check if this is the last call and do final cleanup
1090
- if (size_done >= size_data) {
1091
- // unmap offloaded tensors and metadata
1092
- if (use_mmap) {
1093
- for (uint32_t idx = 0; idx < mappings.size(); idx++) {
1094
- const auto & mmap_used = mmaps_used.at(idx);
1095
- auto & mapping = mappings.at(idx);
1096
- mapping->unmap_fragment(0, mmap_used.first);
1097
- if (mmap_used.second != 0) {
1098
- mapping->unmap_fragment(mmap_used.second, mapping->size());
1099
- }
1100
- }
1101
- }
1102
- if (progress_callback) {
1103
- // Even though the model is done loading, we still honor
1104
- // cancellation since we need to free allocations.
1105
- return progress_callback(1.0f, progress_callback_user_data);
1106
- }
1107
- }
1108
-
1109
- return true;
1110
- }
1111
-
1112
- std::string llama_model_loader::ftype_name() const {
1113
- return llama_model_ftype_name(ftype);
1114
- }
1115
-
1116
- void llama_model_loader::print_info() const {
1117
- LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1118
- LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1119
- if (n_bytes < GiB) {
1120
- LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1121
- } else {
1122
- LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1123
- }
1124
- }
1
+ #include "llama-model-loader.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <array>
6
+ #include <cinttypes>
7
+ #include <cstring>
8
+ #include <future>
9
+
10
+ static const size_t kiB = 1024;
11
+ static const size_t MiB = 1024*kiB;
12
+ static const size_t GiB = 1024*MiB;
13
+
14
+ const char * llama_file_version_name(llama_fver version) {
15
+ switch (version) {
16
+ case LM_GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
17
+ case LM_GGUF_FILE_VERSION_V2: return "GGUF V2";
18
+ case LM_GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
19
+ }
20
+
21
+ return "unknown";
22
+ }
23
+
24
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
25
+ if (ftype & LLAMA_FTYPE_GUESSED) {
26
+ return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
27
+ }
28
+
29
+ switch (ftype) {
30
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
31
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
32
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
33
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
34
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
35
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
41
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
42
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
43
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
44
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
45
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
46
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
47
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
48
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
49
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
50
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
51
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
52
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
53
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
54
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
55
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
56
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
57
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
58
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
59
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
60
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
61
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
62
+
63
+ default: return "unknown, may not work";
64
+ }
65
+ }
66
+
67
+ // return a list of splits for a given path
68
+ // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
69
+ static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
70
+ std::vector<std::string> paths;
71
+ std::string split_prefix;
72
+ std::vector<char> buf(llama_path_max(), 0);
73
+
74
+ {
75
+ int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
76
+ if (!ret) {
77
+ throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
78
+ }
79
+ split_prefix = std::string(buf.data(), ret);
80
+ }
81
+
82
+ if (split_prefix.empty()) {
83
+ throw std::runtime_error(format("invalid split file: %s", path.c_str()));
84
+ }
85
+
86
+ for (int idx = 0; idx < n_split; ++idx) {
87
+ int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
88
+ paths.push_back(std::string(buf.data(), ret));
89
+ }
90
+
91
+ return paths;
92
+ }
93
+
94
+ namespace GGUFMeta {
95
+ template <typename T, lm_gguf_type gt_, T (*gfun)(const lm_gguf_context *, const int64_t)>
96
+ struct GKV_Base_Type {
97
+ static constexpr lm_gguf_type gt = gt_;
98
+
99
+ static T getter(const lm_gguf_context * ctx, const int kid) {
100
+ return gfun(ctx, kid);
101
+ }
102
+ };
103
+
104
+ template<typename T> struct GKV_Base;
105
+
106
+ template<> struct GKV_Base<bool >: GKV_Base_Type<bool, LM_GGUF_TYPE_BOOL, lm_gguf_get_val_bool> {};
107
+ template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, LM_GGUF_TYPE_UINT8, lm_gguf_get_val_u8 > {};
108
+ template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, LM_GGUF_TYPE_UINT16, lm_gguf_get_val_u16 > {};
109
+ template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, LM_GGUF_TYPE_UINT32, lm_gguf_get_val_u32 > {};
110
+ template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, LM_GGUF_TYPE_UINT64, lm_gguf_get_val_u64 > {};
111
+ template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, LM_GGUF_TYPE_INT8, lm_gguf_get_val_i8 > {};
112
+ template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, LM_GGUF_TYPE_INT16, lm_gguf_get_val_i16 > {};
113
+ template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, LM_GGUF_TYPE_INT32, lm_gguf_get_val_i32 > {};
114
+ template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, LM_GGUF_TYPE_INT64, lm_gguf_get_val_i64 > {};
115
+ template<> struct GKV_Base<float >: GKV_Base_Type<float, LM_GGUF_TYPE_FLOAT32, lm_gguf_get_val_f32 > {};
116
+ template<> struct GKV_Base<double >: GKV_Base_Type<double, LM_GGUF_TYPE_FLOAT64, lm_gguf_get_val_f64 > {};
117
+ template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, LM_GGUF_TYPE_STRING, lm_gguf_get_val_str > {};
118
+
119
+ template<> struct GKV_Base<std::string> {
120
+ static constexpr lm_gguf_type gt = LM_GGUF_TYPE_STRING;
121
+
122
+ static std::string getter(const lm_gguf_context * ctx, const int kid) {
123
+ return lm_gguf_get_val_str(ctx, kid);
124
+ }
125
+ };
126
+
127
+ struct ArrayInfo {
128
+ const lm_gguf_type gt;
129
+ const size_t length;
130
+ const void * data;
131
+ };
132
+
133
+ template<> struct GKV_Base<ArrayInfo> {
134
+ public:
135
+ static constexpr lm_gguf_type gt = LM_GGUF_TYPE_ARRAY;
136
+ static ArrayInfo getter(const lm_gguf_context *ctx, const int k) {
137
+ const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx, k);
138
+ return ArrayInfo {
139
+ arr_type,
140
+ size_t(lm_gguf_get_arr_n(ctx, k)),
141
+ arr_type == LM_GGUF_TYPE_STRING ? nullptr : lm_gguf_get_arr_data(ctx, k),
142
+ };
143
+ }
144
+ };
145
+
146
+ template<typename T>
147
+ class GKV : public GKV_Base<T> {
148
+ GKV() = delete;
149
+
150
+ public:
151
+ static T get_kv(const lm_gguf_context * ctx, const int k) {
152
+ const enum lm_gguf_type kt = lm_gguf_get_kv_type(ctx, k);
153
+
154
+ if (kt != GKV::gt) {
155
+ throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
156
+ lm_gguf_get_key(ctx, k), lm_gguf_type_name(kt), lm_gguf_type_name(GKV::gt)));
157
+ }
158
+ return GKV::getter(ctx, k);
159
+ }
160
+
161
+ static const char * override_type_to_str(const llama_model_kv_override_type ty) {
162
+ switch (ty) {
163
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
164
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
165
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
166
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
167
+ }
168
+ return "unknown";
169
+ }
170
+
171
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
172
+ if (!ovrd) { return false; }
173
+ if (ovrd->tag == expected_type) {
174
+ LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
175
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
176
+ switch (ovrd->tag) {
177
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
178
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
179
+ } break;
180
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
181
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
182
+ } break;
183
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
184
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
185
+ } break;
186
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
187
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
188
+ } break;
189
+ default:
190
+ // Shouldn't be possible to end up here, but just in case...
191
+ throw std::runtime_error(
192
+ format("Unsupported attempt to override %s type for metadata key %s\n",
193
+ override_type_to_str(ovrd->tag), ovrd->key));
194
+ }
195
+ return true;
196
+ }
197
+ LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
198
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
199
+ return false;
200
+ }
201
+
202
+ template<typename OT>
203
+ static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
204
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
205
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
206
+ target = ovrd->val_bool;
207
+ return true;
208
+ }
209
+ return false;
210
+ }
211
+
212
+ template<typename OT>
213
+ static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
214
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
215
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
216
+ target = ovrd->val_i64;
217
+ return true;
218
+ }
219
+ return false;
220
+ }
221
+
222
+ template<typename OT>
223
+ static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
224
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
225
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
226
+ target = ovrd->val_f64;
227
+ return true;
228
+ }
229
+ return false;
230
+ }
231
+
232
+ template<typename OT>
233
+ static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
234
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
235
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
236
+ target = ovrd->val_str;
237
+ return true;
238
+ }
239
+ return false;
240
+ }
241
+
242
+ static bool set(const lm_gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
243
+ if (try_override<T>(target, ovrd)) {
244
+ return true;
245
+ }
246
+ if (k < 0) { return false; }
247
+ target = get_kv(ctx, k);
248
+ return true;
249
+ }
250
+
251
+ static bool set(const lm_gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
252
+ return set(ctx, lm_gguf_find_key(ctx, key), target, ovrd);
253
+ }
254
+
255
+ static bool set(const lm_gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
256
+ return set(ctx, key.c_str(), target, ovrd);
257
+ }
258
+ };
259
+ }
260
+
261
+ template<typename T>
262
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
263
+ llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
264
+ const int kid = lm_gguf_find_key(meta.get(), key.c_str());
265
+
266
+ if (kid < 0) {
267
+ if (required) {
268
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
269
+ }
270
+ return false;
271
+ }
272
+
273
+ struct GGUFMeta::ArrayInfo arr_info =
274
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
275
+
276
+
277
+ result = arr_info.length;
278
+ return true;
279
+ }
280
+
281
+ template<typename T>
282
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
283
+ llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) {
284
+ return get_arr_n(llm_kv(kid), result, required);
285
+ }
286
+
287
+ template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
288
+
289
+ template<typename T>
290
+ bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
291
+ const int kid = lm_gguf_find_key(meta.get(), key.c_str());
292
+
293
+ if (kid < 0 || lm_gguf_get_kv_type(meta.get(), kid) != LM_GGUF_TYPE_ARRAY) {
294
+ if (required) {
295
+ throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
296
+ }
297
+ return false;
298
+ }
299
+
300
+ struct GGUFMeta::ArrayInfo arr_info =
301
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
302
+
303
+ switch (arr_info.gt) {
304
+ case LM_GGUF_TYPE_FLOAT32: LM_GGML_ASSERT((std::is_same<T, float>::value)); break;
305
+ case LM_GGUF_TYPE_INT32: LM_GGML_ASSERT(
306
+ (std::is_same<T, int32_t>::value) ||
307
+ (std::is_same<T, uint32_t>::value)); break;
308
+ default:
309
+ throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
310
+ }
311
+
312
+ result.resize(arr_info.length);
313
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
314
+
315
+ return true;
316
+ }
317
+
318
+ template<typename T, size_t N_MAX>
319
+ bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
320
+ const int kid = lm_gguf_find_key(meta.get(), key.c_str());
321
+
322
+ if (kid < 0 || lm_gguf_get_kv_type(meta.get(), kid) != LM_GGUF_TYPE_ARRAY) {
323
+ if (required) {
324
+ throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
325
+ }
326
+ return false;
327
+ }
328
+
329
+ struct GGUFMeta::ArrayInfo arr_info =
330
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
331
+
332
+ switch (arr_info.gt) {
333
+ case LM_GGUF_TYPE_FLOAT32: LM_GGML_ASSERT((std::is_same<T, float>::value)); break;
334
+ case LM_GGUF_TYPE_INT32: LM_GGML_ASSERT(
335
+ (std::is_same<T, int32_t>::value) ||
336
+ (std::is_same<T, uint32_t>::value)); break;
337
+ default:
338
+ throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
339
+ }
340
+
341
+ if (arr_info.length > N_MAX) {
342
+ throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
343
+ }
344
+
345
+ std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
346
+
347
+ return true;
348
+ }
349
+
350
+ template<typename T>
351
+ bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) {
352
+ return get_arr(llm_kv(kid), result, required);
353
+ }
354
+
355
+ template<typename T>
356
+ bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
357
+ auto it = kv_overrides.find(key);
358
+
359
+ const struct llama_model_kv_override * override =
360
+ it != kv_overrides.end() ? &it->second : nullptr;
361
+
362
+ const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
363
+
364
+ if (required && !found) {
365
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
366
+ }
367
+
368
+ return found;
369
+ }
370
+
371
+ template<typename T>
372
+ bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) {
373
+ return get_key(llm_kv(kid), result, required);
374
+ }
375
+
376
+ template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
377
+ template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
378
+ template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
379
+ template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
380
+
381
+ template<>
382
+ bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) {
383
+ uint32_t tmp;
384
+ const bool found = get_key(kid, tmp, required);
385
+ if (found) {
386
+ result = (enum llama_pooling_type) tmp;
387
+ } else {
388
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
389
+ }
390
+ return found;
391
+ }
392
+
393
+ // get array of n <= N_MAX elements, or a single element repeated n times
394
+ template<typename T, size_t N_MAX>
395
+ bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
396
+ const int kid = lm_gguf_find_key(meta.get(), key.c_str());
397
+
398
+ if (kid < 0) {
399
+ if (required) {
400
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
401
+ }
402
+ return false;
403
+ }
404
+
405
+ if (n > N_MAX) {
406
+ throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
407
+ }
408
+
409
+ if (lm_gguf_get_kv_type(meta.get(), kid) == LM_GGUF_TYPE_ARRAY) {
410
+ struct GGUFMeta::ArrayInfo arr_info =
411
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
412
+
413
+ if (n != arr_info.length) {
414
+ throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
415
+ }
416
+
417
+ return get_arr(key, result, required);
418
+ }
419
+
420
+ T value;
421
+
422
+ bool ok = get_key(key, value, required);
423
+ if (!ok) {
424
+ return false;
425
+ }
426
+
427
+ for (uint32_t i = 0; i < n; i++) {
428
+ result[i] = value;
429
+ }
430
+
431
+ return true;
432
+ }
433
+
434
+ template<typename T>
435
+ bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) {
436
+ return get_key_or_arr(llm_kv(kid), result, n, required);
437
+ }
438
+
439
+ // TODO: this is not very clever - figure out something better
440
+ template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
441
+ template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
442
+
443
+ llama_model_loader::llama_model_loader(
444
+ const std::string & fname,
445
+ std::vector<std::string> & splits,
446
+ bool use_mmap,
447
+ bool check_tensors,
448
+ const llama_model_kv_override * param_overrides_p,
449
+ const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
450
+ int trace = 0;
451
+ if (getenv("LLAMA_TRACE")) {
452
+ trace = atoi(getenv("LLAMA_TRACE"));
453
+ }
454
+
455
+ if (param_overrides_p != nullptr) {
456
+ for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
457
+ kv_overrides.insert({std::string(p->key), *p});
458
+ }
459
+ }
460
+
461
+ tensor_buft_overrides = param_tensor_buft_overrides_p;
462
+
463
+ // Load the main GGUF
464
+ struct lm_ggml_context * ctx = NULL;
465
+ struct lm_gguf_init_params params = {
466
+ /*.no_alloc = */ true,
467
+ /*.ctx = */ &ctx,
468
+ };
469
+
470
+ meta.reset(lm_gguf_init_from_file(fname.c_str(), params));
471
+ if (!meta) {
472
+ throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
473
+ }
474
+
475
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
476
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
477
+
478
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
479
+ contexts.emplace_back(ctx);
480
+
481
+ // Save tensors data offset of the main file.
482
+ // For subsidiary files, `meta` tensor data offset must not be used,
483
+ // so we build a unified tensors index for weights.
484
+ for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur; cur = lm_ggml_get_next_tensor(ctx, cur)) {
485
+ std::string tensor_name = std::string(cur->name);
486
+ // make sure there is no duplicated tensor names
487
+ if (weights_map.find(tensor_name) != weights_map.end()) {
488
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", lm_ggml_get_name(cur)));
489
+ }
490
+ n_elements += lm_ggml_nelements(cur);
491
+ n_bytes += lm_ggml_nbytes(cur);
492
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
493
+ }
494
+ uint16_t n_split = 0;
495
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
496
+
497
+ // Load additional GGML contexts
498
+ if (n_split > 1) {
499
+ // make sure the main file is loaded first
500
+ uint16_t idx = 0;
501
+ const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
502
+ get_key(kv_split_no, idx);
503
+ if (idx != 0) {
504
+ throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
505
+ }
506
+
507
+ // generate list of splits if needed
508
+ if (splits.empty()) {
509
+ splits = llama_get_list_splits(fname, idx, n_split);
510
+ }
511
+
512
+ // in case user give a custom list of splits, check if it matches the expected number
513
+ if (n_split != (uint16_t)splits.size()) {
514
+ throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
515
+ }
516
+
517
+ if (trace > 0) {
518
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
519
+ }
520
+
521
+ // load other splits
522
+ for (idx = 1; idx < n_split; idx++) {
523
+ const char * fname_split = splits[idx].c_str();
524
+
525
+ struct lm_gguf_init_params split_params = {
526
+ /*.no_alloc = */ true,
527
+ /*.ctx = */ &ctx,
528
+ };
529
+ lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(fname_split, split_params) };
530
+ if (!ctx_gguf) {
531
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
532
+ }
533
+
534
+ // check idx
535
+ {
536
+ const int kid = lm_gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
537
+ if (kid < 0) {
538
+ throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
539
+ }
540
+ int idx_gguf = lm_gguf_get_val_u16(ctx_gguf.get(), kid);
541
+ if (idx_gguf != idx) {
542
+ throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
543
+ }
544
+ }
545
+
546
+ files.emplace_back(new llama_file(fname_split, "rb"));
547
+ contexts.emplace_back(ctx);
548
+
549
+ // Save tensors data offset info of the shard.
550
+ for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur; cur = lm_ggml_get_next_tensor(ctx, cur)) {
551
+ std::string tensor_name = std::string(cur->name);
552
+ // make sure there is no duplicated tensor names
553
+ if (weights_map.find(tensor_name) != weights_map.end()) {
554
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", lm_ggml_get_name(cur)));
555
+ }
556
+ n_elements += lm_ggml_nelements(cur);
557
+ n_bytes += lm_ggml_nbytes(cur);
558
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
559
+ }
560
+ }
561
+
562
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
563
+
564
+ // sanity check
565
+ {
566
+ const int n_tensors_loaded = (int) weights_map.size();
567
+ if (n_tensors != n_tensors_loaded) {
568
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
569
+ }
570
+ }
571
+
572
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
573
+ }
574
+
575
+ n_kv = lm_gguf_get_n_kv(meta.get());
576
+ n_tensors = weights_map.size();
577
+
578
+ fver = (enum llama_fver) lm_gguf_get_version(meta.get());
579
+
580
+ LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
581
+ __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
582
+
583
+ // determine file type based on the number of tensors for each quantization and print meta data
584
+ // TODO: make optional
585
+ {
586
+ std::map<enum lm_ggml_type, uint32_t> n_type;
587
+
588
+ uint32_t n_type_max = 0;
589
+ enum lm_ggml_type type_max = LM_GGML_TYPE_F32;
590
+
591
+ for (const auto & it : weights_map) {
592
+ const llama_tensor_weight & w = it.second;
593
+ const lm_ggml_tensor * tensor = w.tensor;
594
+
595
+ enum lm_ggml_type type = tensor->type;
596
+
597
+ n_type[type]++;
598
+
599
+ if (n_type_max < n_type[type]) {
600
+ n_type_max = n_type[type];
601
+ type_max = type;
602
+ }
603
+
604
+ if (trace > 0) {
605
+ const uint16_t sid = w.idx;
606
+ LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
607
+ sid, lm_ggml_get_name(tensor), lm_ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
608
+ lm_ggml_nbytes(tensor)/1024.0f/1024.0f);
609
+ }
610
+ }
611
+
612
+ switch (type_max) {
613
+ case LM_GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
614
+ case LM_GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
615
+ case LM_GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
616
+ case LM_GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
617
+ case LM_GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
618
+ case LM_GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
619
+ case LM_GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
620
+ case LM_GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
621
+ case LM_GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
622
+ case LM_GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
623
+ case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
624
+ case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
625
+ case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
626
+ case LM_GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
627
+ case LM_GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
628
+ case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
629
+ case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
630
+ case LM_GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
631
+ case LM_GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
632
+ case LM_GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
633
+ case LM_GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
634
+ case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
635
+ case LM_GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
636
+ case LM_GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
637
+ default:
638
+ {
639
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
640
+ ftype = LLAMA_FTYPE_ALL_F32;
641
+ } break;
642
+ }
643
+
644
+ // this is a way to mark that we have "guessed" the file type
645
+ ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
646
+
647
+ {
648
+ uint32_t ftype_val = 0;
649
+ if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
650
+ ftype = (llama_ftype) ftype_val;
651
+ }
652
+ }
653
+
654
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
655
+
656
+ for (int i = 0; i < n_kv; i++) {
657
+ const char * name = lm_gguf_get_key(meta.get(), i);
658
+ const enum lm_gguf_type type = lm_gguf_get_kv_type(meta.get(), i);
659
+ const std::string type_name =
660
+ type == LM_GGUF_TYPE_ARRAY
661
+ ? format("%s[%s,%zu]", lm_gguf_type_name(type), lm_gguf_type_name(lm_gguf_get_arr_type(meta.get(), i)), lm_gguf_get_arr_n(meta.get(), i))
662
+ : lm_gguf_type_name(type);
663
+
664
+ std::string value = lm_gguf_kv_to_str(meta.get(), i);
665
+ const size_t MAX_VALUE_LEN = 40;
666
+ if (value.size() > MAX_VALUE_LEN) {
667
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
668
+ }
669
+ replace_all(value, "\n", "\\n");
670
+
671
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
672
+ }
673
+
674
+ // print type counts
675
+ for (auto & kv : n_type) {
676
+ if (kv.second == 0) {
677
+ continue;
678
+ }
679
+
680
+ LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, lm_ggml_type_name(kv.first), kv.second);
681
+ }
682
+ }
683
+
684
+ if (!llama_mmap::SUPPORTED) {
685
+ LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
686
+ use_mmap = false;
687
+ }
688
+
689
+ this->use_mmap = use_mmap;
690
+ this->check_tensors = check_tensors;
691
+ }
692
+
693
+ std::string llama_model_loader::get_arch_name() const {
694
+ return arch_name;
695
+ }
696
+
697
+ enum llm_arch llama_model_loader::get_arch() const {
698
+ return llm_kv.arch;
699
+ }
700
+
701
+ const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
702
+ auto pos = weights_map.find(name);
703
+ if (pos != weights_map.end()) {
704
+ return &pos->second;
705
+ }
706
+
707
+ return nullptr;
708
+ }
709
+
710
+ const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const {
711
+ const llama_tensor_weight * weight = get_weight(name);
712
+ if (!weight) {
713
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
714
+ }
715
+ return *weight;
716
+ }
717
+
718
+ struct lm_ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const {
719
+ const auto * weight = get_weight(name);
720
+ if (!weight) {
721
+ return nullptr;
722
+ }
723
+ return weight->tensor;
724
+ }
725
+
726
+ struct lm_ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const {
727
+ struct lm_ggml_tensor * tensor = get_tensor_meta(name.c_str());
728
+ if (!tensor) {
729
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
730
+ }
731
+ return tensor;
732
+ }
733
+
734
+ const struct lm_ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
735
+ const struct lm_ggml_tensor * cur = get_tensor_meta(name.c_str());
736
+
737
+ if (cur == NULL) {
738
+ if (!required) {
739
+ return NULL;
740
+ }
741
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
742
+ }
743
+
744
+ {
745
+ bool is_ok = true;
746
+ for (size_t i = 0; i < LM_GGML_MAX_DIMS; ++i) {
747
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
748
+ is_ok = false;
749
+ break;
750
+ }
751
+ }
752
+ if (!is_ok) {
753
+ throw std::runtime_error(
754
+ format("%s: tensor '%s' has wrong shape; expected %s, got %s",
755
+ __func__, name.c_str(),
756
+ llama_format_tensor_shape(ne).c_str(),
757
+ llama_format_tensor_shape(cur).c_str()));
758
+ }
759
+ }
760
+
761
+ return cur;
762
+ }
763
+
764
+ struct lm_ggml_tensor * llama_model_loader::create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
765
+ const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
766
+
767
+ if (cur == NULL) {
768
+ return NULL;
769
+ }
770
+
771
+ bool duplicated = flags & TENSOR_DUPLICATED;
772
+
773
+ struct lm_ggml_tensor * tensor = lm_ggml_dup_tensor(ctx, cur);
774
+ lm_ggml_set_name(tensor, lm_ggml_get_name(cur));
775
+
776
+ if (duplicated) {
777
+ size_data += lm_ggml_nbytes(cur);
778
+ } else {
779
+ n_created++;
780
+ }
781
+
782
+ return tensor;
783
+
784
+ }
785
+
786
+ struct lm_ggml_tensor * llama_model_loader::create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
787
+ const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, required);
788
+
789
+ if (cur == NULL) {
790
+ return NULL;
791
+ }
792
+
793
+ if (cur->type != base->type) {
794
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), lm_ggml_type_name(base->type), lm_ggml_type_name(cur->type)));
795
+ }
796
+
797
+ std::array<int64_t, LM_GGML_MAX_DIMS> dims;
798
+ for (size_t i = 0; i < LM_GGML_MAX_DIMS; ++i) {
799
+ dims[i] = i < ne.size() ? ne.begin()[i] : 1;
800
+ }
801
+
802
+ struct lm_ggml_tensor * tensor = lm_ggml_view_4d(ctx, base,
803
+ dims[0], dims[1], dims[2], dims[3],
804
+ cur->nb[1], cur->nb[2], cur->nb[3],
805
+ offset);
806
+
807
+ lm_ggml_set_name(tensor, name.c_str());
808
+
809
+ n_created++;
810
+
811
+ return tensor;
812
+ }
813
+
814
+ void llama_model_loader::done_getting_tensors() const {
815
+ if (n_created != n_tensors) {
816
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
817
+ }
818
+ }
819
+
820
+ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
821
+ if (use_mmap) {
822
+ mappings.reserve(files.size());
823
+ mmaps_used.reserve(files.size());
824
+ for (const auto & file : files) {
825
+ auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
826
+ auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
827
+ std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
828
+ mmaps_used.emplace_back(mapping->size(), 0);
829
+ if (mlock_mmaps) {
830
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
831
+ mlock_mmap->init(mapping->addr());
832
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
833
+ }
834
+ mappings.emplace_back(std::move(mapping));
835
+ }
836
+ }
837
+
838
+ // compute the total size of all tensors for progress reporting
839
+ for (const auto & it : weights_map) {
840
+ size_data += lm_ggml_nbytes(it.second.tensor);
841
+ }
842
+ }
843
+
844
+ void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, lm_ggml_context * ctx) const {
845
+ LM_GGML_ASSERT(!mappings.empty());
846
+ const auto & mapping = mappings.at(idx);
847
+
848
+ *first = mapping->size();
849
+ *last = 0;
850
+ *addr = mapping->addr();
851
+ for (lm_ggml_tensor * tensor = lm_ggml_get_first_tensor(ctx); tensor; tensor = lm_ggml_get_next_tensor(ctx, tensor)) {
852
+ const auto * weight = get_weight(lm_ggml_get_name(tensor));
853
+ if (!weight || weight->idx != idx) {
854
+ continue;
855
+ }
856
+ *first = std::min(*first, weight->offs);
857
+ *last = std::max(*last, weight->offs + lm_ggml_nbytes(tensor));
858
+ }
859
+ }
860
+
861
+ void llama_model_loader::load_data_for(struct lm_ggml_tensor * cur) const {
862
+ const auto & w = require_weight(lm_ggml_get_name(cur));
863
+
864
+ if (use_mmap) {
865
+ const auto & mapping = mappings.at(w.idx);
866
+ if (cur->data == nullptr) {
867
+ cur->data = (uint8_t *)mapping->addr() + w.offs;
868
+ } else {
869
+ memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, lm_ggml_nbytes(cur));
870
+ }
871
+ } else {
872
+ LM_GGML_ASSERT(cur->data != nullptr);
873
+ LM_GGML_ASSERT(w.idx < files.size());
874
+ const auto & file = files.at(w.idx);
875
+ file->seek(w.offs, SEEK_SET);
876
+ file->read_raw(cur->data, lm_ggml_nbytes(cur));
877
+ }
878
+
879
+ if (check_tensors && !lm_ggml_validate_row_data(cur->type, cur->data, lm_ggml_nbytes(cur))) {
880
+ throw std::runtime_error(format("tensor '%s' has invalid data", lm_ggml_get_name(cur)));
881
+ }
882
+ }
883
+
884
+ bool llama_model_loader::load_all_data(
885
+ struct lm_ggml_context * ctx,
886
+ llama_buf_map & bufs,
887
+ llama_mlocks * lmlocks,
888
+ llama_progress_callback progress_callback,
889
+ void * progress_callback_user_data) {
890
+ LM_GGML_ASSERT(size_data != 0 && "call init_mappings() first");
891
+
892
+ std::vector<no_init<uint8_t>> read_buf;
893
+ std::vector<std::future<std::pair<lm_ggml_tensor *, bool>>> validation_result;
894
+
895
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
896
+ // NVMe raid configurations might require more / larger buffers.
897
+ constexpr size_t n_buffers = 4;
898
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
899
+
900
+ std::vector<lm_ggml_backend_buffer_t> host_buffers;
901
+ std::vector<lm_ggml_backend_event_t> events;
902
+ std::vector<void *> host_ptrs;
903
+ size_t buffer_idx = 0; // buffer to use for async loads
904
+ lm_ggml_backend_t upload_backend = [&](const char * func) -> lm_ggml_backend_t {
905
+ if (use_mmap || check_tensors) {
906
+ return nullptr;
907
+ }
908
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
909
+ // First determine if the backend supports the necessary features for async uploads.
910
+ auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
911
+ if (!buf) {
912
+ LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
913
+ return nullptr;
914
+ }
915
+
916
+ auto * buft = lm_ggml_backend_buffer_get_type(buf);
917
+ auto * dev = lm_ggml_backend_buft_get_device(buft);
918
+ if (!dev) {
919
+ LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
920
+ lm_ggml_backend_buft_name(buft));
921
+ return nullptr;
922
+ }
923
+
924
+ if (buft != lm_ggml_backend_dev_buffer_type(dev)) {
925
+ LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
926
+ lm_ggml_backend_buft_name(buft), lm_ggml_backend_dev_name(dev));
927
+ return nullptr;
928
+ }
929
+
930
+ lm_ggml_backend_dev_props props;
931
+ lm_ggml_backend_dev_get_props(dev, &props);
932
+ if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
933
+ LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
934
+ lm_ggml_backend_dev_name(dev));
935
+ return nullptr;
936
+ }
937
+
938
+ auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
939
+ if (!host_buft) {
940
+ LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
941
+ lm_ggml_backend_dev_name(dev));
942
+ return nullptr;
943
+ }
944
+
945
+ // If the backend is supported, create pinned memory buffers and events for synchronisation.
946
+ for (size_t idx = 0; idx < n_buffers; ++idx) {
947
+ auto * buf = lm_ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
948
+ if (!buf) {
949
+ LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
950
+ lm_ggml_backend_dev_name(dev));
951
+ return nullptr;
952
+ }
953
+
954
+ host_buffers.emplace_back(buf);
955
+ host_ptrs.emplace_back(lm_ggml_backend_buffer_get_base(buf));
956
+
957
+ auto * event = lm_ggml_backend_event_new(dev);
958
+ if (!event) {
959
+ LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
960
+ lm_ggml_backend_dev_name(dev));
961
+ return nullptr;
962
+ }
963
+
964
+ events.emplace_back(event);
965
+ }
966
+
967
+ lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
968
+ if (!backend) {
969
+ LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
970
+ lm_ggml_backend_dev_name(dev));
971
+ return nullptr;
972
+ }
973
+
974
+ return backend;
975
+ }(__func__);
976
+
977
+ if (upload_backend) {
978
+ LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
979
+ lm_ggml_backend_dev_name(lm_ggml_backend_get_device(upload_backend)),
980
+ lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(bufs.at(0))),
981
+ lm_ggml_backend_name(upload_backend));
982
+ }
983
+
984
+ for (struct lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur != NULL; cur = lm_ggml_get_next_tensor(ctx, cur)) {
985
+ const auto * weight = get_weight(lm_ggml_get_name(cur));
986
+ if (weight == nullptr) {
987
+ // this can happen with split experts models
988
+ continue;
989
+ }
990
+
991
+ if (progress_callback) {
992
+ if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
993
+ return false;
994
+ }
995
+ }
996
+
997
+ size_t n_size = lm_ggml_nbytes(cur);
998
+
999
+ if (use_mmap) {
1000
+ const auto & mapping = mappings.at(weight->idx);
1001
+ lm_ggml_backend_buffer_t buf_mmap = nullptr;
1002
+ if (bufs.count(weight->idx)) {
1003
+ buf_mmap = bufs.at(weight->idx);
1004
+ }
1005
+ uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
1006
+
1007
+ if (check_tensors) {
1008
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
1009
+ return std::make_pair(cur, lm_ggml_validate_row_data(cur->type, data, n_size));
1010
+ }));
1011
+ }
1012
+
1013
+ LM_GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
1014
+ if (buf_mmap && cur->data == nullptr) {
1015
+ lm_ggml_backend_tensor_alloc(buf_mmap, cur, data);
1016
+ if (lmlocks) {
1017
+ const auto & lmlock = lmlocks->at(weight->idx);
1018
+ lmlock->grow_to(weight->offs + n_size);
1019
+ }
1020
+
1021
+ auto & mmap_used = mmaps_used[weight->idx];
1022
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
1023
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
1024
+ } else {
1025
+ lm_ggml_backend_tensor_set(cur, data, 0, n_size);
1026
+ }
1027
+ } else {
1028
+ const auto & file = files.at(weight->idx);
1029
+ if (lm_ggml_backend_buffer_is_host(cur->buffer)) {
1030
+ file->seek(weight->offs, SEEK_SET);
1031
+ file->read_raw(cur->data, n_size);
1032
+ if (check_tensors) {
1033
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1034
+ return std::make_pair(cur, lm_ggml_validate_row_data(cur->type, cur->data, n_size));
1035
+ }));
1036
+ }
1037
+ } else {
1038
+ // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1039
+ if (upload_backend) {
1040
+ file->seek(weight->offs, SEEK_SET);
1041
+
1042
+ size_t bytes_read = 0;
1043
+
1044
+ while (bytes_read < n_size) {
1045
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
1046
+
1047
+ lm_ggml_backend_event_synchronize(events[buffer_idx]);
1048
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
1049
+ lm_ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
1050
+ lm_ggml_backend_event_record(events[buffer_idx], upload_backend);
1051
+
1052
+ bytes_read += read_iteration;
1053
+ ++buffer_idx;
1054
+ buffer_idx %= n_buffers;
1055
+ }
1056
+ } else {
1057
+ read_buf.resize(n_size);
1058
+ file->seek(weight->offs, SEEK_SET);
1059
+ file->read_raw(read_buf.data(), n_size);
1060
+ lm_ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1061
+ if (check_tensors && !lm_ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1062
+ throw std::runtime_error(format("tensor '%s' has invalid data", lm_ggml_get_name(cur)));
1063
+ }
1064
+ }
1065
+ }
1066
+ }
1067
+
1068
+ size_done += n_size;
1069
+ }
1070
+
1071
+ // free temporary resources used for async uploads
1072
+ for (auto * event : events) {
1073
+ lm_ggml_backend_event_synchronize(event);
1074
+ lm_ggml_backend_event_free(event);
1075
+ }
1076
+ for (auto * buf : host_buffers) {
1077
+ lm_ggml_backend_buffer_free(buf);
1078
+ }
1079
+ lm_ggml_backend_free(upload_backend);
1080
+
1081
+ // check validation results
1082
+ bool validation_failed = false;
1083
+ for (auto & future : validation_result) {
1084
+ auto result = future.get();
1085
+ if (!result.second) {
1086
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, lm_ggml_get_name(result.first));
1087
+ validation_failed = true;
1088
+ }
1089
+ }
1090
+ if (validation_failed) {
1091
+ throw std::runtime_error("found tensors with invalid data");
1092
+ }
1093
+
1094
+ // check if this is the last call and do final cleanup
1095
+ if (size_done >= size_data) {
1096
+ // unmap offloaded tensors and metadata
1097
+ if (use_mmap) {
1098
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
1099
+ const auto & mmap_used = mmaps_used.at(idx);
1100
+ auto & mapping = mappings.at(idx);
1101
+ mapping->unmap_fragment(0, mmap_used.first);
1102
+ if (mmap_used.second != 0) {
1103
+ mapping->unmap_fragment(mmap_used.second, mapping->size());
1104
+ }
1105
+ }
1106
+ }
1107
+ if (progress_callback) {
1108
+ // Even though the model is done loading, we still honor
1109
+ // cancellation since we need to free allocations.
1110
+ return progress_callback(1.0f, progress_callback_user_data);
1111
+ }
1112
+ }
1113
+
1114
+ return true;
1115
+ }
1116
+
1117
+ std::string llama_model_loader::ftype_name() const {
1118
+ return llama_model_ftype_name(ftype);
1119
+ }
1120
+
1121
+ void llama_model_loader::print_info() const {
1122
+ LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1123
+ LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1124
+ if (n_bytes < GiB) {
1125
+ LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1126
+ } else {
1127
+ LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1128
+ }
1129
+ }