cui-llama.rn 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/build-arm64/CMakeCache.txt +429 -0
  4. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +81 -0
  5. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +101 -0
  6. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  7. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  8. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +15 -0
  9. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +904 -0
  10. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  11. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +919 -0
  12. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  13. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +431 -0
  14. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  15. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +165 -0
  16. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +297 -0
  17. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +1 -0
  18. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +1 -0
  19. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +1 -0
  20. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +1 -0
  21. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +1 -0
  22. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +1 -0
  23. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +1 -0
  24. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +8 -0
  25. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +1 -0
  26. package/android/src/main/build-arm64/CMakeFiles/progress.marks +1 -0
  27. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  28. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +58 -0
  29. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  30. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +756 -0
  31. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  32. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +709 -0
  33. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  34. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +714 -0
  35. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  36. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +62 -0
  37. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  38. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +708 -0
  39. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  40. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +113 -0
  41. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  42. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +713 -0
  43. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  44. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +763 -0
  45. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  46. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +61 -0
  47. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  48. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +707 -0
  49. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  50. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +104 -0
  51. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  52. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +714 -0
  53. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  54. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +723 -0
  55. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +62 -0
  56. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +722 -0
  57. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +89 -0
  58. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +2 -0
  59. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +2 -0
  60. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +2 -0
  61. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +17 -0
  62. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +41 -0
  63. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +62 -0
  64. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +722 -0
  65. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +89 -0
  66. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +2 -0
  67. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +2 -0
  68. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +2 -0
  69. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +17 -0
  70. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +41 -0
  71. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +62 -0
  72. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +722 -0
  73. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +89 -0
  74. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +2 -0
  75. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +2 -0
  76. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +2 -0
  77. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +17 -0
  78. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +41 -0
  79. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +62 -0
  80. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +722 -0
  81. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +89 -0
  82. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +2 -0
  83. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +2 -0
  84. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +2 -0
  85. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +17 -0
  86. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +41 -0
  87. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +62 -0
  88. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +722 -0
  89. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +89 -0
  90. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +2 -0
  91. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +2 -0
  92. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +2 -0
  93. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +17 -0
  94. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +41 -0
  95. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +62 -0
  96. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +722 -0
  97. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +89 -0
  98. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +2 -0
  99. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +2 -0
  100. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +2 -0
  101. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +17 -0
  102. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +41 -0
  103. package/android/src/main/build-arm64/Makefile +1862 -0
  104. package/android/src/main/build-arm64/cmake_install.cmake +66 -0
  105. package/android/src/main/java/com/rnllama/LlamaContext.java +92 -18
  106. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  107. package/android/src/main/jni-utils.h +6 -0
  108. package/android/src/main/jni.cpp +287 -31
  109. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  110. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  111. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  112. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  113. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  114. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  115. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  116. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  117. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  118. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  119. package/cpp/chat-template.hpp +529 -0
  120. package/cpp/chat.cpp +1085 -0
  121. package/cpp/chat.hpp +55 -0
  122. package/cpp/common.cpp +159 -36
  123. package/cpp/common.h +64 -19
  124. package/cpp/ggml-alloc.c +1 -13
  125. package/cpp/ggml-common.h +0 -2
  126. package/cpp/ggml-cpu-impl.h +6 -12
  127. package/cpp/ggml-cpu-quants.c +937 -340
  128. package/cpp/ggml-cpu.c +207 -113
  129. package/cpp/ggml-cpu.cpp +4 -6
  130. package/cpp/ggml-cpu.h +1 -1
  131. package/cpp/ggml-metal.h +66 -66
  132. package/cpp/ggml-metal.m +141 -23
  133. package/cpp/ggml.c +24 -14
  134. package/cpp/ggml.h +2 -2
  135. package/cpp/json-schema-to-grammar.cpp +46 -66
  136. package/cpp/json-schema-to-grammar.h +15 -1
  137. package/cpp/llama-arch.cpp +7 -2
  138. package/cpp/llama-arch.h +3 -1
  139. package/cpp/llama-chat.cpp +10 -1
  140. package/cpp/llama-chat.h +1 -0
  141. package/cpp/llama-grammar.cpp +86 -6
  142. package/cpp/llama-grammar.h +22 -1
  143. package/cpp/llama-impl.h +6 -6
  144. package/cpp/llama-kv-cache.h +1 -1
  145. package/cpp/llama-mmap.h +1 -0
  146. package/cpp/llama-model-loader.cpp +1 -1
  147. package/cpp/llama-model.cpp +32 -6
  148. package/cpp/llama-sampling.cpp +178 -61
  149. package/cpp/llama-vocab.cpp +8 -3
  150. package/cpp/llama.cpp +188 -128
  151. package/cpp/llama.h +27 -10
  152. package/cpp/log.cpp +32 -10
  153. package/cpp/log.h +12 -1
  154. package/cpp/minja.hpp +2883 -0
  155. package/cpp/rn-llama.cpp +82 -5
  156. package/cpp/rn-llama.h +16 -1
  157. package/cpp/sampling.cpp +68 -41
  158. package/cpp/sampling.h +3 -0
  159. package/cpp/sgemm.cpp +9 -8
  160. package/cpp/unicode.cpp +9 -2
  161. package/ios/CMakeLists.txt +6 -0
  162. package/ios/RNLlama.h +0 -8
  163. package/ios/RNLlama.mm +27 -3
  164. package/ios/RNLlamaContext.h +10 -1
  165. package/ios/RNLlamaContext.mm +269 -57
  166. package/jest/mock.js +21 -2
  167. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  168. package/lib/commonjs/grammar.js +3 -0
  169. package/lib/commonjs/grammar.js.map +1 -1
  170. package/lib/commonjs/index.js +87 -13
  171. package/lib/commonjs/index.js.map +1 -1
  172. package/lib/module/NativeRNLlama.js.map +1 -1
  173. package/lib/module/grammar.js +3 -0
  174. package/lib/module/grammar.js.map +1 -1
  175. package/lib/module/index.js +86 -13
  176. package/lib/module/index.js.map +1 -1
  177. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  178. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  179. package/lib/typescript/grammar.d.ts.map +1 -1
  180. package/lib/typescript/index.d.ts +32 -7
  181. package/lib/typescript/index.d.ts.map +1 -1
  182. package/llama-rn.podspec +1 -1
  183. package/package.json +2 -2
  184. package/src/NativeRNLlama.ts +115 -3
  185. package/src/grammar.ts +3 -0
  186. package/src/index.ts +138 -21
@@ -1,4 +1,6 @@
1
1
  #include "json-schema-to-grammar.h"
2
+ #include "common.h"
3
+
2
4
  #include <algorithm>
3
5
  #include <fstream>
4
6
  #include <map>
@@ -11,11 +13,6 @@
11
13
 
12
14
  using json = nlohmann::ordered_json;
13
15
 
14
- template <typename Iterator>
15
- static std::string join(Iterator begin, Iterator end, const std::string & separator);
16
-
17
- static std::string repeat(const std::string & str, size_t n);
18
-
19
16
  static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
20
17
  auto has_max = max_items != std::numeric_limits<int>::max();
21
18
 
@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
128
125
  if (sub_len > 0) {
129
126
  auto from_sub = from.substr(i + 1);
130
127
  auto to_sub = to.substr(i + 1);
131
- auto sub_zeros = repeat("0", sub_len);
132
- auto sub_nines = repeat("9", sub_len);
128
+ auto sub_zeros = string_repeat("0", sub_len);
129
+ auto sub_nines = string_repeat("9", sub_len);
133
130
 
134
131
  auto to_reached = false;
135
132
  out << "(";
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
188
185
  auto max_digits = max_s.length();
189
186
 
190
187
  for (auto digits = min_digits; digits < max_digits; digits++) {
191
- uniform_range(min_s, repeat("9", digits));
192
- min_s = "1" + repeat("0", digits);
188
+ uniform_range(min_s, string_repeat("9", digits));
189
+ min_s = "1" + string_repeat("0", digits);
193
190
  out << " | ";
194
191
  }
195
192
  uniform_range(min_s, max_s);
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
318
315
  std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
319
316
  std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
320
317
 
321
- template <typename Iterator>
322
- std::string join(Iterator begin, Iterator end, const std::string & separator) {
323
- std::ostringstream result;
324
- if (begin != end) {
325
- result << *begin;
326
- for (Iterator it = begin + 1; it != end; ++it) {
327
- result << separator << *it;
328
- }
329
- }
330
- return result.str();
331
- }
332
-
333
- static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
334
- std::vector<std::string> tokens;
335
- size_t start = 0;
336
- size_t end = str.find(delimiter);
337
-
338
- while (end != std::string::npos) {
339
- tokens.push_back(str.substr(start, end - start));
340
- start = end + delimiter.length();
341
- end = str.find(delimiter, start);
342
- }
343
-
344
- tokens.push_back(str.substr(start));
345
-
346
- return tokens;
347
- }
348
-
349
- static std::string repeat(const std::string & str, size_t n) {
350
- if (n == 0) {
351
- return "";
352
- }
353
-
354
- std::string result;
355
- result.reserve(str.length() * n);
356
-
357
- for (size_t i = 0; i < n; ++i) {
358
- result += str;
359
- }
360
-
361
- return result;
362
- }
363
-
364
318
  static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
365
319
  std::smatch match;
366
320
  std::string result;
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {
389
343
 
390
344
  class SchemaConverter {
391
345
  private:
346
+ friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
392
347
  std::function<json(const std::string &)> _fetch_json;
393
348
  bool _dotall;
394
349
  std::map<std::string, std::string> _rules;
@@ -418,7 +373,7 @@ private:
418
373
  for (size_t i = 0; i < alt_schemas.size(); i++) {
419
374
  rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
420
375
  }
421
- return join(rules.begin(), rules.end(), " | ");
376
+ return string_join(rules, " | ");
422
377
  }
423
378
 
424
379
  std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -481,7 +436,7 @@ private:
481
436
  for (const auto & item : ret) {
482
437
  results.push_back(to_rule(item));
483
438
  }
484
- return std::make_pair(join(results.begin(), results.end(), " "), false);
439
+ return std::make_pair(string_join(results, " "), false);
485
440
  };
486
441
 
487
442
  while (i < length) {
@@ -539,7 +494,7 @@ private:
539
494
  }
540
495
  curly_brackets += '}';
541
496
  i++;
542
- auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
497
+ auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
543
498
  int min_times = 0;
544
499
  int max_times = std::numeric_limits<int>::max();
545
500
  try {
@@ -809,10 +764,11 @@ private:
809
764
  public:
810
765
  SchemaConverter(
811
766
  const std::function<json(const std::string &)> & fetch_json,
812
- bool dotall)
767
+ bool dotall,
768
+ bool compact_spaces)
813
769
  : _fetch_json(fetch_json), _dotall(dotall)
814
770
  {
815
- _rules["space"] = SPACE_RULE;
771
+ _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
816
772
  }
817
773
 
818
774
  void resolve_refs(json & schema, const std::string & url) {
@@ -854,7 +810,7 @@ public:
854
810
  return;
855
811
  }
856
812
  std::string pointer = ref.substr(ref.find('#') + 1);
857
- std::vector<std::string> tokens = split(pointer, "/");
813
+ std::vector<std::string> tokens = string_split(pointer, "/");
858
814
  for (size_t i = 1; i < tokens.size(); ++i) {
859
815
  std::string sel = tokens[i];
860
816
  if (target.is_null() || !target.contains(sel)) {
@@ -905,7 +861,7 @@ public:
905
861
  for (const auto & v : schema["enum"]) {
906
862
  enum_values.push_back(_generate_constant_rule(v));
907
863
  }
908
- return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
864
+ return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
909
865
  } else if ((schema_type.is_null() || schema_type == "object")
910
866
  && (schema.contains("properties") ||
911
867
  (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -1019,10 +975,10 @@ public:
1019
975
 
1020
976
  void check_errors() {
1021
977
  if (!_errors.empty()) {
1022
- throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
978
+ throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
1023
979
  }
1024
980
  if (!_warnings.empty()) {
1025
- fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
981
+ fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
1026
982
  }
1027
983
  }
1028
984
 
@@ -1035,11 +991,35 @@ public:
1035
991
  }
1036
992
  };
1037
993
 
1038
- std::string json_schema_to_grammar(const json & schema) {
1039
- SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
1040
- auto copy = schema;
1041
- converter.resolve_refs(copy, "input");
1042
- converter.visit(copy, "");
994
+ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
995
+ #ifdef LLAMA_USE_LLGUIDANCE
996
+ if (!force_gbnf) {
997
+ return "%llguidance {}\nstart: %json " + schema.dump();
998
+ }
999
+ #else
1000
+ (void)force_gbnf;
1001
+ #endif // LLAMA_USE_LLGUIDANCE
1002
+ return build_grammar([&](const common_grammar_builder & callbacks) {
1003
+ auto copy = schema;
1004
+ callbacks.resolve_refs(copy);
1005
+ callbacks.add_schema("", copy);
1006
+ });
1007
+ }
1008
+
1009
+ std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
1010
+ SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
1011
+ common_grammar_builder builder {
1012
+ /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
1013
+ return converter._add_rule(name, rule);
1014
+ },
1015
+ /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
1016
+ return converter.visit(schema, name == "root" ? "" : name);
1017
+ },
1018
+ /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
1019
+ converter.resolve_refs(schema, "");
1020
+ }
1021
+ };
1022
+ cb(builder);
1043
1023
  converter.check_errors();
1044
1024
  return converter.format_grammar();
1045
1025
  }
@@ -5,4 +5,18 @@
5
5
  #define JSON_ASSERT LM_GGML_ASSERT
6
6
  #include "json.hpp"
7
7
 
8
- std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
8
+ std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
9
+ bool force_gbnf = false);
10
+
11
+ struct common_grammar_builder {
12
+ std::function<std::string(const std::string &, const std::string &)> add_rule;
13
+ std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
14
+ std::function<void(nlohmann::ordered_json &)> resolve_refs;
15
+ };
16
+
17
+ struct common_grammar_options {
18
+ bool dotall = false;
19
+ bool compact_spaces = false;
20
+ };
21
+
22
+ std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
@@ -179,6 +179,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
179
179
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
180
180
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
181
181
  { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
182
+ { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
182
183
  { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
183
184
  { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
184
185
  { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
@@ -1023,6 +1024,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1023
1024
  { LLM_TENSOR_OUTPUT, "output" },
1024
1025
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1025
1026
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1027
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1028
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1029
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1026
1030
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1027
1031
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1028
1032
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
@@ -1443,10 +1447,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1443
1447
  {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1444
1448
  };
1445
1449
 
1446
- LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
1450
+ LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
1447
1451
 
1448
1452
  std::string LLM_KV::operator()(llm_kv kv) const {
1449
- return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1453
+ return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
1454
+ : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1450
1455
  }
1451
1456
 
1452
1457
  std::string LLM_TN_IMPL::str() const {
package/cpp/llama-arch.h CHANGED
@@ -177,6 +177,7 @@ enum llm_kv {
177
177
  LLM_KV_TOKENIZER_HF_JSON,
178
178
  LLM_KV_TOKENIZER_RWKV,
179
179
  LLM_KV_TOKENIZER_CHAT_TEMPLATE,
180
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
180
181
  LLM_KV_TOKENIZER_FIM_PRE_ID,
181
182
  LLM_KV_TOKENIZER_FIM_SUF_ID,
182
183
  LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -335,9 +336,10 @@ enum llm_tensor_layer {
335
336
  };
336
337
 
337
338
  struct LLM_KV {
338
- LLM_KV(llm_arch arch);
339
+ LLM_KV(llm_arch arch, const char * suffix = nullptr);
339
340
 
340
341
  llm_arch arch;
342
+ const char * suffix;
341
343
 
342
344
  std::string operator()(llm_kv kv) const;
343
345
  };
@@ -51,6 +51,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
51
51
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
52
52
  { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
53
53
  { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
54
+ { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
54
55
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
55
56
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
56
57
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
@@ -115,7 +116,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
115
116
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
116
117
  return LLM_CHAT_TEMPLATE_PHI_3;
117
118
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
118
- return LLM_CHAT_TEMPLATE_FALCON_3;
119
+ return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
119
120
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
120
121
  return LLM_CHAT_TEMPLATE_ZEPHYR;
121
122
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -440,6 +441,14 @@ int32_t llm_chat_apply_template(
440
441
  if (add_ass) {
441
442
  ss << "<|assistant|>";
442
443
  }
444
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
445
+ for (auto message : chat) {
446
+ std::string role(message->role);
447
+ ss << "<|" << role << "|>" << "\n" << message->content;
448
+ }
449
+ if (add_ass) {
450
+ ss << "<|assistant|>";
451
+ }
443
452
  } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
444
453
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
445
454
  for (auto message : chat) {
package/cpp/llama-chat.h CHANGED
@@ -31,6 +31,7 @@ enum llm_chat_template {
31
31
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
32
  LLM_CHAT_TEMPLATE_CHATGML_3,
33
33
  LLM_CHAT_TEMPLATE_CHATGML_4,
34
+ LLM_CHAT_TEMPLATE_GLMEDGE,
34
35
  LLM_CHAT_TEMPLATE_MINICPM,
35
36
  LLM_CHAT_TEMPLATE_EXAONE_3,
36
37
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
@@ -560,7 +560,7 @@ bool llama_grammar_parser::parse(const char * src) {
560
560
  }
561
561
  }
562
562
  } catch (const std::exception & err) {
563
- fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
563
+ fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
564
564
  rules.clear();
565
565
  return false;
566
566
  }
@@ -960,10 +960,28 @@ struct llama_grammar * llama_grammar_init_impl(
960
960
  // Important: vec_rules has to be moved here, not copied, because stacks contains
961
961
  // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
962
962
  // then the pointers would be invalidated when the local vec_rules goes out of scope.
963
- return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
963
+ return new llama_grammar {
964
+ vocab,
965
+ std::move(vec_rules),
966
+ std::move(stacks),
967
+ /* .partial_utf8 = */ {},
968
+ /* .lazy =*/ false,
969
+ /* .awaiting_trigger = */ false,
970
+ /* .trigger_buffer = */ "",
971
+ /* .trigger_tokens = */ {},
972
+ /* .trigger_words = */ {},
973
+ };
964
974
  }
965
975
 
966
- struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
976
+ struct llama_grammar * llama_grammar_init_impl(
977
+ const struct llama_vocab * vocab,
978
+ const char * grammar_str,
979
+ const char * grammar_root,
980
+ bool lazy,
981
+ const char ** trigger_words,
982
+ size_t num_trigger_words,
983
+ const llama_token * trigger_tokens,
984
+ size_t num_trigger_tokens) {
967
985
  llama_grammar_parser parser;
968
986
 
969
987
  // if there is a grammar, parse it
@@ -1035,10 +1053,31 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
1035
1053
  }
1036
1054
  } while (true);
1037
1055
 
1056
+ std::vector<llama_token> vec_trigger_tokens;
1057
+ std::vector<std::string> vec_trigger_words;
1058
+ for (size_t i = 0; i < num_trigger_tokens; i++) {
1059
+ LM_GGML_ASSERT(trigger_tokens != nullptr);
1060
+ vec_trigger_tokens.push_back(trigger_tokens[i]);
1061
+ }
1062
+ for (size_t i = 0; i < num_trigger_words; i++) {
1063
+ LM_GGML_ASSERT(trigger_words != nullptr);
1064
+ vec_trigger_words.push_back(trigger_words[i]);
1065
+ }
1066
+
1038
1067
  // Important: vec_rules has to be moved here, not copied, because stacks contains
1039
1068
  // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
1040
1069
  // then the pointers would be invalidated when the local vec_rules goes out of scope.
1041
- return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
1070
+ return new llama_grammar {
1071
+ vocab,
1072
+ std::move(vec_rules),
1073
+ std::move(stacks),
1074
+ /* .partial_utf8 = */ {},
1075
+ /* .lazy = */ lazy,
1076
+ /* .awaiting_trigger = */ lazy,
1077
+ /* .trigger_buffer = */ "",
1078
+ std::move(vec_trigger_tokens),
1079
+ std::move(vec_trigger_words),
1080
+ };
1042
1081
  }
1043
1082
 
1044
1083
  void llama_grammar_free_impl(struct llama_grammar * grammar) {
@@ -1055,6 +1094,11 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1055
1094
  grammar.rules,
1056
1095
  grammar.stacks,
1057
1096
  grammar.partial_utf8,
1097
+ grammar.lazy,
1098
+ grammar.awaiting_trigger,
1099
+ grammar.trigger_buffer,
1100
+ grammar.trigger_tokens,
1101
+ grammar.trigger_words,
1058
1102
  };
1059
1103
 
1060
1104
  // redirect elements in stacks to point to new rules
@@ -1076,6 +1120,10 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1076
1120
  void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
1077
1121
  LM_GGML_ASSERT(grammar.vocab != nullptr);
1078
1122
 
1123
+ if (grammar.awaiting_trigger) {
1124
+ return;
1125
+ }
1126
+
1079
1127
  bool allow_eog = false;
1080
1128
  for (const auto & stack : grammar.stacks) {
1081
1129
  if (stack.empty()) {
@@ -1115,6 +1163,34 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
1115
1163
  void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
1116
1164
  LM_GGML_ASSERT(grammar.vocab != nullptr);
1117
1165
 
1166
+ const auto & piece = grammar.vocab->token_to_piece(token);
1167
+
1168
+ if (grammar.awaiting_trigger) {
1169
+ if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
1170
+ grammar.awaiting_trigger = false;
1171
+ grammar.trigger_buffer.clear();
1172
+ llama_grammar_accept_str(grammar, piece);
1173
+ LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
1174
+ return;
1175
+ } else {
1176
+ // TODO: consider a smarter incremental substring search algorithm (store last position to search from).
1177
+ grammar.trigger_buffer += piece;
1178
+ for (const auto & word : grammar.trigger_words) {
1179
+ auto pos = grammar.trigger_buffer.find(word);
1180
+ if (pos != std::string::npos) {
1181
+ grammar.awaiting_trigger = false;
1182
+ auto constrained_str = grammar.trigger_buffer.substr(pos);
1183
+ grammar.trigger_buffer.clear();
1184
+ llama_grammar_accept_str(grammar, constrained_str);
1185
+ LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
1186
+ return;
1187
+ }
1188
+ }
1189
+ LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
1190
+ return;
1191
+ }
1192
+ }
1193
+
1118
1194
  if (grammar.vocab->is_eog(token)) {
1119
1195
  for (const auto & stack : grammar.stacks) {
1120
1196
  if (stack.empty()) {
@@ -1124,8 +1200,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1124
1200
  LM_GGML_ABORT("fatal error");
1125
1201
  }
1126
1202
 
1127
- const std::string & piece = grammar.vocab->token_to_piece(token);
1203
+ llama_grammar_accept_str(grammar, piece);
1204
+ }
1128
1205
 
1206
+ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
1129
1207
  // Note terminating 0 in decoded string
1130
1208
  const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1131
1209
  const auto & code_points = decoded.first;
@@ -1135,5 +1213,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1135
1213
  }
1136
1214
 
1137
1215
  grammar.partial_utf8 = decoded.second;
1138
- LM_GGML_ASSERT(!grammar.stacks.empty());
1216
+ if (grammar.stacks.empty()) {
1217
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
1218
+ }
1139
1219
  }
@@ -114,6 +114,15 @@ struct llama_grammar {
114
114
 
115
115
  // buffer for partially generated UTF-8 sequence from accepted tokens
116
116
  llama_partial_utf8 partial_utf8;
117
+
118
+ // lazy grammars wait for trigger words or tokens before constraining the sampling.
119
+ // we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
120
+ // (useful e.g. for tool_choice=required)
121
+ bool lazy = false;
122
+ bool awaiting_trigger = false; // Initialized to true for lazy grammars only
123
+ std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
124
+ std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
125
+ std::vector<std::string> trigger_words;
117
126
  };
118
127
 
119
128
  //
@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl(
127
136
  size_t n_rules,
128
137
  size_t start_rule_index);
129
138
 
130
- struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
139
+ struct llama_grammar * llama_grammar_init_impl(
140
+ const struct llama_vocab * vocab,
141
+ const char * grammar_str,
142
+ const char * grammar_root,
143
+ bool lazy,
144
+ const char ** trigger_words,
145
+ size_t num_trigger_words,
146
+ const llama_token * trigger_tokens,
147
+ size_t num_trigger_tokens);
131
148
 
132
149
  void llama_grammar_free_impl(struct llama_grammar * grammar);
133
150
 
@@ -141,3 +158,7 @@ void llama_grammar_apply_impl(
141
158
  void llama_grammar_accept_impl(
142
159
  struct llama_grammar & grammar,
143
160
  llama_token token);
161
+
162
+ void llama_grammar_accept_str(
163
+ struct llama_grammar & grammar,
164
+ const std::string & piece);
package/cpp/llama-impl.h CHANGED
@@ -6,13 +6,13 @@
6
6
  #include <vector>
7
7
 
8
8
  #ifdef __GNUC__
9
- #ifdef __MINGW32__
10
- #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
9
+ # if defined(__MINGW32__) && !defined(__clang__)
10
+ # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
11
+ # else
12
+ # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
13
+ # endif
11
14
  #else
12
- #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
13
- #endif
14
- #else
15
- #define LLAMA_ATTRIBUTE_FORMAT(...)
15
+ # define LLAMA_ATTRIBUTE_FORMAT(...)
16
16
  #endif
17
17
 
18
18
  //
@@ -37,7 +37,7 @@ struct llama_kv_cache {
37
37
  bool can_shift = false;
38
38
 
39
39
  // Note: The value of head isn't only used to optimize searching
40
- // for a free KV slot. llama_decode_internal also uses it, so it
40
+ // for a free KV slot. llama_decode_impl also uses it, so it
41
41
  // cannot be freely changed after a slot has been allocated.
42
42
  uint32_t head = 0;
43
43
  uint32_t size = 0;
package/cpp/llama-mmap.h CHANGED
@@ -1,5 +1,6 @@
1
1
  #pragma once
2
2
 
3
+ #include <cstdint>
3
4
  #include <memory>
4
5
  #include <vector>
5
6
 
@@ -819,7 +819,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
819
819
  for (const auto & file : files) {
820
820
  auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
821
821
  auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
822
- std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
822
+ std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
823
823
  mmaps_used.emplace_back(mapping->size(), 0);
824
824
  if (mlock_mmaps) {
825
825
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -1093,8 +1093,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1093
1093
  {
1094
1094
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1095
1095
  switch (hparams.n_layer) {
1096
- case 28: type = LLM_TYPE_6B; break;
1097
- case 40: type = LLM_TYPE_9B; break;
1096
+ case 28: {
1097
+ if (hparams.n_head(0) == 16) {
1098
+ type = LLM_TYPE_1_5B;
1099
+ } else {
1100
+ type = LLM_TYPE_6B;
1101
+ }
1102
+ } break;
1103
+ case 40: {
1104
+ if (hparams.n_head(0) == 24) {
1105
+ type = LLM_TYPE_4B;
1106
+ } else {
1107
+ type = LLM_TYPE_9B;
1108
+ }
1109
+ } break;
1098
1110
  default: type = LLM_TYPE_UNKNOWN;
1099
1111
  }
1100
1112
  } break;
@@ -1263,6 +1275,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1263
1275
 
1264
1276
  const bool use_mmap_buffer = true;
1265
1277
 
1278
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1279
+
1266
1280
  // build a list of buffer types for the CPU and GPU devices
1267
1281
  pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1268
1282
  for (auto * dev : devices) {
@@ -1303,10 +1317,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1303
1317
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1304
1318
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
1305
1319
  if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
1320
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, lm_ggml_backend_dev_name(cpu_dev));
1306
1321
  return {cpu_dev, &pimpl->cpu_buft_list};
1307
1322
  }
1308
1323
  const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
1309
1324
  auto * dev = devices.at(layer_gpu);
1325
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, lm_ggml_backend_dev_name(dev));
1310
1326
  return {dev, &pimpl->gpu_buft_list.at(dev)};
1311
1327
  };
1312
1328
 
@@ -3066,9 +3082,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3066
3082
  auto & layer = layers[i];
3067
3083
 
3068
3084
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3085
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3086
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3069
3087
 
3070
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3071
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3088
+ if (layer.wqkv == nullptr) {
3089
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3090
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3091
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3092
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3093
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3094
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3095
+ }
3072
3096
 
3073
3097
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3074
3098
 
@@ -3955,8 +3979,10 @@ uint64_t llama_model_size(const struct llama_model * model) {
3955
3979
  return model->size();
3956
3980
  }
3957
3981
 
3958
- const char * llama_model_chat_template(const struct llama_model * model) {
3959
- const auto & it = model->lm_gguf_kv.find(LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE));
3982
+ const char * llama_model_chat_template(const struct llama_model * model, const char * name) {
3983
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
3984
+ : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
3985
+ const auto & it = model->lm_gguf_kv.find(key);
3960
3986
  if (it == model->lm_gguf_kv.end()) {
3961
3987
  return nullptr;
3962
3988
  }