cui-llama.rn 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -114
- package/android/src/main/CMakeLists.txt +5 -0
- package/android/src/main/build-arm64/CMakeCache.txt +429 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +81 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +101 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +15 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +904 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +919 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +431 -0
- package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
- package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +165 -0
- package/android/src/main/build-arm64/CMakeFiles/Makefile2 +297 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/1 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/2 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/3 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/4 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/5 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/6 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +1 -0
- package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +8 -0
- package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +1 -0
- package/android/src/main/build-arm64/CMakeFiles/progress.marks +1 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +58 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +756 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +709 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +714 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +708 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +113 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +713 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +763 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +61 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +707 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +104 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +714 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +723 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +41 -0
- package/android/src/main/build-arm64/Makefile +1862 -0
- package/android/src/main/build-arm64/cmake_install.cmake +66 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +92 -18
- package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
- package/android/src/main/jni-utils.h +6 -0
- package/android/src/main/jni.cpp +287 -31
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
- package/cpp/chat-template.hpp +529 -0
- package/cpp/chat.cpp +1085 -0
- package/cpp/chat.hpp +55 -0
- package/cpp/common.cpp +159 -36
- package/cpp/common.h +64 -19
- package/cpp/ggml-alloc.c +1 -13
- package/cpp/ggml-common.h +0 -2
- package/cpp/ggml-cpu-impl.h +6 -12
- package/cpp/ggml-cpu-quants.c +937 -340
- package/cpp/ggml-cpu.c +207 -113
- package/cpp/ggml-cpu.cpp +4 -6
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-metal.h +66 -66
- package/cpp/ggml-metal.m +141 -23
- package/cpp/ggml.c +24 -14
- package/cpp/ggml.h +2 -2
- package/cpp/json-schema-to-grammar.cpp +46 -66
- package/cpp/json-schema-to-grammar.h +15 -1
- package/cpp/llama-arch.cpp +7 -2
- package/cpp/llama-arch.h +3 -1
- package/cpp/llama-chat.cpp +10 -1
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-grammar.cpp +86 -6
- package/cpp/llama-grammar.h +22 -1
- package/cpp/llama-impl.h +6 -6
- package/cpp/llama-kv-cache.h +1 -1
- package/cpp/llama-mmap.h +1 -0
- package/cpp/llama-model-loader.cpp +1 -1
- package/cpp/llama-model.cpp +32 -6
- package/cpp/llama-sampling.cpp +178 -61
- package/cpp/llama-vocab.cpp +8 -3
- package/cpp/llama.cpp +188 -128
- package/cpp/llama.h +27 -10
- package/cpp/log.cpp +32 -10
- package/cpp/log.h +12 -1
- package/cpp/minja.hpp +2883 -0
- package/cpp/rn-llama.cpp +82 -5
- package/cpp/rn-llama.h +16 -1
- package/cpp/sampling.cpp +68 -41
- package/cpp/sampling.h +3 -0
- package/cpp/sgemm.cpp +9 -8
- package/cpp/unicode.cpp +9 -2
- package/ios/CMakeLists.txt +6 -0
- package/ios/RNLlama.h +0 -8
- package/ios/RNLlama.mm +27 -3
- package/ios/RNLlamaContext.h +10 -1
- package/ios/RNLlamaContext.mm +269 -57
- package/jest/mock.js +21 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +3 -0
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +87 -13
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +3 -0
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +86 -13
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +107 -2
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +32 -7
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +1 -1
- package/package.json +2 -2
- package/src/NativeRNLlama.ts +115 -3
- package/src/grammar.ts +3 -0
- package/src/index.ts +138 -21
@@ -1,4 +1,6 @@
|
|
1
1
|
#include "json-schema-to-grammar.h"
|
2
|
+
#include "common.h"
|
3
|
+
|
2
4
|
#include <algorithm>
|
3
5
|
#include <fstream>
|
4
6
|
#include <map>
|
@@ -11,11 +13,6 @@
|
|
11
13
|
|
12
14
|
using json = nlohmann::ordered_json;
|
13
15
|
|
14
|
-
template <typename Iterator>
|
15
|
-
static std::string join(Iterator begin, Iterator end, const std::string & separator);
|
16
|
-
|
17
|
-
static std::string repeat(const std::string & str, size_t n);
|
18
|
-
|
19
16
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
20
17
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
21
18
|
|
@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
128
125
|
if (sub_len > 0) {
|
129
126
|
auto from_sub = from.substr(i + 1);
|
130
127
|
auto to_sub = to.substr(i + 1);
|
131
|
-
auto sub_zeros =
|
132
|
-
auto sub_nines =
|
128
|
+
auto sub_zeros = string_repeat("0", sub_len);
|
129
|
+
auto sub_nines = string_repeat("9", sub_len);
|
133
130
|
|
134
131
|
auto to_reached = false;
|
135
132
|
out << "(";
|
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
188
185
|
auto max_digits = max_s.length();
|
189
186
|
|
190
187
|
for (auto digits = min_digits; digits < max_digits; digits++) {
|
191
|
-
uniform_range(min_s,
|
192
|
-
min_s = "1" +
|
188
|
+
uniform_range(min_s, string_repeat("9", digits));
|
189
|
+
min_s = "1" + string_repeat("0", digits);
|
193
190
|
out << " | ";
|
194
191
|
}
|
195
192
|
uniform_range(min_s, max_s);
|
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
|
318
315
|
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
319
316
|
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
320
317
|
|
321
|
-
template <typename Iterator>
|
322
|
-
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
323
|
-
std::ostringstream result;
|
324
|
-
if (begin != end) {
|
325
|
-
result << *begin;
|
326
|
-
for (Iterator it = begin + 1; it != end; ++it) {
|
327
|
-
result << separator << *it;
|
328
|
-
}
|
329
|
-
}
|
330
|
-
return result.str();
|
331
|
-
}
|
332
|
-
|
333
|
-
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
|
334
|
-
std::vector<std::string> tokens;
|
335
|
-
size_t start = 0;
|
336
|
-
size_t end = str.find(delimiter);
|
337
|
-
|
338
|
-
while (end != std::string::npos) {
|
339
|
-
tokens.push_back(str.substr(start, end - start));
|
340
|
-
start = end + delimiter.length();
|
341
|
-
end = str.find(delimiter, start);
|
342
|
-
}
|
343
|
-
|
344
|
-
tokens.push_back(str.substr(start));
|
345
|
-
|
346
|
-
return tokens;
|
347
|
-
}
|
348
|
-
|
349
|
-
static std::string repeat(const std::string & str, size_t n) {
|
350
|
-
if (n == 0) {
|
351
|
-
return "";
|
352
|
-
}
|
353
|
-
|
354
|
-
std::string result;
|
355
|
-
result.reserve(str.length() * n);
|
356
|
-
|
357
|
-
for (size_t i = 0; i < n; ++i) {
|
358
|
-
result += str;
|
359
|
-
}
|
360
|
-
|
361
|
-
return result;
|
362
|
-
}
|
363
|
-
|
364
318
|
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
365
319
|
std::smatch match;
|
366
320
|
std::string result;
|
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {
|
|
389
343
|
|
390
344
|
class SchemaConverter {
|
391
345
|
private:
|
346
|
+
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
392
347
|
std::function<json(const std::string &)> _fetch_json;
|
393
348
|
bool _dotall;
|
394
349
|
std::map<std::string, std::string> _rules;
|
@@ -418,7 +373,7 @@ private:
|
|
418
373
|
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
419
374
|
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
420
375
|
}
|
421
|
-
return
|
376
|
+
return string_join(rules, " | ");
|
422
377
|
}
|
423
378
|
|
424
379
|
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
|
@@ -481,7 +436,7 @@ private:
|
|
481
436
|
for (const auto & item : ret) {
|
482
437
|
results.push_back(to_rule(item));
|
483
438
|
}
|
484
|
-
return std::make_pair(
|
439
|
+
return std::make_pair(string_join(results, " "), false);
|
485
440
|
};
|
486
441
|
|
487
442
|
while (i < length) {
|
@@ -539,7 +494,7 @@ private:
|
|
539
494
|
}
|
540
495
|
curly_brackets += '}';
|
541
496
|
i++;
|
542
|
-
auto nums =
|
497
|
+
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
543
498
|
int min_times = 0;
|
544
499
|
int max_times = std::numeric_limits<int>::max();
|
545
500
|
try {
|
@@ -809,10 +764,11 @@ private:
|
|
809
764
|
public:
|
810
765
|
SchemaConverter(
|
811
766
|
const std::function<json(const std::string &)> & fetch_json,
|
812
|
-
bool dotall
|
767
|
+
bool dotall,
|
768
|
+
bool compact_spaces)
|
813
769
|
: _fetch_json(fetch_json), _dotall(dotall)
|
814
770
|
{
|
815
|
-
_rules["space"] = SPACE_RULE;
|
771
|
+
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
|
816
772
|
}
|
817
773
|
|
818
774
|
void resolve_refs(json & schema, const std::string & url) {
|
@@ -854,7 +810,7 @@ public:
|
|
854
810
|
return;
|
855
811
|
}
|
856
812
|
std::string pointer = ref.substr(ref.find('#') + 1);
|
857
|
-
std::vector<std::string> tokens =
|
813
|
+
std::vector<std::string> tokens = string_split(pointer, "/");
|
858
814
|
for (size_t i = 1; i < tokens.size(); ++i) {
|
859
815
|
std::string sel = tokens[i];
|
860
816
|
if (target.is_null() || !target.contains(sel)) {
|
@@ -905,7 +861,7 @@ public:
|
|
905
861
|
for (const auto & v : schema["enum"]) {
|
906
862
|
enum_values.push_back(_generate_constant_rule(v));
|
907
863
|
}
|
908
|
-
return _add_rule(rule_name, "(" +
|
864
|
+
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
909
865
|
} else if ((schema_type.is_null() || schema_type == "object")
|
910
866
|
&& (schema.contains("properties") ||
|
911
867
|
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
@@ -1019,10 +975,10 @@ public:
|
|
1019
975
|
|
1020
976
|
void check_errors() {
|
1021
977
|
if (!_errors.empty()) {
|
1022
|
-
throw std::runtime_error("JSON schema conversion failed:\n" +
|
978
|
+
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
1023
979
|
}
|
1024
980
|
if (!_warnings.empty()) {
|
1025
|
-
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n",
|
981
|
+
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
1026
982
|
}
|
1027
983
|
}
|
1028
984
|
|
@@ -1035,11 +991,35 @@ public:
|
|
1035
991
|
}
|
1036
992
|
};
|
1037
993
|
|
1038
|
-
std::string json_schema_to_grammar(const json & schema) {
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
994
|
+
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
995
|
+
#ifdef LLAMA_USE_LLGUIDANCE
|
996
|
+
if (!force_gbnf) {
|
997
|
+
return "%llguidance {}\nstart: %json " + schema.dump();
|
998
|
+
}
|
999
|
+
#else
|
1000
|
+
(void)force_gbnf;
|
1001
|
+
#endif // LLAMA_USE_LLGUIDANCE
|
1002
|
+
return build_grammar([&](const common_grammar_builder & callbacks) {
|
1003
|
+
auto copy = schema;
|
1004
|
+
callbacks.resolve_refs(copy);
|
1005
|
+
callbacks.add_schema("", copy);
|
1006
|
+
});
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
1010
|
+
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
|
1011
|
+
common_grammar_builder builder {
|
1012
|
+
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
1013
|
+
return converter._add_rule(name, rule);
|
1014
|
+
},
|
1015
|
+
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
|
1016
|
+
return converter.visit(schema, name == "root" ? "" : name);
|
1017
|
+
},
|
1018
|
+
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
|
1019
|
+
converter.resolve_refs(schema, "");
|
1020
|
+
}
|
1021
|
+
};
|
1022
|
+
cb(builder);
|
1043
1023
|
converter.check_errors();
|
1044
1024
|
return converter.format_grammar();
|
1045
1025
|
}
|
@@ -5,4 +5,18 @@
|
|
5
5
|
#define JSON_ASSERT LM_GGML_ASSERT
|
6
6
|
#include "json.hpp"
|
7
7
|
|
8
|
-
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema
|
8
|
+
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
9
|
+
bool force_gbnf = false);
|
10
|
+
|
11
|
+
struct common_grammar_builder {
|
12
|
+
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
13
|
+
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
14
|
+
std::function<void(nlohmann::ordered_json &)> resolve_refs;
|
15
|
+
};
|
16
|
+
|
17
|
+
struct common_grammar_options {
|
18
|
+
bool dotall = false;
|
19
|
+
bool compact_spaces = false;
|
20
|
+
};
|
21
|
+
|
22
|
+
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
package/cpp/llama-arch.cpp
CHANGED
@@ -179,6 +179,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
179
179
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
180
180
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
181
181
|
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
182
|
+
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
|
182
183
|
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
183
184
|
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
184
185
|
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
@@ -1023,6 +1024,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1023
1024
|
{ LLM_TENSOR_OUTPUT, "output" },
|
1024
1025
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1025
1026
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
1027
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1028
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1029
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1026
1030
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1027
1031
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1028
1032
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
@@ -1443,10 +1447,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
1443
1447
|
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
1444
1448
|
};
|
1445
1449
|
|
1446
|
-
LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
|
1450
|
+
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
1447
1451
|
|
1448
1452
|
std::string LLM_KV::operator()(llm_kv kv) const {
|
1449
|
-
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch))
|
1453
|
+
return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
|
1454
|
+
: ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
1450
1455
|
}
|
1451
1456
|
|
1452
1457
|
std::string LLM_TN_IMPL::str() const {
|
package/cpp/llama-arch.h
CHANGED
@@ -177,6 +177,7 @@ enum llm_kv {
|
|
177
177
|
LLM_KV_TOKENIZER_HF_JSON,
|
178
178
|
LLM_KV_TOKENIZER_RWKV,
|
179
179
|
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
180
|
+
LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
|
180
181
|
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
181
182
|
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
182
183
|
LLM_KV_TOKENIZER_FIM_MID_ID,
|
@@ -335,9 +336,10 @@ enum llm_tensor_layer {
|
|
335
336
|
};
|
336
337
|
|
337
338
|
struct LLM_KV {
|
338
|
-
LLM_KV(llm_arch arch);
|
339
|
+
LLM_KV(llm_arch arch, const char * suffix = nullptr);
|
339
340
|
|
340
341
|
llm_arch arch;
|
342
|
+
const char * suffix;
|
341
343
|
|
342
344
|
std::string operator()(llm_kv kv) const;
|
343
345
|
};
|
package/cpp/llama-chat.cpp
CHANGED
@@ -51,6 +51,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
51
51
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
52
52
|
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
53
53
|
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
54
|
+
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
54
55
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
55
56
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
56
57
|
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
@@ -115,7 +116,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
115
116
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
116
117
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
117
118
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
118
|
-
return LLM_CHAT_TEMPLATE_FALCON_3;
|
119
|
+
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
119
120
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
120
121
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
121
122
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
@@ -440,6 +441,14 @@ int32_t llm_chat_apply_template(
|
|
440
441
|
if (add_ass) {
|
441
442
|
ss << "<|assistant|>";
|
442
443
|
}
|
444
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
445
|
+
for (auto message : chat) {
|
446
|
+
std::string role(message->role);
|
447
|
+
ss << "<|" << role << "|>" << "\n" << message->content;
|
448
|
+
}
|
449
|
+
if (add_ass) {
|
450
|
+
ss << "<|assistant|>";
|
451
|
+
}
|
443
452
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
444
453
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
445
454
|
for (auto message : chat) {
|
package/cpp/llama-chat.h
CHANGED
package/cpp/llama-grammar.cpp
CHANGED
@@ -560,7 +560,7 @@ bool llama_grammar_parser::parse(const char * src) {
|
|
560
560
|
}
|
561
561
|
}
|
562
562
|
} catch (const std::exception & err) {
|
563
|
-
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
563
|
+
fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
|
564
564
|
rules.clear();
|
565
565
|
return false;
|
566
566
|
}
|
@@ -960,10 +960,28 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
960
960
|
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
961
961
|
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
962
962
|
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
963
|
-
return new llama_grammar {
|
963
|
+
return new llama_grammar {
|
964
|
+
vocab,
|
965
|
+
std::move(vec_rules),
|
966
|
+
std::move(stacks),
|
967
|
+
/* .partial_utf8 = */ {},
|
968
|
+
/* .lazy =*/ false,
|
969
|
+
/* .awaiting_trigger = */ false,
|
970
|
+
/* .trigger_buffer = */ "",
|
971
|
+
/* .trigger_tokens = */ {},
|
972
|
+
/* .trigger_words = */ {},
|
973
|
+
};
|
964
974
|
}
|
965
975
|
|
966
|
-
struct llama_grammar * llama_grammar_init_impl(
|
976
|
+
struct llama_grammar * llama_grammar_init_impl(
|
977
|
+
const struct llama_vocab * vocab,
|
978
|
+
const char * grammar_str,
|
979
|
+
const char * grammar_root,
|
980
|
+
bool lazy,
|
981
|
+
const char ** trigger_words,
|
982
|
+
size_t num_trigger_words,
|
983
|
+
const llama_token * trigger_tokens,
|
984
|
+
size_t num_trigger_tokens) {
|
967
985
|
llama_grammar_parser parser;
|
968
986
|
|
969
987
|
// if there is a grammar, parse it
|
@@ -1035,10 +1053,31 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
|
|
1035
1053
|
}
|
1036
1054
|
} while (true);
|
1037
1055
|
|
1056
|
+
std::vector<llama_token> vec_trigger_tokens;
|
1057
|
+
std::vector<std::string> vec_trigger_words;
|
1058
|
+
for (size_t i = 0; i < num_trigger_tokens; i++) {
|
1059
|
+
LM_GGML_ASSERT(trigger_tokens != nullptr);
|
1060
|
+
vec_trigger_tokens.push_back(trigger_tokens[i]);
|
1061
|
+
}
|
1062
|
+
for (size_t i = 0; i < num_trigger_words; i++) {
|
1063
|
+
LM_GGML_ASSERT(trigger_words != nullptr);
|
1064
|
+
vec_trigger_words.push_back(trigger_words[i]);
|
1065
|
+
}
|
1066
|
+
|
1038
1067
|
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
1039
1068
|
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
1040
1069
|
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
1041
|
-
return new llama_grammar {
|
1070
|
+
return new llama_grammar {
|
1071
|
+
vocab,
|
1072
|
+
std::move(vec_rules),
|
1073
|
+
std::move(stacks),
|
1074
|
+
/* .partial_utf8 = */ {},
|
1075
|
+
/* .lazy = */ lazy,
|
1076
|
+
/* .awaiting_trigger = */ lazy,
|
1077
|
+
/* .trigger_buffer = */ "",
|
1078
|
+
std::move(vec_trigger_tokens),
|
1079
|
+
std::move(vec_trigger_words),
|
1080
|
+
};
|
1042
1081
|
}
|
1043
1082
|
|
1044
1083
|
void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
@@ -1055,6 +1094,11 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|
1055
1094
|
grammar.rules,
|
1056
1095
|
grammar.stacks,
|
1057
1096
|
grammar.partial_utf8,
|
1097
|
+
grammar.lazy,
|
1098
|
+
grammar.awaiting_trigger,
|
1099
|
+
grammar.trigger_buffer,
|
1100
|
+
grammar.trigger_tokens,
|
1101
|
+
grammar.trigger_words,
|
1058
1102
|
};
|
1059
1103
|
|
1060
1104
|
// redirect elements in stacks to point to new rules
|
@@ -1076,6 +1120,10 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|
1076
1120
|
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
1077
1121
|
LM_GGML_ASSERT(grammar.vocab != nullptr);
|
1078
1122
|
|
1123
|
+
if (grammar.awaiting_trigger) {
|
1124
|
+
return;
|
1125
|
+
}
|
1126
|
+
|
1079
1127
|
bool allow_eog = false;
|
1080
1128
|
for (const auto & stack : grammar.stacks) {
|
1081
1129
|
if (stack.empty()) {
|
@@ -1115,6 +1163,34 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|
1115
1163
|
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
1116
1164
|
LM_GGML_ASSERT(grammar.vocab != nullptr);
|
1117
1165
|
|
1166
|
+
const auto & piece = grammar.vocab->token_to_piece(token);
|
1167
|
+
|
1168
|
+
if (grammar.awaiting_trigger) {
|
1169
|
+
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
1170
|
+
grammar.awaiting_trigger = false;
|
1171
|
+
grammar.trigger_buffer.clear();
|
1172
|
+
llama_grammar_accept_str(grammar, piece);
|
1173
|
+
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
1174
|
+
return;
|
1175
|
+
} else {
|
1176
|
+
// TODO: consider a smarter incremental substring search algorithm (store last position to search from).
|
1177
|
+
grammar.trigger_buffer += piece;
|
1178
|
+
for (const auto & word : grammar.trigger_words) {
|
1179
|
+
auto pos = grammar.trigger_buffer.find(word);
|
1180
|
+
if (pos != std::string::npos) {
|
1181
|
+
grammar.awaiting_trigger = false;
|
1182
|
+
auto constrained_str = grammar.trigger_buffer.substr(pos);
|
1183
|
+
grammar.trigger_buffer.clear();
|
1184
|
+
llama_grammar_accept_str(grammar, constrained_str);
|
1185
|
+
LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
|
1186
|
+
return;
|
1187
|
+
}
|
1188
|
+
}
|
1189
|
+
LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
|
1190
|
+
return;
|
1191
|
+
}
|
1192
|
+
}
|
1193
|
+
|
1118
1194
|
if (grammar.vocab->is_eog(token)) {
|
1119
1195
|
for (const auto & stack : grammar.stacks) {
|
1120
1196
|
if (stack.empty()) {
|
@@ -1124,8 +1200,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
1124
1200
|
LM_GGML_ABORT("fatal error");
|
1125
1201
|
}
|
1126
1202
|
|
1127
|
-
|
1203
|
+
llama_grammar_accept_str(grammar, piece);
|
1204
|
+
}
|
1128
1205
|
|
1206
|
+
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
1129
1207
|
// Note terminating 0 in decoded string
|
1130
1208
|
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
1131
1209
|
const auto & code_points = decoded.first;
|
@@ -1135,5 +1213,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
1135
1213
|
}
|
1136
1214
|
|
1137
1215
|
grammar.partial_utf8 = decoded.second;
|
1138
|
-
|
1216
|
+
if (grammar.stacks.empty()) {
|
1217
|
+
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
1218
|
+
}
|
1139
1219
|
}
|
package/cpp/llama-grammar.h
CHANGED
@@ -114,6 +114,15 @@ struct llama_grammar {
|
|
114
114
|
|
115
115
|
// buffer for partially generated UTF-8 sequence from accepted tokens
|
116
116
|
llama_partial_utf8 partial_utf8;
|
117
|
+
|
118
|
+
// lazy grammars wait for trigger words or tokens before constraining the sampling.
|
119
|
+
// we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
|
120
|
+
// (useful e.g. for tool_choice=required)
|
121
|
+
bool lazy = false;
|
122
|
+
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
123
|
+
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
124
|
+
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
125
|
+
std::vector<std::string> trigger_words;
|
117
126
|
};
|
118
127
|
|
119
128
|
//
|
@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
127
136
|
size_t n_rules,
|
128
137
|
size_t start_rule_index);
|
129
138
|
|
130
|
-
struct llama_grammar * llama_grammar_init_impl(
|
139
|
+
struct llama_grammar * llama_grammar_init_impl(
|
140
|
+
const struct llama_vocab * vocab,
|
141
|
+
const char * grammar_str,
|
142
|
+
const char * grammar_root,
|
143
|
+
bool lazy,
|
144
|
+
const char ** trigger_words,
|
145
|
+
size_t num_trigger_words,
|
146
|
+
const llama_token * trigger_tokens,
|
147
|
+
size_t num_trigger_tokens);
|
131
148
|
|
132
149
|
void llama_grammar_free_impl(struct llama_grammar * grammar);
|
133
150
|
|
@@ -141,3 +158,7 @@ void llama_grammar_apply_impl(
|
|
141
158
|
void llama_grammar_accept_impl(
|
142
159
|
struct llama_grammar & grammar,
|
143
160
|
llama_token token);
|
161
|
+
|
162
|
+
void llama_grammar_accept_str(
|
163
|
+
struct llama_grammar & grammar,
|
164
|
+
const std::string & piece);
|
package/cpp/llama-impl.h
CHANGED
@@ -6,13 +6,13 @@
|
|
6
6
|
#include <vector>
|
7
7
|
|
8
8
|
#ifdef __GNUC__
|
9
|
-
#
|
10
|
-
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
9
|
+
# if defined(__MINGW32__) && !defined(__clang__)
|
10
|
+
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
11
|
+
# else
|
12
|
+
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
13
|
+
# endif
|
11
14
|
#else
|
12
|
-
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
13
|
-
#endif
|
14
|
-
#else
|
15
|
-
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
15
|
+
# define LLAMA_ATTRIBUTE_FORMAT(...)
|
16
16
|
#endif
|
17
17
|
|
18
18
|
//
|
package/cpp/llama-kv-cache.h
CHANGED
@@ -37,7 +37,7 @@ struct llama_kv_cache {
|
|
37
37
|
bool can_shift = false;
|
38
38
|
|
39
39
|
// Note: The value of head isn't only used to optimize searching
|
40
|
-
// for a free KV slot.
|
40
|
+
// for a free KV slot. llama_decode_impl also uses it, so it
|
41
41
|
// cannot be freely changed after a slot has been allocated.
|
42
42
|
uint32_t head = 0;
|
43
43
|
uint32_t size = 0;
|
package/cpp/llama-mmap.h
CHANGED
@@ -819,7 +819,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
|
819
819
|
for (const auto & file : files) {
|
820
820
|
auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
|
821
821
|
auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
|
822
|
-
std::unique_ptr<llama_mmap> mapping
|
822
|
+
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
823
823
|
mmaps_used.emplace_back(mapping->size(), 0);
|
824
824
|
if (mlock_mmaps) {
|
825
825
|
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
package/cpp/llama-model.cpp
CHANGED
@@ -1093,8 +1093,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1093
1093
|
{
|
1094
1094
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
1095
1095
|
switch (hparams.n_layer) {
|
1096
|
-
case 28:
|
1097
|
-
|
1096
|
+
case 28: {
|
1097
|
+
if (hparams.n_head(0) == 16) {
|
1098
|
+
type = LLM_TYPE_1_5B;
|
1099
|
+
} else {
|
1100
|
+
type = LLM_TYPE_6B;
|
1101
|
+
}
|
1102
|
+
} break;
|
1103
|
+
case 40: {
|
1104
|
+
if (hparams.n_head(0) == 24) {
|
1105
|
+
type = LLM_TYPE_4B;
|
1106
|
+
} else {
|
1107
|
+
type = LLM_TYPE_9B;
|
1108
|
+
}
|
1109
|
+
} break;
|
1098
1110
|
default: type = LLM_TYPE_UNKNOWN;
|
1099
1111
|
}
|
1100
1112
|
} break;
|
@@ -1263,6 +1275,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1263
1275
|
|
1264
1276
|
const bool use_mmap_buffer = true;
|
1265
1277
|
|
1278
|
+
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
1279
|
+
|
1266
1280
|
// build a list of buffer types for the CPU and GPU devices
|
1267
1281
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
1268
1282
|
for (auto * dev : devices) {
|
@@ -1303,10 +1317,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1303
1317
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
1304
1318
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
1305
1319
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
1320
|
+
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, lm_ggml_backend_dev_name(cpu_dev));
|
1306
1321
|
return {cpu_dev, &pimpl->cpu_buft_list};
|
1307
1322
|
}
|
1308
1323
|
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
1309
1324
|
auto * dev = devices.at(layer_gpu);
|
1325
|
+
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, lm_ggml_backend_dev_name(dev));
|
1310
1326
|
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
1311
1327
|
};
|
1312
1328
|
|
@@ -3066,9 +3082,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
3066
3082
|
auto & layer = layers[i];
|
3067
3083
|
|
3068
3084
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
3085
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
3086
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
3069
3087
|
|
3070
|
-
layer.wqkv
|
3071
|
-
|
3088
|
+
if (layer.wqkv == nullptr) {
|
3089
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
3090
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
3091
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
3092
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
3093
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
3094
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
3095
|
+
}
|
3072
3096
|
|
3073
3097
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
3074
3098
|
|
@@ -3955,8 +3979,10 @@ uint64_t llama_model_size(const struct llama_model * model) {
|
|
3955
3979
|
return model->size();
|
3956
3980
|
}
|
3957
3981
|
|
3958
|
-
const char * llama_model_chat_template(const struct llama_model * model) {
|
3959
|
-
const auto
|
3982
|
+
const char * llama_model_chat_template(const struct llama_model * model, const char * name) {
|
3983
|
+
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
|
3984
|
+
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
3985
|
+
const auto & it = model->lm_gguf_kv.find(key);
|
3960
3986
|
if (it == model->lm_gguf_kv.end()) {
|
3961
3987
|
return nullptr;
|
3962
3988
|
}
|