cactus-react-native 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +20 -0
- package/README.md +3 -1
- package/android/src/main/CMakeLists.txt +58 -23
- package/android/src/main/java/com/cactus/Cactus.java +484 -16
- package/android/src/main/java/com/cactus/LlamaContext.java +199 -0
- package/android/src/main/jni.cpp +325 -10
- package/android/src/main/jniLibs/arm64-v8a/libcactus.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/libcactus.so +0 -0
- package/android/src/main/jniLibs/x86_64/libcactus_x86_64.so +0 -0
- package/android/src/newarch/java/com/cactus/CactusModule.java +79 -7
- package/android/src/oldarch/java/com/cactus/CactusModule.java +70 -0
- package/cactus-react-native.podspec +0 -3
- package/ios/CMakeLists.txt +58 -36
- package/ios/Cactus.mm +243 -2
- package/ios/CactusContext.h +22 -0
- package/ios/CactusContext.mm +176 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/ggml-llama.metallib +0 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/ggml-llama-sim.metallib +0 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/ggml-llama.metallib +0 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/ggml-llama-sim.metallib +0 -0
- package/lib/commonjs/NativeCactus.js +1 -0
- package/lib/commonjs/NativeCactus.js.map +1 -1
- package/lib/commonjs/index.js +112 -0
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/tools.js +118 -0
- package/lib/commonjs/tools.js.map +1 -0
- package/lib/module/NativeCactus.js +3 -0
- package/lib/module/NativeCactus.js.map +1 -1
- package/lib/module/index.js +87 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/tools.js +110 -0
- package/lib/module/tools.js.map +1 -0
- package/lib/typescript/NativeCactus.d.ts +30 -1
- package/lib/typescript/NativeCactus.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +21 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/lib/typescript/tools.d.ts +38 -0
- package/lib/typescript/tools.d.ts.map +1 -0
- package/package.json +6 -3
- package/src/NativeCactus.ts +62 -1
- package/src/index.ts +113 -2
- package/src/tools.ts +127 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/sgemm.h +0 -14
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/sgemm.h +0 -14
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/sgemm.h +0 -14
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/sgemm.h +0 -14
package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h
CHANGED
|
@@ -2,12 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
|
+
struct llama_memory_params {
|
|
6
|
+
// kv cache
|
|
7
|
+
lm_ggml_type type_k;
|
|
8
|
+
lm_ggml_type type_v;
|
|
9
|
+
|
|
10
|
+
// use full-size SWA cache
|
|
11
|
+
bool swa_full;
|
|
12
|
+
};
|
|
13
|
+
|
|
5
14
|
// general concept of LLM memory
|
|
6
15
|
// the KV cache is a type of LLM memory, but there can be other types
|
|
7
16
|
class llama_memory_i {
|
|
8
17
|
public:
|
|
18
|
+
virtual ~llama_memory_i() = default;
|
|
19
|
+
|
|
9
20
|
virtual void clear() = 0;
|
|
10
|
-
virtual void defrag() = 0;
|
|
11
21
|
|
|
12
22
|
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
|
13
23
|
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
|
@@ -15,7 +25,8 @@ public:
|
|
|
15
25
|
virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
|
|
16
26
|
virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
|
|
17
27
|
|
|
18
|
-
virtual llama_pos
|
|
28
|
+
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
|
29
|
+
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
|
19
30
|
|
|
20
31
|
virtual bool get_can_edit() const = 0;
|
|
21
32
|
};
|
|
@@ -77,8 +77,9 @@ struct llama_model_loader {
|
|
|
77
77
|
|
|
78
78
|
llama_mmaps mappings;
|
|
79
79
|
|
|
80
|
-
std::map<std::string,
|
|
81
|
-
std::unordered_map<std::string,
|
|
80
|
+
std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
|
|
81
|
+
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
|
82
|
+
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
|
82
83
|
|
|
83
84
|
lm_gguf_context_ptr meta;
|
|
84
85
|
std::vector<lm_ggml_context_ptr> contexts;
|
|
@@ -95,7 +96,8 @@ struct llama_model_loader {
|
|
|
95
96
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
96
97
|
bool use_mmap,
|
|
97
98
|
bool check_tensors,
|
|
98
|
-
const
|
|
99
|
+
const llama_model_kv_override * param_overrides_p,
|
|
100
|
+
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
|
99
101
|
|
|
100
102
|
template<typename T>
|
|
101
103
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "llama.h"
|
|
4
|
+
#include "llama-arch.h"
|
|
5
|
+
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
struct llama_model_saver {
|
|
9
|
+
struct lm_gguf_context * lm_gguf_ctx = nullptr;
|
|
10
|
+
const struct llama_model & model;
|
|
11
|
+
const struct LLM_KV llm_kv;
|
|
12
|
+
|
|
13
|
+
llama_model_saver(const struct llama_model & model);
|
|
14
|
+
~llama_model_saver();
|
|
15
|
+
|
|
16
|
+
void add_kv(enum llm_kv key, uint32_t value);
|
|
17
|
+
void add_kv(enum llm_kv key, int32_t value);
|
|
18
|
+
void add_kv(enum llm_kv key, float value);
|
|
19
|
+
void add_kv(enum llm_kv key, bool value);
|
|
20
|
+
void add_kv(enum llm_kv key, const char * value);
|
|
21
|
+
|
|
22
|
+
[[noreturn]]
|
|
23
|
+
void add_kv(enum llm_kv key, char value); // needed to make the template below compile
|
|
24
|
+
|
|
25
|
+
template <typename Container>
|
|
26
|
+
void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
|
|
27
|
+
|
|
28
|
+
void add_kv(enum llm_kv key, const std::vector<std::string> & value);
|
|
29
|
+
|
|
30
|
+
void add_tensor(const struct lm_ggml_tensor * tensor);
|
|
31
|
+
|
|
32
|
+
void add_kv_from_model();
|
|
33
|
+
|
|
34
|
+
void add_tensors_from_model();
|
|
35
|
+
|
|
36
|
+
void save(const std::string & path_model);
|
|
37
|
+
};
|
package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h
CHANGED
|
@@ -36,14 +36,18 @@ enum llm_type {
|
|
|
36
36
|
LLM_TYPE_335M,
|
|
37
37
|
LLM_TYPE_410M,
|
|
38
38
|
LLM_TYPE_450M,
|
|
39
|
+
LLM_TYPE_475M,
|
|
39
40
|
LLM_TYPE_770M,
|
|
40
41
|
LLM_TYPE_780M,
|
|
41
42
|
LLM_TYPE_0_5B,
|
|
43
|
+
LLM_TYPE_0_6B,
|
|
42
44
|
LLM_TYPE_1B,
|
|
43
45
|
LLM_TYPE_1_3B,
|
|
44
46
|
LLM_TYPE_1_4B,
|
|
45
47
|
LLM_TYPE_1_5B,
|
|
46
48
|
LLM_TYPE_1_6B,
|
|
49
|
+
LLM_TYPE_1_7B,
|
|
50
|
+
LLM_TYPE_1_8B,
|
|
47
51
|
LLM_TYPE_2B,
|
|
48
52
|
LLM_TYPE_2_8B,
|
|
49
53
|
LLM_TYPE_2_9B,
|
|
@@ -61,6 +65,7 @@ enum llm_type {
|
|
|
61
65
|
LLM_TYPE_15B,
|
|
62
66
|
LLM_TYPE_16B,
|
|
63
67
|
LLM_TYPE_20B,
|
|
68
|
+
LLM_TYPE_27B,
|
|
64
69
|
LLM_TYPE_30B,
|
|
65
70
|
LLM_TYPE_32B,
|
|
66
71
|
LLM_TYPE_34B,
|
|
@@ -69,7 +74,9 @@ enum llm_type {
|
|
|
69
74
|
LLM_TYPE_65B,
|
|
70
75
|
LLM_TYPE_70B,
|
|
71
76
|
LLM_TYPE_236B,
|
|
77
|
+
LLM_TYPE_290B,
|
|
72
78
|
LLM_TYPE_314B,
|
|
79
|
+
LLM_TYPE_405B,
|
|
73
80
|
LLM_TYPE_671B,
|
|
74
81
|
LLM_TYPE_SMALL,
|
|
75
82
|
LLM_TYPE_MEDIUM,
|
|
@@ -83,9 +90,14 @@ enum llm_type {
|
|
|
83
90
|
LLM_TYPE_16x3_8B,
|
|
84
91
|
LLM_TYPE_10B_128x3_66B,
|
|
85
92
|
LLM_TYPE_57B_A14B,
|
|
86
|
-
|
|
93
|
+
LLM_TYPE_17B_16E, // llama4 Scout
|
|
94
|
+
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
95
|
+
LLM_TYPE_30B_A3B,
|
|
96
|
+
LLM_TYPE_235B_A22B,
|
|
87
97
|
};
|
|
88
98
|
|
|
99
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
100
|
+
|
|
89
101
|
struct llama_layer_posnet {
|
|
90
102
|
// resnet
|
|
91
103
|
struct lm_ggml_tensor * norm1 = nullptr;
|
|
@@ -167,6 +179,8 @@ struct llama_layer {
|
|
|
167
179
|
struct lm_ggml_tensor * wq_b = nullptr;
|
|
168
180
|
struct lm_ggml_tensor * wkv_a_mqa = nullptr;
|
|
169
181
|
struct lm_ggml_tensor * wkv_b = nullptr;
|
|
182
|
+
struct lm_ggml_tensor * wk_b = nullptr;
|
|
183
|
+
struct lm_ggml_tensor * wv_b = nullptr;
|
|
170
184
|
struct lm_ggml_tensor * wq_cross = nullptr;
|
|
171
185
|
struct lm_ggml_tensor * wk_cross = nullptr;
|
|
172
186
|
struct lm_ggml_tensor * wv_cross = nullptr;
|
|
@@ -380,10 +394,18 @@ struct llama_model {
|
|
|
380
394
|
|
|
381
395
|
lm_ggml_backend_buffer_type_t select_buft(int il) const;
|
|
382
396
|
|
|
397
|
+
bool has_tensor_overrides() const;
|
|
398
|
+
|
|
383
399
|
const struct lm_ggml_tensor * get_tensor(const char * name) const;
|
|
384
400
|
|
|
401
|
+
float get_rope_freq_base (const llama_cparams & cparams, int il) const;
|
|
402
|
+
float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
|
|
403
|
+
|
|
404
|
+
lm_ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
|
|
405
|
+
|
|
406
|
+
// note: can mutate `cparams`
|
|
385
407
|
// TODO: move this to new llm_arch_model_i interface
|
|
386
|
-
llama_memory_i * create_memory(
|
|
408
|
+
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
|
387
409
|
|
|
388
410
|
// TODO: move this to new llm_arch_model_i interface
|
|
389
411
|
llm_graph_result_ptr build_graph(
|
package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h
CHANGED
|
@@ -21,6 +21,9 @@ struct llama_vocab {
|
|
|
21
21
|
|
|
22
22
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
|
23
23
|
|
|
24
|
+
std::string get_tokenizer_model() const;
|
|
25
|
+
std::string get_tokenizer_pre() const;
|
|
26
|
+
|
|
24
27
|
enum llama_vocab_type get_type() const;
|
|
25
28
|
enum llama_vocab_pre_type get_pre_type() const;
|
|
26
29
|
|
|
@@ -80,6 +83,9 @@ struct llama_vocab {
|
|
|
80
83
|
int max_token_len() const;
|
|
81
84
|
|
|
82
85
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
|
86
|
+
std::vector<std::string> get_bpe_merges() const;
|
|
87
|
+
|
|
88
|
+
std::vector<char> get_precompiled_charsmap() const;
|
|
83
89
|
|
|
84
90
|
int32_t tokenize(
|
|
85
91
|
const char * text,
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "ggml.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
6
|
#include "ggml-backend.h"
|
|
7
|
+
#include "ggml-opt.h"
|
|
7
8
|
|
|
8
9
|
#include <stddef.h>
|
|
9
10
|
#include <stdint.h>
|
|
@@ -107,6 +108,12 @@ extern "C" {
|
|
|
107
108
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
108
109
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
109
110
|
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
111
|
+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
112
|
+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
113
|
+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
114
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
115
|
+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
116
|
+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
110
117
|
};
|
|
111
118
|
|
|
112
119
|
enum llama_rope_type {
|
|
@@ -277,10 +284,18 @@ extern "C" {
|
|
|
277
284
|
};
|
|
278
285
|
};
|
|
279
286
|
|
|
287
|
+
struct llama_model_tensor_buft_override {
|
|
288
|
+
const char * pattern;
|
|
289
|
+
lm_ggml_backend_buffer_type_t buft;
|
|
290
|
+
};
|
|
291
|
+
|
|
280
292
|
struct llama_model_params {
|
|
281
293
|
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
|
|
282
294
|
lm_ggml_backend_dev_t * devices;
|
|
283
295
|
|
|
296
|
+
// NULL-terminated list of buffer types to use for tensors that match a pattern
|
|
297
|
+
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
|
298
|
+
|
|
284
299
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
285
300
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
286
301
|
|
|
@@ -330,7 +345,7 @@ extern "C" {
|
|
|
330
345
|
float yarn_beta_fast; // YaRN low correction dim
|
|
331
346
|
float yarn_beta_slow; // YaRN high correction dim
|
|
332
347
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
|
333
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold,
|
|
348
|
+
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
|
334
349
|
|
|
335
350
|
lm_ggml_backend_sched_eval_callback cb_eval;
|
|
336
351
|
void * cb_eval_user_data;
|
|
@@ -338,34 +353,35 @@ extern "C" {
|
|
|
338
353
|
enum lm_ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
|
339
354
|
enum lm_ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
|
340
355
|
|
|
341
|
-
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
342
|
-
// TODO: move at the end of the struct
|
|
343
|
-
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
344
|
-
bool embeddings; // if true, extract embeddings (together with logits)
|
|
345
|
-
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
346
|
-
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
347
|
-
bool no_perf; // whether to measure performance timings
|
|
348
|
-
|
|
349
356
|
// Abort callback
|
|
350
357
|
// if it returns true, execution of llama_decode() will be aborted
|
|
351
358
|
// currently works only with CPU execution
|
|
352
359
|
lm_ggml_abort_callback abort_callback;
|
|
353
360
|
void * abort_callback_data;
|
|
361
|
+
|
|
362
|
+
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
363
|
+
bool embeddings; // if true, extract embeddings (together with logits)
|
|
364
|
+
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
|
365
|
+
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
|
366
|
+
bool no_perf; // measure performance timings
|
|
367
|
+
bool op_offload; // offload host tensor operations to device
|
|
368
|
+
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
354
369
|
};
|
|
355
370
|
|
|
356
371
|
// model quantization parameters
|
|
357
372
|
typedef struct llama_model_quantize_params {
|
|
358
|
-
int32_t nthread;
|
|
359
|
-
enum llama_ftype ftype;
|
|
360
|
-
enum lm_ggml_type output_tensor_type;
|
|
361
|
-
enum lm_ggml_type token_embedding_type;
|
|
362
|
-
bool allow_requantize;
|
|
363
|
-
bool quantize_output_tensor;
|
|
364
|
-
bool only_copy;
|
|
365
|
-
bool pure;
|
|
366
|
-
bool keep_split;
|
|
367
|
-
void * imatrix;
|
|
368
|
-
void * kv_overrides;
|
|
373
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
|
374
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
|
375
|
+
enum lm_ggml_type output_tensor_type; // output tensor type
|
|
376
|
+
enum lm_ggml_type token_embedding_type; // token embeddings tensor type
|
|
377
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
|
378
|
+
bool quantize_output_tensor; // quantize output.weight
|
|
379
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
380
|
+
bool pure; // quantize all tensors to the default type
|
|
381
|
+
bool keep_split; // quantize to the same number of shards
|
|
382
|
+
void * imatrix; // pointer to importance matrix data
|
|
383
|
+
void * kv_overrides; // pointer to vector containing overrides
|
|
384
|
+
void * tensor_types; // pointer to vector containing tensor types
|
|
369
385
|
} llama_model_quantize_params;
|
|
370
386
|
|
|
371
387
|
typedef struct llama_logit_bias {
|
|
@@ -431,6 +447,10 @@ extern "C" {
|
|
|
431
447
|
size_t n_paths,
|
|
432
448
|
struct llama_model_params params);
|
|
433
449
|
|
|
450
|
+
LLAMA_API void llama_model_save_to_file(
|
|
451
|
+
const struct llama_model * model,
|
|
452
|
+
const char * path_model);
|
|
453
|
+
|
|
434
454
|
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
|
435
455
|
"use llama_model_free instead");
|
|
436
456
|
|
|
@@ -588,71 +608,14 @@ extern "C" {
|
|
|
588
608
|
// KV cache
|
|
589
609
|
//
|
|
590
610
|
|
|
591
|
-
// TODO: start using struct llama_kv_cache
|
|
592
|
-
|
|
593
|
-
// Information associated with an individual cell in the KV cache view.
|
|
594
|
-
struct llama_kv_cache_view_cell {
|
|
595
|
-
// The position for this cell. Takes KV cache shifts into account.
|
|
596
|
-
// May be negative if the cell is not populated.
|
|
597
|
-
llama_pos pos;
|
|
598
|
-
};
|
|
599
|
-
|
|
600
|
-
// An updateable view of the KV cache.
|
|
601
|
-
struct llama_kv_cache_view {
|
|
602
|
-
// Number of KV cache cells. This will be the same as the context size.
|
|
603
|
-
int32_t n_cells;
|
|
604
|
-
|
|
605
|
-
// Maximum number of sequences that can exist in a cell. It's not an error
|
|
606
|
-
// if there are more sequences in a cell than this value, however they will
|
|
607
|
-
// not be visible in the view cells_sequences.
|
|
608
|
-
int32_t n_seq_max;
|
|
609
|
-
|
|
610
|
-
// Number of tokens in the cache. For example, if there are two populated
|
|
611
|
-
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
612
|
-
// ids then you'll have 3 tokens.
|
|
613
|
-
int32_t token_count;
|
|
614
|
-
|
|
615
|
-
// Number of populated cache cells.
|
|
616
|
-
int32_t used_cells;
|
|
617
|
-
|
|
618
|
-
// Maximum contiguous empty slots in the cache.
|
|
619
|
-
int32_t max_contiguous;
|
|
620
|
-
|
|
621
|
-
// Index to the start of the max_contiguous slot range. Can be negative
|
|
622
|
-
// when cache is full.
|
|
623
|
-
int32_t max_contiguous_idx;
|
|
624
|
-
|
|
625
|
-
// Information for an individual cell.
|
|
626
|
-
struct llama_kv_cache_view_cell * cells;
|
|
627
|
-
|
|
628
|
-
// The sequences for each cell. There will be n_seq_max items per cell.
|
|
629
|
-
llama_seq_id * cells_sequences;
|
|
630
|
-
};
|
|
631
|
-
|
|
632
|
-
// Create an empty KV cache view. (use only for debugging purposes)
|
|
633
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
|
634
|
-
|
|
635
|
-
// Free a KV cache view. (use only for debugging purposes)
|
|
636
|
-
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
|
637
|
-
|
|
638
|
-
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
|
639
|
-
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
|
|
640
|
-
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
|
641
|
-
|
|
642
|
-
///
|
|
643
|
-
|
|
644
611
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
645
612
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
646
|
-
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx)
|
|
647
|
-
|
|
648
|
-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
|
649
|
-
"use llama_kv_self_n_tokens instead");
|
|
613
|
+
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
614
|
+
"Use llama_kv_self_seq_pos_max() instead");
|
|
650
615
|
|
|
651
616
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
652
|
-
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx)
|
|
653
|
-
|
|
654
|
-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
|
|
655
|
-
"use llama_kv_self_used_cells instead");
|
|
617
|
+
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
618
|
+
"Use llama_kv_self_seq_pos_max() instead");
|
|
656
619
|
|
|
657
620
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
658
621
|
LLAMA_API void llama_kv_self_clear(
|
|
@@ -711,10 +674,18 @@ extern "C" {
|
|
|
711
674
|
llama_pos p1,
|
|
712
675
|
int d);
|
|
713
676
|
|
|
677
|
+
// Returns the smallest position present in the KV cache for the specified sequence
|
|
678
|
+
// This is typically non-zero only for SWA caches
|
|
679
|
+
// Return -1 if the sequence is empty
|
|
680
|
+
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
681
|
+
struct llama_context * ctx,
|
|
682
|
+
llama_seq_id seq_id);
|
|
683
|
+
|
|
714
684
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
685
|
+
// Return -1 if the sequence is empty
|
|
715
686
|
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
716
687
|
struct llama_context * ctx,
|
|
717
|
-
|
|
688
|
+
llama_seq_id seq_id);
|
|
718
689
|
|
|
719
690
|
// Defragment the KV cache
|
|
720
691
|
// This will be applied:
|
|
@@ -728,61 +699,6 @@ extern "C" {
|
|
|
728
699
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
729
700
|
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
|
|
730
701
|
|
|
731
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
|
|
732
|
-
struct llama_context * ctx),
|
|
733
|
-
"use llama_kv_self_clear instead");
|
|
734
|
-
|
|
735
|
-
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
|
|
736
|
-
struct llama_context * ctx,
|
|
737
|
-
llama_seq_id seq_id,
|
|
738
|
-
llama_pos p0,
|
|
739
|
-
llama_pos p1),
|
|
740
|
-
"use llama_kv_self_seq_rm instead");
|
|
741
|
-
|
|
742
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
|
|
743
|
-
struct llama_context * ctx,
|
|
744
|
-
llama_seq_id seq_id_src,
|
|
745
|
-
llama_seq_id seq_id_dst,
|
|
746
|
-
llama_pos p0,
|
|
747
|
-
llama_pos p1),
|
|
748
|
-
"use llama_kv_self_seq_cp instead");
|
|
749
|
-
|
|
750
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
|
|
751
|
-
struct llama_context * ctx,
|
|
752
|
-
llama_seq_id seq_id),
|
|
753
|
-
"use llama_kv_self_seq_keep instead");
|
|
754
|
-
|
|
755
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
|
|
756
|
-
struct llama_context * ctx,
|
|
757
|
-
llama_seq_id seq_id,
|
|
758
|
-
llama_pos p0,
|
|
759
|
-
llama_pos p1,
|
|
760
|
-
llama_pos delta),
|
|
761
|
-
"use llama_kv_self_seq_add instead");
|
|
762
|
-
|
|
763
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
|
|
764
|
-
struct llama_context * ctx,
|
|
765
|
-
llama_seq_id seq_id,
|
|
766
|
-
llama_pos p0,
|
|
767
|
-
llama_pos p1,
|
|
768
|
-
int d),
|
|
769
|
-
"use llama_kv_self_seq_div instead");
|
|
770
|
-
|
|
771
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
|
772
|
-
struct llama_context * ctx,
|
|
773
|
-
llama_seq_id seq_id),
|
|
774
|
-
"use llama_kv_self_seq_pos_max instead");
|
|
775
|
-
|
|
776
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
|
|
777
|
-
"use llama_kv_self_defrag instead");
|
|
778
|
-
|
|
779
|
-
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
|
|
780
|
-
"use llama_kv_self_can_shift instead");
|
|
781
|
-
|
|
782
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
|
|
783
|
-
"use llama_kv_self_update instead");
|
|
784
|
-
|
|
785
|
-
|
|
786
702
|
//
|
|
787
703
|
// State / sessions
|
|
788
704
|
//
|
|
@@ -910,18 +826,26 @@ extern "C" {
|
|
|
910
826
|
// Frees a batch of tokens allocated with llama_batch_init()
|
|
911
827
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
|
912
828
|
|
|
913
|
-
//
|
|
914
|
-
//
|
|
829
|
+
// Process a batch of tokens.
|
|
830
|
+
// In contrast to llama_decode() - this call does not use KV cache.
|
|
831
|
+
// For encode-decoder contexts, processes the batch using the encoder.
|
|
832
|
+
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
|
915
833
|
// 0 - success
|
|
916
834
|
// < 0 - error. the KV cache state is restored to the state before this call
|
|
917
835
|
LLAMA_API int32_t llama_encode(
|
|
918
836
|
struct llama_context * ctx,
|
|
919
837
|
struct llama_batch batch);
|
|
920
838
|
|
|
839
|
+
// Process a batch of tokens.
|
|
840
|
+
// Requires KV cache.
|
|
841
|
+
// For encode-decoder contexts, processes the batch using the decoder.
|
|
921
842
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
922
|
-
//
|
|
923
|
-
//
|
|
924
|
-
//
|
|
843
|
+
// Upon non-zero return values, the KV cache state is restored to the state before this call
|
|
844
|
+
// 0 - success
|
|
845
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
846
|
+
// 2 - aborted
|
|
847
|
+
// -1 - invalid input batch
|
|
848
|
+
// < -1 - error
|
|
925
849
|
LLAMA_API int32_t llama_decode(
|
|
926
850
|
struct llama_context * ctx,
|
|
927
851
|
struct llama_batch batch);
|
|
@@ -1218,6 +1142,7 @@ extern "C" {
|
|
|
1218
1142
|
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
1219
1143
|
|
|
1220
1144
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1145
|
+
/// Setting k <= 0 makes this a noop
|
|
1221
1146
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
|
1222
1147
|
|
|
1223
1148
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
@@ -1264,6 +1189,10 @@ extern "C" {
|
|
|
1264
1189
|
float tau,
|
|
1265
1190
|
float eta);
|
|
1266
1191
|
|
|
1192
|
+
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
|
|
1193
|
+
/// @param vocab The vocabulary that this grammar will be used with.
|
|
1194
|
+
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
|
|
1195
|
+
/// @param grammar_root The name of the start symbol for the grammar.
|
|
1267
1196
|
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
|
|
1268
1197
|
const struct llama_vocab * vocab,
|
|
1269
1198
|
const char * grammar_str,
|
|
@@ -1409,6 +1338,37 @@ extern "C" {
|
|
|
1409
1338
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
|
1410
1339
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
|
1411
1340
|
|
|
1341
|
+
//
|
|
1342
|
+
// training
|
|
1343
|
+
//
|
|
1344
|
+
|
|
1345
|
+
// function that returns whether or not a given tensor contains trainable parameters
|
|
1346
|
+
typedef bool (*llama_opt_param_filter)(const struct lm_ggml_tensor * tensor, void * userdata);
|
|
1347
|
+
|
|
1348
|
+
// always returns true
|
|
1349
|
+
LLAMA_API bool llama_opt_param_filter_all(const struct lm_ggml_tensor * tensor, void * userdata);
|
|
1350
|
+
|
|
1351
|
+
struct llama_opt_params {
|
|
1352
|
+
uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
|
|
1353
|
+
|
|
1354
|
+
llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
|
|
1355
|
+
void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
|
|
1356
|
+
|
|
1357
|
+
lm_ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
1358
|
+
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
1359
|
+
};
|
|
1360
|
+
|
|
1361
|
+
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
1362
|
+
|
|
1363
|
+
LLAMA_API void llama_opt_epoch(
|
|
1364
|
+
struct llama_context * lctx,
|
|
1365
|
+
lm_ggml_opt_dataset_t dataset,
|
|
1366
|
+
lm_ggml_opt_result_t result_train,
|
|
1367
|
+
lm_ggml_opt_result_t result_eval,
|
|
1368
|
+
int64_t idata_split,
|
|
1369
|
+
lm_ggml_opt_epoch_callback callback_train,
|
|
1370
|
+
lm_ggml_opt_epoch_callback callback_eval);
|
|
1371
|
+
|
|
1412
1372
|
#ifdef __cplusplus
|
|
1413
1373
|
}
|
|
1414
1374
|
#endif
|
|
@@ -9,10 +9,21 @@
|
|
|
9
9
|
#pragma once
|
|
10
10
|
|
|
11
11
|
#include "minja.hpp"
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
#include <chrono>
|
|
14
|
+
#include <cstddef>
|
|
15
|
+
#include <cstdio>
|
|
16
|
+
#include <ctime>
|
|
17
|
+
#include <exception>
|
|
18
|
+
#include <iomanip>
|
|
19
|
+
#include <memory>
|
|
20
|
+
#include <sstream>
|
|
21
|
+
#include <stdexcept>
|
|
13
22
|
#include <string>
|
|
14
23
|
#include <vector>
|
|
15
24
|
|
|
25
|
+
#include "../json.hpp"
|
|
26
|
+
|
|
16
27
|
using json = nlohmann::ordered_json;
|
|
17
28
|
|
|
18
29
|
namespace minja {
|
|
@@ -384,8 +395,8 @@ class chat_template {
|
|
|
384
395
|
|
|
385
396
|
for (const auto & message_ : adjusted_messages) {
|
|
386
397
|
auto message = message_;
|
|
387
|
-
if (!message.contains("role") || !message.contains("content")) {
|
|
388
|
-
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
|
398
|
+
if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
|
|
399
|
+
throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
|
|
389
400
|
}
|
|
390
401
|
std::string role = message.at("role");
|
|
391
402
|
|
|
@@ -406,7 +417,6 @@ class chat_template {
|
|
|
406
417
|
}
|
|
407
418
|
}
|
|
408
419
|
if (polyfill_tool_calls) {
|
|
409
|
-
auto content = message.at("content");
|
|
410
420
|
auto tool_calls = json::array();
|
|
411
421
|
for (const auto & tool_call : message.at("tool_calls")) {
|
|
412
422
|
if (tool_call.at("type") != "function") {
|
|
@@ -425,8 +435,11 @@ class chat_template {
|
|
|
425
435
|
auto obj = json {
|
|
426
436
|
{"tool_calls", tool_calls},
|
|
427
437
|
};
|
|
428
|
-
if (
|
|
429
|
-
|
|
438
|
+
if (message.contains("content")) {
|
|
439
|
+
auto content = message.at("content");
|
|
440
|
+
if (!content.is_null() && !content.empty()) {
|
|
441
|
+
obj["content"] = content;
|
|
442
|
+
}
|
|
430
443
|
}
|
|
431
444
|
message["content"] = obj.dump(2);
|
|
432
445
|
message.erase("tool_calls");
|
|
@@ -435,13 +448,12 @@ class chat_template {
|
|
|
435
448
|
if (polyfill_tool_responses && role == "tool") {
|
|
436
449
|
message["role"] = "user";
|
|
437
450
|
auto obj = json {
|
|
438
|
-
{"tool_response",
|
|
439
|
-
{"content", message.at("content")},
|
|
440
|
-
}},
|
|
451
|
+
{"tool_response", json::object()},
|
|
441
452
|
};
|
|
442
453
|
if (message.contains("name")) {
|
|
443
|
-
obj["tool_response"]["
|
|
454
|
+
obj["tool_response"]["tool"] = message.at("name");
|
|
444
455
|
}
|
|
456
|
+
obj["tool_response"]["content"] = message.at("content");
|
|
445
457
|
if (message.contains("tool_call_id")) {
|
|
446
458
|
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
|
|
447
459
|
}
|
|
@@ -510,7 +522,7 @@ class chat_template {
|
|
|
510
522
|
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
|
|
511
523
|
json messages_with_system = messages;
|
|
512
524
|
|
|
513
|
-
if (messages_with_system.
|
|
525
|
+
if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
|
|
514
526
|
std::string existing_system = messages_with_system.at(0).at("content");
|
|
515
527
|
messages_with_system[0] = json {
|
|
516
528
|
{"role", "system"},
|