cui-llama.rn 1.6.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -7
- package/android/src/main/CMakeLists.txt +16 -11
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -1
- package/android/src/main/jni.cpp +20 -4
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/LICENSE +21 -0
- package/cpp/chat.cpp +1 -1
- package/cpp/common.cpp +17 -2
- package/cpp/common.h +7 -3
- package/cpp/ggml-alloc.c +4 -1
- package/cpp/ggml-cpp.h +1 -1
- package/cpp/ggml-cpu/amx/amx.cpp +221 -0
- package/cpp/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
- package/cpp/ggml-cpu/amx/mmq.h +10 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
- package/cpp/ggml-cpu/common.h +72 -0
- package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -101
- package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +109 -42
- package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +3 -0
- package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +246 -160
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
- package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
- package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
- package/cpp/ggml-cpu.h +5 -0
- package/cpp/ggml-impl.h +16 -9
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +492 -47
- package/cpp/ggml.c +134 -244
- package/cpp/ggml.h +61 -94
- package/cpp/json-schema-to-grammar.cpp +3 -0
- package/cpp/llama-arch.cpp +46 -17
- package/cpp/llama-arch.h +9 -0
- package/cpp/llama-batch.cpp +5 -1
- package/cpp/llama-batch.h +2 -1
- package/cpp/llama-chat.cpp +31 -10
- package/cpp/llama-chat.h +3 -2
- package/cpp/llama-context.cpp +104 -489
- package/cpp/llama-context.h +14 -30
- package/cpp/llama-graph.cpp +69 -62
- package/cpp/llama-graph.h +21 -18
- package/cpp/llama-hparams.h +5 -0
- package/cpp/llama-kv-cache.cpp +1497 -391
- package/cpp/llama-kv-cache.h +272 -80
- package/cpp/llama-memory.h +11 -1
- package/cpp/llama-model.cpp +502 -176
- package/cpp/llama-model.h +13 -3
- package/cpp/llama-sampling.cpp +2 -1
- package/cpp/llama-vocab.cpp +8 -1
- package/cpp/llama.h +14 -11
- package/cpp/rn-llama.cpp +20 -172
- package/cpp/rn-llama.h +1 -5
- package/ios/CMakeLists.txt +13 -10
- package/ios/RNLlama.h +6 -0
- package/ios/RNLlama.mm +5 -0
- package/ios/RNLlamaContext.mm +26 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +4 -0
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +5 -0
- package/cpp/binary-ops.h +0 -16
- package/cpp/ops.h +0 -128
- package/cpp/simd-mappings.h +0 -888
- package/cpp/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
- /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
- /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
- /package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +0 -0
- /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
- /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
- /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
- /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
- /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
- /package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -0
- /package/cpp/{vec.h → ggml-cpu/vec.h} +0 -0
package/cpp/llama-model.h
CHANGED
@@ -36,14 +36,17 @@ enum llm_type {
|
|
36
36
|
LLM_TYPE_335M,
|
37
37
|
LLM_TYPE_410M,
|
38
38
|
LLM_TYPE_450M,
|
39
|
+
LLM_TYPE_475M,
|
39
40
|
LLM_TYPE_770M,
|
40
41
|
LLM_TYPE_780M,
|
41
42
|
LLM_TYPE_0_5B,
|
43
|
+
LLM_TYPE_0_6B,
|
42
44
|
LLM_TYPE_1B,
|
43
45
|
LLM_TYPE_1_3B,
|
44
46
|
LLM_TYPE_1_4B,
|
45
47
|
LLM_TYPE_1_5B,
|
46
48
|
LLM_TYPE_1_6B,
|
49
|
+
LLM_TYPE_1_7B,
|
47
50
|
LLM_TYPE_1_8B,
|
48
51
|
LLM_TYPE_2B,
|
49
52
|
LLM_TYPE_2_8B,
|
@@ -62,6 +65,7 @@ enum llm_type {
|
|
62
65
|
LLM_TYPE_15B,
|
63
66
|
LLM_TYPE_16B,
|
64
67
|
LLM_TYPE_20B,
|
68
|
+
LLM_TYPE_27B,
|
65
69
|
LLM_TYPE_30B,
|
66
70
|
LLM_TYPE_32B,
|
67
71
|
LLM_TYPE_34B,
|
@@ -70,6 +74,7 @@ enum llm_type {
|
|
70
74
|
LLM_TYPE_65B,
|
71
75
|
LLM_TYPE_70B,
|
72
76
|
LLM_TYPE_236B,
|
77
|
+
LLM_TYPE_290B,
|
73
78
|
LLM_TYPE_314B,
|
74
79
|
LLM_TYPE_671B,
|
75
80
|
LLM_TYPE_SMALL,
|
@@ -84,10 +89,10 @@ enum llm_type {
|
|
84
89
|
LLM_TYPE_16x3_8B,
|
85
90
|
LLM_TYPE_10B_128x3_66B,
|
86
91
|
LLM_TYPE_57B_A14B,
|
87
|
-
LLM_TYPE_27B,
|
88
|
-
LLM_TYPE_290B,
|
89
92
|
LLM_TYPE_17B_16E, // llama4 Scout
|
90
93
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
94
|
+
LLM_TYPE_30B_A3B,
|
95
|
+
LLM_TYPE_235B_A22B,
|
91
96
|
};
|
92
97
|
|
93
98
|
struct llama_layer_posnet {
|
@@ -171,6 +176,8 @@ struct llama_layer {
|
|
171
176
|
struct lm_ggml_tensor * wq_b = nullptr;
|
172
177
|
struct lm_ggml_tensor * wkv_a_mqa = nullptr;
|
173
178
|
struct lm_ggml_tensor * wkv_b = nullptr;
|
179
|
+
struct lm_ggml_tensor * wk_b = nullptr;
|
180
|
+
struct lm_ggml_tensor * wv_b = nullptr;
|
174
181
|
struct lm_ggml_tensor * wq_cross = nullptr;
|
175
182
|
struct lm_ggml_tensor * wk_cross = nullptr;
|
176
183
|
struct lm_ggml_tensor * wv_cross = nullptr;
|
@@ -388,8 +395,11 @@ struct llama_model {
|
|
388
395
|
|
389
396
|
const struct lm_ggml_tensor * get_tensor(const char * name) const;
|
390
397
|
|
398
|
+
lm_ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
399
|
+
|
400
|
+
// note: can mutate `cparams`
|
391
401
|
// TODO: move this to new llm_arch_model_i interface
|
392
|
-
llama_memory_i * create_memory(
|
402
|
+
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
393
403
|
|
394
404
|
// TODO: move this to new llm_arch_model_i interface
|
395
405
|
llm_graph_result_ptr build_graph(
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
232
232
|
// }
|
233
233
|
|
234
234
|
if (k <= 0) {
|
235
|
-
|
235
|
+
return;
|
236
236
|
}
|
237
237
|
|
238
238
|
k = std::min(k, (int) cur_p->size);
|
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
298
298
|
}
|
299
299
|
cur_p->sorted = true;
|
300
300
|
}
|
301
|
+
|
301
302
|
cur_p->size = k;
|
302
303
|
}
|
303
304
|
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -1506,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1506
1506
|
tokenizer_pre == "llama3" ||
|
1507
1507
|
tokenizer_pre == "llama-v3" ||
|
1508
1508
|
tokenizer_pre == "llama-bpe"||
|
1509
|
-
tokenizer_pre == "falcon3"
|
1509
|
+
tokenizer_pre == "falcon3" ||
|
1510
|
+
tokenizer_pre == "pixtral") {
|
1510
1511
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
1511
1512
|
ignore_merges = true;
|
1512
1513
|
add_bos = true;
|
@@ -1572,6 +1573,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1572
1573
|
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
|
1573
1574
|
clean_spaces = false;
|
1574
1575
|
} else if (
|
1576
|
+
tokenizer_pre == "glm4" ||
|
1575
1577
|
tokenizer_pre == "chatglm-bpe") {
|
1576
1578
|
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
1577
1579
|
special_bos_id = LLAMA_TOKEN_NULL;
|
@@ -1840,6 +1842,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1840
1842
|
if (false
|
1841
1843
|
|| t.first == "<|fim_prefix|>" // Qwen
|
1842
1844
|
|| t.first == "<fim-prefix>"
|
1845
|
+
|| t.first == "<fim_prefix>" // Granite
|
1843
1846
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
1844
1847
|
|| t.first == "<PRE>"
|
1845
1848
|
|| t.first == "▁<PRE>" // CodeLlama
|
@@ -1858,6 +1861,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1858
1861
|
if (false
|
1859
1862
|
|| t.first == "<|fim_suffix|>" // Qwen
|
1860
1863
|
|| t.first == "<fim-suffix>"
|
1864
|
+
|| t.first == "<fim_suffix>" // Granite
|
1861
1865
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
1862
1866
|
|| t.first == "<SUF>"
|
1863
1867
|
|| t.first == "▁<SUF>" // CodeLlama
|
@@ -1876,6 +1880,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1876
1880
|
if (false
|
1877
1881
|
|| t.first == "<|fim_middle|>" // Qwen
|
1878
1882
|
|| t.first == "<fim-middle>"
|
1883
|
+
|| t.first == "<fim_middle>" // Granite
|
1879
1884
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
1880
1885
|
|| t.first == "<MID>"
|
1881
1886
|
|| t.first == "▁<MID>" // CodeLlama
|
@@ -1894,6 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1894
1899
|
if (false
|
1895
1900
|
|| t.first == "<|fim_pad|>" // Qwen
|
1896
1901
|
|| t.first == "<fim-pad>"
|
1902
|
+
|| t.first == "<fim_pad>" // Granite
|
1897
1903
|
|| t.first == "<PAD>"
|
1898
1904
|
) {
|
1899
1905
|
special_fim_pad_id = t.second;
|
@@ -1912,6 +1918,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1912
1918
|
|| t.first == "<|repo_name|>"
|
1913
1919
|
|| t.first == "<fim-repo>"
|
1914
1920
|
|| t.first == "<REPO>"
|
1921
|
+
|| t.first == "<reponame>" // Granite
|
1915
1922
|
) {
|
1916
1923
|
special_fim_rep_id = t.second;
|
1917
1924
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
package/cpp/llama.h
CHANGED
@@ -112,6 +112,7 @@ extern "C" {
|
|
112
112
|
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
113
113
|
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
114
114
|
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
115
|
+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
115
116
|
};
|
116
117
|
|
117
118
|
enum llama_rope_type {
|
@@ -368,17 +369,18 @@ extern "C" {
|
|
368
369
|
|
369
370
|
// model quantization parameters
|
370
371
|
typedef struct llama_model_quantize_params {
|
371
|
-
int32_t nthread;
|
372
|
-
enum llama_ftype ftype;
|
373
|
-
enum lm_ggml_type output_tensor_type;
|
374
|
-
enum lm_ggml_type token_embedding_type;
|
375
|
-
bool allow_requantize;
|
376
|
-
bool quantize_output_tensor;
|
377
|
-
bool only_copy;
|
378
|
-
bool pure;
|
379
|
-
bool keep_split;
|
380
|
-
void * imatrix;
|
381
|
-
void * kv_overrides;
|
372
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
373
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
374
|
+
enum lm_ggml_type output_tensor_type; // output tensor type
|
375
|
+
enum lm_ggml_type token_embedding_type; // token embeddings tensor type
|
376
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
377
|
+
bool quantize_output_tensor; // quantize output.weight
|
378
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
379
|
+
bool pure; // quantize all tensors to the default type
|
380
|
+
bool keep_split; // quantize to the same number of shards
|
381
|
+
void * imatrix; // pointer to importance matrix data
|
382
|
+
void * kv_overrides; // pointer to vector containing overrides
|
383
|
+
void * tensor_types; // pointer to vector containing tensor types
|
382
384
|
} llama_model_quantize_params;
|
383
385
|
|
384
386
|
typedef struct llama_logit_bias {
|
@@ -1231,6 +1233,7 @@ extern "C" {
|
|
1231
1233
|
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
1232
1234
|
|
1233
1235
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1236
|
+
/// Setting k <= 0 makes this a noop
|
1234
1237
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
1235
1238
|
|
1236
1239
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
package/cpp/rn-llama.cpp
CHANGED
@@ -165,6 +165,7 @@ void llama_rn_context::rewind() {
|
|
165
165
|
generated_text.reserve(params.n_ctx);
|
166
166
|
generated_token_probs.clear();
|
167
167
|
truncated = false;
|
168
|
+
context_full = false;
|
168
169
|
stopped_eos = false;
|
169
170
|
stopped_word = false;
|
170
171
|
stopped_limit = false;
|
@@ -197,6 +198,9 @@ bool llama_rn_context::loadModel(common_params ¶ms_)
|
|
197
198
|
templates = common_chat_templates_init(model, params.chat_template);
|
198
199
|
n_ctx = llama_n_ctx(ctx);
|
199
200
|
|
201
|
+
// Initialize context shift flag
|
202
|
+
LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
|
203
|
+
|
200
204
|
// We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
|
201
205
|
// LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
|
202
206
|
|
@@ -271,11 +275,11 @@ void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
|
|
271
275
|
|
272
276
|
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
|
273
277
|
|
274
|
-
|
278
|
+
LOG_INFO("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, old_size: %d, new_size: %d",
|
275
279
|
n_ctx,
|
276
280
|
params.n_keep,
|
277
281
|
n_left,
|
278
|
-
|
282
|
+
prompt_tokens.size(),
|
279
283
|
new_tokens.size()
|
280
284
|
);
|
281
285
|
|
@@ -304,18 +308,14 @@ void llama_rn_context::loadPrompt() {
|
|
304
308
|
// if input prompt is too big, truncate like normal
|
305
309
|
if (num_prompt_tokens >= (size_t) n_ctx)
|
306
310
|
{
|
311
|
+
if (!params.ctx_shift) {
|
312
|
+
context_full = true;
|
313
|
+
return;
|
314
|
+
}
|
307
315
|
truncatePrompt(prompt_tokens);
|
308
316
|
num_prompt_tokens = prompt_tokens.size();
|
309
|
-
|
310
317
|
LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
|
311
318
|
}
|
312
|
-
|
313
|
-
// do context shifitng
|
314
|
-
if(!params.embedding){
|
315
|
-
purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
|
316
|
-
}
|
317
|
-
|
318
|
-
|
319
319
|
// push the prompt into the sampling context (do not apply grammar)
|
320
320
|
for (auto & token : prompt_tokens)
|
321
321
|
{
|
@@ -358,6 +358,14 @@ completion_token_output llama_rn_context::nextToken()
|
|
358
358
|
|
359
359
|
if (embd.size() >= (size_t)params.n_ctx)
|
360
360
|
{
|
361
|
+
if (!params.ctx_shift) {
|
362
|
+
// If context shifting is disabled, stop generation
|
363
|
+
LOG_WARNING("context full, n_ctx: %d, tokens: %d", params.n_ctx, embd.size());
|
364
|
+
has_next_token = false;
|
365
|
+
context_full = true;
|
366
|
+
return result;
|
367
|
+
}
|
368
|
+
|
361
369
|
// Shift context
|
362
370
|
|
363
371
|
const int n_left = n_past - params.n_keep - 1;
|
@@ -373,12 +381,9 @@ completion_token_output llama_rn_context::nextToken()
|
|
373
381
|
embd.resize(embd.size() - n_discard);
|
374
382
|
|
375
383
|
n_past -= n_discard;
|
384
|
+
truncated = true;
|
376
385
|
|
377
|
-
LOG_VERBOSE("
|
378
|
-
params.n_ctx,
|
379
|
-
params.n_keep,
|
380
|
-
n_left
|
381
|
-
);
|
386
|
+
LOG_VERBOSE("context shifted, new n_past: %d, new size: %d", n_past, embd.size());
|
382
387
|
}
|
383
388
|
|
384
389
|
bool tg = true;
|
@@ -712,162 +717,5 @@ void llama_rn_context::removeLoraAdapters() {
|
|
712
717
|
std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
|
713
718
|
return this->lora;
|
714
719
|
}
|
715
|
-
std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
|
716
|
-
int m = x.size(), n = y.size();
|
717
|
-
|
718
|
-
//int LCSuff[m+1][n+1];
|
719
|
-
std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
|
720
|
-
|
721
|
-
for (int j = 0; j <= n; j++)
|
722
|
-
LCSuff[0][j] = 0;
|
723
|
-
for (int i = 0; i <= m; i++)
|
724
|
-
LCSuff[i][0] = 0;
|
725
|
-
|
726
|
-
for (int i = 1; i <= m; i++)
|
727
|
-
{
|
728
|
-
for (int j = 1; j <= n; j++)
|
729
|
-
{
|
730
|
-
if (x[i - 1] == y[j - 1])
|
731
|
-
LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
|
732
|
-
else
|
733
|
-
LCSuff[i][j] = 0;
|
734
|
-
}
|
735
|
-
}
|
736
|
-
|
737
|
-
std::vector<int> longest;
|
738
|
-
for (int i = 1; i <= m; i++)
|
739
|
-
{
|
740
|
-
for (int j = 1; j <= n; j++)
|
741
|
-
{
|
742
|
-
if (LCSuff[i][j] > longest.size())
|
743
|
-
{
|
744
|
-
auto off1 = ((i - LCSuff[i][j] + 1) - 1);
|
745
|
-
auto off2 = off1 + LCSuff[i][j];
|
746
|
-
longest.clear();
|
747
|
-
// std::vector<int>().swap(longest);
|
748
|
-
longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
|
749
|
-
// x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
|
750
|
-
}
|
751
|
-
}
|
752
|
-
}
|
753
|
-
return longest;
|
754
|
-
}
|
755
|
-
|
756
|
-
bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
|
757
|
-
{
|
758
|
-
int ss = searchSeq.size();
|
759
|
-
if(targetArray.size()<ss)
|
760
|
-
{
|
761
|
-
return false;
|
762
|
-
}
|
763
|
-
for(int i=0;i<ss;++i)
|
764
|
-
{
|
765
|
-
if(targetArray[i]!=searchSeq[i])
|
766
|
-
{
|
767
|
-
return false;
|
768
|
-
}
|
769
|
-
}
|
770
|
-
return true;
|
771
|
-
}
|
772
|
-
|
773
|
-
int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
|
774
|
-
{
|
775
|
-
int ss = searchSeq.size();
|
776
|
-
int tas = targetArray.size();
|
777
|
-
if(tas<ss)
|
778
|
-
{
|
779
|
-
return -1;
|
780
|
-
}
|
781
|
-
for(int i=0;i<tas;++i)
|
782
|
-
{
|
783
|
-
int srch = 0;
|
784
|
-
bool fail = false;
|
785
|
-
for(int srch=0;srch<ss;++srch)
|
786
|
-
{
|
787
|
-
if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
|
788
|
-
{
|
789
|
-
fail = true;
|
790
|
-
break;
|
791
|
-
}
|
792
|
-
}
|
793
|
-
if(!fail)
|
794
|
-
{
|
795
|
-
return i;
|
796
|
-
}
|
797
|
-
}
|
798
|
-
return -1;
|
799
|
-
}
|
800
|
-
|
801
|
-
void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
|
802
|
-
{
|
803
|
-
//scan from start old and new ctx, until first mismatch found, save as p0
|
804
|
-
//check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
|
805
|
-
//test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
|
806
|
-
//if passed, save beginning of LCQ from old ctx as p1
|
807
|
-
//remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
|
808
|
-
|
809
|
-
const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
|
810
|
-
const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
|
811
|
-
|
812
|
-
int trimstart = 0;
|
813
|
-
int new_tokens_len = new_context_tokens.size();
|
814
|
-
bool purge_needed = true;
|
815
|
-
|
816
|
-
for (int i = 0; i < current_context_tokens.size(); ++i)
|
817
|
-
{
|
818
|
-
if (current_context_tokens[i] == new_context_tokens[i])
|
819
|
-
{
|
820
|
-
trimstart += 1;
|
821
|
-
}
|
822
|
-
else
|
823
|
-
{
|
824
|
-
break;
|
825
|
-
}
|
826
|
-
if ((i + 2) >= new_tokens_len)
|
827
|
-
{
|
828
|
-
purge_needed = false;
|
829
|
-
break; //no surgery required
|
830
|
-
}
|
831
|
-
}
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
|
836
|
-
{
|
837
|
-
LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
|
838
|
-
return; //no purge is needed
|
839
|
-
}
|
840
|
-
|
841
|
-
//at least this many tokens need to match, otherwise don't bother trimming
|
842
|
-
const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
|
843
|
-
|
844
|
-
auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
|
845
|
-
auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
|
846
|
-
|
847
|
-
auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
|
848
|
-
|
849
|
-
if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
|
850
|
-
{
|
851
|
-
int found = arr_find_index_of(current_context_tokens,shared);
|
852
|
-
if(found>=0 && found > trimstart)
|
853
|
-
{
|
854
|
-
|
855
|
-
//extract the unwanted tokens out from context and KV
|
856
|
-
int diff = found - trimstart;
|
857
|
-
llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
|
858
|
-
llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
|
859
|
-
|
860
|
-
for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
|
861
|
-
{
|
862
|
-
current_context_tokens[i - diff] = current_context_tokens[i];
|
863
|
-
}
|
864
|
-
|
865
|
-
LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
|
866
|
-
|
867
|
-
current_context_tokens.resize(current_context_tokens.size() - diff);
|
868
|
-
}
|
869
|
-
}
|
870
|
-
|
871
|
-
}
|
872
720
|
|
873
721
|
}
|
package/cpp/rn-llama.h
CHANGED
@@ -16,7 +16,6 @@
|
|
16
16
|
|
17
17
|
namespace rnllama {
|
18
18
|
|
19
|
-
|
20
19
|
std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
|
21
20
|
|
22
21
|
std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
|
@@ -69,6 +68,7 @@ struct llama_rn_context {
|
|
69
68
|
|
70
69
|
int n_ctx;
|
71
70
|
|
71
|
+
bool context_full = false;
|
72
72
|
bool truncated = false;
|
73
73
|
bool stopped_eos = false;
|
74
74
|
bool stopped_word = false;
|
@@ -107,10 +107,6 @@ struct llama_rn_context {
|
|
107
107
|
int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
|
108
108
|
void removeLoraAdapters();
|
109
109
|
std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
|
110
|
-
std::vector<int> longest_common_subseq(const std::vector<int> x, const std::vector<int> y);
|
111
|
-
bool arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
112
|
-
int arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
113
|
-
void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx);
|
114
110
|
};\
|
115
111
|
|
116
112
|
// Logging macros
|
package/ios/CMakeLists.txt
CHANGED
@@ -40,15 +40,18 @@ add_library(rnllama SHARED
|
|
40
40
|
${SOURCE_DIR}/ggml-alloc.c
|
41
41
|
${SOURCE_DIR}/ggml-backend.cpp
|
42
42
|
${SOURCE_DIR}/ggml-backend-reg.cpp
|
43
|
-
${SOURCE_DIR}/ggml-cpu.
|
44
|
-
${SOURCE_DIR}/ggml-cpu.cpp
|
45
|
-
${SOURCE_DIR}/
|
46
|
-
${SOURCE_DIR}/
|
47
|
-
${SOURCE_DIR}/
|
48
|
-
${SOURCE_DIR}/
|
49
|
-
${SOURCE_DIR}/ggml-cpu-
|
50
|
-
${SOURCE_DIR}/ggml-cpu-
|
51
|
-
${SOURCE_DIR}/ggml-cpu-
|
43
|
+
${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
|
44
|
+
${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
|
45
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
|
46
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
|
47
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu-aarch64.cpp
|
48
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu-quants.c
|
49
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu-traits.cpp
|
50
|
+
${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
|
51
|
+
${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
|
52
|
+
${SOURCE_DIR}/ggml-cpu/sgemm.cpp
|
53
|
+
${SOURCE_DIR}/ggml-cpu/vec.cpp
|
54
|
+
${SOURCE_DIR}/ggml-cpu/ops.cpp
|
52
55
|
${SOURCE_DIR}/ggml-metal.m
|
53
56
|
${SOURCE_DIR}/ggml-opt.cpp
|
54
57
|
${SOURCE_DIR}/ggml-threading.cpp
|
@@ -78,7 +81,6 @@ add_library(rnllama SHARED
|
|
78
81
|
${SOURCE_DIR}/sampling.cpp
|
79
82
|
${SOURCE_DIR}/unicode-data.cpp
|
80
83
|
${SOURCE_DIR}/unicode.cpp
|
81
|
-
${SOURCE_DIR}/sgemm.cpp
|
82
84
|
${SOURCE_DIR}/common.cpp
|
83
85
|
${SOURCE_DIR}/chat.cpp
|
84
86
|
${SOURCE_DIR}/json-schema-to-grammar.cpp
|
@@ -92,6 +94,7 @@ add_library(rnllama SHARED
|
|
92
94
|
target_include_directories(rnllama
|
93
95
|
PUBLIC
|
94
96
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
|
97
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
|
95
98
|
$<INSTALL_INTERFACE:include>
|
96
99
|
)
|
97
100
|
|
package/ios/RNLlama.h
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
#import <React/RCTEventEmitter.h>
|
2
2
|
#import <React/RCTBridgeModule.h>
|
3
3
|
|
4
|
+
#if RNLLAMA_BUILD_FROM_SOURCE
|
5
|
+
#import "json.hpp"
|
6
|
+
#else
|
7
|
+
#import <rnllama/json.hpp>
|
8
|
+
#endif
|
9
|
+
|
4
10
|
// TODO: Use RNLlamaSpec (Need to refactor NSDictionary usage)
|
5
11
|
@interface RNLlama : RCTEventEmitter <RCTBridgeModule>
|
6
12
|
|
package/ios/RNLlama.mm
CHANGED
@@ -108,8 +108,13 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
|
|
108
108
|
} else {
|
109
109
|
resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
|
110
110
|
}
|
111
|
+
} catch (const nlohmann::json_abi_v3_11_3::detail::parse_error& e) {
|
112
|
+
NSString *errorMessage = [NSString stringWithUTF8String:e.what()];
|
113
|
+
reject(@"llama_error", [NSString stringWithFormat:@"JSON parse error in getFormattedChat: %@", errorMessage], nil);
|
111
114
|
} catch (const std::exception& e) { // catch cpp exceptions
|
112
115
|
reject(@"llama_error", [NSString stringWithUTF8String:e.what()], nil);
|
116
|
+
} catch (...) {
|
117
|
+
reject(@"llama_error", @"Unknown error in getFormattedChat", nil);
|
113
118
|
}
|
114
119
|
}
|
115
120
|
|
package/ios/RNLlamaContext.mm
CHANGED
@@ -82,7 +82,7 @@
|
|
82
82
|
BOOL isAsset = [params[@"is_model_asset"] boolValue];
|
83
83
|
NSString *path = modelPath;
|
84
84
|
if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
|
85
|
-
defaultParams.model =
|
85
|
+
defaultParams.model.path = [path UTF8String];
|
86
86
|
|
87
87
|
NSString *chatTemplate = params[@"chat_template"];
|
88
88
|
if (chatTemplate) {
|
@@ -106,37 +106,27 @@
|
|
106
106
|
NSString *reasonNoMetal = @"";
|
107
107
|
defaultParams.n_gpu_layers = 0;
|
108
108
|
#ifdef LM_GGML_USE_METAL
|
109
|
-
// Check ggml-metal availability
|
110
|
-
NSError * error = nil;
|
111
109
|
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
if (error) {
|
121
|
-
reasonNoMetal = [error localizedDescription];
|
110
|
+
|
111
|
+
// Check ggml-metal availability
|
112
|
+
BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
|
113
|
+
if (@available(iOS 16.0, tvOS 16.0, *)) {
|
114
|
+
supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
|
115
|
+
}
|
116
|
+
if (!supportsGgmlMetal) {
|
117
|
+
reasonNoMetal = @"Metal is not supported in this device";
|
122
118
|
skipGpuDevices = true;
|
123
|
-
}
|
124
|
-
|
125
|
-
id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
|
126
|
-
if (pipeline == nil) {
|
127
|
-
reasonNoMetal = [error localizedDescription];
|
128
|
-
skipGpuDevices = true;
|
129
|
-
} else {
|
119
|
+
}
|
120
|
+
|
130
121
|
#if TARGET_OS_SIMULATOR
|
131
|
-
|
132
|
-
|
133
|
-
|
122
|
+
// Use the backend, but no layers because not supported fully on simulator
|
123
|
+
defaultParams.n_gpu_layers = 0;
|
124
|
+
isMetalEnabled = true;
|
134
125
|
#else
|
135
|
-
|
136
|
-
|
126
|
+
defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
|
127
|
+
isMetalEnabled = true;
|
137
128
|
#endif
|
138
|
-
|
139
|
-
}
|
129
|
+
|
140
130
|
device = nil;
|
141
131
|
#else
|
142
132
|
reasonNoMetal = @"Metal is not enabled in this build";
|
@@ -158,6 +148,8 @@
|
|
158
148
|
}
|
159
149
|
if (cpu_devs.size() > 0) {
|
160
150
|
defaultParams.devices = cpu_devs;
|
151
|
+
defaultParams.n_gpu_layers = 0;
|
152
|
+
isMetalEnabled = false;
|
161
153
|
}
|
162
154
|
}
|
163
155
|
|
@@ -184,6 +176,8 @@
|
|
184
176
|
|
185
177
|
if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
|
186
178
|
|
179
|
+
if (params[@"ctx_shift"]) defaultParams.ctx_shift = [params[@"ctx_shift"] boolValue];
|
180
|
+
|
187
181
|
if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
|
188
182
|
if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
|
189
183
|
|
@@ -568,6 +562,9 @@
|
|
568
562
|
}
|
569
563
|
llama->beginCompletion();
|
570
564
|
llama->loadPrompt();
|
565
|
+
if (llama->context_full) {
|
566
|
+
@throw [NSException exceptionWithName:@"LlamaException" reason:@"Context is full" userInfo:nil];
|
567
|
+
}
|
571
568
|
|
572
569
|
size_t sent_count = 0;
|
573
570
|
size_t sent_token_probs_index = 0;
|
@@ -655,7 +652,7 @@
|
|
655
652
|
}];
|
656
653
|
}
|
657
654
|
} catch (const std::exception &e) {
|
658
|
-
|
655
|
+
} catch (...) {
|
659
656
|
}
|
660
657
|
}
|
661
658
|
|
@@ -668,6 +665,7 @@
|
|
668
665
|
result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
|
669
666
|
result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
|
670
667
|
result[@"truncated"] = @(llama->truncated);
|
668
|
+
result[@"context_full"] = @(llama->context_full);
|
671
669
|
result[@"stopped_eos"] = @(llama->stopped_eos);
|
672
670
|
result[@"stopped_word"] = @(llama->stopped_word);
|
673
671
|
result[@"stopped_limit"] = @(llama->stopped_limit);
|
@@ -355,8 +355,10 @@ struct common_params {
|
|
355
355
|
|
356
356
|
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
357
357
|
|
358
|
-
// multimodal models (see
|
358
|
+
// multimodal models (see tools/llava)
|
359
359
|
struct common_params_model mmproj;
|
360
|
+
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
361
|
+
bool no_mmproj = false; // explicitly disable multimodal model
|
360
362
|
std::vector<std::string> image; // path to image file(s)
|
361
363
|
|
362
364
|
// embedding
|
@@ -427,8 +429,8 @@ struct common_params {
|
|
427
429
|
int n_pca_batch = 100;
|
428
430
|
int n_pca_iterations = 1000;
|
429
431
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
430
|
-
std::string cvector_positive_file = "
|
431
|
-
std::string cvector_negative_file = "
|
432
|
+
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
433
|
+
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
432
434
|
|
433
435
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
434
436
|
|
@@ -558,6 +560,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
|
|
558
560
|
// clear LoRA adapters from context, then apply new list of adapters
|
559
561
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
560
562
|
|
563
|
+
std::string get_model_endpoint();
|
564
|
+
|
561
565
|
//
|
562
566
|
// Batch utils
|
563
567
|
//
|