cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +16 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -1
  4. package/android/src/main/jni.cpp +20 -4
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  13. package/cpp/LICENSE +21 -0
  14. package/cpp/chat.cpp +1 -1
  15. package/cpp/common.cpp +17 -2
  16. package/cpp/common.h +7 -3
  17. package/cpp/ggml-alloc.c +4 -1
  18. package/cpp/ggml-cpp.h +1 -1
  19. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  20. package/cpp/ggml-cpu/amx/amx.h +8 -0
  21. package/cpp/ggml-cpu/amx/common.h +91 -0
  22. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  23. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  24. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  25. package/cpp/ggml-cpu/common.h +72 -0
  26. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -101
  27. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +109 -42
  28. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +3 -0
  29. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +246 -160
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  31. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  32. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  33. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  34. package/cpp/ggml-cpu.h +5 -0
  35. package/cpp/ggml-impl.h +16 -9
  36. package/cpp/ggml-llama-sim.metallib +0 -0
  37. package/cpp/ggml-llama.metallib +0 -0
  38. package/cpp/ggml-metal.m +492 -47
  39. package/cpp/ggml.c +134 -244
  40. package/cpp/ggml.h +61 -94
  41. package/cpp/json-schema-to-grammar.cpp +3 -0
  42. package/cpp/llama-arch.cpp +46 -17
  43. package/cpp/llama-arch.h +9 -0
  44. package/cpp/llama-batch.cpp +5 -1
  45. package/cpp/llama-batch.h +2 -1
  46. package/cpp/llama-chat.cpp +31 -10
  47. package/cpp/llama-chat.h +3 -2
  48. package/cpp/llama-context.cpp +104 -489
  49. package/cpp/llama-context.h +14 -30
  50. package/cpp/llama-graph.cpp +69 -62
  51. package/cpp/llama-graph.h +21 -18
  52. package/cpp/llama-hparams.h +5 -0
  53. package/cpp/llama-kv-cache.cpp +1497 -391
  54. package/cpp/llama-kv-cache.h +272 -80
  55. package/cpp/llama-memory.h +11 -1
  56. package/cpp/llama-model.cpp +502 -176
  57. package/cpp/llama-model.h +13 -3
  58. package/cpp/llama-sampling.cpp +2 -1
  59. package/cpp/llama-vocab.cpp +8 -1
  60. package/cpp/llama.h +14 -11
  61. package/cpp/rn-llama.cpp +20 -172
  62. package/cpp/rn-llama.h +1 -5
  63. package/ios/CMakeLists.txt +13 -10
  64. package/ios/RNLlama.h +6 -0
  65. package/ios/RNLlama.mm +5 -0
  66. package/ios/RNLlamaContext.mm +26 -28
  67. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +7 -3
  68. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  69. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  70. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  71. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +61 -94
  72. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  73. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  74. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
  75. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +14 -30
  76. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
  77. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
  78. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  79. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
  80. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +13 -3
  81. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +14 -11
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  85. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
  86. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  87. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  88. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  89. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
  90. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  91. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  92. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
  93. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
  94. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
  95. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
  96. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  97. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
  98. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
  99. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
  100. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
  101. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  102. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  103. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +7 -3
  104. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  105. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  106. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  107. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +61 -94
  108. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  109. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  110. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
  111. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +14 -30
  112. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
  113. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
  114. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  115. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
  116. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +13 -3
  117. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +14 -11
  118. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
  119. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  120. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  121. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
  122. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  123. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  124. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  125. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
  126. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  127. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  128. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
  129. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
  130. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
  131. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
  132. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  133. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
  134. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
  135. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
  136. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
  137. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  138. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  139. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  140. package/lib/module/NativeRNLlama.js.map +1 -1
  141. package/lib/typescript/NativeRNLlama.d.ts +4 -0
  142. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  143. package/package.json +1 -1
  144. package/src/NativeRNLlama.ts +5 -0
  145. package/cpp/binary-ops.h +0 -16
  146. package/cpp/ops.h +0 -128
  147. package/cpp/simd-mappings.h +0 -888
  148. package/cpp/unary-ops.h +0 -28
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  157. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +0 -802
  175. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  176. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  177. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  178. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  179. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  180. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  181. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  182. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  183. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  184. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  185. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  186. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  187. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  188. /package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +0 -0
  189. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  190. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  191. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  192. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  193. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
  194. /package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -0
  195. /package/cpp/{vec.h → ggml-cpu/vec.h} +0 -0
package/cpp/llama-model.h CHANGED
@@ -36,14 +36,17 @@ enum llm_type {
36
36
  LLM_TYPE_335M,
37
37
  LLM_TYPE_410M,
38
38
  LLM_TYPE_450M,
39
+ LLM_TYPE_475M,
39
40
  LLM_TYPE_770M,
40
41
  LLM_TYPE_780M,
41
42
  LLM_TYPE_0_5B,
43
+ LLM_TYPE_0_6B,
42
44
  LLM_TYPE_1B,
43
45
  LLM_TYPE_1_3B,
44
46
  LLM_TYPE_1_4B,
45
47
  LLM_TYPE_1_5B,
46
48
  LLM_TYPE_1_6B,
49
+ LLM_TYPE_1_7B,
47
50
  LLM_TYPE_1_8B,
48
51
  LLM_TYPE_2B,
49
52
  LLM_TYPE_2_8B,
@@ -62,6 +65,7 @@ enum llm_type {
62
65
  LLM_TYPE_15B,
63
66
  LLM_TYPE_16B,
64
67
  LLM_TYPE_20B,
68
+ LLM_TYPE_27B,
65
69
  LLM_TYPE_30B,
66
70
  LLM_TYPE_32B,
67
71
  LLM_TYPE_34B,
@@ -70,6 +74,7 @@ enum llm_type {
70
74
  LLM_TYPE_65B,
71
75
  LLM_TYPE_70B,
72
76
  LLM_TYPE_236B,
77
+ LLM_TYPE_290B,
73
78
  LLM_TYPE_314B,
74
79
  LLM_TYPE_671B,
75
80
  LLM_TYPE_SMALL,
@@ -84,10 +89,10 @@ enum llm_type {
84
89
  LLM_TYPE_16x3_8B,
85
90
  LLM_TYPE_10B_128x3_66B,
86
91
  LLM_TYPE_57B_A14B,
87
- LLM_TYPE_27B,
88
- LLM_TYPE_290B,
89
92
  LLM_TYPE_17B_16E, // llama4 Scout
90
93
  LLM_TYPE_17B_128E, // llama4 Maverick
94
+ LLM_TYPE_30B_A3B,
95
+ LLM_TYPE_235B_A22B,
91
96
  };
92
97
 
93
98
  struct llama_layer_posnet {
@@ -171,6 +176,8 @@ struct llama_layer {
171
176
  struct lm_ggml_tensor * wq_b = nullptr;
172
177
  struct lm_ggml_tensor * wkv_a_mqa = nullptr;
173
178
  struct lm_ggml_tensor * wkv_b = nullptr;
179
+ struct lm_ggml_tensor * wk_b = nullptr;
180
+ struct lm_ggml_tensor * wv_b = nullptr;
174
181
  struct lm_ggml_tensor * wq_cross = nullptr;
175
182
  struct lm_ggml_tensor * wk_cross = nullptr;
176
183
  struct lm_ggml_tensor * wv_cross = nullptr;
@@ -388,8 +395,11 @@ struct llama_model {
388
395
 
389
396
  const struct lm_ggml_tensor * get_tensor(const char * name) const;
390
397
 
398
+ lm_ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
399
+
400
+ // note: can mutate `cparams`
391
401
  // TODO: move this to new llm_arch_model_i interface
392
- llama_memory_i * create_memory() const; // TODO: params
402
+ llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
393
403
 
394
404
  // TODO: move this to new llm_arch_model_i interface
395
405
  llm_graph_result_ptr build_graph(
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
232
232
  // }
233
233
 
234
234
  if (k <= 0) {
235
- k = cur_p->size;
235
+ return;
236
236
  }
237
237
 
238
238
  k = std::min(k, (int) cur_p->size);
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
298
298
  }
299
299
  cur_p->sorted = true;
300
300
  }
301
+
301
302
  cur_p->size = k;
302
303
  }
303
304
 
@@ -1506,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1506
1506
  tokenizer_pre == "llama3" ||
1507
1507
  tokenizer_pre == "llama-v3" ||
1508
1508
  tokenizer_pre == "llama-bpe"||
1509
- tokenizer_pre == "falcon3") {
1509
+ tokenizer_pre == "falcon3" ||
1510
+ tokenizer_pre == "pixtral") {
1510
1511
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1511
1512
  ignore_merges = true;
1512
1513
  add_bos = true;
@@ -1572,6 +1573,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1572
1573
  pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1573
1574
  clean_spaces = false;
1574
1575
  } else if (
1576
+ tokenizer_pre == "glm4" ||
1575
1577
  tokenizer_pre == "chatglm-bpe") {
1576
1578
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1577
1579
  special_bos_id = LLAMA_TOKEN_NULL;
@@ -1840,6 +1842,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1840
1842
  if (false
1841
1843
  || t.first == "<|fim_prefix|>" // Qwen
1842
1844
  || t.first == "<fim-prefix>"
1845
+ || t.first == "<fim_prefix>" // Granite
1843
1846
  || t.first == "<|fim▁begin|>" // DeepSeek
1844
1847
  || t.first == "<PRE>"
1845
1848
  || t.first == "▁<PRE>" // CodeLlama
@@ -1858,6 +1861,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1858
1861
  if (false
1859
1862
  || t.first == "<|fim_suffix|>" // Qwen
1860
1863
  || t.first == "<fim-suffix>"
1864
+ || t.first == "<fim_suffix>" // Granite
1861
1865
  || t.first == "<|fim▁hole|>" // DeepSeek
1862
1866
  || t.first == "<SUF>"
1863
1867
  || t.first == "▁<SUF>" // CodeLlama
@@ -1876,6 +1880,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1876
1880
  if (false
1877
1881
  || t.first == "<|fim_middle|>" // Qwen
1878
1882
  || t.first == "<fim-middle>"
1883
+ || t.first == "<fim_middle>" // Granite
1879
1884
  || t.first == "<|fim▁end|>" // DeepSeek
1880
1885
  || t.first == "<MID>"
1881
1886
  || t.first == "▁<MID>" // CodeLlama
@@ -1894,6 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1894
1899
  if (false
1895
1900
  || t.first == "<|fim_pad|>" // Qwen
1896
1901
  || t.first == "<fim-pad>"
1902
+ || t.first == "<fim_pad>" // Granite
1897
1903
  || t.first == "<PAD>"
1898
1904
  ) {
1899
1905
  special_fim_pad_id = t.second;
@@ -1912,6 +1918,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1912
1918
  || t.first == "<|repo_name|>"
1913
1919
  || t.first == "<fim-repo>"
1914
1920
  || t.first == "<REPO>"
1921
+ || t.first == "<reponame>" // Granite
1915
1922
  ) {
1916
1923
  special_fim_rep_id = t.second;
1917
1924
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
package/cpp/llama.h CHANGED
@@ -112,6 +112,7 @@ extern "C" {
112
112
  LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
113
113
  LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
114
114
  LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
115
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
115
116
  };
116
117
 
117
118
  enum llama_rope_type {
@@ -368,17 +369,18 @@ extern "C" {
368
369
 
369
370
  // model quantization parameters
370
371
  typedef struct llama_model_quantize_params {
371
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
372
- enum llama_ftype ftype; // quantize to this llama_ftype
373
- enum lm_ggml_type output_tensor_type; // output tensor type
374
- enum lm_ggml_type token_embedding_type; // token embeddings tensor type
375
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
376
- bool quantize_output_tensor; // quantize output.weight
377
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
378
- bool pure; // quantize all tensors to the default type
379
- bool keep_split; // quantize to the same number of shards
380
- void * imatrix; // pointer to importance matrix data
381
- void * kv_overrides; // pointer to vector containing overrides
372
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
373
+ enum llama_ftype ftype; // quantize to this llama_ftype
374
+ enum lm_ggml_type output_tensor_type; // output tensor type
375
+ enum lm_ggml_type token_embedding_type; // token embeddings tensor type
376
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
377
+ bool quantize_output_tensor; // quantize output.weight
378
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
379
+ bool pure; // quantize all tensors to the default type
380
+ bool keep_split; // quantize to the same number of shards
381
+ void * imatrix; // pointer to importance matrix data
382
+ void * kv_overrides; // pointer to vector containing overrides
383
+ void * tensor_types; // pointer to vector containing tensor types
382
384
  } llama_model_quantize_params;
383
385
 
384
386
  typedef struct llama_logit_bias {
@@ -1231,6 +1233,7 @@ extern "C" {
1231
1233
  "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1232
1234
 
1233
1235
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1236
+ /// Setting k <= 0 makes this a noop
1234
1237
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
1235
1238
 
1236
1239
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
package/cpp/rn-llama.cpp CHANGED
@@ -165,6 +165,7 @@ void llama_rn_context::rewind() {
165
165
  generated_text.reserve(params.n_ctx);
166
166
  generated_token_probs.clear();
167
167
  truncated = false;
168
+ context_full = false;
168
169
  stopped_eos = false;
169
170
  stopped_word = false;
170
171
  stopped_limit = false;
@@ -197,6 +198,9 @@ bool llama_rn_context::loadModel(common_params &params_)
197
198
  templates = common_chat_templates_init(model, params.chat_template);
198
199
  n_ctx = llama_n_ctx(ctx);
199
200
 
201
+ // Initialize context shift flag
202
+ LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
203
+
200
204
  // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
201
205
  // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
202
206
 
@@ -271,11 +275,11 @@ void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
271
275
 
272
276
  new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
273
277
 
274
- LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
278
+ LOG_INFO("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, old_size: %d, new_size: %d",
275
279
  n_ctx,
276
280
  params.n_keep,
277
281
  n_left,
278
- tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
282
+ prompt_tokens.size(),
279
283
  new_tokens.size()
280
284
  );
281
285
 
@@ -304,18 +308,14 @@ void llama_rn_context::loadPrompt() {
304
308
  // if input prompt is too big, truncate like normal
305
309
  if (num_prompt_tokens >= (size_t) n_ctx)
306
310
  {
311
+ if (!params.ctx_shift) {
312
+ context_full = true;
313
+ return;
314
+ }
307
315
  truncatePrompt(prompt_tokens);
308
316
  num_prompt_tokens = prompt_tokens.size();
309
-
310
317
  LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
311
318
  }
312
-
313
- // do context shifitng
314
- if(!params.embedding){
315
- purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
316
- }
317
-
318
-
319
319
  // push the prompt into the sampling context (do not apply grammar)
320
320
  for (auto & token : prompt_tokens)
321
321
  {
@@ -358,6 +358,14 @@ completion_token_output llama_rn_context::nextToken()
358
358
 
359
359
  if (embd.size() >= (size_t)params.n_ctx)
360
360
  {
361
+ if (!params.ctx_shift) {
362
+ // If context shifting is disabled, stop generation
363
+ LOG_WARNING("context full, n_ctx: %d, tokens: %d", params.n_ctx, embd.size());
364
+ has_next_token = false;
365
+ context_full = true;
366
+ return result;
367
+ }
368
+
361
369
  // Shift context
362
370
 
363
371
  const int n_left = n_past - params.n_keep - 1;
@@ -373,12 +381,9 @@ completion_token_output llama_rn_context::nextToken()
373
381
  embd.resize(embd.size() - n_discard);
374
382
 
375
383
  n_past -= n_discard;
384
+ truncated = true;
376
385
 
377
- LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
378
- params.n_ctx,
379
- params.n_keep,
380
- n_left
381
- );
386
+ LOG_VERBOSE("context shifted, new n_past: %d, new size: %d", n_past, embd.size());
382
387
  }
383
388
 
384
389
  bool tg = true;
@@ -712,162 +717,5 @@ void llama_rn_context::removeLoraAdapters() {
712
717
  std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
713
718
  return this->lora;
714
719
  }
715
- std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
716
- int m = x.size(), n = y.size();
717
-
718
- //int LCSuff[m+1][n+1];
719
- std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
720
-
721
- for (int j = 0; j <= n; j++)
722
- LCSuff[0][j] = 0;
723
- for (int i = 0; i <= m; i++)
724
- LCSuff[i][0] = 0;
725
-
726
- for (int i = 1; i <= m; i++)
727
- {
728
- for (int j = 1; j <= n; j++)
729
- {
730
- if (x[i - 1] == y[j - 1])
731
- LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
732
- else
733
- LCSuff[i][j] = 0;
734
- }
735
- }
736
-
737
- std::vector<int> longest;
738
- for (int i = 1; i <= m; i++)
739
- {
740
- for (int j = 1; j <= n; j++)
741
- {
742
- if (LCSuff[i][j] > longest.size())
743
- {
744
- auto off1 = ((i - LCSuff[i][j] + 1) - 1);
745
- auto off2 = off1 + LCSuff[i][j];
746
- longest.clear();
747
- // std::vector<int>().swap(longest);
748
- longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
749
- // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
750
- }
751
- }
752
- }
753
- return longest;
754
- }
755
-
756
- bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
757
- {
758
- int ss = searchSeq.size();
759
- if(targetArray.size()<ss)
760
- {
761
- return false;
762
- }
763
- for(int i=0;i<ss;++i)
764
- {
765
- if(targetArray[i]!=searchSeq[i])
766
- {
767
- return false;
768
- }
769
- }
770
- return true;
771
- }
772
-
773
- int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
774
- {
775
- int ss = searchSeq.size();
776
- int tas = targetArray.size();
777
- if(tas<ss)
778
- {
779
- return -1;
780
- }
781
- for(int i=0;i<tas;++i)
782
- {
783
- int srch = 0;
784
- bool fail = false;
785
- for(int srch=0;srch<ss;++srch)
786
- {
787
- if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
788
- {
789
- fail = true;
790
- break;
791
- }
792
- }
793
- if(!fail)
794
- {
795
- return i;
796
- }
797
- }
798
- return -1;
799
- }
800
-
801
- void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
802
- {
803
- //scan from start old and new ctx, until first mismatch found, save as p0
804
- //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
805
- //test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
806
- //if passed, save beginning of LCQ from old ctx as p1
807
- //remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
808
-
809
- const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
810
- const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
811
-
812
- int trimstart = 0;
813
- int new_tokens_len = new_context_tokens.size();
814
- bool purge_needed = true;
815
-
816
- for (int i = 0; i < current_context_tokens.size(); ++i)
817
- {
818
- if (current_context_tokens[i] == new_context_tokens[i])
819
- {
820
- trimstart += 1;
821
- }
822
- else
823
- {
824
- break;
825
- }
826
- if ((i + 2) >= new_tokens_len)
827
- {
828
- purge_needed = false;
829
- break; //no surgery required
830
- }
831
- }
832
-
833
-
834
-
835
- if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
836
- {
837
- LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
838
- return; //no purge is needed
839
- }
840
-
841
- //at least this many tokens need to match, otherwise don't bother trimming
842
- const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
843
-
844
- auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
845
- auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
846
-
847
- auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
848
-
849
- if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
850
- {
851
- int found = arr_find_index_of(current_context_tokens,shared);
852
- if(found>=0 && found > trimstart)
853
- {
854
-
855
- //extract the unwanted tokens out from context and KV
856
- int diff = found - trimstart;
857
- llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
858
- llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
859
-
860
- for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
861
- {
862
- current_context_tokens[i - diff] = current_context_tokens[i];
863
- }
864
-
865
- LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
866
-
867
- current_context_tokens.resize(current_context_tokens.size() - diff);
868
- }
869
- }
870
-
871
- }
872
720
 
873
721
  }
package/cpp/rn-llama.h CHANGED
@@ -16,7 +16,6 @@
16
16
 
17
17
  namespace rnllama {
18
18
 
19
-
20
19
  std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
21
20
 
22
21
  std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
@@ -69,6 +68,7 @@ struct llama_rn_context {
69
68
 
70
69
  int n_ctx;
71
70
 
71
+ bool context_full = false;
72
72
  bool truncated = false;
73
73
  bool stopped_eos = false;
74
74
  bool stopped_word = false;
@@ -107,10 +107,6 @@ struct llama_rn_context {
107
107
  int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
108
108
  void removeLoraAdapters();
109
109
  std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
110
- std::vector<int> longest_common_subseq(const std::vector<int> x, const std::vector<int> y);
111
- bool arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq);
112
- int arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq);
113
- void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx);
114
110
  };\
115
111
 
116
112
  // Logging macros
@@ -40,15 +40,18 @@ add_library(rnllama SHARED
40
40
  ${SOURCE_DIR}/ggml-alloc.c
41
41
  ${SOURCE_DIR}/ggml-backend.cpp
42
42
  ${SOURCE_DIR}/ggml-backend-reg.cpp
43
- ${SOURCE_DIR}/ggml-cpu.c
44
- ${SOURCE_DIR}/ggml-cpu.cpp
45
- ${SOURCE_DIR}/ops.cpp
46
- ${SOURCE_DIR}/unary-ops.cpp
47
- ${SOURCE_DIR}/binary-ops.cpp
48
- ${SOURCE_DIR}/vec.cpp
49
- ${SOURCE_DIR}/ggml-cpu-aarch64.cpp
50
- ${SOURCE_DIR}/ggml-cpu-quants.c
51
- ${SOURCE_DIR}/ggml-cpu-traits.cpp
43
+ ${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
44
+ ${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
45
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
46
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
47
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu-aarch64.cpp
48
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu-quants.c
49
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu-traits.cpp
50
+ ${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
51
+ ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
52
+ ${SOURCE_DIR}/ggml-cpu/sgemm.cpp
53
+ ${SOURCE_DIR}/ggml-cpu/vec.cpp
54
+ ${SOURCE_DIR}/ggml-cpu/ops.cpp
52
55
  ${SOURCE_DIR}/ggml-metal.m
53
56
  ${SOURCE_DIR}/ggml-opt.cpp
54
57
  ${SOURCE_DIR}/ggml-threading.cpp
@@ -78,7 +81,6 @@ add_library(rnllama SHARED
78
81
  ${SOURCE_DIR}/sampling.cpp
79
82
  ${SOURCE_DIR}/unicode-data.cpp
80
83
  ${SOURCE_DIR}/unicode.cpp
81
- ${SOURCE_DIR}/sgemm.cpp
82
84
  ${SOURCE_DIR}/common.cpp
83
85
  ${SOURCE_DIR}/chat.cpp
84
86
  ${SOURCE_DIR}/json-schema-to-grammar.cpp
@@ -92,6 +94,7 @@ add_library(rnllama SHARED
92
94
  target_include_directories(rnllama
93
95
  PUBLIC
94
96
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
97
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
95
98
  $<INSTALL_INTERFACE:include>
96
99
  )
97
100
 
package/ios/RNLlama.h CHANGED
@@ -1,6 +1,12 @@
1
1
  #import <React/RCTEventEmitter.h>
2
2
  #import <React/RCTBridgeModule.h>
3
3
 
4
+ #if RNLLAMA_BUILD_FROM_SOURCE
5
+ #import "json.hpp"
6
+ #else
7
+ #import <rnllama/json.hpp>
8
+ #endif
9
+
4
10
  // TODO: Use RNLlamaSpec (Need to refactor NSDictionary usage)
5
11
  @interface RNLlama : RCTEventEmitter <RCTBridgeModule>
6
12
 
package/ios/RNLlama.mm CHANGED
@@ -108,8 +108,13 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
108
108
  } else {
109
109
  resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
110
110
  }
111
+ } catch (const nlohmann::json_abi_v3_11_3::detail::parse_error& e) {
112
+ NSString *errorMessage = [NSString stringWithUTF8String:e.what()];
113
+ reject(@"llama_error", [NSString stringWithFormat:@"JSON parse error in getFormattedChat: %@", errorMessage], nil);
111
114
  } catch (const std::exception& e) { // catch cpp exceptions
112
115
  reject(@"llama_error", [NSString stringWithUTF8String:e.what()], nil);
116
+ } catch (...) {
117
+ reject(@"llama_error", @"Unknown error in getFormattedChat", nil);
113
118
  }
114
119
  }
115
120
 
@@ -82,7 +82,7 @@
82
82
  BOOL isAsset = [params[@"is_model_asset"] boolValue];
83
83
  NSString *path = modelPath;
84
84
  if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
85
- defaultParams.model = {[path UTF8String]};
85
+ defaultParams.model.path = [path UTF8String];
86
86
 
87
87
  NSString *chatTemplate = params[@"chat_template"];
88
88
  if (chatTemplate) {
@@ -106,37 +106,27 @@
106
106
  NSString *reasonNoMetal = @"";
107
107
  defaultParams.n_gpu_layers = 0;
108
108
  #ifdef LM_GGML_USE_METAL
109
- // Check ggml-metal availability
110
- NSError * error = nil;
111
109
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
112
- id<MTLLibrary> library = [device
113
- newLibraryWithSource:@"#include <metal_stdlib>\n"
114
- "using namespace metal;"
115
- "typedef matrix<bfloat, 4, 4> bfloat4x4;"
116
- "kernel void test() { simd_sum(0); }"
117
- options:nil
118
- error:&error
119
- ];
120
- if (error) {
121
- reasonNoMetal = [error localizedDescription];
110
+
111
+ // Check ggml-metal availability
112
+ BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
113
+ if (@available(iOS 16.0, tvOS 16.0, *)) {
114
+ supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
115
+ }
116
+ if (!supportsGgmlMetal) {
117
+ reasonNoMetal = @"Metal is not supported in this device";
122
118
  skipGpuDevices = true;
123
- } else {
124
- id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
125
- id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
126
- if (pipeline == nil) {
127
- reasonNoMetal = [error localizedDescription];
128
- skipGpuDevices = true;
129
- } else {
119
+ }
120
+
130
121
  #if TARGET_OS_SIMULATOR
131
- // Use the backend, but no layers because not supported fully on simulator
132
- defaultParams.n_gpu_layers = 0;
133
- isMetalEnabled = true;
122
+ // Use the backend, but no layers because not supported fully on simulator
123
+ defaultParams.n_gpu_layers = 0;
124
+ isMetalEnabled = true;
134
125
  #else
135
- defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
136
- isMetalEnabled = true;
126
+ defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
127
+ isMetalEnabled = true;
137
128
  #endif
138
- }
139
- }
129
+
140
130
  device = nil;
141
131
  #else
142
132
  reasonNoMetal = @"Metal is not enabled in this build";
@@ -158,6 +148,8 @@
158
148
  }
159
149
  if (cpu_devs.size() > 0) {
160
150
  defaultParams.devices = cpu_devs;
151
+ defaultParams.n_gpu_layers = 0;
152
+ isMetalEnabled = false;
161
153
  }
162
154
  }
163
155
 
@@ -184,6 +176,8 @@
184
176
 
185
177
  if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
186
178
 
179
+ if (params[@"ctx_shift"]) defaultParams.ctx_shift = [params[@"ctx_shift"] boolValue];
180
+
187
181
  if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
188
182
  if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
189
183
 
@@ -568,6 +562,9 @@
568
562
  }
569
563
  llama->beginCompletion();
570
564
  llama->loadPrompt();
565
+ if (llama->context_full) {
566
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Context is full" userInfo:nil];
567
+ }
571
568
 
572
569
  size_t sent_count = 0;
573
570
  size_t sent_token_probs_index = 0;
@@ -655,7 +652,7 @@
655
652
  }];
656
653
  }
657
654
  } catch (const std::exception &e) {
658
- // NSLog(@"Error parsing tool calls: %s", e.what());
655
+ } catch (...) {
659
656
  }
660
657
  }
661
658
 
@@ -668,6 +665,7 @@
668
665
  result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
669
666
  result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
670
667
  result[@"truncated"] = @(llama->truncated);
668
+ result[@"context_full"] = @(llama->context_full);
671
669
  result[@"stopped_eos"] = @(llama->stopped_eos);
672
670
  result[@"stopped_word"] = @(llama->stopped_word);
673
671
  result[@"stopped_limit"] = @(llama->stopped_limit);
@@ -355,8 +355,10 @@ struct common_params {
355
355
 
356
356
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
357
357
 
358
- // multimodal models (see examples/llava)
358
+ // multimodal models (see tools/llava)
359
359
  struct common_params_model mmproj;
360
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
361
+ bool no_mmproj = false; // explicitly disable multimodal model
360
362
  std::vector<std::string> image; // path to image file(s)
361
363
 
362
364
  // embedding
@@ -427,8 +429,8 @@ struct common_params {
427
429
  int n_pca_batch = 100;
428
430
  int n_pca_iterations = 1000;
429
431
  dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
430
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
431
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
432
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
433
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
432
434
 
433
435
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
434
436
 
@@ -558,6 +560,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
558
560
  // clear LoRA adapters from context, then apply new list of adapters
559
561
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
560
562
 
563
+ std::string get_model_endpoint();
564
+
561
565
  //
562
566
  // Batch utils
563
567
  //