cui-llama.rn 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/build-arm64/CMakeCache.txt +429 -0
  4. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +81 -0
  5. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +101 -0
  6. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  7. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  8. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +15 -0
  9. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +904 -0
  10. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  11. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +919 -0
  12. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  13. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +431 -0
  14. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  15. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +165 -0
  16. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +297 -0
  17. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +1 -0
  18. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +1 -0
  19. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +1 -0
  20. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +1 -0
  21. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +1 -0
  22. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +1 -0
  23. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +1 -0
  24. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +8 -0
  25. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +1 -0
  26. package/android/src/main/build-arm64/CMakeFiles/progress.marks +1 -0
  27. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  28. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +58 -0
  29. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  30. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +756 -0
  31. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  32. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +709 -0
  33. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  34. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +714 -0
  35. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  36. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +62 -0
  37. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  38. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +708 -0
  39. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  40. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +113 -0
  41. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  42. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +713 -0
  43. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  44. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +763 -0
  45. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  46. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +61 -0
  47. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  48. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +707 -0
  49. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  50. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +104 -0
  51. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  52. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +714 -0
  53. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  54. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +723 -0
  55. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +62 -0
  56. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +722 -0
  57. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +89 -0
  58. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +2 -0
  59. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +2 -0
  60. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +2 -0
  61. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +17 -0
  62. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +41 -0
  63. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +62 -0
  64. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +722 -0
  65. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +89 -0
  66. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +2 -0
  67. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +2 -0
  68. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +2 -0
  69. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +17 -0
  70. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +41 -0
  71. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +62 -0
  72. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +722 -0
  73. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +89 -0
  74. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +2 -0
  75. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +2 -0
  76. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +2 -0
  77. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +17 -0
  78. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +41 -0
  79. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +62 -0
  80. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +722 -0
  81. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +89 -0
  82. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +2 -0
  83. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +2 -0
  84. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +2 -0
  85. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +17 -0
  86. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +41 -0
  87. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +62 -0
  88. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +722 -0
  89. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +89 -0
  90. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +2 -0
  91. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +2 -0
  92. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +2 -0
  93. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +17 -0
  94. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +41 -0
  95. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +62 -0
  96. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +722 -0
  97. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +89 -0
  98. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +2 -0
  99. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +2 -0
  100. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +2 -0
  101. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +17 -0
  102. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +41 -0
  103. package/android/src/main/build-arm64/Makefile +1862 -0
  104. package/android/src/main/build-arm64/cmake_install.cmake +66 -0
  105. package/android/src/main/java/com/rnllama/LlamaContext.java +92 -18
  106. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  107. package/android/src/main/jni-utils.h +6 -0
  108. package/android/src/main/jni.cpp +287 -31
  109. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  110. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  111. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  112. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  113. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  114. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  115. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  116. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  117. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  118. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  119. package/cpp/chat-template.hpp +529 -0
  120. package/cpp/chat.cpp +1085 -0
  121. package/cpp/chat.hpp +55 -0
  122. package/cpp/common.cpp +159 -36
  123. package/cpp/common.h +64 -19
  124. package/cpp/ggml-alloc.c +1 -13
  125. package/cpp/ggml-common.h +0 -2
  126. package/cpp/ggml-cpu-impl.h +6 -12
  127. package/cpp/ggml-cpu-quants.c +937 -340
  128. package/cpp/ggml-cpu.c +207 -113
  129. package/cpp/ggml-cpu.cpp +4 -6
  130. package/cpp/ggml-cpu.h +1 -1
  131. package/cpp/ggml-metal.h +66 -66
  132. package/cpp/ggml-metal.m +141 -23
  133. package/cpp/ggml.c +24 -14
  134. package/cpp/ggml.h +2 -2
  135. package/cpp/json-schema-to-grammar.cpp +46 -66
  136. package/cpp/json-schema-to-grammar.h +15 -1
  137. package/cpp/llama-arch.cpp +7 -2
  138. package/cpp/llama-arch.h +3 -1
  139. package/cpp/llama-chat.cpp +10 -1
  140. package/cpp/llama-chat.h +1 -0
  141. package/cpp/llama-grammar.cpp +86 -6
  142. package/cpp/llama-grammar.h +22 -1
  143. package/cpp/llama-impl.h +6 -6
  144. package/cpp/llama-kv-cache.h +1 -1
  145. package/cpp/llama-mmap.h +1 -0
  146. package/cpp/llama-model-loader.cpp +1 -1
  147. package/cpp/llama-model.cpp +32 -6
  148. package/cpp/llama-sampling.cpp +178 -61
  149. package/cpp/llama-vocab.cpp +8 -3
  150. package/cpp/llama.cpp +188 -128
  151. package/cpp/llama.h +27 -10
  152. package/cpp/log.cpp +32 -10
  153. package/cpp/log.h +12 -1
  154. package/cpp/minja.hpp +2883 -0
  155. package/cpp/rn-llama.cpp +82 -5
  156. package/cpp/rn-llama.h +16 -1
  157. package/cpp/sampling.cpp +68 -41
  158. package/cpp/sampling.h +3 -0
  159. package/cpp/sgemm.cpp +9 -8
  160. package/cpp/unicode.cpp +9 -2
  161. package/ios/CMakeLists.txt +6 -0
  162. package/ios/RNLlama.h +0 -8
  163. package/ios/RNLlama.mm +27 -3
  164. package/ios/RNLlamaContext.h +10 -1
  165. package/ios/RNLlamaContext.mm +269 -57
  166. package/jest/mock.js +21 -2
  167. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  168. package/lib/commonjs/grammar.js +3 -0
  169. package/lib/commonjs/grammar.js.map +1 -1
  170. package/lib/commonjs/index.js +87 -13
  171. package/lib/commonjs/index.js.map +1 -1
  172. package/lib/module/NativeRNLlama.js.map +1 -1
  173. package/lib/module/grammar.js +3 -0
  174. package/lib/module/grammar.js.map +1 -1
  175. package/lib/module/index.js +86 -13
  176. package/lib/module/index.js.map +1 -1
  177. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  178. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  179. package/lib/typescript/grammar.d.ts.map +1 -1
  180. package/lib/typescript/index.d.ts +32 -7
  181. package/lib/typescript/index.d.ts.map +1 -1
  182. package/llama-rn.podspec +1 -1
  183. package/package.json +2 -2
  184. package/src/NativeRNLlama.ts +115 -3
  185. package/src/grammar.ts +3 -0
  186. package/src/index.ts +138 -21
package/cpp/rn-llama.cpp CHANGED
@@ -194,6 +194,7 @@ bool llama_rn_context::loadModel(common_params &params_)
194
194
  LOG_ERROR("unable to load model: %s", params_.model.c_str());
195
195
  return false;
196
196
  }
197
+ templates = common_chat_templates_from_model(model, params.chat_template);
197
198
  n_ctx = llama_n_ctx(ctx);
198
199
 
199
200
  // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
@@ -202,11 +203,87 @@ bool llama_rn_context::loadModel(common_params &params_)
202
203
  return true;
203
204
  }
204
205
 
205
- bool llama_rn_context::validateModelChatTemplate() const {
206
- const char * tmpl = llama_model_chat_template(model);
207
- llama_chat_message chat[] = {{"user", "test"}};
208
- int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
209
- return chat_res > 0;
206
+ bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
207
+ const char * tmpl = llama_model_chat_template(model, name);
208
+ if (tmpl == nullptr) {
209
+ return false;
210
+ }
211
+ return common_chat_verify_template(tmpl, use_jinja);
212
+ }
213
+
214
+ common_chat_params llama_rn_context::getFormattedChatWithJinja(
215
+ const std::string &messages,
216
+ const std::string &chat_template,
217
+ const std::string &json_schema,
218
+ const std::string &tools,
219
+ const bool &parallel_tool_calls,
220
+ const std::string &tool_choice
221
+ ) const {
222
+ common_chat_inputs inputs;
223
+ inputs.messages = json::parse(messages);
224
+ auto useTools = !tools.empty();
225
+ if (useTools) {
226
+ inputs.tools = json::parse(tools);
227
+ }
228
+ inputs.parallel_tool_calls = parallel_tool_calls;
229
+ if (!tool_choice.empty()) {
230
+ inputs.tool_choice = tool_choice;
231
+ }
232
+ if (!json_schema.empty()) {
233
+ inputs.json_schema = json::parse(json_schema);
234
+ }
235
+ inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
236
+ inputs.stream = true;
237
+
238
+ // If chat_template is provided, create new one and use it (probably slow)
239
+ if (!chat_template.empty()) {
240
+ auto tmp = common_chat_templates_from_model(model, chat_template);
241
+ const common_chat_template* template_ptr = useTools && tmp.template_tool_use ? tmp.template_tool_use.get() : tmp.template_default.get();
242
+ if (inputs.parallel_tool_calls && !template_ptr->original_caps().supports_parallel_tool_calls) {
243
+ inputs.parallel_tool_calls = false;
244
+ }
245
+ return common_chat_params_init(*template_ptr, inputs);
246
+ } else {
247
+ const common_chat_template* template_ptr = useTools && templates.template_tool_use ? templates.template_tool_use.get() : templates.template_default.get();
248
+ if (inputs.parallel_tool_calls && !template_ptr->original_caps().supports_parallel_tool_calls) {
249
+ inputs.parallel_tool_calls = false;
250
+ }
251
+ return common_chat_params_init(*template_ptr, inputs);
252
+ }
253
+ }
254
+
255
+ std::string llama_rn_context::getFormattedChat(
256
+ const std::string &messages,
257
+ const std::string &chat_template
258
+ ) const {
259
+ auto chat_json = json::parse(messages);
260
+
261
+ // Handle regular chat without tools
262
+ std::vector<common_chat_msg> chat_msgs;
263
+ for (const auto &msg : chat_json) {
264
+ chat_msgs.push_back({
265
+ msg["role"].get<std::string>(),
266
+ msg["content"].get<std::string>()
267
+ });
268
+ }
269
+
270
+ // If chat_template is provided, create new one and use it (probably slow)
271
+ if (!chat_template.empty()) {
272
+ auto tmp = common_chat_templates_from_model(model, chat_template);
273
+ return common_chat_apply_template(
274
+ *tmp.template_default,
275
+ chat_msgs,
276
+ true,
277
+ false
278
+ );
279
+ } else {
280
+ return common_chat_apply_template(
281
+ *templates.template_default,
282
+ chat_msgs,
283
+ true,
284
+ false
285
+ );
286
+ }
210
287
  }
211
288
 
212
289
  void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
package/cpp/rn-llama.h CHANGED
@@ -3,6 +3,8 @@
3
3
 
4
4
  #include <sstream>
5
5
  #include <iostream>
6
+ #include "chat.hpp"
7
+ #include "chat-template.hpp"
6
8
  #include "common.h"
7
9
  #include "ggml.h"
8
10
  #include "gguf.h"
@@ -63,6 +65,7 @@ struct llama_rn_context {
63
65
 
64
66
  llama_context *ctx = nullptr;
65
67
  common_sampler *ctx_sampling = nullptr;
68
+ common_chat_templates templates;
66
69
 
67
70
  int n_ctx;
68
71
 
@@ -80,7 +83,19 @@ struct llama_rn_context {
80
83
  void rewind();
81
84
  bool initSampling();
82
85
  bool loadModel(common_params &params_);
83
- bool validateModelChatTemplate() const;
86
+ bool validateModelChatTemplate(bool use_jinja, const char *name) const;
87
+ common_chat_params getFormattedChatWithJinja(
88
+ const std::string &messages,
89
+ const std::string &chat_template,
90
+ const std::string &json_schema,
91
+ const std::string &tools,
92
+ const bool &parallel_tool_calls,
93
+ const std::string &tool_choice
94
+ ) const;
95
+ std::string getFormattedChat(
96
+ const std::string &messages,
97
+ const std::string &chat_template
98
+ ) const;
84
99
  void truncatePrompt(std::vector<llama_token> &prompt_tokens);
85
100
  void loadPrompt();
86
101
  void beginCompletion();
package/cpp/sampling.cpp CHANGED
@@ -134,11 +134,11 @@ std::string common_params_sampling::print() const {
134
134
  snprintf(result, sizeof(result),
135
135
  "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
136
136
  "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
137
- "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
137
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
138
138
  "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
139
139
  penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
140
140
  dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
141
- top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
141
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
142
142
  mirostat, mirostat_eta, mirostat_tau);
143
143
 
144
144
  return std::string(result);
@@ -151,9 +151,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
151
151
 
152
152
  lparams.no_perf = params.no_perf;
153
153
 
154
+ struct llama_sampler * grmr;
155
+ if (params.grammar.compare(0, 11, "%llguidance") == 0) {
156
+ #ifdef LLAMA_USE_LLGUIDANCE
157
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
158
+ #else
159
+ LM_GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
160
+ #endif // LLAMA_USE_LLGUIDANCE
161
+ } else {
162
+ std::vector<const char *> trigger_words;
163
+ trigger_words.reserve(params.grammar_trigger_words.size());
164
+ for (const auto & str : params.grammar_trigger_words) {
165
+ trigger_words.push_back(str.word.c_str());
166
+ }
167
+
168
+ grmr = params.grammar_lazy
169
+ ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
+ trigger_words.data(), trigger_words.size(),
171
+ params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
172
+ : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
173
+ }
174
+
154
175
  auto * result = new common_sampler {
155
176
  /* .params = */ params,
156
- /* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
177
+ /* .grmr = */ grmr,
157
178
  /* .chain = */ llama_sampler_chain_init(lparams),
158
179
  /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
159
180
  /* .cur = */ {},
@@ -167,45 +188,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
167
188
  params.logit_bias.data()));
168
189
 
169
190
  if (params.mirostat == 0) {
170
- for (const auto & cnstr : params.samplers) {
171
- switch (cnstr) {
172
- case COMMON_SAMPLER_TYPE_DRY:
173
- {
174
- std::vector<const char *> c_breakers;
175
- c_breakers.reserve(params.dry_sequence_breakers.size());
176
- for (const auto & str : params.dry_sequence_breakers) {
177
- c_breakers.push_back(str.c_str());
191
+ if (params.top_n_sigma >= 0) {
192
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
193
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
194
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
195
+ } else {
196
+ for (const auto & cnstr : params.samplers) {
197
+ switch (cnstr) {
198
+ case COMMON_SAMPLER_TYPE_DRY:
199
+ {
200
+ std::vector<const char *> c_breakers;
201
+ c_breakers.reserve(params.dry_sequence_breakers.size());
202
+ for (const auto & str : params.dry_sequence_breakers) {
203
+ c_breakers.push_back(str.c_str());
204
+ }
205
+
206
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
178
207
  }
179
-
180
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
181
- }
182
- break;
183
- case COMMON_SAMPLER_TYPE_TOP_K:
184
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
185
- break;
186
- case COMMON_SAMPLER_TYPE_TOP_P:
187
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
188
- break;
189
- case COMMON_SAMPLER_TYPE_MIN_P:
190
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
191
- break;
192
- case COMMON_SAMPLER_TYPE_XTC:
193
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
194
- break;
195
- case COMMON_SAMPLER_TYPE_TYPICAL_P:
196
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
197
- break;
198
- case COMMON_SAMPLER_TYPE_TEMPERATURE:
199
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
200
- break;
201
- case COMMON_SAMPLER_TYPE_INFILL:
202
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
203
- break;
204
- case COMMON_SAMPLER_TYPE_PENALTIES:
205
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
206
- break;
207
- default:
208
- LM_GGML_ASSERT(false && "unknown sampler type");
208
+ break;
209
+ case COMMON_SAMPLER_TYPE_TOP_K:
210
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
211
+ break;
212
+ case COMMON_SAMPLER_TYPE_TOP_P:
213
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
214
+ break;
215
+ case COMMON_SAMPLER_TYPE_MIN_P:
216
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
217
+ break;
218
+ case COMMON_SAMPLER_TYPE_XTC:
219
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
220
+ break;
221
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
222
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
223
+ break;
224
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
225
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
226
+ break;
227
+ case COMMON_SAMPLER_TYPE_INFILL:
228
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
229
+ break;
230
+ case COMMON_SAMPLER_TYPE_PENALTIES:
231
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
232
+ break;
233
+ default:
234
+ LM_GGML_ASSERT(false && "unknown sampler type");
235
+ }
209
236
  }
210
237
  }
211
238
  llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
package/cpp/sampling.h CHANGED
@@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
102
102
 
103
103
  std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
104
104
  std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
105
+
106
+ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
107
+ const char * grammar_kind, const char * grammar_data);
package/cpp/sgemm.cpp CHANGED
@@ -280,14 +280,6 @@ template <> inline __m256bh load(const float *p) {
280
280
  }
281
281
  #endif
282
282
 
283
- ////////////////////////////////////////////////////////////////////////////////////////////////////
284
- // CONSTANTS
285
-
286
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
287
- static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
288
- static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
289
- #endif
290
-
291
283
  ////////////////////////////////////////////////////////////////////////////////////////////////////
292
284
  // FLOATING POINT MATRIX MULTIPLICATION
293
285
 
@@ -614,6 +606,14 @@ class tinyBLAS_Q0_AVX {
614
606
  TC *C, int64_t ldc,
615
607
  int ith, int nth)
616
608
  : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
609
+ const int8_t kvalues_iq4nl[16] = {
610
+ -127, -104, -83, -65,
611
+ -49, -35, -22, -10,
612
+ 1, 13, 25, 38,
613
+ 53, 69, 89, 113
614
+ };
615
+
616
+ iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
617
617
  }
618
618
 
619
619
  void matmul(int64_t m, int64_t n) {
@@ -1038,6 +1038,7 @@ class tinyBLAS_Q0_AVX {
1038
1038
  const int64_t ldc;
1039
1039
  const int ith;
1040
1040
  const int nth;
1041
+ __m128i iq4nlt;
1041
1042
  };
1042
1043
  #endif // __AVX__
1043
1044
 
package/cpp/unicode.cpp CHANGED
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
618
618
  result.reserve(utf8.size());
619
619
  size_t offset = 0;
620
620
  while (offset < utf8.size()) {
621
- result.push_back(unicode_cpt_from_utf8(utf8, offset));
621
+ try {
622
+ result.push_back(unicode_cpt_from_utf8(utf8, offset));
623
+ }
624
+ catch (const std::invalid_argument & /*ex*/) {
625
+ // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
626
+ ++offset;
627
+ result.emplace_back(0xFFFD); // replacement character
628
+ }
622
629
  }
623
630
  return result;
624
631
  }
@@ -701,7 +708,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
701
708
  const auto cpts = unicode_cpts_from_utf8(text);
702
709
 
703
710
  // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
704
- // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
711
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
705
712
  std::string text_collapsed;
706
713
  if (need_collapse) {
707
714
  // collapse all unicode categories
@@ -15,6 +15,7 @@ add_definitions(
15
15
  -DLM_GGML_USE_CPU
16
16
  -DLM_GGML_USE_ACCELERATE
17
17
  -DLM_GGML_USE_METAL
18
+ -DLM_GGML_METAL_USE_BF16
18
19
  )
19
20
 
20
21
  set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp)
@@ -66,6 +67,11 @@ add_library(rnllama SHARED
66
67
  ${SOURCE_DIR}/unicode.cpp
67
68
  ${SOURCE_DIR}/sgemm.cpp
68
69
  ${SOURCE_DIR}/common.cpp
70
+ ${SOURCE_DIR}/chat.cpp
71
+ ${SOURCE_DIR}/chat-template.hpp
72
+ ${SOURCE_DIR}/json-schema-to-grammar.cpp
73
+ ${SOURCE_DIR}/minja.hpp
74
+ ${SOURCE_DIR}/json.hpp
69
75
  ${SOURCE_DIR}/amx/amx.cpp
70
76
  ${SOURCE_DIR}/amx/mmq.cpp
71
77
  ${SOURCE_DIR}/rn-llama.cpp
package/ios/RNLlama.h CHANGED
@@ -1,11 +1,3 @@
1
- #ifdef __cplusplus
2
- #if RNLLAMA_BUILD_FROM_SOURCE
3
- #import "rn-llama.h"
4
- #else
5
- #import <rnllama/rn-llama.h>
6
- #endif
7
- #endif
8
-
9
1
  #import <React/RCTEventEmitter.h>
10
2
  #import <React/RCTBridgeModule.h>
11
3
 
package/ios/RNLlama.mm CHANGED
@@ -13,6 +13,16 @@ dispatch_queue_t llamaDQueue;
13
13
 
14
14
  RCT_EXPORT_MODULE()
15
15
 
16
+ RCT_EXPORT_METHOD(toggleNativeLog:(BOOL)enabled) {
17
+ void (^onEmitLog)(NSString *level, NSString *text) = nil;
18
+ if (enabled) {
19
+ onEmitLog = ^(NSString *level, NSString *text) {
20
+ [self sendEventWithName:@"@RNLlama_onNativeLog" body:@{ @"level": level, @"text": text }];
21
+ };
22
+ }
23
+ [RNLlamaContext toggleNativeLog:enabled onEmitLog:onEmitLog];
24
+ }
25
+
16
26
  RCT_EXPORT_METHOD(setContextLimit:(double)limit
17
27
  withResolver:(RCTPromiseResolveBlock)resolve
18
28
  withRejecter:(RCTPromiseRejectBlock)reject)
@@ -41,7 +51,7 @@ RCT_EXPORT_METHOD(initContext:(double)contextId
41
51
  }
42
52
 
43
53
  if (llamaDQueue == nil) {
44
- llamaDQueue = dispatch_queue_create("com.rnllama", DISPATCH_QUEUE_SERIAL);
54
+ llamaDQueue = dispatch_queue_create("com.rnllama", DISPATCH_QUEUE_SERIAL);
45
55
  }
46
56
 
47
57
  if (llamaContexts == nil) {
@@ -77,8 +87,9 @@ RCT_EXPORT_METHOD(initContext:(double)contextId
77
87
  }
78
88
 
79
89
  RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
80
- withMessages:(NSArray *)messages
90
+ withMessages:(NSString *)messages
81
91
  withTemplate:(NSString *)chatTemplate
92
+ withParams:(NSDictionary *)params
82
93
  withResolver:(RCTPromiseResolveBlock)resolve
83
94
  withRejecter:(RCTPromiseRejectBlock)reject)
84
95
  {
@@ -87,7 +98,19 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
87
98
  reject(@"llama_error", @"Context not found", nil);
88
99
  return;
89
100
  }
90
- resolve([context getFormattedChat:messages withTemplate:chatTemplate]);
101
+ try {
102
+ if ([params[@"jinja"] boolValue]) {
103
+ NSString *jsonSchema = params[@"json_schema"];
104
+ NSString *tools = params[@"tools"];
105
+ NSString *parallelToolCalls = params[@"parallel_tool_calls"];
106
+ NSString *toolChoice = params[@"tool_choice"];\
107
+ resolve([context getFormattedChatWithJinja:messages withChatTemplate:chatTemplate withJsonSchema:jsonSchema withTools:tools withParallelToolCalls:parallelToolCalls withToolChoice:toolChoice]);
108
+ } else {
109
+ resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
110
+ }
111
+ } catch (const std::exception& e) { // catch cpp exceptions
112
+ reject(@"llama_error", [NSString stringWithUTF8String:e.what()], nil);
113
+ }
91
114
  }
92
115
 
93
116
  RCT_EXPORT_METHOD(loadSession:(double)contextId
@@ -146,6 +169,7 @@ RCT_EXPORT_METHOD(saveSession:(double)contextId
146
169
  return@[
147
170
  @"@RNLlama_onInitContextProgress",
148
171
  @"@RNLlama_onToken",
172
+ @"@RNLlama_onNativeLog",
149
173
  ];
150
174
  }
151
175
 
@@ -4,11 +4,13 @@
4
4
  #import "llama-impl.h"
5
5
  #import "ggml.h"
6
6
  #import "rn-llama.h"
7
+ #import "json-schema-to-grammar.h"
7
8
  #else
8
9
  #import <rnllama/llama.h>
9
10
  #import <rnllama/llama-impl.h>
10
11
  #import <rnllama/ggml.h>
11
12
  #import <rnllama/rn-llama.h>
13
+ #import <rnllama/json-schema-to-grammar.h>
12
14
  #endif
13
15
  #endif
14
16
 
@@ -23,6 +25,7 @@
23
25
  rnllama::llama_rn_context * llama;
24
26
  }
25
27
 
28
+ + (void)toggleNativeLog:(BOOL)enabled onEmitLog:(void (^)(NSString *level, NSString *text))onEmitLog;
26
29
  + (NSDictionary *)modelInfo:(NSString *)path skip:(NSArray *)skip;
27
30
  + (instancetype)initWithParams:(NSDictionary *)params onProgress:(void (^)(unsigned int progress))onProgress;
28
31
  - (void)interruptLoad;
@@ -36,7 +39,13 @@
36
39
  - (NSArray *)tokenize:(NSString *)text;
37
40
  - (NSString *)detokenize:(NSArray *)tokens;
38
41
  - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params;
39
- - (NSString *)getFormattedChat:(NSArray *)messages withTemplate:(NSString *)chatTemplate;
42
+ - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
43
+ withChatTemplate:(NSString *)chatTemplate
44
+ withJsonSchema:(NSString *)jsonSchema
45
+ withTools:(NSString *)tools
46
+ withParallelToolCalls:(BOOL)parallelToolCalls
47
+ withToolChoice:(NSString *)toolChoice;
48
+ - (NSString *)getFormattedChat:(NSString *)messages withChatTemplate:(NSString *)chatTemplate;
40
49
  - (NSDictionary *)loadSession:(NSString *)path;
41
50
  - (int)saveSession:(NSString *)path size:(int)size;
42
51
  - (NSString *)bench:(int)pp tg:(int)tg pl:(int)pl nr:(int)nr;