cui-llama.rn 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (309) hide show
  1. package/LICENSE +20 -20
  2. package/README.md +317 -319
  3. package/android/build.gradle +116 -116
  4. package/android/gradle.properties +5 -5
  5. package/android/src/main/AndroidManifest.xml +4 -4
  6. package/android/src/main/CMakeLists.txt +124 -124
  7. package/android/src/main/java/com/rnllama/LlamaContext.java +645 -645
  8. package/android/src/main/java/com/rnllama/RNLlama.java +695 -695
  9. package/android/src/main/java/com/rnllama/RNLlamaPackage.java +48 -48
  10. package/android/src/main/jni-utils.h +100 -100
  11. package/android/src/main/jni.cpp +1263 -1263
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  14. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  15. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  16. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  17. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  20. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +135 -135
  21. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +136 -136
  22. package/cpp/README.md +4 -4
  23. package/cpp/ggml-llama-sim.metallib +0 -0
  24. package/cpp/ggml-llama.metallib +0 -0
  25. package/cpp/ggml-metal-impl.h +597 -597
  26. package/cpp/ggml-metal.m +4 -0
  27. package/cpp/ggml.h +1 -1
  28. package/cpp/rn-llama.cpp +873 -873
  29. package/cpp/rn-llama.h +138 -138
  30. package/cpp/sampling.h +107 -107
  31. package/cpp/unicode-data.cpp +7034 -7034
  32. package/cpp/unicode-data.h +20 -20
  33. package/cpp/unicode.cpp +849 -849
  34. package/cpp/unicode.h +66 -66
  35. package/ios/CMakeLists.txt +116 -108
  36. package/ios/RNLlama.h +7 -7
  37. package/ios/RNLlama.mm +418 -405
  38. package/ios/RNLlamaContext.h +57 -57
  39. package/ios/RNLlamaContext.mm +835 -835
  40. package/ios/rnllama.xcframework/Info.plist +74 -74
  41. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
  42. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +143 -0
  43. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +677 -0
  44. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
  45. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
  46. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  47. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
  48. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
  49. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
  50. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  51. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  52. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  53. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  54. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
  55. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
  56. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  57. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
  58. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
  59. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
  60. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
  61. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +2222 -0
  62. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/gguf.h +202 -0
  63. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  64. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json.hpp +24766 -0
  65. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
  66. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
  67. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
  68. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
  69. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +265 -0
  70. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
  71. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
  72. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
  73. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
  74. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
  75. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
  76. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-io.h +35 -0
  77. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  78. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
  79. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
  80. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
  81. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +409 -0
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +1434 -0
  85. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/log.h +132 -0
  86. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  87. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  88. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +128 -0
  89. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
  90. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sampling.h +107 -0
  91. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +14 -0
  92. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
  93. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/speculative.h +28 -0
  94. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
  95. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
  96. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode.h +66 -0
  97. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +802 -0
  98. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  99. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  100. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  101. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
  102. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
  103. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
  104. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
  105. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
  106. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  107. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
  108. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
  109. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
  110. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  111. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  112. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  113. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  114. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
  115. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
  116. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  117. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
  118. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
  119. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
  120. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
  121. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
  122. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
  123. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  124. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
  125. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
  126. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
  127. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
  128. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
  129. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
  130. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +143 -0
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +677 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +2222 -0
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/gguf.h +202 -0
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json.hpp +24766 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
  188. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
  189. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
  190. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +265 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-io.h +35 -0
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +409 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +1434 -0
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/log.h +132 -0
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +128 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sampling.h +107 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +14 -0
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/speculative.h +28 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode.h +66 -0
  218. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +802 -0
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  222. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
  223. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
  224. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  259. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
  260. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
  271. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
  274. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
  275. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
  276. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
  277. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
  278. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
  279. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  280. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
  281. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  282. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  283. package/jest/mock.js +203 -203
  284. package/lib/commonjs/NativeRNLlama.js +1 -2
  285. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  286. package/lib/commonjs/chat.js.map +1 -1
  287. package/lib/commonjs/grammar.js +12 -31
  288. package/lib/commonjs/grammar.js.map +1 -1
  289. package/lib/commonjs/index.js +47 -47
  290. package/lib/commonjs/index.js.map +1 -1
  291. package/lib/commonjs/package.json +1 -0
  292. package/lib/module/NativeRNLlama.js +2 -0
  293. package/lib/module/NativeRNLlama.js.map +1 -1
  294. package/lib/module/chat.js +2 -0
  295. package/lib/module/chat.js.map +1 -1
  296. package/lib/module/grammar.js +14 -31
  297. package/lib/module/grammar.js.map +1 -1
  298. package/lib/module/index.js +47 -45
  299. package/lib/module/index.js.map +1 -1
  300. package/lib/module/package.json +1 -0
  301. package/lib/typescript/NativeRNLlama.d.ts +6 -4
  302. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  303. package/lib/typescript/index.d.ts.map +1 -1
  304. package/llama-rn.podspec +48 -48
  305. package/package.json +233 -233
  306. package/src/NativeRNLlama.ts +426 -426
  307. package/src/chat.ts +44 -44
  308. package/src/grammar.ts +854 -854
  309. package/src/index.ts +495 -487
package/cpp/rn-llama.cpp CHANGED
@@ -1,873 +1,873 @@
1
- #include "rn-llama.h"
2
-
3
- namespace rnllama {
4
-
5
- const std::vector<lm_ggml_type> kv_cache_types = {
6
- LM_GGML_TYPE_F32,
7
- LM_GGML_TYPE_F16,
8
- LM_GGML_TYPE_BF16,
9
- LM_GGML_TYPE_Q8_0,
10
- LM_GGML_TYPE_Q4_0,
11
- LM_GGML_TYPE_Q4_1,
12
- LM_GGML_TYPE_IQ4_NL,
13
- LM_GGML_TYPE_Q5_0,
14
- LM_GGML_TYPE_Q5_1,
15
- };
16
-
17
- lm_ggml_type kv_cache_type_from_str(const std::string & s) {
18
- for (const auto & type : kv_cache_types) {
19
- if (lm_ggml_type_name(type) == s) {
20
- return type;
21
- }
22
- }
23
- throw std::runtime_error("Unsupported cache type: " + s);
24
- }
25
-
26
- static void llama_batch_clear(llama_batch *batch) {
27
- batch->n_tokens = 0;
28
- }
29
-
30
- static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
31
- batch->token [batch->n_tokens] = id;
32
- batch->pos [batch->n_tokens] = pos;
33
- batch->n_seq_id[batch->n_tokens] = seq_ids.size();
34
- for (size_t i = 0; i < seq_ids.size(); i++) {
35
- batch->seq_id[batch->n_tokens][i] = seq_ids[i];
36
- }
37
- batch->logits [batch->n_tokens] = logits ? 1 : 0;
38
- batch->n_tokens += 1;
39
- }
40
-
41
- // NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
42
-
43
- static void log(const char *level, const char *function, int line,
44
- const char *format, ...)
45
- {
46
- va_list args;
47
- #if defined(__ANDROID__)
48
- char prefix[256];
49
- snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
50
-
51
- va_start(args, format);
52
- android_LogPriority priority;
53
- if (strcmp(level, "ERROR") == 0) {
54
- priority = ANDROID_LOG_ERROR;
55
- } else if (strcmp(level, "WARNING") == 0) {
56
- priority = ANDROID_LOG_WARN;
57
- } else if (strcmp(level, "INFO") == 0) {
58
- priority = ANDROID_LOG_INFO;
59
- } else {
60
- priority = ANDROID_LOG_DEBUG;
61
- }
62
- __android_log_vprint(priority, "RNLlama", prefix, args);
63
- va_end(args);
64
- #else
65
- printf("[%s] %s:%d ", level, function, line);
66
- va_start(args, format);
67
- vprintf(format, args);
68
- va_end(args);
69
- printf("\n");
70
- #endif
71
- }
72
-
73
- #if RNLLAMA_VERBOSE != 1
74
- #define LOG_VERBOSE(MSG, ...)
75
- #else
76
- #define LOG_VERBOSE(MSG, ...) \
77
- do \
78
- { \
79
- if (rnllama_verbose) \
80
- { \
81
- log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
82
- } \
83
- } while (0)
84
- #endif
85
-
86
- #define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
87
- #define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
88
- #define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
89
-
90
- static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
91
- {
92
- size_t i;
93
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
94
- {
95
- }
96
- return i;
97
- }
98
-
99
- static bool ends_with(const std::string &str, const std::string &suffix)
100
- {
101
- return str.size() >= suffix.size() &&
102
- 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
103
- }
104
-
105
- static size_t find_partial_stop_string(const std::string &stop,
106
- const std::string &text)
107
- {
108
- if (!text.empty() && !stop.empty())
109
- {
110
- const char text_last_char = text.back();
111
- for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
112
- {
113
- if (stop[char_index] == text_last_char)
114
- {
115
- const std::string current_partial = stop.substr(0, char_index + 1);
116
- if (ends_with(text, current_partial))
117
- {
118
- return text.size() - char_index - 1;
119
- }
120
- }
121
- }
122
- }
123
- return std::string::npos;
124
- }
125
-
126
- // format incomplete utf-8 multibyte character for output
127
- std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
128
- {
129
- std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
130
- // if the size is 1 and first bit is 1, meaning it's a partial character
131
- // (size > 1 meaning it's already a known token)
132
- if (out.size() == 1 && (out[0] & 0x80) == 0x80)
133
- {
134
- std::stringstream ss;
135
- ss << std::hex << (out[0] & 0xff);
136
- std::string res(ss.str());
137
- out = "byte: \\x" + res;
138
- }
139
- return out;
140
- }
141
-
142
- std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end)
143
- {
144
- std::string ret;
145
- for (auto it = begin; it != end; ++it)
146
- {
147
- ret += common_token_to_piece(ctx, *it);
148
- }
149
- return ret;
150
- }
151
-
152
- llama_rn_context::~llama_rn_context() {
153
- if (ctx_sampling != nullptr) {
154
- common_sampler_free(ctx_sampling);
155
- }
156
- }
157
-
158
- void llama_rn_context::rewind() {
159
- is_interrupted = false;
160
- params.antiprompt.clear();
161
- params.sampling.grammar.clear();
162
- num_prompt_tokens = 0;
163
- num_tokens_predicted = 0;
164
- generated_text = "";
165
- generated_text.reserve(params.n_ctx);
166
- generated_token_probs.clear();
167
- truncated = false;
168
- stopped_eos = false;
169
- stopped_word = false;
170
- stopped_limit = false;
171
- stopping_word = "";
172
- incomplete = false;
173
- n_remain = 0;
174
- n_past = 0;
175
- params.sampling.n_prev = n_ctx;
176
- }
177
-
178
- bool llama_rn_context::initSampling() {
179
- if (ctx_sampling != nullptr) {
180
- common_sampler_free(ctx_sampling);
181
- }
182
- ctx_sampling = common_sampler_init(model, params.sampling);
183
- return ctx_sampling != nullptr;
184
- }
185
-
186
- bool llama_rn_context::loadModel(common_params &params_)
187
- {
188
- params = params_;
189
- llama_init = common_init_from_params(params);
190
- model = llama_init.model.get();
191
- ctx = llama_init.context.get();
192
- if (model == nullptr)
193
- {
194
- LOG_ERROR("unable to load model: %s", params_.model.path.c_str());
195
- return false;
196
- }
197
- templates = common_chat_templates_init(model, params.chat_template);
198
- n_ctx = llama_n_ctx(ctx);
199
-
200
- // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
201
- // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
202
-
203
- return true;
204
- }
205
-
206
- bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
207
- const char * tmpl = llama_model_chat_template(model, name);
208
- if (tmpl == nullptr) {
209
- return false;
210
- }
211
- return common_chat_verify_template(tmpl, use_jinja);
212
- }
213
-
214
- common_chat_params llama_rn_context::getFormattedChatWithJinja(
215
- const std::string &messages,
216
- const std::string &chat_template,
217
- const std::string &json_schema,
218
- const std::string &tools,
219
- const bool &parallel_tool_calls,
220
- const std::string &tool_choice
221
- ) const {
222
- common_chat_templates_inputs inputs;
223
- inputs.use_jinja = true;
224
- inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
225
- auto useTools = !tools.empty();
226
- if (useTools) {
227
- inputs.tools = common_chat_tools_parse_oaicompat(json::parse(tools));
228
- }
229
- inputs.parallel_tool_calls = parallel_tool_calls;
230
- if (!tool_choice.empty()) {
231
- inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
232
- }
233
- if (!json_schema.empty()) {
234
- inputs.json_schema = json::parse(json_schema);
235
- }
236
- inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
237
-
238
- // If chat_template is provided, create new one and use it (probably slow)
239
- if (!chat_template.empty()) {
240
- auto tmps = common_chat_templates_init(model, chat_template);
241
- return common_chat_templates_apply(tmps.get(), inputs);
242
- } else {
243
- return common_chat_templates_apply(templates.get(), inputs);
244
- }
245
- }
246
-
247
- std::string llama_rn_context::getFormattedChat(
248
- const std::string &messages,
249
- const std::string &chat_template
250
- ) const {
251
- common_chat_templates_inputs inputs;
252
- inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
253
- inputs.use_jinja = false;
254
-
255
- // If chat_template is provided, create new one and use it (probably slow)
256
- if (!chat_template.empty()) {
257
- auto tmps = common_chat_templates_init(model, chat_template);
258
- return common_chat_templates_apply(tmps.get(), inputs).prompt;
259
- } else {
260
- return common_chat_templates_apply(templates.get(), inputs).prompt;
261
- }
262
- }
263
-
264
- void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
265
- const int n_left = n_ctx - params.n_keep;
266
- const int n_block_size = n_left / 2;
267
- const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
268
-
269
- // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
270
- std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
271
-
272
- new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
273
-
274
- LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
275
- n_ctx,
276
- params.n_keep,
277
- n_left,
278
- tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
279
- new_tokens.size()
280
- );
281
-
282
- truncated = true;
283
- prompt_tokens = new_tokens;
284
- }
285
-
286
- void llama_rn_context::loadPrompt() {
287
- std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
288
- num_prompt_tokens = prompt_tokens.size();
289
-
290
- // LOG tokens
291
- std::stringstream ss;
292
- ss << "\n" << __func__ << ": prompt_tokens = ";
293
- for (auto& token : prompt_tokens) {
294
- ss << token << " ";
295
- }
296
- LOG_INFO("%s\n", ss.str().c_str());
297
-
298
- if (params.n_keep < 0)
299
- {
300
- params.n_keep = (int)num_prompt_tokens;
301
- }
302
- params.n_keep = std::min(n_ctx - 4, params.n_keep);
303
-
304
- // if input prompt is too big, truncate like normal
305
- if (num_prompt_tokens >= (size_t) n_ctx)
306
- {
307
- truncatePrompt(prompt_tokens);
308
- num_prompt_tokens = prompt_tokens.size();
309
-
310
- LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
311
- }
312
-
313
- // do context shifitng
314
- if(!params.embedding){
315
- purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
316
- }
317
-
318
-
319
- // push the prompt into the sampling context (do not apply grammar)
320
- for (auto & token : prompt_tokens)
321
- {
322
- common_sampler_accept(ctx_sampling, token, false);
323
- }
324
-
325
- // compare the evaluated prompt with the new prompt
326
- n_past = common_part(embd, prompt_tokens);
327
-
328
- embd = prompt_tokens;
329
- if (n_past == num_prompt_tokens)
330
- {
331
- // we have to evaluate at least 1 token to generate logits.
332
- n_past--;
333
- }
334
-
335
- // since #3228 we now have to manually manage the KV cache
336
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
337
-
338
- LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
339
- n_past,
340
- tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
341
- tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
342
- );
343
-
344
- has_next_token = true;
345
- }
346
-
347
- void llama_rn_context::beginCompletion() {
348
- // number of tokens to keep when resetting context
349
- n_remain = params.n_predict;
350
- llama_perf_context_reset(ctx);
351
- is_predicting = true;
352
- }
353
-
354
- completion_token_output llama_rn_context::nextToken()
355
- {
356
- completion_token_output result;
357
- result.tok = -1;
358
-
359
- if (embd.size() >= (size_t)params.n_ctx)
360
- {
361
- // Shift context
362
-
363
- const int n_left = n_past - params.n_keep - 1;
364
- const int n_discard = n_left/2;
365
-
366
- llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
367
- llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
368
-
369
- for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
370
- {
371
- embd[i - n_discard] = embd[i];
372
- }
373
- embd.resize(embd.size() - n_discard);
374
-
375
- n_past -= n_discard;
376
-
377
- LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
378
- params.n_ctx,
379
- params.n_keep,
380
- n_left
381
- );
382
- }
383
-
384
- bool tg = true;
385
- while (n_past < embd.size())
386
- {
387
- int n_eval = (int)embd.size() - n_past;
388
- tg = n_eval == 1;
389
- if (n_eval > params.n_batch)
390
- {
391
- n_eval = params.n_batch;
392
- }
393
- if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
394
- {
395
- LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
396
- n_eval,
397
- n_past,
398
- params.cpuparams.n_threads,
399
- tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
400
- );
401
- has_next_token = false;
402
- return result;
403
- }
404
- n_past += n_eval;
405
-
406
- if(is_interrupted) {
407
- LOG_INFO("Decoding Interrupted");
408
- embd.resize(n_past);
409
- has_next_token = false;
410
- return result;
411
- }
412
- }
413
-
414
- const llama_vocab* vocab = llama_model_get_vocab(model);
415
-
416
- if (params.n_predict == 0)
417
- {
418
- has_next_token = false;
419
- result.tok = llama_vocab_eos(vocab);
420
- return result;
421
- }
422
-
423
- {
424
- // out of user input, sample next token
425
- std::vector<llama_token_data> candidates;
426
- candidates.reserve(llama_vocab_n_tokens(vocab));
427
-
428
- result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
429
-
430
- llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
431
-
432
- const int32_t n_probs = params.sampling.n_probs;
433
-
434
- // deprecated
435
- /*if (params.sampling.temp <= 0 && n_probs > 0)
436
- {
437
- // For llama_sample_token_greedy we need to sort candidates
438
- llama_sampler_init_softmax();
439
-
440
- }*/
441
-
442
-
443
- for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
444
- {
445
- result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
446
- }
447
-
448
- common_sampler_accept(ctx_sampling, result.tok, true);
449
- if (tg) {
450
- num_tokens_predicted++;
451
- }
452
- }
453
-
454
- // add it to the context
455
- embd.push_back(result.tok);
456
- // decrement remaining sampling budget
457
- --n_remain;
458
-
459
- if (!embd.empty() && embd.back() == llama_vocab_eos(vocab))
460
- {
461
- // stopping_word = llama_token_to_piece(ctx, embd.back());
462
- has_next_token = false;
463
- stopped_eos = true;
464
- LOG_VERBOSE("eos token found", "");
465
- return result;
466
- }
467
-
468
- has_next_token = params.n_predict == -1 || n_remain != 0;
469
- return result;
470
- }
471
-
472
- size_t llama_rn_context::findStoppingStrings(const std::string &text, const size_t last_token_size,
473
- const stop_type type)
474
- {
475
- size_t stop_pos = std::string::npos;
476
- for (const std::string &word : params.antiprompt)
477
- {
478
- size_t pos;
479
- if (type == STOP_FULL)
480
- {
481
- const size_t tmp = word.size() + last_token_size;
482
- const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
483
- pos = text.find(word, from_pos);
484
- }
485
- else
486
- {
487
- pos = find_partial_stop_string(word, text);
488
- }
489
- if (pos != std::string::npos &&
490
- (stop_pos == std::string::npos || pos < stop_pos))
491
- {
492
- if (type == STOP_FULL)
493
- {
494
- stopping_word = word;
495
- stopped_word = true;
496
- has_next_token = false;
497
- }
498
- stop_pos = pos;
499
- }
500
- }
501
- return stop_pos;
502
- }
503
-
504
- completion_token_output llama_rn_context::doCompletion()
505
- {
506
- const completion_token_output token_with_probs = nextToken();
507
-
508
- const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
509
- generated_text += token_text;
510
-
511
- if (params.sampling.n_probs > 0)
512
- {
513
- generated_token_probs.push_back(token_with_probs);
514
- }
515
-
516
- // check if there is incomplete UTF-8 character at the end
517
- for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) {
518
- unsigned char c = generated_text[generated_text.size() - i];
519
- if ((c & 0xC0) == 0x80) {
520
- // continuation byte: 10xxxxxx
521
- continue;
522
- }
523
- if ((c & 0xE0) == 0xC0) {
524
- // 2-byte character: 110xxxxx ...
525
- incomplete = i < 2;
526
- } else if ((c & 0xF0) == 0xE0) {
527
- // 3-byte character: 1110xxxx ...
528
- incomplete = i < 3;
529
- } else if ((c & 0xF8) == 0xF0) {
530
- // 4-byte character: 11110xxx ...
531
- incomplete = i < 4;
532
- }
533
- // else 1-byte character or invalid byte
534
- break;
535
- }
536
-
537
- if (incomplete && !has_next_token)
538
- {
539
- has_next_token = true;
540
- n_remain++;
541
- }
542
-
543
- if (!has_next_token && n_remain == 0)
544
- {
545
- stopped_limit = true;
546
- }
547
-
548
- LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
549
- common_token_to_piece(ctx, token_with_probs.tok),
550
- tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
551
- has_next_token,
552
- n_remain,
553
- num_tokens_predicted,
554
- stopped_eos,
555
- stopped_word,
556
- stopped_limit,
557
- stopping_word.c_str()
558
- );
559
- return token_with_probs;
560
- }
561
-
562
- std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
563
- {
564
- static const int n_embd = llama_model_n_embd(llama_get_model(ctx));
565
- if (!embd_params.embedding)
566
- {
567
- LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
568
- return std::vector<float>(n_embd, 0.0f);
569
- }
570
- float *data;
571
-
572
- const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
573
- if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
574
- data = llama_get_embeddings(ctx);
575
- } else {
576
- data = llama_get_embeddings_seq(ctx, 0);
577
- }
578
-
579
- if (!data) {
580
- return std::vector<float>(n_embd, 0.0f);
581
- }
582
- std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
583
- common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize);
584
- return out;
585
- }
586
-
587
- std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
588
- {
589
- if (is_predicting) {
590
- LOG_ERROR("cannot benchmark while predicting", "");
591
- return std::string("[]");
592
- }
593
-
594
- is_predicting = true;
595
-
596
- double pp_avg = 0;
597
- double tg_avg = 0;
598
-
599
- double pp_std = 0;
600
- double tg_std = 0;
601
-
602
- // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
603
- llama_batch batch = llama_batch_init(
604
- std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
605
- 0, // No embeddings
606
- 1 // Single sequence
607
- );
608
-
609
- for (int i = 0; i < nr; i++)
610
- {
611
- llama_batch_clear(&batch);
612
-
613
- const int n_tokens = pp;
614
-
615
- for (int i = 0; i < n_tokens; i++)
616
- {
617
- llama_batch_add(&batch, 0, i, {0}, false);
618
- }
619
- batch.logits[batch.n_tokens - 1] = 1; // true
620
-
621
- llama_kv_self_clear(ctx);
622
-
623
- const int64_t t_pp_start = llama_time_us();
624
- if (llama_decode(ctx, batch) != 0)
625
- {
626
- LOG_ERROR("llama_decode() failed during prompt", "");
627
- }
628
- const int64_t t_pp_end = llama_time_us();
629
- llama_kv_self_clear(ctx);
630
-
631
- if (is_interrupted) break;
632
-
633
- const int64_t t_tg_start = llama_time_us();
634
-
635
- for (int i = 0; i < tg; i++)
636
- {
637
- llama_batch_clear(&batch);
638
-
639
- for (int j = 0; j < pl; j++)
640
- {
641
- llama_batch_add(&batch, 0, i, {j}, true);
642
- }
643
-
644
- if (llama_decode(ctx, batch) != 0)
645
- {
646
- LOG_ERROR("llama_decode() failed during text generation", "");
647
- }
648
- if (is_interrupted) break;
649
- }
650
-
651
- const int64_t t_tg_end = llama_time_us();
652
-
653
- llama_kv_self_clear(ctx);
654
-
655
- const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
656
- const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
657
-
658
- const double speed_pp = pp / t_pp;
659
- const double speed_tg = (pl * tg) / t_tg;
660
-
661
- pp_avg += speed_pp;
662
- tg_avg += speed_tg;
663
-
664
- pp_std += speed_pp * speed_pp;
665
- tg_std += speed_tg * speed_tg;
666
- }
667
-
668
- pp_avg /= nr;
669
- tg_avg /= nr;
670
-
671
- if (nr > 1) {
672
- pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1));
673
- tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1));
674
- } else {
675
- pp_std = 0;
676
- tg_std = 0;
677
- }
678
-
679
- if (is_interrupted) llama_kv_self_clear(ctx);
680
- is_predicting = false;
681
-
682
- char model_desc[128];
683
- llama_model_desc(model, model_desc, sizeof(model_desc));
684
- return std::string("[\"") + model_desc + std::string("\",") +
685
- std::to_string(llama_model_size(model)) + std::string(",") +
686
- std::to_string(llama_model_n_params(model)) + std::string(",") +
687
- std::to_string(pp_avg) + std::string(",") +
688
- std::to_string(pp_std) + std::string(",") +
689
- std::to_string(tg_avg) + std::string(",") +
690
- std::to_string(tg_std) +
691
- std::string("]");
692
- }
693
-
694
- int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
695
- for (auto &la : lora) {
696
- la.ptr = llama_adapter_lora_init(model, la.path.c_str());
697
- if (la.ptr == nullptr) {
698
- LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
699
- return -1;
700
- }
701
- }
702
- this->lora = lora;
703
- common_set_adapter_lora(ctx, lora);
704
- return 0;
705
- }
706
-
707
- void llama_rn_context::removeLoraAdapters() {
708
- this->lora.clear();
709
- common_set_adapter_lora(ctx, this->lora); // apply empty list
710
- }
711
-
712
- std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
713
- return this->lora;
714
- }
715
- std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
716
- int m = x.size(), n = y.size();
717
-
718
- //int LCSuff[m+1][n+1];
719
- std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
720
-
721
- for (int j = 0; j <= n; j++)
722
- LCSuff[0][j] = 0;
723
- for (int i = 0; i <= m; i++)
724
- LCSuff[i][0] = 0;
725
-
726
- for (int i = 1; i <= m; i++)
727
- {
728
- for (int j = 1; j <= n; j++)
729
- {
730
- if (x[i - 1] == y[j - 1])
731
- LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
732
- else
733
- LCSuff[i][j] = 0;
734
- }
735
- }
736
-
737
- std::vector<int> longest;
738
- for (int i = 1; i <= m; i++)
739
- {
740
- for (int j = 1; j <= n; j++)
741
- {
742
- if (LCSuff[i][j] > longest.size())
743
- {
744
- auto off1 = ((i - LCSuff[i][j] + 1) - 1);
745
- auto off2 = off1 + LCSuff[i][j];
746
- longest.clear();
747
- // std::vector<int>().swap(longest);
748
- longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
749
- // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
750
- }
751
- }
752
- }
753
- return longest;
754
- }
755
-
756
- bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
757
- {
758
- int ss = searchSeq.size();
759
- if(targetArray.size()<ss)
760
- {
761
- return false;
762
- }
763
- for(int i=0;i<ss;++i)
764
- {
765
- if(targetArray[i]!=searchSeq[i])
766
- {
767
- return false;
768
- }
769
- }
770
- return true;
771
- }
772
-
773
- int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
774
- {
775
- int ss = searchSeq.size();
776
- int tas = targetArray.size();
777
- if(tas<ss)
778
- {
779
- return -1;
780
- }
781
- for(int i=0;i<tas;++i)
782
- {
783
- int srch = 0;
784
- bool fail = false;
785
- for(int srch=0;srch<ss;++srch)
786
- {
787
- if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
788
- {
789
- fail = true;
790
- break;
791
- }
792
- }
793
- if(!fail)
794
- {
795
- return i;
796
- }
797
- }
798
- return -1;
799
- }
800
-
801
- void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
802
- {
803
- //scan from start old and new ctx, until first mismatch found, save as p0
804
- //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
805
- //test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
806
- //if passed, save beginning of LCQ from old ctx as p1
807
- //remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
808
-
809
- const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
810
- const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
811
-
812
- int trimstart = 0;
813
- int new_tokens_len = new_context_tokens.size();
814
- bool purge_needed = true;
815
-
816
- for (int i = 0; i < current_context_tokens.size(); ++i)
817
- {
818
- if (current_context_tokens[i] == new_context_tokens[i])
819
- {
820
- trimstart += 1;
821
- }
822
- else
823
- {
824
- break;
825
- }
826
- if ((i + 2) >= new_tokens_len)
827
- {
828
- purge_needed = false;
829
- break; //no surgery required
830
- }
831
- }
832
-
833
-
834
-
835
- if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
836
- {
837
- LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
838
- return; //no purge is needed
839
- }
840
-
841
- //at least this many tokens need to match, otherwise don't bother trimming
842
- const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
843
-
844
- auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
845
- auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
846
-
847
- auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
848
-
849
- if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
850
- {
851
- int found = arr_find_index_of(current_context_tokens,shared);
852
- if(found>=0 && found > trimstart)
853
- {
854
-
855
- //extract the unwanted tokens out from context and KV
856
- int diff = found - trimstart;
857
- llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
858
- llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
859
-
860
- for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
861
- {
862
- current_context_tokens[i - diff] = current_context_tokens[i];
863
- }
864
-
865
- LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
866
-
867
- current_context_tokens.resize(current_context_tokens.size() - diff);
868
- }
869
- }
870
-
871
- }
872
-
873
- }
1
+ #include "rn-llama.h"
2
+
3
+ namespace rnllama {
4
+
5
+ const std::vector<lm_ggml_type> kv_cache_types = {
6
+ LM_GGML_TYPE_F32,
7
+ LM_GGML_TYPE_F16,
8
+ LM_GGML_TYPE_BF16,
9
+ LM_GGML_TYPE_Q8_0,
10
+ LM_GGML_TYPE_Q4_0,
11
+ LM_GGML_TYPE_Q4_1,
12
+ LM_GGML_TYPE_IQ4_NL,
13
+ LM_GGML_TYPE_Q5_0,
14
+ LM_GGML_TYPE_Q5_1,
15
+ };
16
+
17
+ lm_ggml_type kv_cache_type_from_str(const std::string & s) {
18
+ for (const auto & type : kv_cache_types) {
19
+ if (lm_ggml_type_name(type) == s) {
20
+ return type;
21
+ }
22
+ }
23
+ throw std::runtime_error("Unsupported cache type: " + s);
24
+ }
25
+
26
+ static void llama_batch_clear(llama_batch *batch) {
27
+ batch->n_tokens = 0;
28
+ }
29
+
30
+ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
31
+ batch->token [batch->n_tokens] = id;
32
+ batch->pos [batch->n_tokens] = pos;
33
+ batch->n_seq_id[batch->n_tokens] = seq_ids.size();
34
+ for (size_t i = 0; i < seq_ids.size(); i++) {
35
+ batch->seq_id[batch->n_tokens][i] = seq_ids[i];
36
+ }
37
+ batch->logits [batch->n_tokens] = logits ? 1 : 0;
38
+ batch->n_tokens += 1;
39
+ }
40
+
41
+ // NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
42
+
43
+ static void log(const char *level, const char *function, int line,
44
+ const char *format, ...)
45
+ {
46
+ va_list args;
47
+ #if defined(__ANDROID__)
48
+ char prefix[256];
49
+ snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
50
+
51
+ va_start(args, format);
52
+ android_LogPriority priority;
53
+ if (strcmp(level, "ERROR") == 0) {
54
+ priority = ANDROID_LOG_ERROR;
55
+ } else if (strcmp(level, "WARNING") == 0) {
56
+ priority = ANDROID_LOG_WARN;
57
+ } else if (strcmp(level, "INFO") == 0) {
58
+ priority = ANDROID_LOG_INFO;
59
+ } else {
60
+ priority = ANDROID_LOG_DEBUG;
61
+ }
62
+ __android_log_vprint(priority, "RNLlama", prefix, args);
63
+ va_end(args);
64
+ #else
65
+ printf("[%s] %s:%d ", level, function, line);
66
+ va_start(args, format);
67
+ vprintf(format, args);
68
+ va_end(args);
69
+ printf("\n");
70
+ #endif
71
+ }
72
+
73
+ #if RNLLAMA_VERBOSE != 1
74
+ #define LOG_VERBOSE(MSG, ...)
75
+ #else
76
+ #define LOG_VERBOSE(MSG, ...) \
77
+ do \
78
+ { \
79
+ if (rnllama_verbose) \
80
+ { \
81
+ log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
82
+ } \
83
+ } while (0)
84
+ #endif
85
+
86
+ #define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
87
+ #define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
88
+ #define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
89
+
90
+ static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
91
+ {
92
+ size_t i;
93
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
94
+ {
95
+ }
96
+ return i;
97
+ }
98
+
99
+ static bool ends_with(const std::string &str, const std::string &suffix)
100
+ {
101
+ return str.size() >= suffix.size() &&
102
+ 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
103
+ }
104
+
105
+ static size_t find_partial_stop_string(const std::string &stop,
106
+ const std::string &text)
107
+ {
108
+ if (!text.empty() && !stop.empty())
109
+ {
110
+ const char text_last_char = text.back();
111
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
112
+ {
113
+ if (stop[char_index] == text_last_char)
114
+ {
115
+ const std::string current_partial = stop.substr(0, char_index + 1);
116
+ if (ends_with(text, current_partial))
117
+ {
118
+ return text.size() - char_index - 1;
119
+ }
120
+ }
121
+ }
122
+ }
123
+ return std::string::npos;
124
+ }
125
+
126
+ // format incomplete utf-8 multibyte character for output
127
+ std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
128
+ {
129
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
130
+ // if the size is 1 and first bit is 1, meaning it's a partial character
131
+ // (size > 1 meaning it's already a known token)
132
+ if (out.size() == 1 && (out[0] & 0x80) == 0x80)
133
+ {
134
+ std::stringstream ss;
135
+ ss << std::hex << (out[0] & 0xff);
136
+ std::string res(ss.str());
137
+ out = "byte: \\x" + res;
138
+ }
139
+ return out;
140
+ }
141
+
142
+ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end)
143
+ {
144
+ std::string ret;
145
+ for (auto it = begin; it != end; ++it)
146
+ {
147
+ ret += common_token_to_piece(ctx, *it);
148
+ }
149
+ return ret;
150
+ }
151
+
152
+ llama_rn_context::~llama_rn_context() {
153
+ if (ctx_sampling != nullptr) {
154
+ common_sampler_free(ctx_sampling);
155
+ }
156
+ }
157
+
158
+ void llama_rn_context::rewind() {
159
+ is_interrupted = false;
160
+ params.antiprompt.clear();
161
+ params.sampling.grammar.clear();
162
+ num_prompt_tokens = 0;
163
+ num_tokens_predicted = 0;
164
+ generated_text = "";
165
+ generated_text.reserve(params.n_ctx);
166
+ generated_token_probs.clear();
167
+ truncated = false;
168
+ stopped_eos = false;
169
+ stopped_word = false;
170
+ stopped_limit = false;
171
+ stopping_word = "";
172
+ incomplete = false;
173
+ n_remain = 0;
174
+ n_past = 0;
175
+ params.sampling.n_prev = n_ctx;
176
+ }
177
+
178
+ bool llama_rn_context::initSampling() {
179
+ if (ctx_sampling != nullptr) {
180
+ common_sampler_free(ctx_sampling);
181
+ }
182
+ ctx_sampling = common_sampler_init(model, params.sampling);
183
+ return ctx_sampling != nullptr;
184
+ }
185
+
186
+ bool llama_rn_context::loadModel(common_params &params_)
187
+ {
188
+ params = params_;
189
+ llama_init = common_init_from_params(params);
190
+ model = llama_init.model.get();
191
+ ctx = llama_init.context.get();
192
+ if (model == nullptr)
193
+ {
194
+ LOG_ERROR("unable to load model: %s", params_.model.path.c_str());
195
+ return false;
196
+ }
197
+ templates = common_chat_templates_init(model, params.chat_template);
198
+ n_ctx = llama_n_ctx(ctx);
199
+
200
+ // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
201
+ // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
202
+
203
+ return true;
204
+ }
205
+
206
+ bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
207
+ const char * tmpl = llama_model_chat_template(model, name);
208
+ if (tmpl == nullptr) {
209
+ return false;
210
+ }
211
+ return common_chat_verify_template(tmpl, use_jinja);
212
+ }
213
+
214
+ common_chat_params llama_rn_context::getFormattedChatWithJinja(
215
+ const std::string &messages,
216
+ const std::string &chat_template,
217
+ const std::string &json_schema,
218
+ const std::string &tools,
219
+ const bool &parallel_tool_calls,
220
+ const std::string &tool_choice
221
+ ) const {
222
+ common_chat_templates_inputs inputs;
223
+ inputs.use_jinja = true;
224
+ inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
225
+ auto useTools = !tools.empty();
226
+ if (useTools) {
227
+ inputs.tools = common_chat_tools_parse_oaicompat(json::parse(tools));
228
+ }
229
+ inputs.parallel_tool_calls = parallel_tool_calls;
230
+ if (!tool_choice.empty()) {
231
+ inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
232
+ }
233
+ if (!json_schema.empty()) {
234
+ inputs.json_schema = json::parse(json_schema);
235
+ }
236
+ inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
237
+
238
+ // If chat_template is provided, create new one and use it (probably slow)
239
+ if (!chat_template.empty()) {
240
+ auto tmps = common_chat_templates_init(model, chat_template);
241
+ return common_chat_templates_apply(tmps.get(), inputs);
242
+ } else {
243
+ return common_chat_templates_apply(templates.get(), inputs);
244
+ }
245
+ }
246
+
247
+ std::string llama_rn_context::getFormattedChat(
248
+ const std::string &messages,
249
+ const std::string &chat_template
250
+ ) const {
251
+ common_chat_templates_inputs inputs;
252
+ inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
253
+ inputs.use_jinja = false;
254
+
255
+ // If chat_template is provided, create new one and use it (probably slow)
256
+ if (!chat_template.empty()) {
257
+ auto tmps = common_chat_templates_init(model, chat_template);
258
+ return common_chat_templates_apply(tmps.get(), inputs).prompt;
259
+ } else {
260
+ return common_chat_templates_apply(templates.get(), inputs).prompt;
261
+ }
262
+ }
263
+
264
+ void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
265
+ const int n_left = n_ctx - params.n_keep;
266
+ const int n_block_size = n_left / 2;
267
+ const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
268
+
269
+ // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
270
+ std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
271
+
272
+ new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
273
+
274
+ LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
275
+ n_ctx,
276
+ params.n_keep,
277
+ n_left,
278
+ tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
279
+ new_tokens.size()
280
+ );
281
+
282
+ truncated = true;
283
+ prompt_tokens = new_tokens;
284
+ }
285
+
286
+ void llama_rn_context::loadPrompt() {
287
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
288
+ num_prompt_tokens = prompt_tokens.size();
289
+
290
+ // LOG tokens
291
+ std::stringstream ss;
292
+ ss << "\n" << __func__ << ": prompt_tokens = ";
293
+ for (auto& token : prompt_tokens) {
294
+ ss << token << " ";
295
+ }
296
+ LOG_INFO("%s\n", ss.str().c_str());
297
+
298
+ if (params.n_keep < 0)
299
+ {
300
+ params.n_keep = (int)num_prompt_tokens;
301
+ }
302
+ params.n_keep = std::min(n_ctx - 4, params.n_keep);
303
+
304
+ // if input prompt is too big, truncate like normal
305
+ if (num_prompt_tokens >= (size_t) n_ctx)
306
+ {
307
+ truncatePrompt(prompt_tokens);
308
+ num_prompt_tokens = prompt_tokens.size();
309
+
310
+ LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
311
+ }
312
+
313
+ // do context shifitng
314
+ if(!params.embedding){
315
+ purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
316
+ }
317
+
318
+
319
+ // push the prompt into the sampling context (do not apply grammar)
320
+ for (auto & token : prompt_tokens)
321
+ {
322
+ common_sampler_accept(ctx_sampling, token, false);
323
+ }
324
+
325
+ // compare the evaluated prompt with the new prompt
326
+ n_past = common_part(embd, prompt_tokens);
327
+
328
+ embd = prompt_tokens;
329
+ if (n_past == num_prompt_tokens)
330
+ {
331
+ // we have to evaluate at least 1 token to generate logits.
332
+ n_past--;
333
+ }
334
+
335
+ // since #3228 we now have to manually manage the KV cache
336
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
337
+
338
+ LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
339
+ n_past,
340
+ tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
341
+ tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
342
+ );
343
+
344
+ has_next_token = true;
345
+ }
346
+
347
+ void llama_rn_context::beginCompletion() {
348
+ // number of tokens to keep when resetting context
349
+ n_remain = params.n_predict;
350
+ llama_perf_context_reset(ctx);
351
+ is_predicting = true;
352
+ }
353
+
354
+ completion_token_output llama_rn_context::nextToken()
355
+ {
356
+ completion_token_output result;
357
+ result.tok = -1;
358
+
359
+ if (embd.size() >= (size_t)params.n_ctx)
360
+ {
361
+ // Shift context
362
+
363
+ const int n_left = n_past - params.n_keep - 1;
364
+ const int n_discard = n_left/2;
365
+
366
+ llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
367
+ llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
368
+
369
+ for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
370
+ {
371
+ embd[i - n_discard] = embd[i];
372
+ }
373
+ embd.resize(embd.size() - n_discard);
374
+
375
+ n_past -= n_discard;
376
+
377
+ LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
378
+ params.n_ctx,
379
+ params.n_keep,
380
+ n_left
381
+ );
382
+ }
383
+
384
+ bool tg = true;
385
+ while (n_past < embd.size())
386
+ {
387
+ int n_eval = (int)embd.size() - n_past;
388
+ tg = n_eval == 1;
389
+ if (n_eval > params.n_batch)
390
+ {
391
+ n_eval = params.n_batch;
392
+ }
393
+ if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
394
+ {
395
+ LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
396
+ n_eval,
397
+ n_past,
398
+ params.cpuparams.n_threads,
399
+ tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
400
+ );
401
+ has_next_token = false;
402
+ return result;
403
+ }
404
+ n_past += n_eval;
405
+
406
+ if(is_interrupted) {
407
+ LOG_INFO("Decoding Interrupted");
408
+ embd.resize(n_past);
409
+ has_next_token = false;
410
+ return result;
411
+ }
412
+ }
413
+
414
+ const llama_vocab* vocab = llama_model_get_vocab(model);
415
+
416
+ if (params.n_predict == 0)
417
+ {
418
+ has_next_token = false;
419
+ result.tok = llama_vocab_eos(vocab);
420
+ return result;
421
+ }
422
+
423
+ {
424
+ // out of user input, sample next token
425
+ std::vector<llama_token_data> candidates;
426
+ candidates.reserve(llama_vocab_n_tokens(vocab));
427
+
428
+ result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
429
+
430
+ llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
431
+
432
+ const int32_t n_probs = params.sampling.n_probs;
433
+
434
+ // deprecated
435
+ /*if (params.sampling.temp <= 0 && n_probs > 0)
436
+ {
437
+ // For llama_sample_token_greedy we need to sort candidates
438
+ llama_sampler_init_softmax();
439
+
440
+ }*/
441
+
442
+
443
+ for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
444
+ {
445
+ result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
446
+ }
447
+
448
+ common_sampler_accept(ctx_sampling, result.tok, true);
449
+ if (tg) {
450
+ num_tokens_predicted++;
451
+ }
452
+ }
453
+
454
+ // add it to the context
455
+ embd.push_back(result.tok);
456
+ // decrement remaining sampling budget
457
+ --n_remain;
458
+
459
+ if (!embd.empty() && embd.back() == llama_vocab_eos(vocab))
460
+ {
461
+ // stopping_word = llama_token_to_piece(ctx, embd.back());
462
+ has_next_token = false;
463
+ stopped_eos = true;
464
+ LOG_VERBOSE("eos token found", "");
465
+ return result;
466
+ }
467
+
468
+ has_next_token = params.n_predict == -1 || n_remain != 0;
469
+ return result;
470
+ }
471
+
472
+ size_t llama_rn_context::findStoppingStrings(const std::string &text, const size_t last_token_size,
473
+ const stop_type type)
474
+ {
475
+ size_t stop_pos = std::string::npos;
476
+ for (const std::string &word : params.antiprompt)
477
+ {
478
+ size_t pos;
479
+ if (type == STOP_FULL)
480
+ {
481
+ const size_t tmp = word.size() + last_token_size;
482
+ const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
483
+ pos = text.find(word, from_pos);
484
+ }
485
+ else
486
+ {
487
+ pos = find_partial_stop_string(word, text);
488
+ }
489
+ if (pos != std::string::npos &&
490
+ (stop_pos == std::string::npos || pos < stop_pos))
491
+ {
492
+ if (type == STOP_FULL)
493
+ {
494
+ stopping_word = word;
495
+ stopped_word = true;
496
+ has_next_token = false;
497
+ }
498
+ stop_pos = pos;
499
+ }
500
+ }
501
+ return stop_pos;
502
+ }
503
+
504
+ completion_token_output llama_rn_context::doCompletion()
505
+ {
506
+ const completion_token_output token_with_probs = nextToken();
507
+
508
+ const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
509
+ generated_text += token_text;
510
+
511
+ if (params.sampling.n_probs > 0)
512
+ {
513
+ generated_token_probs.push_back(token_with_probs);
514
+ }
515
+
516
+ // check if there is incomplete UTF-8 character at the end
517
+ for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) {
518
+ unsigned char c = generated_text[generated_text.size() - i];
519
+ if ((c & 0xC0) == 0x80) {
520
+ // continuation byte: 10xxxxxx
521
+ continue;
522
+ }
523
+ if ((c & 0xE0) == 0xC0) {
524
+ // 2-byte character: 110xxxxx ...
525
+ incomplete = i < 2;
526
+ } else if ((c & 0xF0) == 0xE0) {
527
+ // 3-byte character: 1110xxxx ...
528
+ incomplete = i < 3;
529
+ } else if ((c & 0xF8) == 0xF0) {
530
+ // 4-byte character: 11110xxx ...
531
+ incomplete = i < 4;
532
+ }
533
+ // else 1-byte character or invalid byte
534
+ break;
535
+ }
536
+
537
+ if (incomplete && !has_next_token)
538
+ {
539
+ has_next_token = true;
540
+ n_remain++;
541
+ }
542
+
543
+ if (!has_next_token && n_remain == 0)
544
+ {
545
+ stopped_limit = true;
546
+ }
547
+
548
+ LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
549
+ common_token_to_piece(ctx, token_with_probs.tok),
550
+ tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
551
+ has_next_token,
552
+ n_remain,
553
+ num_tokens_predicted,
554
+ stopped_eos,
555
+ stopped_word,
556
+ stopped_limit,
557
+ stopping_word.c_str()
558
+ );
559
+ return token_with_probs;
560
+ }
561
+
562
+ std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
563
+ {
564
+ static const int n_embd = llama_model_n_embd(llama_get_model(ctx));
565
+ if (!embd_params.embedding)
566
+ {
567
+ LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
568
+ return std::vector<float>(n_embd, 0.0f);
569
+ }
570
+ float *data;
571
+
572
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
573
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
574
+ data = llama_get_embeddings(ctx);
575
+ } else {
576
+ data = llama_get_embeddings_seq(ctx, 0);
577
+ }
578
+
579
+ if (!data) {
580
+ return std::vector<float>(n_embd, 0.0f);
581
+ }
582
+ std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
583
+ common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize);
584
+ return out;
585
+ }
586
+
587
+ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
588
+ {
589
+ if (is_predicting) {
590
+ LOG_ERROR("cannot benchmark while predicting", "");
591
+ return std::string("[]");
592
+ }
593
+
594
+ is_predicting = true;
595
+
596
+ double pp_avg = 0;
597
+ double tg_avg = 0;
598
+
599
+ double pp_std = 0;
600
+ double tg_std = 0;
601
+
602
+ // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
603
+ llama_batch batch = llama_batch_init(
604
+ std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
605
+ 0, // No embeddings
606
+ 1 // Single sequence
607
+ );
608
+
609
+ for (int i = 0; i < nr; i++)
610
+ {
611
+ llama_batch_clear(&batch);
612
+
613
+ const int n_tokens = pp;
614
+
615
+ for (int i = 0; i < n_tokens; i++)
616
+ {
617
+ llama_batch_add(&batch, 0, i, {0}, false);
618
+ }
619
+ batch.logits[batch.n_tokens - 1] = 1; // true
620
+
621
+ llama_kv_self_clear(ctx);
622
+
623
+ const int64_t t_pp_start = llama_time_us();
624
+ if (llama_decode(ctx, batch) != 0)
625
+ {
626
+ LOG_ERROR("llama_decode() failed during prompt", "");
627
+ }
628
+ const int64_t t_pp_end = llama_time_us();
629
+ llama_kv_self_clear(ctx);
630
+
631
+ if (is_interrupted) break;
632
+
633
+ const int64_t t_tg_start = llama_time_us();
634
+
635
+ for (int i = 0; i < tg; i++)
636
+ {
637
+ llama_batch_clear(&batch);
638
+
639
+ for (int j = 0; j < pl; j++)
640
+ {
641
+ llama_batch_add(&batch, 0, i, {j}, true);
642
+ }
643
+
644
+ if (llama_decode(ctx, batch) != 0)
645
+ {
646
+ LOG_ERROR("llama_decode() failed during text generation", "");
647
+ }
648
+ if (is_interrupted) break;
649
+ }
650
+
651
+ const int64_t t_tg_end = llama_time_us();
652
+
653
+ llama_kv_self_clear(ctx);
654
+
655
+ const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
656
+ const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
657
+
658
+ const double speed_pp = pp / t_pp;
659
+ const double speed_tg = (pl * tg) / t_tg;
660
+
661
+ pp_avg += speed_pp;
662
+ tg_avg += speed_tg;
663
+
664
+ pp_std += speed_pp * speed_pp;
665
+ tg_std += speed_tg * speed_tg;
666
+ }
667
+
668
+ pp_avg /= nr;
669
+ tg_avg /= nr;
670
+
671
+ if (nr > 1) {
672
+ pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1));
673
+ tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1));
674
+ } else {
675
+ pp_std = 0;
676
+ tg_std = 0;
677
+ }
678
+
679
+ if (is_interrupted) llama_kv_self_clear(ctx);
680
+ is_predicting = false;
681
+
682
+ char model_desc[128];
683
+ llama_model_desc(model, model_desc, sizeof(model_desc));
684
+ return std::string("[\"") + model_desc + std::string("\",") +
685
+ std::to_string(llama_model_size(model)) + std::string(",") +
686
+ std::to_string(llama_model_n_params(model)) + std::string(",") +
687
+ std::to_string(pp_avg) + std::string(",") +
688
+ std::to_string(pp_std) + std::string(",") +
689
+ std::to_string(tg_avg) + std::string(",") +
690
+ std::to_string(tg_std) +
691
+ std::string("]");
692
+ }
693
+
694
+ int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
695
+ for (auto &la : lora) {
696
+ la.ptr = llama_adapter_lora_init(model, la.path.c_str());
697
+ if (la.ptr == nullptr) {
698
+ LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
699
+ return -1;
700
+ }
701
+ }
702
+ this->lora = lora;
703
+ common_set_adapter_lora(ctx, lora);
704
+ return 0;
705
+ }
706
+
707
+ void llama_rn_context::removeLoraAdapters() {
708
+ this->lora.clear();
709
+ common_set_adapter_lora(ctx, this->lora); // apply empty list
710
+ }
711
+
712
+ std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
713
+ return this->lora;
714
+ }
715
+ std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
716
+ int m = x.size(), n = y.size();
717
+
718
+ //int LCSuff[m+1][n+1];
719
+ std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
720
+
721
+ for (int j = 0; j <= n; j++)
722
+ LCSuff[0][j] = 0;
723
+ for (int i = 0; i <= m; i++)
724
+ LCSuff[i][0] = 0;
725
+
726
+ for (int i = 1; i <= m; i++)
727
+ {
728
+ for (int j = 1; j <= n; j++)
729
+ {
730
+ if (x[i - 1] == y[j - 1])
731
+ LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
732
+ else
733
+ LCSuff[i][j] = 0;
734
+ }
735
+ }
736
+
737
+ std::vector<int> longest;
738
+ for (int i = 1; i <= m; i++)
739
+ {
740
+ for (int j = 1; j <= n; j++)
741
+ {
742
+ if (LCSuff[i][j] > longest.size())
743
+ {
744
+ auto off1 = ((i - LCSuff[i][j] + 1) - 1);
745
+ auto off2 = off1 + LCSuff[i][j];
746
+ longest.clear();
747
+ // std::vector<int>().swap(longest);
748
+ longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
749
+ // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
750
+ }
751
+ }
752
+ }
753
+ return longest;
754
+ }
755
+
756
+ bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
757
+ {
758
+ int ss = searchSeq.size();
759
+ if(targetArray.size()<ss)
760
+ {
761
+ return false;
762
+ }
763
+ for(int i=0;i<ss;++i)
764
+ {
765
+ if(targetArray[i]!=searchSeq[i])
766
+ {
767
+ return false;
768
+ }
769
+ }
770
+ return true;
771
+ }
772
+
773
+ int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
774
+ {
775
+ int ss = searchSeq.size();
776
+ int tas = targetArray.size();
777
+ if(tas<ss)
778
+ {
779
+ return -1;
780
+ }
781
+ for(int i=0;i<tas;++i)
782
+ {
783
+ int srch = 0;
784
+ bool fail = false;
785
+ for(int srch=0;srch<ss;++srch)
786
+ {
787
+ if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
788
+ {
789
+ fail = true;
790
+ break;
791
+ }
792
+ }
793
+ if(!fail)
794
+ {
795
+ return i;
796
+ }
797
+ }
798
+ return -1;
799
+ }
800
+
801
+ void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
802
+ {
803
+ //scan from start old and new ctx, until first mismatch found, save as p0
804
+ //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
805
+ //test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
806
+ //if passed, save beginning of LCQ from old ctx as p1
807
+ //remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
808
+
809
+ const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
810
+ const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
811
+
812
+ int trimstart = 0;
813
+ int new_tokens_len = new_context_tokens.size();
814
+ bool purge_needed = true;
815
+
816
+ for (int i = 0; i < current_context_tokens.size(); ++i)
817
+ {
818
+ if (current_context_tokens[i] == new_context_tokens[i])
819
+ {
820
+ trimstart += 1;
821
+ }
822
+ else
823
+ {
824
+ break;
825
+ }
826
+ if ((i + 2) >= new_tokens_len)
827
+ {
828
+ purge_needed = false;
829
+ break; //no surgery required
830
+ }
831
+ }
832
+
833
+
834
+
835
+ if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
836
+ {
837
+ LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
838
+ return; //no purge is needed
839
+ }
840
+
841
+ //at least this many tokens need to match, otherwise don't bother trimming
842
+ const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
843
+
844
+ auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
845
+ auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
846
+
847
+ auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
848
+
849
+ if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
850
+ {
851
+ int found = arr_find_index_of(current_context_tokens,shared);
852
+ if(found>=0 && found > trimstart)
853
+ {
854
+
855
+ //extract the unwanted tokens out from context and KV
856
+ int diff = found - trimstart;
857
+ llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
858
+ llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
859
+
860
+ for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
861
+ {
862
+ current_context_tokens[i - diff] = current_context_tokens[i];
863
+ }
864
+
865
+ LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
866
+
867
+ current_context_tokens.resize(current_context_tokens.size() - diff);
868
+ }
869
+ }
870
+
871
+ }
872
+
873
+ }