cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
  4. package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,1158 @@
1
+ #define LM_GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+
4
+ #include "ggml-cpu-impl.h"
5
+ #include "simd-mappings.h"
6
+ #include "ggml-quants.h"
7
+ #include "quants.h"
8
+
9
+ #include "arch-fallback.h"
10
+
11
+ #include <string.h>
12
+ #include <assert.h>
13
+ #include <float.h>
14
+ #include <stdlib.h> // for qsort
15
+ #include <stdio.h> // for LM_GGML_ASSERT
16
+
17
+ #define GROUP_MAX_EPS 1e-15f
18
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
20
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
21
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
22
+
23
+ #define UNUSED LM_GGML_UNUSED
24
+
25
+ void quantize_row_q4_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
26
+ quantize_row_q4_0_ref(x, y, k);
27
+ }
28
+
29
+ void quantize_row_q4_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
30
+ quantize_row_q4_1_ref(x, y, k);
31
+ }
32
+
33
+ void quantize_row_q5_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
34
+ quantize_row_q5_0_ref(x, y, k);
35
+ }
36
+
37
+ void quantize_row_q5_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
38
+ quantize_row_q5_1_ref(x, y, k);
39
+ }
40
+
41
+ void quantize_row_q8_0_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
42
+ quantize_row_q8_0_ref(x, y, k);
43
+ }
44
+
45
+ void quantize_row_q8_1_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
46
+ quantize_row_q8_1_ref(x, y, k);
47
+ }
48
+
49
+ //
50
+ // 2-6 bit quantization in super-blocks
51
+ //
52
+
53
+ //========================- 2-bit (de)-quantization
54
+
55
+ void quantize_row_q2_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
56
+ quantize_row_q2_K_ref(x, vy, k);
57
+ }
58
+
59
+ //========================= 3-bit (de)-quantization
60
+
61
+ void quantize_row_q3_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
62
+ quantize_row_q3_K_ref(x, vy, k);
63
+ }
64
+
65
+ // ====================== 4-bit (de)-quantization
66
+
67
+ void quantize_row_q4_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
68
+ assert(k % QK_K == 0);
69
+ block_q4_K * LM_GGML_RESTRICT y = vy;
70
+ quantize_row_q4_K_ref(x, y, k);
71
+ }
72
+
73
+ // ====================== 5-bit (de)-quantization
74
+
75
+ void quantize_row_q5_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
76
+ assert(k % QK_K == 0);
77
+ block_q5_K * LM_GGML_RESTRICT y = vy;
78
+ quantize_row_q5_K_ref(x, y, k);
79
+ }
80
+
81
+ // ====================== 6-bit (de)-quantization
82
+
83
+ void quantize_row_q6_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
84
+ assert(k % QK_K == 0);
85
+ block_q6_K * LM_GGML_RESTRICT y = vy;
86
+ quantize_row_q6_K_ref(x, y, k);
87
+ }
88
+
89
+ // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
90
+
91
+ void quantize_row_tq1_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
92
+ assert(k % QK_K == 0);
93
+ block_tq1_0 * LM_GGML_RESTRICT y = vy;
94
+ quantize_row_tq1_0_ref(x, y, k);
95
+ }
96
+
97
+ void quantize_row_tq2_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
98
+ assert(k % QK_K == 0);
99
+ block_tq2_0 * LM_GGML_RESTRICT y = vy;
100
+ quantize_row_tq2_0_ref(x, y, k);
101
+ }
102
+
103
+ //===================================== Q8_K ==============================================
104
+
105
+ void quantize_row_q8_K_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
106
+ quantize_row_q8_K_ref(x, y, k);
107
+ }
108
+
109
+ //===================================== Dot products =================================
110
+
111
+ void lm_ggml_vec_dot_q4_0_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
112
+ const int qk = QK8_0;
113
+ const int nb = n / qk;
114
+
115
+ assert(n % qk == 0);
116
+ assert(nrc == 1);
117
+ UNUSED(nrc);
118
+ UNUSED(bx);
119
+ UNUSED(by);
120
+ UNUSED(bs);
121
+
122
+ const block_q4_0 * LM_GGML_RESTRICT x = vx;
123
+ const block_q8_0 * LM_GGML_RESTRICT y = vy;
124
+
125
+ int ib = 0;
126
+ float sumf = 0;
127
+
128
+ for (; ib < nb; ++ib) {
129
+ int sumi0 = 0;
130
+ int sumi1 = 0;
131
+
132
+ for (int j = 0; j < qk/2; ++j) {
133
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
134
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
135
+
136
+ sumi0 += (v0 * y[ib].qs[j]);
137
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
138
+ }
139
+
140
+ int sumi = sumi0 + sumi1;
141
+ sumf += sumi*LM_GGML_CPU_FP16_TO_FP32(x[ib].d)*LM_GGML_CPU_FP16_TO_FP32(y[ib].d);
142
+ }
143
+
144
+ *s = sumf;
145
+ }
146
+
147
+ // TODO: add WASM SIMD
148
+ void lm_ggml_vec_dot_q4_1_q8_1_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
149
+ const int qk = QK8_1;
150
+ const int nb = n / qk;
151
+
152
+ assert(n % qk == 0);
153
+ assert(nrc == 1);
154
+ UNUSED(nrc);
155
+ UNUSED(bx);
156
+ UNUSED(by);
157
+ UNUSED(bs);
158
+
159
+ const block_q4_1 * LM_GGML_RESTRICT x = vx;
160
+ const block_q8_1 * LM_GGML_RESTRICT y = vy;
161
+
162
+ int ib = 0;
163
+ float sumf = 0;
164
+
165
+ for (; ib < nb; ++ib) {
166
+ int sumi0 = 0;
167
+ int sumi1 = 0;
168
+
169
+ for (int j = 0; j < qk/2; ++j) {
170
+ const int v0 = (x[ib].qs[j] & 0x0F);
171
+ const int v1 = (x[ib].qs[j] >> 4);
172
+
173
+ sumi0 += (v0 * y[ib].qs[j]);
174
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
175
+ }
176
+
177
+ int sumi = sumi0 + sumi1;
178
+ sumf += (LM_GGML_CPU_FP16_TO_FP32(x[ib].d)*LM_GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_CPU_FP16_TO_FP32(x[ib].m)*LM_GGML_CPU_FP16_TO_FP32(y[ib].s);
179
+ }
180
+
181
+ *s = sumf;
182
+ }
183
+
184
+ void lm_ggml_vec_dot_q5_0_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
185
+ const int qk = QK8_0;
186
+ const int nb = n / qk;
187
+
188
+ int ib = 0;
189
+ float sumf = 0;
190
+
191
+ assert(n % qk == 0);
192
+ assert(qk == QK5_0);
193
+ assert(nrc == 1);
194
+ UNUSED(nrc);
195
+ UNUSED(bx);
196
+ UNUSED(by);
197
+ UNUSED(bs);
198
+
199
+ const block_q5_0 * LM_GGML_RESTRICT x = vx;
200
+ const block_q8_0 * LM_GGML_RESTRICT y = vy;
201
+
202
+ for (; ib < nb; ++ib) {
203
+ uint32_t qh;
204
+ memcpy(&qh, x[ib].qh, sizeof(qh));
205
+
206
+ int sumi0 = 0;
207
+ int sumi1 = 0;
208
+
209
+ for (int j = 0; j < qk/2; ++j) {
210
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
211
+ const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
212
+
213
+ const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
214
+ const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
215
+
216
+ sumi0 += (x0 * y[ib].qs[j]);
217
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
218
+ }
219
+
220
+ int sumi = sumi0 + sumi1;
221
+ sumf += (LM_GGML_CPU_FP16_TO_FP32(x[ib].d)*LM_GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
222
+ }
223
+
224
+ *s = sumf;
225
+ }
226
+
227
+ void lm_ggml_vec_dot_q5_1_q8_1_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
228
+ const int qk = QK8_1;
229
+ const int nb = n / qk;
230
+
231
+ int ib = 0;
232
+ float sumf = 0;
233
+
234
+ assert(n % qk == 0);
235
+ assert(qk == QK5_1);
236
+ assert(nrc == 1);
237
+ UNUSED(nrc);
238
+ UNUSED(bx);
239
+ UNUSED(by);
240
+ UNUSED(bs);
241
+
242
+ const block_q5_1 * LM_GGML_RESTRICT x = vx;
243
+ const block_q8_1 * LM_GGML_RESTRICT y = vy;
244
+
245
+ for (; ib < nb; ++ib) {
246
+ uint32_t qh;
247
+ memcpy(&qh, x[ib].qh, sizeof(qh));
248
+
249
+ int sumi0 = 0;
250
+ int sumi1 = 0;
251
+
252
+ for (int j = 0; j < qk/2; ++j) {
253
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
254
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
255
+
256
+ const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
257
+ const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
258
+
259
+ sumi0 += (x0 * y[ib].qs[j]);
260
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
261
+ }
262
+
263
+ int sumi = sumi0 + sumi1;
264
+ sumf += (LM_GGML_CPU_FP16_TO_FP32(x[ib].d)*LM_GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_CPU_FP16_TO_FP32(x[ib].m)*LM_GGML_CPU_FP16_TO_FP32(y[ib].s);
265
+ }
266
+
267
+ *s = sumf;
268
+ }
269
+
270
+ void lm_ggml_vec_dot_q8_0_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
271
+ const int qk = QK8_0;
272
+ const int nb = n / qk;
273
+
274
+ assert(n % qk == 0);
275
+ assert(nrc == 1);
276
+ UNUSED(nrc);
277
+ UNUSED(bx);
278
+ UNUSED(by);
279
+ UNUSED(bs);
280
+
281
+ const block_q8_0 * LM_GGML_RESTRICT x = vx;
282
+ const block_q8_0 * LM_GGML_RESTRICT y = vy;
283
+
284
+ int ib = 0;
285
+ float sumf = 0;
286
+
287
+ for (; ib < nb; ++ib) {
288
+ int sumi = 0;
289
+
290
+ for (int j = 0; j < qk; j++) {
291
+ sumi += x[ib].qs[j]*y[ib].qs[j];
292
+ }
293
+
294
+ sumf += sumi*(LM_GGML_CPU_FP16_TO_FP32(x[ib].d)*LM_GGML_CPU_FP16_TO_FP32(y[ib].d));
295
+ }
296
+
297
+ *s = sumf;
298
+ }
299
+
300
+ void lm_ggml_vec_dot_tq1_0_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
301
+ assert(nrc == 1);
302
+ UNUSED(nrc);
303
+ UNUSED(bx);
304
+ UNUSED(by);
305
+ UNUSED(bs);
306
+
307
+ const block_tq1_0 * LM_GGML_RESTRICT x = vx;
308
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
309
+
310
+ const int nb = n / QK_K;
311
+
312
+ const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
313
+
314
+ float sumf = 0.0f;
315
+
316
+ for (int i = 0; i < nb; ++i) {
317
+ int sum = 0;
318
+
319
+ for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
320
+ for (size_t l = 0; l < 5; ++l) {
321
+ for (size_t m = 0; m < 32; ++m) {
322
+ uint8_t q = x[i].qs[j + m] * pow3[l];
323
+ uint16_t xi = ((uint16_t) q * 3) >> 8;
324
+ sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
325
+ }
326
+ }
327
+ }
328
+ for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
329
+ for (size_t l = 0; l < 5; ++l) {
330
+ for (size_t m = 0; m < 16; ++m) {
331
+ uint8_t q = x[i].qs[j + m] * pow3[l];
332
+ uint16_t xi = ((uint16_t) q * 3) >> 8;
333
+ sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
334
+ }
335
+ }
336
+ }
337
+
338
+ for (size_t l = 0; l < 4; ++l) {
339
+ for (size_t j = 0; j < sizeof(x->qh); ++j) {
340
+ uint8_t q = x[i].qh[j] * pow3[l];
341
+ uint16_t xi = ((uint16_t) q * 3) >> 8;
342
+ sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
343
+ }
344
+ }
345
+
346
+ sumf += (float) sum * (LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
347
+ }
348
+
349
+ *s = sumf;
350
+ }
351
+
352
+ void lm_ggml_vec_dot_tq2_0_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
353
+ assert(nrc == 1);
354
+ UNUSED(nrc);
355
+ UNUSED(bx);
356
+ UNUSED(by);
357
+ UNUSED(bs);
358
+
359
+ const block_tq2_0 * LM_GGML_RESTRICT x = vx;
360
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
361
+
362
+ const int nb = n / QK_K;
363
+ float sumf = 0.0f;
364
+
365
+ for (int i = 0; i < nb; ++i) {
366
+ int32_t sumi = 0;
367
+
368
+ for (size_t j = 0; j < sizeof(x->qs); j += 32) {
369
+ for (size_t l = 0; l < 4; ++l) {
370
+ for (size_t k = 0; k < 32; ++k) {
371
+ sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
372
+ }
373
+ }
374
+ }
375
+
376
+ const float d = y[i].d * LM_GGML_CPU_FP16_TO_FP32(x[i].d);
377
+
378
+ sumf += (float) sumi * d;
379
+ }
380
+
381
+ *s = sumf;
382
+ }
383
+
384
+ void lm_ggml_vec_dot_q2_K_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
385
+ assert(nrc == 1);
386
+ UNUSED(nrc);
387
+ UNUSED(bx);
388
+ UNUSED(by);
389
+ UNUSED(bs);
390
+
391
+ const block_q2_K * LM_GGML_RESTRICT x = vx;
392
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
393
+
394
+ const int nb = n / QK_K;
395
+
396
+ float sumf = 0;
397
+
398
+ for (int i = 0; i < nb; ++i) {
399
+
400
+ const uint8_t * q2 = x[i].qs;
401
+ const int8_t * q8 = y[i].qs;
402
+ const uint8_t * sc = x[i].scales;
403
+
404
+ int summs = 0;
405
+ for (int j = 0; j < 16; ++j) {
406
+ summs += y[i].bsums[j] * (sc[j] >> 4);
407
+ }
408
+
409
+ const float dall = y[i].d * LM_GGML_CPU_FP16_TO_FP32(x[i].d);
410
+ const float dmin = y[i].d * LM_GGML_CPU_FP16_TO_FP32(x[i].dmin);
411
+
412
+ int isum = 0;
413
+ int is = 0;
414
+ int d;
415
+ for (int k = 0; k < QK_K/128; ++k) {
416
+ int shift = 0;
417
+ for (int j = 0; j < 4; ++j) {
418
+ d = sc[is++] & 0xF;
419
+ int isuml = 0;
420
+ for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
421
+ isum += d * isuml;
422
+ d = sc[is++] & 0xF;
423
+ isuml = 0;
424
+ for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
425
+ isum += d * isuml;
426
+ shift += 2;
427
+ q8 += 32;
428
+ }
429
+ q2 += 32;
430
+ }
431
+ sumf += dall * isum - dmin * summs;
432
+ }
433
+ *s = sumf;
434
+ }
435
+
436
+ void lm_ggml_vec_dot_q3_K_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
437
+ assert(n % QK_K == 0);
438
+ assert(nrc == 1);
439
+ UNUSED(nrc);
440
+ UNUSED(bx);
441
+ UNUSED(by);
442
+ UNUSED(bs);
443
+
444
+ const uint32_t kmask1 = 0x03030303;
445
+ const uint32_t kmask2 = 0x0f0f0f0f;
446
+
447
+ const block_q3_K * LM_GGML_RESTRICT x = vx;
448
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
449
+
450
+ const int nb = n / QK_K;
451
+
452
+ // scalar version
453
+ // This function is written like this so the compiler can manage to vectorize most of it
454
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
455
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
456
+ // The ideal situation would be if we could just write the code once, and the compiler would
457
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
458
+ // write vectorized versions for AVX, ARM_NEON, etc.
459
+
460
+ int8_t aux8[QK_K];
461
+ int16_t aux16[8];
462
+ float sums [8];
463
+ int32_t aux32[8];
464
+ memset(sums, 0, 8*sizeof(float));
465
+
466
+ uint32_t auxs[4];
467
+ const int8_t * scales = (const int8_t*)auxs;
468
+
469
+ float sumf = 0;
470
+ for (int i = 0; i < nb; ++i) {
471
+ const uint8_t * LM_GGML_RESTRICT q3 = x[i].qs;
472
+ const uint8_t * LM_GGML_RESTRICT hm = x[i].hmask;
473
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
474
+ memset(aux32, 0, 8*sizeof(int32_t));
475
+ int8_t * LM_GGML_RESTRICT a = aux8;
476
+ uint8_t m = 1;
477
+ for (int j = 0; j < QK_K; j += 128) {
478
+ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
479
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
480
+ a += 32; m <<= 1;
481
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
482
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
483
+ a += 32; m <<= 1;
484
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
485
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
486
+ a += 32; m <<= 1;
487
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
488
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
489
+ a += 32; m <<= 1;
490
+ q3 += 32;
491
+ }
492
+ a = aux8;
493
+
494
+ memcpy(auxs, x[i].scales, 12);
495
+ uint32_t tmp = auxs[2];
496
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
497
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
498
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
499
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
500
+ for (int j = 0; j < QK_K/16; ++j) {
501
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
502
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
503
+ q8 += 8; a += 8;
504
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
505
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
506
+ q8 += 8; a += 8;
507
+ }
508
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
509
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
510
+ }
511
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
512
+ *s = sumf;
513
+ }
514
+
515
+ void lm_ggml_vec_dot_q4_K_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
516
+ assert(n % QK_K == 0);
517
+ assert(nrc == 1);
518
+ UNUSED(nrc);
519
+ UNUSED(bx);
520
+ UNUSED(by);
521
+ UNUSED(bs);
522
+
523
+ const block_q4_K * LM_GGML_RESTRICT x = vx;
524
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
525
+
526
+ const int nb = n / QK_K;
527
+
528
+ static const uint32_t kmask1 = 0x3f3f3f3f;
529
+ static const uint32_t kmask2 = 0x0f0f0f0f;
530
+ static const uint32_t kmask3 = 0x03030303;
531
+
532
+ uint32_t utmp[4];
533
+
534
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
535
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
536
+
537
+ int8_t aux8[QK_K];
538
+ int16_t aux16[8];
539
+ float sums [8];
540
+ int32_t aux32[8];
541
+ memset(sums, 0, 8*sizeof(float));
542
+
543
+ float sumf = 0;
544
+ for (int i = 0; i < nb; ++i) {
545
+ const uint8_t * LM_GGML_RESTRICT q4 = x[i].qs;
546
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
547
+ memset(aux32, 0, 8*sizeof(int32_t));
548
+ int8_t * LM_GGML_RESTRICT a = aux8;
549
+ for (int j = 0; j < QK_K/64; ++j) {
550
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
551
+ a += 32;
552
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
553
+ a += 32; q4 += 32;
554
+ }
555
+ memcpy(utmp, x[i].scales, 12);
556
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
557
+ const uint32_t uaux = utmp[1] & kmask1;
558
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
559
+ utmp[2] = uaux;
560
+ utmp[0] &= kmask1;
561
+
562
+ int sumi = 0;
563
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
564
+ a = aux8;
565
+ int is = 0;
566
+ for (int j = 0; j < QK_K/32; ++j) {
567
+ int32_t scale = scales[is++];
568
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
569
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
570
+ q8 += 8; a += 8;
571
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
572
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
573
+ q8 += 8; a += 8;
574
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
575
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
576
+ q8 += 8; a += 8;
577
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
578
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
579
+ q8 += 8; a += 8;
580
+ }
581
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
582
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
583
+ const float dmin = LM_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
584
+ sumf -= dmin * sumi;
585
+ }
586
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
587
+ *s = sumf;
588
+ }
589
+
590
+ void lm_ggml_vec_dot_q5_K_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
591
+ assert(n % QK_K == 0);
592
+ assert(nrc == 1);
593
+ UNUSED(nrc);
594
+ UNUSED(bx);
595
+ UNUSED(by);
596
+ UNUSED(bs);
597
+
598
+ const block_q5_K * LM_GGML_RESTRICT x = vx;
599
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
600
+
601
+ const int nb = n / QK_K;
602
+
603
+ static const uint32_t kmask1 = 0x3f3f3f3f;
604
+ static const uint32_t kmask2 = 0x0f0f0f0f;
605
+ static const uint32_t kmask3 = 0x03030303;
606
+
607
+ uint32_t utmp[4];
608
+
609
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
610
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
611
+
612
+ int8_t aux8[QK_K];
613
+ int16_t aux16[8];
614
+ float sums [8];
615
+ int32_t aux32[8];
616
+ memset(sums, 0, 8*sizeof(float));
617
+
618
+ float sumf = 0;
619
+ for (int i = 0; i < nb; ++i) {
620
+ const uint8_t * LM_GGML_RESTRICT q4 = x[i].qs;
621
+ const uint8_t * LM_GGML_RESTRICT hm = x[i].qh;
622
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
623
+ memset(aux32, 0, 8*sizeof(int32_t));
624
+ int8_t * LM_GGML_RESTRICT a = aux8;
625
+ uint8_t m = 1;
626
+ for (int j = 0; j < QK_K/64; ++j) {
627
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
628
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
629
+ a += 32; m <<= 1;
630
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
631
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
632
+ a += 32; m <<= 1;
633
+ q4 += 32;
634
+ }
635
+ memcpy(utmp, x[i].scales, 12);
636
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
637
+ const uint32_t uaux = utmp[1] & kmask1;
638
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
639
+ utmp[2] = uaux;
640
+ utmp[0] &= kmask1;
641
+
642
+ int sumi = 0;
643
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
644
+ a = aux8;
645
+ int is = 0;
646
+ for (int j = 0; j < QK_K/32; ++j) {
647
+ int32_t scale = scales[is++];
648
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
649
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
650
+ q8 += 8; a += 8;
651
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
652
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
653
+ q8 += 8; a += 8;
654
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
655
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
656
+ q8 += 8; a += 8;
657
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
658
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
659
+ q8 += 8; a += 8;
660
+ }
661
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
662
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
663
+ const float dmin = LM_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
664
+ sumf -= dmin * sumi;
665
+ }
666
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
667
+ *s = sumf;
668
+ }
669
+
670
+ void lm_ggml_vec_dot_q6_K_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
671
+ assert(n % QK_K == 0);
672
+ assert(nrc == 1);
673
+ UNUSED(nrc);
674
+ UNUSED(bx);
675
+ UNUSED(by);
676
+ UNUSED(bs);
677
+
678
+ const block_q6_K * LM_GGML_RESTRICT x = vx;
679
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
680
+
681
+ const int nb = n / QK_K;
682
+
683
+ int8_t aux8[QK_K];
684
+ int16_t aux16[8];
685
+ float sums [8];
686
+ int32_t aux32[8];
687
+ memset(sums, 0, 8*sizeof(float));
688
+
689
+ float sumf = 0;
690
+ for (int i = 0; i < nb; ++i) {
691
+ const uint8_t * LM_GGML_RESTRICT q4 = x[i].ql;
692
+ const uint8_t * LM_GGML_RESTRICT qh = x[i].qh;
693
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
694
+ memset(aux32, 0, 8*sizeof(int32_t));
695
+ int8_t * LM_GGML_RESTRICT a = aux8;
696
+ for (int j = 0; j < QK_K; j += 128) {
697
+ for (int l = 0; l < 32; ++l) {
698
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
699
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
700
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
701
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
702
+ }
703
+ a += 128;
704
+ q4 += 64;
705
+ qh += 32;
706
+ }
707
+ a = aux8;
708
+ int is = 0;
709
+ for (int j = 0; j < QK_K/16; ++j) {
710
+ int scale = x[i].scales[is++];
711
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
712
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
713
+ q8 += 8; a += 8;
714
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
715
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
716
+ q8 += 8; a += 8;
717
+ }
718
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
719
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
720
+ }
721
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
722
+ *s = sumf;
723
+ }
724
+
725
+ void lm_ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
726
+ assert(n % QK_K == 0);
727
+ assert(nrc == 1);
728
+ UNUSED(nrc);
729
+ UNUSED(bx);
730
+ UNUSED(by);
731
+ UNUSED(bs);
732
+
733
+ const block_iq2_xxs * LM_GGML_RESTRICT x = vx;
734
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
735
+
736
+ const int nb = n / QK_K;
737
+
738
+ uint32_t aux32[2];
739
+ const uint8_t * aux8 = (const uint8_t *)aux32;
740
+
741
+ float sumf = 0.f;
742
+ for (int i = 0; i < nb; ++i) {
743
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
744
+ const uint16_t * LM_GGML_RESTRICT q2 = x[i].qs;
745
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
746
+ int32_t bsum = 0;
747
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
748
+ memcpy(aux32, q2, 2*sizeof(uint32_t));
749
+ q2 += 4;
750
+ const uint32_t ls = 2*(aux32[1] >> 28) + 1;
751
+ int32_t sumi = 0;
752
+ for (int l = 0; l < 4; ++l) {
753
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
754
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
755
+ for (int j = 0; j < 8; ++j) {
756
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
757
+ }
758
+ q8 += 8;
759
+ }
760
+ bsum += sumi * ls;
761
+ }
762
+ sumf += d * bsum;
763
+ }
764
+ *s = 0.125f * sumf;
765
+ }
766
+
767
+ void lm_ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
768
+ assert(n % QK_K == 0);
769
+ assert(nrc == 1);
770
+ UNUSED(nrc);
771
+ UNUSED(bx);
772
+ UNUSED(by);
773
+ UNUSED(bs);
774
+
775
+ const block_iq2_xs * LM_GGML_RESTRICT x = vx;
776
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
777
+
778
+ const int nb = n / QK_K;
779
+
780
+ float sumf = 0.f;
781
+ for (int i = 0; i < nb; ++i) {
782
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
783
+ const uint16_t * LM_GGML_RESTRICT q2 = x[i].qs;
784
+ const uint8_t * LM_GGML_RESTRICT sc = x[i].scales;
785
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
786
+ int32_t bsum = 0;
787
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
788
+ const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
789
+ const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
790
+ int32_t sumi = 0;
791
+ for (int l = 0; l < 2; ++l) {
792
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
793
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
794
+ for (int j = 0; j < 8; ++j) {
795
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
796
+ }
797
+ q8 += 8;
798
+ }
799
+ bsum += sumi * ls1;
800
+ sumi = 0;
801
+ for (int l = 2; l < 4; ++l) {
802
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
803
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
804
+ for (int j = 0; j < 8; ++j) {
805
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
806
+ }
807
+ q8 += 8;
808
+ }
809
+ bsum += sumi * ls2;
810
+ q2 += 4;
811
+ }
812
+ sumf += d * bsum;
813
+ }
814
+ *s = 0.125f * sumf;
815
+ }
816
+
817
+ void lm_ggml_vec_dot_iq2_s_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
818
+ assert(n % QK_K == 0);
819
+ assert(nrc == 1);
820
+ UNUSED(nrc);
821
+ UNUSED(bx);
822
+ UNUSED(by);
823
+ UNUSED(bs);
824
+
825
+ const block_iq2_s * LM_GGML_RESTRICT x = vx;
826
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
827
+
828
+ const int nb = n / QK_K;
829
+
830
+ float sumf = 0;
831
+ for (int i = 0; i < nb; i++) {
832
+
833
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
834
+ const int8_t * q8 = y[i].qs;
835
+ const uint8_t * qs = x[i].qs;
836
+ const uint8_t * qh = x[i].qh;
837
+ const uint8_t * signs = qs + QK_K/8;
838
+
839
+ int bsum = 0;
840
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
841
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
842
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
843
+ int sumi1 = 0, sumi2 = 0;
844
+ for (int l = 0; l < 2; ++l) {
845
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
846
+ for (int j = 0; j < 8; ++j) {
847
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
848
+ }
849
+ q8 += 8;
850
+ }
851
+ for (int l = 2; l < 4; ++l) {
852
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
853
+ for (int j = 0; j < 8; ++j) {
854
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
855
+ }
856
+ q8 += 8;
857
+ }
858
+ bsum += ls1 * sumi1 + ls2 * sumi2;
859
+ qs += 4;
860
+ signs += 4;
861
+ }
862
+
863
+ sumf += d * bsum;
864
+ }
865
+
866
+ *s = 0.125f * sumf;
867
+ }
868
+
869
+ void lm_ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
870
+ assert(n % QK_K == 0);
871
+ assert(nrc == 1);
872
+ UNUSED(nrc);
873
+ UNUSED(bx);
874
+ UNUSED(by);
875
+ UNUSED(bs);
876
+
877
+ const block_iq3_xxs * LM_GGML_RESTRICT x = vx;
878
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
879
+
880
+ const int nb = n / QK_K;
881
+
882
+ uint32_t aux32;
883
+
884
+ float sumf = 0.f;
885
+ for (int i = 0; i < nb; ++i) {
886
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
887
+ const uint8_t * LM_GGML_RESTRICT q3 = x[i].qs;
888
+ const uint8_t * LM_GGML_RESTRICT gas = x[i].qs + QK_K/4;
889
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
890
+ int32_t bsum = 0;
891
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
892
+ memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
893
+ const uint32_t ls = 2*(aux32 >> 28) + 1;
894
+ int32_t sumi = 0;
895
+ for (int l = 0; l < 4; ++l) {
896
+ const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
897
+ const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
898
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
899
+ for (int j = 0; j < 4; ++j) {
900
+ sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
901
+ sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
902
+ }
903
+ q8 += 8;
904
+ }
905
+ q3 += 8;
906
+ bsum += sumi * ls;
907
+ }
908
+ sumf += d * bsum;
909
+ }
910
+ *s = 0.25f * sumf;
911
+ }
912
+
913
+ void lm_ggml_vec_dot_iq3_s_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
914
+ assert(n % QK_K == 0);
915
+ assert(nrc == 1);
916
+ UNUSED(nrc);
917
+ UNUSED(bx);
918
+ UNUSED(by);
919
+ UNUSED(bs);
920
+
921
+ const block_iq3_s * LM_GGML_RESTRICT x = vx;
922
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
923
+
924
+ const int nb = n / QK_K;
925
+
926
+ float sumf = 0.f;
927
+ for (int i = 0; i < nb; ++i) {
928
+ const float d = LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
929
+ const uint8_t * LM_GGML_RESTRICT qs = x[i].qs;
930
+ const uint8_t * LM_GGML_RESTRICT qh = x[i].qh;
931
+ const uint8_t * LM_GGML_RESTRICT signs = x[i].signs;
932
+ const int8_t * LM_GGML_RESTRICT q8 = y[i].qs;
933
+ int32_t bsum = 0;
934
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
935
+ const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
936
+ const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
937
+ int32_t sumi = 0;
938
+ for (int l = 0; l < 4; ++l) {
939
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
940
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
941
+ for (int j = 0; j < 4; ++j) {
942
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
943
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
944
+ }
945
+ q8 += 8;
946
+ }
947
+ qs += 8;
948
+ signs += 4;
949
+ bsum += sumi * ls1;
950
+ sumi = 0;
951
+ for (int l = 0; l < 4; ++l) {
952
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
953
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
954
+ for (int j = 0; j < 4; ++j) {
955
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
956
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
957
+ }
958
+ q8 += 8;
959
+ }
960
+ qs += 8;
961
+ signs += 4;
962
+ bsum += sumi * ls2;
963
+ }
964
+ sumf += d * bsum;
965
+ }
966
+ *s = sumf;
967
+ }
968
+
969
+ void lm_ggml_vec_dot_iq1_s_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
970
+ assert(n % QK_K == 0);
971
+ assert(nrc == 1);
972
+ UNUSED(nrc);
973
+ UNUSED(bx);
974
+ UNUSED(by);
975
+ UNUSED(bs);
976
+
977
+ const block_iq1_s * LM_GGML_RESTRICT x = vx;
978
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
979
+
980
+ const int nb = n / QK_K;
981
+
982
+ float sumf = 0;
983
+ for (int i = 0; i < nb; i++) {
984
+
985
+ const int8_t * q8 = y[i].qs;
986
+ const uint8_t * qs = x[i].qs;
987
+ const uint16_t * qh = x[i].qh;
988
+
989
+ int sumi = 0, sumi1 = 0;
990
+ for (int ib = 0; ib < QK_K/32; ++ib) {
991
+ const int ls = 2*((qh[ib] >> 12) & 7) + 1;
992
+ const int delta = qh[ib] & 0x8000 ? -1 : 1;
993
+ int lsum = 0;
994
+ for (int l = 0; l < 4; ++l) {
995
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
996
+ for (int j = 0; j < 8; ++j) {
997
+ lsum += q8[j] * grid[j];
998
+ }
999
+ q8 += 8;
1000
+ }
1001
+ sumi += ls * lsum;
1002
+ sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
1003
+ qs += 4;
1004
+ }
1005
+
1006
+ sumf += LM_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1007
+ }
1008
+
1009
+ *s = sumf;
1010
+ }
1011
+
1012
+ void lm_ggml_vec_dot_iq1_m_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
1013
+ assert(n % QK_K == 0);
1014
+ assert(nrc == 1);
1015
+ UNUSED(nrc);
1016
+ UNUSED(bx);
1017
+ UNUSED(by);
1018
+ UNUSED(bs);
1019
+
1020
+ const block_iq1_m * LM_GGML_RESTRICT x = vx;
1021
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
1022
+
1023
+ const int nb = n / QK_K;
1024
+
1025
+ iq1m_scale_t scale;
1026
+
1027
+ int sum1[2], sum2[2], delta[4];
1028
+
1029
+ float sumf = 0;
1030
+ for (int i = 0; i < nb; i++) {
1031
+
1032
+ const int8_t * q8 = y[i].qs;
1033
+ const uint8_t * qs = x[i].qs;
1034
+ const uint8_t * qh = x[i].qh;
1035
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
1036
+
1037
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1038
+
1039
+ int sumi1 = 0, sumi2 = 0;
1040
+ for (int ib = 0; ib < QK_K/32; ++ib) {
1041
+ delta[0] = qh[0] & 0x08 ? -1 : 1;
1042
+ delta[1] = qh[0] & 0x80 ? -1 : 1;
1043
+ delta[2] = qh[1] & 0x08 ? -1 : 1;
1044
+ delta[3] = qh[1] & 0x80 ? -1 : 1;
1045
+ sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
1046
+ for (int l = 0; l < 4; ++l) {
1047
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
1048
+ int lsum1 = 0, lsum2 = 0;
1049
+ for (int j = 0; j < 8; ++j) {
1050
+ lsum1 += q8[j] * grid[j];
1051
+ lsum2 += q8[j];
1052
+ }
1053
+ q8 += 8;
1054
+ sum1[l/2] += lsum1;
1055
+ sum2[l/2] += lsum2*delta[l];
1056
+ }
1057
+
1058
+ const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
1059
+ const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
1060
+
1061
+ sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
1062
+ sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
1063
+ qs += 4;
1064
+ qh += 2;
1065
+ }
1066
+
1067
+ sumf += LM_GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1068
+ }
1069
+
1070
+ *s = sumf;
1071
+ }
1072
+
1073
+ void lm_ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
1074
+ assert(nrc == 1);
1075
+ UNUSED(nrc);
1076
+ UNUSED(bx);
1077
+ UNUSED(by);
1078
+ UNUSED(bs);
1079
+ assert(n % QK4_NL == 0);
1080
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1081
+
1082
+ const block_iq4_nl * LM_GGML_RESTRICT x = vx;
1083
+ const block_q8_0 * LM_GGML_RESTRICT y = vy;
1084
+
1085
+ const int nb = n / QK4_NL;
1086
+
1087
+ int ib = 0;
1088
+ float sumf = 0;
1089
+
1090
+ for (; ib < nb; ++ib) {
1091
+ const float d = LM_GGML_CPU_FP16_TO_FP32(y[ib].d)*LM_GGML_CPU_FP16_TO_FP32(x[ib].d);
1092
+ int sumi1 = 0, sumi2 = 0;
1093
+ for (int j = 0; j < QK4_NL/2; ++j) {
1094
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
1095
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
1096
+ }
1097
+ sumf += d * (sumi1 + sumi2);
1098
+ }
1099
+ *s = sumf;
1100
+ }
1101
+
1102
+ void lm_ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
1103
+ assert(nrc == 1);
1104
+ UNUSED(nrc);
1105
+ UNUSED(bx);
1106
+ UNUSED(by);
1107
+ UNUSED(bs);
1108
+ assert(n % QK_K == 0);
1109
+
1110
+ const block_iq4_xs * LM_GGML_RESTRICT x = vx;
1111
+ const block_q8_K * LM_GGML_RESTRICT y = vy;
1112
+
1113
+ const int nb = n / QK_K;
1114
+
1115
+ float sumf = 0;
1116
+ for (int ibl = 0; ibl < nb; ++ibl) {
1117
+ const float d4d8 = LM_GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1118
+ uint16_t h = x[ibl].scales_h;
1119
+ const uint8_t * qs = x[ibl].qs;
1120
+ const int8_t * q8 = y[ibl].qs;
1121
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
1122
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
1123
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
1124
+ h >>= 4;
1125
+ const float d1 = d4d8*(ls1 - 32);
1126
+ const float d2 = d4d8*(ls2 - 32);
1127
+ int sumi1 = 0, sumi2 = 0;
1128
+ for (int j = 0; j < 16; ++j) {
1129
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1130
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1131
+ }
1132
+ sumf += d1 * (sumi1 + sumi2);
1133
+ qs += 16;
1134
+ q8 += 32;
1135
+ sumi1 = sumi2 = 0;
1136
+ for (int j = 0; j < 16; ++j) {
1137
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1138
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1139
+ }
1140
+ sumf += d2 * (sumi1 + sumi2);
1141
+ qs += 16;
1142
+ q8 += 32;
1143
+ }
1144
+ }
1145
+ *s = sumf;
1146
+ }
1147
+
1148
+ // ============================ 4-bit non-linear quants
1149
+
1150
+ void quantize_row_iq4_nl(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
1151
+ assert(k % QK4_NL == 0);
1152
+ quantize_row_iq4_nl_ref(x, y, k);
1153
+ }
1154
+
1155
+ void quantize_row_iq4_xs(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k) {
1156
+ assert(k % QK_K == 0);
1157
+ quantize_iq4_xs(x, y, 1, k, NULL);
1158
+ }