cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,1571 @@
1
+ #define LM_GGML_COMMON_IMPL_CPP
2
+ #define LM_GGML_COMMON_DECL_CPP
3
+ #include "ggml-common.h"
4
+ #include "ggml-backend-impl.h"
5
+
6
+ #include "ggml-impl.h"
7
+ #include "ggml-cpu.h"
8
+ #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
10
+ #include "traits.h"
11
+
12
+ #include "arch-fallback.h"
13
+
14
+ #include <cmath>
15
+ #include <cstring>
16
+ #include <cassert>
17
+ #include <cstdlib> // for qsort
18
+ #include <cstdio> // for LM_GGML_ASSERT
19
+
20
+ #include "repack.h"
21
+
22
+ #if defined(__GNUC__)
23
+ #pragma GCC diagnostic ignored "-Woverlength-strings"
24
+ #endif
25
+
26
+ #define UNUSED LM_GGML_UNUSED
27
+
28
+ static inline int nearest_int(float fval) {
29
+ assert(fabsf(fval) <= 4194303.f);
30
+ float val = fval + 12582912.f;
31
+ int i; memcpy(&i, &val, sizeof(int));
32
+ return (i & 0x007fffff) - 0x00400000;
33
+ }
34
+
35
+ // Functions to create the interleaved data layout formats
36
+
37
+ // interleave 4 block_q4_0s in blocks of blck_size_interleave
38
+ // returns an interleaved block_q4_0x4
39
+ // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
40
+ // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
41
+ //
42
+ // - in : an array of block_q4_0 pointers
43
+ // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
44
+ // blck_size_interleave bytes
45
+ // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
46
+ // from bias offset form to pure sign form (this saves subtract
47
+ // operations durin unpacking)
48
+ //
49
+
50
+ extern "C" {
51
+
52
+ void lm_ggml_quantize_mat_q8_0_4x4_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
53
+ assert(QK8_0 == 32);
54
+ assert(k % QK8_0 == 0);
55
+ const int nb = k / QK8_0;
56
+
57
+ block_q8_0x4 * LM_GGML_RESTRICT y = (block_q8_0x4 *) vy;
58
+
59
+ // scalar
60
+ const int blck_size_interleave = 4;
61
+ float srcv[4][QK8_0];
62
+ float id[4];
63
+
64
+ for (int i = 0; i < nb; i++) {
65
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
66
+ float amax = 0.0f; // absolute max
67
+
68
+ for (int j = 0; j < QK8_0; j++) {
69
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
70
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
71
+ }
72
+
73
+ const float d = amax / ((1 << 7) - 1);
74
+ id[row_iter] = d ? 1.0f / d : 0.0f;
75
+
76
+ y[i].d[row_iter] = LM_GGML_CPU_FP32_TO_FP16(d);
77
+ }
78
+
79
+ for (int j = 0; j < QK8_0 * 4; j++) {
80
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
81
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
82
+ src_offset += (j % blck_size_interleave);
83
+
84
+ float x0 = srcv[src_id][src_offset] * id[src_id];
85
+ y[i].qs[j] = roundf(x0);
86
+ }
87
+ }
88
+ }
89
+
90
+ void lm_ggml_quantize_mat_q8_0_4x8_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
91
+ assert(QK8_0 == 32);
92
+ assert(k % QK8_0 == 0);
93
+ const int nb = k / QK8_0;
94
+
95
+ block_q8_0x4 * LM_GGML_RESTRICT y = (block_q8_0x4 *) vy;
96
+
97
+ // scalar
98
+ const int blck_size_interleave = 8;
99
+ float srcv[4][QK8_0];
100
+ float id[4];
101
+
102
+ for (int i = 0; i < nb; i++) {
103
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
104
+ float amax = 0.0f; // absolute max
105
+
106
+ for (int j = 0; j < QK8_0; j++) {
107
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
108
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
109
+ }
110
+
111
+ const float d = amax / ((1 << 7) - 1);
112
+ id[row_iter] = d ? 1.0f / d : 0.0f;
113
+
114
+ y[i].d[row_iter] = LM_GGML_CPU_FP32_TO_FP16(d);
115
+ }
116
+
117
+ for (int j = 0; j < QK8_0 * 4; j++) {
118
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
119
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
120
+ src_offset += (j % blck_size_interleave);
121
+
122
+ float x0 = srcv[src_id][src_offset] * id[src_id];
123
+ y[i].qs[j] = roundf(x0);
124
+ }
125
+ }
126
+ }
127
+
128
+ void lm_ggml_quantize_mat_q8_K_4x8_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
129
+ assert(QK_K == 256);
130
+ assert(k % QK_K == 0);
131
+ const int nb = k / QK_K;
132
+
133
+ block_q8_Kx4 * LM_GGML_RESTRICT y = (block_q8_Kx4 *) vy;
134
+
135
+ // scalar
136
+ const int blck_size_interleave = 8;
137
+ float srcv[4][QK_K];
138
+ float iscale[4];
139
+
140
+ for (int i = 0; i < nb; i++) {
141
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
142
+ float amax = 0.0f; // absolute max
143
+ float max = 0;
144
+
145
+ for (int j = 0; j < QK_K; j++) {
146
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
147
+ // Update the maximum value of the corresponding super block
148
+ if(amax < fabsf(srcv[row_iter][j])) {
149
+ amax = fabsf(srcv[row_iter][j]);
150
+ max = srcv[row_iter][j];
151
+ }
152
+ }
153
+
154
+ iscale[row_iter] = amax ? -127.f/max : 0;
155
+
156
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
157
+ }
158
+
159
+ for (int j = 0; j < QK_K / 4; j++) {
160
+ y[i].bsums[j] = 0;
161
+ }
162
+
163
+ // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
164
+ // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
165
+ // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
166
+ for (int j = 0; j < QK_K * 4; j++) {
167
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
168
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
169
+ src_offset += (j % blck_size_interleave);
170
+ int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
171
+
172
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
173
+ y[i].qs[j] = nearest_int(x0);
174
+ y[i].bsums[index] += y[i].qs[j];
175
+ }
176
+ }
177
+ }
178
+
179
+ } // extern "C"
180
+
181
+ template <int64_t INTER_SIZE, lm_ggml_type PARAM_TYPE>
182
+ void lm_ggml_quantize_mat_t(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
183
+
184
+ template <> void lm_ggml_quantize_mat_t<4, LM_GGML_TYPE_Q8_0>(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
185
+ assert(nrow == 4);
186
+ UNUSED(nrow);
187
+ lm_ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
188
+ }
189
+
190
+ template <> void lm_ggml_quantize_mat_t<8, LM_GGML_TYPE_Q8_0>(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
191
+ assert(nrow == 4);
192
+ UNUSED(nrow);
193
+ lm_ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
194
+ }
195
+
196
+ template <> void lm_ggml_quantize_mat_t<8, LM_GGML_TYPE_Q8_K>(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
197
+ assert(nrow == 4);
198
+ UNUSED(nrow);
199
+ lm_ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
200
+ }
201
+
202
+ extern "C" {
203
+
204
+ void lm_ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
205
+ const int qk = QK8_0;
206
+ const int nb = n / qk;
207
+ const int ncols_interleaved = 4;
208
+ const int blocklen = 4;
209
+
210
+ assert (n % qk == 0);
211
+ assert (nc % ncols_interleaved == 0);
212
+
213
+ UNUSED(s);
214
+ UNUSED(bs);
215
+ UNUSED(vx);
216
+ UNUSED(vy);
217
+ UNUSED(nr);
218
+ UNUSED(nc);
219
+ UNUSED(nb);
220
+ UNUSED(ncols_interleaved);
221
+ UNUSED(blocklen);
222
+
223
+ float sumf[4];
224
+ int sumi;
225
+
226
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
227
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
228
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
229
+
230
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
231
+ for (int l = 0; l < nb; l++) {
232
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
233
+ for (int j = 0; j < ncols_interleaved; j++) {
234
+ sumi = 0;
235
+ for (int i = 0; i < blocklen; ++i) {
236
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
237
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
238
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
239
+ }
240
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
241
+ }
242
+ }
243
+ }
244
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
245
+ }
246
+ }
247
+
248
+ void lm_ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
249
+ const int qk = QK8_0;
250
+ const int nb = n / qk;
251
+ const int ncols_interleaved = 4;
252
+ const int blocklen = 8;
253
+
254
+ assert (n % qk == 0);
255
+ assert (nc % ncols_interleaved == 0);
256
+
257
+ UNUSED(s);
258
+ UNUSED(bs);
259
+ UNUSED(vx);
260
+ UNUSED(vy);
261
+ UNUSED(nr);
262
+ UNUSED(nc);
263
+ UNUSED(nb);
264
+ UNUSED(ncols_interleaved);
265
+ UNUSED(blocklen);
266
+
267
+ float sumf[4];
268
+ int sumi;
269
+
270
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
271
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
272
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
273
+
274
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
275
+ for (int l = 0; l < nb; l++) {
276
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
277
+ for (int j = 0; j < ncols_interleaved; j++) {
278
+ sumi = 0;
279
+ for (int i = 0; i < blocklen; ++i) {
280
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
281
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
282
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
283
+ }
284
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
285
+ }
286
+ }
287
+ }
288
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
289
+ }
290
+ }
291
+
292
+ void lm_ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
293
+ const int qk = QK8_0;
294
+ const int nb = n / qk;
295
+ const int ncols_interleaved = 8;
296
+ const int blocklen = 8;
297
+
298
+ assert (n % qk == 0);
299
+ assert (nc % ncols_interleaved == 0);
300
+
301
+ UNUSED(s);
302
+ UNUSED(bs);
303
+ UNUSED(vx);
304
+ UNUSED(vy);
305
+ UNUSED(nr);
306
+ UNUSED(nc);
307
+ UNUSED(nb);
308
+ UNUSED(ncols_interleaved);
309
+ UNUSED(blocklen);
310
+
311
+ {
312
+ float sumf[8];
313
+ int sumi;
314
+
315
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
316
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
317
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
318
+
319
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
320
+ for (int l = 0; l < nb; l++) {
321
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
322
+ for (int j = 0; j < ncols_interleaved; j++) {
323
+ sumi = 0;
324
+ for (int i = 0; i < blocklen; ++i) {
325
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
326
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
327
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
328
+ }
329
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
330
+ }
331
+ }
332
+ }
333
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
334
+ }
335
+ }
336
+ }
337
+
338
+ void lm_ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
339
+ const int qk = QK_K;
340
+ const int nb = n / qk;
341
+ const int ncols_interleaved = 8;
342
+ const int blocklen = 8;
343
+ static const uint32_t kmask1 = 0x3f3f3f3f;
344
+ static const uint32_t kmask2 = 0x0f0f0f0f;
345
+ static const uint32_t kmask3 = 0x03030303;
346
+
347
+ assert (n % qk == 0);
348
+ assert (nc % ncols_interleaved == 0);
349
+
350
+ UNUSED(s);
351
+ UNUSED(bs);
352
+ UNUSED(vx);
353
+ UNUSED(vy);
354
+ UNUSED(nr);
355
+ UNUSED(nc);
356
+ UNUSED(nb);
357
+ UNUSED(ncols_interleaved);
358
+ UNUSED(blocklen);
359
+
360
+ float sumf[8];
361
+ float sum_minf[8];
362
+ uint32_t utmp[32];
363
+ int sumi1;
364
+ int sumi2;
365
+ int sumi;
366
+
367
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
368
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
369
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
370
+
371
+ for (int j = 0; j < ncols_interleaved; j++) {
372
+ sumf[j] = 0.0;
373
+ sum_minf[j] = 0.0;
374
+ }
375
+ for (int l = 0; l < nb; l++) {
376
+ for (int sb = 0; sb < 8; sb++) {
377
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
378
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
379
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
380
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
381
+ utmp[sb * 4 + 2] = uaux_0;
382
+ utmp[sb * 4 + 0] &= kmask1;
383
+ }
384
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
385
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
386
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
387
+ for (int j = 0; j < ncols_interleaved; j++) {
388
+ sumi1 = 0;
389
+ sumi2 = 0;
390
+ sumi = 0;
391
+ for (int i = 0; i < blocklen; ++i) {
392
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
393
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
394
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
395
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
396
+ sumi1 = sumi1 * scales_0[j];
397
+ sumi2 = sumi2 * scales_1[j];
398
+ sumi += sumi1 + sumi2;
399
+ }
400
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
401
+ }
402
+ }
403
+ for (int sb = 0; sb < 8; sb++) {
404
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
405
+ for (int j = 0; j < ncols_interleaved; j++) {
406
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
407
+ }
408
+ }
409
+ }
410
+ for (int j = 0; j < ncols_interleaved; j++) {
411
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
412
+ }
413
+ }
414
+ }
415
+
416
+ void lm_ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
417
+ const int qk = QK8_0;
418
+ const int nb = n / qk;
419
+ const int ncols_interleaved = 4;
420
+ const int blocklen = 4;
421
+
422
+ assert (n % qk == 0);
423
+ assert (nc % ncols_interleaved == 0);
424
+
425
+ UNUSED(s);
426
+ UNUSED(bs);
427
+ UNUSED(vx);
428
+ UNUSED(vy);
429
+ UNUSED(nr);
430
+ UNUSED(nc);
431
+ UNUSED(nb);
432
+ UNUSED(ncols_interleaved);
433
+ UNUSED(blocklen);
434
+
435
+ {
436
+ float sumf[4];
437
+ int sumi;
438
+
439
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
440
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
441
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
442
+
443
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
444
+ for (int l = 0; l < nb; l++) {
445
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
446
+ for (int j = 0; j < ncols_interleaved; j++) {
447
+ sumi = 0;
448
+ for (int i = 0; i < blocklen; ++i) {
449
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
450
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
451
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
452
+ }
453
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
454
+ }
455
+ }
456
+ }
457
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
458
+ }
459
+ }
460
+ }
461
+
462
+ void lm_ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
463
+ const int qk = QK8_0;
464
+ const int nb = n / qk;
465
+ const int ncols_interleaved = 4;
466
+ const int blocklen = 4;
467
+
468
+ assert (n % qk == 0);
469
+ assert (nr % 4 == 0);
470
+ assert (nc % ncols_interleaved == 0);
471
+
472
+ UNUSED(s);
473
+ UNUSED(bs);
474
+ UNUSED(vx);
475
+ UNUSED(vy);
476
+ UNUSED(nr);
477
+ UNUSED(nc);
478
+ UNUSED(nb);
479
+ UNUSED(ncols_interleaved);
480
+ UNUSED(blocklen);
481
+
482
+ {
483
+ float sumf[4][4];
484
+ int sumi;
485
+
486
+ for (int y = 0; y < nr / 4; y++) {
487
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
488
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
489
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
490
+ for (int m = 0; m < 4; m++) {
491
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
492
+ }
493
+ for (int l = 0; l < nb; l++) {
494
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
495
+ for (int m = 0; m < 4; m++) {
496
+ for (int j = 0; j < ncols_interleaved; j++) {
497
+ sumi = 0;
498
+ for (int i = 0; i < blocklen; ++i) {
499
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
500
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
501
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
502
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
503
+ }
504
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
505
+ }
506
+ }
507
+ }
508
+ }
509
+ for (int m = 0; m < 4; m++) {
510
+ for (int j = 0; j < ncols_interleaved; j++)
511
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
512
+ }
513
+ }
514
+ }
515
+ }
516
+ }
517
+
518
+ void lm_ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
519
+ const int qk = QK8_0;
520
+ const int nb = n / qk;
521
+ const int ncols_interleaved = 4;
522
+ const int blocklen = 8;
523
+
524
+ assert (n % qk == 0);
525
+ assert (nr % 4 == 0);
526
+ assert (nc % ncols_interleaved == 0);
527
+
528
+ UNUSED(s);
529
+ UNUSED(bs);
530
+ UNUSED(vx);
531
+ UNUSED(vy);
532
+ UNUSED(nr);
533
+ UNUSED(nc);
534
+ UNUSED(nb);
535
+ UNUSED(ncols_interleaved);
536
+ UNUSED(blocklen);
537
+
538
+ float sumf[4][4];
539
+ int sumi;
540
+
541
+ for (int y = 0; y < nr / 4; y++) {
542
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
543
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
544
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
545
+ for (int m = 0; m < 4; m++) {
546
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
547
+ }
548
+ for (int l = 0; l < nb; l++) {
549
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
550
+ for (int m = 0; m < 4; m++) {
551
+ for (int j = 0; j < ncols_interleaved; j++) {
552
+ sumi = 0;
553
+ for (int i = 0; i < blocklen; ++i) {
554
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
555
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
556
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
557
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
558
+ }
559
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
560
+ }
561
+ }
562
+ }
563
+ }
564
+ for (int m = 0; m < 4; m++) {
565
+ for (int j = 0; j < ncols_interleaved; j++)
566
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
567
+ }
568
+ }
569
+ }
570
+ }
571
+
572
+ void lm_ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
573
+ const int qk = QK8_0;
574
+ const int nb = n / qk;
575
+ const int ncols_interleaved = 8;
576
+ const int blocklen = 8;
577
+
578
+ assert (n % qk == 0);
579
+ assert (nr % 4 == 0);
580
+ assert (nc % ncols_interleaved == 0);
581
+
582
+ UNUSED(s);
583
+ UNUSED(bs);
584
+ UNUSED(vx);
585
+ UNUSED(vy);
586
+ UNUSED(nr);
587
+ UNUSED(nc);
588
+ UNUSED(nb);
589
+ UNUSED(ncols_interleaved);
590
+ UNUSED(blocklen);
591
+
592
+ float sumf[4][8];
593
+ int sumi;
594
+
595
+ for (int y = 0; y < nr / 4; y++) {
596
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
597
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
598
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
599
+ for (int m = 0; m < 4; m++) {
600
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
601
+ }
602
+ for (int l = 0; l < nb; l++) {
603
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
604
+ for (int m = 0; m < 4; m++) {
605
+ for (int j = 0; j < ncols_interleaved; j++) {
606
+ sumi = 0;
607
+ for (int i = 0; i < blocklen; ++i) {
608
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
609
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
610
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
611
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
612
+ }
613
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
614
+ }
615
+ }
616
+ }
617
+ }
618
+ for (int m = 0; m < 4; m++) {
619
+ for (int j = 0; j < ncols_interleaved; j++)
620
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
621
+ }
622
+ }
623
+ }
624
+ }
625
+
626
+ void lm_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
627
+ const int qk = QK_K;
628
+ const int nb = n / qk;
629
+ const int ncols_interleaved = 8;
630
+ const int blocklen = 8;
631
+ static const uint32_t kmask1 = 0x3f3f3f3f;
632
+ static const uint32_t kmask2 = 0x0f0f0f0f;
633
+ static const uint32_t kmask3 = 0x03030303;
634
+
635
+ assert (n % qk == 0);
636
+ assert (nr % 4 == 0);
637
+ assert (nc % ncols_interleaved == 0);
638
+
639
+ UNUSED(s);
640
+ UNUSED(bs);
641
+ UNUSED(vx);
642
+ UNUSED(vy);
643
+ UNUSED(nr);
644
+ UNUSED(nc);
645
+ UNUSED(nb);
646
+ UNUSED(ncols_interleaved);
647
+ UNUSED(blocklen);
648
+
649
+ float sumf[4][8];
650
+ float sum_minf[4][8];
651
+ uint32_t utmp[32];
652
+ int sumi1;
653
+ int sumi2;
654
+ int sumi;
655
+
656
+ for (int y = 0; y < nr / 4; y++) {
657
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
658
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
659
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
660
+ for (int m = 0; m < 4; m++) {
661
+ for (int j = 0; j < ncols_interleaved; j++) {
662
+ sumf[m][j] = 0.0;
663
+ sum_minf[m][j] = 0.0;
664
+ }
665
+ }
666
+ for (int l = 0; l < nb; l++) {
667
+ for (int sb = 0; sb < 8; sb++) {
668
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
669
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
670
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
671
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
672
+ utmp[sb * 4 + 2] = uaux_0;
673
+ utmp[sb * 4 + 0] &= kmask1;
674
+ }
675
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
676
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
677
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
678
+ for (int m = 0; m < 4; m++) {
679
+ for (int j = 0; j < ncols_interleaved; j++) {
680
+ sumi1 = 0;
681
+ sumi2 = 0;
682
+ sumi = 0;
683
+ for (int i = 0; i < blocklen; ++i) {
684
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
685
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
686
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
687
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
688
+ sumi1 = sumi1 * scales_0[j];
689
+ sumi2 = sumi2 * scales_1[j];
690
+ sumi += sumi1 + sumi2;
691
+ }
692
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
693
+ }
694
+ }
695
+ }
696
+ for (int sb = 0; sb < 8; sb++) {
697
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
698
+ for(int m = 0; m < 4; m++) {
699
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
700
+ for(int j = 0; j < ncols_interleaved; j++) {
701
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
702
+ }
703
+ }
704
+ }
705
+ }
706
+ for (int m = 0; m < 4; m++) {
707
+ for (int j = 0; j < ncols_interleaved; j++) {
708
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
709
+ }
710
+ }
711
+ }
712
+ }
713
+ }
714
+
715
+ void lm_ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
716
+ const int qk = QK8_0;
717
+ const int nb = n / qk;
718
+ const int ncols_interleaved = 4;
719
+ const int blocklen = 4;
720
+
721
+ assert (n % qk == 0);
722
+ assert (nr % 4 == 0);
723
+ assert (nc % ncols_interleaved == 0);
724
+
725
+ UNUSED(s);
726
+ UNUSED(bs);
727
+ UNUSED(vx);
728
+ UNUSED(vy);
729
+ UNUSED(nr);
730
+ UNUSED(nc);
731
+ UNUSED(nb);
732
+ UNUSED(ncols_interleaved);
733
+ UNUSED(blocklen);
734
+
735
+ {
736
+ float sumf[4][4];
737
+ int sumi;
738
+
739
+ for (int y = 0; y < nr / 4; y++) {
740
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
741
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
742
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
743
+ for (int m = 0; m < 4; m++) {
744
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
745
+ }
746
+ for (int l = 0; l < nb; l++) {
747
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
748
+ for (int m = 0; m < 4; m++) {
749
+ for (int j = 0; j < ncols_interleaved; j++) {
750
+ sumi = 0;
751
+ for (int i = 0; i < blocklen; ++i) {
752
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
753
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
754
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
755
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
756
+ }
757
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
758
+ }
759
+ }
760
+ }
761
+ }
762
+ for (int m = 0; m < 4; m++) {
763
+ for (int j = 0; j < ncols_interleaved; j++)
764
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
765
+ }
766
+ }
767
+ }
768
+ }
769
+ }
770
+
771
+ } // extern "C"
772
+
773
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
774
+ block_q4_0x4 out;
775
+
776
+ for (int i = 0; i < 4; i++) {
777
+ out.d[i] = in[i].d;
778
+ }
779
+
780
+ const int end = QK4_0 * 2 / blck_size_interleave;
781
+
782
+ if (blck_size_interleave == 8) {
783
+ const uint64_t xor_mask = 0x8888888888888888ULL;
784
+ for (int i = 0; i < end; ++i) {
785
+ int src_id = i % 4;
786
+ int src_offset = (i / 4) * blck_size_interleave;
787
+ int dst_offset = i * blck_size_interleave;
788
+
789
+ uint64_t elems;
790
+ // Using memcpy to avoid unaligned memory accesses
791
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
792
+ elems ^= xor_mask;
793
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
794
+ }
795
+ } else if (blck_size_interleave == 4) {
796
+ const uint32_t xor_mask = 0x88888888;
797
+ for (int i = 0; i < end; ++i) {
798
+ int src_id = i % 4;
799
+ int src_offset = (i / 4) * blck_size_interleave;
800
+ int dst_offset = i * blck_size_interleave;
801
+
802
+ uint32_t elems;
803
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
804
+ elems ^= xor_mask;
805
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
806
+ }
807
+ } else {
808
+ LM_GGML_ASSERT(false);
809
+ }
810
+
811
+ return out;
812
+ }
813
+
814
+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
815
+ // returns an interleaved block_q4_0x8
816
+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
817
+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
818
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
819
+ block_q4_0x8 out;
820
+
821
+ for (int i = 0; i < 8; i++) {
822
+ out.d[i] = in[i].d;
823
+ }
824
+
825
+ const int end = QK4_0 * 4 / blck_size_interleave;
826
+ const uint64_t xor_mask = 0x8888888888888888ULL;
827
+
828
+ for (int i = 0; i < end; ++i) {
829
+ int src_id = i % 8;
830
+ int src_offset = (i / 8) * blck_size_interleave;
831
+ int dst_offset = i * blck_size_interleave;
832
+
833
+ uint64_t elems;
834
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
835
+ elems ^= xor_mask;
836
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
837
+ }
838
+
839
+ return out;
840
+ }
841
+
842
+ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
843
+ block_q4_Kx8 out;
844
+ //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
845
+ for (int i = 0; i < 8; i++) {
846
+ out.d[i] = in[i].LM_GGML_COMMON_AGGR_U.LM_GGML_COMMON_AGGR_S.d;
847
+ }
848
+
849
+ for (int i = 0; i < 8; i++) {
850
+ out.dmin[i] = in[i].LM_GGML_COMMON_AGGR_U.LM_GGML_COMMON_AGGR_S.dmin;
851
+ }
852
+
853
+ const int end = QK_K * 4 / blck_size_interleave;
854
+
855
+ // Interleave Q4_K quants by taking 8 bytes at a time
856
+ for (int i = 0; i < end; ++i) {
857
+ int src_id = i % 8;
858
+ int src_offset = (i / 8) * blck_size_interleave;
859
+ int dst_offset = i * blck_size_interleave;
860
+
861
+ uint64_t elems;
862
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
863
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
864
+ }
865
+
866
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
867
+ // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
868
+ // The output Q4_Kx8 structure has 96 bytes
869
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
870
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
871
+ uint8_t s[8], m[8];
872
+
873
+ for (int i = 0; i < 4; i++) {
874
+ for (int j = 0; j < 8; j++) {
875
+ s[j] = in[j].scales[i] & 63;
876
+ m[j] = in[j].scales[i + 4] & 63;
877
+ }
878
+
879
+ out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
880
+ out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
881
+ out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
882
+ out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
883
+ out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
884
+ out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
885
+ out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
886
+ out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
887
+ out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
888
+ out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
889
+ out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
890
+ out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
891
+
892
+ }
893
+
894
+ for (int i = 0; i < 4; i++) {
895
+ for (int j = 0; j < 8; j++) {
896
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
897
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
898
+ }
899
+
900
+ out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
901
+ out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
902
+ out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
903
+ out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
904
+ out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
905
+ out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
906
+ out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
907
+ out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
908
+ out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
909
+ out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
910
+ out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
911
+ out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
912
+
913
+ }
914
+
915
+ return out;
916
+ }
917
+
918
+ static int repack_q4_0_to_q4_0_4_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
919
+ LM_GGML_ASSERT(t->type == LM_GGML_TYPE_Q4_0);
920
+ LM_GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
921
+ constexpr int nrows_interleaved = 4;
922
+
923
+ block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
924
+ const block_q4_0 * src = (const block_q4_0 *)data;
925
+ block_q4_0 dst_tmp[4];
926
+ int nrow = lm_ggml_nrows(t);
927
+ int nblocks = t->ne[0] / QK4_0;
928
+
929
+ LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
930
+
931
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
932
+ return -1;
933
+ }
934
+
935
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
936
+ for (int64_t x = 0; x < nblocks; x++) {
937
+ for (int i = 0; i < nrows_interleaved; i++) {
938
+ dst_tmp[i] = src[x + i * nblocks];
939
+ }
940
+ *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
941
+ }
942
+ src += nrows_interleaved * nblocks;
943
+ }
944
+ return 0;
945
+
946
+ LM_GGML_UNUSED(data_size);
947
+ }
948
+ static int repack_q4_K_to_q4_K_8_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
949
+ LM_GGML_ASSERT(t->type == LM_GGML_TYPE_Q4_K);
950
+ LM_GGML_ASSERT(interleave_block == 8);
951
+ constexpr int nrows_interleaved = 8;
952
+
953
+ block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
954
+ const block_q4_K * src = (const block_q4_K*) data;
955
+ block_q4_K dst_tmp[8];
956
+ int nrow = lm_ggml_nrows(t);
957
+ int nblocks = t->ne[0] / QK_K;
958
+
959
+ LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
960
+
961
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
962
+ return -1;
963
+ }
964
+
965
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
966
+ for (int64_t x = 0; x < nblocks; x++) {
967
+ for (int i = 0; i < nrows_interleaved; i++ ) {
968
+ dst_tmp[i] = src[x + i * nblocks];
969
+ }
970
+ *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
971
+ }
972
+ src += nrows_interleaved * nblocks;
973
+ }
974
+ return 0;
975
+
976
+ LM_GGML_UNUSED(data_size);
977
+ }
978
+
979
+ static int repack_q4_0_to_q4_0_8_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
980
+ LM_GGML_ASSERT(t->type == LM_GGML_TYPE_Q4_0);
981
+ LM_GGML_ASSERT(interleave_block == 8);
982
+ constexpr int nrows_interleaved = 8;
983
+
984
+ block_q4_0x8 * dst = (block_q4_0x8*)t->data;
985
+ const block_q4_0 * src = (const block_q4_0*) data;
986
+ block_q4_0 dst_tmp[8];
987
+ int nrow = lm_ggml_nrows(t);
988
+ int nblocks = t->ne[0] / QK4_0;
989
+
990
+ LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
991
+
992
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
993
+ return -1;
994
+ }
995
+
996
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
997
+ for (int64_t x = 0; x < nblocks; x++) {
998
+ for (int i = 0; i < nrows_interleaved; i++ ) {
999
+ dst_tmp[i] = src[x + i * nblocks];
1000
+ }
1001
+ *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
1002
+ }
1003
+ src += nrows_interleaved * nblocks;
1004
+ }
1005
+ return 0;
1006
+
1007
+ LM_GGML_UNUSED(data_size);
1008
+ }
1009
+
1010
+ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
1011
+ block_iq4_nlx4 out;
1012
+
1013
+ for (int i = 0; i < 4; i++) {
1014
+ out.d[i] = in[i].d;
1015
+ }
1016
+
1017
+ const int end = QK4_NL * 2 / blck_size_interleave;
1018
+
1019
+ // TODO: this branch seems wrong
1020
+ //if (blck_size_interleave == 8) {
1021
+ // for (int i = 0; i < end; ++i) {
1022
+ // int src_id = i % 4;
1023
+ // int src_offset = (i / 4) * blck_size_interleave;
1024
+ // int dst_offset = i * blck_size_interleave;
1025
+
1026
+ // // Using memcpy to avoid unaligned memory accesses
1027
+ // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1028
+ // }
1029
+ //} else
1030
+ if (blck_size_interleave == 4) {
1031
+ for (int i = 0; i < end; ++i) {
1032
+ int src_id = i % 4;
1033
+ int src_offset = (i / 4) * blck_size_interleave;
1034
+ int dst_offset = i * blck_size_interleave;
1035
+
1036
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
1037
+ }
1038
+ } else {
1039
+ LM_GGML_ASSERT(false);
1040
+ }
1041
+
1042
+ return out;
1043
+ }
1044
+
1045
+ static int repack_iq4_nl_to_iq4_nl_4_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
1046
+ LM_GGML_ASSERT(t->type == LM_GGML_TYPE_IQ4_NL);
1047
+ //LM_GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1048
+ LM_GGML_ASSERT(interleave_block == 4);
1049
+
1050
+ block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
1051
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1052
+ block_iq4_nl dst_tmp[4];
1053
+ int nrow = lm_ggml_nrows(t);
1054
+ int nrows_interleaved = 4;
1055
+ int nblocks = t->ne[0] / QK4_0;
1056
+
1057
+ LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1058
+
1059
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1060
+ return -1;
1061
+ }
1062
+
1063
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1064
+ for (int64_t x = 0; x < nblocks; x++) {
1065
+ for (int i = 0; i < nrows_interleaved; i++) {
1066
+ dst_tmp[i] = src[x + i * nblocks];
1067
+ }
1068
+ *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
1069
+ }
1070
+ src += nrows_interleaved * nblocks;
1071
+ }
1072
+ return 0;
1073
+
1074
+ LM_GGML_UNUSED(data_size);
1075
+ }
1076
+
1077
+ namespace ggml::cpu::repack {
1078
+ // repack
1079
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
1080
+ int repack(struct lm_ggml_tensor *, const void *, size_t);
1081
+
1082
+ // TODO: generalise.
1083
+ template <> int repack<block_q4_0, 4, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
1084
+ return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
1085
+ }
1086
+
1087
+ template <> int repack<block_q4_0, 8, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
1088
+ return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
1089
+ }
1090
+
1091
+ template <> int repack<block_q4_0, 8, 8>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
1092
+ return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
1093
+ }
1094
+
1095
+ template <> int repack<block_q4_K, 8, 8>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
1096
+ return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
1097
+ }
1098
+
1099
+ template <> int repack<block_iq4_nl, 4, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
1100
+ return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1101
+ }
1102
+
1103
+ // TODO: needs to be revisited
1104
+ //template <> int repack<block_iq4_nl, 8, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
1105
+ // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1106
+ //}
1107
+
1108
+ // gemv
1109
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, lm_ggml_type PARAM_TYPE>
1110
+ void gemv(int, float *, size_t, const void *, const void *, int, int);
1111
+
1112
+ template <> void gemv<block_q4_0, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1113
+ lm_ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1114
+ }
1115
+
1116
+ template <> void gemv<block_q4_0, 8, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1117
+ lm_ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
1118
+ }
1119
+
1120
+ template <> void gemv<block_q4_0, 8, 8, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1121
+ lm_ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1122
+ }
1123
+
1124
+ template <> void gemv<block_q4_K, 8, 8, LM_GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1125
+ lm_ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1126
+ }
1127
+
1128
+ template <> void gemv<block_iq4_nl, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1129
+ lm_ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1130
+ }
1131
+
1132
+ // gemm
1133
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, lm_ggml_type PARAM_TYPE>
1134
+ void gemm(int, float *, size_t, const void *, const void *, int, int);
1135
+
1136
+ template <> void gemm<block_q4_0, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1137
+ lm_ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1138
+ }
1139
+
1140
+ template <> void gemm<block_q4_0, 8, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1141
+ lm_ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
1142
+ }
1143
+
1144
+ template <> void gemm<block_q4_0, 8, 8, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1145
+ lm_ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1146
+ }
1147
+
1148
+ template <> void gemm<block_q4_K, 8, 8, LM_GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1149
+ lm_ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1150
+ }
1151
+
1152
+ template <> void gemm<block_iq4_nl, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1153
+ lm_ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1154
+ }
1155
+
1156
+ class tensor_traits_base : public ggml::cpu::tensor_traits {
1157
+ public:
1158
+ virtual int repack(struct lm_ggml_tensor * t, const void * data, size_t data_size) = 0;
1159
+ };
1160
+
1161
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, lm_ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
1162
+
1163
+ bool work_size(int /* n_threads */, const struct lm_ggml_tensor * op, size_t & size) override {
1164
+ // not realy a LM_GGML_TYPE_Q8_0 but same size.
1165
+ switch (op->op) {
1166
+ case LM_GGML_OP_MUL_MAT:
1167
+ {
1168
+ size = lm_ggml_row_size(PARAM_TYPE, lm_ggml_nelements(op->src[1]));
1169
+ return true;
1170
+ }
1171
+ case LM_GGML_OP_MUL_MAT_ID:
1172
+ {
1173
+ size = lm_ggml_row_size(PARAM_TYPE, lm_ggml_nelements(op->src[1]));
1174
+ size = LM_GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
1175
+
1176
+ const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
1177
+ const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
1178
+
1179
+ const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
1180
+
1181
+ size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
1182
+
1183
+ return true;
1184
+ }
1185
+ default:
1186
+ // LM_GGML_ABORT("fatal error");
1187
+ break;
1188
+ }
1189
+ return false;
1190
+ }
1191
+
1192
+ bool compute_forward(struct lm_ggml_compute_params * params, struct lm_ggml_tensor * op) override {
1193
+ switch (op->op) {
1194
+ case LM_GGML_OP_MUL_MAT:
1195
+ forward_mul_mat(params, op);
1196
+ return true;
1197
+ case LM_GGML_OP_MUL_MAT_ID:
1198
+ forward_mul_mat_id(params, op);
1199
+ return true;
1200
+ default:
1201
+ // LM_GGML_ABORT("fatal error");
1202
+ break;
1203
+ }
1204
+ return false;
1205
+ }
1206
+
1207
+ void forward_mul_mat(lm_ggml_compute_params * params, lm_ggml_tensor * op) {
1208
+ const lm_ggml_tensor * src0 = op->src[0];
1209
+ const lm_ggml_tensor * src1 = op->src[1];
1210
+ lm_ggml_tensor * dst = op;
1211
+
1212
+ LM_GGML_TENSOR_BINARY_OP_LOCALS
1213
+
1214
+ const int ith = params->ith;
1215
+ const int nth = params->nth;
1216
+
1217
+ LM_GGML_ASSERT(ne0 == ne01);
1218
+ LM_GGML_ASSERT(ne1 == ne11);
1219
+ LM_GGML_ASSERT(ne2 == ne12);
1220
+ LM_GGML_ASSERT(ne3 == ne13);
1221
+
1222
+ // dst cannot be transposed or permuted
1223
+ LM_GGML_ASSERT(nb0 == sizeof(float));
1224
+ LM_GGML_ASSERT(nb0 <= nb1);
1225
+ LM_GGML_ASSERT(nb1 <= nb2);
1226
+ LM_GGML_ASSERT(nb2 <= nb3);
1227
+
1228
+ LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
1229
+
1230
+ LM_GGML_ASSERT(lm_ggml_n_dims(op->src[0]) == 2);
1231
+ // LM_GGML_ASSERT(lm_ggml_n_dims(op->src[1]) == 2);
1232
+
1233
+ char * wdata = static_cast<char *>(params->wdata);
1234
+ const size_t nbw1 = lm_ggml_row_size(PARAM_TYPE, ne10);
1235
+
1236
+ assert(params->wsize >= nbw1 * ne11);
1237
+
1238
+ const lm_ggml_from_float_t from_float = lm_ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
1239
+
1240
+ int64_t i11_processed = 0;
1241
+ for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
1242
+ lm_ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
1243
+ }
1244
+
1245
+ i11_processed = ne11 - ne11 % 4;
1246
+ for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
1247
+ from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
1248
+ }
1249
+
1250
+ lm_ggml_barrier(params->threadpool);
1251
+
1252
+ const void * src1_wdata = params->wdata;
1253
+ const size_t src1_col_stride = lm_ggml_row_size(PARAM_TYPE, ne10);
1254
+ int64_t src0_start = (ith * ne01) / nth;
1255
+ int64_t src0_end = ((ith + 1) * ne01) / nth;
1256
+ src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
1257
+ src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1258
+ if (src0_start >= src0_end) {
1259
+ return;
1260
+ }
1261
+
1262
+ // If there are more than three rows in src1, use gemm; otherwise, use gemv.
1263
+ if (ne11 > 3) {
1264
+ gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1265
+ (float *) ((char *) dst->data) + src0_start, ne01,
1266
+ (const char *) src0->data + src0_start * nb01,
1267
+ (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
1268
+ }
1269
+ for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
1270
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1271
+ (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
1272
+ (const char *) src0->data + src0_start * nb01,
1273
+ (const char *) src1_wdata + (src1_col_stride * iter), 1,
1274
+ src0_end - src0_start);
1275
+ }
1276
+ }
1277
+
1278
+ void forward_mul_mat_id(lm_ggml_compute_params * params, lm_ggml_tensor * op) {
1279
+ const lm_ggml_tensor * src0 = op->src[0];
1280
+ const lm_ggml_tensor * src1 = op->src[1];
1281
+ const lm_ggml_tensor * ids = op->src[2];
1282
+ lm_ggml_tensor * dst = op;
1283
+
1284
+ LM_GGML_TENSOR_BINARY_OP_LOCALS
1285
+
1286
+ const int ith = params->ith;
1287
+ const int nth = params->nth;
1288
+
1289
+ const lm_ggml_from_float_t from_float = lm_ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
1290
+
1291
+ // we don't support permuted src0 or src1
1292
+ LM_GGML_ASSERT(nb00 == lm_ggml_type_size(src0->type));
1293
+ LM_GGML_ASSERT(nb10 == lm_ggml_type_size(src1->type));
1294
+
1295
+ // dst cannot be transposed or permuted
1296
+ LM_GGML_ASSERT(nb0 == sizeof(float));
1297
+ LM_GGML_ASSERT(nb0 <= nb1);
1298
+ LM_GGML_ASSERT(nb1 <= nb2);
1299
+ LM_GGML_ASSERT(nb2 <= nb3);
1300
+
1301
+ LM_GGML_ASSERT(ne03 == 1);
1302
+ LM_GGML_ASSERT(ne13 == 1);
1303
+ LM_GGML_ASSERT(ne3 == 1);
1304
+
1305
+ LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
1306
+
1307
+ // row groups
1308
+ const int n_ids = ids->ne[0]; // n_expert_used
1309
+ const int n_as = ne02; // n_expert
1310
+
1311
+ const size_t nbw1 = lm_ggml_row_size(PARAM_TYPE, ne10);
1312
+ const size_t nbw2 = nbw1*ne11;
1313
+ const size_t nbw3 = nbw2*ne12;
1314
+
1315
+ struct mmid_row_mapping {
1316
+ int32_t i1;
1317
+ int32_t i2;
1318
+ };
1319
+
1320
+ LM_GGML_ASSERT(params->wsize >=
1321
+ (LM_GGML_PAD(nbw3, sizeof(int64_t)) +
1322
+ n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
1323
+ );
1324
+
1325
+ auto * wdata = (char *)params->wdata;
1326
+ auto * wdata_src1_end = (char *)wdata + LM_GGML_PAD(nbw3, sizeof(int64_t));
1327
+
1328
+ // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
1329
+ auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1330
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
1331
+
1332
+ // src1: float32 => param type
1333
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
1334
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
1335
+ from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
1336
+ (void *) (wdata + i12 * nbw2 + i11 * nbw1),
1337
+ ne10);
1338
+ }
1339
+ }
1340
+
1341
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
1342
+
1343
+ if (ith == 0) {
1344
+ // initialize matrix_row_counts
1345
+ memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
1346
+
1347
+ // group rows by src0 matrix
1348
+ for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
1349
+ for (int32_t id = 0; id < n_ids; ++id) {
1350
+ const int32_t i02 =
1351
+ *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
1352
+
1353
+ LM_GGML_ASSERT(i02 >= 0 && i02 < n_as);
1354
+
1355
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
1356
+ matrix_row_counts[i02] += 1;
1357
+ }
1358
+ }
1359
+ }
1360
+
1361
+ lm_ggml_barrier(params->threadpool);
1362
+
1363
+ // compute each matrix multiplication in sequence
1364
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
1365
+ const int64_t cne1 = matrix_row_counts[cur_a];
1366
+
1367
+ if (cne1 == 0) {
1368
+ continue;
1369
+ }
1370
+
1371
+ const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
1372
+
1373
+ //const int64_t nr0 = ne01; // src0 rows
1374
+ const int64_t nr1 = cne1; // src1 rows
1375
+
1376
+ int64_t src0_cur_start = (ith * ne01) / nth;
1377
+ int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
1378
+
1379
+ src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
1380
+ src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
1381
+
1382
+ if (src0_cur_start >= src0_cur_end) {
1383
+ return;
1384
+ }
1385
+
1386
+ for (int ir1 = 0; ir1 < nr1; ir1++) {
1387
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
1388
+
1389
+ const int id = row_mapping.i1; // selected expert index
1390
+
1391
+ const int64_t i11 = id % ne11;
1392
+ const int64_t i12 = row_mapping.i2; // row index in src1
1393
+
1394
+ const int64_t i1 = id; // selected expert index
1395
+ const int64_t i2 = i12; // row
1396
+
1397
+ const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
1398
+
1399
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1400
+ (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
1401
+ src0_cur + src0_cur_start * nb01,
1402
+ src1_col, 1, src0_cur_end - src0_cur_start);
1403
+ }
1404
+ }
1405
+ #undef MMID_MATRIX_ROW
1406
+ }
1407
+
1408
+ int repack(struct lm_ggml_tensor * t, const void * data, size_t data_size) override {
1409
+ LM_GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, lm_ggml_type_name(t->type),
1410
+ (int) NB_COLS, (int) INTER_SIZE);
1411
+ return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
1412
+ }
1413
+ };
1414
+
1415
+ } // namespace ggml::cpu::repack
1416
+
1417
+ static const ggml::cpu::tensor_traits * lm_ggml_repack_get_optimal_repack_type(const struct lm_ggml_tensor * cur) {
1418
+
1419
+ // instance for Q4
1420
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, LM_GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1421
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, LM_GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1422
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, LM_GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1423
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, LM_GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1424
+
1425
+ // instance for IQ4
1426
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, LM_GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1427
+
1428
+ if (cur->type == LM_GGML_TYPE_Q4_0) {
1429
+ if (lm_ggml_cpu_has_avx2() || (lm_ggml_cpu_has_sve() && lm_ggml_cpu_has_matmul_int8() && lm_ggml_cpu_get_sve_cnt() == QK8_0)) {
1430
+ if (cur->ne[1] % 8 == 0) {
1431
+ return &q4_0_8x8_q8_0;
1432
+ }
1433
+ }
1434
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
1435
+ if (cur->ne[1] % 4 == 0) {
1436
+ return &q4_0_4x8_q8_0;
1437
+ }
1438
+ }
1439
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
1440
+ if (cur->ne[1] % 4 == 0) {
1441
+ return &q4_0_4x4_q8_0;
1442
+ }
1443
+ }
1444
+ } else if (cur->type == LM_GGML_TYPE_Q4_K) {
1445
+ if (lm_ggml_cpu_has_avx2()) {
1446
+ if (cur->ne[1] % 8 == 0) {
1447
+ return &q4_K_8x8_q8_K;
1448
+ }
1449
+ }
1450
+ } else if (cur->type == LM_GGML_TYPE_IQ4_NL) {
1451
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
1452
+ if (cur->ne[1] % 4 == 0) {
1453
+ return &iq4_nl_4x4_q8_0;
1454
+ }
1455
+ }
1456
+ }
1457
+
1458
+ return nullptr;
1459
+ }
1460
+
1461
+ static enum lm_ggml_status lm_ggml_backend_cpu_repack_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
1462
+ tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(lm_ggml_repack_get_optimal_repack_type(tensor));
1463
+
1464
+ LM_GGML_UNUSED(buffer);
1465
+ return LM_GGML_STATUS_SUCCESS;
1466
+ }
1467
+
1468
+ static void lm_ggml_backend_cpu_repack_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor,
1469
+ const void * data, size_t offset, size_t size) {
1470
+ LM_GGML_ASSERT(offset == 0);
1471
+ LM_GGML_ASSERT(size == lm_ggml_nbytes(tensor));
1472
+
1473
+ auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
1474
+ auto OK = tensor_traits->repack(tensor, data, size);
1475
+
1476
+ LM_GGML_ASSERT(OK == 0);
1477
+ LM_GGML_UNUSED(buffer);
1478
+ }
1479
+
1480
+ static const char * lm_ggml_backend_cpu_repack_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
1481
+ return "CPU_REPACK";
1482
+
1483
+ LM_GGML_UNUSED(buft);
1484
+ }
1485
+
1486
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_repack_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
1487
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_cpu_buffer_type(), size);
1488
+
1489
+ if (buffer == nullptr) {
1490
+ return nullptr;
1491
+ }
1492
+
1493
+ buffer->buft = buft;
1494
+ buffer->iface.init_tensor = lm_ggml_backend_cpu_repack_buffer_init_tensor;
1495
+ buffer->iface.set_tensor = lm_ggml_backend_cpu_repack_buffer_set_tensor;
1496
+ buffer->iface.get_tensor = nullptr;
1497
+ buffer->iface.cpy_tensor = nullptr;
1498
+ return buffer;
1499
+ }
1500
+
1501
+ static size_t lm_ggml_backend_cpu_repack_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
1502
+ return TENSOR_ALIGNMENT;
1503
+
1504
+ LM_GGML_UNUSED(buft);
1505
+ }
1506
+
1507
+ namespace ggml::cpu::repack {
1508
+ class extra_buffer_type : ggml::cpu::extra_buffer_type {
1509
+ bool supports_op(lm_ggml_backend_dev_t, const struct lm_ggml_tensor * op) override {
1510
+ if ( op->op == LM_GGML_OP_MUL_MAT &&
1511
+ op->src[0]->buffer &&
1512
+ (lm_ggml_n_dims(op->src[0]) == 2) &&
1513
+ op->src[0]->buffer->buft == lm_ggml_backend_cpu_repack_buffer_type() &&
1514
+ lm_ggml_repack_get_optimal_repack_type(op->src[0])
1515
+ ) {
1516
+ if (op->src[1]->buffer && !lm_ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
1517
+ return false;
1518
+ }
1519
+ if (op->src[1]->type == LM_GGML_TYPE_F32) {
1520
+ return true;
1521
+ }
1522
+ //if (op->src[1]->type == LM_GGML_TYPE_Q8_0) {
1523
+ // return true;
1524
+ //}
1525
+ // may be possible if Q8_0 packed...
1526
+ } else if (op->op == LM_GGML_OP_MUL_MAT_ID
1527
+ && op->src[0]->buffer
1528
+ && (lm_ggml_n_dims(op->src[0]) == 3)
1529
+ && op->src[0]->buffer->buft == lm_ggml_backend_cpu_repack_buffer_type()
1530
+ && lm_ggml_repack_get_optimal_repack_type(op->src[0])
1531
+ ) {
1532
+ if (op->src[1]->buffer && !lm_ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
1533
+ return false;
1534
+ }
1535
+ if (op->src[1]->type == LM_GGML_TYPE_F32) {
1536
+ return true;
1537
+ }
1538
+ //if (op->src[1]->type == LM_GGML_TYPE_Q8_0) {
1539
+ // return true;
1540
+ //}
1541
+ }
1542
+ return false;
1543
+ }
1544
+
1545
+ ggml::cpu::tensor_traits * get_tensor_traits(const struct lm_ggml_tensor * op) override {
1546
+ if (op->op == LM_GGML_OP_MUL_MAT || op->op == LM_GGML_OP_MUL_MAT_ID) {
1547
+ if (op->src[0]->buffer && op->src[0]->buffer->buft == lm_ggml_backend_cpu_repack_buffer_type()) {
1548
+ return (ggml::cpu::tensor_traits *) op->src[0]->extra;
1549
+ }
1550
+ }
1551
+ return nullptr;
1552
+ }
1553
+ };
1554
+ } // namespace ggml::cpu::repack
1555
+
1556
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_repack_buffer_type(void) {
1557
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_repack = {
1558
+ /* .iface = */ {
1559
+ /* .get_name = */ lm_ggml_backend_cpu_repack_buffer_type_get_name,
1560
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_repack_buffer_type_alloc_buffer,
1561
+ /* .get_alignment = */ lm_ggml_backend_cpu_repack_buffer_type_get_alignment,
1562
+ /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
1563
+ /* .get_alloc_size = */ nullptr, // defaults to lm_ggml_nbytes
1564
+ /* .is_host = */ nullptr,
1565
+ },
1566
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
1567
+ /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
1568
+ };
1569
+
1570
+ return &lm_ggml_backend_cpu_buffer_type_repack;
1571
+ }