cactus-react-native 0.0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. package/LICENSE.txt +20 -0
  2. package/README.md +3 -1
  3. package/android/src/main/CMakeLists.txt +58 -23
  4. package/android/src/main/java/com/cactus/Cactus.java +484 -16
  5. package/android/src/main/java/com/cactus/LlamaContext.java +199 -0
  6. package/android/src/main/jni.cpp +325 -10
  7. package/android/src/main/jniLibs/arm64-v8a/libcactus.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libcactus_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/libcactus.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libcactus_x86_64.so +0 -0
  15. package/android/src/newarch/java/com/cactus/CactusModule.java +79 -7
  16. package/android/src/oldarch/java/com/cactus/CactusModule.java +70 -0
  17. package/cactus-react-native.podspec +0 -3
  18. package/ios/CMakeLists.txt +58 -36
  19. package/ios/Cactus.mm +243 -2
  20. package/ios/CactusContext.h +22 -0
  21. package/ios/CactusContext.mm +176 -1
  22. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus.h +92 -5
  23. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +268 -0
  24. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/chat.h +2 -0
  25. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/common.h +42 -51
  26. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-backend.h +4 -4
  27. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-common.h +12 -6
  28. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpp.h +1 -1
  29. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu.h +5 -0
  30. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-impl.h +52 -18
  31. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-metal-impl.h +106 -14
  32. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-opt.h +49 -28
  33. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml.h +87 -106
  34. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-arch.h +16 -0
  35. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-batch.h +2 -1
  36. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-chat.h +7 -2
  37. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-context.h +44 -33
  38. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-cparams.h +1 -0
  39. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-graph.h +83 -17
  40. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-hparams.h +44 -2
  41. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-kv-cache.h +407 -179
  42. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-memory.h +13 -2
  43. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model-loader.h +5 -3
  44. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model-saver.h +37 -0
  45. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model.h +24 -2
  46. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-vocab.h +6 -0
  47. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama.h +102 -142
  48. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/minja/chat-template.hpp +23 -11
  49. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/minja/minja.hpp +186 -127
  50. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist +0 -0
  51. package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
  52. package/ios/cactus.xcframework/ios-arm64/cactus.framework/ggml-llama.metallib +0 -0
  53. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/cactus.h +92 -5
  54. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/cactus_ffi.h +268 -0
  55. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/chat.h +2 -0
  56. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/common.h +42 -51
  57. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-backend.h +4 -4
  58. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-common.h +12 -6
  59. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpp.h +1 -1
  60. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu.h +5 -0
  61. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-impl.h +52 -18
  62. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-metal-impl.h +106 -14
  63. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-opt.h +49 -28
  64. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml.h +87 -106
  65. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-arch.h +16 -0
  66. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-batch.h +2 -1
  67. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-chat.h +7 -2
  68. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-context.h +44 -33
  69. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-cparams.h +1 -0
  70. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-graph.h +83 -17
  71. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-hparams.h +44 -2
  72. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-kv-cache.h +407 -179
  73. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h +13 -2
  74. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-loader.h +5 -3
  75. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-saver.h +37 -0
  76. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h +24 -2
  77. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h +6 -0
  78. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama.h +102 -142
  79. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/minja/chat-template.hpp +23 -11
  80. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/minja/minja.hpp +186 -127
  81. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Info.plist +0 -0
  82. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
  83. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/cactus +0 -0
  84. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/ggml-llama-sim.metallib +0 -0
  85. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/cactus.h +92 -5
  86. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/cactus_ffi.h +268 -0
  87. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/chat.h +2 -0
  88. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/common.h +42 -51
  89. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-backend.h +4 -4
  90. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-common.h +12 -6
  91. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpp.h +1 -1
  92. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu.h +5 -0
  93. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-impl.h +52 -18
  94. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-metal-impl.h +106 -14
  95. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-opt.h +49 -28
  96. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml.h +87 -106
  97. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-arch.h +16 -0
  98. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-batch.h +2 -1
  99. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-chat.h +7 -2
  100. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-context.h +44 -33
  101. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-cparams.h +1 -0
  102. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-graph.h +83 -17
  103. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-hparams.h +44 -2
  104. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-kv-cache.h +407 -179
  105. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-memory.h +13 -2
  106. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model-loader.h +5 -3
  107. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model-saver.h +37 -0
  108. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model.h +24 -2
  109. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-vocab.h +6 -0
  110. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama.h +102 -142
  111. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/minja/chat-template.hpp +23 -11
  112. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/minja/minja.hpp +186 -127
  113. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Info.plist +0 -0
  114. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/cactus +0 -0
  115. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/ggml-llama.metallib +0 -0
  116. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/cactus.h +92 -5
  117. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/cactus_ffi.h +268 -0
  118. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/chat.h +2 -0
  119. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/common.h +42 -51
  120. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-backend.h +4 -4
  121. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-common.h +12 -6
  122. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpp.h +1 -1
  123. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu.h +5 -0
  124. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-impl.h +52 -18
  125. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-metal-impl.h +106 -14
  126. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-opt.h +49 -28
  127. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml.h +87 -106
  128. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-arch.h +16 -0
  129. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-batch.h +2 -1
  130. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-chat.h +7 -2
  131. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-context.h +44 -33
  132. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-cparams.h +1 -0
  133. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-graph.h +83 -17
  134. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-hparams.h +44 -2
  135. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-kv-cache.h +407 -179
  136. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h +13 -2
  137. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-loader.h +5 -3
  138. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-saver.h +37 -0
  139. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h +24 -2
  140. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h +6 -0
  141. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama.h +102 -142
  142. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/minja/chat-template.hpp +23 -11
  143. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/minja/minja.hpp +186 -127
  144. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Info.plist +0 -0
  145. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
  146. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/cactus +0 -0
  147. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/ggml-llama-sim.metallib +0 -0
  148. package/lib/commonjs/NativeCactus.js +1 -0
  149. package/lib/commonjs/NativeCactus.js.map +1 -1
  150. package/lib/commonjs/index.js +112 -0
  151. package/lib/commonjs/index.js.map +1 -1
  152. package/lib/commonjs/tools.js +118 -0
  153. package/lib/commonjs/tools.js.map +1 -0
  154. package/lib/module/NativeCactus.js +3 -0
  155. package/lib/module/NativeCactus.js.map +1 -1
  156. package/lib/module/index.js +87 -1
  157. package/lib/module/index.js.map +1 -1
  158. package/lib/module/tools.js +110 -0
  159. package/lib/module/tools.js.map +1 -0
  160. package/lib/typescript/NativeCactus.d.ts +30 -1
  161. package/lib/typescript/NativeCactus.d.ts.map +1 -1
  162. package/lib/typescript/index.d.ts +21 -2
  163. package/lib/typescript/index.d.ts.map +1 -1
  164. package/lib/typescript/tools.d.ts +38 -0
  165. package/lib/typescript/tools.d.ts.map +1 -0
  166. package/package.json +6 -3
  167. package/src/NativeCactus.ts +62 -1
  168. package/src/index.ts +113 -2
  169. package/src/tools.ts +127 -0
  170. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
  171. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
  172. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
  173. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
  174. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/sgemm.h +0 -14
  175. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
  176. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
  177. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
  178. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
  179. package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/sgemm.h +0 -14
  180. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
  181. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
  182. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
  183. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
  184. package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/sgemm.h +0 -14
  185. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
  186. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
  187. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
  188. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
  189. package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/sgemm.h +0 -14
@@ -6,6 +6,7 @@
6
6
 
7
7
  #include <set>
8
8
  #include <string>
9
+ #include <string_view>
9
10
  #include <vector>
10
11
  #include <sstream>
11
12
 
@@ -42,17 +43,6 @@ extern const char * LLAMA_BUILD_TARGET;
42
43
 
43
44
  struct common_control_vector_load_info;
44
45
 
45
- #define print_build_info() do { \
46
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
47
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
48
- } while(0)
49
-
50
- // build info
51
- extern int LLAMA_BUILD_NUMBER;
52
- extern char const *LLAMA_COMMIT;
53
- extern char const *LLAMA_COMPILER;
54
- extern char const *LLAMA_BUILD_TARGET;
55
-
56
46
  //
57
47
  // CPU utils
58
48
  //
@@ -77,7 +67,6 @@ enum llama_example {
77
67
  LLAMA_EXAMPLE_COMMON,
78
68
  LLAMA_EXAMPLE_SPECULATIVE,
79
69
  LLAMA_EXAMPLE_MAIN,
80
- LLAMA_EXAMPLE_INFILL,
81
70
  LLAMA_EXAMPLE_EMBEDDING,
82
71
  LLAMA_EXAMPLE_PERPLEXITY,
83
72
  LLAMA_EXAMPLE_RETRIEVAL,
@@ -87,7 +76,7 @@ enum llama_example {
87
76
  LLAMA_EXAMPLE_SERVER,
88
77
  LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
78
  LLAMA_EXAMPLE_EXPORT_LORA,
90
- LLAMA_EXAMPLE_LLAVA,
79
+ LLAMA_EXAMPLE_MTMD,
91
80
  LLAMA_EXAMPLE_LOOKUP,
92
81
  LLAMA_EXAMPLE_PARALLEL,
93
82
  LLAMA_EXAMPLE_TTS,
@@ -107,6 +96,7 @@ enum common_sampler_type {
107
96
  COMMON_SAMPLER_TYPE_XTC = 8,
108
97
  COMMON_SAMPLER_TYPE_INFILL = 9,
109
98
  COMMON_SAMPLER_TYPE_PENALTIES = 10,
99
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
110
100
  };
111
101
 
112
102
  // dimensionality reduction methods, used by cvector-generator
@@ -132,10 +122,6 @@ struct common_grammar_trigger {
132
122
  common_grammar_trigger_type type;
133
123
  std::string value;
134
124
  llama_token token = LLAMA_TOKEN_NULL;
135
-
136
- // T can only be nlohmann::ordered_json
137
- template <class T> T to_json() const;
138
- template <class T> static common_grammar_trigger from_json(const T & in);
139
125
  };
140
126
 
141
127
  // sampling parameters
@@ -176,6 +162,7 @@ struct common_params_sampling {
176
162
  std::vector<enum common_sampler_type> samplers = {
177
163
  COMMON_SAMPLER_TYPE_PENALTIES,
178
164
  COMMON_SAMPLER_TYPE_DRY,
165
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
179
166
  COMMON_SAMPLER_TYPE_TOP_K,
180
167
  COMMON_SAMPLER_TYPE_TYPICAL_P,
181
168
  COMMON_SAMPLER_TYPE_TOP_P,
@@ -195,6 +182,13 @@ struct common_params_sampling {
195
182
  std::string print() const;
196
183
  };
197
184
 
185
+ struct common_params_model {
186
+ std::string path = ""; // model local path // NOLINT
187
+ std::string url = ""; // model url to download // NOLINT
188
+ std::string hf_repo = ""; // HF repo // NOLINT
189
+ std::string hf_file = ""; // HF file // NOLINT
190
+ };
191
+
198
192
  struct common_params_speculative {
199
193
  std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
200
194
 
@@ -208,19 +202,11 @@ struct common_params_speculative {
208
202
  struct cpu_params cpuparams;
209
203
  struct cpu_params cpuparams_batch;
210
204
 
211
- std::string hf_repo = ""; // HF repo // NOLINT
212
- std::string hf_file = ""; // HF file // NOLINT
213
-
214
- std::string model = ""; // draft model for speculative decoding // NOLINT
215
- std::string model_url = ""; // model url to download // NOLINT
205
+ struct common_params_model model;
216
206
  };
217
207
 
218
208
  struct common_params_vocoder {
219
- std::string hf_repo = ""; // HF repo // NOLINT
220
- std::string hf_file = ""; // HF file // NOLINT
221
-
222
- std::string model = ""; // model path // NOLINT
223
- std::string model_url = ""; // model url to download // NOLINT
209
+ struct common_params_model model;
224
210
 
225
211
  std::string speaker_file = ""; // speaker file path // NOLINT
226
212
 
@@ -279,12 +265,10 @@ struct common_params {
279
265
  struct common_params_speculative speculative;
280
266
  struct common_params_vocoder vocoder;
281
267
 
282
- std::string model = ""; // model path // NOLINT
268
+ struct common_params_model model;
269
+
283
270
  std::string model_alias = ""; // model alias // NOLINT
284
- std::string model_url = ""; // model url to download // NOLINT
285
271
  std::string hf_token = ""; // HF token // NOLINT
286
- std::string hf_repo = ""; // HF repo // NOLINT
287
- std::string hf_file = ""; // HF file // NOLINT
288
272
  std::string prompt = ""; // NOLINT
289
273
  std::string system_prompt = ""; // NOLINT
290
274
  std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -298,6 +282,7 @@ struct common_params {
298
282
  std::vector<std::string> in_files; // all input files
299
283
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
300
284
  std::vector<llama_model_kv_override> kv_overrides;
285
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
301
286
 
302
287
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
303
288
  std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -339,17 +324,17 @@ struct common_params {
339
324
  bool flash_attn = false; // flash attention
340
325
  bool no_perf = false; // disable performance metrics
341
326
  bool ctx_shift = true; // context shift on inifinite text generation
327
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
342
328
 
343
329
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
344
- bool logits_all = false; // return logits for all tokens in the batch
345
330
  bool use_mmap = true; // use mmap for faster loads
346
331
  bool use_mlock = false; // use mlock to keep model in memory
347
332
  bool verbose_prompt = false; // print prompt tokens before generation
348
333
  bool display_prompt = true; // print prompt before generation
349
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
350
334
  bool no_kv_offload = false; // disable KV offloading
351
335
  bool warmup = true; // warmup run
352
336
  bool check_tensors = false; // validate tensor data
337
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
353
338
 
354
339
  bool single_turn = false; // single turn chat conversation
355
340
 
@@ -361,8 +346,10 @@ struct common_params {
361
346
 
362
347
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
363
348
 
364
- // multimodal models (see examples/llava)
365
- std::string mmproj = ""; // path to multimodal projector // NOLINT
349
+ // multimodal models (see tools/mtmd)
350
+ struct common_params_model mmproj;
351
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
352
+ bool no_mmproj = false; // explicitly disable multimodal model
366
353
  std::vector<std::string> image; // path to image file(s)
367
354
 
368
355
  // embedding
@@ -385,6 +372,7 @@ struct common_params {
385
372
  bool use_jinja = false; // NOLINT
386
373
  bool enable_chat_template = true;
387
374
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
375
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
388
376
 
389
377
  std::vector<std::string> api_keys;
390
378
 
@@ -428,13 +416,14 @@ struct common_params {
428
416
 
429
417
  bool process_output = false; // collect data for the output tensor
430
418
  bool compute_ppl = true; // whether to compute perplexity
419
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
431
420
 
432
421
  // cvector-generator params
433
422
  int n_pca_batch = 100;
434
423
  int n_pca_iterations = 1000;
435
424
  dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
436
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
437
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
425
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
426
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
438
427
 
439
428
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
440
429
 
@@ -443,6 +432,11 @@ struct common_params {
443
432
 
444
433
  // common params
445
434
  std::string out_file; // output filename for all example programs
435
+ // optional callback for model loading progress and cancellation:
436
+ // called with a progress value between 0.0 and 1.0.
437
+ // return false from callback to abort model loading or true to continue
438
+ llama_progress_callback load_progress_callback = NULL;
439
+ void * load_progress_callback_user_data = NULL;
446
440
  };
447
441
 
448
442
  // call once at the start of a program if it uses libcommon
@@ -520,10 +514,9 @@ static bool string_starts_with(const std::string & str,
520
514
  return str.rfind(prefix, 0) == 0;
521
515
  }
522
516
 
523
- static bool string_ends_with(const std::string & str,
524
- const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
525
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
526
- }
517
+ // While we wait for C++20's std::string::ends_with...
518
+ bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
519
+ size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
527
520
 
528
521
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
529
522
  void string_process_escapes(std::string & input);
@@ -564,6 +557,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
564
557
  // clear LoRA adapters from context, then apply new list of adapters
565
558
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
566
559
 
560
+ std::string get_model_endpoint();
561
+
567
562
  //
568
563
  // Batch utils
569
564
  //
@@ -630,16 +625,6 @@ std::string common_detokenize(
630
625
  const std::vector<llama_token> & tokens,
631
626
  bool special = true);
632
627
 
633
- //
634
- // KV cache utils
635
- //
636
-
637
- // Dump the KV cache view with the number of sequences per cell.
638
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
639
-
640
- // Dump the KV cache view showing individual sequences in each cell (long output).
641
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
642
-
643
628
  //
644
629
  // Embedding utils
645
630
  //
@@ -681,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
681
666
  const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
682
667
 
683
668
  }
669
+
670
+ //
671
+ // training utils
672
+ //
673
+
674
+ lm_ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
@@ -38,7 +38,7 @@ extern "C" {
38
38
  LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer (lm_ggml_backend_buffer_type_t buft, size_t size);
39
39
  LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft);
40
40
  LM_GGML_API size_t lm_ggml_backend_buft_get_max_size (lm_ggml_backend_buffer_type_t buft);
41
- LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor);
41
+ LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor);
42
42
  LM_GGML_API bool lm_ggml_backend_buft_is_host (lm_ggml_backend_buffer_type_t buft);
43
43
  LM_GGML_API lm_ggml_backend_dev_t lm_ggml_backend_buft_get_device (lm_ggml_backend_buffer_type_t buft);
44
44
 
@@ -59,7 +59,7 @@ extern "C" {
59
59
  LM_GGML_API enum lm_ggml_status lm_ggml_backend_buffer_init_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
60
60
  LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer);
61
61
  LM_GGML_API size_t lm_ggml_backend_buffer_get_max_size (lm_ggml_backend_buffer_t buffer);
62
- LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
62
+ LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor);
63
63
  LM_GGML_API void lm_ggml_backend_buffer_clear (lm_ggml_backend_buffer_t buffer, uint8_t value);
64
64
  LM_GGML_API bool lm_ggml_backend_buffer_is_host (lm_ggml_backend_buffer_t buffer);
65
65
  LM_GGML_API void lm_ggml_backend_buffer_set_usage (lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage);
@@ -248,7 +248,7 @@ extern "C" {
248
248
  // preferrably to run on the same backend as the buffer
249
249
  lm_ggml_backend_buffer_set_usage(buf_weights, LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
250
250
 
251
- sched = lm_ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, LM_GGML_DEFAULT_GRAPH_SIZE, false);
251
+ sched = lm_ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, LM_GGML_DEFAULT_GRAPH_SIZE, false, true);
252
252
 
253
253
  // initialize buffers from a max size graph (optional)
254
254
  reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
289
289
  typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);
290
290
 
291
291
  // Initialize a backend scheduler, backends with low index are given priority over backends with high index
292
- LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
292
+ LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
293
293
  LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);
294
294
 
295
295
  // Initialize backend buffers from a measure graph
@@ -158,6 +158,12 @@ typedef sycl::half2 lm_ggml_half2;
158
158
 
159
159
  #endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP
160
160
 
161
+ #ifdef _MSC_VER
162
+ #define LM_GGML_EXTENSION
163
+ #else // _MSC_VER
164
+ #define LM_GGML_EXTENSION __extension__
165
+ #endif // _MSC_VER
166
+
161
167
  #define QK4_0 32
162
168
  typedef struct {
163
169
  lm_ggml_half d; // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(lm_ggml_half) + QK4_0 / 2, "wrong q4_
167
173
 
168
174
  #define QK4_1 32
169
175
  typedef struct {
170
- union {
176
+ LM_GGML_EXTENSION union {
171
177
  struct {
172
178
  lm_ggml_half d; // delta
173
179
  lm_ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(lm_ggml_half) + sizeof(uint32_t) + QK
188
194
 
189
195
  #define QK5_1 32
190
196
  typedef struct {
191
- union {
197
+ LM_GGML_EXTENSION union {
192
198
  struct {
193
199
  lm_ggml_half d; // delta
194
200
  lm_ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(lm_ggml_half) + QK8_0, "wrong q8_0 bl
209
215
 
210
216
  #define QK8_1 32
211
217
  typedef struct {
212
- union {
218
+ LM_GGML_EXTENSION union {
213
219
  struct {
214
220
  lm_ggml_half d; // delta
215
221
  lm_ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(lm_ggml_half) + QK_K / 4, "wrong tq2
250
256
  typedef struct {
251
257
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
252
258
  uint8_t qs[QK_K/4]; // quants
253
- union {
259
+ LM_GGML_EXTENSION union {
254
260
  struct {
255
261
  lm_ggml_half d; // super-block scale for quantized scales
256
262
  lm_ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 +
277
283
  // weight is represented as x = a * q + b
278
284
  // Effectively 4.5 bits per weight
279
285
  typedef struct {
280
- union {
286
+ LM_GGML_EXTENSION union {
281
287
  struct {
282
288
  lm_ggml_half d; // super-block scale for quantized scales
283
289
  lm_ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K
294
300
  // weight is represented as x = a * q + b
295
301
  // Effectively 5.5 bits per weight
296
302
  typedef struct {
297
- union {
303
+ LM_GGML_EXTENSION union {
298
304
  struct {
299
305
  lm_ggml_half d; // super-block scale for quantized scales
300
306
  lm_ggml_half dmin; // super-block scale for quantized mins
@@ -24,7 +24,7 @@ typedef std::unique_ptr<lm_gguf_context, lm_gguf_context_deleter> lm_gguf_contex
24
24
 
25
25
  struct lm_ggml_gallocr_deleter { void operator()(lm_ggml_gallocr_t galloc) { lm_ggml_gallocr_free(galloc); } };
26
26
 
27
- typedef std::unique_ptr<lm_ggml_gallocr_t, lm_ggml_gallocr_deleter> lm_ggml_gallocr_ptr;
27
+ typedef std::unique_ptr<lm_ggml_gallocr, lm_ggml_gallocr_deleter> lm_ggml_gallocr_ptr;
28
28
 
29
29
  // ggml-backend
30
30
 
@@ -133,6 +133,11 @@ extern "C" {
133
133
 
134
134
  LM_GGML_BACKEND_API lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void);
135
135
 
136
+ LM_GGML_BACKEND_API void lm_ggml_cpu_fp32_to_fp16(const float *, lm_ggml_fp16_t *, int64_t);
137
+ LM_GGML_BACKEND_API void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t *, float *, int64_t);
138
+ LM_GGML_BACKEND_API void lm_ggml_cpu_fp32_to_bf16(const float *, lm_ggml_bf16_t *, int64_t);
139
+ LM_GGML_BACKEND_API void lm_ggml_cpu_bf16_to_fp32(const lm_ggml_bf16_t *, float *, int64_t);
140
+
136
141
  #ifdef __cplusplus
137
142
  }
138
143
  #endif
@@ -148,8 +148,14 @@ struct lm_ggml_map_custom2_op_params {
148
148
 
149
149
  struct lm_ggml_map_custom3_op_params {
150
150
  lm_ggml_custom3_op_t fun;
151
- int n_tasks;
152
- void * userdata;
151
+ int n_tasks;
152
+ void * userdata;
153
+ };
154
+
155
+ struct lm_ggml_custom_op_params {
156
+ lm_ggml_custom_op_t fun;
157
+ int n_tasks;
158
+ void * userdata;
153
159
  };
154
160
 
155
161
  // bitset
@@ -311,29 +317,28 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
311
317
 
312
318
  // FP16 to FP32 conversion
313
319
 
314
- #if defined(__ARM_NEON)
315
- #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
316
- typedef uint16_t lm_ggml_fp16_internal_t;
317
- #else
318
- typedef __fp16 lm_ggml_fp16_internal_t;
319
- #endif
320
- #endif
321
-
322
- #if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
320
+ // 16-bit float
321
+ // on Arm, we use __fp16
322
+ // on x86, we use uint16_t
323
+ //
324
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
325
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
326
+ //
327
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
323
328
  #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
324
329
  #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
325
330
 
326
331
  #define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
327
332
 
328
333
  static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
329
- lm_ggml_fp16_internal_t tmp;
334
+ __fp16 tmp;
330
335
  memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
331
336
  return (float)tmp;
332
337
  }
333
338
 
334
339
  static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
335
340
  lm_ggml_fp16_t res;
336
- lm_ggml_fp16_internal_t tmp = f;
341
+ __fp16 tmp = f;
337
342
  memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
338
343
  return res;
339
344
  }
@@ -357,8 +362,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
357
362
  #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
358
363
 
359
364
  static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
360
- register float f;
361
- register double d;
365
+ float f;
366
+ double d;
362
367
  __asm__(
363
368
  "mtfprd %0,%2\n"
364
369
  "xscvhpdp %0,%0\n"
@@ -370,8 +375,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
370
375
  }
371
376
 
372
377
  static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
373
- register double d;
374
- register lm_ggml_fp16_t r;
378
+ double d;
379
+ lm_ggml_fp16_t r;
375
380
  __asm__( /* xscvdphp can work on double or single precision */
376
381
  "xscvdphp %0,%2\n"
377
382
  "mffprd %1,%0\n" :
@@ -381,6 +386,35 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
381
386
  return r;
382
387
  }
383
388
 
389
+ #elif defined(__riscv) && defined(LM_GGML_RV_ZFH)
390
+
391
+ static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
392
+ float f;
393
+ __asm__(
394
+ "fmv.h.x %[f], %[h]\n\t"
395
+ "fcvt.s.h %[f], %[f]"
396
+ : [f] "=&f" (f)
397
+ : [h] "r" (h)
398
+ );
399
+ return f;
400
+ }
401
+
402
+ static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
403
+ lm_ggml_fp16_t res;
404
+ __asm__(
405
+ "fcvt.h.s %[f], %[f]\n\t"
406
+ "fmv.x.h %[h], %[f]"
407
+ : [h] "=&r" (res)
408
+ : [f] "f" (f)
409
+ );
410
+ return res;
411
+ }
412
+
413
+ #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
414
+ #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
415
+ #define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
416
+ #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
417
+
384
418
  #else
385
419
 
386
420
  // FP16 <-> FP32
@@ -456,7 +490,7 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
456
490
  #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
457
491
  #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
458
492
 
459
- #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
493
+ #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
460
494
 
461
495
  // precomputed f32 table for f16 (256 KB)
462
496
  // defined in ggml.c, initialized in lm_ggml_init()
@@ -1,6 +1,70 @@
1
1
  #ifndef LM_GGML_METAL_IMPL
2
2
  #define LM_GGML_METAL_IMPL
3
3
 
4
+ // kernel parameters for mat-vec threadgroups
5
+ //
6
+ // N_R0: number of src0 rows to process per simdgroup
7
+ // N_SG: number of simdgroups per threadgroup
8
+ //
9
+ // TODO: for optimal performance, become function of the device and work size
10
+
11
+ #define N_R0_Q4_0 4
12
+ #define N_SG_Q4_0 2
13
+
14
+ #define N_R0_Q4_1 4
15
+ #define N_SG_Q4_1 2
16
+
17
+ #define N_R0_Q5_0 4
18
+ #define N_SG_Q5_0 2
19
+
20
+ #define N_R0_Q5_1 4
21
+ #define N_SG_Q5_1 2
22
+
23
+ #define N_R0_Q8_0 4
24
+ #define N_SG_Q8_0 2
25
+
26
+ #define N_R0_Q2_K 4
27
+ #define N_SG_Q2_K 2
28
+
29
+ #define N_R0_Q3_K 2
30
+ #define N_SG_Q3_K 2
31
+
32
+ #define N_R0_Q4_K 4
33
+ #define N_SG_Q4_K 2
34
+
35
+ #define N_R0_Q5_K 2
36
+ #define N_SG_Q5_K 2
37
+
38
+ #define N_R0_Q6_K 1
39
+ #define N_SG_Q6_K 2
40
+
41
+ #define N_R0_IQ1_S 4
42
+ #define N_SG_IQ1_S 2
43
+
44
+ #define N_R0_IQ1_M 4
45
+ #define N_SG_IQ1_M 2
46
+
47
+ #define N_R0_IQ2_XXS 4
48
+ #define N_SG_IQ2_XXS 2
49
+
50
+ #define N_R0_IQ2_XS 4
51
+ #define N_SG_IQ2_XS 2
52
+
53
+ #define N_R0_IQ2_S 4
54
+ #define N_SG_IQ2_S 2
55
+
56
+ #define N_R0_IQ3_XXS 4
57
+ #define N_SG_IQ3_XXS 2
58
+
59
+ #define N_R0_IQ3_S 4
60
+ #define N_SG_IQ3_S 2
61
+
62
+ #define N_R0_IQ4_NL 2
63
+ #define N_SG_IQ4_NL 2
64
+
65
+ #define N_R0_IQ4_XS 2
66
+ #define N_SG_IQ4_XS 2
67
+
4
68
  // kernel argument structs
5
69
  //
6
70
  // - element counters (e.g. ne00) typically use int32_t to reduce register usage
@@ -143,6 +207,10 @@ typedef struct {
143
207
  float attn_factor;
144
208
  float beta_fast;
145
209
  float beta_slow;
210
+ int32_t sect_0;
211
+ int32_t sect_1;
212
+ int32_t sect_2;
213
+ int32_t sect_3;
146
214
  } lm_ggml_metal_kargs_rope;
147
215
 
148
216
  typedef struct {
@@ -155,9 +223,12 @@ typedef struct {
155
223
  int32_t ne11;
156
224
  int32_t ne_12_2; // assume K and V are same shape
157
225
  int32_t ne_12_3;
158
- uint64_t nb_12_1;
159
- uint64_t nb_12_2;
160
- uint64_t nb_12_3;
226
+ uint64_t nb11;
227
+ uint64_t nb12;
228
+ uint64_t nb13;
229
+ uint64_t nb21;
230
+ uint64_t nb22;
231
+ uint64_t nb23;
161
232
  uint64_t nb31;
162
233
  int32_t ne1;
163
234
  int32_t ne2;
@@ -232,21 +303,42 @@ typedef struct {
232
303
  } lm_ggml_metal_kargs_mul_mv_ext;
233
304
 
234
305
  typedef struct {
235
- int32_t nei0;
236
- int32_t nei1;
237
- uint64_t nbi1;
306
+ int32_t ne10;
307
+ int32_t ne11; // n_expert_used (bcast)
308
+ uint64_t nb11;
309
+ uint64_t nb12;
310
+ int32_t neh11; // n_tokens
311
+ uint64_t nbh11;
312
+ int32_t ne20; // n_expert_used
313
+ uint64_t nb21;
314
+ } lm_ggml_metal_kargs_mul_mm_id_map0;
315
+
316
+ typedef struct {
317
+ int32_t ne20; // n_expert_used
318
+ int32_t neh0;
319
+ int32_t neh1;
320
+ uint64_t nbh1;
321
+ uint64_t nbh2;
322
+ int32_t ne0;
323
+ uint64_t nb1;
324
+ uint64_t nb2;
325
+ } lm_ggml_metal_kargs_mul_mm_id_map1;
326
+
327
+ typedef struct {
238
328
  int32_t ne00;
239
329
  int32_t ne02;
240
330
  uint64_t nb01;
241
331
  uint64_t nb02;
242
- int32_t ne11;
243
- int32_t ne12;
244
- int32_t ne13;
245
- uint64_t nb10;
246
- uint64_t nb11;
247
- uint64_t nb12;
248
- int32_t ne0;
249
- int32_t ne1;
332
+ uint64_t nb03;
333
+ int32_t neh12;
334
+ uint64_t nbh10;
335
+ uint64_t nbh11;
336
+ uint64_t nbh12;
337
+ uint64_t nbh13;
338
+ int32_t neh0;
339
+ int32_t neh1;
340
+ int16_t r2;
341
+ int16_t r3;
250
342
  } lm_ggml_metal_kargs_mul_mm_id;
251
343
 
252
344
  typedef struct {