whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -61,7 +61,10 @@ extern "C" {
61
61
  struct llama_model;
62
62
  struct llama_context;
63
63
  struct llama_sampler;
64
- struct llama_kv_cache;
64
+
65
+ typedef struct llama_memory_i * llama_memory_t;
66
+
67
+ struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
65
68
 
66
69
  typedef int32_t llama_pos;
67
70
  typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
240
243
 
241
244
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
242
245
 
243
- // Input data for llama_decode
246
+ // Input data for llama_encode/llama_decode
244
247
  // A llama_batch object can contain input about one or many sequences
245
248
  // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
246
249
  //
247
250
  // - token : the token ids of the input (used when embd is NULL)
248
251
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
249
252
  // - pos : the positions of the respective token in the sequence
250
- // (if set to NULL, the token position will be tracked automatically by llama_decode)
253
+ // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
251
254
  // - seq_id : the sequence to which the respective token belongs
252
255
  // (if set to NULL, the sequence ID will be assumed to be 0)
253
256
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
254
- // (if set to NULL, only the logits for last token will be returned)
257
+ // (if set to NULL:
258
+ // - if embeddings: all tokens are output
259
+ // - if not: only the last token is output
260
+ // )
255
261
  //
256
262
  typedef struct llama_batch {
257
263
  int32_t n_tokens;
@@ -261,7 +267,7 @@ extern "C" {
261
267
  llama_pos * pos;
262
268
  int32_t * n_seq_id;
263
269
  llama_seq_id ** seq_id;
264
- int8_t * logits; // TODO: rename this to "output"
270
+ int8_t * logits; // TODO: rename this to "output"
265
271
  } llama_batch;
266
272
 
267
273
  enum llama_model_kv_override_type {
@@ -366,6 +372,8 @@ extern "C" {
366
372
  bool no_perf; // measure performance timings
367
373
  bool op_offload; // offload host tensor operations to device
368
374
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
375
+ // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
376
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369
377
  };
370
378
 
371
379
  // model quantization parameters
@@ -382,6 +390,7 @@ extern "C" {
382
390
  void * imatrix; // pointer to importance matrix data
383
391
  void * kv_overrides; // pointer to vector containing overrides
384
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
385
394
  } llama_model_quantize_params;
386
395
 
387
396
  typedef struct llama_logit_bias {
@@ -491,9 +500,11 @@ extern "C" {
491
500
  DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
492
501
 
493
502
  LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
494
- LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
503
+ LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
495
504
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
496
505
 
506
+ DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
507
+
497
508
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
498
509
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
499
510
 
@@ -502,10 +513,18 @@ extern "C" {
502
513
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
503
514
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
504
515
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
516
+ LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
505
517
 
506
518
  // Get the model's RoPE frequency scaling factor
507
519
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
508
520
 
521
+ // Returns the number of classifier outputs (only valid for classifier models)
522
+ // Undefined behavior for non-classifier models
523
+ LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
524
+
525
+ // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
526
+ LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
527
+
509
528
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
510
529
 
511
530
  LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -606,7 +625,81 @@ extern "C" {
606
625
  int32_t il_end);
607
626
 
608
627
  //
609
- // KV cache
628
+ // Memory
629
+ //
630
+
631
+ // Clear the memory contents
632
+ // If data == true, the data buffers will also be cleared together with the metadata
633
+ LLAMA_API void llama_memory_clear(
634
+ llama_memory_t mem,
635
+ bool data);
636
+
637
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
638
+ // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
639
+ // seq_id < 0 : match any sequence
640
+ // p0 < 0 : [0, p1]
641
+ // p1 < 0 : [p0, inf)
642
+ LLAMA_API bool llama_memory_seq_rm(
643
+ llama_memory_t mem,
644
+ llama_seq_id seq_id,
645
+ llama_pos p0,
646
+ llama_pos p1);
647
+
648
+ // Copy all tokens that belong to the specified sequence to another sequence
649
+ // p0 < 0 : [0, p1]
650
+ // p1 < 0 : [p0, inf)
651
+ LLAMA_API void llama_memory_seq_cp(
652
+ llama_memory_t mem,
653
+ llama_seq_id seq_id_src,
654
+ llama_seq_id seq_id_dst,
655
+ llama_pos p0,
656
+ llama_pos p1);
657
+
658
+ // Removes all tokens that do not belong to the specified sequence
659
+ LLAMA_API void llama_memory_seq_keep(
660
+ llama_memory_t mem,
661
+ llama_seq_id seq_id);
662
+
663
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
664
+ // p0 < 0 : [0, p1]
665
+ // p1 < 0 : [p0, inf)
666
+ LLAMA_API void llama_memory_seq_add(
667
+ llama_memory_t mem,
668
+ llama_seq_id seq_id,
669
+ llama_pos p0,
670
+ llama_pos p1,
671
+ llama_pos delta);
672
+
673
+ // Integer division of the positions by factor of `d > 1`
674
+ // p0 < 0 : [0, p1]
675
+ // p1 < 0 : [p0, inf)
676
+ LLAMA_API void llama_memory_seq_div(
677
+ llama_memory_t mem,
678
+ llama_seq_id seq_id,
679
+ llama_pos p0,
680
+ llama_pos p1,
681
+ int d);
682
+
683
+ // Returns the smallest position present in the memory for the specified sequence
684
+ // This is typically non-zero only for SWA caches
685
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
686
+ // Return -1 if the sequence is empty
687
+ LLAMA_API llama_pos llama_memory_seq_pos_min(
688
+ llama_memory_t mem,
689
+ llama_seq_id seq_id);
690
+
691
+ // Returns the largest position present in the memory for the specified sequence
692
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
693
+ // Return -1 if the sequence is empty
694
+ LLAMA_API llama_pos llama_memory_seq_pos_max(
695
+ llama_memory_t mem,
696
+ llama_seq_id seq_id);
697
+
698
+ // Check if the memory supports shifting
699
+ LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
700
+
701
+ //
702
+ // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
610
703
  //
611
704
 
612
705
  // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -619,93 +712,103 @@ extern "C" {
619
712
  "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
620
713
 
621
714
  // Clear the KV cache - both cell info is erased and KV data is zeroed
622
- LLAMA_API void llama_kv_self_clear(
623
- struct llama_context * ctx);
715
+ DEPRECATED(LLAMA_API void llama_kv_self_clear(
716
+ struct llama_context * ctx),
717
+ "Use llama_memory_clear() instead");
624
718
 
625
719
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
626
720
  // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
627
721
  // seq_id < 0 : match any sequence
628
722
  // p0 < 0 : [0, p1]
629
723
  // p1 < 0 : [p0, inf)
630
- LLAMA_API bool llama_kv_self_seq_rm(
724
+ DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
631
725
  struct llama_context * ctx,
632
726
  llama_seq_id seq_id,
633
727
  llama_pos p0,
634
- llama_pos p1);
728
+ llama_pos p1),
729
+ "Use llama_memory_seq_rm() instead");
635
730
 
636
731
  // Copy all tokens that belong to the specified sequence to another sequence
637
732
  // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
638
733
  // p0 < 0 : [0, p1]
639
734
  // p1 < 0 : [p0, inf)
640
- LLAMA_API void llama_kv_self_seq_cp(
735
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
641
736
  struct llama_context * ctx,
642
737
  llama_seq_id seq_id_src,
643
738
  llama_seq_id seq_id_dst,
644
739
  llama_pos p0,
645
- llama_pos p1);
740
+ llama_pos p1),
741
+ "Use llama_memory_seq_cp() instead");
646
742
 
647
743
  // Removes all tokens that do not belong to the specified sequence
648
- LLAMA_API void llama_kv_self_seq_keep(
744
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
649
745
  struct llama_context * ctx,
650
- llama_seq_id seq_id);
746
+ llama_seq_id seq_id),
747
+ "Use llama_memory_seq_keep() instead");
651
748
 
652
749
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
653
750
  // If the KV cache is RoPEd, the KV data is updated accordingly:
654
751
  // - lazily on next llama_decode()
655
- // - explicitly with llama_kv_self_update()
656
752
  // p0 < 0 : [0, p1]
657
753
  // p1 < 0 : [p0, inf)
658
- LLAMA_API void llama_kv_self_seq_add(
754
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
659
755
  struct llama_context * ctx,
660
756
  llama_seq_id seq_id,
661
757
  llama_pos p0,
662
758
  llama_pos p1,
663
- llama_pos delta);
759
+ llama_pos delta),
760
+ "Use llama_memory_seq_add() instead");
664
761
 
665
762
  // Integer division of the positions by factor of `d > 1`
666
763
  // If the KV cache is RoPEd, the KV data is updated accordingly:
667
764
  // - lazily on next llama_decode()
668
- // - explicitly with llama_kv_self_update()
669
765
  // p0 < 0 : [0, p1]
670
766
  // p1 < 0 : [p0, inf)
671
- LLAMA_API void llama_kv_self_seq_div(
767
+ DEPRECATED(void llama_kv_self_seq_div(
672
768
  struct llama_context * ctx,
673
769
  llama_seq_id seq_id,
674
770
  llama_pos p0,
675
771
  llama_pos p1,
676
- int d);
772
+ int d),
773
+ "Use llama_memory_seq_div() instead");
677
774
 
678
775
  // Returns the smallest position present in the KV cache for the specified sequence
679
776
  // This is typically non-zero only for SWA caches
777
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
680
778
  // Return -1 if the sequence is empty
681
- LLAMA_API llama_pos llama_kv_self_seq_pos_min(
779
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
682
780
  struct llama_context * ctx,
683
- llama_seq_id seq_id);
781
+ llama_seq_id seq_id),
782
+ "Use llama_memory_seq_pos_min() instead");
684
783
 
685
784
  // Returns the largest position present in the KV cache for the specified sequence
785
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
686
786
  // Return -1 if the sequence is empty
687
- LLAMA_API llama_pos llama_kv_self_seq_pos_max(
787
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
688
788
  struct llama_context * ctx,
689
- llama_seq_id seq_id);
789
+ llama_seq_id seq_id),
790
+ "Use llama_memory_seq_pos_max() instead");
690
791
 
691
792
  // Defragment the KV cache
692
793
  // This will be applied:
693
794
  // - lazily on next llama_decode()
694
- // - explicitly with llama_kv_self_update()
695
- LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
795
+ DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
796
+ "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
696
797
 
697
798
  // Check if the context supports KV cache shifting
698
- LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
799
+ DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
800
+ "use llama_memory_can_shift() instead");
699
801
 
700
802
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
701
- LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
803
+ DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
804
+ "simply remove this call, updates are applied lazily on the next llama_decode()");
702
805
 
703
806
  //
704
807
  // State / sessions
705
808
  //
706
809
 
707
810
  // Returns the *actual* size in bytes of the state
708
- // (logits, embedding and kv_cache)
811
+ // (logits, embedding and memory)
709
812
  // Only use when saving the state, not when restoring it, otherwise the size may be too small.
710
813
  LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
711
814
  LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -761,12 +864,12 @@ extern "C" {
761
864
  size_t n_token_count),
762
865
  "use llama_state_save_file instead");
763
866
 
764
- // Get the exact size needed to copy the KV cache of a single sequence
867
+ // Get the exact size needed to copy the state of a single sequence
765
868
  LLAMA_API size_t llama_state_seq_get_size(
766
869
  struct llama_context * ctx,
767
870
  llama_seq_id seq_id);
768
871
 
769
- // Copy the KV cache of a single sequence into the specified buffer
872
+ // Copy the state of a single sequence into the specified buffer
770
873
  LLAMA_API size_t llama_state_seq_get_data(
771
874
  struct llama_context * ctx,
772
875
  uint8_t * dst,
@@ -832,21 +935,23 @@ extern "C" {
832
935
  // For encode-decoder contexts, processes the batch using the encoder.
833
936
  // Can store the encoder output internally for later use by the decoder's cross-attention layers.
834
937
  // 0 - success
835
- // < 0 - error. the KV cache state is restored to the state before this call
938
+ // < 0 - error. the memory state is restored to the state before this call
836
939
  LLAMA_API int32_t llama_encode(
837
940
  struct llama_context * ctx,
838
941
  struct llama_batch batch);
839
942
 
840
943
  // Process a batch of tokens.
841
- // Requires KV cache.
944
+ // Requires the context to have a memory.
842
945
  // For encode-decoder contexts, processes the batch using the decoder.
843
946
  // Positive return values does not mean a fatal error, but rather a warning.
844
- // Upon non-zero return values, the KV cache state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
845
950
  // 0 - success
846
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
847
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
848
953
  // -1 - invalid input batch
849
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
850
955
  LLAMA_API int32_t llama_decode(
851
956
  struct llama_context * ctx,
852
957
  struct llama_batch batch);
@@ -862,8 +967,8 @@ extern "C" {
862
967
  // Get the number of threads used for prompt and batch processing (multiple token).
863
968
  LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
864
969
 
865
- // Set whether the model is in embeddings mode or not
866
- // If true, embeddings will be returned but logits will not
970
+ // Set whether the context outputs embeddings or not
971
+ // TODO: rename to avoid confusion with llama_get_embeddings()
867
972
  LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
868
973
 
869
974
  // Set whether to use causal attention or not
@@ -912,7 +1017,7 @@ extern "C" {
912
1017
 
913
1018
  // Get the embeddings for a sequence id
914
1019
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
915
- // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
1020
+ // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
916
1021
  // otherwise: float[n_embd] (1-dimensional)
917
1022
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
918
1023
 
@@ -942,6 +1047,7 @@ extern "C" {
942
1047
 
943
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
944
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
945
1051
 
946
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
947
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -985,6 +1091,7 @@ extern "C" {
985
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
986
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
987
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
988
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
989
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
990
1097
  /// as plaintext. Does not insert a leading space.
@@ -291,6 +291,8 @@ The transcript only includes text, it does not include markup like HTML and Mark
291
291
  {0}{4})";
292
292
 
293
293
  int main(int argc, char ** argv) {
294
+ ggml_backend_load_all();
295
+
294
296
  whisper_params params;
295
297
 
296
298
  if (whisper_params_parse(argc, argv, params) == false) {
@@ -204,12 +204,17 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
204
204
  // disable C++17 deprecation warning for std::codecvt_utf8
205
205
  # pragma clang diagnostic push
206
206
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
207
+ #elif defined(__GNUC__)
208
+ # pragma GCC diagnostic push
209
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
207
210
  #endif
208
211
 
209
212
  std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
210
213
 
211
214
  #if defined(__clang__)
212
215
  # pragma clang diagnostic pop
216
+ #elif defined(__GNUC__)
217
+ # pragma GCC diagnostic pop
213
218
  #endif
214
219
 
215
220
  return conv.from_bytes(s);
@@ -83,6 +83,8 @@ static bool vad_params_parse(int argc, char ** argv, cli_params & params) {
83
83
  static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
84
84
 
85
85
  int main(int argc, char ** argv) {
86
+ ggml_backend_load_all();
87
+
86
88
  cli_params cli_params;
87
89
 
88
90
  if (!vad_params_parse(argc, argv, cli_params)) {
@@ -109,6 +111,10 @@ int main(int argc, char ** argv) {
109
111
  struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
110
112
  cli_params.vad_model.c_str(),
111
113
  ctx_params);
114
+ if (vctx == nullptr) {
115
+ fprintf(stderr, "error: failed to initialize whisper context\n");
116
+ return 2;
117
+ }
112
118
 
113
119
  // Detect speech in the input audio file.
114
120
  if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) {
@@ -168,6 +168,8 @@ bool get_audio(std::vector<float> & pcmf32_cur) {
168
168
  }
169
169
 
170
170
  int main(int argc, char ** argv) {
171
+ ggml_backend_load_all();
172
+
171
173
  whisper_params params;
172
174
 
173
175
  if (whisper_params_parse(argc, argv, params) == false) {
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
105
105
  message(DEBUG "INS_ENB : ${INS_ENB}")
106
106
 
107
107
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
108
- option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
108
+ option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
109
109
  option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
110
110
  option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
111
111
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
@@ -131,13 +131,14 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" ON)
134
135
 
135
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
136
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
137
138
  set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
138
139
 
139
140
 
140
- if (WIN32)
141
+ if (MINGW)
141
142
  set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
142
143
  endif()
143
144
 
@@ -172,12 +173,12 @@ option(GGML_HIP "ggml: use HIP"
172
173
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
173
174
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
174
175
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
+ option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
175
177
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
176
178
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
177
179
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
178
180
  option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
179
181
  option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
180
- option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
181
182
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
182
183
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
183
184
  option(GGML_KOMPUTE "ggml: use Kompute" OFF)
@@ -368,6 +369,8 @@ if (MSVC)
368
369
  /wd4005 # Macro redefinition
369
370
  /wd4244 # Conversion from one type to another type, possible loss of data
370
371
  /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
372
+ /wd4305 # Conversion from 'type1' to 'type2', possible loss of data
373
+ /wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
371
374
  /wd4996 # Disable POSIX deprecation warnings
372
375
  /wd4702 # Unreachable code warnings
373
376
  )
@@ -387,4 +390,46 @@ if (MSVC)
387
390
  disable_msvc_warnings(ggml-cpu-skylakex)
388
391
  disable_msvc_warnings(ggml-cpu-icelake)
389
392
  disable_msvc_warnings(ggml-cpu-alderlake)
393
+
394
+ if (GGML_BUILD_EXAMPLES)
395
+ disable_msvc_warnings(common-ggml)
396
+ disable_msvc_warnings(common)
397
+
398
+ disable_msvc_warnings(mnist-common)
399
+ disable_msvc_warnings(mnist-eval)
400
+ disable_msvc_warnings(mnist-train)
401
+
402
+ disable_msvc_warnings(gpt-2-ctx)
403
+ disable_msvc_warnings(gpt-2-alloc)
404
+ disable_msvc_warnings(gpt-2-backend)
405
+ disable_msvc_warnings(gpt-2-sched)
406
+ disable_msvc_warnings(gpt-2-quantize)
407
+ disable_msvc_warnings(gpt-2-batched)
408
+
409
+ disable_msvc_warnings(gpt-j)
410
+ disable_msvc_warnings(gpt-j-quantize)
411
+
412
+ disable_msvc_warnings(magika)
413
+ disable_msvc_warnings(yolov3-tiny)
414
+ disable_msvc_warnings(sam)
415
+
416
+ disable_msvc_warnings(simple-ctx)
417
+ disable_msvc_warnings(simple-backend)
418
+ endif()
419
+
420
+ if (GGML_BUILD_TESTS)
421
+ disable_msvc_warnings(test-mul-mat)
422
+ disable_msvc_warnings(test-arange)
423
+ disable_msvc_warnings(test-backend-ops)
424
+ disable_msvc_warnings(test-cont)
425
+ disable_msvc_warnings(test-conv-transpose)
426
+ disable_msvc_warnings(test-conv-transpose-1d)
427
+ disable_msvc_warnings(test-conv1d)
428
+ disable_msvc_warnings(test-conv2d)
429
+ disable_msvc_warnings(test-conv2d-dw)
430
+ disable_msvc_warnings(test-customop)
431
+ disable_msvc_warnings(test-dup)
432
+ disable_msvc_warnings(test-opt)
433
+ disable_msvc_warnings(test-pool)
434
+ endif ()
390
435
  endif()
@@ -24,3 +24,27 @@ function(ggml_get_flags CCID CCVER)
24
24
  set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
25
25
  set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
26
26
  endfunction()
27
+
28
+ function(ggml_get_system_arch)
29
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
30
+ CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
31
+ (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
32
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
33
+ set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
34
+ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
35
+ CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
36
+ (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
37
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
38
+ set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
39
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
40
+ set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
41
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
42
+ set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
43
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
44
+ set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
45
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
46
+ set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
47
+ else()
48
+ set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
49
+ endif()
50
+ endfunction()
@@ -339,7 +339,7 @@ extern "C" {
339
339
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
340
340
 
341
341
  // Compare the output of two backends
342
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
342
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
343
343
 
344
344
  // Tensor initialization
345
345
  GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -133,6 +134,7 @@ extern "C" {
133
134
 
134
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);