whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -5,6 +5,7 @@
5
5
  #include "httplib.h"
6
6
  #include "json.hpp"
7
7
 
8
+ #include <cfloat>
8
9
  #include <chrono>
9
10
  #include <cmath>
10
11
  #include <cstdio>
@@ -13,10 +14,23 @@
13
14
  #include <string>
14
15
  #include <thread>
15
16
  #include <vector>
17
+ #include <memory>
18
+ #include <csignal>
19
+ #include <atomic>
20
+ #include <functional>
21
+ #include <cstdlib>
22
+ #if defined (_WIN32)
23
+ #include <windows.h>
24
+ #endif
16
25
 
17
26
  using namespace httplib;
18
27
  using json = nlohmann::ordered_json;
19
28
 
29
+ enum server_state {
30
+ SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
31
+ SERVER_STATE_READY, // Server is ready and model is loaded
32
+ };
33
+
20
34
  namespace {
21
35
 
22
36
  // output formats
@@ -26,6 +40,20 @@ const std::string srt_format = "srt";
26
40
  const std::string vjson_format = "verbose_json";
27
41
  const std::string vtt_format = "vtt";
28
42
 
43
+ std::function<void(int)> shutdown_handler;
44
+ std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
45
+
46
+ inline void signal_handler(int signal) {
47
+ if (is_terminating.test_and_set()) {
48
+ // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
49
+ // this is for better developer experience, we can remove when the server is stable enough
50
+ fprintf(stderr, "Received second interrupt, terminating immediately.\n");
51
+ exit(1);
52
+ }
53
+
54
+ shutdown_handler(signal);
55
+ }
56
+
29
57
  struct server_params
30
58
  {
31
59
  std::string hostname = "127.0.0.1";
@@ -90,6 +118,16 @@ struct whisper_params {
90
118
  std::string openvino_encode_device = "CPU";
91
119
 
92
120
  std::string dtw = "";
121
+
122
+ // Voice Activity Detection (VAD) parameters
123
+ bool vad = false;
124
+ std::string vad_model = "";
125
+ float vad_threshold = 0.5f;
126
+ int vad_min_speech_duration_ms = 250;
127
+ int vad_min_silence_duration_ms = 100;
128
+ float vad_max_speech_duration_s = FLT_MAX;
129
+ int vad_speech_pad_ms = 30;
130
+ float vad_samples_overlap = 0.1f;
93
131
  };
94
132
 
95
133
  void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -140,6 +178,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
140
178
  fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
141
179
  fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
142
180
  fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
181
+ // Voice Activity Detection (VAD) parameters
182
+ fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
183
+ fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
184
+ fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
185
+ fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
186
+ fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
187
+ fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
188
+ fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
189
+ std::string("FLT_MAX").c_str() :
190
+ std::to_string(params.vad_max_speech_duration_s).c_str());
191
+ fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
192
+ fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
143
193
  fprintf(stderr, "\n");
144
194
  }
145
195
 
@@ -195,6 +245,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
195
245
  else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
196
246
  else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
197
247
  else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
248
+
249
+ // Voice Activity Detection (VAD)
250
+ else if ( arg == "--vad") { params.vad = true; }
251
+ else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
252
+ else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
253
+ else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
254
+ else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
255
+ else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
256
+ else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
257
+ else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
198
258
  else {
199
259
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
200
260
  whisper_print_usage(argc, argv, params, sparams);
@@ -511,11 +571,41 @@ void get_req_parameters(const Request & req, whisper_params & params)
511
571
  {
512
572
  params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
513
573
  }
574
+ if (req.has_file("vad"))
575
+ {
576
+ params.vad = parse_str_to_bool(req.get_file_value("vad").content);
577
+ }
578
+ if (req.has_file("vad_threshold"))
579
+ {
580
+ params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
581
+ }
582
+ if (req.has_file("vad_min_speech_duration_ms"))
583
+ {
584
+ params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
585
+ }
586
+ if (req.has_file("vad_min_silence_duration_ms"))
587
+ {
588
+ params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
589
+ }
590
+ if (req.has_file("vad_max_speech_duration_s"))
591
+ {
592
+ params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
593
+ }
594
+ if (req.has_file("vad_speech_pad_ms"))
595
+ {
596
+ params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
597
+ }
598
+ if (req.has_file("vad_samples_overlap"))
599
+ {
600
+ params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
601
+ }
514
602
  }
515
603
 
516
604
  } // namespace
517
605
 
518
606
  int main(int argc, char ** argv) {
607
+ ggml_backend_load_all();
608
+
519
609
  whisper_params params;
520
610
  server_params sparams;
521
611
 
@@ -584,13 +674,19 @@ int main(int argc, char ** argv) {
584
674
  if (params.dtw == "large.v3") {
585
675
  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
586
676
  }
587
-
677
+ if (params.dtw == "large.v3.turbo") {
678
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;
679
+ }
680
+
588
681
  if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
589
682
  fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
590
683
  return 3;
591
684
  }
592
685
  }
593
686
 
687
+ std::unique_ptr<httplib::Server> svr = std::make_unique<httplib::Server>();
688
+ std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
689
+
594
690
  struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
595
691
 
596
692
  if (ctx == nullptr) {
@@ -600,9 +696,10 @@ int main(int argc, char ** argv) {
600
696
 
601
697
  // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
602
698
  whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
699
+ state.store(SERVER_STATE_READY);
700
+
603
701
 
604
- Server svr;
605
- svr.set_default_headers({{"Server", "whisper.cpp"},
702
+ svr->set_default_headers({{"Server", "whisper.cpp"},
606
703
  {"Access-Control-Allow-Origin", "*"},
607
704
  {"Access-Control-Allow-Headers", "content-type, authorization"}});
608
705
 
@@ -681,15 +778,15 @@ int main(int argc, char ** argv) {
681
778
  whisper_params default_params = params;
682
779
 
683
780
  // this is only called if no index.html is found in the public --path
684
- svr.Get(sparams.request_path + "/", [&default_content](const Request &, Response &res){
781
+ svr->Get(sparams.request_path + "/", [&](const Request &, Response &res){
685
782
  res.set_content(default_content, "text/html");
686
783
  return false;
687
784
  });
688
785
 
689
- svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
786
+ svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
690
787
  });
691
788
 
692
- svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
789
+ svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
693
790
  // acquire whisper model mutex lock
694
791
  std::lock_guard<std::mutex> lock(whisper_mutex);
695
792
 
@@ -827,6 +924,16 @@ int main(int argc, char ** argv) {
827
924
 
828
925
  wparams.suppress_nst = params.suppress_nst;
829
926
 
927
+ wparams.vad = params.vad;
928
+ wparams.vad_model_path = params.vad_model.c_str();
929
+
930
+ wparams.vad_params.threshold = params.vad_threshold;
931
+ wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
932
+ wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
933
+ wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
934
+ wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
935
+ wparams.vad_params.samples_overlap = params.vad_samples_overlap;
936
+
830
937
  whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
831
938
 
832
939
  // this callback is called on each new segment
@@ -995,8 +1102,9 @@ int main(int argc, char ** argv) {
995
1102
  // reset params to their defaults
996
1103
  params = default_params;
997
1104
  });
998
- svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
1105
+ svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
999
1106
  std::lock_guard<std::mutex> lock(whisper_mutex);
1107
+ state.store(SERVER_STATE_LOADING_MODEL);
1000
1108
  if (!req.has_file("model"))
1001
1109
  {
1002
1110
  fprintf(stderr, "error: no 'model' field in the request\n");
@@ -1028,18 +1136,25 @@ int main(int argc, char ** argv) {
1028
1136
  // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
1029
1137
  whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
1030
1138
 
1139
+ state.store(SERVER_STATE_READY);
1031
1140
  const std::string success = "Load was successful!";
1032
1141
  res.set_content(success, "application/text");
1033
1142
 
1034
1143
  // check if the model is in the file system
1035
1144
  });
1036
1145
 
1037
- svr.Get(sparams.request_path + "/health", [&](const Request &, Response &res){
1038
- const std::string health_response = "{\"status\":\"ok\"}";
1039
- res.set_content(health_response, "application/json");
1146
+ svr->Get(sparams.request_path + "/health", [&](const Request &, Response &res){
1147
+ server_state current_state = state.load();
1148
+ if (current_state == SERVER_STATE_READY) {
1149
+ const std::string health_response = "{\"status\":\"ok\"}";
1150
+ res.set_content(health_response, "application/json");
1151
+ } else {
1152
+ res.set_content("{\"status\":\"loading model\"}", "application/json");
1153
+ res.status = 503;
1154
+ }
1040
1155
  });
1041
1156
 
1042
- svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
1157
+ svr->set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
1043
1158
  const char fmt[] = "500 Internal Server Error\n%s";
1044
1159
  char buf[BUFSIZ];
1045
1160
  try {
@@ -1053,7 +1168,7 @@ int main(int argc, char ** argv) {
1053
1168
  res.status = 500;
1054
1169
  });
1055
1170
 
1056
- svr.set_error_handler([](const Request &req, Response &res) {
1171
+ svr->set_error_handler([](const Request &req, Response &res) {
1057
1172
  if (res.status == 400) {
1058
1173
  res.set_content("Invalid request", "text/plain");
1059
1174
  } else if (res.status != 500) {
@@ -1063,10 +1178,10 @@ int main(int argc, char ** argv) {
1063
1178
  });
1064
1179
 
1065
1180
  // set timeouts and change hostname and port
1066
- svr.set_read_timeout(sparams.read_timeout);
1067
- svr.set_write_timeout(sparams.write_timeout);
1181
+ svr->set_read_timeout(sparams.read_timeout);
1182
+ svr->set_write_timeout(sparams.write_timeout);
1068
1183
 
1069
- if (!svr.bind_to_port(sparams.hostname, sparams.port))
1184
+ if (!svr->bind_to_port(sparams.hostname, sparams.port))
1070
1185
  {
1071
1186
  fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
1072
1187
  sparams.hostname.c_str(), sparams.port);
@@ -1074,18 +1189,50 @@ int main(int argc, char ** argv) {
1074
1189
  }
1075
1190
 
1076
1191
  // Set the base directory for serving static files
1077
- svr.set_base_dir(sparams.public_path);
1192
+ svr->set_base_dir(sparams.public_path);
1078
1193
 
1079
1194
  // to make it ctrl+clickable:
1080
1195
  printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
1081
1196
 
1082
- if (!svr.listen_after_bind())
1083
- {
1084
- return 1;
1085
- }
1197
+ shutdown_handler = [&](int signal) {
1198
+ printf("\nCaught signal %d, shutting down gracefully...\n", signal);
1199
+ if (svr) {
1200
+ svr->stop();
1201
+ }
1202
+ };
1203
+
1204
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
1205
+ struct sigaction sigint_action;
1206
+ sigint_action.sa_handler = signal_handler;
1207
+ sigemptyset (&sigint_action.sa_mask);
1208
+ sigint_action.sa_flags = 0;
1209
+ sigaction(SIGINT, &sigint_action, NULL);
1210
+ sigaction(SIGTERM, &sigint_action, NULL);
1211
+ #elif defined (_WIN32)
1212
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
1213
+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
1214
+ };
1215
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
1216
+ #endif
1217
+
1218
+ // clean up function, to be called before exit
1219
+ auto clean_up = [&]() {
1220
+ whisper_print_timings(ctx);
1221
+ whisper_free(ctx);
1222
+ };
1223
+
1224
+ std::thread t([&] {
1225
+ if (!svr->listen_after_bind()) {
1226
+ fprintf(stderr, "error: server listen failed\n");
1227
+ }
1228
+ });
1229
+
1230
+ svr->wait_until_ready();
1231
+
1232
+ t.join();
1233
+
1086
1234
 
1087
- whisper_print_timings(ctx);
1088
- whisper_free(ctx);
1235
+ clean_up();
1089
1236
 
1090
1237
  return 0;
1091
1238
  }
@@ -116,6 +116,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
116
116
  }
117
117
 
118
118
  int main(int argc, char ** argv) {
119
+ ggml_backend_load_all();
120
+
119
121
  whisper_params params;
120
122
 
121
123
  if (whisper_params_parse(argc, argv, params) == false) {
@@ -161,6 +163,10 @@ int main(int argc, char ** argv) {
161
163
  cparams.flash_attn = params.flash_attn;
162
164
 
163
165
  struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
166
+ if (ctx == nullptr) {
167
+ fprintf(stderr, "error: failed to initialize whisper context\n");
168
+ return 2;
169
+ }
164
170
 
165
171
  std::vector<float> pcmf32 (n_samples_30s, 0.0f);
166
172
  std::vector<float> pcmf32_old;
@@ -16,7 +16,10 @@ if (WHISPER_SDL2)
16
16
  llama-hparams.cpp
17
17
  llama-impl.cpp
18
18
  llama-io.cpp
19
- llama-kv-cache.cpp
19
+ llama-kv-cache-unified.cpp
20
+ llama-kv-cache-unified-iswa.cpp
21
+ llama-memory-recurrent.cpp
22
+ llama-memory-hybrid.cpp
20
23
  llama-memory.cpp
21
24
  llama-mmap.cpp
22
25
  llama-model-loader.cpp
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
20
20
  { LLM_ARCH_BERT, "bert" },
21
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
22
  { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
23
+ { LLM_ARCH_NEO_BERT, "neo-bert" },
23
24
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
24
25
  { LLM_ARCH_BLOOM, "bloom" },
25
26
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
41
42
  { LLM_ARCH_GEMMA, "gemma" },
42
43
  { LLM_ARCH_GEMMA2, "gemma2" },
43
44
  { LLM_ARCH_GEMMA3, "gemma3" },
45
+ { LLM_ARCH_GEMMA3N, "gemma3n" },
44
46
  { LLM_ARCH_STARCODER2, "starcoder2" },
45
47
  { LLM_ARCH_MAMBA, "mamba" },
46
48
  { LLM_ARCH_XVERSE, "xverse" },
@@ -72,6 +74,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
72
74
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
73
75
  { LLM_ARCH_PLM, "plm" },
74
76
  { LLM_ARCH_BAILINGMOE, "bailingmoe" },
77
+ { LLM_ARCH_DOTS1, "dots1" },
78
+ { LLM_ARCH_ARCEE, "arcee" },
79
+ { LLM_ARCH_ERNIE4_5, "ernie4_5" },
75
80
  { LLM_ARCH_UNKNOWN, "(unknown)" },
76
81
  };
77
82
 
@@ -144,6 +149,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
144
149
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
145
150
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146
151
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
152
+ { LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
147
153
 
148
154
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
149
155
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -174,6 +180,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
174
180
  { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
175
181
  { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
176
182
 
183
+ { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
184
+
177
185
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
178
186
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
179
187
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -192,13 +200,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
192
200
  { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
193
201
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
194
202
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
203
+ { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
195
204
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
196
205
  { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
197
206
  { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
198
207
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
199
208
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
200
209
  { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
201
- { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
202
210
  { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
203
211
  { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
204
212
  { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
@@ -242,6 +250,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
242
250
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
243
251
  },
244
252
  },
253
+ {
254
+ LLM_ARCH_ARCEE,
255
+ {
256
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
257
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
258
+ { LLM_TENSOR_OUTPUT, "output" },
259
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
260
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
261
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
262
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
263
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
264
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
265
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
266
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
267
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
268
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
269
+ },
270
+ },
245
271
  {
246
272
  LLM_ARCH_LLAMA4,
247
273
  {
@@ -448,6 +474,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
448
474
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
449
475
  { LLM_TENSOR_POS_EMBD, "position_embd" },
450
476
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
477
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
451
478
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
452
479
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
453
480
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
@@ -492,6 +519,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
492
519
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
520
  },
494
521
  },
522
+ {
523
+ LLM_ARCH_NEO_BERT,
524
+ {
525
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
526
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
527
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
528
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
529
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
530
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
531
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
532
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
533
+ { LLM_TENSOR_CLS, "cls" },
534
+ { LLM_TENSOR_CLS_OUT, "cls.output" },
535
+ },
536
+ },
495
537
  {
496
538
  LLM_ARCH_JINA_BERT_V2,
497
539
  {
@@ -892,6 +934,42 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
892
934
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
893
935
  },
894
936
  },
937
+ {
938
+ LLM_ARCH_GEMMA3N,
939
+ {
940
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
941
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
942
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
943
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
944
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
945
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
946
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
947
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
948
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
949
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
950
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
951
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
952
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
953
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
954
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
955
+ { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
956
+ { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
957
+ { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
958
+ { LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" },
959
+ { LLM_TENSOR_ALTUP_PROJ, "altup_proj" },
960
+ { LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" },
961
+ { LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" },
962
+ { LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" },
963
+ { LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" },
964
+ { LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" },
965
+ { LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" },
966
+ { LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" },
967
+ { LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" },
968
+ { LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" },
969
+ { LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" },
970
+ { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
971
+ },
972
+ },
895
973
  {
896
974
  LLM_ARCH_STARCODER2,
897
975
  {
@@ -1553,6 +1631,51 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1553
1631
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1554
1632
  },
1555
1633
  },
1634
+ {
1635
+ LLM_ARCH_DOTS1,
1636
+ {
1637
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1638
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1639
+ { LLM_TENSOR_OUTPUT, "output" },
1640
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1641
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1642
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1643
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1644
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1645
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1646
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1647
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1648
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1649
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1650
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1651
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1652
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1653
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1654
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1655
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1656
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1657
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1658
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1659
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1660
+ }
1661
+ },
1662
+ {
1663
+ LLM_ARCH_ERNIE4_5,
1664
+ {
1665
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1666
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1667
+ { LLM_TENSOR_OUTPUT, "output" },
1668
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1669
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1670
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1671
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1672
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1673
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1674
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1675
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1676
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1677
+ },
1678
+ },
1556
1679
  {
1557
1680
  LLM_ARCH_UNKNOWN,
1558
1681
  {
@@ -1681,6 +1804,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1681
1804
  {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
1682
1805
  {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
1683
1806
  {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1807
+ // altup / laurel (gemma 3n)
1808
+ {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
1809
+ {LLM_TENSOR_PER_LAYER_MODEL_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
1810
+ {LLM_TENSOR_PER_LAYER_PROJ_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
1811
+ {LLM_TENSOR_ALTUP_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
1812
+ {LLM_TENSOR_ALTUP_UNEMBD_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
1813
+ {LLM_TENSOR_PER_LAYER_INP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1814
+ {LLM_TENSOR_PER_LAYER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1815
+ {LLM_TENSOR_PER_LAYER_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1816
+ {LLM_TENSOR_ALTUP_CORRECT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1817
+ {LLM_TENSOR_ALTUP_CORRECT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1818
+ {LLM_TENSOR_ALTUP_PREDICT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1819
+ {LLM_TENSOR_ALTUP_ROUTER, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1820
+ {LLM_TENSOR_ALTUP_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1821
+ {LLM_TENSOR_LAUREL_L, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1822
+ {LLM_TENSOR_LAUREL_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1823
+ {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1684
1824
  // this tensor is loaded for T5, but never used
1685
1825
  {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
1686
1826
  {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
@@ -1704,8 +1844,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1704
1844
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
1705
1845
 
1706
1846
  std::string LLM_KV::operator()(llm_kv kv) const {
1707
- return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
1708
- : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1847
+ std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1848
+
1849
+ if (suffix != nullptr) {
1850
+ name += ".";
1851
+ name += suffix;
1852
+ }
1853
+
1854
+ return name;
1709
1855
  }
1710
1856
 
1711
1857
  std::string LLM_TN_IMPL::str() const {
@@ -1744,3 +1890,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
1744
1890
  const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
1745
1891
  return LLM_TENSOR_INFOS.at(tensor);
1746
1892
  }
1893
+
1894
+ bool llm_arch_is_recurrent(const llm_arch & arch) {
1895
+ switch (arch) {
1896
+ case LLM_ARCH_MAMBA:
1897
+ case LLM_ARCH_RWKV6:
1898
+ case LLM_ARCH_RWKV6QWEN2:
1899
+ case LLM_ARCH_RWKV7:
1900
+ case LLM_ARCH_ARWKV7:
1901
+ return true;
1902
+ default:
1903
+ return false;
1904
+ }
1905
+ }
1906
+
1907
+ bool llm_arch_is_hybrid(const llm_arch & arch) {
1908
+ // TODO: There are currently no hybrid models! Once there are, this will be
1909
+ // the place to identify them
1910
+ switch (arch) {
1911
+ default:
1912
+ return false;
1913
+ }
1914
+ }