whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -5,6 +5,7 @@
|
|
5
5
|
#include "httplib.h"
|
6
6
|
#include "json.hpp"
|
7
7
|
|
8
|
+
#include <cfloat>
|
8
9
|
#include <chrono>
|
9
10
|
#include <cmath>
|
10
11
|
#include <cstdio>
|
@@ -13,10 +14,23 @@
|
|
13
14
|
#include <string>
|
14
15
|
#include <thread>
|
15
16
|
#include <vector>
|
17
|
+
#include <memory>
|
18
|
+
#include <csignal>
|
19
|
+
#include <atomic>
|
20
|
+
#include <functional>
|
21
|
+
#include <cstdlib>
|
22
|
+
#if defined (_WIN32)
|
23
|
+
#include <windows.h>
|
24
|
+
#endif
|
16
25
|
|
17
26
|
using namespace httplib;
|
18
27
|
using json = nlohmann::ordered_json;
|
19
28
|
|
29
|
+
enum server_state {
|
30
|
+
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
31
|
+
SERVER_STATE_READY, // Server is ready and model is loaded
|
32
|
+
};
|
33
|
+
|
20
34
|
namespace {
|
21
35
|
|
22
36
|
// output formats
|
@@ -26,6 +40,20 @@ const std::string srt_format = "srt";
|
|
26
40
|
const std::string vjson_format = "verbose_json";
|
27
41
|
const std::string vtt_format = "vtt";
|
28
42
|
|
43
|
+
std::function<void(int)> shutdown_handler;
|
44
|
+
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
|
45
|
+
|
46
|
+
inline void signal_handler(int signal) {
|
47
|
+
if (is_terminating.test_and_set()) {
|
48
|
+
// in case it hangs, we can force terminate the server by hitting Ctrl+C twice
|
49
|
+
// this is for better developer experience, we can remove when the server is stable enough
|
50
|
+
fprintf(stderr, "Received second interrupt, terminating immediately.\n");
|
51
|
+
exit(1);
|
52
|
+
}
|
53
|
+
|
54
|
+
shutdown_handler(signal);
|
55
|
+
}
|
56
|
+
|
29
57
|
struct server_params
|
30
58
|
{
|
31
59
|
std::string hostname = "127.0.0.1";
|
@@ -90,6 +118,16 @@ struct whisper_params {
|
|
90
118
|
std::string openvino_encode_device = "CPU";
|
91
119
|
|
92
120
|
std::string dtw = "";
|
121
|
+
|
122
|
+
// Voice Activity Detection (VAD) parameters
|
123
|
+
bool vad = false;
|
124
|
+
std::string vad_model = "";
|
125
|
+
float vad_threshold = 0.5f;
|
126
|
+
int vad_min_speech_duration_ms = 250;
|
127
|
+
int vad_min_silence_duration_ms = 100;
|
128
|
+
float vad_max_speech_duration_s = FLT_MAX;
|
129
|
+
int vad_speech_pad_ms = 30;
|
130
|
+
float vad_samples_overlap = 0.1f;
|
93
131
|
};
|
94
132
|
|
95
133
|
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
|
@@ -140,6 +178,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
140
178
|
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
|
141
179
|
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
142
180
|
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
181
|
+
// Voice Activity Detection (VAD) parameters
|
182
|
+
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
|
183
|
+
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
|
184
|
+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
|
185
|
+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
|
186
|
+
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
|
187
|
+
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
|
188
|
+
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
|
189
|
+
std::string("FLT_MAX").c_str() :
|
190
|
+
std::to_string(params.vad_max_speech_duration_s).c_str());
|
191
|
+
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
|
192
|
+
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
|
143
193
|
fprintf(stderr, "\n");
|
144
194
|
}
|
145
195
|
|
@@ -195,6 +245,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|
195
245
|
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
|
196
246
|
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
|
197
247
|
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
|
248
|
+
|
249
|
+
// Voice Activity Detection (VAD)
|
250
|
+
else if ( arg == "--vad") { params.vad = true; }
|
251
|
+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
|
252
|
+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
|
253
|
+
else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
|
254
|
+
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
|
255
|
+
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
|
256
|
+
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
|
257
|
+
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
|
198
258
|
else {
|
199
259
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
200
260
|
whisper_print_usage(argc, argv, params, sparams);
|
@@ -511,11 +571,41 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
|
511
571
|
{
|
512
572
|
params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
|
513
573
|
}
|
574
|
+
if (req.has_file("vad"))
|
575
|
+
{
|
576
|
+
params.vad = parse_str_to_bool(req.get_file_value("vad").content);
|
577
|
+
}
|
578
|
+
if (req.has_file("vad_threshold"))
|
579
|
+
{
|
580
|
+
params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
|
581
|
+
}
|
582
|
+
if (req.has_file("vad_min_speech_duration_ms"))
|
583
|
+
{
|
584
|
+
params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
|
585
|
+
}
|
586
|
+
if (req.has_file("vad_min_silence_duration_ms"))
|
587
|
+
{
|
588
|
+
params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
|
589
|
+
}
|
590
|
+
if (req.has_file("vad_max_speech_duration_s"))
|
591
|
+
{
|
592
|
+
params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
|
593
|
+
}
|
594
|
+
if (req.has_file("vad_speech_pad_ms"))
|
595
|
+
{
|
596
|
+
params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
|
597
|
+
}
|
598
|
+
if (req.has_file("vad_samples_overlap"))
|
599
|
+
{
|
600
|
+
params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
|
601
|
+
}
|
514
602
|
}
|
515
603
|
|
516
604
|
} // namespace
|
517
605
|
|
518
606
|
int main(int argc, char ** argv) {
|
607
|
+
ggml_backend_load_all();
|
608
|
+
|
519
609
|
whisper_params params;
|
520
610
|
server_params sparams;
|
521
611
|
|
@@ -584,13 +674,19 @@ int main(int argc, char ** argv) {
|
|
584
674
|
if (params.dtw == "large.v3") {
|
585
675
|
cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
|
586
676
|
}
|
587
|
-
|
677
|
+
if (params.dtw == "large.v3.turbo") {
|
678
|
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;
|
679
|
+
}
|
680
|
+
|
588
681
|
if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
|
589
682
|
fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
|
590
683
|
return 3;
|
591
684
|
}
|
592
685
|
}
|
593
686
|
|
687
|
+
std::unique_ptr<httplib::Server> svr = std::make_unique<httplib::Server>();
|
688
|
+
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
|
689
|
+
|
594
690
|
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
595
691
|
|
596
692
|
if (ctx == nullptr) {
|
@@ -600,9 +696,10 @@ int main(int argc, char ** argv) {
|
|
600
696
|
|
601
697
|
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
|
602
698
|
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
|
699
|
+
state.store(SERVER_STATE_READY);
|
700
|
+
|
603
701
|
|
604
|
-
Server
|
605
|
-
svr.set_default_headers({{"Server", "whisper.cpp"},
|
702
|
+
svr->set_default_headers({{"Server", "whisper.cpp"},
|
606
703
|
{"Access-Control-Allow-Origin", "*"},
|
607
704
|
{"Access-Control-Allow-Headers", "content-type, authorization"}});
|
608
705
|
|
@@ -681,15 +778,15 @@ int main(int argc, char ** argv) {
|
|
681
778
|
whisper_params default_params = params;
|
682
779
|
|
683
780
|
// this is only called if no index.html is found in the public --path
|
684
|
-
svr
|
781
|
+
svr->Get(sparams.request_path + "/", [&](const Request &, Response &res){
|
685
782
|
res.set_content(default_content, "text/html");
|
686
783
|
return false;
|
687
784
|
});
|
688
785
|
|
689
|
-
svr
|
786
|
+
svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
|
690
787
|
});
|
691
788
|
|
692
|
-
svr
|
789
|
+
svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
|
693
790
|
// acquire whisper model mutex lock
|
694
791
|
std::lock_guard<std::mutex> lock(whisper_mutex);
|
695
792
|
|
@@ -827,6 +924,16 @@ int main(int argc, char ** argv) {
|
|
827
924
|
|
828
925
|
wparams.suppress_nst = params.suppress_nst;
|
829
926
|
|
927
|
+
wparams.vad = params.vad;
|
928
|
+
wparams.vad_model_path = params.vad_model.c_str();
|
929
|
+
|
930
|
+
wparams.vad_params.threshold = params.vad_threshold;
|
931
|
+
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
932
|
+
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
933
|
+
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
934
|
+
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
935
|
+
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
936
|
+
|
830
937
|
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
831
938
|
|
832
939
|
// this callback is called on each new segment
|
@@ -995,8 +1102,9 @@ int main(int argc, char ** argv) {
|
|
995
1102
|
// reset params to their defaults
|
996
1103
|
params = default_params;
|
997
1104
|
});
|
998
|
-
svr
|
1105
|
+
svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
|
999
1106
|
std::lock_guard<std::mutex> lock(whisper_mutex);
|
1107
|
+
state.store(SERVER_STATE_LOADING_MODEL);
|
1000
1108
|
if (!req.has_file("model"))
|
1001
1109
|
{
|
1002
1110
|
fprintf(stderr, "error: no 'model' field in the request\n");
|
@@ -1028,18 +1136,25 @@ int main(int argc, char ** argv) {
|
|
1028
1136
|
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
|
1029
1137
|
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
|
1030
1138
|
|
1139
|
+
state.store(SERVER_STATE_READY);
|
1031
1140
|
const std::string success = "Load was successful!";
|
1032
1141
|
res.set_content(success, "application/text");
|
1033
1142
|
|
1034
1143
|
// check if the model is in the file system
|
1035
1144
|
});
|
1036
1145
|
|
1037
|
-
svr
|
1038
|
-
|
1039
|
-
|
1146
|
+
svr->Get(sparams.request_path + "/health", [&](const Request &, Response &res){
|
1147
|
+
server_state current_state = state.load();
|
1148
|
+
if (current_state == SERVER_STATE_READY) {
|
1149
|
+
const std::string health_response = "{\"status\":\"ok\"}";
|
1150
|
+
res.set_content(health_response, "application/json");
|
1151
|
+
} else {
|
1152
|
+
res.set_content("{\"status\":\"loading model\"}", "application/json");
|
1153
|
+
res.status = 503;
|
1154
|
+
}
|
1040
1155
|
});
|
1041
1156
|
|
1042
|
-
svr
|
1157
|
+
svr->set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
|
1043
1158
|
const char fmt[] = "500 Internal Server Error\n%s";
|
1044
1159
|
char buf[BUFSIZ];
|
1045
1160
|
try {
|
@@ -1053,7 +1168,7 @@ int main(int argc, char ** argv) {
|
|
1053
1168
|
res.status = 500;
|
1054
1169
|
});
|
1055
1170
|
|
1056
|
-
svr
|
1171
|
+
svr->set_error_handler([](const Request &req, Response &res) {
|
1057
1172
|
if (res.status == 400) {
|
1058
1173
|
res.set_content("Invalid request", "text/plain");
|
1059
1174
|
} else if (res.status != 500) {
|
@@ -1063,10 +1178,10 @@ int main(int argc, char ** argv) {
|
|
1063
1178
|
});
|
1064
1179
|
|
1065
1180
|
// set timeouts and change hostname and port
|
1066
|
-
svr
|
1067
|
-
svr
|
1181
|
+
svr->set_read_timeout(sparams.read_timeout);
|
1182
|
+
svr->set_write_timeout(sparams.write_timeout);
|
1068
1183
|
|
1069
|
-
if (!svr
|
1184
|
+
if (!svr->bind_to_port(sparams.hostname, sparams.port))
|
1070
1185
|
{
|
1071
1186
|
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
|
1072
1187
|
sparams.hostname.c_str(), sparams.port);
|
@@ -1074,18 +1189,50 @@ int main(int argc, char ** argv) {
|
|
1074
1189
|
}
|
1075
1190
|
|
1076
1191
|
// Set the base directory for serving static files
|
1077
|
-
svr
|
1192
|
+
svr->set_base_dir(sparams.public_path);
|
1078
1193
|
|
1079
1194
|
// to make it ctrl+clickable:
|
1080
1195
|
printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
1081
1196
|
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1197
|
+
shutdown_handler = [&](int signal) {
|
1198
|
+
printf("\nCaught signal %d, shutting down gracefully...\n", signal);
|
1199
|
+
if (svr) {
|
1200
|
+
svr->stop();
|
1201
|
+
}
|
1202
|
+
};
|
1203
|
+
|
1204
|
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
1205
|
+
struct sigaction sigint_action;
|
1206
|
+
sigint_action.sa_handler = signal_handler;
|
1207
|
+
sigemptyset (&sigint_action.sa_mask);
|
1208
|
+
sigint_action.sa_flags = 0;
|
1209
|
+
sigaction(SIGINT, &sigint_action, NULL);
|
1210
|
+
sigaction(SIGTERM, &sigint_action, NULL);
|
1211
|
+
#elif defined (_WIN32)
|
1212
|
+
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
1213
|
+
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
1214
|
+
};
|
1215
|
+
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
1216
|
+
#endif
|
1217
|
+
|
1218
|
+
// clean up function, to be called before exit
|
1219
|
+
auto clean_up = [&]() {
|
1220
|
+
whisper_print_timings(ctx);
|
1221
|
+
whisper_free(ctx);
|
1222
|
+
};
|
1223
|
+
|
1224
|
+
std::thread t([&] {
|
1225
|
+
if (!svr->listen_after_bind()) {
|
1226
|
+
fprintf(stderr, "error: server listen failed\n");
|
1227
|
+
}
|
1228
|
+
});
|
1229
|
+
|
1230
|
+
svr->wait_until_ready();
|
1231
|
+
|
1232
|
+
t.join();
|
1233
|
+
|
1086
1234
|
|
1087
|
-
|
1088
|
-
whisper_free(ctx);
|
1235
|
+
clean_up();
|
1089
1236
|
|
1090
1237
|
return 0;
|
1091
1238
|
}
|
@@ -116,6 +116,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
116
116
|
}
|
117
117
|
|
118
118
|
int main(int argc, char ** argv) {
|
119
|
+
ggml_backend_load_all();
|
120
|
+
|
119
121
|
whisper_params params;
|
120
122
|
|
121
123
|
if (whisper_params_parse(argc, argv, params) == false) {
|
@@ -161,6 +163,10 @@ int main(int argc, char ** argv) {
|
|
161
163
|
cparams.flash_attn = params.flash_attn;
|
162
164
|
|
163
165
|
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
166
|
+
if (ctx == nullptr) {
|
167
|
+
fprintf(stderr, "error: failed to initialize whisper context\n");
|
168
|
+
return 2;
|
169
|
+
}
|
164
170
|
|
165
171
|
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
|
166
172
|
std::vector<float> pcmf32_old;
|
@@ -16,7 +16,10 @@ if (WHISPER_SDL2)
|
|
16
16
|
llama-hparams.cpp
|
17
17
|
llama-impl.cpp
|
18
18
|
llama-io.cpp
|
19
|
-
llama-kv-cache.cpp
|
19
|
+
llama-kv-cache-unified.cpp
|
20
|
+
llama-kv-cache-unified-iswa.cpp
|
21
|
+
llama-memory-recurrent.cpp
|
22
|
+
llama-memory-hybrid.cpp
|
20
23
|
llama-memory.cpp
|
21
24
|
llama-mmap.cpp
|
22
25
|
llama-model-loader.cpp
|
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
20
20
|
{ LLM_ARCH_BERT, "bert" },
|
21
21
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
22
22
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
23
|
+
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
23
24
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
24
25
|
{ LLM_ARCH_BLOOM, "bloom" },
|
25
26
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
41
42
|
{ LLM_ARCH_GEMMA, "gemma" },
|
42
43
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
43
44
|
{ LLM_ARCH_GEMMA3, "gemma3" },
|
45
|
+
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
44
46
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
45
47
|
{ LLM_ARCH_MAMBA, "mamba" },
|
46
48
|
{ LLM_ARCH_XVERSE, "xverse" },
|
@@ -72,6 +74,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
72
74
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
73
75
|
{ LLM_ARCH_PLM, "plm" },
|
74
76
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
77
|
+
{ LLM_ARCH_DOTS1, "dots1" },
|
78
|
+
{ LLM_ARCH_ARCEE, "arcee" },
|
79
|
+
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
75
80
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
76
81
|
};
|
77
82
|
|
@@ -144,6 +149,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
144
149
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
145
150
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
146
151
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
152
|
+
{ LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
|
147
153
|
|
148
154
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
149
155
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
@@ -174,6 +180,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
174
180
|
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
|
175
181
|
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
|
176
182
|
|
183
|
+
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
184
|
+
|
177
185
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
178
186
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
179
187
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
@@ -192,13 +200,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
192
200
|
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
193
201
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
194
202
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
203
|
+
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
|
195
204
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
196
205
|
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
197
206
|
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
198
207
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
199
208
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
200
209
|
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
201
|
-
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
|
202
210
|
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
203
211
|
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
204
212
|
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
@@ -242,6 +250,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
242
250
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
243
251
|
},
|
244
252
|
},
|
253
|
+
{
|
254
|
+
LLM_ARCH_ARCEE,
|
255
|
+
{
|
256
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
257
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
258
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
259
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
260
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
261
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
262
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
263
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
264
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
265
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
266
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
267
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
268
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
269
|
+
},
|
270
|
+
},
|
245
271
|
{
|
246
272
|
LLM_ARCH_LLAMA4,
|
247
273
|
{
|
@@ -448,6 +474,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
448
474
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
449
475
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
450
476
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
477
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
451
478
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
452
479
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
453
480
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
@@ -492,6 +519,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
492
519
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
493
520
|
},
|
494
521
|
},
|
522
|
+
{
|
523
|
+
LLM_ARCH_NEO_BERT,
|
524
|
+
{
|
525
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
526
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
527
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
528
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
529
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
530
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
531
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
532
|
+
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
533
|
+
{ LLM_TENSOR_CLS, "cls" },
|
534
|
+
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
535
|
+
},
|
536
|
+
},
|
495
537
|
{
|
496
538
|
LLM_ARCH_JINA_BERT_V2,
|
497
539
|
{
|
@@ -892,6 +934,42 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
892
934
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
893
935
|
},
|
894
936
|
},
|
937
|
+
{
|
938
|
+
LLM_ARCH_GEMMA3N,
|
939
|
+
{
|
940
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
941
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
942
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
943
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
944
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
945
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
946
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
947
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
948
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
949
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
950
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
951
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
952
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
953
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
954
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
955
|
+
{ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
|
956
|
+
{ LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
|
957
|
+
{ LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
|
958
|
+
{ LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" },
|
959
|
+
{ LLM_TENSOR_ALTUP_PROJ, "altup_proj" },
|
960
|
+
{ LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" },
|
961
|
+
{ LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" },
|
962
|
+
{ LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" },
|
963
|
+
{ LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" },
|
964
|
+
{ LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" },
|
965
|
+
{ LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" },
|
966
|
+
{ LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" },
|
967
|
+
{ LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" },
|
968
|
+
{ LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" },
|
969
|
+
{ LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" },
|
970
|
+
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
|
971
|
+
},
|
972
|
+
},
|
895
973
|
{
|
896
974
|
LLM_ARCH_STARCODER2,
|
897
975
|
{
|
@@ -1553,6 +1631,51 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1553
1631
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1554
1632
|
},
|
1555
1633
|
},
|
1634
|
+
{
|
1635
|
+
LLM_ARCH_DOTS1,
|
1636
|
+
{
|
1637
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1638
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1639
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1640
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1641
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1642
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
1643
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1644
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1645
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1646
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1647
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1648
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1649
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1650
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1651
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1652
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1653
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1654
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1655
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1656
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1657
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1658
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1659
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
1660
|
+
}
|
1661
|
+
},
|
1662
|
+
{
|
1663
|
+
LLM_ARCH_ERNIE4_5,
|
1664
|
+
{
|
1665
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1666
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1667
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1668
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1669
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1670
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1671
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1672
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1673
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1674
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1675
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1676
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1677
|
+
},
|
1678
|
+
},
|
1556
1679
|
{
|
1557
1680
|
LLM_ARCH_UNKNOWN,
|
1558
1681
|
{
|
@@ -1681,6 +1804,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
1681
1804
|
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
1682
1805
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
1683
1806
|
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
1807
|
+
// altup / laurel (gemma 3n)
|
1808
|
+
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
1809
|
+
{LLM_TENSOR_PER_LAYER_MODEL_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
1810
|
+
{LLM_TENSOR_PER_LAYER_PROJ_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
1811
|
+
{LLM_TENSOR_ALTUP_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
1812
|
+
{LLM_TENSOR_ALTUP_UNEMBD_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
1813
|
+
{LLM_TENSOR_PER_LAYER_INP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1814
|
+
{LLM_TENSOR_PER_LAYER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1815
|
+
{LLM_TENSOR_PER_LAYER_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
1816
|
+
{LLM_TENSOR_ALTUP_CORRECT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1817
|
+
{LLM_TENSOR_ALTUP_CORRECT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
1818
|
+
{LLM_TENSOR_ALTUP_PREDICT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1819
|
+
{LLM_TENSOR_ALTUP_ROUTER, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1820
|
+
{LLM_TENSOR_ALTUP_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
1821
|
+
{LLM_TENSOR_LAUREL_L, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1822
|
+
{LLM_TENSOR_LAUREL_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
1823
|
+
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
1684
1824
|
// this tensor is loaded for T5, but never used
|
1685
1825
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
1686
1826
|
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
@@ -1704,8 +1844,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
1704
1844
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
1705
1845
|
|
1706
1846
|
std::string LLM_KV::operator()(llm_kv kv) const {
|
1707
|
-
|
1708
|
-
|
1847
|
+
std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
1848
|
+
|
1849
|
+
if (suffix != nullptr) {
|
1850
|
+
name += ".";
|
1851
|
+
name += suffix;
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
return name;
|
1709
1855
|
}
|
1710
1856
|
|
1711
1857
|
std::string LLM_TN_IMPL::str() const {
|
@@ -1744,3 +1890,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
|
|
1744
1890
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
|
1745
1891
|
return LLM_TENSOR_INFOS.at(tensor);
|
1746
1892
|
}
|
1893
|
+
|
1894
|
+
bool llm_arch_is_recurrent(const llm_arch & arch) {
|
1895
|
+
switch (arch) {
|
1896
|
+
case LLM_ARCH_MAMBA:
|
1897
|
+
case LLM_ARCH_RWKV6:
|
1898
|
+
case LLM_ARCH_RWKV6QWEN2:
|
1899
|
+
case LLM_ARCH_RWKV7:
|
1900
|
+
case LLM_ARCH_ARWKV7:
|
1901
|
+
return true;
|
1902
|
+
default:
|
1903
|
+
return false;
|
1904
|
+
}
|
1905
|
+
}
|
1906
|
+
|
1907
|
+
bool llm_arch_is_hybrid(const llm_arch & arch) {
|
1908
|
+
// TODO: There are currently no hybrid models! Once there are, this will be
|
1909
|
+
// the place to identify them
|
1910
|
+
switch (arch) {
|
1911
|
+
default:
|
1912
|
+
return false;
|
1913
|
+
}
|
1914
|
+
}
|