whisper.rn 0.4.0-rc.10 → 0.4.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/whisper.cpp CHANGED
@@ -1,8 +1,7 @@
1
1
  #include "whisper.h"
2
2
 
3
- #include "ggml-cpu.h"
4
-
5
3
  #include "ggml.h"
4
+ #include "ggml-cpp.h"
6
5
  #include "ggml-alloc.h"
7
6
  #include "ggml-backend.h"
8
7
 
@@ -19,35 +18,38 @@
19
18
  #include <cassert>
20
19
  #define _USE_MATH_DEFINES
21
20
  #include <cmath>
22
- #include <cstdio>
21
+ #include <codecvt>
23
22
  #include <cstdarg>
23
+ #include <cstdio>
24
24
  #include <cstring>
25
25
  #include <fstream>
26
+ #include <functional>
26
27
  #include <map>
28
+ #include <mutex>
29
+ #include <random>
30
+ #include <regex>
27
31
  #include <set>
28
32
  #include <string>
29
33
  #include <thread>
30
34
  #include <vector>
31
- #include <regex>
32
- #include <random>
33
- #include <functional>
34
- #include <codecvt>
35
+
36
+ // dummy
35
37
 
36
38
  #if defined(_MSC_VER)
37
39
  #pragma warning(disable: 4244 4267) // possible loss of data
38
40
  #endif
39
41
 
40
- #if defined(WSP_GGML_BIG_ENDIAN)
41
- #include <bit>
42
-
42
+ #if defined(WHISPER_BIG_ENDIAN)
43
43
  template<typename T>
44
44
  static T byteswap(T value) {
45
- return std::byteswap(value);
46
- }
47
-
48
- template<>
49
- float byteswap(float value) {
50
- return std::bit_cast<float>(byteswap(std::bit_cast<std::uint32_t>(value)));
45
+ T value_swapped;
46
+ char * source = reinterpret_cast<char *>(&value);
47
+ char * target = reinterpret_cast<char *>(&value_swapped);
48
+ int size = sizeof(T);
49
+ for (int i = 0; i < size; i++) {
50
+ target[size - 1 - i] = source[i];
51
+ }
52
+ return value_swapped;
51
53
  }
52
54
 
53
55
  template<typename T>
@@ -83,14 +85,14 @@ static void byteswap_tensor(wsp_ggml_tensor * tensor) {
83
85
  }
84
86
 
85
87
  #define BYTESWAP_VALUE(d) d = byteswap(d)
86
- #define BYTESWAP_FILTERS(f) \
88
+ #define BYTESWAP_FILTERS(f) \
87
89
  do { \
88
90
  for (auto & datum : f.data) { \
89
91
  datum = byteswap(datum); \
90
92
  } \
91
93
  } while (0)
92
- #define BYTESWAP_TENSOR(t) \
93
- do { \
94
+ #define BYTESWAP_TENSOR(t) \
95
+ do { \
94
96
  byteswap_tensor(t); \
95
97
  } while (0)
96
98
  #else
@@ -147,21 +149,25 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
147
149
 
148
150
  static bool wsp_ggml_graph_compute_helper(
149
151
  struct wsp_ggml_cgraph * graph,
150
- std::vector<uint8_t> & buf,
151
152
  int n_threads,
152
153
  wsp_ggml_abort_callback abort_callback,
153
154
  void * abort_callback_data) {
154
- struct wsp_ggml_cplan plan = wsp_ggml_graph_plan(graph, n_threads, nullptr);
155
155
 
156
- plan.abort_callback = abort_callback;
157
- plan.abort_callback_data = abort_callback_data;
156
+ wsp_ggml_backend_ptr backend { wsp_ggml_backend_init_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
158
157
 
159
- if (plan.work_size > 0) {
160
- buf.resize(plan.work_size);
161
- plan.work_data = buf.data();
158
+ auto * reg = wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_get_device(backend.get()));
159
+
160
+ auto * set_abort_callback_fn = (wsp_ggml_backend_set_abort_callback_t) wsp_ggml_backend_reg_get_proc_address(reg, "wsp_ggml_backend_set_abort_callback");
161
+ if (set_abort_callback_fn) {
162
+ set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
162
163
  }
163
164
 
164
- return wsp_ggml_graph_compute(graph, &plan);
165
+ auto wsp_ggml_backend_set_n_threads_fn = (wsp_ggml_backend_set_n_threads_t) wsp_ggml_backend_reg_get_proc_address(reg, "wsp_ggml_backend_set_n_threads");
166
+ if (wsp_ggml_backend_set_n_threads_fn) {
167
+ wsp_ggml_backend_set_n_threads_fn(backend.get(), n_threads);
168
+ }
169
+
170
+ return wsp_ggml_backend_graph_compute(backend.get(), graph) == WSP_GGML_STATUS_SUCCESS;
165
171
  }
166
172
 
167
173
  static bool wsp_ggml_graph_compute_helper(
@@ -185,6 +191,61 @@ static bool wsp_ggml_graph_compute_helper(
185
191
  return t;
186
192
  }
187
193
 
194
+ static void whisper_load_backends() {
195
+ #ifdef WSP_GGML_BACKEND_DL
196
+ static std::once_flag flag;
197
+ std::call_once(flag, []() {
198
+ wsp_ggml_backend_load_all();
199
+ });
200
+ #endif
201
+ }
202
+
203
+ // TODO: move these functions to ggml-base with support for ggml-backend?
204
+
205
+ static wsp_ggml_tensor * whisper_set_f32(struct wsp_ggml_tensor * t, float v) {
206
+ WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_F32);
207
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous(t));
208
+ size_t nels = wsp_ggml_nelements(t);
209
+ for (int64_t i = 0; i < nels; ++i) {
210
+ ((float *) t->data)[i] = v;
211
+ }
212
+ return t;
213
+ }
214
+
215
+ static wsp_ggml_tensor * whisper_set_i32(struct wsp_ggml_tensor * t, int32_t v) {
216
+ WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_I32);
217
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous(t));
218
+ size_t nels = wsp_ggml_nelements(t);
219
+ for (int64_t i = 0; i < nels; ++i) {
220
+ ((int32_t *) t->data)[i] = v;
221
+ }
222
+ return t;
223
+ }
224
+
225
+ static float whisper_get_f32_nd(const struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
226
+ WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_F32);
227
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
228
+ return *(float *) data;
229
+ }
230
+
231
+ static void whisper_set_f32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
232
+ WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_F32);
233
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
234
+ *(float *) data = v;
235
+ }
236
+
237
+ static int32_t whisper_get_i32_nd(const struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
238
+ WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_I32);
239
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
240
+ return *(int32_t *) data;
241
+ }
242
+
243
+ static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
244
+ WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_I32);
245
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
246
+ *(int32_t *) data = v;
247
+ }
248
+
188
249
  // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
189
250
  // the idea is to represent the original matrix multiplication:
190
251
  //
@@ -428,6 +489,7 @@ struct whisper_segment {
428
489
  int64_t t1;
429
490
 
430
491
  std::string text;
492
+ float no_speech_prob;
431
493
 
432
494
  std::vector<whisper_token_data> tokens;
433
495
 
@@ -867,6 +929,7 @@ struct whisper_state {
867
929
  whisper_token tid_last;
868
930
 
869
931
  std::vector<float> energy; // PCM signal energy
932
+ float no_speech_prob = 0.0f;
870
933
 
871
934
  // [EXPERIMENTAL] Token-level timestamps with DTW
872
935
  whisper_aheads_masks aheads_masks;
@@ -1233,21 +1296,38 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1233
1296
  static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1234
1297
  wsp_ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1235
1298
 
1299
+ whisper_load_backends();
1300
+
1301
+ wsp_ggml_backend_dev_t dev = nullptr;
1302
+
1303
+ int cnt = 0;
1236
1304
  if (params.use_gpu) {
1237
1305
  for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
1238
- wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
1239
- if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
1240
- WHISPER_LOG_INFO("%s: using %s backend\n", __func__, wsp_ggml_backend_dev_name(dev));
1241
- wsp_ggml_backend_t result = wsp_ggml_backend_dev_init(dev, nullptr);
1242
- if (!result) {
1243
- WHISPER_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, wsp_ggml_backend_dev_name(dev));
1306
+ wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
1307
+ if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
1308
+ if (cnt == 0 || cnt == params.gpu_device) {
1309
+ dev = dev_cur;
1310
+ }
1311
+
1312
+ if (++cnt > params.gpu_device) {
1313
+ break;
1244
1314
  }
1245
- return result;
1246
1315
  }
1247
1316
  }
1248
1317
  }
1249
1318
 
1250
- return nullptr;
1319
+ if (dev == nullptr) {
1320
+ WHISPER_LOG_INFO("%s: no GPU found\n", __func__);
1321
+ return nullptr;
1322
+ }
1323
+
1324
+ WHISPER_LOG_INFO("%s: using %s backend\n", __func__, wsp_ggml_backend_dev_name(dev));
1325
+ wsp_ggml_backend_t result = wsp_ggml_backend_dev_init(dev, nullptr);
1326
+ if (!result) {
1327
+ WHISPER_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, wsp_ggml_backend_dev_name(dev));
1328
+ }
1329
+
1330
+ return result;
1251
1331
  }
1252
1332
 
1253
1333
  static std::vector<wsp_ggml_backend_t> whisper_backend_init(const whisper_context_params & params) {
@@ -1275,26 +1355,33 @@ static std::vector<wsp_ggml_backend_t> whisper_backend_init(const whisper_contex
1275
1355
 
1276
1356
  WSP_GGML_UNUSED(params);
1277
1357
 
1278
- result.push_back(wsp_ggml_backend_cpu_init());
1358
+ result.push_back(wsp_ggml_backend_init_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
1279
1359
 
1280
1360
  return result;
1281
1361
  }
1282
1362
 
1283
1363
  static wsp_ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) {
1364
+ wsp_ggml_backend_buffer_type_t result = wsp_ggml_backend_cpu_buffer_type();
1365
+
1284
1366
  if (!params.use_gpu) {
1285
- return wsp_ggml_backend_cpu_buffer_type();
1367
+ return result;
1286
1368
  }
1287
1369
 
1288
- // if we have a GPU device - use it
1370
+ int cnt = 0;
1289
1371
  for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
1290
1372
  wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
1291
1373
  if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
1292
- WHISPER_LOG_INFO("%s: using device %s (%s)\n", __func__, wsp_ggml_backend_dev_name(dev), wsp_ggml_backend_dev_description(dev));
1293
- return wsp_ggml_backend_dev_buffer_type(dev);
1374
+ if (cnt == 0 || cnt == params.gpu_device) {
1375
+ result = wsp_ggml_backend_dev_buffer_type(dev);
1376
+ }
1377
+
1378
+ if (++cnt > params.gpu_device) {
1379
+ break;
1380
+ }
1294
1381
  }
1295
1382
  }
1296
1383
 
1297
- return wsp_ggml_backend_cpu_buffer_type();
1384
+ return result;
1298
1385
  }
1299
1386
 
1300
1387
  // load the model from a ggml file
@@ -4184,22 +4271,28 @@ static int whisper_has_openvino(void) {
4184
4271
  const char * whisper_print_system_info(void) {
4185
4272
  static std::string s;
4186
4273
 
4274
+ whisper_load_backends();
4275
+
4187
4276
  s = "";
4188
- s += "AVX = " + std::to_string(wsp_ggml_cpu_has_avx()) + " | ";
4189
- s += "AVX2 = " + std::to_string(wsp_ggml_cpu_has_avx2()) + " | ";
4190
- s += "AVX512 = " + std::to_string(wsp_ggml_cpu_has_avx512()) + " | ";
4191
- s += "FMA = " + std::to_string(wsp_ggml_cpu_has_fma()) + " | ";
4192
- s += "NEON = " + std::to_string(wsp_ggml_cpu_has_neon()) + " | ";
4193
- s += "ARM_FMA = " + std::to_string(wsp_ggml_cpu_has_arm_fma()) + " | ";
4194
- s += "F16C = " + std::to_string(wsp_ggml_cpu_has_f16c()) + " | ";
4195
- s += "FP16_VA = " + std::to_string(wsp_ggml_cpu_has_fp16_va()) + " | ";
4196
- s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
4197
- s += "SSE3 = " + std::to_string(wsp_ggml_cpu_has_sse3()) + " | ";
4198
- s += "SSSE3 = " + std::to_string(wsp_ggml_cpu_has_ssse3()) + " | ";
4199
- s += "VSX = " + std::to_string(wsp_ggml_cpu_has_vsx()) + " | ";
4277
+ s += "WHISPER : ";
4200
4278
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
4201
4279
  s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
4202
4280
 
4281
+ for (size_t i = 0; i < wsp_ggml_backend_reg_count(); i++) {
4282
+ auto * reg = wsp_ggml_backend_reg_get(i);
4283
+ auto * get_features_fn = (wsp_ggml_backend_get_features_t) wsp_ggml_backend_reg_get_proc_address(reg, "wsp_ggml_backend_get_features");
4284
+ if (get_features_fn) {
4285
+ wsp_ggml_backend_feature * features = get_features_fn(reg);
4286
+ s += wsp_ggml_backend_reg_name(reg);
4287
+ s += " : ";
4288
+ for (; features->name; features++) {
4289
+ s += features->name;
4290
+ s += " = ";
4291
+ s += features->value;
4292
+ s += " | ";
4293
+ }
4294
+ }
4295
+ }
4203
4296
  return s.c_str();
4204
4297
  }
4205
4298
 
@@ -4679,7 +4772,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
4679
4772
  /*.detect_language =*/ false,
4680
4773
 
4681
4774
  /*.suppress_blank =*/ true,
4682
- /*.suppress_non_speech_tokens =*/ false,
4775
+ /*.suppress_nst =*/ false,
4683
4776
 
4684
4777
  /*.temperature =*/ 0.0f,
4685
4778
  /*.max_initial_ts =*/ 1.0f,
@@ -4829,6 +4922,42 @@ static const std::vector<std::string> non_speech_tokens = {
4829
4922
  "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
4830
4923
  };
4831
4924
 
4925
+ static void whisper_compute_logprobs(
4926
+ const std::vector<float> & logits,
4927
+ const int n_logits,
4928
+ std::vector<float> & logprobs) {
4929
+ const float logit_max = *std::max_element(logits.begin(), logits.end());
4930
+ float logsumexp = 0.0f;
4931
+ for (int i = 0; i < n_logits; ++i) {
4932
+ if (logits[i] > -INFINITY) {
4933
+ logsumexp += expf(logits[i] - logit_max);
4934
+ }
4935
+ }
4936
+ logsumexp = logf(logsumexp) + logit_max;
4937
+
4938
+ for (int i = 0; i < n_logits; ++i) {
4939
+ if (logits[i] > -INFINITY) {
4940
+ logprobs[i] = logits[i] - logsumexp;
4941
+ } else {
4942
+ logprobs[i] = -INFINITY;
4943
+ }
4944
+ }
4945
+ }
4946
+
4947
+ static void whisper_compute_probs(
4948
+ const std::vector<float> & logits,
4949
+ const int n_logits,
4950
+ const std::vector<float> & logprobs,
4951
+ std::vector<float> & probs) {
4952
+ for (int i = 0; i < n_logits; ++i) {
4953
+ if (logits[i] == -INFINITY) {
4954
+ probs[i] = 0.0f;
4955
+ } else {
4956
+ probs[i] = expf(logprobs[i]);
4957
+ }
4958
+ }
4959
+ }
4960
+
4832
4961
  // process the logits for the selected decoder
4833
4962
  // - applies logit filters
4834
4963
  // - computes logprobs and probs
@@ -4890,7 +5019,7 @@ static void whisper_process_logits(
4890
5019
 
4891
5020
  // suppress sot and nosp tokens
4892
5021
  logits[vocab.token_sot] = -INFINITY;
4893
- logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
5022
+ logits[vocab.token_nosp] = -INFINITY;
4894
5023
 
4895
5024
  // [TDRZ] when tinydiarize is disabled, suppress solm token
4896
5025
  if (params.tdrz_enable == false) {
@@ -4927,7 +5056,7 @@ static void whisper_process_logits(
4927
5056
 
4928
5057
  // suppress non-speech tokens
4929
5058
  // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
4930
- if (params.suppress_non_speech_tokens) {
5059
+ if (params.suppress_nst) {
4931
5060
  for (const std::string & token : non_speech_tokens) {
4932
5061
  const std::string suppress_tokens[] = {token, " " + token};
4933
5062
  for (const std::string & suppress_token : suppress_tokens) {
@@ -4989,24 +5118,7 @@ static void whisper_process_logits(
4989
5118
  }
4990
5119
 
4991
5120
  // populate the logprobs array (log_softmax)
4992
- {
4993
- const float logit_max = *std::max_element(logits.begin(), logits.end());
4994
- float logsumexp = 0.0f;
4995
- for (int i = 0; i < n_logits; ++i) {
4996
- if (logits[i] > -INFINITY) {
4997
- logsumexp += expf(logits[i] - logit_max);
4998
- }
4999
- }
5000
- logsumexp = logf(logsumexp) + logit_max;
5001
-
5002
- for (int i = 0; i < n_logits; ++i) {
5003
- if (logits[i] > -INFINITY) {
5004
- logprobs[i] = logits[i] - logsumexp;
5005
- } else {
5006
- logprobs[i] = -INFINITY;
5007
- }
5008
- }
5009
- }
5121
+ whisper_compute_logprobs(logits, n_logits, logprobs);
5010
5122
 
5011
5123
  // if sum of probability over timestamps is above any other token, sample timestamp
5012
5124
  // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L431-L437
@@ -5064,15 +5176,7 @@ static void whisper_process_logits(
5064
5176
  }
5065
5177
 
5066
5178
  // compute probs
5067
- {
5068
- for (int i = 0; i < n_logits; ++i) {
5069
- if (logits[i] == -INFINITY) {
5070
- probs[i] = 0.0f;
5071
- } else {
5072
- probs[i] = expf(logprobs[i]);
5073
- }
5074
- }
5075
- }
5179
+ whisper_compute_probs(logits, n_logits, logprobs, probs);
5076
5180
 
5077
5181
  #if 0
5078
5182
  // print first 100 logits - token string : logit
@@ -5651,6 +5755,18 @@ int whisper_full_with_state(
5651
5755
  return -8;
5652
5756
  }
5653
5757
 
5758
+ // Calculate no_speech probability after first decode.
5759
+ // This has to be done before any logit filtering. Hence we cannot use the probs from the whisper_process_logits.
5760
+ {
5761
+ const int n_logits = ctx->vocab.id_to_token.size();
5762
+ std::vector<float> logprobs(n_logits);
5763
+ std::vector<float> probs(n_logits);
5764
+
5765
+ whisper_compute_logprobs(state->logits, n_logits, logprobs);
5766
+ whisper_compute_probs(state->logits, n_logits, logprobs, probs);
5767
+ state->no_speech_prob = probs[whisper_token_nosp(ctx)];
5768
+ }
5769
+
5654
5770
  {
5655
5771
  const int64_t t_start_sample_us = wsp_ggml_time_us();
5656
5772
 
@@ -6042,8 +6158,9 @@ int whisper_full_with_state(
6042
6158
  if (it != (int) temperatures.size() - 1) {
6043
6159
  const auto & decoder = state->decoders[best_decoder_id];
6044
6160
 
6045
- if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
6046
- WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
6161
+ if (decoder.failed ||
6162
+ (decoder.sequence.avg_logprobs < params.logprob_thold && state->no_speech_prob < params.no_speech_thold)) {
6163
+ WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f and no_speech_prob %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold, state->no_speech_prob, params.no_speech_thold);
6047
6164
  success = false;
6048
6165
  state->n_fail_p++;
6049
6166
  }
@@ -6064,7 +6181,7 @@ int whisper_full_with_state(
6064
6181
  {
6065
6182
  const auto & best_decoder = state->decoders[best_decoder_id];
6066
6183
 
6067
- const auto seek_delta = best_decoder.seek_delta;
6184
+ auto seek_delta = best_decoder.seek_delta;
6068
6185
  const auto result_len = best_decoder.sequence.result_len;
6069
6186
 
6070
6187
  const auto & tokens_cur = best_decoder.sequence.tokens;
@@ -6072,6 +6189,9 @@ int whisper_full_with_state(
6072
6189
  // [EXPERIMENTAL] Token-level timestamps with DTW
6073
6190
  const auto n_segments_before = state->result_all.size();
6074
6191
 
6192
+ const bool is_no_speech = (state->no_speech_prob > params.no_speech_thold &&
6193
+ best_decoder.sequence.avg_logprobs < params.logprob_thold);
6194
+
6075
6195
  //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
6076
6196
 
6077
6197
  // update prompt_past
@@ -6080,11 +6200,11 @@ int whisper_full_with_state(
6080
6200
  prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
6081
6201
  }
6082
6202
 
6083
- for (int i = 0; i < result_len; ++i) {
6203
+ for (int i = 0; i < result_len && !is_no_speech; ++i) {
6084
6204
  prompt_past.push_back(tokens_cur[i].id);
6085
6205
  }
6086
6206
 
6087
- if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
6207
+ if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
6088
6208
  int i0 = 0;
6089
6209
  auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
6090
6210
 
@@ -6123,7 +6243,7 @@ int whisper_full_with_state(
6123
6243
 
6124
6244
  //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
6125
6245
 
6126
- result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
6246
+ result_all.push_back({ tt0, tt1, text, state->no_speech_prob, {}, speaker_turn_next });
6127
6247
  for (int j = i0; j <= i; j++) {
6128
6248
  result_all.back().tokens.push_back(tokens_cur[j]);
6129
6249
  }
@@ -6168,7 +6288,7 @@ int whisper_full_with_state(
6168
6288
  }
6169
6289
  }
6170
6290
 
6171
- result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
6291
+ result_all.push_back({ tt0, tt1, text, state->no_speech_prob, {}, speaker_turn_next });
6172
6292
  for (int j = i0; j < (int) tokens_cur.size(); j++) {
6173
6293
  result_all.back().tokens.push_back(tokens_cur[j]);
6174
6294
  }
@@ -6205,6 +6325,15 @@ int whisper_full_with_state(
6205
6325
  }
6206
6326
  }
6207
6327
 
6328
+ // ref: https://github.com/ggerganov/whisper.cpp/pull/2629
6329
+ const bool single_timestamp_ending = tokens_cur.size() > 1 &&
6330
+ tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) &&
6331
+ tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx);
6332
+ if (single_timestamp_ending) {
6333
+ WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n");
6334
+ seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
6335
+ }
6336
+
6208
6337
  // update audio window
6209
6338
  seek += seek_delta;
6210
6339
 
@@ -6426,6 +6555,14 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
6426
6555
  return ctx->state->result_all[i_segment].tokens[i_token].p;
6427
6556
  }
6428
6557
 
6558
+ float whisper_full_get_segment_no_speech_prob(struct whisper_context * ctx, int i_segment) {
6559
+ return ctx->state->result_all[i_segment].no_speech_prob;
6560
+ }
6561
+
6562
+ float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment) {
6563
+ return state->result_all[i_segment].no_speech_prob;
6564
+ }
6565
+
6429
6566
  // =================================================================================================
6430
6567
 
6431
6568
  //
@@ -6587,6 +6724,8 @@ WHISPER_API int whisper_bench_wsp_ggml_mul_mat(int n_threads) {
6587
6724
  }
6588
6725
 
6589
6726
  WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
6727
+ whisper_load_backends();
6728
+
6590
6729
  static std::string s;
6591
6730
  s = "";
6592
6731
  char strbuf[256];
@@ -6606,7 +6745,6 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
6606
6745
  // c: N*N*sizeof(float)
6607
6746
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
6608
6747
  std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*wsp_ggml_tensor_overhead() + wsp_ggml_graph_overhead());
6609
- std::vector<uint8_t> work;
6610
6748
 
6611
6749
  // put a bunch of random data in the buffer
6612
6750
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -6663,12 +6801,12 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
6663
6801
  double tsum = 0.0;
6664
6802
 
6665
6803
  // heat-up
6666
- wsp_ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
6804
+ wsp_ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
6667
6805
 
6668
6806
  for (int i = 0; i < n_max; ++i) {
6669
6807
  const int64_t t0 = wsp_ggml_time_us();
6670
6808
 
6671
- wsp_ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
6809
+ wsp_ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
6672
6810
 
6673
6811
  const int64_t t1 = wsp_ggml_time_us();
6674
6812
 
@@ -7045,18 +7183,18 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
7045
7183
  struct wsp_ggml_tensor * cost = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, N + 1, M + 1);
7046
7184
  struct wsp_ggml_tensor * trace = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_I32, N + 1, M + 1);
7047
7185
 
7048
- cost = wsp_ggml_set_f32(cost, INFINITY);
7049
- trace = wsp_ggml_set_f32(trace, -1);
7050
- wsp_ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
7186
+ cost = whisper_set_f32(cost, INFINITY);
7187
+ trace = whisper_set_i32(trace, -1);
7188
+ whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
7051
7189
 
7052
7190
  // dtw
7053
7191
  // supposedly can be optmized by computing diagonals in parallel ?
7054
7192
  // Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
7055
7193
  for (int64_t j = 1; j < M + 1; ++j) {
7056
7194
  for (int64_t i = 1; i < N + 1; ++i) {
7057
- float c0 = wsp_ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0);
7058
- float c1 = wsp_ggml_get_f32_nd(cost, i - 1, j, 0, 0);
7059
- float c2 = wsp_ggml_get_f32_nd(cost, i, j - 1, 0, 0);
7195
+ float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
7196
+ float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
7197
+ float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);
7060
7198
 
7061
7199
  float c;
7062
7200
  int32_t t;
@@ -7071,9 +7209,9 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
7071
7209
  t = 2;
7072
7210
  }
7073
7211
 
7074
- c = wsp_ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
7075
- wsp_ggml_set_f32_nd(cost, i, j, 0, 0, c);
7076
- wsp_ggml_set_i32_nd(trace, i, j, 0, 0, t);
7212
+ c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
7213
+ whisper_set_f32_nd(cost, i, j, 0, 0, c);
7214
+ whisper_set_i32_nd(trace, i, j, 0, 0, t);
7077
7215
  }
7078
7216
  }
7079
7217
 
@@ -7082,19 +7220,19 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
7082
7220
  struct wsp_ggml_tensor * bt = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_I32, BT_MAX_ROWS, 2);
7083
7221
  // trace[0, :] = 2;
7084
7222
  for (int64_t i = 0; i < M + 1; ++i)
7085
- wsp_ggml_set_i32_nd(trace, 0, i, 0, 0, 2);
7223
+ whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
7086
7224
  //trace[:, 0] = 1;
7087
7225
  for (int64_t i = 0; i < N + 1; ++i)
7088
- wsp_ggml_set_i32_nd(trace, i, 0, 0, 0, 1);
7226
+ whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
7089
7227
  int bt_row_idx = BT_MAX_ROWS - 1;
7090
7228
  int64_t i = N;
7091
7229
  int64_t j = M;
7092
7230
  while (i > 0 || j > 0) {
7093
- wsp_ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
7094
- wsp_ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
7231
+ whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
7232
+ whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
7095
7233
  --bt_row_idx;
7096
7234
 
7097
- int32_t t = wsp_ggml_get_i32_nd(trace, i, j, 0, 0);
7235
+ int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
7098
7236
  if (t == 0) {
7099
7237
  --i;
7100
7238
  --j;
@@ -7115,8 +7253,8 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
7115
7253
  wsp_ggml_tensor * r = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_I32, 2, result_n_cols);
7116
7254
  for (int64_t i = 0; i < 2; ++i) {
7117
7255
  for (int64_t j = 0; j < result_n_cols; ++j) {
7118
- int32_t v = wsp_ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
7119
- wsp_ggml_set_i32_nd(r, i, j, 0, 0, v);
7256
+ int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
7257
+ whisper_set_i32_nd(r, i, j, 0, 0, v);
7120
7258
  }
7121
7259
  }
7122
7260
 
@@ -7151,11 +7289,11 @@ static void median_filter(struct wsp_ggml_tensor * dst , const struct wsp_ggml_t
7151
7289
  idx = 2*(a->ne[2] - 1) - idx;
7152
7290
  }
7153
7291
 
7154
- filter.push_back(wsp_ggml_get_f32_nd(a, i, j, idx, 0));
7292
+ filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
7155
7293
  }
7156
7294
  std::sort(filter.begin(), filter.end());
7157
7295
  const float v = filter[filter.size()/2];
7158
- wsp_ggml_set_f32_nd(dst, i, j, k, 0, v);
7296
+ whisper_set_f32_nd(dst, i, j, k, 0, v);
7159
7297
  filter.clear();
7160
7298
  }
7161
7299
  }
@@ -7277,7 +7415,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
7277
7415
  // Compute
7278
7416
  struct wsp_ggml_cgraph * gf = wsp_ggml_new_graph(gctx);
7279
7417
  wsp_ggml_build_forward_expand(gf, w);
7280
- wsp_ggml_graph_compute_with_ctx(gctx, gf, n_threads);
7418
+
7419
+ wsp_ggml_backend_ptr backend { wsp_ggml_backend_init_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
7420
+ wsp_ggml_backend_graph_compute(backend.get(), gf);
7281
7421
 
7282
7422
  wsp_ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
7283
7423
 
@@ -7286,9 +7426,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
7286
7426
  auto seg_i = state->result_all.begin() + i_segment;
7287
7427
  auto tok_i = seg_i->tokens.begin();
7288
7428
  for (int i = 0; i < alignment->ne[1]; ++i) {
7289
- int32_t v = wsp_ggml_get_i32_nd(alignment, 0, i, 0, 0);
7429
+ int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
7290
7430
  if (v != last_v) {
7291
- int32_t time_index = wsp_ggml_get_i32_nd(alignment, 1, i, 0, 0);
7431
+ int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
7292
7432
  int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
7293
7433
  last_v = v;
7294
7434