whisper.rn 0.4.0-rc.6 → 0.4.0-rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +5 -5
- package/cpp/coreml/whisper-encoder.mm +1 -1
- package/cpp/ggml-alloc.c +41 -11
- package/cpp/ggml-alloc.h +3 -1
- package/cpp/ggml-backend-impl.h +38 -34
- package/cpp/ggml-backend.c +630 -269
- package/cpp/ggml-backend.h +58 -30
- package/cpp/ggml-impl.h +3 -0
- package/cpp/ggml-metal-whisper.metal +1253 -341
- package/cpp/ggml-metal.h +6 -54
- package/cpp/ggml-metal.m +2004 -1987
- package/cpp/ggml-quants.c +2230 -421
- package/cpp/ggml-quants.h +39 -1
- package/cpp/ggml.c +735 -265
- package/cpp/ggml.h +94 -43
- package/cpp/rn-whisper.cpp +1 -0
- package/cpp/whisper.cpp +118 -86
- package/ios/RNWhisperContext.mm +4 -2
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
package/cpp/whisper.cpp
CHANGED
|
@@ -122,9 +122,18 @@ WHISPER_ATTRIBUTE_FORMAT(2, 3)
|
|
|
122
122
|
static void whisper_log_internal (wsp_ggml_log_level level, const char * format, ...);
|
|
123
123
|
static void whisper_log_callback_default(wsp_ggml_log_level level, const char * text, void * user_data);
|
|
124
124
|
|
|
125
|
-
#define WHISPER_LOG_INFO(...) whisper_log_internal(WSP_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
126
|
-
#define WHISPER_LOG_WARN(...) whisper_log_internal(WSP_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
127
125
|
#define WHISPER_LOG_ERROR(...) whisper_log_internal(WSP_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
126
|
+
#define WHISPER_LOG_WARN(...) whisper_log_internal(WSP_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
127
|
+
#define WHISPER_LOG_INFO(...) whisper_log_internal(WSP_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
128
|
+
|
|
129
|
+
// define this to enable verbose trace logging - useful for debugging purposes
|
|
130
|
+
//#define WHISPER_DEBUG
|
|
131
|
+
|
|
132
|
+
#if defined(WHISPER_DEBUG)
|
|
133
|
+
#define WHISPER_LOG_DEBUG(...) whisper_log_internal(WSP_GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
|
134
|
+
#else
|
|
135
|
+
#define WHISPER_LOG_DEBUG(...)
|
|
136
|
+
#endif
|
|
128
137
|
|
|
129
138
|
#define WHISPER_ASSERT(x) \
|
|
130
139
|
do { \
|
|
@@ -134,18 +143,6 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
|
|
|
134
143
|
} \
|
|
135
144
|
} while (0)
|
|
136
145
|
|
|
137
|
-
// define this to enable verbose trace logging - useful for debugging purposes
|
|
138
|
-
//#define WHISPER_DEBUG
|
|
139
|
-
|
|
140
|
-
#if defined(WHISPER_DEBUG)
|
|
141
|
-
#define WHISPER_PRINT_DEBUG(...) \
|
|
142
|
-
do { \
|
|
143
|
-
fprintf(stderr, __VA_ARGS__); \
|
|
144
|
-
} while (0)
|
|
145
|
-
#else
|
|
146
|
-
#define WHISPER_PRINT_DEBUG(...)
|
|
147
|
-
#endif
|
|
148
|
-
|
|
149
146
|
//#define WHISPER_USE_FLASH_ATTN
|
|
150
147
|
//#define WHISPER_USE_FLASH_FF
|
|
151
148
|
#define WHISPER_MAX_DECODERS 8
|
|
@@ -155,7 +152,7 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
|
|
|
155
152
|
// ggml helpers
|
|
156
153
|
//
|
|
157
154
|
|
|
158
|
-
static
|
|
155
|
+
static bool wsp_ggml_graph_compute_helper(
|
|
159
156
|
struct wsp_ggml_cgraph * graph,
|
|
160
157
|
std::vector<uint8_t> & buf,
|
|
161
158
|
int n_threads,
|
|
@@ -171,10 +168,10 @@ static void wsp_ggml_graph_compute_helper(
|
|
|
171
168
|
plan.work_data = buf.data();
|
|
172
169
|
}
|
|
173
170
|
|
|
174
|
-
wsp_ggml_graph_compute(graph, &plan);
|
|
171
|
+
return wsp_ggml_graph_compute(graph, &plan);
|
|
175
172
|
}
|
|
176
173
|
|
|
177
|
-
static
|
|
174
|
+
static bool wsp_ggml_graph_compute_helper(
|
|
178
175
|
struct wsp_ggml_backend * backend,
|
|
179
176
|
struct wsp_ggml_cgraph * graph,
|
|
180
177
|
int n_threads) {
|
|
@@ -186,7 +183,7 @@ static void wsp_ggml_graph_compute_helper(
|
|
|
186
183
|
wsp_ggml_backend_metal_set_n_cb(backend, n_threads);
|
|
187
184
|
}
|
|
188
185
|
#endif
|
|
189
|
-
wsp_ggml_backend_graph_compute(backend, graph);
|
|
186
|
+
return wsp_ggml_backend_graph_compute(backend, graph);
|
|
190
187
|
}
|
|
191
188
|
|
|
192
189
|
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
@@ -487,8 +484,8 @@ static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
|
|
|
487
484
|
|
|
488
485
|
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
|
489
486
|
static void whisper_allocr_graph_init(struct whisper_allocr & allocr, wsp_ggml_backend_t backend, std::function<struct wsp_ggml_cgraph *()> && get_graph) {
|
|
490
|
-
auto & alloc
|
|
491
|
-
auto & meta
|
|
487
|
+
auto & alloc = allocr.alloc;
|
|
488
|
+
auto & meta = allocr.meta;
|
|
492
489
|
|
|
493
490
|
alloc = wsp_ggml_allocr_new_measure_from_backend(backend);
|
|
494
491
|
|
|
@@ -704,7 +701,7 @@ struct whisper_model {
|
|
|
704
701
|
struct wsp_ggml_context * ctx;
|
|
705
702
|
|
|
706
703
|
// the model backend data is read-only and can be shared between processors
|
|
707
|
-
struct wsp_ggml_backend_buffer
|
|
704
|
+
std::vector<struct wsp_ggml_backend_buffer *> buffers;
|
|
708
705
|
|
|
709
706
|
// tensors
|
|
710
707
|
int n_loaded;
|
|
@@ -1073,7 +1070,7 @@ static wsp_ggml_backend_t whisper_backend_init(const whisper_context_params & pa
|
|
|
1073
1070
|
#ifdef WSP_GGML_USE_METAL
|
|
1074
1071
|
if (params.use_gpu) {
|
|
1075
1072
|
WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
|
|
1076
|
-
|
|
1073
|
+
wsp_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
1077
1074
|
backend_gpu = wsp_ggml_backend_metal_init();
|
|
1078
1075
|
if (!backend_gpu) {
|
|
1079
1076
|
WHISPER_LOG_ERROR("%s: wsp_ggml_backend_metal_init() failed\n", __func__);
|
|
@@ -1517,24 +1514,64 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1517
1514
|
|
|
1518
1515
|
wctx.backend = whisper_backend_init(wctx.params);
|
|
1519
1516
|
|
|
1517
|
+
// some devices have a limit on the maximum size of single memory buffer
|
|
1518
|
+
// for example, iPhones are limited to 1GB per buffer
|
|
1519
|
+
// to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
|
|
1520
|
+
// model weights between them
|
|
1521
|
+
//
|
|
1522
|
+
// the map_t2b maps tensor names to buffer indices
|
|
1523
|
+
// as we iterate over the tensors, we will allocate new buffers when the current one is full
|
|
1524
|
+
//
|
|
1525
|
+
// finally, we create a separate allocator for each buffer and use it to allocate the tensors
|
|
1526
|
+
// we keep the allocators alive until all the tensors are loaded
|
|
1527
|
+
|
|
1528
|
+
WSP_GGML_ASSERT(model.buffers.empty());
|
|
1529
|
+
|
|
1530
|
+
std::map<std::string, int> map_t2b;
|
|
1531
|
+
|
|
1520
1532
|
{
|
|
1521
1533
|
size_t size_main = 0;
|
|
1534
|
+
size_t size_cur = 0;
|
|
1535
|
+
|
|
1536
|
+
static const size_t GB = 1024ull*1024ull*1024ull;
|
|
1522
1537
|
|
|
1523
1538
|
for (const auto & t : model.tensors) {
|
|
1524
|
-
|
|
1539
|
+
const size_t cur = wsp_ggml_nbytes(t.second) + wsp_ggml_tensor_overhead();
|
|
1540
|
+
|
|
1541
|
+
// adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
|
|
1542
|
+
if (size_cur + cur > GB) {
|
|
1543
|
+
WSP_GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
|
|
1544
|
+
|
|
1545
|
+
model.buffers.emplace_back(wsp_ggml_backend_alloc_buffer(wctx.backend, size_cur));
|
|
1546
|
+
|
|
1547
|
+
size_cur = cur;
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
map_t2b[t.first] = model.buffers.size();
|
|
1551
|
+
|
|
1552
|
+
size_cur += cur;
|
|
1553
|
+
size_main += cur;
|
|
1525
1554
|
}
|
|
1526
1555
|
|
|
1527
|
-
|
|
1556
|
+
// allocate the last buffer if needed
|
|
1557
|
+
if (size_cur > 0) {
|
|
1558
|
+
model.buffers.emplace_back(wsp_ggml_backend_alloc_buffer(wctx.backend, size_cur));
|
|
1559
|
+
}
|
|
1528
1560
|
|
|
1529
|
-
|
|
1561
|
+
WSP_GGML_ASSERT(model.buffers.size() > 0);
|
|
1562
|
+
|
|
1563
|
+
WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, wsp_ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
|
|
1530
1564
|
}
|
|
1531
1565
|
|
|
1532
|
-
wsp_ggml_allocr
|
|
1566
|
+
std::vector<wsp_ggml_allocr *> allocs(model.buffers.size());
|
|
1567
|
+
for (size_t i = 0; i < allocs.size(); ++i) {
|
|
1568
|
+
allocs[i] = wsp_ggml_allocr_new_from_buffer(model.buffers[i]);
|
|
1569
|
+
}
|
|
1533
1570
|
|
|
1534
1571
|
// allocate tensors in the backend buffers
|
|
1535
1572
|
{
|
|
1536
1573
|
for (const auto & t : model.tensors) {
|
|
1537
|
-
wsp_ggml_allocr_alloc(
|
|
1574
|
+
wsp_ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
|
|
1538
1575
|
}
|
|
1539
1576
|
}
|
|
1540
1577
|
|
|
@@ -1635,7 +1672,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1635
1672
|
}
|
|
1636
1673
|
}
|
|
1637
1674
|
|
|
1638
|
-
|
|
1675
|
+
for (auto & alloc : allocs) {
|
|
1676
|
+
wsp_ggml_allocr_free(alloc);
|
|
1677
|
+
}
|
|
1639
1678
|
|
|
1640
1679
|
wctx.t_load_us = wsp_ggml_time_us() - t_start_us;
|
|
1641
1680
|
|
|
@@ -1777,7 +1816,7 @@ static struct wsp_ggml_cgraph * whisper_build_graph_encoder(
|
|
|
1777
1816
|
|
|
1778
1817
|
wsp_ggml_cgraph * gf = wsp_ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
|
1779
1818
|
|
|
1780
|
-
wsp_ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
|
1819
|
+
//wsp_ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
|
1781
1820
|
|
|
1782
1821
|
//struct wsp_ggml_tensor * cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_ctx, n_state);
|
|
1783
1822
|
//wsp_ggml_allocr_alloc(alloc, cur);
|
|
@@ -1787,13 +1826,7 @@ static struct wsp_ggml_cgraph * whisper_build_graph_encoder(
|
|
|
1787
1826
|
//}
|
|
1788
1827
|
struct wsp_ggml_tensor * cur = wsp_ggml_view_tensor(ctx0, wstate.embd_conv);
|
|
1789
1828
|
|
|
1790
|
-
|
|
1791
|
-
wsp_ggml_allocr_alloc(alloc, KQscale);
|
|
1792
|
-
|
|
1793
|
-
if (!wsp_ggml_allocr_is_measure(alloc)) {
|
|
1794
|
-
const float val = 1.0f/sqrtf(float(n_state)/n_head);
|
|
1795
|
-
wsp_ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
|
1796
|
-
}
|
|
1829
|
+
const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
|
|
1797
1830
|
|
|
1798
1831
|
// ===================================================================
|
|
1799
1832
|
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
|
@@ -1843,14 +1876,14 @@ static struct wsp_ggml_cgraph * whisper_build_graph_encoder(
|
|
|
1843
1876
|
|
|
1844
1877
|
Qcur = wsp_ggml_add(ctx0, Qcur, layer.attn_q_b);
|
|
1845
1878
|
|
|
1846
|
-
//Qcur = wsp_ggml_scale(ctx0, Qcur,
|
|
1879
|
+
//Qcur = wsp_ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
|
|
1847
1880
|
|
|
1848
1881
|
// note: no bias for Key
|
|
1849
1882
|
struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
|
|
1850
1883
|
layer.attn_k_w,
|
|
1851
1884
|
cur);
|
|
1852
1885
|
|
|
1853
|
-
//Kcur = wsp_ggml_scale(ctx0, Kcur,
|
|
1886
|
+
//Kcur = wsp_ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
|
|
1854
1887
|
|
|
1855
1888
|
struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
|
|
1856
1889
|
layer.attn_v_w,
|
|
@@ -2032,7 +2065,7 @@ static struct wsp_ggml_cgraph * whisper_build_graph_cross(
|
|
|
2032
2065
|
|
|
2033
2066
|
wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx0);
|
|
2034
2067
|
|
|
2035
|
-
wsp_ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
|
2068
|
+
//wsp_ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
|
2036
2069
|
|
|
2037
2070
|
//struct wsp_ggml_tensor * cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
|
|
2038
2071
|
//wsp_ggml_allocr_alloc(alloc, cur);
|
|
@@ -2042,13 +2075,7 @@ static struct wsp_ggml_cgraph * whisper_build_graph_cross(
|
|
|
2042
2075
|
//}
|
|
2043
2076
|
struct wsp_ggml_tensor * cur = wsp_ggml_view_tensor(ctx0, wstate.embd_enc);
|
|
2044
2077
|
|
|
2045
|
-
|
|
2046
|
-
wsp_ggml_allocr_alloc(alloc, Kscale);
|
|
2047
|
-
|
|
2048
|
-
if (!wsp_ggml_allocr_is_measure(alloc)) {
|
|
2049
|
-
const float val = pow(float(n_state) / n_head, -0.25);
|
|
2050
|
-
wsp_ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
|
|
2051
|
-
}
|
|
2078
|
+
const float Kscale = pow(float(n_state) / n_head, -0.25);
|
|
2052
2079
|
|
|
2053
2080
|
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
|
2054
2081
|
auto & layer = model.layers_decoder[il];
|
|
@@ -2118,7 +2145,9 @@ static bool whisper_encode_internal(
|
|
|
2118
2145
|
wsp_ggml_allocr_alloc_graph(alloc, gf);
|
|
2119
2146
|
|
|
2120
2147
|
if (!whisper_encode_external(wstate)) {
|
|
2121
|
-
wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
2148
|
+
if (!wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
|
2149
|
+
return false;
|
|
2150
|
+
}
|
|
2122
2151
|
}
|
|
2123
2152
|
}
|
|
2124
2153
|
|
|
@@ -2132,7 +2161,9 @@ static bool whisper_encode_internal(
|
|
|
2132
2161
|
|
|
2133
2162
|
wsp_ggml_allocr_alloc_graph(alloc, gf);
|
|
2134
2163
|
|
|
2135
|
-
wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
2164
|
+
if (!wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
|
2165
|
+
return false;
|
|
2166
|
+
}
|
|
2136
2167
|
}
|
|
2137
2168
|
|
|
2138
2169
|
// cross
|
|
@@ -2145,7 +2176,9 @@ static bool whisper_encode_internal(
|
|
|
2145
2176
|
|
|
2146
2177
|
wsp_ggml_allocr_alloc_graph(alloc, gf);
|
|
2147
2178
|
|
|
2148
|
-
wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
2179
|
+
if (!wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
|
2180
|
+
return false;
|
|
2181
|
+
}
|
|
2149
2182
|
}
|
|
2150
2183
|
|
|
2151
2184
|
wstate.t_encode_us += wsp_ggml_time_us() - t_start_us;
|
|
@@ -2178,7 +2211,7 @@ static struct wsp_ggml_cgraph * whisper_build_graph_decoder(
|
|
|
2178
2211
|
const int32_t n_kv = wsp_ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
|
|
2179
2212
|
const int32_t kv_head = wsp_ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
|
|
2180
2213
|
|
|
2181
|
-
//
|
|
2214
|
+
//WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
|
|
2182
2215
|
|
|
2183
2216
|
struct wsp_ggml_init_params params = {
|
|
2184
2217
|
/*.mem_size =*/ wstate.alloc_decode.meta.size(),
|
|
@@ -2207,13 +2240,7 @@ static struct wsp_ggml_cgraph * whisper_build_graph_decoder(
|
|
|
2207
2240
|
}
|
|
2208
2241
|
}
|
|
2209
2242
|
|
|
2210
|
-
|
|
2211
|
-
wsp_ggml_allocr_alloc(alloc, KQscale);
|
|
2212
|
-
|
|
2213
|
-
if (!wsp_ggml_allocr_is_measure(alloc)) {
|
|
2214
|
-
const float val = pow(float(n_state)/n_head, -0.25);
|
|
2215
|
-
wsp_ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
|
2216
|
-
}
|
|
2243
|
+
const float KQscale = pow(float(n_state)/n_head, -0.25);
|
|
2217
2244
|
|
|
2218
2245
|
struct wsp_ggml_tensor * KQ_mask = wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_kv, n_tokens, 1);
|
|
2219
2246
|
wsp_ggml_allocr_alloc(alloc, KQ_mask);
|
|
@@ -2573,7 +2600,9 @@ static bool whisper_decode_internal(
|
|
|
2573
2600
|
|
|
2574
2601
|
logits = gf->nodes[gf->n_nodes - 1];
|
|
2575
2602
|
|
|
2576
|
-
wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
2603
|
+
if (!wsp_ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
|
2604
|
+
return false;
|
|
2605
|
+
}
|
|
2577
2606
|
}
|
|
2578
2607
|
|
|
2579
2608
|
logits_out.resize(n_tokens*n_vocab);
|
|
@@ -3393,8 +3422,10 @@ void whisper_free(struct whisper_context * ctx) {
|
|
|
3393
3422
|
wsp_ggml_free(ctx->model.ctx);
|
|
3394
3423
|
}
|
|
3395
3424
|
|
|
3396
|
-
|
|
3397
|
-
|
|
3425
|
+
for (auto & buffer : ctx->model.buffers) {
|
|
3426
|
+
if (buffer) {
|
|
3427
|
+
wsp_ggml_backend_buffer_free(buffer);
|
|
3428
|
+
}
|
|
3398
3429
|
}
|
|
3399
3430
|
|
|
3400
3431
|
whisper_free_state(ctx->state);
|
|
@@ -3838,6 +3869,7 @@ void whisper_reset_timings(struct whisper_context * ctx) {
|
|
|
3838
3869
|
ctx->state->t_sample_us = 0;
|
|
3839
3870
|
ctx->state->t_encode_us = 0;
|
|
3840
3871
|
ctx->state->t_decode_us = 0;
|
|
3872
|
+
ctx->state->t_batchd_us = 0;
|
|
3841
3873
|
ctx->state->t_prompt_us = 0;
|
|
3842
3874
|
ctx->state->n_sample = 0;
|
|
3843
3875
|
ctx->state->n_encode = 0;
|
|
@@ -4966,7 +4998,7 @@ static void whisper_sequence_score(
|
|
|
4966
4998
|
const auto p = kv.second/(double)cnt;
|
|
4967
4999
|
entropy -= p*log(p);
|
|
4968
5000
|
|
|
4969
|
-
//
|
|
5001
|
+
//WHISPER_LOG_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
|
|
4970
5002
|
}
|
|
4971
5003
|
|
|
4972
5004
|
sequence.entropy = entropy;
|
|
@@ -5032,7 +5064,7 @@ int whisper_full_with_state(
|
|
|
5032
5064
|
// basically don't process anything that is less than 1.0s
|
|
5033
5065
|
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
|
5034
5066
|
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
|
|
5035
|
-
|
|
5067
|
+
WHISPER_LOG_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
|
|
5036
5068
|
return 0;
|
|
5037
5069
|
}
|
|
5038
5070
|
|
|
@@ -5221,7 +5253,7 @@ int whisper_full_with_state(
|
|
|
5221
5253
|
|
|
5222
5254
|
n_decoders_cur = std::max(1, n_decoders_cur);
|
|
5223
5255
|
|
|
5224
|
-
|
|
5256
|
+
WHISPER_LOG_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);
|
|
5225
5257
|
|
|
5226
5258
|
// TAGS: WHISPER_DECODER_INIT
|
|
5227
5259
|
for (int j = 0; j < n_decoders_cur; ++j) {
|
|
@@ -5265,11 +5297,11 @@ int whisper_full_with_state(
|
|
|
5265
5297
|
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
|
5266
5298
|
|
|
5267
5299
|
// print the prompt
|
|
5268
|
-
|
|
5300
|
+
WHISPER_LOG_DEBUG("\n\n");
|
|
5269
5301
|
for (int i = 0; i < (int) prompt.size(); i++) {
|
|
5270
|
-
|
|
5302
|
+
WHISPER_LOG_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
|
|
5271
5303
|
}
|
|
5272
|
-
|
|
5304
|
+
WHISPER_LOG_DEBUG("\n\n");
|
|
5273
5305
|
|
|
5274
5306
|
whisper_kv_cache_clear(state->kv_self);
|
|
5275
5307
|
|
|
@@ -5417,7 +5449,7 @@ int whisper_full_with_state(
|
|
|
5417
5449
|
|
|
5418
5450
|
whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);
|
|
5419
5451
|
|
|
5420
|
-
|
|
5452
|
+
WHISPER_LOG_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
|
5421
5453
|
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
|
5422
5454
|
}
|
|
5423
5455
|
|
|
@@ -5460,7 +5492,7 @@ int whisper_full_with_state(
|
|
|
5460
5492
|
|
|
5461
5493
|
// do not allow to go back in time
|
|
5462
5494
|
if (has_ts && seek_delta > seek_delta_new && result_len < i) {
|
|
5463
|
-
|
|
5495
|
+
WHISPER_LOG_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
|
|
5464
5496
|
failed = true; // TODO: maybe this is not a failure ?
|
|
5465
5497
|
continue;
|
|
5466
5498
|
}
|
|
@@ -5475,7 +5507,7 @@ int whisper_full_with_state(
|
|
|
5475
5507
|
#ifdef WHISPER_DEBUG
|
|
5476
5508
|
{
|
|
5477
5509
|
const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
|
|
5478
|
-
|
|
5510
|
+
WHISPER_LOG_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
|
|
5479
5511
|
__func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
|
|
5480
5512
|
}
|
|
5481
5513
|
#endif
|
|
@@ -5485,22 +5517,22 @@ int whisper_full_with_state(
|
|
|
5485
5517
|
(params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
|
|
5486
5518
|
(has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached
|
|
5487
5519
|
) {
|
|
5488
|
-
if (result_len == 0) {
|
|
5520
|
+
if (result_len == 0 && !params.no_timestamps) {
|
|
5489
5521
|
if (seek + seek_delta + 100 >= seek_end) {
|
|
5490
5522
|
result_len = i + 1;
|
|
5491
5523
|
} else {
|
|
5492
|
-
|
|
5524
|
+
WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
|
|
5493
5525
|
failed = true;
|
|
5494
5526
|
continue;
|
|
5495
5527
|
}
|
|
5496
5528
|
}
|
|
5497
5529
|
|
|
5498
|
-
if (params.single_segment) {
|
|
5530
|
+
if (params.single_segment || params.no_timestamps) {
|
|
5499
5531
|
result_len = i + 1;
|
|
5500
5532
|
seek_delta = 100*WHISPER_CHUNK_SIZE;
|
|
5501
5533
|
}
|
|
5502
5534
|
|
|
5503
|
-
|
|
5535
|
+
WHISPER_LOG_DEBUG("%s: decoder %d completed\n", __func__, j);
|
|
5504
5536
|
completed = true;
|
|
5505
5537
|
continue;
|
|
5506
5538
|
}
|
|
@@ -5516,7 +5548,7 @@ int whisper_full_with_state(
|
|
|
5516
5548
|
// sometimes, the decoding can get stuck in a repetition loop
|
|
5517
5549
|
// this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
|
|
5518
5550
|
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
|
|
5519
|
-
|
|
5551
|
+
WHISPER_LOG_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
|
|
5520
5552
|
failed = true;
|
|
5521
5553
|
continue;
|
|
5522
5554
|
}
|
|
@@ -5558,7 +5590,7 @@ int whisper_full_with_state(
|
|
|
5558
5590
|
continue;
|
|
5559
5591
|
}
|
|
5560
5592
|
|
|
5561
|
-
//
|
|
5593
|
+
//WHISPER_LOG_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);
|
|
5562
5594
|
|
|
5563
5595
|
decoder.i_batch = batch.n_tokens;
|
|
5564
5596
|
|
|
@@ -5638,11 +5670,11 @@ int whisper_full_with_state(
|
|
|
5638
5670
|
decoder.sequence.tokens.resize(decoder.sequence.result_len);
|
|
5639
5671
|
whisper_sequence_score(params, decoder.sequence);
|
|
5640
5672
|
|
|
5641
|
-
|
|
5673
|
+
WHISPER_LOG_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
|
|
5642
5674
|
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
|
|
5643
5675
|
|
|
5644
5676
|
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
|
|
5645
|
-
|
|
5677
|
+
WHISPER_LOG_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
|
|
5646
5678
|
__func__, j, decoder.sequence.entropy, params.entropy_thold);
|
|
5647
5679
|
|
|
5648
5680
|
decoder.failed = true;
|
|
@@ -5657,7 +5689,7 @@ int whisper_full_with_state(
|
|
|
5657
5689
|
}
|
|
5658
5690
|
}
|
|
5659
5691
|
|
|
5660
|
-
|
|
5692
|
+
WHISPER_LOG_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
|
|
5661
5693
|
}
|
|
5662
5694
|
|
|
5663
5695
|
bool success = true;
|
|
@@ -5669,7 +5701,7 @@ int whisper_full_with_state(
|
|
|
5669
5701
|
const auto & decoder = state->decoders[best_decoder_id];
|
|
5670
5702
|
|
|
5671
5703
|
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
|
|
5672
|
-
|
|
5704
|
+
WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
|
|
5673
5705
|
success = false;
|
|
5674
5706
|
state->n_fail_p++;
|
|
5675
5707
|
}
|
|
@@ -5677,13 +5709,13 @@ int whisper_full_with_state(
|
|
|
5677
5709
|
|
|
5678
5710
|
if (success) {
|
|
5679
5711
|
//for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
|
|
5680
|
-
//
|
|
5712
|
+
// WHISPER_LOG_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
|
|
5681
5713
|
//}
|
|
5682
5714
|
|
|
5683
5715
|
break;
|
|
5684
5716
|
}
|
|
5685
5717
|
|
|
5686
|
-
|
|
5718
|
+
WHISPER_LOG_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
|
|
5687
5719
|
}
|
|
5688
5720
|
|
|
5689
5721
|
// output results through a user-provided callback
|
|
@@ -5695,7 +5727,7 @@ int whisper_full_with_state(
|
|
|
5695
5727
|
|
|
5696
5728
|
const auto & tokens_cur = best_decoder.sequence.tokens;
|
|
5697
5729
|
|
|
5698
|
-
//
|
|
5730
|
+
//WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
|
5699
5731
|
|
|
5700
5732
|
// update prompt_past
|
|
5701
5733
|
prompt_past.clear();
|
|
@@ -5815,7 +5847,7 @@ int whisper_full_with_state(
|
|
|
5815
5847
|
// update audio window
|
|
5816
5848
|
seek += seek_delta;
|
|
5817
5849
|
|
|
5818
|
-
|
|
5850
|
+
WHISPER_LOG_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
|
|
5819
5851
|
}
|
|
5820
5852
|
}
|
|
5821
5853
|
|
|
@@ -6132,7 +6164,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
|
6132
6164
|
|
|
6133
6165
|
// multi-thread
|
|
6134
6166
|
|
|
6135
|
-
for (
|
|
6167
|
+
for (int32_t k = 1; k <= n_threads; k++) {
|
|
6136
6168
|
char * src = (char *) malloc(size);
|
|
6137
6169
|
char * dst = (char *) malloc(size);
|
|
6138
6170
|
|
|
@@ -6156,13 +6188,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
|
6156
6188
|
const int64_t t0 = wsp_ggml_time_us();
|
|
6157
6189
|
|
|
6158
6190
|
std::vector<std::thread> threads(k - 1);
|
|
6159
|
-
for (
|
|
6191
|
+
for (int32_t th = 0; th < k - 1; ++th) {
|
|
6160
6192
|
threads[th] = std::thread(helper, th);
|
|
6161
6193
|
}
|
|
6162
6194
|
|
|
6163
6195
|
helper(k - 1);
|
|
6164
6196
|
|
|
6165
|
-
for (
|
|
6197
|
+
for (int32_t th = 0; th < k - 1; ++th) {
|
|
6166
6198
|
threads[th].join();
|
|
6167
6199
|
}
|
|
6168
6200
|
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -116,6 +116,7 @@
|
|
|
116
116
|
self->recordState.transcribeSliceIndex = 0;
|
|
117
117
|
self->recordState.nSamplesTranscribing = 0;
|
|
118
118
|
|
|
119
|
+
self->recordState.sliceNSamples.clear();
|
|
119
120
|
self->recordState.sliceNSamples.push_back(0);
|
|
120
121
|
|
|
121
122
|
self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
|
|
@@ -202,7 +203,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
202
203
|
state->sliceNSamples.push_back(0);
|
|
203
204
|
}
|
|
204
205
|
|
|
205
|
-
NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
|
|
206
|
+
NSLog(@"[RNWhisper] Slice %d has %d samples, put %d samples", state->sliceIndex, nSamples, n);
|
|
206
207
|
|
|
207
208
|
state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
|
|
208
209
|
|
|
@@ -413,7 +414,8 @@ struct rnwhisper_segments_callback_data {
|
|
|
413
414
|
params.new_segment_callback_user_data = &user_data;
|
|
414
415
|
}
|
|
415
416
|
|
|
416
|
-
rnwhisper::job* job = rnwhisper::job_new(jobId, params)
|
|
417
|
+
rnwhisper::job* job = rnwhisper::job_new(jobId, params);
|
|
418
|
+
self->recordState.job = job;
|
|
417
419
|
int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
|
|
418
420
|
rnwhisper::job_remove(jobId);
|
|
419
421
|
self->recordState.isTranscribing = false;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":"1.5.
|
|
1
|
+
{"version":"1.5.4"}
|
package/lib/module/version.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":"1.5.
|
|
1
|
+
{"version":"1.5.4"}
|
package/package.json
CHANGED
package/src/version.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":"1.5.
|
|
1
|
+
{"version":"1.5.4"}
|