whisper.rn 0.4.0-rc.1 → 0.4.0-rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +21 -1
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +226 -109
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +5 -3
- package/cpp/ggml-alloc.c +797 -400
- package/cpp/ggml-alloc.h +60 -10
- package/cpp/ggml-backend-impl.h +255 -0
- package/cpp/ggml-backend-reg.cpp +582 -0
- package/cpp/ggml-backend.cpp +2002 -0
- package/cpp/ggml-backend.h +354 -0
- package/cpp/ggml-common.h +1851 -0
- package/cpp/ggml-cpp.h +39 -0
- package/cpp/ggml-cpu-aarch64.cpp +4247 -0
- package/cpp/ggml-cpu-aarch64.h +8 -0
- package/cpp/ggml-cpu-impl.h +531 -0
- package/cpp/ggml-cpu-quants.c +12245 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14792 -0
- package/cpp/ggml-cpu.cpp +653 -0
- package/cpp/ggml-cpu.h +137 -0
- package/cpp/ggml-impl.h +567 -0
- package/cpp/ggml-metal-impl.h +288 -0
- package/cpp/ggml-metal.h +24 -43
- package/cpp/ggml-metal.m +4867 -1080
- package/cpp/ggml-opt.cpp +854 -0
- package/cpp/ggml-opt.h +216 -0
- package/cpp/ggml-quants.c +5238 -0
- package/cpp/ggml-quants.h +100 -0
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +14 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +5106 -19431
- package/cpp/ggml.h +847 -669
- package/cpp/gguf.cpp +1329 -0
- package/cpp/gguf.h +202 -0
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +221 -52
- package/cpp/rn-whisper.h +50 -15
- package/cpp/whisper.cpp +3174 -1533
- package/cpp/whisper.h +176 -44
- package/ios/RNWhisper.mm +139 -46
- package/ios/RNWhisperAudioUtils.h +1 -2
- package/ios/RNWhisperAudioUtils.m +18 -67
- package/ios/RNWhisperContext.h +11 -8
- package/ios/RNWhisperContext.mm +195 -150
- package/jest/mock.js +15 -2
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +76 -28
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +76 -28
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +13 -4
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +37 -5
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +9 -7
- package/src/NativeRNWhisper.ts +20 -4
- package/src/index.ts +98 -42
- package/src/version.json +1 -1
- package/whisper-rn.podspec +13 -20
- package/cpp/README.md +0 -4
- package/cpp/ggml-metal.metal +0 -2353
package/cpp/gguf.h
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
|
|
2
|
+
// GGUF files have the following structure:
|
|
3
|
+
//
|
|
4
|
+
// 1. File magic "GGUF" (4 bytes).
|
|
5
|
+
// 2. File version (uint32_t).
|
|
6
|
+
// 3. Number of ggml tensors in file (int64_t).
|
|
7
|
+
// 4. Number of key-value-pairs in file (int64_t).
|
|
8
|
+
// 5. For each KV pair:
|
|
9
|
+
// 1. The key (string).
|
|
10
|
+
// 2. The value type (wsp_gguf_type).
|
|
11
|
+
// 3a. If the value type is WSP_GGUF_TYPE_ARRAY:
|
|
12
|
+
// 1. The type of the array (wsp_gguf_type).
|
|
13
|
+
// 2. The number of elements in the array (uint64_t).
|
|
14
|
+
// 3. The binary representation of each element in the array.
|
|
15
|
+
// 3b. Otherwise:
|
|
16
|
+
// 1. The binary representation of the value.
|
|
17
|
+
// 6. For each ggml tensor:
|
|
18
|
+
// 1. The tensor name (string).
|
|
19
|
+
// 2. The number of dimensions of the tensor (uint32_t).
|
|
20
|
+
// 3. For each dimension:
|
|
21
|
+
// 1. The size of the tensor in the dimension (int64_t).
|
|
22
|
+
// 4. The tensor data type (wsp_ggml_type).
|
|
23
|
+
// 5. The tensor data offset in the tensor data binary blob (uint64_t).
|
|
24
|
+
// 7. The tensor data binary blob (optional, aligned).
|
|
25
|
+
//
|
|
26
|
+
// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
|
|
27
|
+
// All enums are stored as int32_t.
|
|
28
|
+
// All bool values are stored as int8_t.
|
|
29
|
+
// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
|
|
30
|
+
// otherwise WSP_GGUF_DEFAULT_ALIGNMENT is used.
|
|
31
|
+
//
|
|
32
|
+
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
|
33
|
+
|
|
34
|
+
#pragma once
|
|
35
|
+
|
|
36
|
+
#include "ggml.h"
|
|
37
|
+
|
|
38
|
+
#include <stdbool.h>
|
|
39
|
+
#include <stdint.h>
|
|
40
|
+
|
|
41
|
+
#define WSP_GGUF_MAGIC "GGUF"
|
|
42
|
+
#define WSP_GGUF_VERSION 3
|
|
43
|
+
|
|
44
|
+
#define WSP_GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
|
|
45
|
+
|
|
46
|
+
#define WSP_GGUF_DEFAULT_ALIGNMENT 32
|
|
47
|
+
|
|
48
|
+
#ifdef __cplusplus
|
|
49
|
+
extern "C" {
|
|
50
|
+
#endif
|
|
51
|
+
|
|
52
|
+
// types that can be stored as GGUF KV data
|
|
53
|
+
enum wsp_gguf_type {
|
|
54
|
+
WSP_GGUF_TYPE_UINT8 = 0,
|
|
55
|
+
WSP_GGUF_TYPE_INT8 = 1,
|
|
56
|
+
WSP_GGUF_TYPE_UINT16 = 2,
|
|
57
|
+
WSP_GGUF_TYPE_INT16 = 3,
|
|
58
|
+
WSP_GGUF_TYPE_UINT32 = 4,
|
|
59
|
+
WSP_GGUF_TYPE_INT32 = 5,
|
|
60
|
+
WSP_GGUF_TYPE_FLOAT32 = 6,
|
|
61
|
+
WSP_GGUF_TYPE_BOOL = 7,
|
|
62
|
+
WSP_GGUF_TYPE_STRING = 8,
|
|
63
|
+
WSP_GGUF_TYPE_ARRAY = 9,
|
|
64
|
+
WSP_GGUF_TYPE_UINT64 = 10,
|
|
65
|
+
WSP_GGUF_TYPE_INT64 = 11,
|
|
66
|
+
WSP_GGUF_TYPE_FLOAT64 = 12,
|
|
67
|
+
WSP_GGUF_TYPE_COUNT, // marks the end of the enum
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
struct wsp_gguf_context;
|
|
71
|
+
|
|
72
|
+
struct wsp_gguf_init_params {
|
|
73
|
+
bool no_alloc;
|
|
74
|
+
|
|
75
|
+
// if not NULL, create a wsp_ggml_context and allocate the tensor data in it
|
|
76
|
+
struct wsp_ggml_context ** ctx;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_empty(void);
|
|
80
|
+
WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp_gguf_init_params params);
|
|
81
|
+
//WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_buffer(..);
|
|
82
|
+
|
|
83
|
+
WSP_GGML_API void wsp_gguf_free(struct wsp_gguf_context * ctx);
|
|
84
|
+
|
|
85
|
+
WSP_GGML_API const char * wsp_gguf_type_name(enum wsp_gguf_type type);
|
|
86
|
+
|
|
87
|
+
WSP_GGML_API uint32_t wsp_gguf_get_version (const struct wsp_gguf_context * ctx);
|
|
88
|
+
WSP_GGML_API size_t wsp_gguf_get_alignment (const struct wsp_gguf_context * ctx);
|
|
89
|
+
WSP_GGML_API size_t wsp_gguf_get_data_offset(const struct wsp_gguf_context * ctx);
|
|
90
|
+
|
|
91
|
+
WSP_GGML_API int64_t wsp_gguf_get_n_kv(const struct wsp_gguf_context * ctx);
|
|
92
|
+
WSP_GGML_API int64_t wsp_gguf_find_key(const struct wsp_gguf_context * ctx, const char * key); // returns -1 if key is not found
|
|
93
|
+
WSP_GGML_API const char * wsp_gguf_get_key (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
94
|
+
|
|
95
|
+
WSP_GGML_API enum wsp_gguf_type wsp_gguf_get_kv_type (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
96
|
+
WSP_GGML_API enum wsp_gguf_type wsp_gguf_get_arr_type(const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
97
|
+
|
|
98
|
+
// will abort if the wrong type is used for the key
|
|
99
|
+
WSP_GGML_API uint8_t wsp_gguf_get_val_u8 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
100
|
+
WSP_GGML_API int8_t wsp_gguf_get_val_i8 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
101
|
+
WSP_GGML_API uint16_t wsp_gguf_get_val_u16 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
102
|
+
WSP_GGML_API int16_t wsp_gguf_get_val_i16 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
103
|
+
WSP_GGML_API uint32_t wsp_gguf_get_val_u32 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
104
|
+
WSP_GGML_API int32_t wsp_gguf_get_val_i32 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
105
|
+
WSP_GGML_API float wsp_gguf_get_val_f32 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
106
|
+
WSP_GGML_API uint64_t wsp_gguf_get_val_u64 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
107
|
+
WSP_GGML_API int64_t wsp_gguf_get_val_i64 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
108
|
+
WSP_GGML_API double wsp_gguf_get_val_f64 (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
109
|
+
WSP_GGML_API bool wsp_gguf_get_val_bool(const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
110
|
+
WSP_GGML_API const char * wsp_gguf_get_val_str (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
111
|
+
WSP_GGML_API const void * wsp_gguf_get_val_data(const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
112
|
+
WSP_GGML_API size_t wsp_gguf_get_arr_n (const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
113
|
+
|
|
114
|
+
// get raw pointer to the first element of the array with the given key_id
|
|
115
|
+
// for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
|
|
116
|
+
WSP_GGML_API const void * wsp_gguf_get_arr_data(const struct wsp_gguf_context * ctx, int64_t key_id);
|
|
117
|
+
|
|
118
|
+
// get ith C string from array with given key_id
|
|
119
|
+
WSP_GGML_API const char * wsp_gguf_get_arr_str (const struct wsp_gguf_context * ctx, int64_t key_id, size_t i);
|
|
120
|
+
|
|
121
|
+
WSP_GGML_API int64_t wsp_gguf_get_n_tensors (const struct wsp_gguf_context * ctx);
|
|
122
|
+
WSP_GGML_API int64_t wsp_gguf_find_tensor (const struct wsp_gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
|
|
123
|
+
WSP_GGML_API size_t wsp_gguf_get_tensor_offset(const struct wsp_gguf_context * ctx, int64_t tensor_id);
|
|
124
|
+
WSP_GGML_API const char * wsp_gguf_get_tensor_name (const struct wsp_gguf_context * ctx, int64_t tensor_id);
|
|
125
|
+
WSP_GGML_API enum wsp_ggml_type wsp_gguf_get_tensor_type (const struct wsp_gguf_context * ctx, int64_t tensor_id);
|
|
126
|
+
WSP_GGML_API size_t wsp_gguf_get_tensor_size (const struct wsp_gguf_context * ctx, int64_t tensor_id);
|
|
127
|
+
|
|
128
|
+
// removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
|
|
129
|
+
WSP_GGML_API int64_t wsp_gguf_remove_key(struct wsp_gguf_context * ctx, const char * key);
|
|
130
|
+
|
|
131
|
+
// overrides an existing KV pair or adds a new one, the new KV pair is always at the back
|
|
132
|
+
WSP_GGML_API void wsp_gguf_set_val_u8 (struct wsp_gguf_context * ctx, const char * key, uint8_t val);
|
|
133
|
+
WSP_GGML_API void wsp_gguf_set_val_i8 (struct wsp_gguf_context * ctx, const char * key, int8_t val);
|
|
134
|
+
WSP_GGML_API void wsp_gguf_set_val_u16 (struct wsp_gguf_context * ctx, const char * key, uint16_t val);
|
|
135
|
+
WSP_GGML_API void wsp_gguf_set_val_i16 (struct wsp_gguf_context * ctx, const char * key, int16_t val);
|
|
136
|
+
WSP_GGML_API void wsp_gguf_set_val_u32 (struct wsp_gguf_context * ctx, const char * key, uint32_t val);
|
|
137
|
+
WSP_GGML_API void wsp_gguf_set_val_i32 (struct wsp_gguf_context * ctx, const char * key, int32_t val);
|
|
138
|
+
WSP_GGML_API void wsp_gguf_set_val_f32 (struct wsp_gguf_context * ctx, const char * key, float val);
|
|
139
|
+
WSP_GGML_API void wsp_gguf_set_val_u64 (struct wsp_gguf_context * ctx, const char * key, uint64_t val);
|
|
140
|
+
WSP_GGML_API void wsp_gguf_set_val_i64 (struct wsp_gguf_context * ctx, const char * key, int64_t val);
|
|
141
|
+
WSP_GGML_API void wsp_gguf_set_val_f64 (struct wsp_gguf_context * ctx, const char * key, double val);
|
|
142
|
+
WSP_GGML_API void wsp_gguf_set_val_bool(struct wsp_gguf_context * ctx, const char * key, bool val);
|
|
143
|
+
WSP_GGML_API void wsp_gguf_set_val_str (struct wsp_gguf_context * ctx, const char * key, const char * val);
|
|
144
|
+
|
|
145
|
+
// creates a new array with n elements of the given type and copies the corresponding number of bytes from data
|
|
146
|
+
WSP_GGML_API void wsp_gguf_set_arr_data(struct wsp_gguf_context * ctx, const char * key, enum wsp_gguf_type type, const void * data, size_t n);
|
|
147
|
+
|
|
148
|
+
// creates a new array with n strings and copies the corresponding strings from data
|
|
149
|
+
WSP_GGML_API void wsp_gguf_set_arr_str (struct wsp_gguf_context * ctx, const char * key, const char ** data, size_t n);
|
|
150
|
+
|
|
151
|
+
// set or add KV pairs from another context
|
|
152
|
+
WSP_GGML_API void wsp_gguf_set_kv(struct wsp_gguf_context * ctx, const struct wsp_gguf_context * src);
|
|
153
|
+
|
|
154
|
+
// add tensor to GGUF context, tensor name must be unique
|
|
155
|
+
WSP_GGML_API void wsp_gguf_add_tensor(struct wsp_gguf_context * ctx, const struct wsp_ggml_tensor * tensor);
|
|
156
|
+
|
|
157
|
+
// after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
|
|
158
|
+
// in such a way that the tensor data remains as one contiguous block (except for padding)
|
|
159
|
+
WSP_GGML_API void wsp_gguf_set_tensor_type(struct wsp_gguf_context * ctx, const char * name, enum wsp_ggml_type type);
|
|
160
|
+
|
|
161
|
+
// assumes that at least wsp_gguf_get_tensor_size bytes can be read from data
|
|
162
|
+
WSP_GGML_API void wsp_gguf_set_tensor_data(struct wsp_gguf_context * ctx, const char * name, const void * data);
|
|
163
|
+
|
|
164
|
+
// writing gguf files can be done in 3 ways:
|
|
165
|
+
//
|
|
166
|
+
// - write the entire wsp_gguf_context to a binary file in a single pass:
|
|
167
|
+
//
|
|
168
|
+
// wsp_gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
|
|
169
|
+
//
|
|
170
|
+
// - write only the meta data to a file, then re-open the file and append the tensor data:
|
|
171
|
+
//
|
|
172
|
+
// wsp_gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
|
|
173
|
+
// FILE * f = fopen(fname, "ab");
|
|
174
|
+
// fwrite(f, ...); // write tensor data
|
|
175
|
+
// fclose(f);
|
|
176
|
+
//
|
|
177
|
+
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
|
178
|
+
//
|
|
179
|
+
// FILE * f = fopen(fname, "wb");
|
|
180
|
+
// const size_t size_meta = wsp_gguf_get_meta_size(ctx);
|
|
181
|
+
// fseek(f, size_meta, SEEK_SET);
|
|
182
|
+
// fwrite(f, ...); // write tensor data
|
|
183
|
+
// void * data = malloc(size_meta);
|
|
184
|
+
// wsp_gguf_get_meta_data(ctx, data);
|
|
185
|
+
// rewind(f);
|
|
186
|
+
// fwrite(data, 1, data, f);
|
|
187
|
+
// free(data);
|
|
188
|
+
// fclose(f);
|
|
189
|
+
//
|
|
190
|
+
|
|
191
|
+
// write the entire context to a binary file
|
|
192
|
+
WSP_GGML_API bool wsp_gguf_write_to_file(const struct wsp_gguf_context * ctx, const char * fname, bool only_meta);
|
|
193
|
+
|
|
194
|
+
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
|
195
|
+
WSP_GGML_API size_t wsp_gguf_get_meta_size(const struct wsp_gguf_context * ctx);
|
|
196
|
+
|
|
197
|
+
// writes the meta data to pointer "data"
|
|
198
|
+
WSP_GGML_API void wsp_gguf_get_meta_data(const struct wsp_gguf_context * ctx, void * data);
|
|
199
|
+
|
|
200
|
+
#ifdef __cplusplus
|
|
201
|
+
}
|
|
202
|
+
#endif
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#include "rn-audioutils.h"
|
|
2
|
+
#include "rn-whisper-log.h"
|
|
3
|
+
|
|
4
|
+
namespace rnaudioutils {
|
|
5
|
+
|
|
6
|
+
std::vector<uint8_t> concat_short_buffers(const std::vector<short*>& buffers, const std::vector<int>& slice_n_samples) {
|
|
7
|
+
std::vector<uint8_t> output_data;
|
|
8
|
+
|
|
9
|
+
for (size_t i = 0; i < buffers.size(); i++) {
|
|
10
|
+
int size = slice_n_samples[i]; // Number of shorts
|
|
11
|
+
short* slice = buffers[i];
|
|
12
|
+
|
|
13
|
+
// Copy each short as two bytes
|
|
14
|
+
for (int j = 0; j < size; j++) {
|
|
15
|
+
output_data.push_back(static_cast<uint8_t>(slice[j] & 0xFF)); // Lower byte
|
|
16
|
+
output_data.push_back(static_cast<uint8_t>((slice[j] >> 8) & 0xFF)); // Higher byte
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return output_data;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
std::vector<uint8_t> remove_trailing_zeros(const std::vector<uint8_t>& audio_data) {
|
|
24
|
+
auto last = std::find_if(audio_data.rbegin(), audio_data.rend(), [](uint8_t byte) { return byte != 0; });
|
|
25
|
+
return std::vector<uint8_t>(audio_data.begin(), last.base());
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void save_wav_file(const std::vector<uint8_t>& raw, const std::string& file) {
|
|
29
|
+
std::vector<uint8_t> data = remove_trailing_zeros(raw);
|
|
30
|
+
|
|
31
|
+
std::ofstream output(file, std::ios::binary);
|
|
32
|
+
|
|
33
|
+
if (!output.is_open()) {
|
|
34
|
+
RNWHISPER_LOG_ERROR("Failed to open file for writing: %s\n", file.c_str());
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// WAVE header
|
|
39
|
+
output.write("RIFF", 4);
|
|
40
|
+
int32_t chunk_size = 36 + static_cast<int32_t>(data.size());
|
|
41
|
+
output.write(reinterpret_cast<char*>(&chunk_size), sizeof(chunk_size));
|
|
42
|
+
output.write("WAVE", 4);
|
|
43
|
+
output.write("fmt ", 4);
|
|
44
|
+
int32_t sub_chunk_size = 16;
|
|
45
|
+
output.write(reinterpret_cast<char*>(&sub_chunk_size), sizeof(sub_chunk_size));
|
|
46
|
+
short audio_format = 1;
|
|
47
|
+
output.write(reinterpret_cast<char*>(&audio_format), sizeof(audio_format));
|
|
48
|
+
short num_channels = 1;
|
|
49
|
+
output.write(reinterpret_cast<char*>(&num_channels), sizeof(num_channels));
|
|
50
|
+
int32_t sample_rate = WHISPER_SAMPLE_RATE;
|
|
51
|
+
output.write(reinterpret_cast<char*>(&sample_rate), sizeof(sample_rate));
|
|
52
|
+
int32_t byte_rate = WHISPER_SAMPLE_RATE * 2;
|
|
53
|
+
output.write(reinterpret_cast<char*>(&byte_rate), sizeof(byte_rate));
|
|
54
|
+
short block_align = 2;
|
|
55
|
+
output.write(reinterpret_cast<char*>(&block_align), sizeof(block_align));
|
|
56
|
+
short bits_per_sample = 16;
|
|
57
|
+
output.write(reinterpret_cast<char*>(&bits_per_sample), sizeof(bits_per_sample));
|
|
58
|
+
output.write("data", 4);
|
|
59
|
+
int32_t sub_chunk2_size = static_cast<int32_t>(data.size());
|
|
60
|
+
output.write(reinterpret_cast<char*>(&sub_chunk2_size), sizeof(sub_chunk2_size));
|
|
61
|
+
output.write(reinterpret_cast<const char*>(data.data()), data.size());
|
|
62
|
+
|
|
63
|
+
output.close();
|
|
64
|
+
|
|
65
|
+
RNWHISPER_LOG_INFO("Saved audio file: %s\n", file.c_str());
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
} // namespace rnaudioutils
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <fstream>
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <cstdint>
|
|
5
|
+
#include <cstring>
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include "whisper.h"
|
|
8
|
+
|
|
9
|
+
namespace rnaudioutils {
|
|
10
|
+
|
|
11
|
+
std::vector<uint8_t> concat_short_buffers(const std::vector<short*>& buffers, const std::vector<int>& slice_n_samples);
|
|
12
|
+
void save_wav_file(const std::vector<uint8_t>& raw, const std::string& file);
|
|
13
|
+
|
|
14
|
+
} // namespace rnaudioutils
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#if defined(__ANDROID__) && defined(RNWHISPER_ANDROID_ENABLE_LOGGING)
|
|
2
|
+
#include <android/log.h>
|
|
3
|
+
#define RNWHISPER_ANDROID_TAG "RNWHISPER_LOG_ANDROID"
|
|
4
|
+
#define RNWHISPER_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , RNWHISPER_ANDROID_TAG, __VA_ARGS__)
|
|
5
|
+
#define RNWHISPER_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , RNWHISPER_ANDROID_TAG, __VA_ARGS__)
|
|
6
|
+
#define RNWHISPER_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, RNWHISPER_ANDROID_TAG, __VA_ARGS__)
|
|
7
|
+
#else
|
|
8
|
+
#define RNWHISPER_LOG_INFO(...) fprintf(stderr, __VA_ARGS__)
|
|
9
|
+
#define RNWHISPER_LOG_WARN(...) fprintf(stderr, __VA_ARGS__)
|
|
10
|
+
#define RNWHISPER_LOG_ERROR(...) fprintf(stderr, __VA_ARGS__)
|
|
11
|
+
#endif // __ANDROID__
|
package/cpp/rn-whisper.cpp
CHANGED
|
@@ -2,40 +2,94 @@
|
|
|
2
2
|
#include <string>
|
|
3
3
|
#include <vector>
|
|
4
4
|
#include <unordered_map>
|
|
5
|
-
#include "whisper.h"
|
|
5
|
+
#include "rn-whisper.h"
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
#define DEFAULT_MAX_AUDIO_SEC 30;
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
namespace rnwhisper {
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
const char * system_info(void) {
|
|
12
|
+
static std::string s;
|
|
13
|
+
s = "";
|
|
14
|
+
if (wsp_ggml_cpu_has_avx() == 1) s += "AVX ";
|
|
15
|
+
if (wsp_ggml_cpu_has_avx2() == 1) s += "AVX2 ";
|
|
16
|
+
if (wsp_ggml_cpu_has_avx512() == 1) s += "AVX512 ";
|
|
17
|
+
if (wsp_ggml_cpu_has_fma() == 1) s += "FMA ";
|
|
18
|
+
if (wsp_ggml_cpu_has_neon() == 1) s += "NEON ";
|
|
19
|
+
if (wsp_ggml_cpu_has_arm_fma() == 1) s += "ARM_FMA ";
|
|
20
|
+
if (wsp_ggml_cpu_has_f16c() == 1) s += "F16C ";
|
|
21
|
+
if (wsp_ggml_cpu_has_fp16_va() == 1) s += "FP16_VA ";
|
|
22
|
+
if (wsp_ggml_cpu_has_sse3() == 1) s += "SSE3 ";
|
|
23
|
+
if (wsp_ggml_cpu_has_ssse3() == 1) s += "SSSE3 ";
|
|
24
|
+
if (wsp_ggml_cpu_has_vsx() == 1) s += "VSX ";
|
|
25
|
+
#ifdef WHISPER_USE_COREML
|
|
26
|
+
s += "COREML ";
|
|
27
|
+
#endif
|
|
28
|
+
s.erase(s.find_last_not_of(" ") + 1);
|
|
29
|
+
return s.c_str();
|
|
14
30
|
}
|
|
15
31
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
abort_map.erase(job_id);
|
|
19
|
-
}
|
|
20
|
-
}
|
|
32
|
+
std::string bench(struct whisper_context * ctx, int n_threads) {
|
|
33
|
+
const int n_mels = whisper_model_n_mels(ctx);
|
|
21
34
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
35
|
+
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
|
|
36
|
+
return "error: failed to set mel: " + std::to_string(ret);
|
|
37
|
+
}
|
|
38
|
+
// heat encoder
|
|
39
|
+
if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
|
|
40
|
+
return "error: failed to encode: " + std::to_string(ret);
|
|
41
|
+
}
|
|
27
42
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
43
|
+
whisper_token tokens[512];
|
|
44
|
+
memset(tokens, 0, sizeof(tokens));
|
|
45
|
+
|
|
46
|
+
// prompt heat
|
|
47
|
+
if (int ret = whisper_decode(ctx, tokens, 256, 0, n_threads) != 0) {
|
|
48
|
+
return "error: failed to decode: " + std::to_string(ret);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// text-generation heat
|
|
52
|
+
if (int ret = whisper_decode(ctx, tokens, 1, 256, n_threads) != 0) {
|
|
53
|
+
return "error: failed to decode: " + std::to_string(ret);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
whisper_reset_timings(ctx);
|
|
57
|
+
|
|
58
|
+
// actual run
|
|
59
|
+
if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
|
|
60
|
+
return "error: failed to encode: " + std::to_string(ret);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// text-generation
|
|
64
|
+
for (int i = 0; i < 256; i++) {
|
|
65
|
+
if (int ret = whisper_decode(ctx, tokens, 1, i, n_threads) != 0) {
|
|
66
|
+
return "error: failed to decode: " + std::to_string(ret);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// batched decoding
|
|
71
|
+
for (int i = 0; i < 64; i++) {
|
|
72
|
+
if (int ret = whisper_decode(ctx, tokens, 5, 0, n_threads) != 0) {
|
|
73
|
+
return "error: failed to decode: " + std::to_string(ret);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
34
76
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
77
|
+
// prompt processing
|
|
78
|
+
for (int i = 0; i < 16; i++) {
|
|
79
|
+
if (int ret = whisper_decode(ctx, tokens, 256, 0, n_threads) != 0) {
|
|
80
|
+
return "error: failed to decode: " + std::to_string(ret);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const struct whisper_timings * timings = whisper_get_timings(ctx);
|
|
85
|
+
|
|
86
|
+
return std::string("[") +
|
|
87
|
+
"\"" + system_info() + "\"," +
|
|
88
|
+
std::to_string(n_threads) + "," +
|
|
89
|
+
std::to_string(timings->encode_ms) + "," +
|
|
90
|
+
std::to_string(timings->decode_ms) + "," +
|
|
91
|
+
std::to_string(timings->batchd_ms) + "," +
|
|
92
|
+
std::to_string(timings->prompt_ms) + "]";
|
|
39
93
|
}
|
|
40
94
|
|
|
41
95
|
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
|
@@ -51,42 +105,157 @@ void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate
|
|
|
51
105
|
}
|
|
52
106
|
}
|
|
53
107
|
|
|
54
|
-
bool
|
|
55
|
-
|
|
56
|
-
|
|
108
|
+
bool vad_simple_impl(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
|
109
|
+
const int n_samples = pcmf32.size();
|
|
110
|
+
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
|
57
111
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
112
|
+
if (n_samples_last >= n_samples) {
|
|
113
|
+
// not enough samples - assume no speech
|
|
114
|
+
return false;
|
|
115
|
+
}
|
|
62
116
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
117
|
+
if (freq_thold > 0.0f) {
|
|
118
|
+
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
|
119
|
+
}
|
|
66
120
|
|
|
67
|
-
|
|
68
|
-
|
|
121
|
+
float energy_all = 0.0f;
|
|
122
|
+
float energy_last = 0.0f;
|
|
69
123
|
|
|
70
|
-
|
|
71
|
-
|
|
124
|
+
for (int i = 0; i < n_samples; i++) {
|
|
125
|
+
energy_all += fabsf(pcmf32[i]);
|
|
72
126
|
|
|
73
|
-
|
|
74
|
-
|
|
127
|
+
if (i >= n_samples - n_samples_last) {
|
|
128
|
+
energy_last += fabsf(pcmf32[i]);
|
|
129
|
+
}
|
|
75
130
|
}
|
|
76
|
-
}
|
|
77
131
|
|
|
78
|
-
|
|
79
|
-
|
|
132
|
+
energy_all /= n_samples;
|
|
133
|
+
energy_last /= n_samples_last;
|
|
80
134
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
135
|
+
if (verbose) {
|
|
136
|
+
RNWHISPER_LOG_INFO("%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
|
137
|
+
}
|
|
84
138
|
|
|
85
|
-
|
|
139
|
+
if (energy_last > vad_thold*energy_all) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return true;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
void job::set_realtime_params(
|
|
147
|
+
vad_params params,
|
|
148
|
+
int sec,
|
|
149
|
+
int slice_sec,
|
|
150
|
+
float min_sec,
|
|
151
|
+
const char* output_path
|
|
152
|
+
) {
|
|
153
|
+
vad = params;
|
|
154
|
+
if (vad.vad_ms < 2000) vad.vad_ms = 2000;
|
|
155
|
+
audio_sec = sec > 0 ? sec : DEFAULT_MAX_AUDIO_SEC;
|
|
156
|
+
audio_slice_sec = slice_sec > 0 && slice_sec < audio_sec ? slice_sec : audio_sec;
|
|
157
|
+
audio_min_sec = min_sec >= 0.5 && min_sec <= audio_slice_sec ? min_sec : 1.0f;
|
|
158
|
+
audio_output_path = output_path;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
bool job::vad_simple(int slice_index, int n_samples, int n) {
|
|
162
|
+
if (slice_index >= pcm_slices.size()) return !vad.use_vad;
|
|
163
|
+
if (!vad.use_vad) return true;
|
|
164
|
+
|
|
165
|
+
short* pcm = pcm_slices[slice_index];
|
|
166
|
+
int sample_size = (int) (WHISPER_SAMPLE_RATE * vad.vad_ms / 1000);
|
|
167
|
+
if (n_samples + n > sample_size) {
|
|
168
|
+
int start = n_samples + n - sample_size;
|
|
169
|
+
std::vector<float> pcmf32(sample_size);
|
|
170
|
+
for (int i = 0; i < sample_size; i++) {
|
|
171
|
+
pcmf32[i] = (float)pcm[i + start] / 32768.0f;
|
|
172
|
+
}
|
|
173
|
+
return vad_simple_impl(pcmf32, WHISPER_SAMPLE_RATE, vad.last_ms, vad.vad_thold, vad.freq_thold, vad.verbose);
|
|
174
|
+
}
|
|
86
175
|
return false;
|
|
87
|
-
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
void job::put_pcm_data(short* data, int slice_index, int n_samples, int n) {
|
|
179
|
+
if (pcm_slices.size() == slice_index) {
|
|
180
|
+
int n_slices = (int) (WHISPER_SAMPLE_RATE * audio_slice_sec);
|
|
181
|
+
pcm_slices.push_back(new short[n_slices]);
|
|
182
|
+
}
|
|
183
|
+
short* pcm = pcm_slices[slice_index];
|
|
184
|
+
for (int i = 0; i < n; i++) {
|
|
185
|
+
pcm[i + n_samples] = data[i];
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
float* job::pcm_slice_to_f32(int slice_index, int size) {
|
|
190
|
+
if (pcm_slices.size() > slice_index) {
|
|
191
|
+
float* pcmf32 = new float[size];
|
|
192
|
+
for (int i = 0; i < size; i++) {
|
|
193
|
+
pcmf32[i] = (float)pcm_slices[slice_index][i] / 32768.0f;
|
|
194
|
+
}
|
|
195
|
+
return pcmf32;
|
|
196
|
+
}
|
|
197
|
+
return nullptr;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
bool job::is_aborted() {
|
|
201
|
+
return aborted;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
void job::abort() {
|
|
205
|
+
aborted = true;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
job::~job() {
|
|
209
|
+
RNWHISPER_LOG_INFO("rnwhisper::job::%s: job_id: %d\n", __func__, job_id);
|
|
210
|
+
|
|
211
|
+
for (size_t i = 0; i < pcm_slices.size(); i++) {
|
|
212
|
+
delete[] pcm_slices[i];
|
|
213
|
+
}
|
|
214
|
+
pcm_slices.clear();
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
std::unordered_map<int, job*> job_map;
|
|
218
|
+
|
|
219
|
+
void job_abort_all() {
|
|
220
|
+
for (auto it = job_map.begin(); it != job_map.end(); ++it) {
|
|
221
|
+
it->second->abort();
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
job* job_new(int job_id, struct whisper_full_params params) {
|
|
226
|
+
job* ctx = new job();
|
|
227
|
+
ctx->job_id = job_id;
|
|
228
|
+
ctx->params = params;
|
|
229
|
+
|
|
230
|
+
job_map[job_id] = ctx;
|
|
231
|
+
|
|
232
|
+
// Abort handler
|
|
233
|
+
params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
234
|
+
job *j = (job*)user_data;
|
|
235
|
+
return !j->is_aborted();
|
|
236
|
+
};
|
|
237
|
+
params.encoder_begin_callback_user_data = job_map[job_id];
|
|
238
|
+
params.abort_callback = [](void * user_data) {
|
|
239
|
+
job *j = (job*)user_data;
|
|
240
|
+
return j->is_aborted();
|
|
241
|
+
};
|
|
242
|
+
params.abort_callback_user_data = job_map[job_id];
|
|
88
243
|
|
|
89
|
-
|
|
244
|
+
return job_map[job_id];
|
|
90
245
|
}
|
|
91
246
|
|
|
92
|
-
|
|
247
|
+
job* job_get(int job_id) {
|
|
248
|
+
if (job_map.find(job_id) != job_map.end()) {
|
|
249
|
+
return job_map[job_id];
|
|
250
|
+
}
|
|
251
|
+
return nullptr;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
void job_remove(int job_id) {
|
|
255
|
+
if (job_map.find(job_id) != job_map.end()) {
|
|
256
|
+
delete job_map[job_id];
|
|
257
|
+
}
|
|
258
|
+
job_map.erase(job_id);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
}
|
package/cpp/rn-whisper.h
CHANGED
|
@@ -1,17 +1,52 @@
|
|
|
1
|
+
#ifndef RNWHISPER_H
|
|
2
|
+
#define RNWHISPER_H
|
|
1
3
|
|
|
2
|
-
#ifdef __cplusplus
|
|
3
4
|
#include <string>
|
|
4
|
-
#include <
|
|
5
|
-
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
5
|
+
#include <vector>
|
|
6
|
+
#include "whisper.h"
|
|
7
|
+
#include "rn-whisper-log.h"
|
|
8
|
+
#include "rn-audioutils.h"
|
|
9
|
+
|
|
10
|
+
namespace rnwhisper {
|
|
11
|
+
|
|
12
|
+
std::string bench(whisper_context * ctx, int n_threads);
|
|
13
|
+
|
|
14
|
+
struct vad_params {
|
|
15
|
+
bool use_vad = false;
|
|
16
|
+
float vad_thold = 0.6f;
|
|
17
|
+
float freq_thold = 100.0f;
|
|
18
|
+
int vad_ms = 2000;
|
|
19
|
+
int last_ms = 1000;
|
|
20
|
+
bool verbose = false;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
struct job {
|
|
24
|
+
int job_id;
|
|
25
|
+
bool aborted = false;
|
|
26
|
+
whisper_full_params params;
|
|
27
|
+
|
|
28
|
+
~job();
|
|
29
|
+
bool is_aborted();
|
|
30
|
+
void abort();
|
|
31
|
+
|
|
32
|
+
// Realtime transcription only:
|
|
33
|
+
vad_params vad;
|
|
34
|
+
int audio_sec = 0;
|
|
35
|
+
int audio_slice_sec = 0;
|
|
36
|
+
float audio_min_sec = 0;
|
|
37
|
+
const char* audio_output_path = nullptr;
|
|
38
|
+
std::vector<short *> pcm_slices;
|
|
39
|
+
void set_realtime_params(vad_params vad, int sec, int slice_sec, float min_sec, const char* output_path);
|
|
40
|
+
bool vad_simple(int slice_index, int n_samples, int n);
|
|
41
|
+
void put_pcm_data(short* pcm, int slice_index, int n_samples, int n);
|
|
42
|
+
float* pcm_slice_to_f32(int slice_index, int size);
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
void job_abort_all();
|
|
46
|
+
job* job_new(int job_id, struct whisper_full_params params);
|
|
47
|
+
void job_remove(int job_id);
|
|
48
|
+
job* job_get(int job_id);
|
|
49
|
+
|
|
50
|
+
} // namespace rnwhisper
|
|
51
|
+
|
|
52
|
+
#endif // RNWHISPER_H
|