whispercpp 1.2.0.2 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +3 -92
- data/ext/extconf.rb +9 -0
- data/ext/ggml.c +18380 -5241
- data/ext/ggml.h +2156 -502
- data/ext/ruby_whisper.cpp +13 -47
- data/ext/whisper.cpp +4182 -1787
- data/ext/whisper.h +334 -65
- metadata +3 -3
data/ext/ruby_whisper.cpp
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
-
#include <ruby/thread.h>
|
3
2
|
#include "ruby_whisper.h"
|
4
3
|
#define DR_WAV_IMPLEMENTATION
|
5
4
|
#include "dr_wav.h"
|
@@ -88,39 +87,13 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
|
|
88
87
|
if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
|
89
88
|
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
|
90
89
|
}
|
91
|
-
rw->context =
|
90
|
+
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
|
92
91
|
if (rw->context == nullptr) {
|
93
92
|
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
|
94
93
|
}
|
95
94
|
return self;
|
96
95
|
}
|
97
96
|
|
98
|
-
struct WhisperFullParallelParams {
|
99
|
-
ruby_whisper *rw;
|
100
|
-
ruby_whisper_params *rwp;
|
101
|
-
std::vector<float> pcmf32; // mono-channel F32 PCM
|
102
|
-
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
103
|
-
};
|
104
|
-
|
105
|
-
|
106
|
-
static void stop_whisper_unblock(void *args) {
|
107
|
-
struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
|
108
|
-
fprintf(stderr, "Set running to abort\n");
|
109
|
-
whisper_running_abort(object->rw->context);
|
110
|
-
}
|
111
|
-
|
112
|
-
static VALUE call_whisper_full_parallel(void *args) {
|
113
|
-
struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
|
114
|
-
|
115
|
-
whisper_running_restore(object->rw->context);
|
116
|
-
|
117
|
-
if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
|
118
|
-
fprintf(stderr, "failed to process audio\n");
|
119
|
-
return INT2FIX(-1);
|
120
|
-
}
|
121
|
-
return INT2FIX(0);
|
122
|
-
}
|
123
|
-
|
124
97
|
/*
|
125
98
|
* transcribe a single file
|
126
99
|
* can emit to a block results
|
@@ -141,9 +114,8 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|
141
114
|
|
142
115
|
std::string fname_inp = StringValueCStr(wave_file_path);
|
143
116
|
|
144
|
-
|
145
|
-
|
146
|
-
struct WhisperFullParallelParams object;
|
117
|
+
std::vector<float> pcmf32; // mono-channel F32 PCM
|
118
|
+
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
147
119
|
|
148
120
|
// WAV input - this is directly from main.cpp example
|
149
121
|
{
|
@@ -201,49 +173,43 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|
201
173
|
drwav_uninit(&wav);
|
202
174
|
|
203
175
|
// convert to mono, float
|
204
|
-
|
176
|
+
pcmf32.resize(n);
|
205
177
|
if (wav.channels == 1) {
|
206
178
|
for (uint64_t i = 0; i < n; i++) {
|
207
|
-
|
179
|
+
pcmf32[i] = float(pcm16[i])/32768.0f;
|
208
180
|
}
|
209
181
|
} else {
|
210
182
|
for (uint64_t i = 0; i < n; i++) {
|
211
|
-
|
183
|
+
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
212
184
|
}
|
213
185
|
}
|
214
186
|
|
215
187
|
if (rwp->diarize) {
|
216
188
|
// convert to stereo, float
|
217
|
-
|
189
|
+
pcmf32s.resize(2);
|
218
190
|
|
219
|
-
|
220
|
-
|
191
|
+
pcmf32s[0].resize(n);
|
192
|
+
pcmf32s[1].resize(n);
|
221
193
|
for (uint64_t i = 0; i < n; i++) {
|
222
|
-
|
223
|
-
|
194
|
+
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
195
|
+
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
224
196
|
}
|
225
197
|
}
|
226
198
|
}
|
227
199
|
{
|
228
200
|
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
229
201
|
|
230
|
-
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
|
202
|
+
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
231
203
|
bool is_aborted = *(bool*)user_data;
|
232
204
|
return !is_aborted;
|
233
205
|
};
|
234
206
|
rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
235
207
|
}
|
236
208
|
|
237
|
-
|
238
|
-
object.rwp = rwp;
|
239
|
-
|
240
|
-
int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
|
241
|
-
//if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
|
242
|
-
if (r != 0) {
|
209
|
+
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
|
243
210
|
fprintf(stderr, "failed to process audio\n");
|
244
211
|
return self;
|
245
212
|
}
|
246
|
-
|
247
213
|
const int n_segments = whisper_full_n_segments(rw->context);
|
248
214
|
VALUE output = rb_str_new2("");
|
249
215
|
for (int i = 0; i < n_segments; ++i) {
|