whispercpp 1.2.0.2 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (9) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +3 -92
  3. data/ext/extconf.rb +9 -0
  4. data/ext/ggml.c +18380 -5241
  5. data/ext/ggml.h +2156 -502
  6. data/ext/ruby_whisper.cpp +13 -47
  7. data/ext/whisper.cpp +4182 -1787
  8. data/ext/whisper.h +334 -65
  9. metadata +3 -3
data/ext/ruby_whisper.cpp CHANGED
@@ -1,5 +1,4 @@
1
1
  #include <ruby.h>
2
- #include <ruby/thread.h>
3
2
  #include "ruby_whisper.h"
4
3
  #define DR_WAV_IMPLEMENTATION
5
4
  #include "dr_wav.h"
@@ -88,39 +87,13 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
88
87
  if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
89
88
  rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
90
89
  }
91
- rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
90
+ rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
92
91
  if (rw->context == nullptr) {
93
92
  rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
94
93
  }
95
94
  return self;
96
95
  }
97
96
 
98
- struct WhisperFullParallelParams {
99
- ruby_whisper *rw;
100
- ruby_whisper_params *rwp;
101
- std::vector<float> pcmf32; // mono-channel F32 PCM
102
- std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
103
- };
104
-
105
-
106
- static void stop_whisper_unblock(void *args) {
107
- struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
108
- fprintf(stderr, "Set running to abort\n");
109
- whisper_running_abort(object->rw->context);
110
- }
111
-
112
- static VALUE call_whisper_full_parallel(void *args) {
113
- struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
114
-
115
- whisper_running_restore(object->rw->context);
116
-
117
- if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
118
- fprintf(stderr, "failed to process audio\n");
119
- return INT2FIX(-1);
120
- }
121
- return INT2FIX(0);
122
- }
123
-
124
97
  /*
125
98
  * transcribe a single file
126
99
  * can emit to a block results
@@ -141,9 +114,8 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
141
114
 
142
115
  std::string fname_inp = StringValueCStr(wave_file_path);
143
116
 
144
- //std::vector<float> pcmf32; // mono-channel F32 PCM
145
- //std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
146
- struct WhisperFullParallelParams object;
117
+ std::vector<float> pcmf32; // mono-channel F32 PCM
118
+ std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
147
119
 
148
120
  // WAV input - this is directly from main.cpp example
149
121
  {
@@ -201,49 +173,43 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
201
173
  drwav_uninit(&wav);
202
174
 
203
175
  // convert to mono, float
204
- object.pcmf32.resize(n);
176
+ pcmf32.resize(n);
205
177
  if (wav.channels == 1) {
206
178
  for (uint64_t i = 0; i < n; i++) {
207
- object.pcmf32[i] = float(pcm16[i])/32768.0f;
179
+ pcmf32[i] = float(pcm16[i])/32768.0f;
208
180
  }
209
181
  } else {
210
182
  for (uint64_t i = 0; i < n; i++) {
211
- object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
183
+ pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
212
184
  }
213
185
  }
214
186
 
215
187
  if (rwp->diarize) {
216
188
  // convert to stereo, float
217
- object.pcmf32s.resize(2);
189
+ pcmf32s.resize(2);
218
190
 
219
- object.pcmf32s[0].resize(n);
220
- object.pcmf32s[1].resize(n);
191
+ pcmf32s[0].resize(n);
192
+ pcmf32s[1].resize(n);
221
193
  for (uint64_t i = 0; i < n; i++) {
222
- object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
223
- object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
194
+ pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
195
+ pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
224
196
  }
225
197
  }
226
198
  }
227
199
  {
228
200
  static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
229
201
 
230
- rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
202
+ rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
231
203
  bool is_aborted = *(bool*)user_data;
232
204
  return !is_aborted;
233
205
  };
234
206
  rwp->params.encoder_begin_callback_user_data = &is_aborted;
235
207
  }
236
208
 
237
- object.rw = rw;
238
- object.rwp = rwp;
239
-
240
- int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
241
- //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
242
- if (r != 0) {
209
+ if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
243
210
  fprintf(stderr, "failed to process audio\n");
244
211
  return self;
245
212
  }
246
-
247
213
  const int n_segments = whisper_full_n_segments(rw->context);
248
214
  VALUE output = rb_str_new2("");
249
215
  for (int i = 0; i < n_segments; ++i) {