whispercpp 1.2.0.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (9) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +3 -92
  3. data/ext/extconf.rb +9 -0
  4. data/ext/ggml.c +18380 -5241
  5. data/ext/ggml.h +2156 -502
  6. data/ext/ruby_whisper.cpp +13 -47
  7. data/ext/whisper.cpp +4182 -1787
  8. data/ext/whisper.h +334 -65
  9. metadata +3 -3
data/ext/ruby_whisper.cpp CHANGED
@@ -1,5 +1,4 @@
1
1
  #include <ruby.h>
2
- #include <ruby/thread.h>
3
2
  #include "ruby_whisper.h"
4
3
  #define DR_WAV_IMPLEMENTATION
5
4
  #include "dr_wav.h"
@@ -88,39 +87,13 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
88
87
  if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
89
88
  rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
90
89
  }
91
- rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
90
+ rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
92
91
  if (rw->context == nullptr) {
93
92
  rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
94
93
  }
95
94
  return self;
96
95
  }
97
96
 
98
- struct WhisperFullParallelParams {
99
- ruby_whisper *rw;
100
- ruby_whisper_params *rwp;
101
- std::vector<float> pcmf32; // mono-channel F32 PCM
102
- std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
103
- };
104
-
105
-
106
- static void stop_whisper_unblock(void *args) {
107
- struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
108
- fprintf(stderr, "Set running to abort\n");
109
- whisper_running_abort(object->rw->context);
110
- }
111
-
112
- static VALUE call_whisper_full_parallel(void *args) {
113
- struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
114
-
115
- whisper_running_restore(object->rw->context);
116
-
117
- if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
118
- fprintf(stderr, "failed to process audio\n");
119
- return INT2FIX(-1);
120
- }
121
- return INT2FIX(0);
122
- }
123
-
124
97
  /*
125
98
  * transcribe a single file
126
99
  * can emit to a block results
@@ -141,9 +114,8 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
141
114
 
142
115
  std::string fname_inp = StringValueCStr(wave_file_path);
143
116
 
144
- //std::vector<float> pcmf32; // mono-channel F32 PCM
145
- //std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
146
- struct WhisperFullParallelParams object;
117
+ std::vector<float> pcmf32; // mono-channel F32 PCM
118
+ std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
147
119
 
148
120
  // WAV input - this is directly from main.cpp example
149
121
  {
@@ -201,49 +173,43 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
201
173
  drwav_uninit(&wav);
202
174
 
203
175
  // convert to mono, float
204
- object.pcmf32.resize(n);
176
+ pcmf32.resize(n);
205
177
  if (wav.channels == 1) {
206
178
  for (uint64_t i = 0; i < n; i++) {
207
- object.pcmf32[i] = float(pcm16[i])/32768.0f;
179
+ pcmf32[i] = float(pcm16[i])/32768.0f;
208
180
  }
209
181
  } else {
210
182
  for (uint64_t i = 0; i < n; i++) {
211
- object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
183
+ pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
212
184
  }
213
185
  }
214
186
 
215
187
  if (rwp->diarize) {
216
188
  // convert to stereo, float
217
- object.pcmf32s.resize(2);
189
+ pcmf32s.resize(2);
218
190
 
219
- object.pcmf32s[0].resize(n);
220
- object.pcmf32s[1].resize(n);
191
+ pcmf32s[0].resize(n);
192
+ pcmf32s[1].resize(n);
221
193
  for (uint64_t i = 0; i < n; i++) {
222
- object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
223
- object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
194
+ pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
195
+ pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
224
196
  }
225
197
  }
226
198
  }
227
199
  {
228
200
  static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
229
201
 
230
- rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
202
+ rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
231
203
  bool is_aborted = *(bool*)user_data;
232
204
  return !is_aborted;
233
205
  };
234
206
  rwp->params.encoder_begin_callback_user_data = &is_aborted;
235
207
  }
236
208
 
237
- object.rw = rw;
238
- object.rwp = rwp;
239
-
240
- int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
241
- //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
242
- if (r != 0) {
209
+ if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
243
210
  fprintf(stderr, "failed to process audio\n");
244
211
  return self;
245
212
  }
246
-
247
213
  const int n_segments = whisper_full_n_segments(rw->context);
248
214
  VALUE output = rb_str_new2("");
249
215
  for (int i = 0; i < n_segments; ++i) {