whispercpp 1.2.0.1 → 1.2.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc32803382297d2edfd5a2517cb3734f394fc95f7ba5b9111af8c2206ead53ca
4
- data.tar.gz: e6e8afcad840c465faed0cba6485ef86204e12895461a683b6d3ebc73703bb29
3
+ metadata.gz: 9de22be96d7d59590930b292b4cab94675502604f19051e4f61603896d81a887
4
+ data.tar.gz: 6261d4023c4e71a29c5884e401bd3557794ed4ed3ae5ae855abc0fb1c4d54d30
5
5
  SHA512:
6
- metadata.gz: e3a3f74e16dc7fe4e34a64ee63742a9dd12bcdbe9dd3a5ded1b860babe654eeb04aba8a36cd889c1d8231e910739159a799c0e6928f82831a1e7d9b20105cd50
7
- data.tar.gz: 9909de651ac6b8c6e1190b90a65388f84e9a8a5dd72a335853eca850286c6f819871e63b05b23f9c080123f6aabcc4a2bad77affc26e5fc4fe1f174fee555040
6
+ metadata.gz: b36fed0dc2d51177bc7478562deba9f6e6b5c0328624e8670a8f23a358b1bffccbb55c5ffc68f456c79b53098534d090c7721249ecc1e975bccd2128b468e0a6
7
+ data.tar.gz: '083105df0648a8c7de79554aa13c888a8cd8d325ac69ef78a52f382865e71ea82ef66b564b3022f89389902be1f5fcffa39fd2d4c6f5142f2ca6968446208bea'
data/Rakefile CHANGED
@@ -1,7 +1,21 @@
1
1
  require 'erb'
2
+ require 'open3'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
2
5
  require 'rubygems/package'
3
6
 
4
- BUILD_VERSION=1
7
+ CLEAN.include '**/*.o'
8
+ CLEAN.include "**/*.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
9
+ CLOBBER.include 'doc'
10
+ CLOBBER.include '**/*.log'
11
+ CLOBBER.include '**/Makefile'
12
+ CLOBBER.include '**/extconf.h'
13
+ CLOBBER.include '**/extconf.h'
14
+ CLOBBER.include '**/whisper.*'
15
+ CLOBBER.include '**/ggml.*'
16
+ CLOBBER.include '**/dr_wav.h'
17
+
18
+ BUILD_VERSION=2
5
19
  # Determine the current version of the software
6
20
  if File.read('../../CMakeLists.txt') =~ /project.*\s*VERSION\s*(\d.+)\)/
7
21
  CURRENT_VERSION = "#{$1}.#{BUILD_VERSION}"
@@ -9,6 +23,66 @@ else
9
23
  CURRENT_VERSION = "0.0.0.#{BUILD_VERSION}"
10
24
  end
11
25
 
26
+ def shell(args, opts = {})
27
+ puts "> #{args.join(' ')}"
28
+ cmd, live_stream, cwd = args, opts[:live_stdout], opts[:cwd]
29
+ Dir.chdir(cwd) {
30
+ wait_thr = nil
31
+
32
+ Open3.popen3(*cmd) do |stdin, stdout, stderr, thr|
33
+ stdin.close
34
+ wait_thr = thr # Ruby 1.8 will not yield thr, this will be nil
35
+
36
+ while line = stdout.gets do
37
+ live_stream.puts(line) if live_stream
38
+ end
39
+
40
+ while line = stderr.gets do
41
+ puts line
42
+ end
43
+ end
44
+
45
+ # prefer process handle directly from popen3, but if not available
46
+ # fallback to global.
47
+ p_status = wait_thr ? wait_thr.value : $?
48
+ exit_code = p_status.exitstatus
49
+ error = (exit_code != 0)
50
+ }
51
+ end
52
+
53
+ make_program = (/mswin/ =~ RUBY_PLATFORM) ? 'nmake' : 'make'
54
+ MAKECMD = ENV['MAKE_CMD'] || make_program
55
+ MAKEOPTS = ENV['MAKE_OPTS'] || ''
56
+ WHISPER_SO = "ext/whisper.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
57
+
58
+ file 'ext/Makefile' => 'ext/extconf.rb' do
59
+ shell(['ruby', 'extconf.rb', ENV['EXTCONF_OPTS'].to_s],
60
+ { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
61
+ )
62
+ end
63
+
64
+ def make(target = '')
65
+ shell(["#{MAKECMD}", "#{MAKEOPTS}", "#{target}"].reject(&:empty?),
66
+ { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
67
+ )
68
+ end
69
+
70
+ # Let make handle dependencies between c/o/so - we'll just run it.
71
+ file WHISPER_SO => (['ext/Makefile'] + Dir['ext/*.cpp'] + Dir['ext/*.c'] + Dir['ext/*.h']) do
72
+ make
73
+ end
74
+
75
+ desc "Compile the shared object"
76
+ task :compile => [WHISPER_SO]
77
+
78
+ desc "Default Task (Test project)"
79
+ task :default => :test
80
+
81
+ Rake::TestTask.new(:test) do |t|
82
+ t.test_files = FileList['tests/test_*.rb']
83
+ t.verbose = false
84
+ end
85
+
12
86
  desc 'Generate gem specification'
13
87
  task :gemspec do
14
88
  system("cp ../../LICENSE .")
data/ext/ruby_whisper.cpp CHANGED
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/thread.h>
2
3
  #include "ruby_whisper.h"
3
4
  #define DR_WAV_IMPLEMENTATION
4
5
  #include "dr_wav.h"
@@ -94,6 +95,32 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
94
95
  return self;
95
96
  }
96
97
 
98
+ struct WhisperFullParallelParams {
99
+ ruby_whisper *rw;
100
+ ruby_whisper_params *rwp;
101
+ std::vector<float> pcmf32; // mono-channel F32 PCM
102
+ std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
103
+ };
104
+
105
+
106
+ static void stop_whisper_unblock(void *args) {
107
+ struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
108
+ fprintf(stderr, "Set running to abort\n");
109
+ whisper_running_abort(object->rw->context);
110
+ }
111
+
112
+ static VALUE call_whisper_full_parallel(void *args) {
113
+ struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
114
+
115
+ whisper_running_restore(object->rw->context);
116
+
117
+ if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
118
+ fprintf(stderr, "failed to process audio\n");
119
+ return INT2FIX(-1);
120
+ }
121
+ return INT2FIX(0);
122
+ }
123
+
97
124
  /*
98
125
  * transcribe a single file
99
126
  * can emit to a block results
@@ -114,8 +141,9 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
114
141
 
115
142
  std::string fname_inp = StringValueCStr(wave_file_path);
116
143
 
117
- std::vector<float> pcmf32; // mono-channel F32 PCM
118
- std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
144
+ //std::vector<float> pcmf32; // mono-channel F32 PCM
145
+ //std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
146
+ struct WhisperFullParallelParams object;
119
147
 
120
148
  // WAV input - this is directly from main.cpp example
121
149
  {
@@ -173,26 +201,26 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
173
201
  drwav_uninit(&wav);
174
202
 
175
203
  // convert to mono, float
176
- pcmf32.resize(n);
204
+ object.pcmf32.resize(n);
177
205
  if (wav.channels == 1) {
178
206
  for (uint64_t i = 0; i < n; i++) {
179
- pcmf32[i] = float(pcm16[i])/32768.0f;
207
+ object.pcmf32[i] = float(pcm16[i])/32768.0f;
180
208
  }
181
209
  } else {
182
210
  for (uint64_t i = 0; i < n; i++) {
183
- pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
211
+ object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
184
212
  }
185
213
  }
186
214
 
187
215
  if (rwp->diarize) {
188
216
  // convert to stereo, float
189
- pcmf32s.resize(2);
217
+ object.pcmf32s.resize(2);
190
218
 
191
- pcmf32s[0].resize(n);
192
- pcmf32s[1].resize(n);
219
+ object.pcmf32s[0].resize(n);
220
+ object.pcmf32s[1].resize(n);
193
221
  for (uint64_t i = 0; i < n; i++) {
194
- pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
195
- pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
222
+ object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
223
+ object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
196
224
  }
197
225
  }
198
226
  }
@@ -206,10 +234,16 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
206
234
  rwp->params.encoder_begin_callback_user_data = &is_aborted;
207
235
  }
208
236
 
209
- if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
237
+ object.rw = rw;
238
+ object.rwp = rwp;
239
+
240
+ int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
241
+ //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
242
+ if (r != 0) {
210
243
  fprintf(stderr, "failed to process audio\n");
211
244
  return self;
212
245
  }
246
+
213
247
  const int n_segments = whisper_full_n_segments(rw->context);
214
248
  VALUE output = rb_str_new2("");
215
249
  for (int i = 0; i < n_segments; ++i) {
data/ext/whisper.cpp CHANGED
@@ -592,16 +592,19 @@ struct whisper_context {
592
592
 
593
593
  mutable std::mt19937 rng; // used for sampling at t > 0.0
594
594
 
595
- int lang_id;
595
+ int lang_id = 0; // english by default
596
596
 
597
597
  // [EXPERIMENTAL] token-level timestamps data
598
- int64_t t_beg;
599
- int64_t t_last;
598
+ int64_t t_beg = 0;
599
+ int64_t t_last = 0;
600
600
  whisper_token tid_last;
601
601
  std::vector<float> energy; // PCM signal energy
602
602
 
603
603
  // [EXPERIMENTAL] speed-up techniques
604
- int32_t exp_n_audio_ctx; // 0 - use default
604
+ int32_t exp_n_audio_ctx = 0; // 0 - use default
605
+
606
+ // [EXPERIMENTAL] abort handling
607
+ bool running = true;
605
608
 
606
609
  void use_buf(struct ggml_context * ctx, int i) {
607
610
  #if defined(WHISPER_USE_SCRATCH)
@@ -805,7 +808,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
805
808
  MEM_REQ_SCRATCH3.at (model.type) +
806
809
  scale*MEM_REQ_MODEL.at (model.type) +
807
810
  scale*MEM_REQ_KV_CROSS.at(model.type) +
808
- scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
811
+ scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
809
812
 
810
813
  // this is the memory required by one decoder
811
814
  const size_t mem_required_decoder =
@@ -2936,7 +2939,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2936
2939
  /*.language =*/ "en",
2937
2940
 
2938
2941
  /*.suppress_blank =*/ true,
2939
- /*.suppress_non_speech_tokens =*/true,
2942
+ /*.suppress_non_speech_tokens =*/ false,
2940
2943
 
2941
2944
  /*.temperature =*/ 0.0f,
2942
2945
  /*.max_initial_ts =*/ 1.0f,
@@ -2962,6 +2965,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2962
2965
 
2963
2966
  /*.encoder_begin_callback =*/ nullptr,
2964
2967
  /*.encoder_begin_callback_user_data =*/ nullptr,
2968
+
2969
+ /*.logits_filter_callback =*/ nullptr,
2970
+ /*.logits_filter_callback_user_data =*/ nullptr,
2965
2971
  };
2966
2972
 
2967
2973
  switch (strategy) {
@@ -3078,8 +3084,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
3078
3084
  return res;
3079
3085
  }
3080
3086
 
3081
- static const std::vector<std::string> non_speech_tokens
3082
- {
3087
+ static const std::vector<std::string> non_speech_tokens = {
3083
3088
  "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
3084
3089
  "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
3085
3090
  "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
@@ -3090,7 +3095,7 @@ static const std::vector<std::string> non_speech_tokens
3090
3095
  // - applies logit filters
3091
3096
  // - computes logprobs and probs
3092
3097
  static void whisper_process_logits(
3093
- const struct whisper_context & ctx,
3098
+ struct whisper_context & ctx,
3094
3099
  const struct whisper_full_params params,
3095
3100
  struct whisper_decoder & decoder,
3096
3101
  float temperature) {
@@ -3146,29 +3151,27 @@ static void whisper_process_logits(
3146
3151
  logits[vocab.token_translate] = -INFINITY;
3147
3152
  logits[vocab.token_transcribe] = -INFINITY;
3148
3153
 
3154
+ if (params.logits_filter_callback) {
3155
+ params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
3156
+ }
3149
3157
 
3150
3158
  // suppress non-speech tokens
3151
3159
  // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
3152
- if (params.suppress_non_speech_tokens)
3153
- {
3154
- for (const std::string &token : non_speech_tokens)
3155
- {
3156
- std::string suppress_tokens[] = {token, " " + token};
3157
- for (const std::string &suppress_token : suppress_tokens)
3158
- {
3159
- if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
3160
- {
3160
+ if (params.suppress_non_speech_tokens) {
3161
+ for (const std::string & token : non_speech_tokens) {
3162
+ const std::string suppress_tokens[] = {token, " " + token};
3163
+ for (const std::string & suppress_token : suppress_tokens) {
3164
+ if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
3161
3165
  logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
3162
3166
  }
3163
3167
  }
3164
3168
  }
3169
+
3165
3170
  // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
3166
- if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
3167
- {
3171
+ if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
3168
3172
  logits[vocab.token_to_id.at(" -")] = -INFINITY;
3169
3173
  }
3170
- if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
3171
- {
3174
+ if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
3172
3175
  logits[vocab.token_to_id.at(" '")] = -INFINITY;
3173
3176
  }
3174
3177
  }
@@ -3571,7 +3574,7 @@ int whisper_full(
3571
3574
  n_decoders = std::max(1, n_decoders);
3572
3575
 
3573
3576
  // TAGS: WHISPER_DECODER_INIT
3574
- for (int j = 1; j < n_decoders; j++) {
3577
+ for (int j = 1; j < n_decoders && ctx->running; j++) {
3575
3578
  auto & decoder = ctx->decoders[j];
3576
3579
 
3577
3580
  if (decoder.kv_self.ctx == nullptr) {
@@ -3654,7 +3657,7 @@ int whisper_full(
3654
3657
  std::vector<beam_candidate> beam_candidates;
3655
3658
 
3656
3659
  // main loop
3657
- while (true) {
3660
+ while (ctx->running) {
3658
3661
  const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
3659
3662
  while (progress_cur >= progress_prev + progress_step) {
3660
3663
  progress_prev += progress_step;
@@ -3854,7 +3857,7 @@ int whisper_full(
3854
3857
  return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
3855
3858
  });
3856
3859
 
3857
- int cur_c = 0;
3860
+ uint32_t cur_c = 0;
3858
3861
 
3859
3862
  for (int j = 0; j < n_decoders_cur; ++j) {
3860
3863
  auto & decoder = ctx->decoders[j];
@@ -4204,6 +4207,18 @@ int whisper_full(
4204
4207
  return 0;
4205
4208
  }
4206
4209
 
4210
+ void whisper_running_abort(struct whisper_context * ctx) {
4211
+ ctx->running = false;
4212
+ }
4213
+
4214
+ void whisper_running_restore(struct whisper_context * ctx) {
4215
+ ctx->running = true;
4216
+ }
4217
+
4218
+ bool whisper_running_state(struct whisper_context * ctx) {
4219
+ return ctx->running;
4220
+ }
4221
+
4207
4222
  int whisper_full_parallel(
4208
4223
  struct whisper_context * ctx,
4209
4224
  struct whisper_full_params params,
@@ -4339,7 +4354,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
4339
4354
  }
4340
4355
 
4341
4356
  int whisper_full_lang_id(struct whisper_context * ctx) {
4342
- return ctx->lang_id;
4357
+ return ctx->lang_id;
4343
4358
  }
4344
4359
 
4345
4360
  int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
data/ext/whisper.h CHANGED
@@ -225,6 +225,15 @@ extern "C" {
225
225
  // Print system information
226
226
  WHISPER_API const char * whisper_print_system_info(void);
227
227
 
228
+ // Abort a running whisper_full_parallel or whisper_full
229
+ WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
230
+
231
+ // Resume whisper context from an aborted state allowing it run again
232
+ WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
233
+
234
+ // Check the whisper context state if true then it can run if false it can not
235
+ WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
236
+
228
237
  ////////////////////////////////////////////////////////////////////////////
229
238
 
230
239
  // Available sampling strategies
@@ -243,6 +252,16 @@ extern "C" {
243
252
  // If it returns false, the computation is aborted
244
253
  typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
245
254
 
255
+ // Logits filter callback
256
+ // Can be used to modify the logits before sampling
257
+ // If not NULL, called after applying temperature to logits
258
+ typedef void (*whisper_logits_filter_callback)(
259
+ struct whisper_context * ctx,
260
+ const whisper_token_data * tokens,
261
+ int n_tokens,
262
+ float * logits,
263
+ void * user_data);
264
+
246
265
  // Parameters for the whisper_full() function
247
266
  // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
248
267
  // whisper_full_default_params()
@@ -315,6 +334,10 @@ extern "C" {
315
334
  // called each time before the encoder starts
316
335
  whisper_encoder_begin_callback encoder_begin_callback;
317
336
  void * encoder_begin_callback_user_data;
337
+
338
+ // called by each decoder to filter obtained logits
339
+ whisper_logits_filter_callback logits_filter_callback;
340
+ void * logits_filter_callback_user_data;
318
341
  };
319
342
 
320
343
  WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whispercpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0.1
4
+ version: 1.2.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Georgi Gerganov
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2023-02-25 00:00:00.000000000 Z
12
+ date: 2023-02-27 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: High-performance inference of OpenAI's Whisper automatic speech recognition
15
15
  (ASR) model via Ruby