whispercpp 1.2.0.1 → 1.2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc32803382297d2edfd5a2517cb3734f394fc95f7ba5b9111af8c2206ead53ca
4
- data.tar.gz: e6e8afcad840c465faed0cba6485ef86204e12895461a683b6d3ebc73703bb29
3
+ metadata.gz: 9de22be96d7d59590930b292b4cab94675502604f19051e4f61603896d81a887
4
+ data.tar.gz: 6261d4023c4e71a29c5884e401bd3557794ed4ed3ae5ae855abc0fb1c4d54d30
5
5
  SHA512:
6
- metadata.gz: e3a3f74e16dc7fe4e34a64ee63742a9dd12bcdbe9dd3a5ded1b860babe654eeb04aba8a36cd889c1d8231e910739159a799c0e6928f82831a1e7d9b20105cd50
7
- data.tar.gz: 9909de651ac6b8c6e1190b90a65388f84e9a8a5dd72a335853eca850286c6f819871e63b05b23f9c080123f6aabcc4a2bad77affc26e5fc4fe1f174fee555040
6
+ metadata.gz: b36fed0dc2d51177bc7478562deba9f6e6b5c0328624e8670a8f23a358b1bffccbb55c5ffc68f456c79b53098534d090c7721249ecc1e975bccd2128b468e0a6
7
+ data.tar.gz: '083105df0648a8c7de79554aa13c888a8cd8d325ac69ef78a52f382865e71ea82ef66b564b3022f89389902be1f5fcffa39fd2d4c6f5142f2ca6968446208bea'
data/Rakefile CHANGED
@@ -1,7 +1,21 @@
1
1
  require 'erb'
2
+ require 'open3'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
2
5
  require 'rubygems/package'
3
6
 
4
- BUILD_VERSION=1
7
+ CLEAN.include '**/*.o'
8
+ CLEAN.include "**/*.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
9
+ CLOBBER.include 'doc'
10
+ CLOBBER.include '**/*.log'
11
+ CLOBBER.include '**/Makefile'
12
+ CLOBBER.include '**/extconf.h'
13
+ CLOBBER.include '**/extconf.h'
14
+ CLOBBER.include '**/whisper.*'
15
+ CLOBBER.include '**/ggml.*'
16
+ CLOBBER.include '**/dr_wav.h'
17
+
18
+ BUILD_VERSION=2
5
19
  # Determine the current version of the software
6
20
  if File.read('../../CMakeLists.txt') =~ /project.*\s*VERSION\s*(\d.+)\)/
7
21
  CURRENT_VERSION = "#{$1}.#{BUILD_VERSION}"
@@ -9,6 +23,66 @@ else
9
23
  CURRENT_VERSION = "0.0.0.#{BUILD_VERSION}"
10
24
  end
11
25
 
26
+ def shell(args, opts = {})
27
+ puts "> #{args.join(' ')}"
28
+ cmd, live_stream, cwd = args, opts[:live_stdout], opts[:cwd]
29
+ Dir.chdir(cwd) {
30
+ wait_thr = nil
31
+
32
+ Open3.popen3(*cmd) do |stdin, stdout, stderr, thr|
33
+ stdin.close
34
+ wait_thr = thr # Ruby 1.8 will not yield thr, this will be nil
35
+
36
+ while line = stdout.gets do
37
+ live_stream.puts(line) if live_stream
38
+ end
39
+
40
+ while line = stderr.gets do
41
+ puts line
42
+ end
43
+ end
44
+
45
+ # prefer process handle directly from popen3, but if not available
46
+ # fallback to global.
47
+ p_status = wait_thr ? wait_thr.value : $?
48
+ exit_code = p_status.exitstatus
49
+ error = (exit_code != 0)
50
+ }
51
+ end
52
+
53
+ make_program = (/mswin/ =~ RUBY_PLATFORM) ? 'nmake' : 'make'
54
+ MAKECMD = ENV['MAKE_CMD'] || make_program
55
+ MAKEOPTS = ENV['MAKE_OPTS'] || ''
56
+ WHISPER_SO = "ext/whisper.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
57
+
58
+ file 'ext/Makefile' => 'ext/extconf.rb' do
59
+ shell(['ruby', 'extconf.rb', ENV['EXTCONF_OPTS'].to_s],
60
+ { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
61
+ )
62
+ end
63
+
64
+ def make(target = '')
65
+ shell(["#{MAKECMD}", "#{MAKEOPTS}", "#{target}"].reject(&:empty?),
66
+ { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
67
+ )
68
+ end
69
+
70
+ # Let make handle dependencies between c/o/so - we'll just run it.
71
+ file WHISPER_SO => (['ext/Makefile'] + Dir['ext/*.cpp'] + Dir['ext/*.c'] + Dir['ext/*.h']) do
72
+ make
73
+ end
74
+
75
+ desc "Compile the shared object"
76
+ task :compile => [WHISPER_SO]
77
+
78
+ desc "Default Task (Test project)"
79
+ task :default => :test
80
+
81
+ Rake::TestTask.new(:test) do |t|
82
+ t.test_files = FileList['tests/test_*.rb']
83
+ t.verbose = false
84
+ end
85
+
12
86
  desc 'Generate gem specification'
13
87
  task :gemspec do
14
88
  system("cp ../../LICENSE .")
data/ext/ruby_whisper.cpp CHANGED
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/thread.h>
2
3
  #include "ruby_whisper.h"
3
4
  #define DR_WAV_IMPLEMENTATION
4
5
  #include "dr_wav.h"
@@ -94,6 +95,32 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
94
95
  return self;
95
96
  }
96
97
 
98
+ struct WhisperFullParallelParams {
99
+ ruby_whisper *rw;
100
+ ruby_whisper_params *rwp;
101
+ std::vector<float> pcmf32; // mono-channel F32 PCM
102
+ std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
103
+ };
104
+
105
+
106
+ static void stop_whisper_unblock(void *args) {
107
+ struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
108
+ fprintf(stderr, "Set running to abort\n");
109
+ whisper_running_abort(object->rw->context);
110
+ }
111
+
112
+ static VALUE call_whisper_full_parallel(void *args) {
113
+ struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
114
+
115
+ whisper_running_restore(object->rw->context);
116
+
117
+ if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
118
+ fprintf(stderr, "failed to process audio\n");
119
+ return INT2FIX(-1);
120
+ }
121
+ return INT2FIX(0);
122
+ }
123
+
97
124
  /*
98
125
  * transcribe a single file
99
126
  * can emit to a block results
@@ -114,8 +141,9 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
114
141
 
115
142
  std::string fname_inp = StringValueCStr(wave_file_path);
116
143
 
117
- std::vector<float> pcmf32; // mono-channel F32 PCM
118
- std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
144
+ //std::vector<float> pcmf32; // mono-channel F32 PCM
145
+ //std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
146
+ struct WhisperFullParallelParams object;
119
147
 
120
148
  // WAV input - this is directly from main.cpp example
121
149
  {
@@ -173,26 +201,26 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
173
201
  drwav_uninit(&wav);
174
202
 
175
203
  // convert to mono, float
176
- pcmf32.resize(n);
204
+ object.pcmf32.resize(n);
177
205
  if (wav.channels == 1) {
178
206
  for (uint64_t i = 0; i < n; i++) {
179
- pcmf32[i] = float(pcm16[i])/32768.0f;
207
+ object.pcmf32[i] = float(pcm16[i])/32768.0f;
180
208
  }
181
209
  } else {
182
210
  for (uint64_t i = 0; i < n; i++) {
183
- pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
211
+ object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
184
212
  }
185
213
  }
186
214
 
187
215
  if (rwp->diarize) {
188
216
  // convert to stereo, float
189
- pcmf32s.resize(2);
217
+ object.pcmf32s.resize(2);
190
218
 
191
- pcmf32s[0].resize(n);
192
- pcmf32s[1].resize(n);
219
+ object.pcmf32s[0].resize(n);
220
+ object.pcmf32s[1].resize(n);
193
221
  for (uint64_t i = 0; i < n; i++) {
194
- pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
195
- pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
222
+ object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
223
+ object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
196
224
  }
197
225
  }
198
226
  }
@@ -206,10 +234,16 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
206
234
  rwp->params.encoder_begin_callback_user_data = &is_aborted;
207
235
  }
208
236
 
209
- if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
237
+ object.rw = rw;
238
+ object.rwp = rwp;
239
+
240
+ int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
241
+ //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
242
+ if (r != 0) {
210
243
  fprintf(stderr, "failed to process audio\n");
211
244
  return self;
212
245
  }
246
+
213
247
  const int n_segments = whisper_full_n_segments(rw->context);
214
248
  VALUE output = rb_str_new2("");
215
249
  for (int i = 0; i < n_segments; ++i) {
data/ext/whisper.cpp CHANGED
@@ -592,16 +592,19 @@ struct whisper_context {
592
592
 
593
593
  mutable std::mt19937 rng; // used for sampling at t > 0.0
594
594
 
595
- int lang_id;
595
+ int lang_id = 0; // english by default
596
596
 
597
597
  // [EXPERIMENTAL] token-level timestamps data
598
- int64_t t_beg;
599
- int64_t t_last;
598
+ int64_t t_beg = 0;
599
+ int64_t t_last = 0;
600
600
  whisper_token tid_last;
601
601
  std::vector<float> energy; // PCM signal energy
602
602
 
603
603
  // [EXPERIMENTAL] speed-up techniques
604
- int32_t exp_n_audio_ctx; // 0 - use default
604
+ int32_t exp_n_audio_ctx = 0; // 0 - use default
605
+
606
+ // [EXPERIMENTAL] abort handling
607
+ bool running = true;
605
608
 
606
609
  void use_buf(struct ggml_context * ctx, int i) {
607
610
  #if defined(WHISPER_USE_SCRATCH)
@@ -805,7 +808,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
805
808
  MEM_REQ_SCRATCH3.at (model.type) +
806
809
  scale*MEM_REQ_MODEL.at (model.type) +
807
810
  scale*MEM_REQ_KV_CROSS.at(model.type) +
808
- scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
811
+ scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
809
812
 
810
813
  // this is the memory required by one decoder
811
814
  const size_t mem_required_decoder =
@@ -2936,7 +2939,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2936
2939
  /*.language =*/ "en",
2937
2940
 
2938
2941
  /*.suppress_blank =*/ true,
2939
- /*.suppress_non_speech_tokens =*/true,
2942
+ /*.suppress_non_speech_tokens =*/ false,
2940
2943
 
2941
2944
  /*.temperature =*/ 0.0f,
2942
2945
  /*.max_initial_ts =*/ 1.0f,
@@ -2962,6 +2965,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2962
2965
 
2963
2966
  /*.encoder_begin_callback =*/ nullptr,
2964
2967
  /*.encoder_begin_callback_user_data =*/ nullptr,
2968
+
2969
+ /*.logits_filter_callback =*/ nullptr,
2970
+ /*.logits_filter_callback_user_data =*/ nullptr,
2965
2971
  };
2966
2972
 
2967
2973
  switch (strategy) {
@@ -3078,8 +3084,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
3078
3084
  return res;
3079
3085
  }
3080
3086
 
3081
- static const std::vector<std::string> non_speech_tokens
3082
- {
3087
+ static const std::vector<std::string> non_speech_tokens = {
3083
3088
  "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
3084
3089
  "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
3085
3090
  "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
@@ -3090,7 +3095,7 @@ static const std::vector<std::string> non_speech_tokens
3090
3095
  // - applies logit filters
3091
3096
  // - computes logprobs and probs
3092
3097
  static void whisper_process_logits(
3093
- const struct whisper_context & ctx,
3098
+ struct whisper_context & ctx,
3094
3099
  const struct whisper_full_params params,
3095
3100
  struct whisper_decoder & decoder,
3096
3101
  float temperature) {
@@ -3146,29 +3151,27 @@ static void whisper_process_logits(
3146
3151
  logits[vocab.token_translate] = -INFINITY;
3147
3152
  logits[vocab.token_transcribe] = -INFINITY;
3148
3153
 
3154
+ if (params.logits_filter_callback) {
3155
+ params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
3156
+ }
3149
3157
 
3150
3158
  // suppress non-speech tokens
3151
3159
  // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
3152
- if (params.suppress_non_speech_tokens)
3153
- {
3154
- for (const std::string &token : non_speech_tokens)
3155
- {
3156
- std::string suppress_tokens[] = {token, " " + token};
3157
- for (const std::string &suppress_token : suppress_tokens)
3158
- {
3159
- if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
3160
- {
3160
+ if (params.suppress_non_speech_tokens) {
3161
+ for (const std::string & token : non_speech_tokens) {
3162
+ const std::string suppress_tokens[] = {token, " " + token};
3163
+ for (const std::string & suppress_token : suppress_tokens) {
3164
+ if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
3161
3165
  logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
3162
3166
  }
3163
3167
  }
3164
3168
  }
3169
+
3165
3170
  // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
3166
- if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
3167
- {
3171
+ if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
3168
3172
  logits[vocab.token_to_id.at(" -")] = -INFINITY;
3169
3173
  }
3170
- if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
3171
- {
3174
+ if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
3172
3175
  logits[vocab.token_to_id.at(" '")] = -INFINITY;
3173
3176
  }
3174
3177
  }
@@ -3571,7 +3574,7 @@ int whisper_full(
3571
3574
  n_decoders = std::max(1, n_decoders);
3572
3575
 
3573
3576
  // TAGS: WHISPER_DECODER_INIT
3574
- for (int j = 1; j < n_decoders; j++) {
3577
+ for (int j = 1; j < n_decoders && ctx->running; j++) {
3575
3578
  auto & decoder = ctx->decoders[j];
3576
3579
 
3577
3580
  if (decoder.kv_self.ctx == nullptr) {
@@ -3654,7 +3657,7 @@ int whisper_full(
3654
3657
  std::vector<beam_candidate> beam_candidates;
3655
3658
 
3656
3659
  // main loop
3657
- while (true) {
3660
+ while (ctx->running) {
3658
3661
  const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
3659
3662
  while (progress_cur >= progress_prev + progress_step) {
3660
3663
  progress_prev += progress_step;
@@ -3854,7 +3857,7 @@ int whisper_full(
3854
3857
  return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
3855
3858
  });
3856
3859
 
3857
- int cur_c = 0;
3860
+ uint32_t cur_c = 0;
3858
3861
 
3859
3862
  for (int j = 0; j < n_decoders_cur; ++j) {
3860
3863
  auto & decoder = ctx->decoders[j];
@@ -4204,6 +4207,18 @@ int whisper_full(
4204
4207
  return 0;
4205
4208
  }
4206
4209
 
4210
+ void whisper_running_abort(struct whisper_context * ctx) {
4211
+ ctx->running = false;
4212
+ }
4213
+
4214
+ void whisper_running_restore(struct whisper_context * ctx) {
4215
+ ctx->running = true;
4216
+ }
4217
+
4218
+ bool whisper_running_state(struct whisper_context * ctx) {
4219
+ return ctx->running;
4220
+ }
4221
+
4207
4222
  int whisper_full_parallel(
4208
4223
  struct whisper_context * ctx,
4209
4224
  struct whisper_full_params params,
@@ -4339,7 +4354,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
4339
4354
  }
4340
4355
 
4341
4356
  int whisper_full_lang_id(struct whisper_context * ctx) {
4342
- return ctx->lang_id;
4357
+ return ctx->lang_id;
4343
4358
  }
4344
4359
 
4345
4360
  int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
data/ext/whisper.h CHANGED
@@ -225,6 +225,15 @@ extern "C" {
225
225
  // Print system information
226
226
  WHISPER_API const char * whisper_print_system_info(void);
227
227
 
228
+ // Abort a running whisper_full_parallel or whisper_full
229
+ WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
230
+
231
+ // Resume whisper context from an aborted state allowing it run again
232
+ WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
233
+
234
+ // Check the whisper context state if true then it can run if false it can not
235
+ WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
236
+
228
237
  ////////////////////////////////////////////////////////////////////////////
229
238
 
230
239
  // Available sampling strategies
@@ -243,6 +252,16 @@ extern "C" {
243
252
  // If it returns false, the computation is aborted
244
253
  typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
245
254
 
255
+ // Logits filter callback
256
+ // Can be used to modify the logits before sampling
257
+ // If not NULL, called after applying temperature to logits
258
+ typedef void (*whisper_logits_filter_callback)(
259
+ struct whisper_context * ctx,
260
+ const whisper_token_data * tokens,
261
+ int n_tokens,
262
+ float * logits,
263
+ void * user_data);
264
+
246
265
  // Parameters for the whisper_full() function
247
266
  // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
248
267
  // whisper_full_default_params()
@@ -315,6 +334,10 @@ extern "C" {
315
334
  // called each time before the encoder starts
316
335
  whisper_encoder_begin_callback encoder_begin_callback;
317
336
  void * encoder_begin_callback_user_data;
337
+
338
+ // called by each decoder to filter obtained logits
339
+ whisper_logits_filter_callback logits_filter_callback;
340
+ void * logits_filter_callback_user_data;
318
341
  };
319
342
 
320
343
  WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whispercpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0.1
4
+ version: 1.2.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Georgi Gerganov
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2023-02-25 00:00:00.000000000 Z
12
+ date: 2023-02-27 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: High-performance inference of OpenAI's Whisper automatic speech recognition
15
15
  (ASR) model via Ruby