RubyGems - whispercpp - Versions diffs - 1.2.0.1 → 1.2.0.2 - Mend

whispercpp 1.2.0.1 → 1.2.0.2

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fc32803382297d2edfd5a2517cb3734f394fc95f7ba5b9111af8c2206ead53ca
-  data.tar.gz: e6e8afcad840c465faed0cba6485ef86204e12895461a683b6d3ebc73703bb29
+  metadata.gz: 9de22be96d7d59590930b292b4cab94675502604f19051e4f61603896d81a887
+  data.tar.gz: 6261d4023c4e71a29c5884e401bd3557794ed4ed3ae5ae855abc0fb1c4d54d30
 SHA512:
-  metadata.gz: e3a3f74e16dc7fe4e34a64ee63742a9dd12bcdbe9dd3a5ded1b860babe654eeb04aba8a36cd889c1d8231e910739159a799c0e6928f82831a1e7d9b20105cd50
-  data.tar.gz: 9909de651ac6b8c6e1190b90a65388f84e9a8a5dd72a335853eca850286c6f819871e63b05b23f9c080123f6aabcc4a2bad77affc26e5fc4fe1f174fee555040
+  metadata.gz: b36fed0dc2d51177bc7478562deba9f6e6b5c0328624e8670a8f23a358b1bffccbb55c5ffc68f456c79b53098534d090c7721249ecc1e975bccd2128b468e0a6
+  data.tar.gz: '083105df0648a8c7de79554aa13c888a8cd8d325ac69ef78a52f382865e71ea82ef66b564b3022f89389902be1f5fcffa39fd2d4c6f5142f2ca6968446208bea'

data/Rakefile CHANGED Viewed

@@ -1,7 +1,21 @@
 require 'erb'
+require 'open3'
+require 'rake/clean'
+require 'rake/testtask'
 require 'rubygems/package'
-BUILD_VERSION=1
+CLEAN.include '**/*.o'
+CLEAN.include "**/*.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
+CLOBBER.include 'doc'
+CLOBBER.include '**/*.log'
+CLOBBER.include '**/Makefile'
+CLOBBER.include '**/extconf.h'
+CLOBBER.include '**/extconf.h'
+CLOBBER.include '**/whisper.*'
+CLOBBER.include '**/ggml.*'
+CLOBBER.include '**/dr_wav.h'
+BUILD_VERSION=2
 # Determine the current version of the software
 if File.read('../../CMakeLists.txt') =~ /project.*\s*VERSION\s*(\d.+)\)/
   CURRENT_VERSION = "#{$1}.#{BUILD_VERSION}"
@@ -9,6 +23,66 @@ else
   CURRENT_VERSION = "0.0.0.#{BUILD_VERSION}"
 end
+def shell(args, opts = {})
+  puts "> #{args.join(' ')}"
+  cmd, live_stream, cwd = args, opts[:live_stdout], opts[:cwd]
+  Dir.chdir(cwd) {
+    wait_thr = nil
+    Open3.popen3(*cmd) do |stdin, stdout, stderr, thr|
+      stdin.close
+      wait_thr = thr # Ruby 1.8 will not yield thr, this will be nil
+      while line = stdout.gets do
+        live_stream.puts(line) if live_stream
+      end
+      while line = stderr.gets do
+        puts line
+      end
+    end
+    # prefer process handle directly from popen3, but if not available
+    # fallback to global.
+    p_status = wait_thr ? wait_thr.value : $?
+    exit_code = p_status.exitstatus
+    error = (exit_code != 0)
+  }
+end
+make_program = (/mswin/ =~ RUBY_PLATFORM) ? 'nmake' : 'make'
+MAKECMD = ENV['MAKE_CMD'] || make_program
+MAKEOPTS = ENV['MAKE_OPTS'] || ''
+WHISPER_SO = "ext/whisper.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
+file 'ext/Makefile' => 'ext/extconf.rb' do
+  shell(['ruby', 'extconf.rb', ENV['EXTCONF_OPTS'].to_s],
+        { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
+       )
+end
+def make(target = '')
+  shell(["#{MAKECMD}", "#{MAKEOPTS}", "#{target}"].reject(&:empty?),
+        { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
+       )
+end
+# Let make handle dependencies between c/o/so - we'll just run it.
+file WHISPER_SO => (['ext/Makefile'] + Dir['ext/*.cpp'] + Dir['ext/*.c'] + Dir['ext/*.h']) do
+  make
+end
+desc "Compile the shared object"
+task :compile => [WHISPER_SO]
+desc "Default Task (Test project)"
+task :default => :test
+Rake::TestTask.new(:test) do |t|
+  t.test_files = FileList['tests/test_*.rb']
+  t.verbose = false
+end
 desc 'Generate gem specification'
 task :gemspec do
   system("cp ../../LICENSE .")

data/ext/ruby_whisper.cpp CHANGED Viewed

@@ -1,4 +1,5 @@
 #include <ruby.h>
+#include <ruby/thread.h>
 #include "ruby_whisper.h"
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
@@ -94,6 +95,32 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
   return self;
 }
+struct WhisperFullParallelParams {
+  ruby_whisper *rw;
+  ruby_whisper_params *rwp;
+  std::vector<float> pcmf32; // mono-channel F32 PCM
+  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+};
+static void stop_whisper_unblock(void *args) {
+  struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
+  fprintf(stderr, "Set running to abort\n");
+  whisper_running_abort(object->rw->context);
+}
+static VALUE call_whisper_full_parallel(void *args) {
+  struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
+  whisper_running_restore(object->rw->context);
+  if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
+    fprintf(stderr, "failed to process audio\n");
+    return INT2FIX(-1);
+  }
+  return INT2FIX(0);
+}
 /*
  * transcribe a single file
  * can emit to a block results
@@ -114,8 +141,9 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
   std::string fname_inp = StringValueCStr(wave_file_path);
-  std::vector<float> pcmf32; // mono-channel F32 PCM
-  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+  //std::vector<float> pcmf32; // mono-channel F32 PCM
+  //std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+  struct WhisperFullParallelParams object;
   // WAV input - this is directly from main.cpp example
   {
@@ -173,26 +201,26 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     drwav_uninit(&wav);
     // convert to mono, float
-    pcmf32.resize(n);
+    object.pcmf32.resize(n);
     if (wav.channels == 1) {
       for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[i])/32768.0f;
+        object.pcmf32[i] = float(pcm16[i])/32768.0f;
       }
     } else {
       for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+        object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
       }
     }
     if (rwp->diarize) {
       // convert to stereo, float
-      pcmf32s.resize(2);
+      object.pcmf32s.resize(2);
-      pcmf32s[0].resize(n);
-      pcmf32s[1].resize(n);
+      object.pcmf32s[0].resize(n);
+      object.pcmf32s[1].resize(n);
       for (uint64_t i = 0; i < n; i++) {
-        pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-        pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+        object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+        object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
       }
     }
   }
@@ -206,10 +234,16 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     rwp->params.encoder_begin_callback_user_data = &is_aborted;
   }
-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
+  object.rw = rw;
+  object.rwp = rwp;
+  int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
+  //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
+  if (r != 0) {
     fprintf(stderr, "failed to process audio\n");
     return self;
   }
   const int n_segments = whisper_full_n_segments(rw->context);
   VALUE output = rb_str_new2("");
   for (int i = 0; i < n_segments; ++i) {

data/ext/whisper.cpp CHANGED Viewed

@@ -592,16 +592,19 @@ struct whisper_context {
     mutable std::mt19937 rng; // used for sampling at t > 0.0
-    int lang_id;
+    int lang_id = 0; // english by default
     // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg;
-    int64_t t_last;
+    int64_t t_beg = 0;
+    int64_t t_last = 0;
     whisper_token tid_last;
     std::vector<float> energy; // PCM signal energy
     // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx; // 0 - use default
+    int32_t exp_n_audio_ctx = 0; // 0 - use default
+    // [EXPERIMENTAL] abort handling
+    bool running = true;
     void use_buf(struct ggml_context * ctx, int i) {
 #if defined(WHISPER_USE_SCRATCH)
@@ -805,7 +808,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                      MEM_REQ_SCRATCH3.at (model.type) +
                 scale*MEM_REQ_MODEL.at   (model.type) +
                 scale*MEM_REQ_KV_CROSS.at(model.type) +
-                scale*std::max(MEM_REQ_ENCODE.at(model.type),       MEM_REQ_DECODE.at(model.type));
+                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
             // this is the memory required by one decoder
             const size_t mem_required_decoder =
@@ -2936,7 +2939,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.language         =*/ "en",
         /*.suppress_blank   =*/ true,
-        /*.suppress_non_speech_tokens =*/true,
+        /*.suppress_non_speech_tokens =*/ false,
         /*.temperature      =*/  0.0f,
         /*.max_initial_ts   =*/  1.0f,
@@ -2962,6 +2965,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.encoder_begin_callback           =*/ nullptr,
         /*.encoder_begin_callback_user_data =*/ nullptr,
+        /*.logits_filter_callback           =*/ nullptr,
+        /*.logits_filter_callback_user_data =*/ nullptr,
     };
     switch (strategy) {
@@ -3078,8 +3084,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
     return res;
 }
-static const std::vector<std::string> non_speech_tokens
-{
+static const std::vector<std::string> non_speech_tokens = {
     "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
     "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
     "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
@@ -3090,7 +3095,7 @@ static const std::vector<std::string> non_speech_tokens
 // - applies logit filters
 // - computes logprobs and probs
 static void whisper_process_logits(
-        const struct whisper_context & ctx,
+              struct whisper_context & ctx,
     const struct whisper_full_params   params,
               struct whisper_decoder & decoder,
                                float   temperature) {
@@ -3146,29 +3151,27 @@ static void whisper_process_logits(
         logits[vocab.token_translate]  = -INFINITY;
         logits[vocab.token_transcribe] = -INFINITY;
+        if (params.logits_filter_callback) {
+            params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
+        }
         // suppress non-speech tokens
         // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-        if (params.suppress_non_speech_tokens)
-        {
-            for (const std::string &token : non_speech_tokens)
-            {
-                std::string suppress_tokens[] = {token, " " + token};
-                for (const std::string &suppress_token : suppress_tokens)
-                {
-                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
-                    {
+        if (params.suppress_non_speech_tokens) {
+            for (const std::string & token : non_speech_tokens) {
+                const std::string suppress_tokens[] = {token, " " + token};
+                for (const std::string & suppress_token : suppress_tokens) {
+                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
                         logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
                     }
                 }
             }
             // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
-            {
+            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
                 logits[vocab.token_to_id.at(" -")] = -INFINITY;
             }
-            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
-            {
+            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
                 logits[vocab.token_to_id.at(" '")] = -INFINITY;
             }
         }
@@ -3571,7 +3574,7 @@ int whisper_full(
     n_decoders = std::max(1, n_decoders);
     // TAGS: WHISPER_DECODER_INIT
-    for (int j = 1; j < n_decoders; j++) {
+    for (int j = 1; j < n_decoders && ctx->running; j++) {
         auto & decoder = ctx->decoders[j];
         if (decoder.kv_self.ctx == nullptr) {
@@ -3654,7 +3657,7 @@ int whisper_full(
     std::vector<beam_candidate> beam_candidates;
     // main loop
-    while (true) {
+    while (ctx->running) {
         const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
         while (progress_cur >= progress_prev + progress_step) {
             progress_prev += progress_step;
@@ -3854,7 +3857,7 @@ int whisper_full(
                         return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
                     });
-                    int cur_c = 0;
+                    uint32_t cur_c = 0;
                     for (int j = 0; j < n_decoders_cur; ++j) {
                         auto & decoder = ctx->decoders[j];
@@ -4204,6 +4207,18 @@ int whisper_full(
     return 0;
 }
+void whisper_running_abort(struct whisper_context * ctx) {
+    ctx->running = false;
+}
+void whisper_running_restore(struct whisper_context * ctx) {
+    ctx->running = true;
+}
+bool whisper_running_state(struct whisper_context * ctx) {
+    return ctx->running;
+}
 int whisper_full_parallel(
         struct whisper_context * ctx,
         struct whisper_full_params params,
@@ -4339,7 +4354,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
 }
 int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->lang_id;
+    return ctx->lang_id;
 }
 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {

data/ext/whisper.h CHANGED Viewed

@@ -225,6 +225,15 @@ extern "C" {
     // Print system information
     WHISPER_API const char * whisper_print_system_info(void);
+    // Abort a running whisper_full_parallel or whisper_full
+    WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
+    // Resume whisper context from an aborted state allowing it run again
+    WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
+    // Check the whisper context state if true then it can run if false it can not
+    WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
     ////////////////////////////////////////////////////////////////////////////
     // Available sampling strategies
@@ -243,6 +252,16 @@ extern "C" {
     // If it returns false, the computation is aborted
     typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
+    // Logits filter callback
+    // Can be used to modify the logits before sampling
+    // If not NULL, called after applying temperature to logits
+    typedef void (*whisper_logits_filter_callback)(
+            struct whisper_context * ctx,
+          const whisper_token_data * tokens,
+                               int   n_tokens,
+                             float * logits,
+                              void * user_data);
     // Parameters for the whisper_full() function
     // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
     // whisper_full_default_params()
@@ -315,6 +334,10 @@ extern "C" {
         // called each time before the encoder starts
         whisper_encoder_begin_callback encoder_begin_callback;
         void * encoder_begin_callback_user_data;
+        // called by each decoder to filter obtained logits
+        whisper_logits_filter_callback logits_filter_callback;
+        void * logits_filter_callback_user_data;
     };
     WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: whispercpp
 version: !ruby/object:Gem::Version
-  version: 1.2.0.1
+  version: 1.2.0.2
 platform: ruby
 authors:
 - Georgi Gerganov
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-25 00:00:00.000000000 Z
+date: 2023-02-27 00:00:00.000000000 Z
 dependencies: []
 description: High-performance inference of OpenAI's Whisper automatic speech recognition
   (ASR) model via Ruby