RubyGems - whispercpp - Versions diffs - 1.2.0.1 → 1.2.0.2 - Mend

whispercpp 1.2.0.1 → 1.2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fc32803382297d2edfd5a2517cb3734f394fc95f7ba5b9111af8c2206ead53ca
-  data.tar.gz: e6e8afcad840c465faed0cba6485ef86204e12895461a683b6d3ebc73703bb29
+  metadata.gz: 9de22be96d7d59590930b292b4cab94675502604f19051e4f61603896d81a887
+  data.tar.gz: 6261d4023c4e71a29c5884e401bd3557794ed4ed3ae5ae855abc0fb1c4d54d30
 SHA512:
-  metadata.gz: e3a3f74e16dc7fe4e34a64ee63742a9dd12bcdbe9dd3a5ded1b860babe654eeb04aba8a36cd889c1d8231e910739159a799c0e6928f82831a1e7d9b20105cd50
-  data.tar.gz: 9909de651ac6b8c6e1190b90a65388f84e9a8a5dd72a335853eca850286c6f819871e63b05b23f9c080123f6aabcc4a2bad77affc26e5fc4fe1f174fee555040
+  metadata.gz: b36fed0dc2d51177bc7478562deba9f6e6b5c0328624e8670a8f23a358b1bffccbb55c5ffc68f456c79b53098534d090c7721249ecc1e975bccd2128b468e0a6
+  data.tar.gz: '083105df0648a8c7de79554aa13c888a8cd8d325ac69ef78a52f382865e71ea82ef66b564b3022f89389902be1f5fcffa39fd2d4c6f5142f2ca6968446208bea'

data/Rakefile CHANGED Viewed

@@ -1,7 +1,21 @@
 require 'erb'
+require 'open3'
+require 'rake/clean'
+require 'rake/testtask'
 require 'rubygems/package'
-BUILD_VERSION=1
+CLEAN.include '**/*.o'
+CLEAN.include "**/*.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
+CLOBBER.include 'doc'
+CLOBBER.include '**/*.log'
+CLOBBER.include '**/Makefile'
+CLOBBER.include '**/extconf.h'
+CLOBBER.include '**/extconf.h'
+CLOBBER.include '**/whisper.*'
+CLOBBER.include '**/ggml.*'
+CLOBBER.include '**/dr_wav.h'
+BUILD_VERSION=2
 # Determine the current version of the software
 if File.read('../../CMakeLists.txt') =~ /project.*\s*VERSION\s*(\d.+)\)/
   CURRENT_VERSION = "#{$1}.#{BUILD_VERSION}"
@@ -9,6 +23,66 @@ else
   CURRENT_VERSION = "0.0.0.#{BUILD_VERSION}"
 end
+def shell(args, opts = {})
+  puts "> #{args.join(' ')}"
+  cmd, live_stream, cwd = args, opts[:live_stdout], opts[:cwd]
+  Dir.chdir(cwd) {
+    wait_thr = nil
+    Open3.popen3(*cmd) do |stdin, stdout, stderr, thr|
+      stdin.close
+      wait_thr = thr # Ruby 1.8 will not yield thr, this will be nil
+      while line = stdout.gets do
+        live_stream.puts(line) if live_stream
+      end
+      while line = stderr.gets do
+        puts line
+      end
+    end
+    # prefer process handle directly from popen3, but if not available
+    # fallback to global.
+    p_status = wait_thr ? wait_thr.value : $?
+    exit_code = p_status.exitstatus
+    error = (exit_code != 0)
+  }
+end
+make_program = (/mswin/ =~ RUBY_PLATFORM) ? 'nmake' : 'make'
+MAKECMD = ENV['MAKE_CMD'] || make_program
+MAKEOPTS = ENV['MAKE_OPTS'] || ''
+WHISPER_SO = "ext/whisper.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}"
+file 'ext/Makefile' => 'ext/extconf.rb' do
+  shell(['ruby', 'extconf.rb', ENV['EXTCONF_OPTS'].to_s],
+        { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
+       )
+end
+def make(target = '')
+  shell(["#{MAKECMD}", "#{MAKEOPTS}", "#{target}"].reject(&:empty?),
+        { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" }
+       )
+end
+# Let make handle dependencies between c/o/so - we'll just run it.
+file WHISPER_SO => (['ext/Makefile'] + Dir['ext/*.cpp'] + Dir['ext/*.c'] + Dir['ext/*.h']) do
+  make
+end
+desc "Compile the shared object"
+task :compile => [WHISPER_SO]
+desc "Default Task (Test project)"
+task :default => :test
+Rake::TestTask.new(:test) do |t|
+  t.test_files = FileList['tests/test_*.rb']
+  t.verbose = false
+end
 desc 'Generate gem specification'
 task :gemspec do
   system("cp ../../LICENSE .")

data/ext/ruby_whisper.cpp CHANGED Viewed

@@ -1,4 +1,5 @@
 #include <ruby.h>
+#include <ruby/thread.h>
 #include "ruby_whisper.h"
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
@@ -94,6 +95,32 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
   return self;
 }
+struct WhisperFullParallelParams {
+  ruby_whisper *rw;
+  ruby_whisper_params *rwp;
+  std::vector<float> pcmf32; // mono-channel F32 PCM
+  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+};
+static void stop_whisper_unblock(void *args) {
+  struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
+  fprintf(stderr, "Set running to abort\n");
+  whisper_running_abort(object->rw->context);
+}
+static VALUE call_whisper_full_parallel(void *args) {
+  struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args;
+  whisper_running_restore(object->rw->context);
+  if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) {
+    fprintf(stderr, "failed to process audio\n");
+    return INT2FIX(-1);
+  }
+  return INT2FIX(0);
+}
 /*
  * transcribe a single file
  * can emit to a block results
@@ -114,8 +141,9 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
   std::string fname_inp = StringValueCStr(wave_file_path);
-  std::vector<float> pcmf32; // mono-channel F32 PCM
-  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+  //std::vector<float> pcmf32; // mono-channel F32 PCM
+  //std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+  struct WhisperFullParallelParams object;
   // WAV input - this is directly from main.cpp example
   {
@@ -173,26 +201,26 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     drwav_uninit(&wav);
     // convert to mono, float
-    pcmf32.resize(n);
+    object.pcmf32.resize(n);
     if (wav.channels == 1) {
       for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[i])/32768.0f;
+        object.pcmf32[i] = float(pcm16[i])/32768.0f;
       }
     } else {
       for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+        object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
       }
     }
     if (rwp->diarize) {
       // convert to stereo, float
-      pcmf32s.resize(2);
+      object.pcmf32s.resize(2);
-      pcmf32s[0].resize(n);
-      pcmf32s[1].resize(n);
+      object.pcmf32s[0].resize(n);
+      object.pcmf32s[1].resize(n);
       for (uint64_t i = 0; i < n; i++) {
-        pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-        pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+        object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+        object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
       }
     }
   }
@@ -206,10 +234,16 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     rwp->params.encoder_begin_callback_user_data = &is_aborted;
   }
-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
+  object.rw = rw;
+  object.rwp = rwp;
+  int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object);
+  //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) {
+  if (r != 0) {
     fprintf(stderr, "failed to process audio\n");
     return self;
   }
   const int n_segments = whisper_full_n_segments(rw->context);
   VALUE output = rb_str_new2("");
   for (int i = 0; i < n_segments; ++i) {

data/ext/whisper.cpp CHANGED Viewed

@@ -592,16 +592,19 @@ struct whisper_context {
     mutable std::mt19937 rng; // used for sampling at t > 0.0
-    int lang_id;
+    int lang_id = 0; // english by default
     // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg;
-    int64_t t_last;
+    int64_t t_beg = 0;
+    int64_t t_last = 0;
     whisper_token tid_last;
     std::vector<float> energy; // PCM signal energy
     // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx; // 0 - use default
+    int32_t exp_n_audio_ctx = 0; // 0 - use default
+    // [EXPERIMENTAL] abort handling
+    bool running = true;
     void use_buf(struct ggml_context * ctx, int i) {
 #if defined(WHISPER_USE_SCRATCH)
@@ -805,7 +808,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                      MEM_REQ_SCRATCH3.at (model.type) +
                 scale*MEM_REQ_MODEL.at   (model.type) +
                 scale*MEM_REQ_KV_CROSS.at(model.type) +
-                scale*std::max(MEM_REQ_ENCODE.at(model.type),       MEM_REQ_DECODE.at(model.type));
+                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
             // this is the memory required by one decoder
             const size_t mem_required_decoder =
@@ -2936,7 +2939,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.language         =*/ "en",
         /*.suppress_blank   =*/ true,
-        /*.suppress_non_speech_tokens =*/true,
+        /*.suppress_non_speech_tokens =*/ false,
         /*.temperature      =*/  0.0f,
         /*.max_initial_ts   =*/  1.0f,
@@ -2962,6 +2965,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.encoder_begin_callback           =*/ nullptr,
         /*.encoder_begin_callback_user_data =*/ nullptr,
+        /*.logits_filter_callback           =*/ nullptr,
+        /*.logits_filter_callback_user_data =*/ nullptr,
     };
     switch (strategy) {
@@ -3078,8 +3084,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
     return res;
 }
-static const std::vector<std::string> non_speech_tokens
-{
+static const std::vector<std::string> non_speech_tokens = {
     "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
     "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
     "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
@@ -3090,7 +3095,7 @@ static const std::vector<std::string> non_speech_tokens
 // - applies logit filters
 // - computes logprobs and probs
 static void whisper_process_logits(
-        const struct whisper_context & ctx,
+              struct whisper_context & ctx,
     const struct whisper_full_params   params,
               struct whisper_decoder & decoder,
                                float   temperature) {
@@ -3146,29 +3151,27 @@ static void whisper_process_logits(
         logits[vocab.token_translate]  = -INFINITY;
         logits[vocab.token_transcribe] = -INFINITY;
+        if (params.logits_filter_callback) {
+            params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
+        }
         // suppress non-speech tokens
         // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-        if (params.suppress_non_speech_tokens)
-        {
-            for (const std::string &token : non_speech_tokens)
-            {
-                std::string suppress_tokens[] = {token, " " + token};
-                for (const std::string &suppress_token : suppress_tokens)
-                {
-                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
-                    {
+        if (params.suppress_non_speech_tokens) {
+            for (const std::string & token : non_speech_tokens) {
+                const std::string suppress_tokens[] = {token, " " + token};
+                for (const std::string & suppress_token : suppress_tokens) {
+                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
                         logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
                     }
                 }
             }
             // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
-            {
+            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
                 logits[vocab.token_to_id.at(" -")] = -INFINITY;
             }
-            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
-            {
+            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
                 logits[vocab.token_to_id.at(" '")] = -INFINITY;
             }
         }
@@ -3571,7 +3574,7 @@ int whisper_full(
     n_decoders = std::max(1, n_decoders);
     // TAGS: WHISPER_DECODER_INIT
-    for (int j = 1; j < n_decoders; j++) {
+    for (int j = 1; j < n_decoders && ctx->running; j++) {
         auto & decoder = ctx->decoders[j];
         if (decoder.kv_self.ctx == nullptr) {
@@ -3654,7 +3657,7 @@ int whisper_full(
     std::vector<beam_candidate> beam_candidates;
     // main loop
-    while (true) {
+    while (ctx->running) {
         const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
         while (progress_cur >= progress_prev + progress_step) {
             progress_prev += progress_step;
@@ -3854,7 +3857,7 @@ int whisper_full(
                         return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
                     });
-                    int cur_c = 0;
+                    uint32_t cur_c = 0;
                     for (int j = 0; j < n_decoders_cur; ++j) {
                         auto & decoder = ctx->decoders[j];
@@ -4204,6 +4207,18 @@ int whisper_full(
     return 0;
 }
+void whisper_running_abort(struct whisper_context * ctx) {
+    ctx->running = false;
+}
+void whisper_running_restore(struct whisper_context * ctx) {
+    ctx->running = true;
+}
+bool whisper_running_state(struct whisper_context * ctx) {
+    return ctx->running;
+}
 int whisper_full_parallel(
         struct whisper_context * ctx,
         struct whisper_full_params params,
@@ -4339,7 +4354,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
 }
 int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->lang_id;
+    return ctx->lang_id;
 }
 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {

data/ext/whisper.h CHANGED Viewed

@@ -225,6 +225,15 @@ extern "C" {
     // Print system information
     WHISPER_API const char * whisper_print_system_info(void);
+    // Abort a running whisper_full_parallel or whisper_full
+    WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
+    // Resume whisper context from an aborted state allowing it run again
+    WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
+    // Check the whisper context state if true then it can run if false it can not
+    WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
     ////////////////////////////////////////////////////////////////////////////
     // Available sampling strategies
@@ -243,6 +252,16 @@ extern "C" {
     // If it returns false, the computation is aborted
     typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
+    // Logits filter callback
+    // Can be used to modify the logits before sampling
+    // If not NULL, called after applying temperature to logits
+    typedef void (*whisper_logits_filter_callback)(
+            struct whisper_context * ctx,
+          const whisper_token_data * tokens,
+                               int   n_tokens,
+                             float * logits,
+                              void * user_data);
     // Parameters for the whisper_full() function
     // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
     // whisper_full_default_params()
@@ -315,6 +334,10 @@ extern "C" {
         // called each time before the encoder starts
         whisper_encoder_begin_callback encoder_begin_callback;
         void * encoder_begin_callback_user_data;
+        // called by each decoder to filter obtained logits
+        whisper_logits_filter_callback logits_filter_callback;
+        void * logits_filter_callback_user_data;
     };
     WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: whispercpp
 version: !ruby/object:Gem::Version
-  version: 1.2.0.1
+  version: 1.2.0.2
 platform: ruby
 authors:
 - Georgi Gerganov
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-25 00:00:00.000000000 Z
+date: 2023-02-27 00:00:00.000000000 Z
 dependencies: []
 description: High-performance inference of OpenAI's Whisper automatic speech recognition
   (ASR) model via Ruby