whisper.cpp 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9233f65df607fba80d30ff4ae4a86dcbfaa1a750c88de487a2f98ace0a15de75
4
- data.tar.gz: 157dcee2a04c9b514c3b36a92500532521aa964eed35ef9ffefd6f2d6b5fd40b
3
+ metadata.gz: 561b7b824c1f631537d681fb1e5a1eafe20546a8468705ecd2ba3989e0062c69
4
+ data.tar.gz: e047a8ab099358095100a55cd30aca398bbe8b8c4c55cfb0713cdf33e87d2e1a
5
5
  SHA512:
6
- metadata.gz: 5a1b01fb0ae991764d318f054369f1740d9f7d5324aa8a0deedf7e5bad9f1c8ed84573061c3a68dfda1b73c60bc5761dd2f4f21931c22414098ebe3b1cc9b17d
7
- data.tar.gz: bae0d8a234b23f88fca2a363438a43989ad01d6e732b95987f740b999b6fa9c74f300de1033e338a141d85e29d5dff2827763b408f599c34bbed50fe58f5d441
6
+ metadata.gz: 80c60958d31df322ac570c97ca6d758278f8711d9bfff371be34e8159ffcadc2facf149aa4c7b55de49a2d2d8812b570d0ddd68e543172301f930fe494a4243b
7
+ data.tar.gz: 02fd686ff1857adf234c53fa0719035e797a82bed4b165e54004c0b7c903728eb7d8c44273b3f46e6ff21a32df44e29986092b72ccfd853324749e639fa303ed
data/lib/whisper/model.rb CHANGED
@@ -3,81 +3,125 @@ require_relative 'audio_processor'
3
3
 
4
4
  module Whisper
5
5
  class Model
6
+ TranscriptionResult = Struct.new(:language, :output)
6
7
 
7
- TranscriptionResult = Struct.new :language, :output
8
-
9
- def initialize model_path
10
- params = Whisper.whisper_context_default_params
11
- # Modify params as needed
12
- params[:use_gpu] = true
13
- params[:gpu_device] = 0
14
-
15
- @ctx = Whisper.whisper_init_from_file_with_params model_path, params
16
- raise 'Failed to initialize Whisper model' if @ctx.null?
8
+ def initialize(model_path)
9
+ @model_path = model_path
10
+ @ctx = nil
17
11
  end
18
12
 
19
- def transcribe_from_file(audio_file_path, format: 'plaintext')
13
+ def transcribe_from_file(audio_file_path, format: 'plaintext', **params)
20
14
  # Load audio file and convert to float array
21
- audio_data = Whisper::AudioProcessor.convert_to_float_array audio_file_path
15
+ audio_data = Whisper::AudioProcessor.convert_to_float_array(audio_file_path)
16
+ transcribe_from_audio_data(audio_data, format: format, **params)
17
+ end
18
+
19
+ def transcribe_from_audio_data(audio_data, format: 'plaintext', **params)
20
+ init_whisper_context(params)
22
21
 
23
22
  # Prepare full params
24
- params = Whisper.whisper_full_default_params Whisper::WHISPER_SAMPLING_GREEDY
25
- params[:n_threads] = 4
26
- params[:translate] = false
27
- params[:language] = FFI::Pointer::NULL # Auto-detect language
23
+ full_params = default_full_params(params)
28
24
 
29
25
  # Prepare audio data pointer
30
26
  n_samples = audio_data.size
31
27
  samples_ptr = FFI::MemoryPointer.new(:float, n_samples)
32
- samples_ptr.write_array_of_float audio_data
28
+ samples_ptr.write_array_of_float(audio_data)
33
29
 
34
- # Call the whisper_full function
35
- result = Whisper.whisper_full @ctx, params, samples_ptr, n_samples
30
+ # Call the whisper_full_parallel function
31
+ n_processors = params.fetch :n_processors, ENV['WHISPER_N_PROCS']&.to_i || 1
32
+ result = Whisper.whisper_full_parallel(@ctx, full_params, samples_ptr, n_samples, n_processors)
36
33
  raise 'Transcription failed' if result != 0
37
34
 
38
35
  # Retrieve detected language
39
36
  lang_id = Whisper.whisper_full_lang_id(@ctx)
40
37
  language = Whisper.whisper_lang_str(lang_id)
41
38
 
42
- n_segments = Whisper.whisper_full_n_segments @ctx
39
+ # Retrieve the transcription output
40
+ n_segments = Whisper.whisper_full_n_segments(@ctx)
41
+ output = format_transcription(format, n_segments: n_segments)
42
+
43
+ TranscriptionResult.new(language, output)
44
+ end
45
+
46
+ def close
47
+ Whisper.whisper_free(@ctx) unless @ctx.nil?
48
+ end
49
+
50
+ private
51
+
52
+ def init_whisper_context(params)
53
+ return unless @ctx.nil?
54
+
55
+ ctx_params = Whisper.whisper_context_default_params
56
+
57
+ # Set user-provided context params
58
+ user_ctx_params = params.fetch(:context_params, {})
59
+ user_ctx_params.each do |key, value|
60
+ if ctx_params.members.include?(field)
61
+ ctx_params[key] = value
62
+ else
63
+ warn "Unknown context_param field: #{field}"
64
+ end
65
+ end
66
+ ctx_params[:gpu_device] = ENV['WHISPER_GPU']&.to_i || 0
67
+
68
+ # Initialize context
69
+ @ctx = Whisper.whisper_init_from_file_with_params(@model_path, ctx_params)
70
+ raise 'Failed to initialize Whisper model' if @ctx.null?
71
+ end
72
+
73
+ def default_full_params(params)
74
+ # Get default full params
75
+ strategy = params.fetch(:sampling_strategy, Whisper::WHISPER_SAMPLING_GREEDY)
76
+ full_params = Whisper.whisper_full_default_params(strategy)
77
+
78
+ # Set translate to false to prevent translation to English
79
+ full_params[:translate] = false
80
+
81
+ # Set user-provided full params
82
+ user_full_params = params.fetch(:full_params, {})
83
+ user_full_params.each do |key, value|
84
+ if full_params.members.include?(field)
85
+ full_params[key] = value
86
+ else
87
+ warn "Unknown full_param field: #{field}"
88
+ end
89
+ end
90
+
91
+ full_params
92
+ end
93
+
94
+ def format_transcription(format, n_segments:)
43
95
  output = ''
44
96
  case format.downcase
45
97
  when 'plaintext'
46
98
  n_segments.times do |i|
47
- segment_text = Whisper.whisper_full_get_segment_text @ctx, i
99
+ segment_text = Whisper.whisper_full_get_segment_text(@ctx, i)
48
100
  output += segment_text
49
101
  end
50
102
  when 'srt'
51
103
  n_segments.times do |i|
52
- start_time = Whisper.whisper_full_get_segment_t0(@ctx, i) / 100.0
53
- end_time = Whisper.whisper_full_get_segment_t1(@ctx, i) / 100.0
54
- segment_text = Whisper.whisper_full_get_segment_text @ctx, i
104
+ start_time = Whisper.whisper_full_get_segment_t0(@ctx, i) / 100.0
105
+ end_time = Whisper.whisper_full_get_segment_t1(@ctx, i) / 100.0
106
+ segment_text = Whisper.whisper_full_get_segment_text(@ctx, i)
55
107
 
56
108
  output += "#{i + 1}\n"
57
- output += "#{format_time_srt start_time} --> #{format_time_srt end_time}\n"
109
+ output += "#{format_time_srt(start_time)} --> #{format_time_srt(end_time)}\n"
58
110
  output += "#{segment_text.strip}\n\n"
59
111
  end
60
112
  else
61
113
  raise "Unsupported format: #{format}"
62
114
  end
63
-
64
- TranscriptionResult.new language, output
65
- end
66
-
67
- def close
68
- Whisper.whisper_free @ctx
115
+ output
69
116
  end
70
117
 
71
- private
72
-
73
- def format_time_srt seconds
74
- hours = (seconds / 3600).to_i
118
+ def format_time_srt(seconds)
119
+ hours = (seconds / 3600).to_i
75
120
  minutes = ((seconds % 3600) / 60).to_i
76
- secs = (seconds % 60).to_i
77
- millis = ((seconds - seconds.to_i) * 1000).to_i
78
- format '%02d:%02d:%02d,%03d', hours, minutes, secs, millis
121
+ secs = (seconds % 60).to_i
122
+ millis = ((seconds - seconds.to_i) * 1000).to_i
123
+ format('%02d:%02d:%02d,%03d', hours, minutes, secs, millis)
79
124
  end
80
-
81
125
  end
82
126
  end
83
127
 
@@ -0,0 +1,5 @@
1
+ module Whisper
2
+
3
+ VERSION = '0.3.1'
4
+
5
+ end
data/lib/whisper.rb CHANGED
@@ -29,6 +29,20 @@ module Whisper
29
29
  :WHISPER_AHEADS_LARGE_V3
30
30
  ]
31
31
 
32
+ # Enums for sampling strategy
33
+ enum :whisper_sampling_strategy, [
34
+ :WHISPER_SAMPLING_GREEDY,
35
+ :WHISPER_SAMPLING_BEAM_SEARCH
36
+ ]
37
+
38
+ # Callbacks
39
+ callback :whisper_new_segment_callback, [:pointer, :pointer, :int, :pointer], :void
40
+ callback :whisper_progress_callback, [:pointer, :pointer, :int, :pointer], :void
41
+ callback :whisper_encoder_begin_callback, [:pointer, :pointer, :pointer], :bool
42
+ callback :whisper_logits_filter_callback, [:pointer, :pointer, :pointer, :int, :pointer, :pointer], :void
43
+ callback :ggml_abort_callback, [:pointer], :bool
44
+ callback :ggml_log_callback, [:int, :string, :pointer], :void
45
+
32
46
  # Structs Definitions
33
47
 
34
48
  # whisper_ahead struct
@@ -77,6 +91,28 @@ module Whisper
77
91
  )
78
92
  end
79
93
 
94
+ # whisper_model_loader struct
95
+ #class WhisperModelLoader < FFI::Struct
96
+ # callback :read_callback, [:pointer, :pointer, :size_t], :size_t
97
+ # callback :eof_callback, [:pointer], :bool
98
+ # callback :close_callback, [:pointer], :void
99
+
100
+ # layout(
101
+ # :context, :pointer,
102
+ # :read, :read_callback,
103
+ # :eof, :eof_callback,
104
+ # :close, :close_callback
105
+ # )
106
+ #end
107
+
108
+ # whisper_grammar_element struct
109
+ class WhisperGrammarElement < FFI::Struct
110
+ layout(
111
+ :type, :int,
112
+ :value, :uint32
113
+ )
114
+ end
115
+
80
116
  # greedy sampling parameters
81
117
  class WhisperGreedyParams < FFI::Struct
82
118
  layout(
@@ -134,15 +170,15 @@ module Whisper
134
170
  :no_speech_thold, :float,
135
171
  :greedy, WhisperGreedyParams,
136
172
  :beam_search, WhisperBeamSearchParams,
137
- :new_segment_callback, :pointer,
173
+ :new_segment_callback, :whisper_new_segment_callback,
138
174
  :new_segment_callback_user_data, :pointer,
139
- :progress_callback, :pointer,
175
+ :progress_callback, :whisper_progress_callback,
140
176
  :progress_callback_user_data, :pointer,
141
- :encoder_begin_callback, :pointer,
177
+ :encoder_begin_callback, :whisper_encoder_begin_callback,
142
178
  :encoder_begin_callback_user_data, :pointer,
143
- :abort_callback, :pointer,
179
+ :abort_callback, :ggml_abort_callback,
144
180
  :abort_callback_user_data, :pointer,
145
- :logits_filter_callback, :pointer,
181
+ :logits_filter_callback, :whisper_logits_filter_callback,
146
182
  :logits_filter_callback_user_data, :pointer,
147
183
  :grammar_rules, :pointer,
148
184
  :n_grammar_rules, :size_t,
@@ -151,41 +187,162 @@ module Whisper
151
187
  )
152
188
  end
153
189
 
190
+ # Get default context params
191
+ attach_function :whisper_context_default_params, [], WhisperContextParams.by_value
192
+ # Get default full params
193
+ attach_function :whisper_full_default_params, [:int], WhisperFullParams.by_value
194
+
154
195
  # Function Bindings
155
196
 
156
197
  # Initialize context with params
157
198
  attach_function :whisper_init_from_file_with_params, [:string, WhisperContextParams.by_value], :pointer
199
+ attach_function :whisper_init_from_buffer_with_params, [:pointer, :size_t, WhisperContextParams.by_value], :pointer
200
+ #attach_function :whisper_init_with_params, [WhisperModelLoader.by_ref, WhisperContextParams.by_value], :pointer
158
201
 
159
- # Get default context params
160
- attach_function :whisper_context_default_params, [], WhisperContextParams.by_value
202
+ # Initialize context without state
203
+ attach_function :whisper_init_from_file_with_params_no_state, [:string, WhisperContextParams.by_value], :pointer
204
+ attach_function :whisper_init_from_buffer_with_params_no_state, [:pointer, :size_t, WhisperContextParams.by_value], :pointer
205
+ #attach_function :whisper_init_with_params_no_state, [WhisperModelLoader.by_ref, WhisperContextParams.by_value], :pointer
161
206
 
162
- # Get default full params
163
- attach_function :whisper_full_default_params, [:int], WhisperFullParams.by_value
207
+ # Initialize state
208
+ attach_function :whisper_init_state, [:pointer], :pointer
209
+
210
+ # OpenVINO functions
211
+ #attach_function :whisper_ctx_init_openvino_encoder_with_state, [:pointer, :pointer, :string, :string, :string], :int
212
+ #attach_function :whisper_ctx_init_openvino_encoder, [:pointer, :string, :string, :string], :int
164
213
 
165
214
  # Free functions
166
215
  attach_function :whisper_free, [:pointer], :void
216
+ attach_function :whisper_free_state, [:pointer], :void
217
+ attach_function :whisper_free_params, [:pointer], :void
218
+ attach_function :whisper_free_context_params, [:pointer], :void
219
+
220
+ # PCM to Mel spectrogram
221
+ attach_function :whisper_pcm_to_mel, [:pointer, :pointer, :int, :int], :int
222
+ attach_function :whisper_pcm_to_mel_with_state, [:pointer, :pointer, :pointer, :int, :int], :int
223
+
224
+ # Set custom Mel spectrogram
225
+ attach_function :whisper_set_mel, [:pointer, :pointer, :int, :int], :int
226
+ attach_function :whisper_set_mel_with_state, [:pointer, :pointer, :pointer, :int, :int], :int
227
+
228
+ # Encode
229
+ attach_function :whisper_encode, [:pointer, :int, :int], :int
230
+ attach_function :whisper_encode_with_state, [:pointer, :pointer, :int, :int], :int
231
+
232
+ # Decode
233
+ attach_function :whisper_decode, [:pointer, :pointer, :int, :int, :int], :int
234
+ attach_function :whisper_decode_with_state, [:pointer, :pointer, :pointer, :int, :int, :int], :int
235
+
236
+ # Tokenize
237
+ attach_function :whisper_tokenize, [:pointer, :string, :pointer, :int], :int
238
+ attach_function :whisper_token_count, [:pointer, :string], :int
239
+
240
+ # Language functions
241
+ attach_function :whisper_lang_max_id, [], :int
242
+ attach_function :whisper_lang_id, [:string], :int
243
+ attach_function :whisper_lang_str, [:int], :string
244
+ attach_function :whisper_lang_str_full, [:int], :string
245
+
246
+ # Auto-detect language
247
+ attach_function :whisper_lang_auto_detect, [:pointer, :int, :int, :pointer], :int
248
+ attach_function :whisper_lang_auto_detect_with_state, [:pointer, :pointer, :int, :int, :pointer], :int
249
+
250
+ # Model info
251
+ attach_function :whisper_n_len, [:pointer], :int
252
+ attach_function :whisper_n_len_from_state, [:pointer], :int
253
+ attach_function :whisper_n_vocab, [:pointer], :int
254
+ attach_function :whisper_n_text_ctx, [:pointer], :int
255
+ attach_function :whisper_n_audio_ctx, [:pointer], :int
256
+ attach_function :whisper_is_multilingual, [:pointer], :int
257
+
258
+ attach_function :whisper_model_n_vocab, [:pointer], :int
259
+ attach_function :whisper_model_n_audio_ctx, [:pointer], :int
260
+ attach_function :whisper_model_n_audio_state, [:pointer], :int
261
+ attach_function :whisper_model_n_audio_head, [:pointer], :int
262
+ attach_function :whisper_model_n_audio_layer, [:pointer], :int
263
+ attach_function :whisper_model_n_text_ctx, [:pointer], :int
264
+ attach_function :whisper_model_n_text_state, [:pointer], :int
265
+ attach_function :whisper_model_n_text_head, [:pointer], :int
266
+ attach_function :whisper_model_n_text_layer, [:pointer], :int
267
+ attach_function :whisper_model_n_mels, [:pointer], :int
268
+ attach_function :whisper_model_ftype, [:pointer], :int
269
+ attach_function :whisper_model_type, [:pointer], :int
270
+
271
+ # Get logits
272
+ attach_function :whisper_get_logits, [:pointer], :pointer
273
+ attach_function :whisper_get_logits_from_state, [:pointer], :pointer
274
+
275
+ # Token functions
276
+ attach_function :whisper_token_to_str, [:pointer, :int32], :string
277
+ attach_function :whisper_model_type_readable, [:pointer], :string
278
+
279
+ # Special tokens
280
+ attach_function :whisper_token_eot, [:pointer], :int32
281
+ attach_function :whisper_token_sot, [:pointer], :int32
282
+ attach_function :whisper_token_solm, [:pointer], :int32
283
+ attach_function :whisper_token_prev, [:pointer], :int32
284
+ attach_function :whisper_token_nosp, [:pointer], :int32
285
+ attach_function :whisper_token_not, [:pointer], :int32
286
+ attach_function :whisper_token_beg, [:pointer], :int32
287
+ attach_function :whisper_token_lang, [:pointer, :int], :int32
288
+
289
+ # Task tokens
290
+ attach_function :whisper_token_translate, [:pointer], :int32
291
+ attach_function :whisper_token_transcribe, [:pointer], :int32
292
+
293
+ # Timings
294
+ attach_function :whisper_print_timings, [:pointer], :void
295
+ attach_function :whisper_reset_timings, [:pointer], :void
296
+ attach_function :whisper_print_system_info, [], :string
167
297
 
168
298
  # Full transcription function
169
299
  attach_function :whisper_full, [:pointer, WhisperFullParams.by_value, :pointer, :int], :int
300
+ attach_function :whisper_full_with_state, [:pointer, :pointer, WhisperFullParams.by_value, :pointer, :int], :int
301
+
302
+ # Parallel processing
303
+ attach_function :whisper_full_parallel, [:pointer, WhisperFullParams.by_value, :pointer, :int, :int], :int
170
304
 
171
305
  # Number of segments
172
306
  attach_function :whisper_full_n_segments, [:pointer], :int
307
+ attach_function :whisper_full_n_segments_from_state, [:pointer], :int
173
308
 
174
- # Get segment text
175
- attach_function :whisper_full_get_segment_text, [:pointer, :int], :string
176
-
177
- # Get segment start and end times
309
+ # Get segment info
178
310
  attach_function :whisper_full_get_segment_t0, [:pointer, :int], :int64
311
+ attach_function :whisper_full_get_segment_t0_from_state, [:pointer, :int], :int64
312
+
179
313
  attach_function :whisper_full_get_segment_t1, [:pointer, :int], :int64
314
+ attach_function :whisper_full_get_segment_t1_from_state, [:pointer, :int], :int64
180
315
 
181
- # Get detected language ID
182
- attach_function :whisper_full_lang_id, [:pointer], :int
316
+ attach_function :whisper_full_get_segment_speaker_turn_next, [:pointer, :int], :bool
317
+ attach_function :whisper_full_get_segment_speaker_turn_next_from_state, [:pointer, :int], :bool
183
318
 
184
- # Convert language ID to string
185
- attach_function :whisper_lang_str, [:int], :string
319
+ attach_function :whisper_full_get_segment_text, [:pointer, :int], :string
320
+ attach_function :whisper_full_get_segment_text_from_state, [:pointer, :int], :string
186
321
 
187
- # void log_callback(int level, const char *msg, void *user_data);
188
- callback :ggml_log_callback, [:int, :string, :pointer], :void
322
+ attach_function :whisper_full_n_tokens, [:pointer, :int], :int
323
+ attach_function :whisper_full_n_tokens_from_state, [:pointer, :int], :int
324
+
325
+ attach_function :whisper_full_get_token_text, [:pointer, :int, :int], :string
326
+ attach_function :whisper_full_get_token_text_from_state, [:pointer, :pointer, :int, :int], :string
327
+
328
+ attach_function :whisper_full_get_token_id, [:pointer, :int, :int], :int32
329
+ attach_function :whisper_full_get_token_id_from_state, [:pointer, :int, :int], :int32
330
+
331
+ attach_function :whisper_full_get_token_data, [:pointer, :int, :int], WhisperTokenData.by_value
332
+ attach_function :whisper_full_get_token_data_from_state, [:pointer, :int, :int], WhisperTokenData.by_value
333
+
334
+ attach_function :whisper_full_get_token_p, [:pointer, :int, :int], :float
335
+ attach_function :whisper_full_get_token_p_from_state, [:pointer, :int, :int], :float
336
+
337
+ # Language ID
338
+ attach_function :whisper_full_lang_id, [:pointer], :int
339
+ attach_function :whisper_full_lang_id_from_state, [:pointer], :int
340
+
341
+ # Benchmarks
342
+ attach_function :whisper_bench_memcpy, [:int], :int
343
+ attach_function :whisper_bench_memcpy_str, [:int], :string
344
+ attach_function :whisper_bench_ggml_mul_mat, [:int], :int
345
+ attach_function :whisper_bench_ggml_mul_mat_str, [:int], :string
189
346
 
190
347
  # Set the log callback
191
348
  attach_function :whisper_log_set, [:ggml_log_callback, :pointer], :void
@@ -196,6 +353,5 @@ module Whisper
196
353
  end
197
354
  # Set the no-op log callback to suppress logging
198
355
  Whisper.whisper_log_set NOOP_LOG_CALLBACK, FFI::Pointer::NULL unless ENV['WHISPER_DEBUG']
199
-
200
356
  end
201
357
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whisper.cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Braulio Oliveira
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-30 00:00:00.000000000 Z
11
+ date: 2024-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -73,6 +73,7 @@ files:
73
73
  - lib/whisper.rb
74
74
  - lib/whisper/audio_processor.rb
75
75
  - lib/whisper/model.rb
76
+ - lib/whisper/version.rb
76
77
  - script/console
77
78
  - whisper.cpp.gemspec
78
79
  homepage: