whisper.cpp 0.2.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9233f65df607fba80d30ff4ae4a86dcbfaa1a750c88de487a2f98ace0a15de75
4
- data.tar.gz: 157dcee2a04c9b514c3b36a92500532521aa964eed35ef9ffefd6f2d6b5fd40b
3
+ metadata.gz: 561b7b824c1f631537d681fb1e5a1eafe20546a8468705ecd2ba3989e0062c69
4
+ data.tar.gz: e047a8ab099358095100a55cd30aca398bbe8b8c4c55cfb0713cdf33e87d2e1a
5
5
  SHA512:
6
- metadata.gz: 5a1b01fb0ae991764d318f054369f1740d9f7d5324aa8a0deedf7e5bad9f1c8ed84573061c3a68dfda1b73c60bc5761dd2f4f21931c22414098ebe3b1cc9b17d
7
- data.tar.gz: bae0d8a234b23f88fca2a363438a43989ad01d6e732b95987f740b999b6fa9c74f300de1033e338a141d85e29d5dff2827763b408f599c34bbed50fe58f5d441
6
+ metadata.gz: 80c60958d31df322ac570c97ca6d758278f8711d9bfff371be34e8159ffcadc2facf149aa4c7b55de49a2d2d8812b570d0ddd68e543172301f930fe494a4243b
7
+ data.tar.gz: 02fd686ff1857adf234c53fa0719035e797a82bed4b165e54004c0b7c903728eb7d8c44273b3f46e6ff21a32df44e29986092b72ccfd853324749e639fa303ed
data/lib/whisper/model.rb CHANGED
@@ -3,81 +3,125 @@ require_relative 'audio_processor'
3
3
 
4
4
  module Whisper
5
5
  class Model
6
+ TranscriptionResult = Struct.new(:language, :output)
6
7
 
7
- TranscriptionResult = Struct.new :language, :output
8
-
9
- def initialize model_path
10
- params = Whisper.whisper_context_default_params
11
- # Modify params as needed
12
- params[:use_gpu] = true
13
- params[:gpu_device] = 0
14
-
15
- @ctx = Whisper.whisper_init_from_file_with_params model_path, params
16
- raise 'Failed to initialize Whisper model' if @ctx.null?
8
+ def initialize(model_path)
9
+ @model_path = model_path
10
+ @ctx = nil
17
11
  end
18
12
 
19
- def transcribe_from_file(audio_file_path, format: 'plaintext')
13
+ def transcribe_from_file(audio_file_path, format: 'plaintext', **params)
20
14
  # Load audio file and convert to float array
21
- audio_data = Whisper::AudioProcessor.convert_to_float_array audio_file_path
15
+ audio_data = Whisper::AudioProcessor.convert_to_float_array(audio_file_path)
16
+ transcribe_from_audio_data(audio_data, format: format, **params)
17
+ end
18
+
19
+ def transcribe_from_audio_data(audio_data, format: 'plaintext', **params)
20
+ init_whisper_context(params)
22
21
 
23
22
  # Prepare full params
24
- params = Whisper.whisper_full_default_params Whisper::WHISPER_SAMPLING_GREEDY
25
- params[:n_threads] = 4
26
- params[:translate] = false
27
- params[:language] = FFI::Pointer::NULL # Auto-detect language
23
+ full_params = default_full_params(params)
28
24
 
29
25
  # Prepare audio data pointer
30
26
  n_samples = audio_data.size
31
27
  samples_ptr = FFI::MemoryPointer.new(:float, n_samples)
32
- samples_ptr.write_array_of_float audio_data
28
+ samples_ptr.write_array_of_float(audio_data)
33
29
 
34
- # Call the whisper_full function
35
- result = Whisper.whisper_full @ctx, params, samples_ptr, n_samples
30
+ # Call the whisper_full_parallel function
31
+ n_processors = params.fetch :n_processors, ENV['WHISPER_N_PROCS']&.to_i || 1
32
+ result = Whisper.whisper_full_parallel(@ctx, full_params, samples_ptr, n_samples, n_processors)
36
33
  raise 'Transcription failed' if result != 0
37
34
 
38
35
  # Retrieve detected language
39
36
  lang_id = Whisper.whisper_full_lang_id(@ctx)
40
37
  language = Whisper.whisper_lang_str(lang_id)
41
38
 
42
- n_segments = Whisper.whisper_full_n_segments @ctx
39
+ # Retrieve the transcription output
40
+ n_segments = Whisper.whisper_full_n_segments(@ctx)
41
+ output = format_transcription(format, n_segments: n_segments)
42
+
43
+ TranscriptionResult.new(language, output)
44
+ end
45
+
46
+ def close
47
+ Whisper.whisper_free(@ctx) unless @ctx.nil?
48
+ end
49
+
50
+ private
51
+
52
+ def init_whisper_context(params)
53
+ return unless @ctx.nil?
54
+
55
+ ctx_params = Whisper.whisper_context_default_params
56
+
57
+ # Set user-provided context params
58
+ user_ctx_params = params.fetch(:context_params, {})
59
+ user_ctx_params.each do |key, value|
60
+ if ctx_params.members.include?(field)
61
+ ctx_params[key] = value
62
+ else
63
+ warn "Unknown context_param field: #{field}"
64
+ end
65
+ end
66
+ ctx_params[:gpu_device] = ENV['WHISPER_GPU']&.to_i || 0
67
+
68
+ # Initialize context
69
+ @ctx = Whisper.whisper_init_from_file_with_params(@model_path, ctx_params)
70
+ raise 'Failed to initialize Whisper model' if @ctx.null?
71
+ end
72
+
73
+ def default_full_params(params)
74
+ # Get default full params
75
+ strategy = params.fetch(:sampling_strategy, Whisper::WHISPER_SAMPLING_GREEDY)
76
+ full_params = Whisper.whisper_full_default_params(strategy)
77
+
78
+ # Set translate to false to prevent translation to English
79
+ full_params[:translate] = false
80
+
81
+ # Set user-provided full params
82
+ user_full_params = params.fetch(:full_params, {})
83
+ user_full_params.each do |key, value|
84
+ if full_params.members.include?(field)
85
+ full_params[key] = value
86
+ else
87
+ warn "Unknown full_param field: #{field}"
88
+ end
89
+ end
90
+
91
+ full_params
92
+ end
93
+
94
+ def format_transcription(format, n_segments:)
43
95
  output = ''
44
96
  case format.downcase
45
97
  when 'plaintext'
46
98
  n_segments.times do |i|
47
- segment_text = Whisper.whisper_full_get_segment_text @ctx, i
99
+ segment_text = Whisper.whisper_full_get_segment_text(@ctx, i)
48
100
  output += segment_text
49
101
  end
50
102
  when 'srt'
51
103
  n_segments.times do |i|
52
- start_time = Whisper.whisper_full_get_segment_t0(@ctx, i) / 100.0
53
- end_time = Whisper.whisper_full_get_segment_t1(@ctx, i) / 100.0
54
- segment_text = Whisper.whisper_full_get_segment_text @ctx, i
104
+ start_time = Whisper.whisper_full_get_segment_t0(@ctx, i) / 100.0
105
+ end_time = Whisper.whisper_full_get_segment_t1(@ctx, i) / 100.0
106
+ segment_text = Whisper.whisper_full_get_segment_text(@ctx, i)
55
107
 
56
108
  output += "#{i + 1}\n"
57
- output += "#{format_time_srt start_time} --> #{format_time_srt end_time}\n"
109
+ output += "#{format_time_srt(start_time)} --> #{format_time_srt(end_time)}\n"
58
110
  output += "#{segment_text.strip}\n\n"
59
111
  end
60
112
  else
61
113
  raise "Unsupported format: #{format}"
62
114
  end
63
-
64
- TranscriptionResult.new language, output
65
- end
66
-
67
- def close
68
- Whisper.whisper_free @ctx
115
+ output
69
116
  end
70
117
 
71
- private
72
-
73
- def format_time_srt seconds
74
- hours = (seconds / 3600).to_i
118
+ def format_time_srt(seconds)
119
+ hours = (seconds / 3600).to_i
75
120
  minutes = ((seconds % 3600) / 60).to_i
76
- secs = (seconds % 60).to_i
77
- millis = ((seconds - seconds.to_i) * 1000).to_i
78
- format '%02d:%02d:%02d,%03d', hours, minutes, secs, millis
121
+ secs = (seconds % 60).to_i
122
+ millis = ((seconds - seconds.to_i) * 1000).to_i
123
+ format('%02d:%02d:%02d,%03d', hours, minutes, secs, millis)
79
124
  end
80
-
81
125
  end
82
126
  end
83
127
 
@@ -0,0 +1,5 @@
1
+ module Whisper
2
+
3
+ VERSION = '0.3.1'
4
+
5
+ end
data/lib/whisper.rb CHANGED
@@ -29,6 +29,20 @@ module Whisper
29
29
  :WHISPER_AHEADS_LARGE_V3
30
30
  ]
31
31
 
32
+ # Enums for sampling strategy
33
+ enum :whisper_sampling_strategy, [
34
+ :WHISPER_SAMPLING_GREEDY,
35
+ :WHISPER_SAMPLING_BEAM_SEARCH
36
+ ]
37
+
38
+ # Callbacks
39
+ callback :whisper_new_segment_callback, [:pointer, :pointer, :int, :pointer], :void
40
+ callback :whisper_progress_callback, [:pointer, :pointer, :int, :pointer], :void
41
+ callback :whisper_encoder_begin_callback, [:pointer, :pointer, :pointer], :bool
42
+ callback :whisper_logits_filter_callback, [:pointer, :pointer, :pointer, :int, :pointer, :pointer], :void
43
+ callback :ggml_abort_callback, [:pointer], :bool
44
+ callback :ggml_log_callback, [:int, :string, :pointer], :void
45
+
32
46
  # Structs Definitions
33
47
 
34
48
  # whisper_ahead struct
@@ -77,6 +91,28 @@ module Whisper
77
91
  )
78
92
  end
79
93
 
94
+ # whisper_model_loader struct
95
+ #class WhisperModelLoader < FFI::Struct
96
+ # callback :read_callback, [:pointer, :pointer, :size_t], :size_t
97
+ # callback :eof_callback, [:pointer], :bool
98
+ # callback :close_callback, [:pointer], :void
99
+
100
+ # layout(
101
+ # :context, :pointer,
102
+ # :read, :read_callback,
103
+ # :eof, :eof_callback,
104
+ # :close, :close_callback
105
+ # )
106
+ #end
107
+
108
+ # whisper_grammar_element struct
109
+ class WhisperGrammarElement < FFI::Struct
110
+ layout(
111
+ :type, :int,
112
+ :value, :uint32
113
+ )
114
+ end
115
+
80
116
  # greedy sampling parameters
81
117
  class WhisperGreedyParams < FFI::Struct
82
118
  layout(
@@ -134,15 +170,15 @@ module Whisper
134
170
  :no_speech_thold, :float,
135
171
  :greedy, WhisperGreedyParams,
136
172
  :beam_search, WhisperBeamSearchParams,
137
- :new_segment_callback, :pointer,
173
+ :new_segment_callback, :whisper_new_segment_callback,
138
174
  :new_segment_callback_user_data, :pointer,
139
- :progress_callback, :pointer,
175
+ :progress_callback, :whisper_progress_callback,
140
176
  :progress_callback_user_data, :pointer,
141
- :encoder_begin_callback, :pointer,
177
+ :encoder_begin_callback, :whisper_encoder_begin_callback,
142
178
  :encoder_begin_callback_user_data, :pointer,
143
- :abort_callback, :pointer,
179
+ :abort_callback, :ggml_abort_callback,
144
180
  :abort_callback_user_data, :pointer,
145
- :logits_filter_callback, :pointer,
181
+ :logits_filter_callback, :whisper_logits_filter_callback,
146
182
  :logits_filter_callback_user_data, :pointer,
147
183
  :grammar_rules, :pointer,
148
184
  :n_grammar_rules, :size_t,
@@ -151,41 +187,162 @@ module Whisper
151
187
  )
152
188
  end
153
189
 
190
+ # Get default context params
191
+ attach_function :whisper_context_default_params, [], WhisperContextParams.by_value
192
+ # Get default full params
193
+ attach_function :whisper_full_default_params, [:int], WhisperFullParams.by_value
194
+
154
195
  # Function Bindings
155
196
 
156
197
  # Initialize context with params
157
198
  attach_function :whisper_init_from_file_with_params, [:string, WhisperContextParams.by_value], :pointer
199
+ attach_function :whisper_init_from_buffer_with_params, [:pointer, :size_t, WhisperContextParams.by_value], :pointer
200
+ #attach_function :whisper_init_with_params, [WhisperModelLoader.by_ref, WhisperContextParams.by_value], :pointer
158
201
 
159
- # Get default context params
160
- attach_function :whisper_context_default_params, [], WhisperContextParams.by_value
202
+ # Initialize context without state
203
+ attach_function :whisper_init_from_file_with_params_no_state, [:string, WhisperContextParams.by_value], :pointer
204
+ attach_function :whisper_init_from_buffer_with_params_no_state, [:pointer, :size_t, WhisperContextParams.by_value], :pointer
205
+ #attach_function :whisper_init_with_params_no_state, [WhisperModelLoader.by_ref, WhisperContextParams.by_value], :pointer
161
206
 
162
- # Get default full params
163
- attach_function :whisper_full_default_params, [:int], WhisperFullParams.by_value
207
+ # Initialize state
208
+ attach_function :whisper_init_state, [:pointer], :pointer
209
+
210
+ # OpenVINO functions
211
+ #attach_function :whisper_ctx_init_openvino_encoder_with_state, [:pointer, :pointer, :string, :string, :string], :int
212
+ #attach_function :whisper_ctx_init_openvino_encoder, [:pointer, :string, :string, :string], :int
164
213
 
165
214
  # Free functions
166
215
  attach_function :whisper_free, [:pointer], :void
216
+ attach_function :whisper_free_state, [:pointer], :void
217
+ attach_function :whisper_free_params, [:pointer], :void
218
+ attach_function :whisper_free_context_params, [:pointer], :void
219
+
220
+ # PCM to Mel spectrogram
221
+ attach_function :whisper_pcm_to_mel, [:pointer, :pointer, :int, :int], :int
222
+ attach_function :whisper_pcm_to_mel_with_state, [:pointer, :pointer, :pointer, :int, :int], :int
223
+
224
+ # Set custom Mel spectrogram
225
+ attach_function :whisper_set_mel, [:pointer, :pointer, :int, :int], :int
226
+ attach_function :whisper_set_mel_with_state, [:pointer, :pointer, :pointer, :int, :int], :int
227
+
228
+ # Encode
229
+ attach_function :whisper_encode, [:pointer, :int, :int], :int
230
+ attach_function :whisper_encode_with_state, [:pointer, :pointer, :int, :int], :int
231
+
232
+ # Decode
233
+ attach_function :whisper_decode, [:pointer, :pointer, :int, :int, :int], :int
234
+ attach_function :whisper_decode_with_state, [:pointer, :pointer, :pointer, :int, :int, :int], :int
235
+
236
+ # Tokenize
237
+ attach_function :whisper_tokenize, [:pointer, :string, :pointer, :int], :int
238
+ attach_function :whisper_token_count, [:pointer, :string], :int
239
+
240
+ # Language functions
241
+ attach_function :whisper_lang_max_id, [], :int
242
+ attach_function :whisper_lang_id, [:string], :int
243
+ attach_function :whisper_lang_str, [:int], :string
244
+ attach_function :whisper_lang_str_full, [:int], :string
245
+
246
+ # Auto-detect language
247
+ attach_function :whisper_lang_auto_detect, [:pointer, :int, :int, :pointer], :int
248
+ attach_function :whisper_lang_auto_detect_with_state, [:pointer, :pointer, :int, :int, :pointer], :int
249
+
250
+ # Model info
251
+ attach_function :whisper_n_len, [:pointer], :int
252
+ attach_function :whisper_n_len_from_state, [:pointer], :int
253
+ attach_function :whisper_n_vocab, [:pointer], :int
254
+ attach_function :whisper_n_text_ctx, [:pointer], :int
255
+ attach_function :whisper_n_audio_ctx, [:pointer], :int
256
+ attach_function :whisper_is_multilingual, [:pointer], :int
257
+
258
+ attach_function :whisper_model_n_vocab, [:pointer], :int
259
+ attach_function :whisper_model_n_audio_ctx, [:pointer], :int
260
+ attach_function :whisper_model_n_audio_state, [:pointer], :int
261
+ attach_function :whisper_model_n_audio_head, [:pointer], :int
262
+ attach_function :whisper_model_n_audio_layer, [:pointer], :int
263
+ attach_function :whisper_model_n_text_ctx, [:pointer], :int
264
+ attach_function :whisper_model_n_text_state, [:pointer], :int
265
+ attach_function :whisper_model_n_text_head, [:pointer], :int
266
+ attach_function :whisper_model_n_text_layer, [:pointer], :int
267
+ attach_function :whisper_model_n_mels, [:pointer], :int
268
+ attach_function :whisper_model_ftype, [:pointer], :int
269
+ attach_function :whisper_model_type, [:pointer], :int
270
+
271
+ # Get logits
272
+ attach_function :whisper_get_logits, [:pointer], :pointer
273
+ attach_function :whisper_get_logits_from_state, [:pointer], :pointer
274
+
275
+ # Token functions
276
+ attach_function :whisper_token_to_str, [:pointer, :int32], :string
277
+ attach_function :whisper_model_type_readable, [:pointer], :string
278
+
279
+ # Special tokens
280
+ attach_function :whisper_token_eot, [:pointer], :int32
281
+ attach_function :whisper_token_sot, [:pointer], :int32
282
+ attach_function :whisper_token_solm, [:pointer], :int32
283
+ attach_function :whisper_token_prev, [:pointer], :int32
284
+ attach_function :whisper_token_nosp, [:pointer], :int32
285
+ attach_function :whisper_token_not, [:pointer], :int32
286
+ attach_function :whisper_token_beg, [:pointer], :int32
287
+ attach_function :whisper_token_lang, [:pointer, :int], :int32
288
+
289
+ # Task tokens
290
+ attach_function :whisper_token_translate, [:pointer], :int32
291
+ attach_function :whisper_token_transcribe, [:pointer], :int32
292
+
293
+ # Timings
294
+ attach_function :whisper_print_timings, [:pointer], :void
295
+ attach_function :whisper_reset_timings, [:pointer], :void
296
+ attach_function :whisper_print_system_info, [], :string
167
297
 
168
298
  # Full transcription function
169
299
  attach_function :whisper_full, [:pointer, WhisperFullParams.by_value, :pointer, :int], :int
300
+ attach_function :whisper_full_with_state, [:pointer, :pointer, WhisperFullParams.by_value, :pointer, :int], :int
301
+
302
+ # Parallel processing
303
+ attach_function :whisper_full_parallel, [:pointer, WhisperFullParams.by_value, :pointer, :int, :int], :int
170
304
 
171
305
  # Number of segments
172
306
  attach_function :whisper_full_n_segments, [:pointer], :int
307
+ attach_function :whisper_full_n_segments_from_state, [:pointer], :int
173
308
 
174
- # Get segment text
175
- attach_function :whisper_full_get_segment_text, [:pointer, :int], :string
176
-
177
- # Get segment start and end times
309
+ # Get segment info
178
310
  attach_function :whisper_full_get_segment_t0, [:pointer, :int], :int64
311
+ attach_function :whisper_full_get_segment_t0_from_state, [:pointer, :int], :int64
312
+
179
313
  attach_function :whisper_full_get_segment_t1, [:pointer, :int], :int64
314
+ attach_function :whisper_full_get_segment_t1_from_state, [:pointer, :int], :int64
180
315
 
181
- # Get detected language ID
182
- attach_function :whisper_full_lang_id, [:pointer], :int
316
+ attach_function :whisper_full_get_segment_speaker_turn_next, [:pointer, :int], :bool
317
+ attach_function :whisper_full_get_segment_speaker_turn_next_from_state, [:pointer, :int], :bool
183
318
 
184
- # Convert language ID to string
185
- attach_function :whisper_lang_str, [:int], :string
319
+ attach_function :whisper_full_get_segment_text, [:pointer, :int], :string
320
+ attach_function :whisper_full_get_segment_text_from_state, [:pointer, :int], :string
186
321
 
187
- # void log_callback(int level, const char *msg, void *user_data);
188
- callback :ggml_log_callback, [:int, :string, :pointer], :void
322
+ attach_function :whisper_full_n_tokens, [:pointer, :int], :int
323
+ attach_function :whisper_full_n_tokens_from_state, [:pointer, :int], :int
324
+
325
+ attach_function :whisper_full_get_token_text, [:pointer, :int, :int], :string
326
+ attach_function :whisper_full_get_token_text_from_state, [:pointer, :pointer, :int, :int], :string
327
+
328
+ attach_function :whisper_full_get_token_id, [:pointer, :int, :int], :int32
329
+ attach_function :whisper_full_get_token_id_from_state, [:pointer, :int, :int], :int32
330
+
331
+ attach_function :whisper_full_get_token_data, [:pointer, :int, :int], WhisperTokenData.by_value
332
+ attach_function :whisper_full_get_token_data_from_state, [:pointer, :int, :int], WhisperTokenData.by_value
333
+
334
+ attach_function :whisper_full_get_token_p, [:pointer, :int, :int], :float
335
+ attach_function :whisper_full_get_token_p_from_state, [:pointer, :int, :int], :float
336
+
337
+ # Language ID
338
+ attach_function :whisper_full_lang_id, [:pointer], :int
339
+ attach_function :whisper_full_lang_id_from_state, [:pointer], :int
340
+
341
+ # Benchmarks
342
+ attach_function :whisper_bench_memcpy, [:int], :int
343
+ attach_function :whisper_bench_memcpy_str, [:int], :string
344
+ attach_function :whisper_bench_ggml_mul_mat, [:int], :int
345
+ attach_function :whisper_bench_ggml_mul_mat_str, [:int], :string
189
346
 
190
347
  # Set the log callback
191
348
  attach_function :whisper_log_set, [:ggml_log_callback, :pointer], :void
@@ -196,6 +353,5 @@ module Whisper
196
353
  end
197
354
  # Set the no-op log callback to suppress logging
198
355
  Whisper.whisper_log_set NOOP_LOG_CALLBACK, FFI::Pointer::NULL unless ENV['WHISPER_DEBUG']
199
-
200
356
  end
201
357
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whisper.cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Braulio Oliveira
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-30 00:00:00.000000000 Z
11
+ date: 2024-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -73,6 +73,7 @@ files:
73
73
  - lib/whisper.rb
74
74
  - lib/whisper/audio_processor.rb
75
75
  - lib/whisper/model.rb
76
+ - lib/whisper/version.rb
76
77
  - script/console
77
78
  - whisper.cpp.gemspec
78
79
  homepage: