GameSentenceMiner 2.10.17__py3-none-any.whl → 2.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
GameSentenceMiner/vad.py CHANGED
@@ -14,8 +14,8 @@ class VADSystem:
14
14
  def __init__(self):
15
15
  self.silero = None
16
16
  self.whisper = None
17
- self.vosk = None
18
- self.groq = None
17
+ # self.vosk = None
18
+ # self.groq = None
19
19
 
20
20
  def init(self):
21
21
  if get_config().vad.is_whisper():
@@ -24,12 +24,12 @@ class VADSystem:
24
24
  if get_config().vad.is_silero():
25
25
  if not self.silero:
26
26
  self.silero = SileroVADProcessor()
27
- if get_config().vad.is_vosk():
28
- if not self.vosk:
29
- self.vosk = VoskVADProcessor()
30
- if get_config().vad.is_groq():
31
- if not self.groq:
32
- self.groq = GroqVADProcessor()
27
+ # if get_config().vad.is_vosk():
28
+ # if not self.vosk:
29
+ # self.vosk = VoskVADProcessor()
30
+ # if get_config().vad.is_groq():
31
+ # if not self.groq:
32
+ # self.groq = GroqVADProcessor()
33
33
 
34
34
  def trim_audio_with_vad(self, input_audio, output_audio, game_line):
35
35
  if get_config().vad.do_vad_postprocessing:
@@ -53,18 +53,18 @@ class VADSystem:
53
53
  match model:
54
54
  case configuration.OFF:
55
55
  return VADResult(False, 0, 0, "OFF")
56
- case configuration.GROQ:
57
- if not self.groq:
58
- self.groq = GroqVADProcessor()
59
- return self.groq.process_audio(input_audio, output_audio, game_line)
56
+ # case configuration.GROQ:
57
+ # if not self.groq:
58
+ # self.groq = GroqVADProcessor()
59
+ # return self.groq.process_audio(input_audio, output_audio, game_line)
60
60
  case configuration.SILERO:
61
61
  if not self.silero:
62
62
  self.silero = SileroVADProcessor()
63
63
  return self.silero.process_audio(input_audio, output_audio, game_line)
64
- case configuration.VOSK:
65
- if not self.vosk:
66
- self.vosk = VoskVADProcessor()
67
- return self.vosk.process_audio(input_audio, output_audio, game_line)
64
+ # case configuration.VOSK:
65
+ # if not self.vosk:
66
+ # self.vosk = VoskVADProcessor()
67
+ # return self.vosk.process_audio(input_audio, output_audio, game_line)
68
68
  case configuration.WHISPER:
69
69
  if not self.whisper:
70
70
  self.whisper = WhisperVADProcessor()
@@ -121,6 +121,8 @@ class VADProcessor(ABC):
121
121
  logger.info("No voice activity detected in the audio.")
122
122
  return VADResult(False, 0, 0, self.vad_system_name)
123
123
 
124
+ print(voice_activity)
125
+
124
126
  start_time = voice_activity[0]['start'] if voice_activity else 0
125
127
  end_time = voice_activity[-1]['end'] if voice_activity else 0
126
128
 
@@ -133,7 +135,7 @@ class VADProcessor(ABC):
133
135
  if get_config().vad.cut_and_splice_segments:
134
136
  self.extract_audio_and_combine_segments(input_audio, voice_activity, output_audio, padding=get_config().vad.splice_padding)
135
137
  else:
136
- ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0, fade_out_duration=0)
138
+ ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0.05, fade_out_duration=00)
137
139
  return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, self.vad_system_name, voice_activity, output_audio)
138
140
 
139
141
  class SileroVADProcessor(VADProcessor):
@@ -145,10 +147,10 @@ class SileroVADProcessor(VADProcessor):
145
147
 
146
148
  def _detect_voice_activity(self, input_audio):
147
149
  from silero_vad import read_audio, get_speech_timestamps
148
- temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
149
- ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
150
- wav = read_audio(temp_wav)
151
- speech_timestamps = get_speech_timestamps(wav, self.vad_model, return_seconds=True)
150
+ # temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
151
+ # ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
152
+ wav = read_audio(input_audio)
153
+ speech_timestamps = get_speech_timestamps(wav, self.vad_model, return_seconds=True, threshold=0.2)
152
154
  logger.debug(speech_timestamps)
153
155
  return speech_timestamps
154
156
 
@@ -161,7 +163,7 @@ class WhisperVADProcessor(VADProcessor):
161
163
  def load_whisper_model(self):
162
164
  import stable_whisper as whisper
163
165
  if not self.vad_model:
164
- with warnings.catch_warnings(action="ignore"):
166
+ with warnings.catch_warnings():
165
167
  self.vad_model = whisper.load_model(get_config().vad.whisper_model)
166
168
  logger.info(f"Whisper model '{get_config().vad.whisper_model}' loaded.")
167
169
  return self.vad_model
@@ -169,21 +171,29 @@ class WhisperVADProcessor(VADProcessor):
169
171
  def _detect_voice_activity(self, input_audio):
170
172
  from stable_whisper import WhisperResult
171
173
  # Convert the audio to 16kHz mono WAV
172
- temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
173
- ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
174
+ # temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
175
+ # ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
174
176
 
175
177
  logger.info('transcribing audio...')
176
178
 
177
179
  # Transcribe the audio using Whisper
178
- with warnings.catch_warnings(action="ignore"):
179
- result: WhisperResult = self.vad_model.transcribe(temp_wav, vad=True, language=get_config().vad.language,
180
+ with warnings.catch_warnings():
181
+ result: WhisperResult = self.vad_model.transcribe(input_audio, vad=True, language=get_config().vad.language,
180
182
  temperature=0.0)
181
183
  voice_activity = []
182
184
 
183
185
  logger.debug(result.to_dict())
184
186
 
185
187
  # Process the segments to extract tokens, timestamps, and confidence
186
- for segment in result.segments:
188
+ for i, segment in enumerate(result.segments):
189
+ if len(segment.text) == 1 and (i > 1 and segment.start - result.segments[i - 1].end > 1.0) or (i < len(result.segments) - 1 and result.segments[i + 1].start - segment.end > 1.0):
190
+ if segment.text in ['えー', 'ん']:
191
+ logger.debug(f"Skipping filler segment: {segment.text} at {segment.start}-{segment.end}")
192
+ continue
193
+ else:
194
+ logger.info(
195
+ "Unknown single character segment, not skipping, but logging, please report if this is a mistake: " + segment.text)
196
+
187
197
  logger.debug(segment.to_dict())
188
198
  voice_activity.append({
189
199
  'text': segment.text,
@@ -216,154 +226,179 @@ class WhisperVADProcessor(VADProcessor):
216
226
  return voice_activity
217
227
 
218
228
  # Add a new class for Vosk-based VAD
219
- class VoskVADProcessor(VADProcessor):
220
- def __init__(self):
221
- super().__init__()
222
- self.vad_model = self._load_vosk_model()
223
- self.vad_system_name = VOSK
224
-
225
- def _load_vosk_model(self):
226
- if not self.vad_model:
227
- import vosk
228
- vosk_model_path = self._download_and_cache_vosk_model()
229
- self.vad_model = vosk.Model(vosk_model_path)
230
- logger.info(f"Vosk model loaded from {vosk_model_path}")
231
- return self.vad_model
232
-
233
- def _download_and_cache_vosk_model(self, model_dir="vosk_model_cache"):
234
- # Ensure the cache directory exists
235
- import requests
236
- import zipfile
237
- import tarfile
238
- if not os.path.exists(os.path.join(get_app_directory(), model_dir)):
239
- os.makedirs(os.path.join(get_app_directory(), model_dir))
240
-
241
- # Extract the model name from the URL
242
- model_filename = get_config().vad.vosk_url.split("/")[-1]
243
- model_path = os.path.join(get_app_directory(), model_dir, model_filename)
244
-
245
- # If the model is already downloaded, skip the download
246
- if not os.path.exists(model_path):
247
- logger.info(
248
- f"Downloading the Vosk model from {get_config().vad.vosk_url}... This will take a while if using large model, ~1G")
249
- response = requests.get(get_config().vad.vosk_url, stream=True)
250
- with open(model_path, "wb") as file:
251
- for chunk in response.iter_content(chunk_size=8192):
252
- if chunk:
253
- file.write(chunk)
254
- logger.info("Download complete.")
255
-
256
- # Extract the model if it's a zip or tar file
257
- model_extract_path = os.path.join(get_app_directory(), model_dir, "vosk_model")
258
- if not os.path.exists(model_extract_path):
259
- logger.info("Extracting the Vosk model...")
260
- if model_filename.endswith(".zip"):
261
- with zipfile.ZipFile(model_path, "r") as zip_ref:
262
- zip_ref.extractall(model_extract_path)
263
- elif model_filename.endswith(".tar.gz"):
264
- with tarfile.open(model_path, "r:gz") as tar_ref:
265
- tar_ref.extractall(model_extract_path)
266
- else:
267
- logger.info("Unknown archive format. Model extraction skipped.")
268
- logger.info(f"Model extracted to {model_extract_path}.")
269
- else:
270
- logger.info(f"Model already extracted at {model_extract_path}.")
271
-
272
- # Return the path to the actual model folder inside the extraction directory
273
- extracted_folders = os.listdir(model_extract_path)
274
- if extracted_folders:
275
- actual_model_folder = os.path.join(model_extract_path,
276
- extracted_folders[0]) # Assuming the first folder is the model
277
- return actual_model_folder
278
- else:
279
- return model_extract_path # In case there's no subfolder, return the extraction path directly
280
-
281
- def _detect_voice_activity(self, input_audio):
282
- import soundfile as sf
283
- import vosk
284
- import numpy as np
285
- # Convert the audio to 16kHz mono WAV
286
- temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
287
- ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
288
-
289
- # Initialize recognizer
290
- with sf.SoundFile(temp_wav) as audio_file:
291
- recognizer = vosk.KaldiRecognizer(self.vad_model, audio_file.samplerate)
292
- voice_activity = []
293
-
294
- recognizer.SetWords(True)
295
-
296
- # Process audio in chunks
297
- while True:
298
- data = audio_file.buffer_read(4000, dtype='int16')
299
- if len(data) == 0:
300
- break
301
-
302
- # Convert buffer to bytes using NumPy
303
- data_bytes = np.frombuffer(data, dtype='int16').tobytes()
304
-
305
- if recognizer.AcceptWaveform(data_bytes):
306
- pass
307
-
308
- final_result = json.loads(recognizer.FinalResult())
309
- if 'result' in final_result:
310
- for word in final_result['result']:
311
- if word['conf'] >= 0.90:
312
- voice_activity.append({
313
- 'text': word['word'],
314
- 'start': word['start'],
315
- 'end': word['end']
316
- })
317
-
318
- # Return the detected voice activity
319
- return voice_activity
320
-
321
- class GroqVADProcessor(VADProcessor):
322
- def __init__(self):
323
- super().__init__()
324
- from groq import Groq
325
- self.client = Groq(api_key=get_config().ai.groq_api_key)
326
- self.vad_model = self.load_groq_model()
327
- self.vad_system_name = GROQ
328
-
329
- def load_groq_model(self):
330
- if not self.vad_model:
331
- from groq import Groq
332
- self.vad_model = Groq()
333
- logger.info("Groq model loaded.")
334
- return self.vad_model
335
-
336
- def _detect_voice_activity(self, input_audio):
337
- try:
338
- with open(input_audio, "rb") as file:
339
- transcription = self.client.audio.transcriptions.create(
340
- file=(os.path.basename(input_audio), file.read()),
341
- model="whisper-large-v3-turbo",
342
- response_format="verbose_json",
343
- language=get_config().vad.language,
344
- temperature=0.0,
345
- timestamp_granularities=["segment"],
346
- prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
347
- )
348
-
349
- logger.debug(transcription)
350
- speech_segments = []
351
- if hasattr(transcription, 'segments'):
352
- speech_segments = transcription.segments
353
- elif hasattr(transcription, 'words'):
354
- speech_segments = transcription.words
355
- return speech_segments
356
- except Exception as e:
357
- logger.error(f"Error detecting voice with Groq: {e}")
358
- return [], 0.0
229
+ # class VoskVADProcessor(VADProcessor):
230
+ # def __init__(self):
231
+ # super().__init__()
232
+ # self.vad_model = self._load_vosk_model()
233
+ # self.vad_system_name = VOSK
234
+ #
235
+ # def _load_vosk_model(self):
236
+ # if not self.vad_model:
237
+ # import vosk
238
+ # vosk_model_path = self._download_and_cache_vosk_model()
239
+ # self.vad_model = vosk.Model(vosk_model_path)
240
+ # logger.info(f"Vosk model loaded from {vosk_model_path}")
241
+ # return self.vad_model
242
+ #
243
+ # def _download_and_cache_vosk_model(self, model_dir="vosk_model_cache"):
244
+ # # Ensure the cache directory exists
245
+ # import requests
246
+ # import zipfile
247
+ # import tarfile
248
+ # if not os.path.exists(os.path.join(get_app_directory(), model_dir)):
249
+ # os.makedirs(os.path.join(get_app_directory(), model_dir))
250
+ #
251
+ # # Extract the model name from the URL
252
+ # model_filename = get_config().vad.vosk_url.split("/")[-1]
253
+ # model_path = os.path.join(get_app_directory(), model_dir, model_filename)
254
+ #
255
+ # # If the model is already downloaded, skip the download
256
+ # if not os.path.exists(model_path):
257
+ # logger.info(
258
+ # f"Downloading the Vosk model from {get_config().vad.vosk_url}... This will take a while if using large model, ~1G")
259
+ # response = requests.get(get_config().vad.vosk_url, stream=True)
260
+ # with open(model_path, "wb") as file:
261
+ # for chunk in response.iter_content(chunk_size=8192):
262
+ # if chunk:
263
+ # file.write(chunk)
264
+ # logger.info("Download complete.")
265
+ #
266
+ # # Extract the model if it's a zip or tar file
267
+ # model_extract_path = os.path.join(get_app_directory(), model_dir, "vosk_model")
268
+ # if not os.path.exists(model_extract_path):
269
+ # logger.info("Extracting the Vosk model...")
270
+ # if model_filename.endswith(".zip"):
271
+ # with zipfile.ZipFile(model_path, "r") as zip_ref:
272
+ # zip_ref.extractall(model_extract_path)
273
+ # elif model_filename.endswith(".tar.gz"):
274
+ # with tarfile.open(model_path, "r:gz") as tar_ref:
275
+ # tar_ref.extractall(model_extract_path)
276
+ # else:
277
+ # logger.info("Unknown archive format. Model extraction skipped.")
278
+ # logger.info(f"Model extracted to {model_extract_path}.")
279
+ # else:
280
+ # logger.info(f"Model already extracted at {model_extract_path}.")
281
+ #
282
+ # # Return the path to the actual model folder inside the extraction directory
283
+ # extracted_folders = os.listdir(model_extract_path)
284
+ # if extracted_folders:
285
+ # actual_model_folder = os.path.join(model_extract_path,
286
+ # extracted_folders[0]) # Assuming the first folder is the model
287
+ # return actual_model_folder
288
+ # else:
289
+ # return model_extract_path # In case there's no subfolder, return the extraction path directly
290
+ #
291
+ # def _detect_voice_activity(self, input_audio):
292
+ # import soundfile as sf
293
+ # import vosk
294
+ # import numpy as np
295
+ # # Convert the audio to 16kHz mono WAV
296
+ # temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
297
+ # ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
298
+ #
299
+ # # Initialize recognizer
300
+ # with sf.SoundFile(temp_wav) as audio_file:
301
+ # recognizer = vosk.KaldiRecognizer(self.vad_model, audio_file.samplerate)
302
+ # voice_activity = []
303
+ #
304
+ # recognizer.SetWords(True)
305
+ #
306
+ # # Process audio in chunks
307
+ # while True:
308
+ # data = audio_file.buffer_read(4000, dtype='int16')
309
+ # if len(data) == 0:
310
+ # break
311
+ #
312
+ # # Convert buffer to bytes using NumPy
313
+ # data_bytes = np.frombuffer(data, dtype='int16').tobytes()
314
+ #
315
+ # if recognizer.AcceptWaveform(data_bytes):
316
+ # pass
317
+ #
318
+ # final_result = json.loads(recognizer.FinalResult())
319
+ # if 'result' in final_result:
320
+ # for word in final_result['result']:
321
+ # if word['conf'] >= 0.90:
322
+ # voice_activity.append({
323
+ # 'text': word['word'],
324
+ # 'start': word['start'],
325
+ # 'end': word['end']
326
+ # })
327
+ #
328
+ # # Return the detected voice activity
329
+ # return voice_activity
330
+
331
+ # class GroqVADProcessor(VADProcessor):
332
+ # def __init__(self):
333
+ # super().__init__()
334
+ # self.client = self.load_groq_model()
335
+ # self.vad_system_name = GROQ
336
+ #
337
+ # def load_groq_model(self):
338
+ # if not hasattr(self, 'client') or not self.client:
339
+ # from groq import Groq
340
+ # client = Groq(api_key=get_config().ai.groq_api_key)
341
+ # logger.info("Groq model loaded.")
342
+ # return client
343
+ # return self.client
344
+ #
345
+ # def _detect_voice_activity(self, input_audio):
346
+ # try:
347
+ # with open(input_audio, "rb") as file:
348
+ # transcription = self.client.audio.transcriptions.create(
349
+ # file=(os.path.basename(input_audio), file.read()),
350
+ # model="whisper-large-v3-turbo",
351
+ # response_format="verbose_json",
352
+ # language=get_config().vad.language,
353
+ # temperature=0.0,
354
+ # timestamp_granularities=["segment"],
355
+ # prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
356
+ # )
357
+ #
358
+ # logger.debug(transcription)
359
+ # speech_segments = []
360
+ # if hasattr(transcription, 'segments'):
361
+ # speech_segments = transcription.segments
362
+ # elif hasattr(transcription, 'words'):
363
+ # speech_segments = transcription.words
364
+ # return speech_segments
365
+ # except Exception as e:
366
+ # logger.error(f"Error detecting voice with Groq: {e}")
367
+ # return [], 0.0
359
368
 
360
369
 
361
370
  vad_processor = VADSystem()
362
371
 
363
- # test_vad = WhisperVADProcessor()
364
- #
365
- # if os.path.exists(r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus"):
366
- # os.remove(r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus")
367
- # get_config().vad.cut_and_splice_segments = True
368
- # get_config().vad.splice_padding = 0.3
369
- # test_vad.process_audio(r"C:\Users\Beangate\GSM\Electron App\test\temp_audio.opus", r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus", None)
372
+ # Test cases for all VADProcessors
373
+ def test_vad_processors():
374
+ logger.setLevel(logging.DEBUG)
375
+ test_audio = r"C:\Users\Beangate\GSM\Electron App\test\tmptnl4a93q_untrimmed.opus"
376
+ output_dir = r"C:\Users\Beangate\GSM\Electron App\test\output"
377
+ os.makedirs(output_dir, exist_ok=True)
378
+ processors = [
379
+ (WhisperVADProcessor(), "after_splice_whisper.opus"),
380
+ (SileroVADProcessor(), "after_splice_silero.opus"),
381
+ # (VoskVADProcessor(), "after_splice_vosk.opus"),
382
+ # (GroqVADProcessor(), "after_splice_groq.opus"),
383
+ ]
384
+ # get_config().vad.cut_and_splice_segments = True
385
+ # get_config().vad.splice_padding = 0.3
386
+ # for processor, out_name in processors:
387
+ # logger.info("Testing Splice Audio with " + processor.vad_system_name)
388
+ # out_path = os.path.join(output_dir, out_name)
389
+ # if os.path.exists(out_path):
390
+ # os.remove(out_path)
391
+ # processor.process_audio(test_audio, out_path, None)
392
+
393
+ get_config().vad.cut_and_splice_segments = False
394
+ get_config().vad.trim_beginning = True
395
+ for processor, out_name in processors:
396
+ logger.info("Testing Trim Audio with " + processor.vad_system_name)
397
+ out_path = os.path.join(output_dir, out_name.replace("after_splice_", "after_trim_"))
398
+ if os.path.exists(out_path):
399
+ os.remove(out_path)
400
+ processor.process_audio(test_audio, out_path, None)
401
+
402
+
403
+ if __name__ == "__main__":
404
+ test_vad_processors()