GameSentenceMiner 2.10.16__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/ai/ai_prompting.py +339 -42
- GameSentenceMiner/anki.py +8 -4
- GameSentenceMiner/config_gui.py +25 -17
- GameSentenceMiner/gsm.py +19 -17
- GameSentenceMiner/ocr/owocr_helper.py +9 -6
- GameSentenceMiner/owocr/owocr/ocr.py +86 -41
- GameSentenceMiner/owocr/owocr/run.py +11 -4
- GameSentenceMiner/util/configuration.py +11 -9
- GameSentenceMiner/util/text_log.py +4 -0
- GameSentenceMiner/vad.py +209 -174
- GameSentenceMiner/web/templates/index.html +21 -20
- GameSentenceMiner/web/texthooking_page.py +15 -1
- {gamesentenceminer-2.10.16.dist-info → gamesentenceminer-2.11.0.dist-info}/METADATA +2 -2
- {gamesentenceminer-2.10.16.dist-info → gamesentenceminer-2.11.0.dist-info}/RECORD +18 -18
- {gamesentenceminer-2.10.16.dist-info → gamesentenceminer-2.11.0.dist-info}/WHEEL +0 -0
- {gamesentenceminer-2.10.16.dist-info → gamesentenceminer-2.11.0.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.10.16.dist-info → gamesentenceminer-2.11.0.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.10.16.dist-info → gamesentenceminer-2.11.0.dist-info}/top_level.txt +0 -0
GameSentenceMiner/vad.py
CHANGED
@@ -14,8 +14,8 @@ class VADSystem:
|
|
14
14
|
def __init__(self):
|
15
15
|
self.silero = None
|
16
16
|
self.whisper = None
|
17
|
-
self.vosk = None
|
18
|
-
self.groq = None
|
17
|
+
# self.vosk = None
|
18
|
+
# self.groq = None
|
19
19
|
|
20
20
|
def init(self):
|
21
21
|
if get_config().vad.is_whisper():
|
@@ -24,12 +24,12 @@ class VADSystem:
|
|
24
24
|
if get_config().vad.is_silero():
|
25
25
|
if not self.silero:
|
26
26
|
self.silero = SileroVADProcessor()
|
27
|
-
if get_config().vad.is_vosk():
|
28
|
-
|
29
|
-
|
30
|
-
if get_config().vad.is_groq():
|
31
|
-
|
32
|
-
|
27
|
+
# if get_config().vad.is_vosk():
|
28
|
+
# if not self.vosk:
|
29
|
+
# self.vosk = VoskVADProcessor()
|
30
|
+
# if get_config().vad.is_groq():
|
31
|
+
# if not self.groq:
|
32
|
+
# self.groq = GroqVADProcessor()
|
33
33
|
|
34
34
|
def trim_audio_with_vad(self, input_audio, output_audio, game_line):
|
35
35
|
if get_config().vad.do_vad_postprocessing:
|
@@ -53,18 +53,18 @@ class VADSystem:
|
|
53
53
|
match model:
|
54
54
|
case configuration.OFF:
|
55
55
|
return VADResult(False, 0, 0, "OFF")
|
56
|
-
case configuration.GROQ:
|
57
|
-
|
58
|
-
|
59
|
-
|
56
|
+
# case configuration.GROQ:
|
57
|
+
# if not self.groq:
|
58
|
+
# self.groq = GroqVADProcessor()
|
59
|
+
# return self.groq.process_audio(input_audio, output_audio, game_line)
|
60
60
|
case configuration.SILERO:
|
61
61
|
if not self.silero:
|
62
62
|
self.silero = SileroVADProcessor()
|
63
63
|
return self.silero.process_audio(input_audio, output_audio, game_line)
|
64
|
-
case configuration.VOSK:
|
65
|
-
|
66
|
-
|
67
|
-
|
64
|
+
# case configuration.VOSK:
|
65
|
+
# if not self.vosk:
|
66
|
+
# self.vosk = VoskVADProcessor()
|
67
|
+
# return self.vosk.process_audio(input_audio, output_audio, game_line)
|
68
68
|
case configuration.WHISPER:
|
69
69
|
if not self.whisper:
|
70
70
|
self.whisper = WhisperVADProcessor()
|
@@ -121,6 +121,8 @@ class VADProcessor(ABC):
|
|
121
121
|
logger.info("No voice activity detected in the audio.")
|
122
122
|
return VADResult(False, 0, 0, self.vad_system_name)
|
123
123
|
|
124
|
+
print(voice_activity)
|
125
|
+
|
124
126
|
start_time = voice_activity[0]['start'] if voice_activity else 0
|
125
127
|
end_time = voice_activity[-1]['end'] if voice_activity else 0
|
126
128
|
|
@@ -133,7 +135,7 @@ class VADProcessor(ABC):
|
|
133
135
|
if get_config().vad.cut_and_splice_segments:
|
134
136
|
self.extract_audio_and_combine_segments(input_audio, voice_activity, output_audio, padding=get_config().vad.splice_padding)
|
135
137
|
else:
|
136
|
-
ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0, fade_out_duration=
|
138
|
+
ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0.05, fade_out_duration=00)
|
137
139
|
return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, self.vad_system_name, voice_activity, output_audio)
|
138
140
|
|
139
141
|
class SileroVADProcessor(VADProcessor):
|
@@ -145,10 +147,10 @@ class SileroVADProcessor(VADProcessor):
|
|
145
147
|
|
146
148
|
def _detect_voice_activity(self, input_audio):
|
147
149
|
from silero_vad import read_audio, get_speech_timestamps
|
148
|
-
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
149
|
-
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
150
|
-
wav = read_audio(
|
151
|
-
speech_timestamps = get_speech_timestamps(wav, self.vad_model, return_seconds=True)
|
150
|
+
# temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
151
|
+
# ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
152
|
+
wav = read_audio(input_audio)
|
153
|
+
speech_timestamps = get_speech_timestamps(wav, self.vad_model, return_seconds=True, threshold=0.2)
|
152
154
|
logger.debug(speech_timestamps)
|
153
155
|
return speech_timestamps
|
154
156
|
|
@@ -161,7 +163,7 @@ class WhisperVADProcessor(VADProcessor):
|
|
161
163
|
def load_whisper_model(self):
|
162
164
|
import stable_whisper as whisper
|
163
165
|
if not self.vad_model:
|
164
|
-
with warnings.catch_warnings(
|
166
|
+
with warnings.catch_warnings():
|
165
167
|
self.vad_model = whisper.load_model(get_config().vad.whisper_model)
|
166
168
|
logger.info(f"Whisper model '{get_config().vad.whisper_model}' loaded.")
|
167
169
|
return self.vad_model
|
@@ -169,21 +171,29 @@ class WhisperVADProcessor(VADProcessor):
|
|
169
171
|
def _detect_voice_activity(self, input_audio):
|
170
172
|
from stable_whisper import WhisperResult
|
171
173
|
# Convert the audio to 16kHz mono WAV
|
172
|
-
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
173
|
-
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
174
|
+
# temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
175
|
+
# ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
174
176
|
|
175
177
|
logger.info('transcribing audio...')
|
176
178
|
|
177
179
|
# Transcribe the audio using Whisper
|
178
|
-
with warnings.catch_warnings(
|
179
|
-
result: WhisperResult = self.vad_model.transcribe(
|
180
|
+
with warnings.catch_warnings():
|
181
|
+
result: WhisperResult = self.vad_model.transcribe(input_audio, vad=True, language=get_config().vad.language,
|
180
182
|
temperature=0.0)
|
181
183
|
voice_activity = []
|
182
184
|
|
183
185
|
logger.debug(result.to_dict())
|
184
186
|
|
185
187
|
# Process the segments to extract tokens, timestamps, and confidence
|
186
|
-
for segment in result.segments:
|
188
|
+
for i, segment in enumerate(result.segments):
|
189
|
+
if len(segment.text) == 1 and (i > 1 and segment.start - result.segments[i - 1].end > 1.0) or (i < len(result.segments) - 1 and result.segments[i + 1].start - segment.end > 1.0):
|
190
|
+
if segment.text in ['えー', 'ん']:
|
191
|
+
logger.debug(f"Skipping filler segment: {segment.text} at {segment.start}-{segment.end}")
|
192
|
+
continue
|
193
|
+
else:
|
194
|
+
logger.info(
|
195
|
+
"Unknown single character segment, not skipping, but logging, please report if this is a mistake: " + segment.text)
|
196
|
+
|
187
197
|
logger.debug(segment.to_dict())
|
188
198
|
voice_activity.append({
|
189
199
|
'text': segment.text,
|
@@ -216,154 +226,179 @@ class WhisperVADProcessor(VADProcessor):
|
|
216
226
|
return voice_activity
|
217
227
|
|
218
228
|
# Add a new class for Vosk-based VAD
|
219
|
-
class VoskVADProcessor(VADProcessor):
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
class GroqVADProcessor(VADProcessor):
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
return [], 0.0
|
229
|
+
# class VoskVADProcessor(VADProcessor):
|
230
|
+
# def __init__(self):
|
231
|
+
# super().__init__()
|
232
|
+
# self.vad_model = self._load_vosk_model()
|
233
|
+
# self.vad_system_name = VOSK
|
234
|
+
#
|
235
|
+
# def _load_vosk_model(self):
|
236
|
+
# if not self.vad_model:
|
237
|
+
# import vosk
|
238
|
+
# vosk_model_path = self._download_and_cache_vosk_model()
|
239
|
+
# self.vad_model = vosk.Model(vosk_model_path)
|
240
|
+
# logger.info(f"Vosk model loaded from {vosk_model_path}")
|
241
|
+
# return self.vad_model
|
242
|
+
#
|
243
|
+
# def _download_and_cache_vosk_model(self, model_dir="vosk_model_cache"):
|
244
|
+
# # Ensure the cache directory exists
|
245
|
+
# import requests
|
246
|
+
# import zipfile
|
247
|
+
# import tarfile
|
248
|
+
# if not os.path.exists(os.path.join(get_app_directory(), model_dir)):
|
249
|
+
# os.makedirs(os.path.join(get_app_directory(), model_dir))
|
250
|
+
#
|
251
|
+
# # Extract the model name from the URL
|
252
|
+
# model_filename = get_config().vad.vosk_url.split("/")[-1]
|
253
|
+
# model_path = os.path.join(get_app_directory(), model_dir, model_filename)
|
254
|
+
#
|
255
|
+
# # If the model is already downloaded, skip the download
|
256
|
+
# if not os.path.exists(model_path):
|
257
|
+
# logger.info(
|
258
|
+
# f"Downloading the Vosk model from {get_config().vad.vosk_url}... This will take a while if using large model, ~1G")
|
259
|
+
# response = requests.get(get_config().vad.vosk_url, stream=True)
|
260
|
+
# with open(model_path, "wb") as file:
|
261
|
+
# for chunk in response.iter_content(chunk_size=8192):
|
262
|
+
# if chunk:
|
263
|
+
# file.write(chunk)
|
264
|
+
# logger.info("Download complete.")
|
265
|
+
#
|
266
|
+
# # Extract the model if it's a zip or tar file
|
267
|
+
# model_extract_path = os.path.join(get_app_directory(), model_dir, "vosk_model")
|
268
|
+
# if not os.path.exists(model_extract_path):
|
269
|
+
# logger.info("Extracting the Vosk model...")
|
270
|
+
# if model_filename.endswith(".zip"):
|
271
|
+
# with zipfile.ZipFile(model_path, "r") as zip_ref:
|
272
|
+
# zip_ref.extractall(model_extract_path)
|
273
|
+
# elif model_filename.endswith(".tar.gz"):
|
274
|
+
# with tarfile.open(model_path, "r:gz") as tar_ref:
|
275
|
+
# tar_ref.extractall(model_extract_path)
|
276
|
+
# else:
|
277
|
+
# logger.info("Unknown archive format. Model extraction skipped.")
|
278
|
+
# logger.info(f"Model extracted to {model_extract_path}.")
|
279
|
+
# else:
|
280
|
+
# logger.info(f"Model already extracted at {model_extract_path}.")
|
281
|
+
#
|
282
|
+
# # Return the path to the actual model folder inside the extraction directory
|
283
|
+
# extracted_folders = os.listdir(model_extract_path)
|
284
|
+
# if extracted_folders:
|
285
|
+
# actual_model_folder = os.path.join(model_extract_path,
|
286
|
+
# extracted_folders[0]) # Assuming the first folder is the model
|
287
|
+
# return actual_model_folder
|
288
|
+
# else:
|
289
|
+
# return model_extract_path # In case there's no subfolder, return the extraction path directly
|
290
|
+
#
|
291
|
+
# def _detect_voice_activity(self, input_audio):
|
292
|
+
# import soundfile as sf
|
293
|
+
# import vosk
|
294
|
+
# import numpy as np
|
295
|
+
# # Convert the audio to 16kHz mono WAV
|
296
|
+
# temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
297
|
+
# ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
298
|
+
#
|
299
|
+
# # Initialize recognizer
|
300
|
+
# with sf.SoundFile(temp_wav) as audio_file:
|
301
|
+
# recognizer = vosk.KaldiRecognizer(self.vad_model, audio_file.samplerate)
|
302
|
+
# voice_activity = []
|
303
|
+
#
|
304
|
+
# recognizer.SetWords(True)
|
305
|
+
#
|
306
|
+
# # Process audio in chunks
|
307
|
+
# while True:
|
308
|
+
# data = audio_file.buffer_read(4000, dtype='int16')
|
309
|
+
# if len(data) == 0:
|
310
|
+
# break
|
311
|
+
#
|
312
|
+
# # Convert buffer to bytes using NumPy
|
313
|
+
# data_bytes = np.frombuffer(data, dtype='int16').tobytes()
|
314
|
+
#
|
315
|
+
# if recognizer.AcceptWaveform(data_bytes):
|
316
|
+
# pass
|
317
|
+
#
|
318
|
+
# final_result = json.loads(recognizer.FinalResult())
|
319
|
+
# if 'result' in final_result:
|
320
|
+
# for word in final_result['result']:
|
321
|
+
# if word['conf'] >= 0.90:
|
322
|
+
# voice_activity.append({
|
323
|
+
# 'text': word['word'],
|
324
|
+
# 'start': word['start'],
|
325
|
+
# 'end': word['end']
|
326
|
+
# })
|
327
|
+
#
|
328
|
+
# # Return the detected voice activity
|
329
|
+
# return voice_activity
|
330
|
+
|
331
|
+
# class GroqVADProcessor(VADProcessor):
|
332
|
+
# def __init__(self):
|
333
|
+
# super().__init__()
|
334
|
+
# self.client = self.load_groq_model()
|
335
|
+
# self.vad_system_name = GROQ
|
336
|
+
#
|
337
|
+
# def load_groq_model(self):
|
338
|
+
# if not hasattr(self, 'client') or not self.client:
|
339
|
+
# from groq import Groq
|
340
|
+
# client = Groq(api_key=get_config().ai.groq_api_key)
|
341
|
+
# logger.info("Groq model loaded.")
|
342
|
+
# return client
|
343
|
+
# return self.client
|
344
|
+
#
|
345
|
+
# def _detect_voice_activity(self, input_audio):
|
346
|
+
# try:
|
347
|
+
# with open(input_audio, "rb") as file:
|
348
|
+
# transcription = self.client.audio.transcriptions.create(
|
349
|
+
# file=(os.path.basename(input_audio), file.read()),
|
350
|
+
# model="whisper-large-v3-turbo",
|
351
|
+
# response_format="verbose_json",
|
352
|
+
# language=get_config().vad.language,
|
353
|
+
# temperature=0.0,
|
354
|
+
# timestamp_granularities=["segment"],
|
355
|
+
# prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
|
356
|
+
# )
|
357
|
+
#
|
358
|
+
# logger.debug(transcription)
|
359
|
+
# speech_segments = []
|
360
|
+
# if hasattr(transcription, 'segments'):
|
361
|
+
# speech_segments = transcription.segments
|
362
|
+
# elif hasattr(transcription, 'words'):
|
363
|
+
# speech_segments = transcription.words
|
364
|
+
# return speech_segments
|
365
|
+
# except Exception as e:
|
366
|
+
# logger.error(f"Error detecting voice with Groq: {e}")
|
367
|
+
# return [], 0.0
|
359
368
|
|
360
369
|
|
361
370
|
vad_processor = VADSystem()
|
362
371
|
|
363
|
-
#
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
372
|
+
# Test cases for all VADProcessors
|
373
|
+
def test_vad_processors():
|
374
|
+
logger.setLevel(logging.DEBUG)
|
375
|
+
test_audio = r"C:\Users\Beangate\GSM\Electron App\test\tmptnl4a93q_untrimmed.opus"
|
376
|
+
output_dir = r"C:\Users\Beangate\GSM\Electron App\test\output"
|
377
|
+
os.makedirs(output_dir, exist_ok=True)
|
378
|
+
processors = [
|
379
|
+
(WhisperVADProcessor(), "after_splice_whisper.opus"),
|
380
|
+
(SileroVADProcessor(), "after_splice_silero.opus"),
|
381
|
+
# (VoskVADProcessor(), "after_splice_vosk.opus"),
|
382
|
+
# (GroqVADProcessor(), "after_splice_groq.opus"),
|
383
|
+
]
|
384
|
+
# get_config().vad.cut_and_splice_segments = True
|
385
|
+
# get_config().vad.splice_padding = 0.3
|
386
|
+
# for processor, out_name in processors:
|
387
|
+
# logger.info("Testing Splice Audio with " + processor.vad_system_name)
|
388
|
+
# out_path = os.path.join(output_dir, out_name)
|
389
|
+
# if os.path.exists(out_path):
|
390
|
+
# os.remove(out_path)
|
391
|
+
# processor.process_audio(test_audio, out_path, None)
|
392
|
+
|
393
|
+
get_config().vad.cut_and_splice_segments = False
|
394
|
+
get_config().vad.trim_beginning = True
|
395
|
+
for processor, out_name in processors:
|
396
|
+
logger.info("Testing Trim Audio with " + processor.vad_system_name)
|
397
|
+
out_path = os.path.join(output_dir, out_name.replace("after_splice_", "after_trim_"))
|
398
|
+
if os.path.exists(out_path):
|
399
|
+
os.remove(out_path)
|
400
|
+
processor.process_audio(test_audio, out_path, None)
|
401
|
+
|
402
|
+
|
403
|
+
if __name__ == "__main__":
|
404
|
+
test_vad_processors()
|