GameSentenceMiner 2.17.7__py3-none-any.whl → 2.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/ai/ai_prompting.py +6 -6
- GameSentenceMiner/anki.py +236 -152
- GameSentenceMiner/gametext.py +7 -4
- GameSentenceMiner/gsm.py +49 -10
- GameSentenceMiner/locales/en_us.json +7 -3
- GameSentenceMiner/locales/ja_jp.json +8 -4
- GameSentenceMiner/locales/zh_cn.json +8 -4
- GameSentenceMiner/obs.py +238 -59
- GameSentenceMiner/ocr/owocr_helper.py +1 -1
- GameSentenceMiner/tools/ss_selector.py +7 -8
- GameSentenceMiner/ui/__init__.py +0 -0
- GameSentenceMiner/ui/anki_confirmation.py +187 -0
- GameSentenceMiner/{config_gui.py → ui/config_gui.py} +100 -35
- GameSentenceMiner/ui/screenshot_selector.py +215 -0
- GameSentenceMiner/util/configuration.py +124 -22
- GameSentenceMiner/util/db.py +22 -13
- GameSentenceMiner/util/downloader/download_tools.py +2 -2
- GameSentenceMiner/util/ffmpeg.py +24 -30
- GameSentenceMiner/util/get_overlay_coords.py +34 -34
- GameSentenceMiner/util/gsm_utils.py +31 -1
- GameSentenceMiner/util/text_log.py +11 -9
- GameSentenceMiner/vad.py +31 -12
- GameSentenceMiner/web/database_api.py +742 -123
- GameSentenceMiner/web/static/css/dashboard-shared.css +241 -0
- GameSentenceMiner/web/static/css/kanji-grid.css +94 -2
- GameSentenceMiner/web/static/css/overview.css +850 -0
- GameSentenceMiner/web/static/css/popups-shared.css +126 -0
- GameSentenceMiner/web/static/css/shared.css +97 -0
- GameSentenceMiner/web/static/css/stats.css +192 -597
- GameSentenceMiner/web/static/js/anki_stats.js +6 -4
- GameSentenceMiner/web/static/js/database.js +209 -5
- GameSentenceMiner/web/static/js/goals.js +610 -0
- GameSentenceMiner/web/static/js/kanji-grid.js +267 -4
- GameSentenceMiner/web/static/js/overview.js +1176 -0
- GameSentenceMiner/web/static/js/shared.js +25 -0
- GameSentenceMiner/web/static/js/stats.js +154 -1459
- GameSentenceMiner/web/stats.py +2 -2
- GameSentenceMiner/web/templates/anki_stats.html +5 -0
- GameSentenceMiner/web/templates/components/navigation.html +3 -1
- GameSentenceMiner/web/templates/database.html +73 -1
- GameSentenceMiner/web/templates/goals.html +376 -0
- GameSentenceMiner/web/templates/index.html +13 -11
- GameSentenceMiner/web/templates/overview.html +416 -0
- GameSentenceMiner/web/templates/stats.html +46 -251
- GameSentenceMiner/web/texthooking_page.py +18 -0
- {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.0.dist-info}/METADATA +5 -1
- {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.0.dist-info}/RECORD +51 -41
- {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.0.dist-info}/WHEEL +0 -0
- {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.0.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.0.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.0.dist-info}/top_level.txt +0 -0
GameSentenceMiner/util/ffmpeg.py
CHANGED
|
@@ -5,23 +5,20 @@ import sys
|
|
|
5
5
|
import tempfile
|
|
6
6
|
import time
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
import subprocess
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import shutil
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
from GameSentenceMiner import obs
|
|
10
|
-
from GameSentenceMiner.
|
|
14
|
+
from GameSentenceMiner.ui.config_gui import ConfigApp
|
|
15
|
+
from GameSentenceMiner.util.configuration import ffmpeg_base_command_list, get_ffprobe_path, logger, get_config, \
|
|
11
16
|
get_temporary_directory, gsm_state, is_linux
|
|
12
17
|
from GameSentenceMiner.util.gsm_utils import make_unique_file_name, get_file_modification_time
|
|
13
18
|
from GameSentenceMiner.util import configuration
|
|
14
19
|
from GameSentenceMiner.util.text_log import initial_time
|
|
15
20
|
|
|
16
21
|
|
|
17
|
-
def get_ffmpeg_path():
|
|
18
|
-
return os.path.join(get_app_directory(), "ffmpeg", "ffmpeg.exe") if is_windows() else "ffmpeg"
|
|
19
|
-
|
|
20
|
-
def get_ffprobe_path():
|
|
21
|
-
return os.path.join(get_app_directory(), "ffmpeg", "ffprobe.exe") if is_windows() else "ffprobe"
|
|
22
|
-
|
|
23
|
-
ffmpeg_base_command_list = [get_ffmpeg_path(), "-hide_banner", "-loglevel", "error", '-nostdin']
|
|
24
|
-
|
|
25
22
|
supported_formats = {
|
|
26
23
|
'opus': 'libopus',
|
|
27
24
|
'mp3': 'libmp3lame',
|
|
@@ -30,11 +27,6 @@ supported_formats = {
|
|
|
30
27
|
'm4a': 'aac',
|
|
31
28
|
}
|
|
32
29
|
|
|
33
|
-
import subprocess
|
|
34
|
-
from pathlib import Path
|
|
35
|
-
import shutil
|
|
36
|
-
|
|
37
|
-
|
|
38
30
|
def video_to_anim(
|
|
39
31
|
input_path: str | Path,
|
|
40
32
|
output_path: str | Path = None,
|
|
@@ -184,22 +176,24 @@ def call_frame_extractor(video_path, timestamp):
|
|
|
184
176
|
str: The path of the selected image, or None on error.
|
|
185
177
|
"""
|
|
186
178
|
try:
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
#
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
logger.
|
|
200
|
-
#
|
|
201
|
-
logger.info(
|
|
202
|
-
|
|
179
|
+
config_app: ConfigApp = gsm_state.config_app
|
|
180
|
+
return config_app.show_screenshot_selector(video_path, timestamp, get_config().screenshot.screenshot_timing_setting)
|
|
181
|
+
# logger.info(' '.join([sys.executable, "-m", "GameSentenceMiner.tools.ss_selector", video_path, str(timestamp)]))
|
|
182
|
+
|
|
183
|
+
# # Run the script using subprocess.run()
|
|
184
|
+
# result = subprocess.run(
|
|
185
|
+
# [sys.executable, "-m", "GameSentenceMiner.tools.ss_selector", video_path, str(timestamp), get_config().screenshot.screenshot_timing_setting], # Use sys.executable
|
|
186
|
+
# capture_output=True,
|
|
187
|
+
# text=True, # Get output as text
|
|
188
|
+
# check=False # Raise an exception for non-zero exit codes
|
|
189
|
+
# )
|
|
190
|
+
# if result.returncode != 0:
|
|
191
|
+
# logger.error(f"Script failed with return code: {result.returncode}")
|
|
192
|
+
# return None
|
|
193
|
+
# logger.info(result)
|
|
194
|
+
# # Print the standard output
|
|
195
|
+
# logger.info(f"Frame extractor script output: {result.stdout.strip()}")
|
|
196
|
+
# return result.stdout.strip() # Return the output
|
|
203
197
|
|
|
204
198
|
except subprocess.CalledProcessError as e:
|
|
205
199
|
logger.error(f"Error calling script: {e}")
|
|
@@ -190,37 +190,38 @@ class OverlayProcessor:
|
|
|
190
190
|
"""
|
|
191
191
|
with mss.mss() as sct:
|
|
192
192
|
monitors = sct.monitors[1:]
|
|
193
|
-
if
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
193
|
+
return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
|
|
194
|
+
# if is_windows() and monitor_index == 0:
|
|
195
|
+
# from ctypes import wintypes
|
|
196
|
+
# import ctypes
|
|
197
|
+
# # Get work area for primary monitor (ignores taskbar)
|
|
198
|
+
# SPI_GETWORKAREA = 0x0030
|
|
199
|
+
# rect = wintypes.RECT()
|
|
200
|
+
# res = ctypes.windll.user32.SystemParametersInfoW(
|
|
201
|
+
# SPI_GETWORKAREA, 0, ctypes.byref(rect), 0
|
|
202
|
+
# )
|
|
203
|
+
# if not res:
|
|
204
|
+
# raise ctypes.WinError()
|
|
204
205
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
elif is_windows() and monitor_index > 0:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
else:
|
|
222
|
-
|
|
223
|
-
|
|
206
|
+
# return {
|
|
207
|
+
# "left": rect.left,
|
|
208
|
+
# "top": rect.top,
|
|
209
|
+
# "width": rect.right - rect.left,
|
|
210
|
+
# "height": rect.bottom - rect.top,
|
|
211
|
+
# }
|
|
212
|
+
# elif is_windows() and monitor_index > 0:
|
|
213
|
+
# # Secondary monitors: just return with a guess of how tall the taskbar is
|
|
214
|
+
# taskbar_height_guess = 48 # A common taskbar height, may vary
|
|
215
|
+
# mon = monitors[monitor_index]
|
|
216
|
+
# return {
|
|
217
|
+
# "left": mon["left"],
|
|
218
|
+
# "top": mon["top"],
|
|
219
|
+
# "width": mon["width"],
|
|
220
|
+
# "height": mon["height"] - taskbar_height_guess
|
|
221
|
+
# }
|
|
222
|
+
# else:
|
|
223
|
+
# # For non-Windows systems or unspecified monitors, return the monitor area as-is
|
|
224
|
+
# return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
|
|
224
225
|
|
|
225
226
|
|
|
226
227
|
def _get_full_screenshot(self) -> Tuple[Image.Image | None, int, int]:
|
|
@@ -309,11 +310,9 @@ class OverlayProcessor:
|
|
|
309
310
|
|
|
310
311
|
score = fuzz.ratio(text_str, self.last_oneocr_result)
|
|
311
312
|
if score >= 80:
|
|
312
|
-
logger.info("OneOCR results are similar to the last results (score: %d). Skipping overlay update.", score)
|
|
313
313
|
return
|
|
314
314
|
self.last_oneocr_result = text_str
|
|
315
315
|
|
|
316
|
-
logger.info("Sending OneOCR results to overlay.")
|
|
317
316
|
await send_word_coordinates_to_overlay(self._convert_oneocr_results_to_percentages(oneocr_results, monitor_width, monitor_height))
|
|
318
317
|
|
|
319
318
|
# If User Home is beangate
|
|
@@ -322,7 +321,7 @@ class OverlayProcessor:
|
|
|
322
321
|
f.write(json.dumps(oneocr_results, ensure_ascii=False, indent=2))
|
|
323
322
|
|
|
324
323
|
if get_config().overlay.engine == OverlayEngine.ONEOCR.value and self.oneocr:
|
|
325
|
-
logger.info("
|
|
324
|
+
logger.info("Sent %d text boxes to overlay.", len(oneocr_results))
|
|
326
325
|
return
|
|
327
326
|
|
|
328
327
|
# 3. Create a composite image with only the detected text regions
|
|
@@ -371,8 +370,9 @@ class OverlayProcessor:
|
|
|
371
370
|
crop_height=composite_image.height,
|
|
372
371
|
use_percentages=True
|
|
373
372
|
)
|
|
374
|
-
logger.info("Sending Google Lens results to overlay.")
|
|
375
373
|
await send_word_coordinates_to_overlay(extracted_data)
|
|
374
|
+
|
|
375
|
+
logger.info("Sent %d text boxes to overlay.", len(extracted_data))
|
|
376
376
|
|
|
377
377
|
def _extract_text_with_pixel_boxes(
|
|
378
378
|
self,
|
|
@@ -13,7 +13,7 @@ from pathlib import Path
|
|
|
13
13
|
import requests
|
|
14
14
|
from rapidfuzz import process
|
|
15
15
|
|
|
16
|
-
from GameSentenceMiner.util.configuration import logger, get_config, get_app_directory
|
|
16
|
+
from GameSentenceMiner.util.configuration import gsm_state, logger, get_config, get_app_directory, get_temporary_directory
|
|
17
17
|
|
|
18
18
|
SCRIPTS_DIR = r"E:\Japanese Stuff\agent-v0.1.4-win32-x64\data\scripts"
|
|
19
19
|
|
|
@@ -22,6 +22,13 @@ def run_new_thread(func):
|
|
|
22
22
|
thread.start()
|
|
23
23
|
return thread
|
|
24
24
|
|
|
25
|
+
def make_unique_temp_file(path):
|
|
26
|
+
path = Path(path)
|
|
27
|
+
current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
|
|
28
|
+
temp_dir = get_temporary_directory()
|
|
29
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
30
|
+
return str(Path(temp_dir) / f"{path.stem}_{current_time}{path.suffix}")
|
|
31
|
+
|
|
25
32
|
def make_unique_file_name(path):
|
|
26
33
|
path = Path(path)
|
|
27
34
|
current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
|
|
@@ -258,6 +265,29 @@ TEXT_REPLACEMENTS_FILE = os.path.join(get_app_directory(), 'config', 'text_repla
|
|
|
258
265
|
OCR_REPLACEMENTS_FILE = os.path.join(get_app_directory(), 'config', 'ocr_replacements.json')
|
|
259
266
|
os.makedirs(os.path.dirname(TEXT_REPLACEMENTS_FILE), exist_ok=True)
|
|
260
267
|
|
|
268
|
+
|
|
269
|
+
def add_srt_line(line_time, new_line):
|
|
270
|
+
global srt_index
|
|
271
|
+
if get_config().features.generate_longplay and gsm_state.recording_started_time and new_line.prev:
|
|
272
|
+
logger.info(f"Adding SRT line {new_line.prev.text}... for longplay")
|
|
273
|
+
with open(gsm_state.current_srt, 'a', encoding='utf-8') as srt_file:
|
|
274
|
+
# Calculate start and end times for the previous line
|
|
275
|
+
prev_start_time = new_line.prev.time - gsm_state.recording_started_time
|
|
276
|
+
prev_end_time = (line_time if line_time else datetime.now()) - gsm_state.recording_started_time
|
|
277
|
+
# Format times as SRT timestamps (HH:MM:SS,mmm)
|
|
278
|
+
def format_srt_time(td, offset=0):
|
|
279
|
+
total_seconds = int(td.total_seconds()) + offset
|
|
280
|
+
hours = total_seconds // 3600
|
|
281
|
+
minutes = (total_seconds % 3600) // 60
|
|
282
|
+
seconds = total_seconds % 60
|
|
283
|
+
milliseconds = int(td.microseconds / 1000)
|
|
284
|
+
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
|
|
285
|
+
|
|
286
|
+
srt_file.write(f"{gsm_state.srt_index}\n")
|
|
287
|
+
srt_file.write(f"{format_srt_time(prev_start_time)} --> {format_srt_time(prev_end_time, offset=-1)}\n")
|
|
288
|
+
srt_file.write(f"{new_line.prev.text}\n\n")
|
|
289
|
+
gsm_state.srt_index += 1
|
|
290
|
+
|
|
261
291
|
# if not os.path.exists(OCR_REPLACEMENTS_FILE):
|
|
262
292
|
# url = "https://raw.githubusercontent.com/bpwhelan/GameSentenceMiner/refs/heads/main/electron-src/assets/ocr_replacements.json"
|
|
263
293
|
# try:
|
|
@@ -89,11 +89,11 @@ class GameText:
|
|
|
89
89
|
scene=gsm_state.current_game or ""
|
|
90
90
|
)
|
|
91
91
|
self.values_dict[line_id] = new_line
|
|
92
|
-
logger.debug(f"Adding line: {new_line}")
|
|
93
92
|
self.game_line_index += 1
|
|
94
93
|
if self.values:
|
|
95
94
|
self.values[-1].next = new_line
|
|
96
95
|
self.values.append(new_line)
|
|
96
|
+
return new_line
|
|
97
97
|
# self.remove_old_events(datetime.now() - timedelta(minutes=10))
|
|
98
98
|
|
|
99
99
|
def has_line(self, line_text) -> bool:
|
|
@@ -119,16 +119,17 @@ def strip_whitespace_and_punctuation(text: str) -> str:
|
|
|
119
119
|
return re.sub(r'[\s 、。「」【】《》., ]', '', text).strip()
|
|
120
120
|
|
|
121
121
|
|
|
122
|
+
# TODO See if partial_ratio is better than ratio
|
|
122
123
|
def lines_match(texthooker_sentence, anki_sentence, similarity_threshold=80) -> bool:
|
|
123
124
|
# Replace newlines, spaces, other whitespace characters, AND japanese punctuation
|
|
124
125
|
texthooker_sentence = strip_whitespace_and_punctuation(texthooker_sentence)
|
|
125
126
|
anki_sentence = strip_whitespace_and_punctuation(anki_sentence)
|
|
126
127
|
similarity = rapidfuzz.fuzz.ratio(texthooker_sentence, anki_sentence)
|
|
127
|
-
logger.debug(f"Comparing sentences: '{texthooker_sentence}' and '{anki_sentence}' - Similarity: {similarity}")
|
|
128
|
-
if texthooker_sentence in anki_sentence:
|
|
129
|
-
|
|
130
|
-
elif anki_sentence in texthooker_sentence:
|
|
131
|
-
|
|
128
|
+
# logger.debug(f"Comparing sentences: '{texthooker_sentence}' and '{anki_sentence}' - Similarity: {similarity}")
|
|
129
|
+
# if texthooker_sentence in anki_sentence:
|
|
130
|
+
# logger.debug(f"One contains the other: {texthooker_sentence} in {anki_sentence} - Similarity: {similarity}")
|
|
131
|
+
# elif anki_sentence in texthooker_sentence:
|
|
132
|
+
# logger.debug(f"One contains the other: {anki_sentence} in {texthooker_sentence} - Similarity: {similarity}")
|
|
132
133
|
return (anki_sentence in texthooker_sentence) or (texthooker_sentence in anki_sentence) or (similarity >= similarity_threshold)
|
|
133
134
|
|
|
134
135
|
|
|
@@ -145,7 +146,8 @@ def get_text_event(last_note) -> GameLine:
|
|
|
145
146
|
if not sentence:
|
|
146
147
|
return lines[-1]
|
|
147
148
|
|
|
148
|
-
|
|
149
|
+
# Check the last 50 lines for a match
|
|
150
|
+
for line in reversed(lines[-50:]):
|
|
149
151
|
if lines_match(line.text, remove_html_and_cloze_tags(sentence)):
|
|
150
152
|
return line
|
|
151
153
|
|
|
@@ -181,7 +183,7 @@ def get_mined_line(last_note: AnkiCard, lines=None):
|
|
|
181
183
|
raise Exception("No voicelines in GSM. GSM can only do work on text that has been sent to it since it started. If you are not getting any text into GSM, please check your setup/config.")
|
|
182
184
|
|
|
183
185
|
sentence = last_note.get_field(get_config().anki.sentence_field)
|
|
184
|
-
for line in reversed(lines):
|
|
186
|
+
for line in reversed(lines[-50:]):
|
|
185
187
|
if lines_match(line.get_stripped_text(), remove_html_and_cloze_tags(sentence)):
|
|
186
188
|
return line
|
|
187
189
|
return lines[-1]
|
|
@@ -199,7 +201,7 @@ def get_text_log() -> GameText:
|
|
|
199
201
|
return game_log
|
|
200
202
|
|
|
201
203
|
def add_line(current_line_after_regex, line_time):
|
|
202
|
-
game_log.add_line(current_line_after_regex, line_time)
|
|
204
|
+
return game_log.add_line(current_line_after_regex, line_time)
|
|
203
205
|
|
|
204
206
|
def get_line_by_id(line_id: str) -> Optional[GameLine]:
|
|
205
207
|
"""
|
GameSentenceMiner/vad.py
CHANGED
|
@@ -5,6 +5,7 @@ import shutil
|
|
|
5
5
|
import tempfile
|
|
6
6
|
import time
|
|
7
7
|
import warnings
|
|
8
|
+
import re
|
|
8
9
|
from abc import abstractmethod, ABC
|
|
9
10
|
|
|
10
11
|
from GameSentenceMiner.util import configuration, ffmpeg
|
|
@@ -35,26 +36,26 @@ class VADSystem:
|
|
|
35
36
|
# if not self.groq:
|
|
36
37
|
# self.groq = GroqVADProcessor()
|
|
37
38
|
|
|
38
|
-
def trim_audio_with_vad(self, input_audio, output_audio, game_line):
|
|
39
|
+
def trim_audio_with_vad(self, input_audio, output_audio, game_line, full_text):
|
|
39
40
|
if get_config().vad.do_vad_postprocessing:
|
|
40
|
-
result = self._do_vad_processing(get_config().vad.selected_vad_model, input_audio, output_audio, game_line)
|
|
41
|
+
result = self._do_vad_processing(get_config().vad.selected_vad_model, input_audio, output_audio, game_line, full_text)
|
|
41
42
|
if not result.success and get_config().vad.backup_vad_model != configuration.OFF:
|
|
42
43
|
logger.info("No voice activity detected, using backup VAD model.")
|
|
43
|
-
result = self._do_vad_processing(get_config().vad.backup_vad_model, input_audio, output_audio, game_line)
|
|
44
|
+
result = self._do_vad_processing(get_config().vad.backup_vad_model, input_audio, output_audio, game_line, full_text)
|
|
44
45
|
return result
|
|
45
46
|
|
|
46
|
-
def _do_vad_processing(self, model, input_audio, output_audio, game_line):
|
|
47
|
+
def _do_vad_processing(self, model, input_audio, output_audio, game_line, text_mined):
|
|
47
48
|
match model:
|
|
48
49
|
case configuration.OFF:
|
|
49
50
|
return VADResult(False, 0, 0, "OFF")
|
|
50
51
|
case configuration.SILERO:
|
|
51
52
|
if not self.silero:
|
|
52
53
|
self.silero = SileroVADProcessor()
|
|
53
|
-
return self.silero.process_audio(input_audio, output_audio, game_line)
|
|
54
|
+
return self.silero.process_audio(input_audio, output_audio, game_line, text_mined)
|
|
54
55
|
case configuration.WHISPER:
|
|
55
56
|
if not self.whisper:
|
|
56
57
|
self.whisper = WhisperVADProcessor()
|
|
57
|
-
return self.whisper.process_audio(input_audio, output_audio, game_line)
|
|
58
|
+
return self.whisper.process_audio(input_audio, output_audio, game_line, text_mined)
|
|
58
59
|
|
|
59
60
|
# Base class for VAD systems
|
|
60
61
|
class VADProcessor(ABC):
|
|
@@ -63,7 +64,7 @@ class VADProcessor(ABC):
|
|
|
63
64
|
self.vad_system_name = None
|
|
64
65
|
|
|
65
66
|
@abstractmethod
|
|
66
|
-
def _detect_voice_activity(self, input_audio):
|
|
67
|
+
def _detect_voice_activity(self, input_audio, text_mined):
|
|
67
68
|
pass
|
|
68
69
|
|
|
69
70
|
@staticmethod
|
|
@@ -100,8 +101,8 @@ class VADProcessor(ABC):
|
|
|
100
101
|
shutil.move(files[0], output_audio)
|
|
101
102
|
|
|
102
103
|
|
|
103
|
-
def process_audio(self, input_audio, output_audio, game_line):
|
|
104
|
-
voice_activity = self._detect_voice_activity(input_audio)
|
|
104
|
+
def process_audio(self, input_audio, output_audio, game_line, text_mined):
|
|
105
|
+
voice_activity = self._detect_voice_activity(input_audio, text_mined)
|
|
105
106
|
|
|
106
107
|
if not voice_activity:
|
|
107
108
|
logger.info("No voice activity detected in the audio.")
|
|
@@ -140,7 +141,7 @@ class SileroVADProcessor(VADProcessor):
|
|
|
140
141
|
self.vad_model = load_silero_vad()
|
|
141
142
|
self.vad_system_name = SILERO
|
|
142
143
|
|
|
143
|
-
def _detect_voice_activity(self, input_audio):
|
|
144
|
+
def _detect_voice_activity(self, input_audio, text_mined):
|
|
144
145
|
from silero_vad import read_audio, get_speech_timestamps
|
|
145
146
|
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
|
146
147
|
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
|
@@ -166,7 +167,7 @@ class WhisperVADProcessor(VADProcessor):
|
|
|
166
167
|
logger.info(f"Whisper model '{get_config().vad.whisper_model}' loaded.")
|
|
167
168
|
return self.vad_model
|
|
168
169
|
|
|
169
|
-
def _detect_voice_activity(self, input_audio):
|
|
170
|
+
def _detect_voice_activity(self, input_audio, text_mined):
|
|
170
171
|
from stable_whisper import WhisperResult
|
|
171
172
|
# Convert the audio to 16kHz mono WAV, evidence https://discord.com/channels/1286409772383342664/1286518821913362445/1407017127529152533
|
|
172
173
|
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
|
|
@@ -178,10 +179,22 @@ class WhisperVADProcessor(VADProcessor):
|
|
|
178
179
|
with warnings.catch_warnings():
|
|
179
180
|
warnings.simplefilter("ignore")
|
|
180
181
|
result: WhisperResult = self.vad_model.transcribe(temp_wav, vad=True, language=get_config().vad.language, vad_filter=get_config().vad.use_vad_filter_for_whisper,
|
|
181
|
-
temperature=0.0)
|
|
182
|
+
temperature=0.0, chunk_length=60)
|
|
182
183
|
voice_activity = []
|
|
183
184
|
|
|
184
185
|
logger.debug(json.dumps(result.to_dict()))
|
|
186
|
+
|
|
187
|
+
text = result.text.strip()
|
|
188
|
+
|
|
189
|
+
# If both mined text and Whisper transcription are available, compare their similarity
|
|
190
|
+
if text_mined and text:
|
|
191
|
+
from rapidfuzz import fuzz
|
|
192
|
+
similarity = fuzz.partial_ratio(text_mined, text)
|
|
193
|
+
logger.info(f"Whisper transcription: '{text}' | Mined text: '{text_mined}' | Partial similarity: {similarity:.1f}")
|
|
194
|
+
# If similarity is very low, treat as no voice activity detected
|
|
195
|
+
if similarity < 20:
|
|
196
|
+
logger.info(f"Partial similarity {similarity:.1f} is below threshold, skipping voice activity.")
|
|
197
|
+
return []
|
|
185
198
|
|
|
186
199
|
# Process the segments to extract tokens, timestamps, and confidence
|
|
187
200
|
previous_segment = None
|
|
@@ -193,6 +206,12 @@ class WhisperVADProcessor(VADProcessor):
|
|
|
193
206
|
else:
|
|
194
207
|
logger.info(
|
|
195
208
|
"Unknown single character segment, not skipping, but logging, please report if this is a mistake: " + segment.text)
|
|
209
|
+
|
|
210
|
+
# Skip segments with excessive repeating sequences of at least 3 characters
|
|
211
|
+
match = re.search(r'(.{3,})\1{4,}', segment.text)
|
|
212
|
+
if match:
|
|
213
|
+
logger.debug(f"Skipping segment with excessive repeating sequence (>=5): '{segment.text}' at {segment.start}-{segment.end}. Likely Hallucination.")
|
|
214
|
+
continue
|
|
196
215
|
|
|
197
216
|
if segment.no_speech_prob and segment.no_speech_prob > 0.9:
|
|
198
217
|
logger.debug(f"Skipping segment with high no_speech_prob: {segment.no_speech_prob} for segment {segment.text} at {segment.start}-{segment.end}")
|