GameSentenceMiner 2.17.7__py3-none-any.whl → 2.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of GameSentenceMiner might be problematic. Click here for more details.

Files changed (76) hide show
  1. GameSentenceMiner/ai/ai_prompting.py +6 -6
  2. GameSentenceMiner/anki.py +236 -152
  3. GameSentenceMiner/gametext.py +7 -4
  4. GameSentenceMiner/gsm.py +49 -10
  5. GameSentenceMiner/locales/en_us.json +7 -3
  6. GameSentenceMiner/locales/ja_jp.json +8 -4
  7. GameSentenceMiner/locales/zh_cn.json +8 -4
  8. GameSentenceMiner/obs.py +238 -59
  9. GameSentenceMiner/ocr/owocr_helper.py +1 -1
  10. GameSentenceMiner/tools/ss_selector.py +7 -8
  11. GameSentenceMiner/ui/__init__.py +0 -0
  12. GameSentenceMiner/ui/anki_confirmation.py +187 -0
  13. GameSentenceMiner/{config_gui.py → ui/config_gui.py} +100 -35
  14. GameSentenceMiner/ui/screenshot_selector.py +215 -0
  15. GameSentenceMiner/util/configuration.py +124 -22
  16. GameSentenceMiner/util/db.py +22 -13
  17. GameSentenceMiner/util/downloader/download_tools.py +2 -2
  18. GameSentenceMiner/util/ffmpeg.py +24 -30
  19. GameSentenceMiner/util/get_overlay_coords.py +34 -34
  20. GameSentenceMiner/util/gsm_utils.py +31 -1
  21. GameSentenceMiner/util/text_log.py +11 -9
  22. GameSentenceMiner/vad.py +31 -12
  23. GameSentenceMiner/web/database_api.py +742 -123
  24. GameSentenceMiner/web/static/css/dashboard-shared.css +241 -0
  25. GameSentenceMiner/web/static/css/kanji-grid.css +94 -2
  26. GameSentenceMiner/web/static/css/overview.css +850 -0
  27. GameSentenceMiner/web/static/css/popups-shared.css +126 -0
  28. GameSentenceMiner/web/static/css/shared.css +97 -0
  29. GameSentenceMiner/web/static/css/stats.css +192 -597
  30. GameSentenceMiner/web/static/js/anki_stats.js +6 -4
  31. GameSentenceMiner/web/static/js/database.js +209 -5
  32. GameSentenceMiner/web/static/js/goals.js +610 -0
  33. GameSentenceMiner/web/static/js/kanji-grid.js +267 -4
  34. GameSentenceMiner/web/static/js/overview.js +1176 -0
  35. GameSentenceMiner/web/static/js/shared.js +25 -0
  36. GameSentenceMiner/web/static/js/stats.js +154 -1459
  37. GameSentenceMiner/web/stats.py +2 -2
  38. GameSentenceMiner/web/templates/anki_stats.html +5 -0
  39. GameSentenceMiner/web/templates/components/kanji_grid/basic_kanji_book_bkb_v1_v2.json +17 -0
  40. GameSentenceMiner/web/templates/components/kanji_grid/duolingo_kanji.json +29 -0
  41. GameSentenceMiner/web/templates/components/kanji_grid/grade.json +17 -0
  42. GameSentenceMiner/web/templates/components/kanji_grid/hk_primary_learning.json +17 -0
  43. GameSentenceMiner/web/templates/components/kanji_grid/hkscs2016.json +13 -0
  44. GameSentenceMiner/web/templates/components/kanji_grid/hsk_levels.json +33 -0
  45. GameSentenceMiner/web/templates/components/kanji_grid/humanum_frequency_list.json +41 -0
  46. GameSentenceMiner/web/templates/components/kanji_grid/jis_levels.json +25 -0
  47. GameSentenceMiner/web/templates/components/kanji_grid/jlpt_level.json +29 -0
  48. GameSentenceMiner/web/templates/components/kanji_grid/jpdb_kanji_frequency_list.json +37 -0
  49. GameSentenceMiner/web/templates/components/kanji_grid/jpdbv2_kanji_frequency_list.json +161 -0
  50. GameSentenceMiner/web/templates/components/kanji_grid/jun_das_modern_chinese_character_frequency_list.json +13 -0
  51. GameSentenceMiner/web/templates/components/kanji_grid/kanji_in_context_revised_edition.json +37 -0
  52. GameSentenceMiner/web/templates/components/kanji_grid/kanji_kentei_level.json +61 -0
  53. GameSentenceMiner/web/templates/components/kanji_grid/mainland_china_elementary_textbook_characters.json +33 -0
  54. GameSentenceMiner/web/templates/components/kanji_grid/moe_way_quiz.json +47 -0
  55. GameSentenceMiner/web/templates/components/kanji_grid/official_kanji.json +25 -0
  56. GameSentenceMiner/web/templates/components/kanji_grid/remembering_the_kanji.json +25 -0
  57. GameSentenceMiner/web/templates/components/kanji_grid/standard_form_of_national_characters.json +25 -0
  58. GameSentenceMiner/web/templates/components/kanji_grid/table_of_general_standard_chinese_characters.json +21 -0
  59. GameSentenceMiner/web/templates/components/kanji_grid/the_kodansha_kanji_learners_course_klc.json +45 -0
  60. GameSentenceMiner/web/templates/components/kanji_grid/thousand_character_classic.json +13 -0
  61. GameSentenceMiner/web/templates/components/kanji_grid/wanikani_levels.json +249 -0
  62. GameSentenceMiner/web/templates/components/kanji_grid/words_hk_frequency_list.json +33 -0
  63. GameSentenceMiner/web/templates/components/navigation.html +3 -1
  64. GameSentenceMiner/web/templates/database.html +73 -1
  65. GameSentenceMiner/web/templates/goals.html +376 -0
  66. GameSentenceMiner/web/templates/index.html +13 -11
  67. GameSentenceMiner/web/templates/overview.html +416 -0
  68. GameSentenceMiner/web/templates/stats.html +46 -251
  69. GameSentenceMiner/web/texthooking_page.py +18 -0
  70. {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.1.dist-info}/METADATA +5 -1
  71. gamesentenceminer-2.18.1.dist-info/RECORD +132 -0
  72. gamesentenceminer-2.17.7.dist-info/RECORD +0 -98
  73. {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.1.dist-info}/WHEEL +0 -0
  74. {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.1.dist-info}/entry_points.txt +0 -0
  75. {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.1.dist-info}/licenses/LICENSE +0 -0
  76. {gamesentenceminer-2.17.7.dist-info → gamesentenceminer-2.18.1.dist-info}/top_level.txt +0 -0
@@ -5,23 +5,20 @@ import sys
5
5
  import tempfile
6
6
  import time
7
7
  from pathlib import Path
8
+ import subprocess
9
+ from pathlib import Path
10
+ import shutil
11
+
8
12
 
9
13
  from GameSentenceMiner import obs
10
- from GameSentenceMiner.util.configuration import get_app_directory, is_windows, logger, get_config, \
14
+ from GameSentenceMiner.ui.config_gui import ConfigApp
15
+ from GameSentenceMiner.util.configuration import ffmpeg_base_command_list, get_ffprobe_path, logger, get_config, \
11
16
  get_temporary_directory, gsm_state, is_linux
12
17
  from GameSentenceMiner.util.gsm_utils import make_unique_file_name, get_file_modification_time
13
18
  from GameSentenceMiner.util import configuration
14
19
  from GameSentenceMiner.util.text_log import initial_time
15
20
 
16
21
 
17
- def get_ffmpeg_path():
18
- return os.path.join(get_app_directory(), "ffmpeg", "ffmpeg.exe") if is_windows() else "ffmpeg"
19
-
20
- def get_ffprobe_path():
21
- return os.path.join(get_app_directory(), "ffmpeg", "ffprobe.exe") if is_windows() else "ffprobe"
22
-
23
- ffmpeg_base_command_list = [get_ffmpeg_path(), "-hide_banner", "-loglevel", "error", '-nostdin']
24
-
25
22
  supported_formats = {
26
23
  'opus': 'libopus',
27
24
  'mp3': 'libmp3lame',
@@ -30,11 +27,6 @@ supported_formats = {
30
27
  'm4a': 'aac',
31
28
  }
32
29
 
33
- import subprocess
34
- from pathlib import Path
35
- import shutil
36
-
37
-
38
30
  def video_to_anim(
39
31
  input_path: str | Path,
40
32
  output_path: str | Path = None,
@@ -184,22 +176,24 @@ def call_frame_extractor(video_path, timestamp):
184
176
  str: The path of the selected image, or None on error.
185
177
  """
186
178
  try:
187
- logger.info(' '.join([sys.executable, "-m", "GameSentenceMiner.tools.ss_selector", video_path, str(timestamp)]))
188
-
189
- # Run the script using subprocess.run()
190
- result = subprocess.run(
191
- [sys.executable, "-m", "GameSentenceMiner.tools.ss_selector", video_path, str(timestamp), get_config().screenshot.screenshot_timing_setting], # Use sys.executable
192
- capture_output=True,
193
- text=True, # Get output as text
194
- check=False # Raise an exception for non-zero exit codes
195
- )
196
- if result.returncode != 0:
197
- logger.error(f"Script failed with return code: {result.returncode}")
198
- return None
199
- logger.info(result)
200
- # Print the standard output
201
- logger.info(f"Frame extractor script output: {result.stdout.strip()}")
202
- return result.stdout.strip() # Return the output
179
+ config_app: ConfigApp = gsm_state.config_app
180
+ return config_app.show_screenshot_selector(video_path, timestamp, get_config().screenshot.screenshot_timing_setting)
181
+ # logger.info(' '.join([sys.executable, "-m", "GameSentenceMiner.tools.ss_selector", video_path, str(timestamp)]))
182
+
183
+ # # Run the script using subprocess.run()
184
+ # result = subprocess.run(
185
+ # [sys.executable, "-m", "GameSentenceMiner.tools.ss_selector", video_path, str(timestamp), get_config().screenshot.screenshot_timing_setting], # Use sys.executable
186
+ # capture_output=True,
187
+ # text=True, # Get output as text
188
+ # check=False # Raise an exception for non-zero exit codes
189
+ # )
190
+ # if result.returncode != 0:
191
+ # logger.error(f"Script failed with return code: {result.returncode}")
192
+ # return None
193
+ # logger.info(result)
194
+ # # Print the standard output
195
+ # logger.info(f"Frame extractor script output: {result.stdout.strip()}")
196
+ # return result.stdout.strip() # Return the output
203
197
 
204
198
  except subprocess.CalledProcessError as e:
205
199
  logger.error(f"Error calling script: {e}")
@@ -190,37 +190,38 @@ class OverlayProcessor:
190
190
  """
191
191
  with mss.mss() as sct:
192
192
  monitors = sct.monitors[1:]
193
- if is_windows() and monitor_index == 0:
194
- from ctypes import wintypes
195
- import ctypes
196
- # Get work area for primary monitor (ignores taskbar)
197
- SPI_GETWORKAREA = 0x0030
198
- rect = wintypes.RECT()
199
- res = ctypes.windll.user32.SystemParametersInfoW(
200
- SPI_GETWORKAREA, 0, ctypes.byref(rect), 0
201
- )
202
- if not res:
203
- raise ctypes.WinError()
193
+ return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
194
+ # if is_windows() and monitor_index == 0:
195
+ # from ctypes import wintypes
196
+ # import ctypes
197
+ # # Get work area for primary monitor (ignores taskbar)
198
+ # SPI_GETWORKAREA = 0x0030
199
+ # rect = wintypes.RECT()
200
+ # res = ctypes.windll.user32.SystemParametersInfoW(
201
+ # SPI_GETWORKAREA, 0, ctypes.byref(rect), 0
202
+ # )
203
+ # if not res:
204
+ # raise ctypes.WinError()
204
205
 
205
- return {
206
- "left": rect.left,
207
- "top": rect.top,
208
- "width": rect.right - rect.left,
209
- "height": rect.bottom - rect.top,
210
- }
211
- elif is_windows() and monitor_index > 0:
212
- # Secondary monitors: just return with a guess of how tall the taskbar is
213
- taskbar_height_guess = 48 # A common taskbar height, may vary
214
- mon = monitors[monitor_index]
215
- return {
216
- "left": mon["left"],
217
- "top": mon["top"],
218
- "width": mon["width"],
219
- "height": mon["height"] - taskbar_height_guess
220
- }
221
- else:
222
- # For non-Windows systems or unspecified monitors, return the monitor area as-is
223
- return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
206
+ # return {
207
+ # "left": rect.left,
208
+ # "top": rect.top,
209
+ # "width": rect.right - rect.left,
210
+ # "height": rect.bottom - rect.top,
211
+ # }
212
+ # elif is_windows() and monitor_index > 0:
213
+ # # Secondary monitors: just return with a guess of how tall the taskbar is
214
+ # taskbar_height_guess = 48 # A common taskbar height, may vary
215
+ # mon = monitors[monitor_index]
216
+ # return {
217
+ # "left": mon["left"],
218
+ # "top": mon["top"],
219
+ # "width": mon["width"],
220
+ # "height": mon["height"] - taskbar_height_guess
221
+ # }
222
+ # else:
223
+ # # For non-Windows systems or unspecified monitors, return the monitor area as-is
224
+ # return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
224
225
 
225
226
 
226
227
  def _get_full_screenshot(self) -> Tuple[Image.Image | None, int, int]:
@@ -309,11 +310,9 @@ class OverlayProcessor:
309
310
 
310
311
  score = fuzz.ratio(text_str, self.last_oneocr_result)
311
312
  if score >= 80:
312
- logger.info("OneOCR results are similar to the last results (score: %d). Skipping overlay update.", score)
313
313
  return
314
314
  self.last_oneocr_result = text_str
315
315
 
316
- logger.info("Sending OneOCR results to overlay.")
317
316
  await send_word_coordinates_to_overlay(self._convert_oneocr_results_to_percentages(oneocr_results, monitor_width, monitor_height))
318
317
 
319
318
  # If User Home is beangate
@@ -322,7 +321,7 @@ class OverlayProcessor:
322
321
  f.write(json.dumps(oneocr_results, ensure_ascii=False, indent=2))
323
322
 
324
323
  if get_config().overlay.engine == OverlayEngine.ONEOCR.value and self.oneocr:
325
- logger.info("Using OneOCR results for overlay as configured.")
324
+ logger.info("Sent %d text boxes to overlay.", len(oneocr_results))
326
325
  return
327
326
 
328
327
  # 3. Create a composite image with only the detected text regions
@@ -371,8 +370,9 @@ class OverlayProcessor:
371
370
  crop_height=composite_image.height,
372
371
  use_percentages=True
373
372
  )
374
- logger.info("Sending Google Lens results to overlay.")
375
373
  await send_word_coordinates_to_overlay(extracted_data)
374
+
375
+ logger.info("Sent %d text boxes to overlay.", len(extracted_data))
376
376
 
377
377
  def _extract_text_with_pixel_boxes(
378
378
  self,
@@ -13,7 +13,7 @@ from pathlib import Path
13
13
  import requests
14
14
  from rapidfuzz import process
15
15
 
16
- from GameSentenceMiner.util.configuration import logger, get_config, get_app_directory
16
+ from GameSentenceMiner.util.configuration import gsm_state, logger, get_config, get_app_directory, get_temporary_directory
17
17
 
18
18
  SCRIPTS_DIR = r"E:\Japanese Stuff\agent-v0.1.4-win32-x64\data\scripts"
19
19
 
@@ -22,6 +22,13 @@ def run_new_thread(func):
22
22
  thread.start()
23
23
  return thread
24
24
 
25
+ def make_unique_temp_file(path):
26
+ path = Path(path)
27
+ current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
28
+ temp_dir = get_temporary_directory()
29
+ os.makedirs(temp_dir, exist_ok=True)
30
+ return str(Path(temp_dir) / f"{path.stem}_{current_time}{path.suffix}")
31
+
25
32
  def make_unique_file_name(path):
26
33
  path = Path(path)
27
34
  current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
@@ -258,6 +265,29 @@ TEXT_REPLACEMENTS_FILE = os.path.join(get_app_directory(), 'config', 'text_repla
258
265
  OCR_REPLACEMENTS_FILE = os.path.join(get_app_directory(), 'config', 'ocr_replacements.json')
259
266
  os.makedirs(os.path.dirname(TEXT_REPLACEMENTS_FILE), exist_ok=True)
260
267
 
268
+
269
+ def add_srt_line(line_time, new_line):
270
+ global srt_index
271
+ if get_config().features.generate_longplay and gsm_state.recording_started_time and new_line.prev:
272
+ logger.info(f"Adding SRT line {new_line.prev.text}... for longplay")
273
+ with open(gsm_state.current_srt, 'a', encoding='utf-8') as srt_file:
274
+ # Calculate start and end times for the previous line
275
+ prev_start_time = new_line.prev.time - gsm_state.recording_started_time
276
+ prev_end_time = (line_time if line_time else datetime.now()) - gsm_state.recording_started_time
277
+ # Format times as SRT timestamps (HH:MM:SS,mmm)
278
+ def format_srt_time(td, offset=0):
279
+ total_seconds = int(td.total_seconds()) + offset
280
+ hours = total_seconds // 3600
281
+ minutes = (total_seconds % 3600) // 60
282
+ seconds = total_seconds % 60
283
+ milliseconds = int(td.microseconds / 1000)
284
+ return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
285
+
286
+ srt_file.write(f"{gsm_state.srt_index}\n")
287
+ srt_file.write(f"{format_srt_time(prev_start_time)} --> {format_srt_time(prev_end_time, offset=-1)}\n")
288
+ srt_file.write(f"{new_line.prev.text}\n\n")
289
+ gsm_state.srt_index += 1
290
+
261
291
  # if not os.path.exists(OCR_REPLACEMENTS_FILE):
262
292
  # url = "https://raw.githubusercontent.com/bpwhelan/GameSentenceMiner/refs/heads/main/electron-src/assets/ocr_replacements.json"
263
293
  # try:
@@ -89,11 +89,11 @@ class GameText:
89
89
  scene=gsm_state.current_game or ""
90
90
  )
91
91
  self.values_dict[line_id] = new_line
92
- logger.debug(f"Adding line: {new_line}")
93
92
  self.game_line_index += 1
94
93
  if self.values:
95
94
  self.values[-1].next = new_line
96
95
  self.values.append(new_line)
96
+ return new_line
97
97
  # self.remove_old_events(datetime.now() - timedelta(minutes=10))
98
98
 
99
99
  def has_line(self, line_text) -> bool:
@@ -119,16 +119,17 @@ def strip_whitespace_and_punctuation(text: str) -> str:
119
119
  return re.sub(r'[\s 、。「」【】《》., ]', '', text).strip()
120
120
 
121
121
 
122
+ # TODO See if partial_ratio is better than ratio
122
123
  def lines_match(texthooker_sentence, anki_sentence, similarity_threshold=80) -> bool:
123
124
  # Replace newlines, spaces, other whitespace characters, AND japanese punctuation
124
125
  texthooker_sentence = strip_whitespace_and_punctuation(texthooker_sentence)
125
126
  anki_sentence = strip_whitespace_and_punctuation(anki_sentence)
126
127
  similarity = rapidfuzz.fuzz.ratio(texthooker_sentence, anki_sentence)
127
- logger.debug(f"Comparing sentences: '{texthooker_sentence}' and '{anki_sentence}' - Similarity: {similarity}")
128
- if texthooker_sentence in anki_sentence:
129
- logger.debug(f"One contains the other: {texthooker_sentence} in {anki_sentence} - Similarity: {similarity}")
130
- elif anki_sentence in texthooker_sentence:
131
- logger.debug(f"One contains the other: {anki_sentence} in {texthooker_sentence} - Similarity: {similarity}")
128
+ # logger.debug(f"Comparing sentences: '{texthooker_sentence}' and '{anki_sentence}' - Similarity: {similarity}")
129
+ # if texthooker_sentence in anki_sentence:
130
+ # logger.debug(f"One contains the other: {texthooker_sentence} in {anki_sentence} - Similarity: {similarity}")
131
+ # elif anki_sentence in texthooker_sentence:
132
+ # logger.debug(f"One contains the other: {anki_sentence} in {texthooker_sentence} - Similarity: {similarity}")
132
133
  return (anki_sentence in texthooker_sentence) or (texthooker_sentence in anki_sentence) or (similarity >= similarity_threshold)
133
134
 
134
135
 
@@ -145,7 +146,8 @@ def get_text_event(last_note) -> GameLine:
145
146
  if not sentence:
146
147
  return lines[-1]
147
148
 
148
- for line in reversed(lines):
149
+ # Check the last 50 lines for a match
150
+ for line in reversed(lines[-50:]):
149
151
  if lines_match(line.text, remove_html_and_cloze_tags(sentence)):
150
152
  return line
151
153
 
@@ -181,7 +183,7 @@ def get_mined_line(last_note: AnkiCard, lines=None):
181
183
  raise Exception("No voicelines in GSM. GSM can only do work on text that has been sent to it since it started. If you are not getting any text into GSM, please check your setup/config.")
182
184
 
183
185
  sentence = last_note.get_field(get_config().anki.sentence_field)
184
- for line in reversed(lines):
186
+ for line in reversed(lines[-50:]):
185
187
  if lines_match(line.get_stripped_text(), remove_html_and_cloze_tags(sentence)):
186
188
  return line
187
189
  return lines[-1]
@@ -199,7 +201,7 @@ def get_text_log() -> GameText:
199
201
  return game_log
200
202
 
201
203
  def add_line(current_line_after_regex, line_time):
202
- game_log.add_line(current_line_after_regex, line_time)
204
+ return game_log.add_line(current_line_after_regex, line_time)
203
205
 
204
206
  def get_line_by_id(line_id: str) -> Optional[GameLine]:
205
207
  """
GameSentenceMiner/vad.py CHANGED
@@ -5,6 +5,7 @@ import shutil
5
5
  import tempfile
6
6
  import time
7
7
  import warnings
8
+ import re
8
9
  from abc import abstractmethod, ABC
9
10
 
10
11
  from GameSentenceMiner.util import configuration, ffmpeg
@@ -35,26 +36,26 @@ class VADSystem:
35
36
  # if not self.groq:
36
37
  # self.groq = GroqVADProcessor()
37
38
 
38
- def trim_audio_with_vad(self, input_audio, output_audio, game_line):
39
+ def trim_audio_with_vad(self, input_audio, output_audio, game_line, full_text):
39
40
  if get_config().vad.do_vad_postprocessing:
40
- result = self._do_vad_processing(get_config().vad.selected_vad_model, input_audio, output_audio, game_line)
41
+ result = self._do_vad_processing(get_config().vad.selected_vad_model, input_audio, output_audio, game_line, full_text)
41
42
  if not result.success and get_config().vad.backup_vad_model != configuration.OFF:
42
43
  logger.info("No voice activity detected, using backup VAD model.")
43
- result = self._do_vad_processing(get_config().vad.backup_vad_model, input_audio, output_audio, game_line)
44
+ result = self._do_vad_processing(get_config().vad.backup_vad_model, input_audio, output_audio, game_line, full_text)
44
45
  return result
45
46
 
46
- def _do_vad_processing(self, model, input_audio, output_audio, game_line):
47
+ def _do_vad_processing(self, model, input_audio, output_audio, game_line, text_mined):
47
48
  match model:
48
49
  case configuration.OFF:
49
50
  return VADResult(False, 0, 0, "OFF")
50
51
  case configuration.SILERO:
51
52
  if not self.silero:
52
53
  self.silero = SileroVADProcessor()
53
- return self.silero.process_audio(input_audio, output_audio, game_line)
54
+ return self.silero.process_audio(input_audio, output_audio, game_line, text_mined)
54
55
  case configuration.WHISPER:
55
56
  if not self.whisper:
56
57
  self.whisper = WhisperVADProcessor()
57
- return self.whisper.process_audio(input_audio, output_audio, game_line)
58
+ return self.whisper.process_audio(input_audio, output_audio, game_line, text_mined)
58
59
 
59
60
  # Base class for VAD systems
60
61
  class VADProcessor(ABC):
@@ -63,7 +64,7 @@ class VADProcessor(ABC):
63
64
  self.vad_system_name = None
64
65
 
65
66
  @abstractmethod
66
- def _detect_voice_activity(self, input_audio):
67
+ def _detect_voice_activity(self, input_audio, text_mined):
67
68
  pass
68
69
 
69
70
  @staticmethod
@@ -100,8 +101,8 @@ class VADProcessor(ABC):
100
101
  shutil.move(files[0], output_audio)
101
102
 
102
103
 
103
- def process_audio(self, input_audio, output_audio, game_line):
104
- voice_activity = self._detect_voice_activity(input_audio)
104
+ def process_audio(self, input_audio, output_audio, game_line, text_mined):
105
+ voice_activity = self._detect_voice_activity(input_audio, text_mined)
105
106
 
106
107
  if not voice_activity:
107
108
  logger.info("No voice activity detected in the audio.")
@@ -140,7 +141,7 @@ class SileroVADProcessor(VADProcessor):
140
141
  self.vad_model = load_silero_vad()
141
142
  self.vad_system_name = SILERO
142
143
 
143
- def _detect_voice_activity(self, input_audio):
144
+ def _detect_voice_activity(self, input_audio, text_mined):
144
145
  from silero_vad import read_audio, get_speech_timestamps
145
146
  temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
146
147
  ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
@@ -166,7 +167,7 @@ class WhisperVADProcessor(VADProcessor):
166
167
  logger.info(f"Whisper model '{get_config().vad.whisper_model}' loaded.")
167
168
  return self.vad_model
168
169
 
169
- def _detect_voice_activity(self, input_audio):
170
+ def _detect_voice_activity(self, input_audio, text_mined):
170
171
  from stable_whisper import WhisperResult
171
172
  # Convert the audio to 16kHz mono WAV, evidence https://discord.com/channels/1286409772383342664/1286518821913362445/1407017127529152533
172
173
  temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
@@ -178,10 +179,22 @@ class WhisperVADProcessor(VADProcessor):
178
179
  with warnings.catch_warnings():
179
180
  warnings.simplefilter("ignore")
180
181
  result: WhisperResult = self.vad_model.transcribe(temp_wav, vad=True, language=get_config().vad.language, vad_filter=get_config().vad.use_vad_filter_for_whisper,
181
- temperature=0.0)
182
+ temperature=0.0, chunk_length=60)
182
183
  voice_activity = []
183
184
 
184
185
  logger.debug(json.dumps(result.to_dict()))
186
+
187
+ text = result.text.strip()
188
+
189
+ # If both mined text and Whisper transcription are available, compare their similarity
190
+ if text_mined and text:
191
+ from rapidfuzz import fuzz
192
+ similarity = fuzz.partial_ratio(text_mined, text)
193
+ logger.info(f"Whisper transcription: '{text}' | Mined text: '{text_mined}' | Partial similarity: {similarity:.1f}")
194
+ # If similarity is very low, treat as no voice activity detected
195
+ if similarity < 20:
196
+ logger.info(f"Partial similarity {similarity:.1f} is below threshold, skipping voice activity.")
197
+ return []
185
198
 
186
199
  # Process the segments to extract tokens, timestamps, and confidence
187
200
  previous_segment = None
@@ -193,6 +206,12 @@ class WhisperVADProcessor(VADProcessor):
193
206
  else:
194
207
  logger.info(
195
208
  "Unknown single character segment, not skipping, but logging, please report if this is a mistake: " + segment.text)
209
+
210
+ # Skip segments with excessive repeating sequences of at least 3 characters
211
+ match = re.search(r'(.{3,})\1{4,}', segment.text)
212
+ if match:
213
+ logger.debug(f"Skipping segment with excessive repeating sequence (>=5): '{segment.text}' at {segment.start}-{segment.end}. Likely Hallucination.")
214
+ continue
196
215
 
197
216
  if segment.no_speech_prob and segment.no_speech_prob > 0.9:
198
217
  logger.debug(f"Skipping segment with high no_speech_prob: {segment.no_speech_prob} for segment {segment.text} at {segment.start}-{segment.end}")