GameSentenceMiner 2.9.0__py3-none-any.whl → 2.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/anki.py +1 -1
- GameSentenceMiner/communication/websocket.py +7 -0
- GameSentenceMiner/config_gui.py +54 -19
- GameSentenceMiner/configuration.py +67 -3
- GameSentenceMiner/ffmpeg.py +2 -2
- GameSentenceMiner/gametext.py +71 -62
- GameSentenceMiner/gsm.py +103 -51
- GameSentenceMiner/obs.py +2 -2
- GameSentenceMiner/ocr/owocr_helper.py +17 -25
- GameSentenceMiner/owocr/owocr/ocr.py +4 -3
- GameSentenceMiner/text_log.py +1 -1
- GameSentenceMiner/vad/groq_trim.py +82 -0
- GameSentenceMiner/vad/result.py +15 -2
- GameSentenceMiner/vad/silero_trim.py +14 -10
- GameSentenceMiner/vad/vosk_helper.py +2 -2
- GameSentenceMiner/vad/whisper_helper.py +8 -7
- GameSentenceMiner/web/texthooking_page.py +41 -26
- {gamesentenceminer-2.9.0.dist-info → gamesentenceminer-2.9.2.dist-info}/METADATA +5 -2
- {gamesentenceminer-2.9.0.dist-info → gamesentenceminer-2.9.2.dist-info}/RECORD +23 -22
- {gamesentenceminer-2.9.0.dist-info → gamesentenceminer-2.9.2.dist-info}/WHEEL +1 -1
- {gamesentenceminer-2.9.0.dist-info → gamesentenceminer-2.9.2.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.9.0.dist-info → gamesentenceminer-2.9.2.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.9.0.dist-info → gamesentenceminer-2.9.2.dist-info}/top_level.txt +0 -0
GameSentenceMiner/gsm.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import asyncio
|
2
|
+
import shutil
|
2
3
|
import sys
|
3
4
|
|
4
5
|
from GameSentenceMiner.vad.result import VADResult
|
@@ -59,39 +60,28 @@ root = None
|
|
59
60
|
|
60
61
|
|
61
62
|
class VideoToAudioHandler(FileSystemEventHandler):
|
63
|
+
def __init__(self):
|
64
|
+
super().__init__()
|
65
|
+
|
66
|
+
|
62
67
|
def on_created(self, event):
|
63
68
|
if event.is_directory or ("Replay" not in event.src_path and "GSM" not in event.src_path):
|
64
69
|
return
|
65
70
|
if event.src_path.endswith(".mkv") or event.src_path.endswith(".mp4"): # Adjust based on your OBS output format
|
66
71
|
logger.info(f"MKV {event.src_path} FOUND, RUNNING LOGIC")
|
67
72
|
wait_for_stable_file(event.src_path)
|
68
|
-
self.
|
73
|
+
self.process_replay(event.src_path)
|
69
74
|
|
70
|
-
|
71
|
-
def convert_to_audio(video_path):
|
75
|
+
def process_replay(self, video_path):
|
72
76
|
vad_trimmed_audio = ''
|
73
|
-
|
74
|
-
|
75
|
-
line: GameLine = texthooking_page.event_manager.line_for_audio
|
76
|
-
texthooking_page.event_manager.line_for_audio = None
|
77
|
-
if get_config().advanced.audio_player_path:
|
78
|
-
audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path, temporary=True)
|
79
|
-
play_audio_in_external(audio)
|
80
|
-
os.remove(video_path)
|
81
|
-
elif get_config().advanced.video_player_path:
|
82
|
-
play_video_in_external(line, video_path)
|
83
|
-
return
|
84
|
-
if texthooking_page.event_manager.line_for_screenshot:
|
85
|
-
line: GameLine = texthooking_page.event_manager.line_for_screenshot
|
86
|
-
texthooking_page.event_manager.line_for_screenshot = None
|
87
|
-
screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
|
88
|
-
os.startfile(screenshot)
|
89
|
-
os.remove(video_path)
|
90
|
-
return
|
91
|
-
except Exception as e:
|
92
|
-
logger.error(f"Error Playing Audio/Video: {e}")
|
93
|
-
logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
|
77
|
+
print(video_path)
|
78
|
+
if "previous.mkv" in video_path:
|
94
79
|
os.remove(video_path)
|
80
|
+
video_path = gsm_state.previous_replay
|
81
|
+
else:
|
82
|
+
gsm_state.previous_replay = video_path
|
83
|
+
if gsm_state.line_for_audio or gsm_state.line_for_screenshot:
|
84
|
+
self.handle_texthooker_button(video_path)
|
95
85
|
return
|
96
86
|
try:
|
97
87
|
if anki.card_queue and len(anki.card_queue) > 0:
|
@@ -148,7 +138,7 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
148
138
|
mined_line=mined_line)
|
149
139
|
else:
|
150
140
|
final_audio_output = ""
|
151
|
-
vad_result = VADResult(False, 0, 0)
|
141
|
+
vad_result = VADResult(False, 0, 0, '')
|
152
142
|
vad_trimmed_audio = ""
|
153
143
|
if not get_config().audio.enabled:
|
154
144
|
logger.info("Audio is disabled in config, skipping audio processing!")
|
@@ -184,6 +174,43 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
184
174
|
if vad_trimmed_audio and get_config().paths.remove_audio and os.path.exists(vad_trimmed_audio):
|
185
175
|
os.remove(vad_trimmed_audio) # Optionally remove the screenshot after conversion
|
186
176
|
|
177
|
+
def handle_texthooker_button(self, video_path):
|
178
|
+
try:
|
179
|
+
if gsm_state.line_for_audio:
|
180
|
+
line: GameLine = gsm_state.line_for_audio
|
181
|
+
gsm_state.line_for_audio = None
|
182
|
+
if line == gsm_state.previous_line_for_audio:
|
183
|
+
logger.info("Line is the same as the last one, skipping processing.")
|
184
|
+
if get_config().advanced.audio_player_path:
|
185
|
+
play_audio_in_external(gsm_state.previous_audio)
|
186
|
+
elif get_config().advanced.video_player_path:
|
187
|
+
play_video_in_external(line, gsm_state.previous_audio)
|
188
|
+
return
|
189
|
+
gsm_state.previous_line_for_audio = line
|
190
|
+
if get_config().advanced.audio_player_path:
|
191
|
+
audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path,
|
192
|
+
temporary=True)
|
193
|
+
play_audio_in_external(audio)
|
194
|
+
gsm_state.previous_audio = audio
|
195
|
+
elif get_config().advanced.video_player_path:
|
196
|
+
new_video_path = play_video_in_external(line, video_path)
|
197
|
+
gsm_state.previous_audio = new_video_path
|
198
|
+
gsm_state.previous_replay = new_video_path
|
199
|
+
return
|
200
|
+
if gsm_state.line_for_screenshot:
|
201
|
+
line: GameLine = gsm_state.line_for_screenshot
|
202
|
+
gsm_state.line_for_screenshot = None
|
203
|
+
gsm_state.previous_line_for_screenshot = line
|
204
|
+
screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
|
205
|
+
os.startfile(screenshot)
|
206
|
+
return
|
207
|
+
except Exception as e:
|
208
|
+
logger.error(f"Error Playing Audio/Video: {e}")
|
209
|
+
logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
|
210
|
+
return
|
211
|
+
finally:
|
212
|
+
if video_path and get_config().paths.remove_video and os.path.exists(video_path):
|
213
|
+
os.remove(video_path)
|
187
214
|
|
188
215
|
@staticmethod
|
189
216
|
def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False, mined_line=None):
|
@@ -195,9 +222,8 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
195
222
|
f"{os.path.abspath(configuration.get_temporary_directory())}/{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}")
|
196
223
|
final_audio_output = make_unique_file_name(os.path.join(get_config().paths.audio_destination,
|
197
224
|
f"{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}"))
|
198
|
-
result = VADResult(False, 0, 0)
|
225
|
+
result = VADResult(False, 0, 0, "")
|
199
226
|
if get_config().vad.do_vad_postprocessing:
|
200
|
-
logger.info("Trimming audio with Voice Detection...")
|
201
227
|
result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio, game_line=mined_line)
|
202
228
|
if not result.success:
|
203
229
|
result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
|
@@ -209,6 +235,8 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
209
235
|
else:
|
210
236
|
logger.info("No voice activity detected.")
|
211
237
|
return None, result, None
|
238
|
+
else:
|
239
|
+
logger.info(result.trim_successful_string())
|
212
240
|
if timing_only:
|
213
241
|
return result
|
214
242
|
if get_config().audio.ffmpeg_reencode_options and os.path.exists(vad_trimmed_audio):
|
@@ -223,6 +251,9 @@ def do_vad_processing(model, trimmed_audio, vad_trimmed_audio, game_line=None, s
|
|
223
251
|
match model:
|
224
252
|
case configuration.OFF:
|
225
253
|
pass
|
254
|
+
case configuration.GROQ:
|
255
|
+
from GameSentenceMiner.vad import groq_trim
|
256
|
+
return groq_trim.process_audio_with_groq(trimmed_audio, vad_trimmed_audio, game_line)
|
226
257
|
case configuration.SILERO:
|
227
258
|
from GameSentenceMiner.vad import silero_trim
|
228
259
|
return silero_trim.process_audio_with_silero(trimmed_audio, vad_trimmed_audio, game_line)
|
@@ -239,7 +270,7 @@ def play_audio_in_external(filepath):
|
|
239
270
|
|
240
271
|
filepath = os.path.normpath(filepath)
|
241
272
|
|
242
|
-
command = [exe, filepath]
|
273
|
+
command = [exe, "--no-video", filepath]
|
243
274
|
|
244
275
|
try:
|
245
276
|
subprocess.Popen(command)
|
@@ -248,10 +279,13 @@ def play_audio_in_external(filepath):
|
|
248
279
|
print(f"An error occurred: {e}")
|
249
280
|
|
250
281
|
def play_video_in_external(line, filepath):
|
251
|
-
def
|
282
|
+
def move_video_when_closed(p, fp):
|
252
283
|
p.wait()
|
253
284
|
os.remove(fp)
|
254
285
|
|
286
|
+
shutil.move(filepath, get_temporary_directory())
|
287
|
+
new_filepath = os.path.join(get_temporary_directory(), os.path.basename(filepath))
|
288
|
+
|
255
289
|
command = [get_config().advanced.video_player_path]
|
256
290
|
|
257
291
|
start, _, _ = get_video_timings(filepath, line)
|
@@ -265,14 +299,17 @@ def play_video_in_external(line, filepath):
|
|
265
299
|
|
266
300
|
logger.info(" ".join(command))
|
267
301
|
|
302
|
+
|
303
|
+
|
268
304
|
try:
|
269
305
|
proc = subprocess.Popen(command)
|
270
306
|
print(f"Opened {filepath} in {get_config().advanced.video_player_path}.")
|
271
|
-
threading.Thread(target=
|
307
|
+
threading.Thread(target=move_video_when_closed, args=(proc, filepath)).start()
|
272
308
|
except FileNotFoundError:
|
273
309
|
print("VLC not found. Make sure it's installed and in your PATH.")
|
274
310
|
except Exception as e:
|
275
311
|
print(f"An error occurred: {e}")
|
312
|
+
return new_filepath
|
276
313
|
|
277
314
|
def convert_to_vlc_seconds(time_str):
|
278
315
|
"""Converts HH:MM:SS.milliseconds to VLC-compatible seconds."""
|
@@ -326,21 +363,25 @@ def get_screenshot():
|
|
326
363
|
logger.error(f"Failed to get Screenshot: {e}")
|
327
364
|
|
328
365
|
|
329
|
-
def create_image():
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
366
|
+
# def create_image():
|
367
|
+
# """Create a simple pickaxe icon."""
|
368
|
+
# width, height = 64, 64
|
369
|
+
# image = Image.new("RGBA", (width, height), (0, 0, 0, 0)) # Transparent background
|
370
|
+
# draw = ImageDraw.Draw(image)
|
371
|
+
#
|
372
|
+
# # Handle (rectangle)
|
373
|
+
# handle_color = (139, 69, 19) # Brown color
|
374
|
+
# draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
|
375
|
+
#
|
376
|
+
# # Blade (triangle-like shape)
|
377
|
+
# blade_color = (192, 192, 192) # Silver color
|
378
|
+
# draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
|
379
|
+
#
|
380
|
+
# return image
|
338
381
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
return image
|
382
|
+
def create_image():
|
383
|
+
image_path = os.path.join(os.path.dirname(__file__), "assets", "pickaxe.png")
|
384
|
+
return Image.open(image_path)
|
344
385
|
|
345
386
|
|
346
387
|
def open_settings():
|
@@ -351,7 +392,7 @@ def open_settings():
|
|
351
392
|
def play_most_recent_audio():
|
352
393
|
if get_config().advanced.audio_player_path or get_config().advanced.video_player_path and len(
|
353
394
|
get_all_lines()) > 0:
|
354
|
-
|
395
|
+
gsm_state.line_for_audio = get_all_lines()[-1]
|
355
396
|
obs.save_replay_buffer()
|
356
397
|
else:
|
357
398
|
logger.error("Feature Disabled. No audio or video player path set in config!")
|
@@ -405,7 +446,7 @@ def update_icon(profile=None):
|
|
405
446
|
)
|
406
447
|
|
407
448
|
menu = Menu(
|
408
|
-
MenuItem("Open Settings", open_settings),
|
449
|
+
MenuItem("Open Settings", open_settings, default=True),
|
409
450
|
MenuItem("Open Multi-Mine GUI", open_multimine),
|
410
451
|
MenuItem("Open Log", open_log),
|
411
452
|
MenuItem("Toggle Replay Buffer", play_pause),
|
@@ -442,7 +483,7 @@ def run_tray():
|
|
442
483
|
)
|
443
484
|
|
444
485
|
menu = Menu(
|
445
|
-
MenuItem("Open Settings", open_settings),
|
486
|
+
MenuItem("Open Settings", open_settings, default=True),
|
446
487
|
MenuItem("Open Texthooker", texthooking_page.open_texthooker),
|
447
488
|
MenuItem("Open Log", open_log),
|
448
489
|
MenuItem("Toggle Replay Buffer", play_pause),
|
@@ -451,7 +492,7 @@ def run_tray():
|
|
451
492
|
MenuItem("Exit", exit_program)
|
452
493
|
)
|
453
494
|
|
454
|
-
icon = Icon("TrayApp", create_image(), "
|
495
|
+
icon = Icon("TrayApp", create_image(), "GameSentenceMiner", menu)
|
455
496
|
icon.run()
|
456
497
|
|
457
498
|
|
@@ -575,6 +616,18 @@ def handle_websocket_message(message: Message):
|
|
575
616
|
close_obs()
|
576
617
|
case FunctionName.START_OBS:
|
577
618
|
obs.start_obs()
|
619
|
+
case FunctionName.OPEN_SETTINGS:
|
620
|
+
open_settings()
|
621
|
+
case FunctionName.OPEN_TEXTHOOKER:
|
622
|
+
texthooking_page.open_texthooker()
|
623
|
+
case FunctionName.OPEN_LOG:
|
624
|
+
open_log()
|
625
|
+
case FunctionName.TOGGLE_REPLAY_BUFFER:
|
626
|
+
play_pause(None, None)
|
627
|
+
case FunctionName.RESTART_OBS:
|
628
|
+
restart_obs()
|
629
|
+
case FunctionName.EXIT:
|
630
|
+
exit_program(None, None)
|
578
631
|
case _:
|
579
632
|
logger.debug(f"unknown message from electron websocket: {message.to_json()}")
|
580
633
|
|
@@ -627,7 +680,6 @@ async def register_scene_switcher_callback():
|
|
627
680
|
settings_window.reload_settings()
|
628
681
|
update_icon()
|
629
682
|
|
630
|
-
logger.info("Registering scene switcher callback")
|
631
683
|
await obs.register_scene_change_callback(scene_switcher_callback)
|
632
684
|
|
633
685
|
async def main(reloading=False):
|
@@ -655,8 +707,8 @@ async def main(reloading=False):
|
|
655
707
|
|
656
708
|
|
657
709
|
try:
|
658
|
-
|
659
|
-
|
710
|
+
if get_config().general.open_config_on_startup:
|
711
|
+
root.after(50, settings_window.show)
|
660
712
|
settings_window.add_save_hook(update_icon)
|
661
713
|
settings_window.on_exit = exit_program
|
662
714
|
root.mainloop()
|
GameSentenceMiner/obs.py
CHANGED
@@ -99,12 +99,12 @@ async def check_obs_folder_is_correct():
|
|
99
99
|
obs_record_directory = get_record_directory()
|
100
100
|
if obs_record_directory and os.path.normpath(obs_record_directory) != os.path.normpath(
|
101
101
|
get_config().paths.folder_to_watch):
|
102
|
-
logger.info("OBS Path
|
102
|
+
logger.info("OBS Path wrong, Setting OBS Recording folder in GSM Config...")
|
103
103
|
get_config().paths.folder_to_watch = os.path.normpath(obs_record_directory)
|
104
104
|
get_master_config().sync_shared_fields()
|
105
105
|
save_full_config(get_master_config())
|
106
106
|
else:
|
107
|
-
logger.
|
107
|
+
logger.debug("OBS Recording path looks correct")
|
108
108
|
|
109
109
|
|
110
110
|
def get_obs_websocket_config_values():
|
@@ -194,8 +194,8 @@ class WebsocketServerThread(threading.Thread):
|
|
194
194
|
self._stop_event = stop_event = asyncio.Event()
|
195
195
|
self._event.set()
|
196
196
|
self.server = start_server = websockets.serve(self.server_handler,
|
197
|
-
|
198
|
-
get_config().
|
197
|
+
"0.0.0.0",
|
198
|
+
get_config().advanced.ocr_websocket_port,
|
199
199
|
max_size=1000000000)
|
200
200
|
async with start_server:
|
201
201
|
await stop_event.wait()
|
@@ -313,20 +313,15 @@ def text_callback(text, orig_text, time, img=None, came_from_ss=False, filtering
|
|
313
313
|
done = False
|
314
314
|
|
315
315
|
|
316
|
-
def run_oneocr(ocr_config: OCRConfig,
|
316
|
+
def run_oneocr(ocr_config: OCRConfig, rectangles):
|
317
317
|
global done
|
318
318
|
print("Running OneOCR")
|
319
319
|
screen_area = None
|
320
320
|
screen_areas = []
|
321
|
+
exclusions = []
|
321
322
|
if not ssonly:
|
322
|
-
for rect_config in
|
323
|
-
|
324
|
-
coords = rect_config.coordinates
|
325
|
-
monitor_config = rect_config.monitor
|
326
|
-
screen_area = ",".join(str(c) for c in coords) if area else None
|
327
|
-
if screen_area:
|
328
|
-
screen_areas.append(screen_area)
|
329
|
-
exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, ocr_config.rectangles)))
|
323
|
+
screen_areas = [",".join(str(c) for c in rect_config.coordinates) for rect_config in rectangles if not rect_config.is_excluded]
|
324
|
+
exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, rectangles)))
|
330
325
|
|
331
326
|
run.init_config(False)
|
332
327
|
run.run(read_from="screencapture" if not ssonly else "clipboard",
|
@@ -334,13 +329,13 @@ def run_oneocr(ocr_config: OCRConfig, area=False):
|
|
334
329
|
write_to="callback",
|
335
330
|
screen_capture_area=screen_area,
|
336
331
|
# screen_capture_monitor=monitor_config['index'],
|
337
|
-
screen_capture_window=ocr_config.window,
|
332
|
+
screen_capture_window=ocr_config.window if ocr_config and ocr_config.window else None,
|
338
333
|
screen_capture_only_active_windows=get_requires_open_window(),
|
339
334
|
screen_capture_delay_secs=get_ocr_scan_rate(), engine=ocr1,
|
340
335
|
text_callback=text_callback,
|
341
336
|
screen_capture_exclusions=exclusions,
|
342
337
|
language=language,
|
343
|
-
monitor_index=
|
338
|
+
monitor_index=None,
|
344
339
|
ocr1=ocr1,
|
345
340
|
ocr2=ocr2,
|
346
341
|
gsm_ocr_config=ocr_config,
|
@@ -380,7 +375,7 @@ if __name__ == "__main__":
|
|
380
375
|
import sys
|
381
376
|
|
382
377
|
args = sys.argv[1:]
|
383
|
-
if len(args)
|
378
|
+
if len(args) >= 4:
|
384
379
|
language = args[0]
|
385
380
|
ocr1 = args[1]
|
386
381
|
ocr2 = args[2]
|
@@ -418,22 +413,19 @@ if __name__ == "__main__":
|
|
418
413
|
else:
|
419
414
|
logger.error(f"Window '{ocr_config.window}' not found within 30 seconds.")
|
420
415
|
sys.exit(1)
|
421
|
-
|
422
|
-
if ocr_config:
|
423
|
-
rectangles =
|
416
|
+
logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
|
417
|
+
if ocr_config or ssonly:
|
418
|
+
rectangles = ocr_config.rectangles if ocr_config and ocr_config.rectangles else []
|
424
419
|
oneocr_threads = []
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
420
|
+
ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,rectangles ), daemon=True)
|
421
|
+
ocr_thread.start()
|
422
|
+
if not ssonly:
|
423
|
+
websocket_server_thread = WebsocketServerThread(read=True)
|
424
|
+
websocket_server_thread.start()
|
430
425
|
try:
|
431
426
|
while not done:
|
432
427
|
time.sleep(1)
|
433
428
|
except KeyboardInterrupt as e:
|
434
429
|
pass
|
435
|
-
for thread in oneocr_threads:
|
436
|
-
thread.join()
|
437
|
-
# asyncio.run(websocket_client())
|
438
430
|
else:
|
439
431
|
print("Failed to load OCR configuration. Please check the logs.")
|
@@ -1043,7 +1043,7 @@ class GeminiOCR:
|
|
1043
1043
|
}
|
1044
1044
|
},
|
1045
1045
|
{
|
1046
|
-
'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
|
1046
|
+
'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
|
1047
1047
|
}
|
1048
1048
|
]
|
1049
1049
|
}
|
@@ -1096,13 +1096,14 @@ class GroqOCR:
|
|
1096
1096
|
return (False, 'Error processing image for Groq.')
|
1097
1097
|
|
1098
1098
|
prompt = (
|
1099
|
-
"Analyze
|
1099
|
+
"Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
|
1100
|
+
# "Analyze this i#mage and extract text from it"
|
1100
1101
|
# "(speech bubbles or panels containing character dialogue). From the extracted dialogue text, "
|
1101
1102
|
# "filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, "
|
1102
1103
|
# "including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. "
|
1103
1104
|
# "If no text is found within dialogue boxes after applying filters, return an empty string. "
|
1104
1105
|
# "OR, if there are no text bubbles or dialogue boxes found, return everything."
|
1105
|
-
"Do not include any other output, formatting markers, or commentary, only the text from the image."
|
1106
|
+
# "Do not include any other output, formatting markers, or commentary, only the text from the image."
|
1106
1107
|
)
|
1107
1108
|
|
1108
1109
|
response = self.client.chat.completions.create(
|
GameSentenceMiner/text_log.py
CHANGED
@@ -125,7 +125,7 @@ def get_text_event(last_note) -> GameLine:
|
|
125
125
|
if lines_match(line.text, remove_html_and_cloze_tags(sentence)):
|
126
126
|
return line
|
127
127
|
|
128
|
-
logger.
|
128
|
+
logger.info("Could not find matching sentence from GSM's history. Using the latest line.")
|
129
129
|
return lines[-1]
|
130
130
|
|
131
131
|
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import os
|
2
|
+
import tempfile
|
3
|
+
import time
|
4
|
+
|
5
|
+
from groq import Groq
|
6
|
+
|
7
|
+
# Assuming these are available from GameSentenceMiner
|
8
|
+
from GameSentenceMiner import configuration, ffmpeg
|
9
|
+
from GameSentenceMiner.configuration import get_config, logger, GROQ # Import specific functions/objects
|
10
|
+
from GameSentenceMiner.vad.result import VADResult
|
11
|
+
from GameSentenceMiner.vad.vad_utils import get_audio_length
|
12
|
+
|
13
|
+
# Initialize Groq Client
|
14
|
+
client = Groq(api_key=get_config().ai.groq_api_key)
|
15
|
+
|
16
|
+
def detect_voice_with_groq(input_audio_path):
|
17
|
+
"""
|
18
|
+
Detects voice activity and extracts speech timestamps using the Groq Whisper API.
|
19
|
+
"""
|
20
|
+
try:
|
21
|
+
with open(input_audio_path, "rb") as file:
|
22
|
+
transcription = client.audio.transcriptions.create(
|
23
|
+
file=(os.path.basename(input_audio_path), file.read()),
|
24
|
+
model="whisper-large-v3-turbo",
|
25
|
+
response_format="verbose_json",
|
26
|
+
language=get_config().vad.language,
|
27
|
+
temperature=0.0,
|
28
|
+
timestamp_granularities=["segment"],
|
29
|
+
prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
|
30
|
+
)
|
31
|
+
|
32
|
+
logger.debug(transcription)
|
33
|
+
|
34
|
+
# print(transcription)
|
35
|
+
|
36
|
+
speech_segments = transcription.segments if hasattr(transcription, 'segments') else []
|
37
|
+
# print(f"Groq speech segments: {speech_segments}")
|
38
|
+
|
39
|
+
audio_length = get_audio_length(input_audio_path)
|
40
|
+
# print(f"FFPROBE Length of input audio: {audio_length}")
|
41
|
+
|
42
|
+
return speech_segments, audio_length
|
43
|
+
except Exception as e:
|
44
|
+
logger.error(f"Error detecting voice with Groq: {e}")
|
45
|
+
return [], 0.0
|
46
|
+
|
47
|
+
def process_audio_with_groq(input_audio, output_audio, game_line):
|
48
|
+
"""
|
49
|
+
Processes an audio file by detecting voice activity using Groq Whisper API,
|
50
|
+
trimming the audio based on detected speech timestamps, and saving the trimmed audio.
|
51
|
+
"""
|
52
|
+
start = time.time()
|
53
|
+
voice_activity, audio_length = detect_voice_with_groq(input_audio)
|
54
|
+
logger.info(f"Processing time for Groq: {time.time() - start:.2f} seconds")
|
55
|
+
|
56
|
+
if not voice_activity:
|
57
|
+
logger.info(f"No voice activity detected in {input_audio}")
|
58
|
+
return VADResult(False, 0, 0, GROQ)
|
59
|
+
|
60
|
+
start_time = voice_activity[0]['start']
|
61
|
+
end_time = voice_activity[-1]['end']
|
62
|
+
|
63
|
+
# Logic to potentially use the second-to-last timestamp if a next game line is expected
|
64
|
+
# and there's a significant pause before the very last segment.
|
65
|
+
if (game_line and hasattr(game_line, 'next') and game_line.next and
|
66
|
+
len(voice_activity) > 1 and
|
67
|
+
(voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
|
68
|
+
end_time = voice_activity[-2]['end']
|
69
|
+
logger.info("Using the second last timestamp for trimming due to game_line.next and significant pause.")
|
70
|
+
|
71
|
+
# Apply offsets from configuration, ensuring times are within valid bounds
|
72
|
+
final_start_time = max(0, start_time + get_config().vad.beginning_offset)
|
73
|
+
final_end_time = min(audio_length, end_time + get_config().audio.end_offset)
|
74
|
+
|
75
|
+
logger.debug(f"Trimming {input_audio} from {final_start_time:.2f}s to {final_end_time:.2f}s into {output_audio}")
|
76
|
+
|
77
|
+
ffmpeg.trim_audio(input_audio, final_start_time, final_end_time, output_audio)
|
78
|
+
|
79
|
+
return VADResult(True, final_start_time, final_end_time, GROQ)
|
80
|
+
|
81
|
+
# Example usage (uncomment and modify with your actual file paths for testing)
|
82
|
+
# process_audio_with_groq("tmp6x81cy27.opus", "tmp6x81cy27_trimmed_groq.opus", None)
|
GameSentenceMiner/vad/result.py
CHANGED
@@ -1,8 +1,21 @@
|
|
1
|
+
from GameSentenceMiner.configuration import get_config
|
2
|
+
|
3
|
+
|
1
4
|
class VADResult:
|
2
|
-
def __init__(self, success: bool, start: float, end: float):
|
5
|
+
def __init__(self, success: bool, start: float, end: float, model: str):
|
3
6
|
self.success = success
|
4
7
|
self.start = start
|
5
8
|
self.end = end
|
9
|
+
self.model = model
|
6
10
|
|
7
11
|
def __repr__(self):
|
8
|
-
return f"VADResult(success={self.success}, start={self.start}, end={self.end})"
|
12
|
+
return f"VADResult(success={self.success}, start={self.start}, end={self.end}, model={self.model})"
|
13
|
+
|
14
|
+
def trim_successful_string(self):
|
15
|
+
if self.success:
|
16
|
+
if get_config().vad.trim_beginning:
|
17
|
+
return f"Trimmed audio from {self.start:.2f} to {self.end:.2f} seconds using {self.model}."
|
18
|
+
else:
|
19
|
+
return f"Trimmed end of audio to {self.end:.2f} seconds using {self.model}."
|
20
|
+
else:
|
21
|
+
return f"Failed to trim audio using {self.model}."
|
@@ -5,6 +5,7 @@ from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
|
|
5
5
|
from GameSentenceMiner import configuration, ffmpeg
|
6
6
|
from GameSentenceMiner.configuration import *
|
7
7
|
from GameSentenceMiner.vad.result import VADResult
|
8
|
+
from GameSentenceMiner.vad.vad_utils import get_audio_length
|
8
9
|
|
9
10
|
# Silero VAD setup
|
10
11
|
vad_model = load_silero_vad()
|
@@ -17,32 +18,35 @@ def detect_voice_with_silero(input_audio):
|
|
17
18
|
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
18
19
|
|
19
20
|
# Load the audio and detect speech timestamps
|
20
|
-
wav = read_audio(temp_wav
|
21
|
+
wav = read_audio(temp_wav)
|
21
22
|
speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True)
|
22
23
|
|
23
24
|
logger.debug(speech_timestamps)
|
24
25
|
|
25
26
|
# Return the speech timestamps (start and end in seconds)
|
26
|
-
return speech_timestamps
|
27
|
+
return speech_timestamps, len(wav) / 16000
|
27
28
|
|
28
29
|
|
29
30
|
# Example usage of Silero with trimming
|
30
31
|
def process_audio_with_silero(input_audio, output_audio, game_line):
|
31
|
-
voice_activity = detect_voice_with_silero(input_audio)
|
32
|
+
voice_activity, audio_length = detect_voice_with_silero(input_audio)
|
32
33
|
|
33
34
|
if not voice_activity:
|
34
|
-
return VADResult(False, 0, 0)
|
35
|
+
return VADResult(False, 0, 0, SILERO)
|
35
36
|
|
36
37
|
# Trim based on the first and last speech detected
|
37
38
|
start_time = voice_activity[0]['start'] if voice_activity else 0
|
38
|
-
if
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
logger.info("Using the second last timestamp for trimming")
|
39
|
+
if game_line and game_line.next and len(voice_activity) > 1 and 0 > audio_length - voice_activity[-1]['start'] + get_config().audio.beginning_offset:
|
40
|
+
# and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
|
41
|
+
end_time = voice_activity[-2]['end']
|
42
|
+
logger.info("Using the second last timestamp for trimming")
|
43
43
|
else:
|
44
44
|
end_time = voice_activity[-1]['end'] if voice_activity else 0
|
45
45
|
|
46
46
|
# Trim the audio using FFmpeg
|
47
47
|
ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
|
48
|
-
return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
|
48
|
+
return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, SILERO)
|
49
|
+
|
50
|
+
|
51
|
+
# process_audio_with_silero("tmp6x81cy27.opus", "tmp6x81cy27_trimmed.opus", None)
|
52
|
+
# print(detect_voice_with_silero("tmp6x81cy27.opus"))
|
@@ -128,7 +128,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
|
|
128
128
|
|
129
129
|
if not voice_activity:
|
130
130
|
logger.info("No voice activity detected in the audio.")
|
131
|
-
return VADResult(False, 0, 0)
|
131
|
+
return VADResult(False, 0, 0, VOSK)
|
132
132
|
|
133
133
|
# Trim based on the first and last speech detected
|
134
134
|
start_time = voice_activity[0]['start'] if voice_activity else 0
|
@@ -148,7 +148,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
|
|
148
148
|
|
149
149
|
# Trim the audio using FFmpeg
|
150
150
|
ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
|
151
|
-
return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
|
151
|
+
return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, VOSK)
|
152
152
|
|
153
153
|
|
154
154
|
def get_vosk_model():
|