PyPI - GameSentenceMiner - Versions diffs - 2.8.54__py3-none-any.whl → 2.9.1__py3-none-any.whl - Mend

GameSentenceMiner 2.8.54py3-none-any.whl → 2.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

GameSentenceMiner/anki.py +1 -1
GameSentenceMiner/communication/websocket.py +7 -0
GameSentenceMiner/config_gui.py +63 -20
GameSentenceMiner/configuration.py +69 -3
GameSentenceMiner/ffmpeg.py +2 -2
GameSentenceMiner/gametext.py +71 -62
GameSentenceMiner/gsm.py +108 -55
GameSentenceMiner/obs.py +2 -2
GameSentenceMiner/ocr/owocr_helper.py +17 -25
GameSentenceMiner/owocr/owocr/ocr.py +4 -3
GameSentenceMiner/text_log.py +1 -1
GameSentenceMiner/vad/groq_trim.py +82 -0
GameSentenceMiner/vad/result.py +15 -2
GameSentenceMiner/vad/silero_trim.py +14 -10
GameSentenceMiner/vad/vad_utils.py +13 -0
GameSentenceMiner/vad/vosk_helper.py +2 -2
GameSentenceMiner/vad/whisper_helper.py +8 -7
GameSentenceMiner/web/templates/index.html +49 -0
GameSentenceMiner/web/templates/utility.html +2 -2
GameSentenceMiner/web/texthooking_page.py +54 -32
{gamesentenceminer-2.8.54.dist-info → gamesentenceminer-2.9.1.dist-info}/METADATA +4 -1
{gamesentenceminer-2.8.54.dist-info → gamesentenceminer-2.9.1.dist-info}/RECORD +26 -23
{gamesentenceminer-2.8.54.dist-info → gamesentenceminer-2.9.1.dist-info}/WHEEL +1 -1
{gamesentenceminer-2.8.54.dist-info → gamesentenceminer-2.9.1.dist-info}/entry_points.txt +0 -0
{gamesentenceminer-2.8.54.dist-info → gamesentenceminer-2.9.1.dist-info}/licenses/LICENSE +0 -0
{gamesentenceminer-2.8.54.dist-info → gamesentenceminer-2.9.1.dist-info}/top_level.txt +0 -0

GameSentenceMiner/gsm.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import shutil
 import sys
 from GameSentenceMiner.vad.result import VADResult
@@ -59,39 +60,28 @@ root = None
 class VideoToAudioHandler(FileSystemEventHandler):
+    def __init__(self):
+        super().__init__()
     def on_created(self, event):
         if event.is_directory or ("Replay" not in event.src_path and "GSM" not in event.src_path):
             return
         if event.src_path.endswith(".mkv") or event.src_path.endswith(".mp4"):  # Adjust based on your OBS output format
             logger.info(f"MKV {event.src_path} FOUND, RUNNING LOGIC")
             wait_for_stable_file(event.src_path)
-            self.convert_to_audio(event.src_path)
+            self.process_replay(event.src_path)
-    @staticmethod
-    def convert_to_audio(video_path):
+    def process_replay(self, video_path):
         vad_trimmed_audio = ''
-        try:
-            if texthooking_page.event_manager.line_for_audio:
-                line: GameLine = texthooking_page.event_manager.line_for_audio
-                texthooking_page.event_manager.line_for_audio = None
-                if get_config().advanced.audio_player_path:
-                    audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path, temporary=True)
-                    play_audio_in_external(audio)
-                    os.remove(video_path)
-                elif get_config().advanced.video_player_path:
-                    play_video_in_external(line, video_path)
-                return
-            if texthooking_page.event_manager.line_for_screenshot:
-                line: GameLine = texthooking_page.event_manager.line_for_screenshot
-                texthooking_page.event_manager.line_for_screenshot = None
-                screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
-                os.startfile(screenshot)
-                os.remove(video_path)
-                return
-        except Exception as e:
-            logger.error(f"Error Playing Audio/Video: {e}")
-            logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
+        print(video_path)
+        if "previous.mkv" in video_path:
             os.remove(video_path)
+            video_path = gsm_state.previous_replay
+        else:
+            gsm_state.previous_replay = video_path
+        if gsm_state.line_for_audio or gsm_state.line_for_screenshot:
+            self.handle_texthooker_button(video_path)
             return
         try:
             if anki.card_queue and len(anki.card_queue) > 0:
@@ -144,10 +134,11 @@ class VideoToAudioHandler(FileSystemEventHandler):
                         start_line,
                         line_cutoff,
                         video_path,
-                        anki_card_creation_time)
+                        anki_card_creation_time,
+                        mined_line=mined_line)
                 else:
                     final_audio_output = ""
-                    vad_result = VADResult(False, 0, 0)
+                    vad_result = VADResult(False, 0, 0, '')
                     vad_trimmed_audio = ""
                     if not get_config().audio.enabled:
                         logger.info("Audio is disabled in config, skipping audio processing!")
@@ -183,9 +174,46 @@ class VideoToAudioHandler(FileSystemEventHandler):
             if vad_trimmed_audio and get_config().paths.remove_audio and os.path.exists(vad_trimmed_audio):
                 os.remove(vad_trimmed_audio)  # Optionally remove the screenshot after conversion
+    def handle_texthooker_button(self, video_path):
+        try:
+            if gsm_state.line_for_audio:
+                line: GameLine = gsm_state.line_for_audio
+                gsm_state.line_for_audio = None
+                if line == gsm_state.previous_line_for_audio:
+                    logger.info("Line is the same as the last one, skipping processing.")
+                    if get_config().advanced.audio_player_path:
+                        play_audio_in_external(gsm_state.previous_audio)
+                    elif get_config().advanced.video_player_path:
+                        play_video_in_external(line, gsm_state.previous_audio)
+                    return
+                gsm_state.previous_line_for_audio = line
+                if get_config().advanced.audio_player_path:
+                    audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path,
+                                                          temporary=True)
+                    play_audio_in_external(audio)
+                    gsm_state.previous_audio = audio
+                elif get_config().advanced.video_player_path:
+                    new_video_path = play_video_in_external(line, video_path)
+                    gsm_state.previous_audio = new_video_path
+                    gsm_state.previous_replay = new_video_path
+                return
+            if gsm_state.line_for_screenshot:
+                line: GameLine = gsm_state.line_for_screenshot
+                gsm_state.line_for_screenshot = None
+                gsm_state.previous_line_for_screenshot = line
+                screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
+                os.startfile(screenshot)
+                return
+        except Exception as e:
+            logger.error(f"Error Playing Audio/Video: {e}")
+            logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
+            return
+        finally:
+            if video_path and get_config().paths.remove_video and os.path.exists(video_path):
+                os.remove(video_path)
     @staticmethod
-    def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False):
+    def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False, mined_line=None):
         logger.info("Getting audio from video...")
         trimmed_audio = get_audio_and_trim(video_path, game_line, next_line_time, anki_card_creation_time)
         if temporary:
@@ -194,13 +222,12 @@ class VideoToAudioHandler(FileSystemEventHandler):
             f"{os.path.abspath(configuration.get_temporary_directory())}/{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}")
         final_audio_output = make_unique_file_name(os.path.join(get_config().paths.audio_destination,
                                                                 f"{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}"))
-        result = VADResult(False, 0, 0)
+        result = VADResult(False, 0, 0, "")
         if get_config().vad.do_vad_postprocessing:
-            logger.info("Trimming audio with Voice Detection...")
-            result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio, game_line=game_line)
+            result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio, game_line=mined_line)
             if not result.success:
                 result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
-                                                        vad_trimmed_audio, game_line=game_line)
+                                                        vad_trimmed_audio, game_line=mined_line)
             if not result.success:
                 if get_config().vad.add_audio_on_no_results:
                     logger.info("No voice activity detected, using full audio.")
@@ -208,6 +235,8 @@ class VideoToAudioHandler(FileSystemEventHandler):
                 else:
                     logger.info("No voice activity detected.")
                     return None, result, None
+            else:
+                logger.info(result.trim_successful_string())
         if timing_only:
             return result
         if get_config().audio.ffmpeg_reencode_options and os.path.exists(vad_trimmed_audio):
@@ -222,6 +251,9 @@ def do_vad_processing(model, trimmed_audio, vad_trimmed_audio, game_line=None, s
     match model:
         case configuration.OFF:
             pass
+        case configuration.GROQ:
+            from GameSentenceMiner.vad import groq_trim
+            return groq_trim.process_audio_with_groq(trimmed_audio, vad_trimmed_audio, game_line)
         case configuration.SILERO:
             from GameSentenceMiner.vad import silero_trim
             return silero_trim.process_audio_with_silero(trimmed_audio, vad_trimmed_audio, game_line)
@@ -238,7 +270,7 @@ def play_audio_in_external(filepath):
     filepath = os.path.normpath(filepath)
-    command = [exe, filepath]
+    command = [exe, "--no-video", filepath]
     try:
         subprocess.Popen(command)
@@ -247,10 +279,13 @@ def play_audio_in_external(filepath):
         print(f"An error occurred: {e}")
 def play_video_in_external(line, filepath):
-    def remove_video_when_closed(p, fp):
+    def move_video_when_closed(p, fp):
         p.wait()
         os.remove(fp)
+    shutil.move(filepath, get_temporary_directory())
+    new_filepath = os.path.join(get_temporary_directory(), os.path.basename(filepath))
     command = [get_config().advanced.video_player_path]
     start, _, _ = get_video_timings(filepath, line)
@@ -264,14 +299,17 @@ def play_video_in_external(line, filepath):
     logger.info(" ".join(command))
     try:
         proc = subprocess.Popen(command)
         print(f"Opened {filepath} in {get_config().advanced.video_player_path}.")
-        threading.Thread(target=remove_video_when_closed, args=(proc, filepath)).start()
+        threading.Thread(target=move_video_when_closed, args=(proc, filepath)).start()
     except FileNotFoundError:
         print("VLC not found. Make sure it's installed and in your PATH.")
     except Exception as e:
         print(f"An error occurred: {e}")
+    return new_filepath
 def convert_to_vlc_seconds(time_str):
     """Converts HH:MM:SS.milliseconds to VLC-compatible seconds."""
@@ -325,21 +363,25 @@ def get_screenshot():
         logger.error(f"Failed to get Screenshot: {e}")
-def create_image():
-    """Create a simple pickaxe icon."""
-    width, height = 64, 64
-    image = Image.new("RGBA", (width, height), (0, 0, 0, 0))  # Transparent background
-    draw = ImageDraw.Draw(image)
-    # Handle (rectangle)
-    handle_color = (139, 69, 19)  # Brown color
-    draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
+# def create_image():
+#     """Create a simple pickaxe icon."""
+#     width, height = 64, 64
+#     image = Image.new("RGBA", (width, height), (0, 0, 0, 0))  # Transparent background
+#     draw = ImageDraw.Draw(image)
+#
+#     # Handle (rectangle)
+#     handle_color = (139, 69, 19)  # Brown color
+#     draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
+#
+#     # Blade (triangle-like shape)
+#     blade_color = (192, 192, 192)  # Silver color
+#     draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
+#
+#     return image
-    # Blade (triangle-like shape)
-    blade_color = (192, 192, 192)  # Silver color
-    draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
-    return image
+def create_image():
+    image_path = os.path.join(os.path.dirname(__file__), "assets", "pickaxe.png")
+    return Image.open(image_path)
 def open_settings():
@@ -350,7 +392,7 @@ def open_settings():
 def play_most_recent_audio():
     if get_config().advanced.audio_player_path or get_config().advanced.video_player_path and len(
             get_all_lines()) > 0:
-        texthooking_page.event_manager.line_for_audio = get_all_lines()[-1]
+        gsm_state.line_for_audio = get_all_lines()[-1]
         obs.save_replay_buffer()
     else:
         logger.error("Feature Disabled. No audio or video player path set in config!")
@@ -404,7 +446,7 @@ def update_icon(profile=None):
     )
     menu = Menu(
-        MenuItem("Open Settings", open_settings),
+        MenuItem("Open Settings", open_settings, default=True),
         MenuItem("Open Multi-Mine GUI", open_multimine),
         MenuItem("Open Log", open_log),
         MenuItem("Toggle Replay Buffer", play_pause),
@@ -441,7 +483,7 @@ def run_tray():
     )
     menu = Menu(
-        MenuItem("Open Settings", open_settings),
+        MenuItem("Open Settings", open_settings, default=True),
         MenuItem("Open Texthooker", texthooking_page.open_texthooker),
         MenuItem("Open Log", open_log),
         MenuItem("Toggle Replay Buffer", play_pause),
@@ -450,7 +492,7 @@ def run_tray():
         MenuItem("Exit", exit_program)
     )
-    icon = Icon("TrayApp", create_image(), "Game Sentence Miner", menu)
+    icon = Icon("TrayApp", create_image(), "GameSentenceMiner", menu)
     icon.run()
@@ -574,6 +616,18 @@ def handle_websocket_message(message: Message):
             close_obs()
         case FunctionName.START_OBS:
             obs.start_obs()
+        case FunctionName.OPEN_SETTINGS:
+            open_settings()
+        case FunctionName.OPEN_TEXTHOOKER:
+            texthooking_page.open_texthooker()
+        case FunctionName.OPEN_LOG:
+            open_log()
+        case FunctionName.TOGGLE_REPLAY_BUFFER:
+            play_pause(None, None)
+        case FunctionName.RESTART_OBS:
+            restart_obs()
+        case FunctionName.EXIT:
+            exit_program(None, None)
         case _:
             logger.debug(f"unknown message from electron websocket: {message.to_json()}")
@@ -626,7 +680,6 @@ async def register_scene_switcher_callback():
             settings_window.reload_settings()
             update_icon()
-    logger.info("Registering scene switcher callback")
     await obs.register_scene_change_callback(scene_switcher_callback)
 async def main(reloading=False):
@@ -654,8 +707,8 @@ async def main(reloading=False):
     try:
-        # if get_config().general.open_config_on_startup:
-        #     root.after(0, settings_window.show)
+        if get_config().general.open_config_on_startup:
+            root.after(50, settings_window.show)
         settings_window.add_save_hook(update_icon)
         settings_window.on_exit = exit_program
         root.mainloop()

GameSentenceMiner/obs.py CHANGED Viewed

@@ -99,12 +99,12 @@ async def check_obs_folder_is_correct():
         obs_record_directory = get_record_directory()
         if obs_record_directory and os.path.normpath(obs_record_directory) != os.path.normpath(
                 get_config().paths.folder_to_watch):
-            logger.info("OBS Path Setting wrong, OBS Recording folder in GSM Config")
+            logger.info("OBS Path wrong, Setting OBS Recording folder in GSM Config...")
             get_config().paths.folder_to_watch = os.path.normpath(obs_record_directory)
             get_master_config().sync_shared_fields()
             save_full_config(get_master_config())
         else:
-            logger.info("OBS Recording path looks correct")
+            logger.debug("OBS Recording path looks correct")
 def get_obs_websocket_config_values():

GameSentenceMiner/ocr/owocr_helper.py CHANGED Viewed

@@ -194,8 +194,8 @@ class WebsocketServerThread(threading.Thread):
             self._stop_event = stop_event = asyncio.Event()
             self._event.set()
             self.server = start_server = websockets.serve(self.server_handler,
-                                                          get_config().general.websocket_uri.split(":")[0],
-                                                          get_config().general.websocket_uri.split(":")[1],
+                                                          "0.0.0.0",
+                                                          get_config().advanced.ocr_websocket_port,
                                                           max_size=1000000000)
             async with start_server:
                 await stop_event.wait()
@@ -313,20 +313,15 @@ def text_callback(text, orig_text, time, img=None, came_from_ss=False, filtering
 done = False
-def run_oneocr(ocr_config: OCRConfig, area=False):
+def run_oneocr(ocr_config: OCRConfig, rectangles):
     global done
     print("Running OneOCR")
     screen_area = None
     screen_areas = []
+    exclusions = []
     if not ssonly:
-        for rect_config in ocr_config.rectangles:
-            if not rect_config.is_excluded:
-                coords = rect_config.coordinates
-                monitor_config = rect_config.monitor
-                screen_area = ",".join(str(c) for c in coords) if area else None
-                if screen_area:
-                    screen_areas.append(screen_area)
-    exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, ocr_config.rectangles)))
+        screen_areas = [",".join(str(c) for c in rect_config.coordinates) for rect_config in rectangles if not rect_config.is_excluded]
+        exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, rectangles)))
     run.init_config(False)
     run.run(read_from="screencapture" if not ssonly else "clipboard",
@@ -334,13 +329,13 @@ def run_oneocr(ocr_config: OCRConfig, area=False):
             write_to="callback",
             screen_capture_area=screen_area,
             # screen_capture_monitor=monitor_config['index'],
-            screen_capture_window=ocr_config.window,
+            screen_capture_window=ocr_config.window if ocr_config and ocr_config.window else None,
             screen_capture_only_active_windows=get_requires_open_window(),
             screen_capture_delay_secs=get_ocr_scan_rate(), engine=ocr1,
             text_callback=text_callback,
             screen_capture_exclusions=exclusions,
             language=language,
-            monitor_index=ocr_config.window,
+            monitor_index=None,
             ocr1=ocr1,
             ocr2=ocr2,
             gsm_ocr_config=ocr_config,
@@ -380,7 +375,7 @@ if __name__ == "__main__":
     import sys
     args = sys.argv[1:]
-    if len(args) == 4:
+    if len(args) >= 4:
         language = args[0]
         ocr1 = args[1]
         ocr2 = args[2]
@@ -418,22 +413,19 @@ if __name__ == "__main__":
             else:
                 logger.error(f"Window '{ocr_config.window}' not found within 30 seconds.")
                 sys.exit(1)
-    logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
-    if ocr_config:
-        rectangles = list(filter(lambda rect: not rect.is_excluded, ocr_config.rectangles))
+        logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
+    if ocr_config or ssonly:
+        rectangles = ocr_config.rectangles if ocr_config and ocr_config.rectangles else []
         oneocr_threads = []
-        single_ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,ocr_config.rectangles ), daemon=True)
-        oneocr_threads.append(single_ocr_thread)
-        single_ocr_thread.start()
-        websocket_server_thread = WebsocketServerThread(read=True)
-        websocket_server_thread.start()
+        ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,rectangles ), daemon=True)
+        ocr_thread.start()
+        if not ssonly:
+            websocket_server_thread = WebsocketServerThread(read=True)
+            websocket_server_thread.start()
         try:
             while not done:
                 time.sleep(1)
         except KeyboardInterrupt as e:
             pass
-        for thread in oneocr_threads:
-            thread.join()
-        # asyncio.run(websocket_client())
     else:
         print("Failed to load OCR configuration. Please check the logs.")

GameSentenceMiner/owocr/owocr/ocr.py CHANGED Viewed

@@ -1043,7 +1043,7 @@ class GeminiOCR:
                             }
                         },
                         {
-                            'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
+                            'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
                         }
                     ]
                 }
@@ -1096,13 +1096,14 @@ class GroqOCR:
                 return (False, 'Error processing image for Groq.')
             prompt = (
-                "Analyze this image and extract text from it"
+                "Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
+                # "Analyze this i#mage and extract text from it"
                 # "(speech bubbles or panels containing character dialogue). From the extracted dialogue text, "
                 # "filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, "
                 # "including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. "
                 # "If no text is found within dialogue boxes after applying filters, return an empty string. "
                 # "OR, if there are no text bubbles or dialogue boxes found, return everything."
-                "Do not include any other output, formatting markers, or commentary, only the text from the image."
+                # "Do not include any other output, formatting markers, or commentary, only the text from the image."
             )
             response = self.client.chat.completions.create(

GameSentenceMiner/text_log.py CHANGED Viewed

@@ -125,7 +125,7 @@ def get_text_event(last_note) -> GameLine:
         if lines_match(line.text, remove_html_and_cloze_tags(sentence)):
             return line
-    logger.debug("Couldn't find a match in history, using last event")
+    logger.info("Could not find matching sentence from GSM's history. Using the latest line.")
     return lines[-1]

GameSentenceMiner/vad/groq_trim.py ADDED Viewed

@@ -0,0 +1,82 @@
+import os
+import tempfile
+import time
+from groq import Groq
+# Assuming these are available from GameSentenceMiner
+from GameSentenceMiner import configuration, ffmpeg
+from GameSentenceMiner.configuration import get_config, logger, GROQ  # Import specific functions/objects
+from GameSentenceMiner.vad.result import VADResult
+from GameSentenceMiner.vad.vad_utils import get_audio_length
+# Initialize Groq Client
+client = Groq(api_key=get_config().ai.groq_api_key)
+def detect_voice_with_groq(input_audio_path):
+    """
+    Detects voice activity and extracts speech timestamps using the Groq Whisper API.
+    """
+    try:
+        with open(input_audio_path, "rb") as file:
+            transcription = client.audio.transcriptions.create(
+                file=(os.path.basename(input_audio_path), file.read()),
+                model="whisper-large-v3-turbo",
+                response_format="verbose_json",
+                language=get_config().vad.language,
+                temperature=0.0,
+                timestamp_granularities=["segment"],
+                prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
+            )
+        logger.debug(transcription)
+        # print(transcription)
+        speech_segments = transcription.segments if hasattr(transcription, 'segments') else []
+        # print(f"Groq speech segments: {speech_segments}")
+        audio_length = get_audio_length(input_audio_path)
+        # print(f"FFPROBE Length of input audio: {audio_length}")
+        return speech_segments, audio_length
+    except Exception as e:
+        logger.error(f"Error detecting voice with Groq: {e}")
+        return [], 0.0
+def process_audio_with_groq(input_audio, output_audio, game_line):
+    """
+    Processes an audio file by detecting voice activity using Groq Whisper API,
+    trimming the audio based on detected speech timestamps, and saving the trimmed audio.
+    """
+    start = time.time()
+    voice_activity, audio_length = detect_voice_with_groq(input_audio)
+    logger.info(f"Processing time for Groq: {time.time() - start:.2f} seconds")
+    if not voice_activity:
+        logger.info(f"No voice activity detected in {input_audio}")
+        return VADResult(False, 0, 0, GROQ)
+    start_time = voice_activity[0]['start']
+    end_time = voice_activity[-1]['end']
+    # Logic to potentially use the second-to-last timestamp if a next game line is expected
+    # and there's a significant pause before the very last segment.
+    if (game_line and hasattr(game_line, 'next') and game_line.next and
+        len(voice_activity) > 1 and
+        (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
+        end_time = voice_activity[-2]['end']
+        logger.info("Using the second last timestamp for trimming due to game_line.next and significant pause.")
+    # Apply offsets from configuration, ensuring times are within valid bounds
+    final_start_time = max(0, start_time + get_config().vad.beginning_offset)
+    final_end_time = min(audio_length, end_time + get_config().audio.end_offset)
+    logger.debug(f"Trimming {input_audio} from {final_start_time:.2f}s to {final_end_time:.2f}s into {output_audio}")
+    ffmpeg.trim_audio(input_audio, final_start_time, final_end_time, output_audio)
+    return VADResult(True, final_start_time, final_end_time, GROQ)
+# Example usage (uncomment and modify with your actual file paths for testing)
+# process_audio_with_groq("tmp6x81cy27.opus", "tmp6x81cy27_trimmed_groq.opus", None)

GameSentenceMiner/vad/result.py CHANGED Viewed

@@ -1,8 +1,21 @@
+from GameSentenceMiner.configuration import get_config
 class VADResult:
-    def __init__(self, success: bool, start: float, end: float):
+    def __init__(self, success: bool, start: float, end: float, model: str):
         self.success = success
         self.start = start
         self.end = end
+        self.model = model
     def __repr__(self):
-        return f"VADResult(success={self.success}, start={self.start}, end={self.end})"
+        return f"VADResult(success={self.success}, start={self.start}, end={self.end}, model={self.model})"
+    def trim_successful_string(self):
+        if self.success:
+            if get_config().vad.trim_beginning:
+                return f"Trimmed audio from {self.start:.2f} to {self.end:.2f} seconds using {self.model}."
+            else:
+                return f"Trimmed end of audio to {self.end:.2f} seconds using {self.model}."
+        else:
+            return f"Failed to trim audio using {self.model}."

GameSentenceMiner/vad/silero_trim.py CHANGED Viewed

@@ -5,6 +5,7 @@ from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
 from GameSentenceMiner  import configuration, ffmpeg
 from GameSentenceMiner.configuration import *
 from GameSentenceMiner.vad.result import VADResult
+from GameSentenceMiner.vad.vad_utils import get_audio_length
 # Silero VAD setup
 vad_model = load_silero_vad()
@@ -17,32 +18,35 @@ def detect_voice_with_silero(input_audio):
     ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
     # Load the audio and detect speech timestamps
-    wav = read_audio(temp_wav, sampling_rate=16000)
+    wav = read_audio(temp_wav)
     speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True)
     logger.debug(speech_timestamps)
     # Return the speech timestamps (start and end in seconds)
-    return speech_timestamps
+    return speech_timestamps, len(wav) / 16000
 # Example usage of Silero with trimming
 def process_audio_with_silero(input_audio, output_audio, game_line):
-    voice_activity = detect_voice_with_silero(input_audio)
+    voice_activity, audio_length = detect_voice_with_silero(input_audio)
     if not voice_activity:
-        return VADResult(False, 0, 0)
+        return VADResult(False, 0, 0, SILERO)
     # Trim based on the first and last speech detected
     start_time = voice_activity[0]['start'] if voice_activity else 0
-    if (game_line.next and len(voice_activity) > 1
-        and voice_activity[-1]['end'] - get_config().audio.beginning_offset > len(input_audio) / 16000
-            and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
-                end_time = voice_activity[-2]['end']
-                logger.info("Using the second last timestamp for trimming")
+    if game_line and game_line.next and len(voice_activity) > 1 and 0 > get_config().audio.beginning_offset > audio_length - voice_activity[-1]['start']:
+    #         and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
+            end_time = voice_activity[-2]['end']
+            logger.info("Using the second last timestamp for trimming")
     else:
         end_time = voice_activity[-1]['end'] if voice_activity else 0
     # Trim the audio using FFmpeg
     ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
-    return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
+    return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, SILERO)
+# process_audio_with_silero("tmp6x81cy27.opus", "tmp6x81cy27_trimmed.opus", None)
+# print(detect_voice_with_silero("tmp6x81cy27.opus"))

GameSentenceMiner/vad/vad_utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import subprocess
+from GameSentenceMiner.ffmpeg import get_ffprobe_path
+def get_audio_length(path):
+    result = subprocess.run(
+        [get_ffprobe_path(), "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True
+    )
+    return float(result.stdout.strip())

GameSentenceMiner/vad/vosk_helper.py CHANGED Viewed

@@ -128,7 +128,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
     if not voice_activity:
         logger.info("No voice activity detected in the audio.")
-        return VADResult(False, 0, 0)
+        return VADResult(False, 0, 0, VOSK)
     # Trim based on the first and last speech detected
     start_time = voice_activity[0]['start'] if voice_activity else 0
@@ -148,7 +148,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
     # Trim the audio using FFmpeg
     ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
-    return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
+    return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, VOSK)
 def get_vosk_model():

GameSentenceMiner 2.8.54__py3-none-any.whl → 2.9.1__py3-none-any.whl

GameSentenceMiner 2.8.54py3-none-any.whl → 2.9.1py3-none-any.whl