PyPI - GameSentenceMiner - Versions diffs - 2.10.16__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

GameSentenceMiner 2.10.16py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

GameSentenceMiner/vad.py CHANGED Viewed

@@ -14,8 +14,8 @@ class VADSystem:
     def __init__(self):
         self.silero = None
         self.whisper = None
-        self.vosk = None
-        self.groq = None
+        # self.vosk = None
+        # self.groq = None
     def init(self):
         if get_config().vad.is_whisper():
@@ -24,12 +24,12 @@ class VADSystem:
         if get_config().vad.is_silero():
             if not self.silero:
                 self.silero = SileroVADProcessor()
-        if get_config().vad.is_vosk():
-            if not self.vosk:
-                self.vosk = VoskVADProcessor()
-        if get_config().vad.is_groq():
-            if not self.groq:
-                self.groq = GroqVADProcessor()
+        # if get_config().vad.is_vosk():
+        #     if not self.vosk:
+        #         self.vosk = VoskVADProcessor()
+        # if get_config().vad.is_groq():
+        #     if not self.groq:
+        #         self.groq = GroqVADProcessor()
     def trim_audio_with_vad(self, input_audio, output_audio, game_line):
         if get_config().vad.do_vad_postprocessing:
@@ -53,18 +53,18 @@ class VADSystem:
         match model:
             case configuration.OFF:
                 return VADResult(False, 0, 0, "OFF")
-            case configuration.GROQ:
-                if not self.groq:
-                    self.groq = GroqVADProcessor()
-                return self.groq.process_audio(input_audio, output_audio, game_line)
+            # case configuration.GROQ:
+            #     if not self.groq:
+            #         self.groq = GroqVADProcessor()
+            #     return self.groq.process_audio(input_audio, output_audio, game_line)
             case configuration.SILERO:
                 if not self.silero:
                     self.silero = SileroVADProcessor()
                 return self.silero.process_audio(input_audio, output_audio, game_line)
-            case configuration.VOSK:
-                if not self.vosk:
-                    self.vosk = VoskVADProcessor()
-                return self.vosk.process_audio(input_audio, output_audio, game_line)
+            # case configuration.VOSK:
+            #     if not self.vosk:
+            #         self.vosk = VoskVADProcessor()
+            #     return self.vosk.process_audio(input_audio, output_audio, game_line)
             case configuration.WHISPER:
                 if not self.whisper:
                     self.whisper = WhisperVADProcessor()
@@ -121,6 +121,8 @@ class VADProcessor(ABC):
             logger.info("No voice activity detected in the audio.")
             return VADResult(False, 0, 0, self.vad_system_name)
+        print(voice_activity)
         start_time = voice_activity[0]['start'] if voice_activity else 0
         end_time = voice_activity[-1]['end'] if voice_activity else 0
@@ -133,7 +135,7 @@ class VADProcessor(ABC):
         if get_config().vad.cut_and_splice_segments:
             self.extract_audio_and_combine_segments(input_audio, voice_activity, output_audio, padding=get_config().vad.splice_padding)
         else:
-            ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0, fade_out_duration=0)
+            ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0.05, fade_out_duration=00)
         return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, self.vad_system_name, voice_activity, output_audio)
 class SileroVADProcessor(VADProcessor):
@@ -145,10 +147,10 @@ class SileroVADProcessor(VADProcessor):
     def _detect_voice_activity(self, input_audio):
         from silero_vad import read_audio, get_speech_timestamps
-        temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
-        ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
-        wav = read_audio(temp_wav)
-        speech_timestamps = get_speech_timestamps(wav, self.vad_model, return_seconds=True)
+        # temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
+        # ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
+        wav = read_audio(input_audio)
+        speech_timestamps = get_speech_timestamps(wav, self.vad_model, return_seconds=True, threshold=0.2)
         logger.debug(speech_timestamps)
         return speech_timestamps
@@ -161,7 +163,7 @@ class WhisperVADProcessor(VADProcessor):
     def load_whisper_model(self):
         import stable_whisper as whisper
         if not self.vad_model:
-            with warnings.catch_warnings(action="ignore"):
+            with warnings.catch_warnings():
                 self.vad_model = whisper.load_model(get_config().vad.whisper_model)
             logger.info(f"Whisper model '{get_config().vad.whisper_model}' loaded.")
         return self.vad_model
@@ -169,21 +171,29 @@ class WhisperVADProcessor(VADProcessor):
     def _detect_voice_activity(self, input_audio):
         from stable_whisper import WhisperResult
         # Convert the audio to 16kHz mono WAV
-        temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
-        ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
+        # temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
+        # ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
         logger.info('transcribing audio...')
         # Transcribe the audio using Whisper
-        with warnings.catch_warnings(action="ignore"):
-            result: WhisperResult = self.vad_model.transcribe(temp_wav, vad=True, language=get_config().vad.language,
+        with warnings.catch_warnings():
+            result: WhisperResult = self.vad_model.transcribe(input_audio, vad=True, language=get_config().vad.language,
                                                              temperature=0.0)
         voice_activity = []
         logger.debug(result.to_dict())
         # Process the segments to extract tokens, timestamps, and confidence
-        for segment in result.segments:
+        for i, segment in enumerate(result.segments):
+            if len(segment.text) == 1 and (i > 1 and segment.start - result.segments[i - 1].end > 1.0) or (i < len(result.segments) - 1 and result.segments[i + 1].start - segment.end > 1.0):
+                if segment.text in ['えー', 'ん']:
+                        logger.debug(f"Skipping filler segment: {segment.text} at {segment.start}-{segment.end}")
+                        continue
+                else:
+                    logger.info(
+                        "Unknown single character segment, not skipping, but logging, please report if this is a mistake: " + segment.text)
             logger.debug(segment.to_dict())
             voice_activity.append({
                 'text': segment.text,
@@ -216,154 +226,179 @@ class WhisperVADProcessor(VADProcessor):
         return voice_activity
 # Add a new class for Vosk-based VAD
-class VoskVADProcessor(VADProcessor):
-    def __init__(self):
-        super().__init__()
-        self.vad_model = self._load_vosk_model()
-        self.vad_system_name = VOSK
-    def _load_vosk_model(self):
-        if not self.vad_model:
-            import vosk
-            vosk_model_path = self._download_and_cache_vosk_model()
-            self.vad_model = vosk.Model(vosk_model_path)
-            logger.info(f"Vosk model loaded from {vosk_model_path}")
-        return self.vad_model
-    def _download_and_cache_vosk_model(self, model_dir="vosk_model_cache"):
-        # Ensure the cache directory exists
-        import requests
-        import zipfile
-        import tarfile
-        if not os.path.exists(os.path.join(get_app_directory(), model_dir)):
-            os.makedirs(os.path.join(get_app_directory(), model_dir))
-        # Extract the model name from the URL
-        model_filename = get_config().vad.vosk_url.split("/")[-1]
-        model_path = os.path.join(get_app_directory(), model_dir, model_filename)
-        # If the model is already downloaded, skip the download
-        if not os.path.exists(model_path):
-            logger.info(
-                f"Downloading the Vosk model from {get_config().vad.vosk_url}... This will take a while if using large model, ~1G")
-            response = requests.get(get_config().vad.vosk_url, stream=True)
-            with open(model_path, "wb") as file:
-                for chunk in response.iter_content(chunk_size=8192):
-                    if chunk:
-                        file.write(chunk)
-            logger.info("Download complete.")
-        # Extract the model if it's a zip or tar file
-        model_extract_path = os.path.join(get_app_directory(), model_dir, "vosk_model")
-        if not os.path.exists(model_extract_path):
-            logger.info("Extracting the Vosk model...")
-            if model_filename.endswith(".zip"):
-                with zipfile.ZipFile(model_path, "r") as zip_ref:
-                    zip_ref.extractall(model_extract_path)
-            elif model_filename.endswith(".tar.gz"):
-                with tarfile.open(model_path, "r:gz") as tar_ref:
-                    tar_ref.extractall(model_extract_path)
-            else:
-                logger.info("Unknown archive format. Model extraction skipped.")
-            logger.info(f"Model extracted to {model_extract_path}.")
-        else:
-            logger.info(f"Model already extracted at {model_extract_path}.")
-        # Return the path to the actual model folder inside the extraction directory
-        extracted_folders = os.listdir(model_extract_path)
-        if extracted_folders:
-            actual_model_folder = os.path.join(model_extract_path,
-                                               extracted_folders[0])  # Assuming the first folder is the model
-            return actual_model_folder
-        else:
-            return model_extract_path  # In case there's no subfolder, return the extraction path directly
-    def _detect_voice_activity(self, input_audio):
-        import soundfile as sf
-        import vosk
-        import numpy as np
-        # Convert the audio to 16kHz mono WAV
-        temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
-        ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
-        # Initialize recognizer
-        with sf.SoundFile(temp_wav) as audio_file:
-            recognizer = vosk.KaldiRecognizer(self.vad_model, audio_file.samplerate)
-            voice_activity = []
-            recognizer.SetWords(True)
-            # Process audio in chunks
-            while True:
-                data = audio_file.buffer_read(4000, dtype='int16')
-                if len(data) == 0:
-                    break
-                # Convert buffer to bytes using NumPy
-                data_bytes = np.frombuffer(data, dtype='int16').tobytes()
-                if recognizer.AcceptWaveform(data_bytes):
-                    pass
-            final_result = json.loads(recognizer.FinalResult())
-            if 'result' in final_result:
-                for word in final_result['result']:
-                    if word['conf'] >= 0.90:
-                        voice_activity.append({
-                            'text': word['word'],
-                            'start': word['start'],
-                            'end': word['end']
-                        })
-        # Return the detected voice activity
-        return voice_activity
-class GroqVADProcessor(VADProcessor):
-    def __init__(self):
-        super().__init__()
-        from groq import Groq
-        self.client = Groq(api_key=get_config().ai.groq_api_key)
-        self.vad_model = self.load_groq_model()
-        self.vad_system_name = GROQ
-    def load_groq_model(self):
-        if not self.vad_model:
-            from groq import Groq
-            self.vad_model = Groq()
-            logger.info("Groq model loaded.")
-        return self.vad_model
-    def _detect_voice_activity(self, input_audio):
-        try:
-            with open(input_audio, "rb") as file:
-                transcription = self.client.audio.transcriptions.create(
-                    file=(os.path.basename(input_audio), file.read()),
-                    model="whisper-large-v3-turbo",
-                    response_format="verbose_json",
-                    language=get_config().vad.language,
-                    temperature=0.0,
-                    timestamp_granularities=["segment"],
-                    prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
-                )
-            logger.debug(transcription)
-            speech_segments = []
-            if hasattr(transcription, 'segments'):
-                speech_segments = transcription.segments
-            elif hasattr(transcription, 'words'):
-                speech_segments = transcription.words
-            return speech_segments
-        except Exception as e:
-            logger.error(f"Error detecting voice with Groq: {e}")
-            return [], 0.0
+# class VoskVADProcessor(VADProcessor):
+#     def __init__(self):
+#         super().__init__()
+#         self.vad_model = self._load_vosk_model()
+#         self.vad_system_name = VOSK
+#
+#     def _load_vosk_model(self):
+#         if not self.vad_model:
+#             import vosk
+#             vosk_model_path = self._download_and_cache_vosk_model()
+#             self.vad_model = vosk.Model(vosk_model_path)
+#             logger.info(f"Vosk model loaded from {vosk_model_path}")
+#         return self.vad_model
+#
+#     def _download_and_cache_vosk_model(self, model_dir="vosk_model_cache"):
+#         # Ensure the cache directory exists
+#         import requests
+#         import zipfile
+#         import tarfile
+#         if not os.path.exists(os.path.join(get_app_directory(), model_dir)):
+#             os.makedirs(os.path.join(get_app_directory(), model_dir))
+#
+#         # Extract the model name from the URL
+#         model_filename = get_config().vad.vosk_url.split("/")[-1]
+#         model_path = os.path.join(get_app_directory(), model_dir, model_filename)
+#
+#         # If the model is already downloaded, skip the download
+#         if not os.path.exists(model_path):
+#             logger.info(
+#                 f"Downloading the Vosk model from {get_config().vad.vosk_url}... This will take a while if using large model, ~1G")
+#             response = requests.get(get_config().vad.vosk_url, stream=True)
+#             with open(model_path, "wb") as file:
+#                 for chunk in response.iter_content(chunk_size=8192):
+#                     if chunk:
+#                         file.write(chunk)
+#             logger.info("Download complete.")
+#
+#         # Extract the model if it's a zip or tar file
+#         model_extract_path = os.path.join(get_app_directory(), model_dir, "vosk_model")
+#         if not os.path.exists(model_extract_path):
+#             logger.info("Extracting the Vosk model...")
+#             if model_filename.endswith(".zip"):
+#                 with zipfile.ZipFile(model_path, "r") as zip_ref:
+#                     zip_ref.extractall(model_extract_path)
+#             elif model_filename.endswith(".tar.gz"):
+#                 with tarfile.open(model_path, "r:gz") as tar_ref:
+#                     tar_ref.extractall(model_extract_path)
+#             else:
+#                 logger.info("Unknown archive format. Model extraction skipped.")
+#             logger.info(f"Model extracted to {model_extract_path}.")
+#         else:
+#             logger.info(f"Model already extracted at {model_extract_path}.")
+#
+#         # Return the path to the actual model folder inside the extraction directory
+#         extracted_folders = os.listdir(model_extract_path)
+#         if extracted_folders:
+#             actual_model_folder = os.path.join(model_extract_path,
+#                                                extracted_folders[0])  # Assuming the first folder is the model
+#             return actual_model_folder
+#         else:
+#             return model_extract_path  # In case there's no subfolder, return the extraction path directly
+#
+#     def _detect_voice_activity(self, input_audio):
+#         import soundfile as sf
+#         import vosk
+#         import numpy as np
+#         # Convert the audio to 16kHz mono WAV
+#         temp_wav = tempfile.NamedTemporaryFile(dir=configuration.get_temporary_directory(), suffix='.wav').name
+#         ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
+#
+#         # Initialize recognizer
+#         with sf.SoundFile(temp_wav) as audio_file:
+#             recognizer = vosk.KaldiRecognizer(self.vad_model, audio_file.samplerate)
+#             voice_activity = []
+#
+#             recognizer.SetWords(True)
+#
+#             # Process audio in chunks
+#             while True:
+#                 data = audio_file.buffer_read(4000, dtype='int16')
+#                 if len(data) == 0:
+#                     break
+#
+#                 # Convert buffer to bytes using NumPy
+#                 data_bytes = np.frombuffer(data, dtype='int16').tobytes()
+#
+#                 if recognizer.AcceptWaveform(data_bytes):
+#                     pass
+#
+#             final_result = json.loads(recognizer.FinalResult())
+#             if 'result' in final_result:
+#                 for word in final_result['result']:
+#                     if word['conf'] >= 0.90:
+#                         voice_activity.append({
+#                             'text': word['word'],
+#                             'start': word['start'],
+#                             'end': word['end']
+#                         })
+#
+#         # Return the detected voice activity
+#         return voice_activity
+# class GroqVADProcessor(VADProcessor):
+#     def __init__(self):
+#         super().__init__()
+#         self.client = self.load_groq_model()
+#         self.vad_system_name = GROQ
+#
+#     def load_groq_model(self):
+#         if not hasattr(self, 'client') or not self.client:
+#             from groq import Groq
+#             client = Groq(api_key=get_config().ai.groq_api_key)
+#             logger.info("Groq model loaded.")
+#             return client
+#         return self.client
+#
+#     def _detect_voice_activity(self, input_audio):
+#         try:
+#             with open(input_audio, "rb") as file:
+#                 transcription = self.client.audio.transcriptions.create(
+#                     file=(os.path.basename(input_audio), file.read()),
+#                     model="whisper-large-v3-turbo",
+#                     response_format="verbose_json",
+#                     language=get_config().vad.language,
+#                     temperature=0.0,
+#                     timestamp_granularities=["segment"],
+#                     prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
+#                 )
+#
+#             logger.debug(transcription)
+#             speech_segments = []
+#             if hasattr(transcription, 'segments'):
+#                 speech_segments = transcription.segments
+#             elif hasattr(transcription, 'words'):
+#                 speech_segments = transcription.words
+#             return speech_segments
+#         except Exception as e:
+#             logger.error(f"Error detecting voice with Groq: {e}")
+#             return [], 0.0
 vad_processor = VADSystem()
-# test_vad = WhisperVADProcessor()
-#
-# if os.path.exists(r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus"):
-#     os.remove(r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus")
-# get_config().vad.cut_and_splice_segments = True
-# get_config().vad.splice_padding = 0.3
-# test_vad.process_audio(r"C:\Users\Beangate\GSM\Electron App\test\temp_audio.opus", r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus", None)
+# Test cases for all VADProcessors
+def test_vad_processors():
+    logger.setLevel(logging.DEBUG)
+    test_audio = r"C:\Users\Beangate\GSM\Electron App\test\tmptnl4a93q_untrimmed.opus"
+    output_dir = r"C:\Users\Beangate\GSM\Electron App\test\output"
+    os.makedirs(output_dir, exist_ok=True)
+    processors = [
+        (WhisperVADProcessor(), "after_splice_whisper.opus"),
+        (SileroVADProcessor(), "after_splice_silero.opus"),
+        # (VoskVADProcessor(), "after_splice_vosk.opus"),
+        # (GroqVADProcessor(), "after_splice_groq.opus"),
+    ]
+    # get_config().vad.cut_and_splice_segments = True
+    # get_config().vad.splice_padding = 0.3
+    # for processor, out_name in processors:
+    #     logger.info("Testing Splice Audio with " + processor.vad_system_name)
+    #     out_path = os.path.join(output_dir, out_name)
+    #     if os.path.exists(out_path):
+    #         os.remove(out_path)
+    #     processor.process_audio(test_audio, out_path, None)
+    get_config().vad.cut_and_splice_segments = False
+    get_config().vad.trim_beginning = True
+    for processor, out_name in processors:
+        logger.info("Testing Trim Audio with " + processor.vad_system_name)
+        out_path = os.path.join(output_dir, out_name.replace("after_splice_", "after_trim_"))
+        if os.path.exists(out_path):
+            os.remove(out_path)
+        processor.process_audio(test_audio, out_path, None)
+if __name__ == "__main__":
+    test_vad_processors()

GameSentenceMiner 2.10.16__py3-none-any.whl → 2.11.0__py3-none-any.whl

GameSentenceMiner 2.10.16py3-none-any.whl → 2.11.0py3-none-any.whl