GameSentenceMiner 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/__init__.py +0 -0
- GameSentenceMiner/anki.py +265 -0
- GameSentenceMiner/config_gui.py +803 -0
- GameSentenceMiner/configuration.py +359 -0
- GameSentenceMiner/ffmpeg.py +297 -0
- GameSentenceMiner/gametext.py +128 -0
- GameSentenceMiner/gsm.py +385 -0
- GameSentenceMiner/model.py +84 -0
- GameSentenceMiner/notification.py +69 -0
- GameSentenceMiner/obs.py +128 -0
- GameSentenceMiner/util.py +136 -0
- GameSentenceMiner/vad/__init__.py +0 -0
- GameSentenceMiner/vad/silero_trim.py +43 -0
- GameSentenceMiner/vad/vosk_helper.py +152 -0
- GameSentenceMiner/vad/whisper_helper.py +98 -0
- GameSentenceMiner-2.0.0.dist-info/METADATA +346 -0
- GameSentenceMiner-2.0.0.dist-info/RECORD +20 -0
- GameSentenceMiner-2.0.0.dist-info/WHEEL +5 -0
- GameSentenceMiner-2.0.0.dist-info/entry_points.txt +2 -0
- GameSentenceMiner-2.0.0.dist-info/top_level.txt +1 -0
GameSentenceMiner/obs.py
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
import obswebsocket
|
4
|
+
from obswebsocket import obsws, requests
|
5
|
+
from obswebsocket.exceptions import ConnectionFailure
|
6
|
+
|
7
|
+
from . import util
|
8
|
+
from . import configuration
|
9
|
+
from .configuration import *
|
10
|
+
from .model import *
|
11
|
+
|
12
|
+
client: obsws = None
|
13
|
+
|
14
|
+
# REFERENCE: https://github.com/obsproject/obs-websocket/blob/master/docs/generated/protocol.md
|
15
|
+
|
16
|
+
|
17
|
+
def on_connect(obs):
|
18
|
+
logger.info("Connected to OBS WebSocket.")
|
19
|
+
time.sleep(2)
|
20
|
+
if get_config().obs.start_buffer:
|
21
|
+
start_replay_buffer()
|
22
|
+
|
23
|
+
|
24
|
+
def on_disconnect(obs):
|
25
|
+
logger.error("OBS Connection Lost!")
|
26
|
+
|
27
|
+
|
28
|
+
def connect_to_obs(start_replay=False):
|
29
|
+
global client
|
30
|
+
if get_config().obs.enabled:
|
31
|
+
client = obsws(host=get_config().obs.host, port=get_config().obs.port,
|
32
|
+
password=get_config().obs.password, authreconnect=1, on_connect=on_connect,
|
33
|
+
on_disconnect=on_disconnect)
|
34
|
+
try:
|
35
|
+
client.connect()
|
36
|
+
except ConnectionFailure:
|
37
|
+
logger.error("OBS Websocket Connection Has not been Set up, please set it up in Settings")
|
38
|
+
exit(1)
|
39
|
+
|
40
|
+
time.sleep(1)
|
41
|
+
if start_replay and get_config().obs.start_buffer:
|
42
|
+
start_replay_buffer()
|
43
|
+
update_current_game()
|
44
|
+
|
45
|
+
|
46
|
+
# Disconnect from OBS WebSocket
|
47
|
+
def disconnect_from_obs():
|
48
|
+
global client
|
49
|
+
if client:
|
50
|
+
client.disconnect()
|
51
|
+
client = None
|
52
|
+
logger.info("Disconnected from OBS WebSocket.")
|
53
|
+
|
54
|
+
|
55
|
+
# Start replay buffer
|
56
|
+
def start_replay_buffer():
|
57
|
+
try:
|
58
|
+
client.call(requests.GetVersion())
|
59
|
+
client.call(requests.StartReplayBuffer())
|
60
|
+
except Exception as e:
|
61
|
+
print(f"Error starting replay buffer: {e}")
|
62
|
+
|
63
|
+
|
64
|
+
# Stop replay buffer
|
65
|
+
def stop_replay_buffer():
|
66
|
+
try:
|
67
|
+
client.call(requests.StopReplayBuffer())
|
68
|
+
print("Replay buffer stopped.")
|
69
|
+
except Exception as e:
|
70
|
+
print(f"Error stopping replay buffer: {e}")
|
71
|
+
|
72
|
+
|
73
|
+
# Save the current replay buffer
|
74
|
+
def save_replay_buffer():
|
75
|
+
try:
|
76
|
+
client.call(requests.SaveReplayBuffer())
|
77
|
+
except Exception as e:
|
78
|
+
print(f"Error saving replay buffer: {e}")
|
79
|
+
|
80
|
+
|
81
|
+
def get_current_scene():
|
82
|
+
try:
|
83
|
+
response = client.call(requests.GetCurrentProgramScene())
|
84
|
+
scene_info = SceneInfo.from_dict(response.datain)
|
85
|
+
return scene_info.sceneName
|
86
|
+
except Exception as e:
|
87
|
+
print(f"Couldn't get scene: {e}")
|
88
|
+
return ''
|
89
|
+
|
90
|
+
|
91
|
+
def get_source_from_scene(scene_name):
|
92
|
+
try:
|
93
|
+
response = client.call(requests.GetSceneItemList(sceneName=scene_name))
|
94
|
+
scene_list = SceneItemsResponse.from_dict(response.datain)
|
95
|
+
print(scene_list)
|
96
|
+
return scene_list.sceneItems[0]
|
97
|
+
except Exception as e:
|
98
|
+
print(f"Error getting source from scene: {e}")
|
99
|
+
return ''
|
100
|
+
|
101
|
+
|
102
|
+
def get_screenshot():
|
103
|
+
try:
|
104
|
+
screenshot = util.make_unique_file_name(os.path.abspath(configuration.temp_directory) + '/screenshot.png')
|
105
|
+
update_current_game()
|
106
|
+
current_source = get_source_from_scene(get_current_game())
|
107
|
+
current_source_name = current_source.sourceName
|
108
|
+
if not current_source_name:
|
109
|
+
print("No active scene found.")
|
110
|
+
return
|
111
|
+
client.call(
|
112
|
+
requests.SaveSourceScreenshot(sourceName=current_source_name, imageFormat='png', imageFilePath=screenshot))
|
113
|
+
return screenshot
|
114
|
+
except Exception as e:
|
115
|
+
print(f"Error getting screenshot: {e}")
|
116
|
+
|
117
|
+
|
118
|
+
def update_current_game():
|
119
|
+
configuration.current_game = get_current_scene()
|
120
|
+
|
121
|
+
|
122
|
+
def get_current_game(sanitize=False):
|
123
|
+
if not configuration.current_game:
|
124
|
+
update_current_game()
|
125
|
+
|
126
|
+
if sanitize:
|
127
|
+
return util.sanitize_filename(configuration.current_game)
|
128
|
+
return configuration.current_game
|
@@ -0,0 +1,136 @@
|
|
1
|
+
import os
|
2
|
+
import random
|
3
|
+
import re
|
4
|
+
import string
|
5
|
+
import subprocess
|
6
|
+
import threading
|
7
|
+
from datetime import datetime
|
8
|
+
from sys import platform
|
9
|
+
|
10
|
+
from rapidfuzz import process
|
11
|
+
|
12
|
+
SCRIPTS_DIR = r"E:\Japanese Stuff\agent-v0.1.4-win32-x64\data\scripts"
|
13
|
+
|
14
|
+
# Global variables to control script execution
|
15
|
+
use_previous_audio = False
|
16
|
+
keep_running = True
|
17
|
+
lock = threading.Lock()
|
18
|
+
|
19
|
+
|
20
|
+
def run_new_thread(func):
|
21
|
+
thread = threading.Thread(target=func, daemon=True)
|
22
|
+
thread.start()
|
23
|
+
return thread
|
24
|
+
|
25
|
+
|
26
|
+
def make_unique_file_name(path):
|
27
|
+
split = path.rsplit('.', 1)
|
28
|
+
filename = split[0]
|
29
|
+
extension = split[1]
|
30
|
+
|
31
|
+
current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
|
32
|
+
|
33
|
+
return f"{filename}_{current_time}.{extension}"
|
34
|
+
|
35
|
+
def sanitize_filename(filename):
|
36
|
+
return re.sub(r'[ <>:"/\\|?*\x00-\x1F]', '', filename)
|
37
|
+
|
38
|
+
|
39
|
+
def get_random_digit_string():
|
40
|
+
return ''.join(random.choice(string.digits) for i in range(9))
|
41
|
+
|
42
|
+
|
43
|
+
def timedelta_to_ffmpeg_friendly_format(td_obj):
|
44
|
+
total_seconds = td_obj.total_seconds()
|
45
|
+
hours, remainder = divmod(total_seconds, 3600)
|
46
|
+
minutes, seconds = divmod(remainder, 60)
|
47
|
+
return "{:02}:{:02}:{:06.3f}".format(int(hours), int(minutes), seconds)
|
48
|
+
|
49
|
+
|
50
|
+
def get_file_modification_time(file_path):
|
51
|
+
mod_time_epoch = os.path.getmtime(file_path)
|
52
|
+
mod_time = datetime.fromtimestamp(mod_time_epoch)
|
53
|
+
return mod_time
|
54
|
+
|
55
|
+
|
56
|
+
def get_process_id_by_title(game_title):
|
57
|
+
powershell_command = f"Get-Process | Where-Object {{$_.MainWindowTitle -like '*{game_title}*'}} | Select-Object -First 1 -ExpandProperty Id"
|
58
|
+
process_id = subprocess.check_output(["powershell", "-Command", powershell_command], text=True).strip()
|
59
|
+
print(f"Process ID for {game_title}: {process_id}")
|
60
|
+
return process_id
|
61
|
+
|
62
|
+
|
63
|
+
def get_script_files(directory):
|
64
|
+
script_files = []
|
65
|
+
for root, dirs, files in os.walk(directory):
|
66
|
+
for file in files:
|
67
|
+
if file.endswith(".js"): # Assuming the scripts are .js files
|
68
|
+
script_files.append(os.path.join(root, file))
|
69
|
+
return script_files
|
70
|
+
|
71
|
+
|
72
|
+
def filter_steam_scripts(scripts):
|
73
|
+
return [script for script in scripts if "PC_Steam" in os.path.basename(script)]
|
74
|
+
|
75
|
+
|
76
|
+
def extract_game_name(script_path):
|
77
|
+
# Remove directory and file extension to get the name part
|
78
|
+
script_name = os.path.basename(script_path)
|
79
|
+
game_name = script_name.replace("PC_Steam_", "").replace(".js", "")
|
80
|
+
return game_name.replace("_", " ").replace(".", " ")
|
81
|
+
|
82
|
+
|
83
|
+
def find_most_similar_script(game_title, steam_scripts):
|
84
|
+
# Create a list of game names from the script paths
|
85
|
+
game_names = [extract_game_name(script) for script in steam_scripts]
|
86
|
+
|
87
|
+
# Use rapidfuzz to find the closest match
|
88
|
+
best_match = process.extractOne(game_title, game_names)
|
89
|
+
|
90
|
+
if best_match:
|
91
|
+
matched_game_name, confidence_score, index = best_match
|
92
|
+
return steam_scripts[index], matched_game_name, confidence_score
|
93
|
+
return None, None, None
|
94
|
+
|
95
|
+
|
96
|
+
def find_script_for_game(game_title):
|
97
|
+
script_files = get_script_files(SCRIPTS_DIR)
|
98
|
+
|
99
|
+
steam_scripts = filter_steam_scripts(script_files)
|
100
|
+
|
101
|
+
best_script, matched_game_name, confidence = find_most_similar_script(game_title, steam_scripts)
|
102
|
+
|
103
|
+
if best_script:
|
104
|
+
print(f"Found Script: {best_script}")
|
105
|
+
return best_script
|
106
|
+
else:
|
107
|
+
print("No similar script found.")
|
108
|
+
|
109
|
+
|
110
|
+
def run_agent_and_hook(pname, agent_script):
|
111
|
+
command = f'agent --script=\"{agent_script}\" --pname={pname}'
|
112
|
+
print("Running and Hooking Agent!")
|
113
|
+
try:
|
114
|
+
dos_process = subprocess.Popen(command, shell=True)
|
115
|
+
dos_process.wait() # Wait for the process to complete
|
116
|
+
print("Agent script finished or closed.")
|
117
|
+
except Exception as e:
|
118
|
+
print(f"Error occurred while running agent script: {e}")
|
119
|
+
|
120
|
+
keep_running = False
|
121
|
+
|
122
|
+
|
123
|
+
def is_linux():
|
124
|
+
return platform == 'linux'
|
125
|
+
|
126
|
+
# def run_command(command, shell=False, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
127
|
+
# # Use shell=True if the OS is Linux, otherwise shell=False
|
128
|
+
# if is_linux():
|
129
|
+
# return subprocess.run(command, shell=True, input=input, capture_output=capture_output, timeout=timeout,
|
130
|
+
# check=check, **kwargs)
|
131
|
+
# else:
|
132
|
+
# return subprocess.run(command, shell=shell, input=input, capture_output=capture_output, timeout=timeout,
|
133
|
+
# check=check, **kwargs)
|
134
|
+
def remove_html_tags(text):
|
135
|
+
clean_text = re.sub(r'<.*?>', '', text)
|
136
|
+
return clean_text
|
File without changes
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import tempfile
|
2
|
+
|
3
|
+
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
|
4
|
+
|
5
|
+
from .. import configuration, ffmpeg
|
6
|
+
from ..configuration import *
|
7
|
+
|
8
|
+
# Silero VAD setup
|
9
|
+
vad_model = load_silero_vad()
|
10
|
+
|
11
|
+
|
12
|
+
# Use Silero to detect voice activity with timestamps in the audio
|
13
|
+
def detect_voice_with_silero(input_audio):
|
14
|
+
# Convert the audio to 16kHz mono WAV
|
15
|
+
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.temp_directory, suffix='.wav').name
|
16
|
+
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
17
|
+
|
18
|
+
# Load the audio and detect speech timestamps
|
19
|
+
wav = read_audio(temp_wav, sampling_rate=16000)
|
20
|
+
speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True)
|
21
|
+
|
22
|
+
logger.debug(speech_timestamps)
|
23
|
+
|
24
|
+
# Return the speech timestamps (start and end in seconds)
|
25
|
+
return speech_timestamps
|
26
|
+
|
27
|
+
|
28
|
+
# Example usage of Silero with trimming
|
29
|
+
def process_audio_with_silero(input_audio, output_audio):
|
30
|
+
voice_activity = detect_voice_with_silero(input_audio)
|
31
|
+
|
32
|
+
if not voice_activity:
|
33
|
+
logger.info("No voice activity detected in the audio.")
|
34
|
+
return False
|
35
|
+
|
36
|
+
# Trim based on the first and last speech detected
|
37
|
+
start_time = voice_activity[0]['start'] if voice_activity else 0
|
38
|
+
end_time = voice_activity[-1]['end'] if voice_activity else 0
|
39
|
+
|
40
|
+
# Trim the audio using FFmpeg
|
41
|
+
ffmpeg.trim_audio(input_audio, start_time, end_time + get_config().audio.end_offset, output_audio)
|
42
|
+
logger.info(f"Trimmed audio saved to: {output_audio}")
|
43
|
+
return True
|
@@ -0,0 +1,152 @@
|
|
1
|
+
import tarfile
|
2
|
+
import tempfile
|
3
|
+
import zipfile
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import requests
|
7
|
+
import soundfile as sf
|
8
|
+
import vosk
|
9
|
+
|
10
|
+
from .. import ffmpeg, configuration
|
11
|
+
from ..configuration import *
|
12
|
+
|
13
|
+
ffmpeg_base_command_list = ["ffmpeg", "-hide_banner", "-loglevel", "error"]
|
14
|
+
vosk.SetLogLevel(-1)
|
15
|
+
vosk_model_path = ''
|
16
|
+
vosk_model = None
|
17
|
+
|
18
|
+
|
19
|
+
# Function to download and cache the Vosk model
|
20
|
+
def download_and_cache_vosk_model(model_dir="vosk_model_cache"):
|
21
|
+
# Ensure the cache directory exists
|
22
|
+
if not os.path.exists(model_dir):
|
23
|
+
os.makedirs(model_dir)
|
24
|
+
|
25
|
+
# Extract the model name from the URL
|
26
|
+
model_filename = get_config().vad.vosk_url.split("/")[-1]
|
27
|
+
model_path = os.path.join(model_dir, model_filename)
|
28
|
+
|
29
|
+
# If the model is already downloaded, skip the download
|
30
|
+
if not os.path.exists(model_path):
|
31
|
+
logger.info(
|
32
|
+
f"Downloading the Vosk model from {get_config().vad.vosk_url}... This will take a while if using large model, ~1G")
|
33
|
+
response = requests.get(get_config().vad.vosk_url, stream=True)
|
34
|
+
with open(model_path, "wb") as file:
|
35
|
+
for chunk in response.iter_content(chunk_size=8192):
|
36
|
+
if chunk:
|
37
|
+
file.write(chunk)
|
38
|
+
logger.info("Download complete.")
|
39
|
+
|
40
|
+
# Extract the model if it's a zip or tar file
|
41
|
+
model_extract_path = os.path.join(model_dir, "vosk_model")
|
42
|
+
if not os.path.exists(model_extract_path):
|
43
|
+
logger.info("Extracting the Vosk model...")
|
44
|
+
if model_filename.endswith(".zip"):
|
45
|
+
with zipfile.ZipFile(model_path, "r") as zip_ref:
|
46
|
+
zip_ref.extractall(model_extract_path)
|
47
|
+
elif model_filename.endswith(".tar.gz"):
|
48
|
+
with tarfile.open(model_path, "r:gz") as tar_ref:
|
49
|
+
tar_ref.extractall(model_extract_path)
|
50
|
+
else:
|
51
|
+
logger.info("Unknown archive format. Model extraction skipped.")
|
52
|
+
logger.info(f"Model extracted to {model_extract_path}.")
|
53
|
+
else:
|
54
|
+
logger.info(f"Model already extracted at {model_extract_path}.")
|
55
|
+
|
56
|
+
# Return the path to the actual model folder inside the extraction directory
|
57
|
+
extracted_folders = os.listdir(model_extract_path)
|
58
|
+
if extracted_folders:
|
59
|
+
actual_model_folder = os.path.join(model_extract_path,
|
60
|
+
extracted_folders[0]) # Assuming the first folder is the model
|
61
|
+
return actual_model_folder
|
62
|
+
else:
|
63
|
+
return model_extract_path # In case there's no subfolder, return the extraction path directly
|
64
|
+
|
65
|
+
|
66
|
+
# Use Vosk to detect voice activity with timestamps in the audio
|
67
|
+
def detect_voice_with_vosk(input_audio):
|
68
|
+
global vosk_model_path, vosk_model
|
69
|
+
# Convert the audio to 16kHz mono WAV
|
70
|
+
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.temp_directory, suffix='.wav').name
|
71
|
+
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
72
|
+
|
73
|
+
if not vosk_model_path or not vosk_model:
|
74
|
+
vosk_model_path = download_and_cache_vosk_model()
|
75
|
+
vosk_model = vosk.Model(vosk_model_path)
|
76
|
+
|
77
|
+
# Open the audio file
|
78
|
+
with sf.SoundFile(temp_wav) as audio_file:
|
79
|
+
recognizer = vosk.KaldiRecognizer(vosk_model, audio_file.samplerate)
|
80
|
+
voice_activity = []
|
81
|
+
total_duration = len(audio_file) / audio_file.samplerate # Get total duration in seconds
|
82
|
+
|
83
|
+
recognizer.SetWords(True)
|
84
|
+
# recognizer.SetPartialWords(True)
|
85
|
+
|
86
|
+
# Process audio in chunks
|
87
|
+
while True:
|
88
|
+
data = audio_file.buffer_read(4000, dtype='int16')
|
89
|
+
if len(data) == 0:
|
90
|
+
break
|
91
|
+
|
92
|
+
# Convert buffer to bytes using NumPy
|
93
|
+
data_bytes = np.frombuffer(data, dtype='int16').tobytes()
|
94
|
+
|
95
|
+
if recognizer.AcceptWaveform(data_bytes):
|
96
|
+
pass
|
97
|
+
|
98
|
+
final_result = json.loads(recognizer.FinalResult())
|
99
|
+
if 'result' in final_result:
|
100
|
+
should_use = False
|
101
|
+
unique_words = set()
|
102
|
+
for word in final_result['result']:
|
103
|
+
if word['conf'] >= .90:
|
104
|
+
logger.debug(word)
|
105
|
+
should_use = True
|
106
|
+
unique_words.add(word['word'])
|
107
|
+
if len(unique_words) == 1 or all(item in ['えー', 'ん'] for item in unique_words):
|
108
|
+
should_use = False
|
109
|
+
|
110
|
+
if not should_use:
|
111
|
+
return None, 0
|
112
|
+
|
113
|
+
for word in final_result['result']:
|
114
|
+
voice_activity.append({
|
115
|
+
'text': word['word'],
|
116
|
+
'start': word['start'],
|
117
|
+
'end': word['end']
|
118
|
+
})
|
119
|
+
|
120
|
+
# Return the detected voice activity and the total duration
|
121
|
+
return voice_activity, total_duration
|
122
|
+
|
123
|
+
|
124
|
+
# Example usage of Vosk with trimming
|
125
|
+
def process_audio_with_vosk(input_audio, output_audio):
|
126
|
+
voice_activity, total_duration = detect_voice_with_vosk(input_audio)
|
127
|
+
|
128
|
+
if not voice_activity:
|
129
|
+
logger.info("No voice activity detected in the audio.")
|
130
|
+
return False
|
131
|
+
|
132
|
+
# Trim based on the first and last speech detected
|
133
|
+
start_time = voice_activity[0]['start'] if voice_activity else 0
|
134
|
+
end_time = voice_activity[-1]['end'] if voice_activity else total_duration
|
135
|
+
|
136
|
+
if get_config().vad.trim_beginning:
|
137
|
+
logger.info(f"Trimmed Beginning of Audio to {start_time}")
|
138
|
+
|
139
|
+
# Print detected speech details with timestamps
|
140
|
+
logger.info(f"Trimmed End of Audio to {end_time} seconds:")
|
141
|
+
|
142
|
+
# Trim the audio using FFmpeg
|
143
|
+
ffmpeg.trim_audio(input_audio, start_time, end_time + get_config().audio.end_offset, output_audio)
|
144
|
+
logger.info(f"Trimmed audio saved to: {output_audio}")
|
145
|
+
return True
|
146
|
+
|
147
|
+
|
148
|
+
def get_vosk_model():
|
149
|
+
global vosk_model_path, vosk_model
|
150
|
+
vosk_model_path = download_and_cache_vosk_model()
|
151
|
+
vosk_model = vosk.Model(vosk_model_path)
|
152
|
+
logger.info(f"Using Vosk model from {vosk_model_path}")
|
@@ -0,0 +1,98 @@
|
|
1
|
+
import tempfile
|
2
|
+
import warnings
|
3
|
+
|
4
|
+
import stable_whisper as whisper
|
5
|
+
from stable_whisper import WhisperResult
|
6
|
+
|
7
|
+
from .. import ffmpeg, configuration
|
8
|
+
from ..configuration import *
|
9
|
+
|
10
|
+
ffmpeg_base_command_list = ["ffmpeg", "-hide_banner", "-loglevel", "error"]
|
11
|
+
whisper_model = None
|
12
|
+
|
13
|
+
|
14
|
+
# Function to download and load the Whisper model
|
15
|
+
def load_whisper_model():
|
16
|
+
global whisper_model
|
17
|
+
if whisper_model is None:
|
18
|
+
logger.info(f"Loading Whisper model '{get_config().vad.whisper_model}'... This may take a while.")
|
19
|
+
with warnings.catch_warnings(action="ignore"):
|
20
|
+
whisper_model = whisper.load_model(get_config().vad.whisper_model)
|
21
|
+
logger.info("Whisper model loaded.")
|
22
|
+
|
23
|
+
|
24
|
+
# Use Whisper to detect voice activity with timestamps in the audio
|
25
|
+
def detect_voice_with_whisper(input_audio):
|
26
|
+
# Convert the audio to 16kHz mono WAV
|
27
|
+
temp_wav = tempfile.NamedTemporaryFile(dir=configuration.temp_directory, suffix='.wav').name
|
28
|
+
ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
|
29
|
+
|
30
|
+
# Make sure Whisper is loaded
|
31
|
+
load_whisper_model()
|
32
|
+
|
33
|
+
logger.info('transcribing audio...')
|
34
|
+
|
35
|
+
# Transcribe the audio using Whisper
|
36
|
+
with warnings.catch_warnings(action="ignore"):
|
37
|
+
result: WhisperResult = whisper_model.transcribe(temp_wav, vad=True, language='ja')
|
38
|
+
|
39
|
+
voice_activity = []
|
40
|
+
|
41
|
+
logger.debug(result.to_dict())
|
42
|
+
|
43
|
+
# Process the segments to extract tokens, timestamps, and confidence
|
44
|
+
for segment in result.segments:
|
45
|
+
logger.debug(segment.to_dict())
|
46
|
+
for word in segment.words:
|
47
|
+
logger.debug(word.to_dict())
|
48
|
+
confidence = word.probability
|
49
|
+
if confidence > .1:
|
50
|
+
logger.debug(word)
|
51
|
+
voice_activity.append({
|
52
|
+
'text': word.word,
|
53
|
+
'start': word.start,
|
54
|
+
'end': word.end,
|
55
|
+
'confidence': word.probability
|
56
|
+
})
|
57
|
+
|
58
|
+
# Analyze the detected words to decide whether to use the audio
|
59
|
+
should_use = False
|
60
|
+
unique_words = set(word['text'] for word in voice_activity)
|
61
|
+
if len(unique_words) > 1 or not all(item in ['えー', 'ん'] for item in unique_words):
|
62
|
+
should_use = True
|
63
|
+
|
64
|
+
if not should_use:
|
65
|
+
return None
|
66
|
+
|
67
|
+
# Return the detected voice activity and the total duration
|
68
|
+
return voice_activity
|
69
|
+
|
70
|
+
|
71
|
+
# Example usage of Whisper with trimming
|
72
|
+
def process_audio_with_whisper(input_audio, output_audio):
|
73
|
+
voice_activity = detect_voice_with_whisper(input_audio)
|
74
|
+
|
75
|
+
if not voice_activity:
|
76
|
+
logger.info("No voice activity detected in the audio.")
|
77
|
+
return False
|
78
|
+
|
79
|
+
# Trim based on the first and last speech detected
|
80
|
+
start_time = voice_activity[0]['start']
|
81
|
+
end_time = voice_activity[-1]['end']
|
82
|
+
|
83
|
+
if get_config().vad.trim_beginning:
|
84
|
+
logger.info(f"Trimmed Beginning of Audio to {start_time}")
|
85
|
+
|
86
|
+
# Print detected speech details with timestamps
|
87
|
+
logger.info(f"Trimmed End of Audio to {end_time} seconds:")
|
88
|
+
|
89
|
+
# Trim the audio using FFmpeg
|
90
|
+
ffmpeg.trim_audio(input_audio, start_time, end_time + get_config().audio.end_offset, output_audio)
|
91
|
+
logger.info(f"Trimmed audio saved to: {output_audio}")
|
92
|
+
return True
|
93
|
+
|
94
|
+
|
95
|
+
# Load Whisper model initially
|
96
|
+
def initialize_whisper_model():
|
97
|
+
load_whisper_model()
|
98
|
+
logger.info(f"Using Whisper model '{get_config().vad.whisper_model}' for Japanese voice detection")
|