abstractvoice 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +33 -0
- abstractvoice/__main__.py +119 -0
- abstractvoice/examples/__init__.py +1 -0
- abstractvoice/examples/cli_repl.py +861 -0
- abstractvoice/examples/voice_cli.py +85 -0
- abstractvoice/examples/web_api.py +214 -0
- abstractvoice/recognition.py +252 -0
- abstractvoice/stt/__init__.py +5 -0
- abstractvoice/stt/transcriber.py +138 -0
- abstractvoice/tts/__init__.py +5 -0
- abstractvoice/tts/tts_engine.py +931 -0
- abstractvoice/vad/__init__.py +5 -0
- abstractvoice/vad/voice_detector.py +75 -0
- abstractvoice/voice_manager.py +294 -0
- abstractvoice-0.1.0.dist-info/METADATA +1132 -0
- abstractvoice-0.1.0.dist-info/RECORD +20 -0
- abstractvoice-0.1.0.dist-info/WHEEL +5 -0
- abstractvoice-0.1.0.dist-info/entry_points.txt +3 -0
- abstractvoice-0.1.0.dist-info/licenses/LICENSE +21 -0
- abstractvoice-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
AbstractVoice voice mode CLI launcher.
|
|
4
|
+
|
|
5
|
+
This module provides a direct entry point to start AbstractVoice in voice mode.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import time
|
|
10
|
+
from abstractvoice.examples.cli_repl import VoiceREPL
|
|
11
|
+
|
|
12
|
+
def parse_args():
|
|
13
|
+
"""Parse command line arguments."""
|
|
14
|
+
parser = argparse.ArgumentParser(description="AbstractVoice Voice Mode")
|
|
15
|
+
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
16
|
+
parser.add_argument("--api", default="http://localhost:11434/api/chat",
|
|
17
|
+
help="LLM API URL")
|
|
18
|
+
parser.add_argument("--model", default="granite3.3:2b",
|
|
19
|
+
help="LLM model name")
|
|
20
|
+
parser.add_argument("--whisper", default="tiny",
|
|
21
|
+
help="Whisper model to use (tiny, base, small, medium, large)")
|
|
22
|
+
parser.add_argument("--no-listening", action="store_true",
|
|
23
|
+
help="Disable speech-to-text (listening), TTS still works")
|
|
24
|
+
parser.add_argument("--system",
|
|
25
|
+
help="Custom system prompt")
|
|
26
|
+
parser.add_argument("--temperature", type=float, default=0.4,
|
|
27
|
+
help="Set temperature (0.0-2.0) for the LLM")
|
|
28
|
+
parser.add_argument("--max-tokens", type=int, default=4096,
|
|
29
|
+
help="Set maximum tokens for the LLM response")
|
|
30
|
+
return parser.parse_args()
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
"""Entry point for direct voice mode."""
|
|
34
|
+
try:
|
|
35
|
+
# Parse command line arguments
|
|
36
|
+
args = parse_args()
|
|
37
|
+
|
|
38
|
+
print("Starting AbstractVoice voice interface...")
|
|
39
|
+
|
|
40
|
+
# Initialize REPL
|
|
41
|
+
repl = VoiceREPL(
|
|
42
|
+
api_url=args.api,
|
|
43
|
+
model=args.model,
|
|
44
|
+
debug_mode=args.debug
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Set custom system prompt if provided
|
|
48
|
+
if args.system:
|
|
49
|
+
repl.system_prompt = args.system
|
|
50
|
+
repl.messages = [{"role": "system", "content": args.system}]
|
|
51
|
+
if args.debug:
|
|
52
|
+
print(f"System prompt set to: {args.system}")
|
|
53
|
+
|
|
54
|
+
# Set temperature and max_tokens
|
|
55
|
+
repl.temperature = args.temperature
|
|
56
|
+
repl.max_tokens = args.max_tokens
|
|
57
|
+
if args.debug:
|
|
58
|
+
print(f"Temperature: {args.temperature}")
|
|
59
|
+
print(f"Max tokens: {args.max_tokens}")
|
|
60
|
+
|
|
61
|
+
# Change Whisper model if specified
|
|
62
|
+
if args.whisper and args.whisper != "tiny":
|
|
63
|
+
if repl.voice_manager.set_whisper(args.whisper):
|
|
64
|
+
if args.debug:
|
|
65
|
+
print(f"Using Whisper model: {args.whisper}")
|
|
66
|
+
|
|
67
|
+
# Start in voice mode automatically unless --no-listening is specified
|
|
68
|
+
if not args.no_listening:
|
|
69
|
+
print("Activating voice mode. Say 'stop' to exit voice mode.")
|
|
70
|
+
# Use the existing voice mode method
|
|
71
|
+
repl.do_voice("on")
|
|
72
|
+
|
|
73
|
+
# Start the REPL
|
|
74
|
+
repl.cmdloop()
|
|
75
|
+
|
|
76
|
+
except KeyboardInterrupt:
|
|
77
|
+
print("\nExiting AbstractVoice...")
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f"Application error: {e}")
|
|
80
|
+
if args.debug:
|
|
81
|
+
import traceback
|
|
82
|
+
traceback.print_exc()
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
main()
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Web API example using AbstractVoice with Flask.
|
|
4
|
+
|
|
5
|
+
This example shows how to create a simple web API that exposes
|
|
6
|
+
AbstractVoice functionality to web applications.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import tempfile
|
|
13
|
+
import uuid
|
|
14
|
+
from flask import Flask, request, jsonify, send_file, render_template_string
|
|
15
|
+
|
|
16
|
+
# Import VoiceManager only when needed
|
|
17
|
+
# from abstractvoice import VoiceManager
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Initialize Flask app
|
|
21
|
+
app = Flask(__name__)
|
|
22
|
+
|
|
23
|
+
# Global voice manager
|
|
24
|
+
voice_manager = None
|
|
25
|
+
|
|
26
|
+
# Store active sessions (in a real app, use a database)
|
|
27
|
+
active_sessions = {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Simple HTML template for the home page
|
|
31
|
+
HOME_PAGE_TEMPLATE = """
|
|
32
|
+
<!DOCTYPE html>
|
|
33
|
+
<html>
|
|
34
|
+
<head>
|
|
35
|
+
<title>AbstractVoice Web API</title>
|
|
36
|
+
<style>
|
|
37
|
+
body {
|
|
38
|
+
font-family: Arial, sans-serif;
|
|
39
|
+
max-width: 800px;
|
|
40
|
+
margin: 0 auto;
|
|
41
|
+
padding: 20px;
|
|
42
|
+
line-height: 1.6;
|
|
43
|
+
}
|
|
44
|
+
h1, h2 {
|
|
45
|
+
color: #333;
|
|
46
|
+
}
|
|
47
|
+
pre {
|
|
48
|
+
background-color: #f5f5f5;
|
|
49
|
+
padding: 10px;
|
|
50
|
+
border-radius: 4px;
|
|
51
|
+
overflow-x: auto;
|
|
52
|
+
}
|
|
53
|
+
code {
|
|
54
|
+
background-color: #f5f5f5;
|
|
55
|
+
padding: 2px 4px;
|
|
56
|
+
border-radius: 4px;
|
|
57
|
+
}
|
|
58
|
+
.endpoint {
|
|
59
|
+
margin-bottom: 20px;
|
|
60
|
+
border-bottom: 1px solid #eee;
|
|
61
|
+
padding-bottom: 20px;
|
|
62
|
+
}
|
|
63
|
+
</style>
|
|
64
|
+
</head>
|
|
65
|
+
<body>
|
|
66
|
+
<h1>AbstractVoice Web API</h1>
|
|
67
|
+
<p>Welcome to the AbstractVoice Web API. Below are the available endpoints:</p>
|
|
68
|
+
|
|
69
|
+
<div class="endpoint">
|
|
70
|
+
<h2>GET /api/status</h2>
|
|
71
|
+
<p>Get the status of the voice services.</p>
|
|
72
|
+
<pre>curl http://{{ host }}:{{ port }}/api/status</pre>
|
|
73
|
+
</div>
|
|
74
|
+
|
|
75
|
+
<div class="endpoint">
|
|
76
|
+
<h2>POST /api/tts</h2>
|
|
77
|
+
<p>Convert text to speech and return audio file.</p>
|
|
78
|
+
<p><strong>Request Body:</strong></p>
|
|
79
|
+
<pre>{
|
|
80
|
+
"text": "Text to speak",
|
|
81
|
+
"speed": 1.0 // Optional
|
|
82
|
+
}</pre>
|
|
83
|
+
<p><strong>Example:</strong></p>
|
|
84
|
+
<pre>curl -X POST http://{{ host }}:{{ port }}/api/tts \
|
|
85
|
+
-H "Content-Type: application/json" \
|
|
86
|
+
-d '{"text":"Hello, this is a test", "speed":1.0}' \
|
|
87
|
+
--output speech.wav</pre>
|
|
88
|
+
</div>
|
|
89
|
+
|
|
90
|
+
<div class="endpoint">
|
|
91
|
+
<h2>POST /api/stt/transcribe</h2>
|
|
92
|
+
<p>Transcribe audio from file.</p>
|
|
93
|
+
<p><strong>Example:</strong></p>
|
|
94
|
+
<pre>curl -X POST http://{{ host }}:{{ port }}/api/stt/transcribe \
|
|
95
|
+
-F "audio_file=@/path/to/audio.wav"</pre>
|
|
96
|
+
</div>
|
|
97
|
+
|
|
98
|
+
<div class="endpoint">
|
|
99
|
+
<h2>POST /api/stt/start</h2>
|
|
100
|
+
<p>Start a listening session.</p>
|
|
101
|
+
<pre>curl -X POST http://{{ host }}:{{ port }}/api/stt/start</pre>
|
|
102
|
+
</div>
|
|
103
|
+
|
|
104
|
+
<div class="endpoint">
|
|
105
|
+
<h2>POST /api/stt/stop</h2>
|
|
106
|
+
<p>Stop a listening session.</p>
|
|
107
|
+
<p><strong>Request Body:</strong></p>
|
|
108
|
+
<pre>{
|
|
109
|
+
"session_id": "UUID of the session"
|
|
110
|
+
}</pre>
|
|
111
|
+
</div>
|
|
112
|
+
</body>
|
|
113
|
+
</html>
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@app.route('/')
|
|
118
|
+
def home():
|
|
119
|
+
"""Serve the home page with API documentation."""
|
|
120
|
+
host = request.host.split(':')[0]
|
|
121
|
+
port = request.host.split(':')[1] if ':' in request.host else "5000"
|
|
122
|
+
return render_template_string(HOME_PAGE_TEMPLATE, host=host, port=port)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@app.route('/api/test', methods=['GET'])
|
|
126
|
+
def test_api():
|
|
127
|
+
"""Simple test endpoint to verify the API is working."""
|
|
128
|
+
return jsonify({
|
|
129
|
+
"status": "ok",
|
|
130
|
+
"message": "AbstractVoice Web API is running",
|
|
131
|
+
"is_voice_manager_initialized": voice_manager is not None
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@app.route('/api/simpletest', methods=['GET'])
|
|
136
|
+
def simple_test():
|
|
137
|
+
"""A very simple test that doesn't require any initialization."""
|
|
138
|
+
return jsonify({
|
|
139
|
+
"status": "ok",
|
|
140
|
+
"message": "Basic Flask API is working!",
|
|
141
|
+
"timestamp": str(uuid.uuid4())
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Simplified function that doesn't actually load the VoiceManager
|
|
146
|
+
def lazy_initialize_voice_manager(debug_mode=False):
|
|
147
|
+
"""Initialize the voice manager only when needed."""
|
|
148
|
+
print("This is a placeholder for VoiceManager initialization")
|
|
149
|
+
print("For a full implementation, uncomment the VoiceManager import")
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def parse_args():
|
|
154
|
+
"""Parse command line arguments."""
|
|
155
|
+
parser = argparse.ArgumentParser(description="AbstractVoice Web API Example")
|
|
156
|
+
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
157
|
+
parser.add_argument("--host", default="127.0.0.1", help="Host to listen on")
|
|
158
|
+
parser.add_argument("--port", type=int, default=5000, help="Port to listen on")
|
|
159
|
+
parser.add_argument("--simulate", action="store_true",
|
|
160
|
+
help="Simulate only, don't load models")
|
|
161
|
+
return parser.parse_args()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def main():
|
|
165
|
+
"""Entry point for the application."""
|
|
166
|
+
global voice_manager
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
# Parse command line arguments
|
|
170
|
+
args = parse_args()
|
|
171
|
+
|
|
172
|
+
# Configure logging
|
|
173
|
+
import logging
|
|
174
|
+
log_level = logging.DEBUG if args.debug else logging.INFO
|
|
175
|
+
logging.basicConfig(
|
|
176
|
+
level=log_level,
|
|
177
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Configure Flask for development
|
|
181
|
+
if args.debug:
|
|
182
|
+
app.debug = True
|
|
183
|
+
app.logger.setLevel(logging.DEBUG)
|
|
184
|
+
else:
|
|
185
|
+
app.logger.setLevel(logging.INFO)
|
|
186
|
+
|
|
187
|
+
# Print startup message
|
|
188
|
+
print(f"Starting AbstractVoice Web API on {args.host}:{args.port}")
|
|
189
|
+
|
|
190
|
+
if not args.simulate:
|
|
191
|
+
print("Initializing VoiceManager (this may take a moment)...")
|
|
192
|
+
# Initialize voice manager - for real implementation, uncomment this
|
|
193
|
+
# from abstractvoice import VoiceManager
|
|
194
|
+
# voice_manager = VoiceManager(debug_mode=args.debug)
|
|
195
|
+
else:
|
|
196
|
+
print("Running in simulation mode (no models loaded)")
|
|
197
|
+
|
|
198
|
+
# Run Flask app
|
|
199
|
+
print(f"Server is ready at http://{args.host}:{args.port}")
|
|
200
|
+
print("Try these test endpoints:")
|
|
201
|
+
print(f" http://{args.host}:{args.port}/")
|
|
202
|
+
print(f" http://{args.host}:{args.port}/api/simpletest")
|
|
203
|
+
print("Press CTRL+C to quit")
|
|
204
|
+
|
|
205
|
+
app.run(host=args.host, port=args.port)
|
|
206
|
+
|
|
207
|
+
except KeyboardInterrupt:
|
|
208
|
+
print("\nExiting...")
|
|
209
|
+
except Exception as e:
|
|
210
|
+
print(f"Application error: {e}")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
if __name__ == "__main__":
|
|
214
|
+
main()
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Voice recognition module that combines VAD and STT."""
|
|
2
|
+
|
|
3
|
+
import threading
|
|
4
|
+
import time
|
|
5
|
+
import pyaudio
|
|
6
|
+
from .vad import VoiceDetector
|
|
7
|
+
from .stt import Transcriber
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class VoiceRecognizer:
|
|
11
|
+
"""Voice recognition with VAD and STT."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, transcription_callback, stop_callback=None,
|
|
14
|
+
vad_aggressiveness=1, min_speech_duration=600,
|
|
15
|
+
silence_timeout=1500, sample_rate=16000,
|
|
16
|
+
chunk_duration=30, whisper_model="tiny",
|
|
17
|
+
min_transcription_length=5, debug_mode=False):
|
|
18
|
+
"""Initialize voice recognizer.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
transcription_callback: Function to call with transcription text
|
|
22
|
+
stop_callback: Function to call when "stop" is detected
|
|
23
|
+
vad_aggressiveness: VAD aggressiveness (0-3)
|
|
24
|
+
min_speech_duration: Min speech duration in ms to start recording
|
|
25
|
+
silence_timeout: Silence timeout in ms to end recording
|
|
26
|
+
sample_rate: Audio sample rate in Hz
|
|
27
|
+
chunk_duration: Audio chunk duration in ms
|
|
28
|
+
whisper_model: Whisper model name
|
|
29
|
+
min_transcription_length: Min valid transcription length
|
|
30
|
+
debug_mode: Enable debug output
|
|
31
|
+
"""
|
|
32
|
+
self.debug_mode = debug_mode
|
|
33
|
+
self.transcription_callback = transcription_callback
|
|
34
|
+
self.stop_callback = stop_callback
|
|
35
|
+
|
|
36
|
+
# Configuration
|
|
37
|
+
self.sample_rate = sample_rate
|
|
38
|
+
self.chunk_duration = chunk_duration # in ms
|
|
39
|
+
self.chunk_size = int(sample_rate * chunk_duration / 1000)
|
|
40
|
+
self.min_speech_chunks = int(min_speech_duration / chunk_duration)
|
|
41
|
+
self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
|
|
42
|
+
|
|
43
|
+
# Initialize components
|
|
44
|
+
self.voice_detector = VoiceDetector(
|
|
45
|
+
aggressiveness=vad_aggressiveness,
|
|
46
|
+
sample_rate=sample_rate,
|
|
47
|
+
debug_mode=debug_mode
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
self.transcriber = Transcriber(
|
|
51
|
+
model_name=whisper_model,
|
|
52
|
+
min_transcription_length=min_transcription_length,
|
|
53
|
+
debug_mode=debug_mode
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# State
|
|
57
|
+
self.is_running = False
|
|
58
|
+
self.thread = None
|
|
59
|
+
self.pyaudio = None
|
|
60
|
+
self.stream = None
|
|
61
|
+
self.tts_interrupt_callback = None
|
|
62
|
+
self.tts_interrupt_enabled = True # Can be disabled during TTS playback
|
|
63
|
+
self.listening_paused = False # Can be paused to completely stop processing audio
|
|
64
|
+
|
|
65
|
+
def start(self, tts_interrupt_callback=None):
|
|
66
|
+
"""Start voice recognition in a separate thread.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
tts_interrupt_callback: Function to call when speech is detected during listening
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
True if started, False if already running
|
|
73
|
+
"""
|
|
74
|
+
if self.is_running:
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
self.tts_interrupt_callback = tts_interrupt_callback
|
|
78
|
+
self.is_running = True
|
|
79
|
+
self.thread = threading.Thread(target=self._recognition_loop)
|
|
80
|
+
self.thread.start()
|
|
81
|
+
|
|
82
|
+
if self.debug_mode:
|
|
83
|
+
print(" > Voice recognition started")
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
def stop(self):
|
|
87
|
+
"""Stop voice recognition.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if stopped, False if not running
|
|
91
|
+
"""
|
|
92
|
+
if not self.is_running:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
self.is_running = False
|
|
96
|
+
if self.thread:
|
|
97
|
+
self.thread.join()
|
|
98
|
+
|
|
99
|
+
if self.stream:
|
|
100
|
+
self.stream.stop_stream()
|
|
101
|
+
self.stream.close()
|
|
102
|
+
|
|
103
|
+
if self.pyaudio:
|
|
104
|
+
self.pyaudio.terminate()
|
|
105
|
+
|
|
106
|
+
if self.debug_mode:
|
|
107
|
+
print(" > Voice recognition stopped")
|
|
108
|
+
return True
|
|
109
|
+
|
|
110
|
+
def _recognition_loop(self):
|
|
111
|
+
"""Main recognition loop."""
|
|
112
|
+
import pyaudio
|
|
113
|
+
|
|
114
|
+
self.pyaudio = pyaudio.PyAudio()
|
|
115
|
+
self.stream = self.pyaudio.open(
|
|
116
|
+
format=pyaudio.paInt16,
|
|
117
|
+
channels=1,
|
|
118
|
+
rate=self.sample_rate,
|
|
119
|
+
input=True,
|
|
120
|
+
frames_per_buffer=self.chunk_size
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
speech_buffer = []
|
|
124
|
+
speech_count = 0
|
|
125
|
+
silence_count = 0
|
|
126
|
+
recording = False
|
|
127
|
+
|
|
128
|
+
while self.is_running:
|
|
129
|
+
try:
|
|
130
|
+
# If listening is paused, sleep briefly and skip processing
|
|
131
|
+
if self.listening_paused:
|
|
132
|
+
time.sleep(0.1)
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
# Read audio data
|
|
136
|
+
audio_data = self.stream.read(self.chunk_size, exception_on_overflow=False)
|
|
137
|
+
|
|
138
|
+
# Check for speech
|
|
139
|
+
is_speech = self.voice_detector.is_speech(audio_data)
|
|
140
|
+
|
|
141
|
+
if is_speech:
|
|
142
|
+
speech_buffer.append(audio_data)
|
|
143
|
+
speech_count += 1
|
|
144
|
+
silence_count = 0
|
|
145
|
+
|
|
146
|
+
# Trigger TTS interrupt callback if enough speech detected
|
|
147
|
+
# Only interrupt if TTS interruption is enabled (not during TTS playback)
|
|
148
|
+
if (self.tts_interrupt_callback and
|
|
149
|
+
self.tts_interrupt_enabled and
|
|
150
|
+
speech_count >= self.min_speech_chunks and
|
|
151
|
+
not recording):
|
|
152
|
+
self.tts_interrupt_callback()
|
|
153
|
+
if self.debug_mode:
|
|
154
|
+
print(" > TTS interrupted by user speech")
|
|
155
|
+
|
|
156
|
+
# Start recording after minimum speech detected
|
|
157
|
+
if speech_count >= self.min_speech_chunks:
|
|
158
|
+
recording = True
|
|
159
|
+
|
|
160
|
+
else:
|
|
161
|
+
# Handle silence during recording
|
|
162
|
+
if recording:
|
|
163
|
+
speech_buffer.append(audio_data)
|
|
164
|
+
silence_count += 1
|
|
165
|
+
|
|
166
|
+
# End of speech detected
|
|
167
|
+
if silence_count >= self.silence_timeout_chunks:
|
|
168
|
+
if self.debug_mode:
|
|
169
|
+
print(f" > Speech detected ({len(speech_buffer)} chunks), transcribing...")
|
|
170
|
+
|
|
171
|
+
audio_bytes = b''.join(speech_buffer)
|
|
172
|
+
text = self.transcriber.transcribe(audio_bytes)
|
|
173
|
+
|
|
174
|
+
if text:
|
|
175
|
+
# Check for stop command
|
|
176
|
+
if text.lower() == "stop":
|
|
177
|
+
if self.stop_callback:
|
|
178
|
+
self.stop_callback()
|
|
179
|
+
else:
|
|
180
|
+
# If no stop callback, invoke transcription callback anyway
|
|
181
|
+
self.transcription_callback(text)
|
|
182
|
+
else:
|
|
183
|
+
# Normal transcription
|
|
184
|
+
self.transcription_callback(text)
|
|
185
|
+
|
|
186
|
+
# Reset state
|
|
187
|
+
speech_buffer = []
|
|
188
|
+
speech_count = 0
|
|
189
|
+
silence_count = 0
|
|
190
|
+
recording = False
|
|
191
|
+
else:
|
|
192
|
+
# No speech detected and not recording
|
|
193
|
+
speech_count = max(0, speech_count - 1)
|
|
194
|
+
if speech_count == 0:
|
|
195
|
+
speech_buffer = []
|
|
196
|
+
|
|
197
|
+
except Exception as e:
|
|
198
|
+
if self.debug_mode:
|
|
199
|
+
print(f"Voice recognition error: {e}")
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
def change_whisper_model(self, model_name):
|
|
203
|
+
"""Change the Whisper model.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
model_name: New model name
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
True if changed, False otherwise
|
|
210
|
+
"""
|
|
211
|
+
return self.transcriber.change_model(model_name)
|
|
212
|
+
|
|
213
|
+
def change_vad_aggressiveness(self, aggressiveness):
|
|
214
|
+
"""Change VAD aggressiveness.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
aggressiveness: New aggressiveness level (0-3)
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
True if changed, False otherwise
|
|
221
|
+
"""
|
|
222
|
+
return self.voice_detector.set_aggressiveness(aggressiveness)
|
|
223
|
+
|
|
224
|
+
def pause_tts_interrupt(self):
|
|
225
|
+
"""Temporarily disable TTS interruption (e.g., during TTS playback).
|
|
226
|
+
|
|
227
|
+
This prevents the system from interrupting its own speech.
|
|
228
|
+
"""
|
|
229
|
+
self.tts_interrupt_enabled = False
|
|
230
|
+
if self.debug_mode:
|
|
231
|
+
print(" > TTS interrupt paused")
|
|
232
|
+
|
|
233
|
+
def resume_tts_interrupt(self):
|
|
234
|
+
"""Re-enable TTS interruption after it was paused."""
|
|
235
|
+
self.tts_interrupt_enabled = True
|
|
236
|
+
if self.debug_mode:
|
|
237
|
+
print(" > TTS interrupt resumed")
|
|
238
|
+
|
|
239
|
+
def pause_listening(self):
|
|
240
|
+
"""Temporarily pause audio processing entirely (e.g., during TTS in 'wait' mode).
|
|
241
|
+
|
|
242
|
+
This completely stops processing audio input while keeping the thread alive.
|
|
243
|
+
"""
|
|
244
|
+
self.listening_paused = True
|
|
245
|
+
if self.debug_mode:
|
|
246
|
+
print(" > Listening paused")
|
|
247
|
+
|
|
248
|
+
def resume_listening(self):
|
|
249
|
+
"""Resume audio processing after it was paused."""
|
|
250
|
+
self.listening_paused = False
|
|
251
|
+
if self.debug_mode:
|
|
252
|
+
print(" > Listening resumed")
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Speech-to-text transcription using OpenAI's Whisper."""
|
|
2
|
+
|
|
3
|
+
import whisper
|
|
4
|
+
import numpy as np
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Transcriber:
|
|
11
|
+
"""Transcribes audio using OpenAI's Whisper model."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, model_name="tiny", min_transcription_length=5, debug_mode=False):
|
|
14
|
+
"""Initialize the Whisper transcriber.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
model_name: Whisper model to use (tiny, base, etc.)
|
|
18
|
+
min_transcription_length: Minimum length of text to consider valid
|
|
19
|
+
debug_mode: Enable debug output
|
|
20
|
+
"""
|
|
21
|
+
self.model_name = model_name
|
|
22
|
+
self.min_transcription_length = min_transcription_length
|
|
23
|
+
self.debug_mode = debug_mode
|
|
24
|
+
|
|
25
|
+
# Suppress Whisper output unless in debug mode
|
|
26
|
+
if not debug_mode:
|
|
27
|
+
logging.getLogger('whisper').setLevel(logging.ERROR)
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
if self.debug_mode:
|
|
31
|
+
print(f" > Loading Whisper model: {model_name}")
|
|
32
|
+
|
|
33
|
+
# Redirect stdout when loading Whisper model in non-debug mode
|
|
34
|
+
orig_stdout = None
|
|
35
|
+
null_out = None
|
|
36
|
+
if not debug_mode:
|
|
37
|
+
orig_stdout = sys.stdout
|
|
38
|
+
null_out = open(os.devnull, 'w')
|
|
39
|
+
sys.stdout = null_out
|
|
40
|
+
|
|
41
|
+
# Load the Whisper model
|
|
42
|
+
self.model = whisper.load_model(model_name)
|
|
43
|
+
finally:
|
|
44
|
+
# Restore stdout if we redirected it
|
|
45
|
+
if not debug_mode and orig_stdout:
|
|
46
|
+
sys.stdout = orig_stdout
|
|
47
|
+
if 'null_out' in locals() and null_out:
|
|
48
|
+
null_out.close()
|
|
49
|
+
|
|
50
|
+
def transcribe(self, audio_data):
|
|
51
|
+
"""Transcribe audio data to text.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
audio_data: Audio data as bytes or numpy array
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Transcribed text or None if transcription failed or is too short
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
# Convert bytes to numpy array if needed
|
|
61
|
+
if isinstance(audio_data, bytes):
|
|
62
|
+
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
|
63
|
+
else:
|
|
64
|
+
audio_np = audio_data
|
|
65
|
+
|
|
66
|
+
# Redirect stdout for non-debug mode
|
|
67
|
+
orig_stdout = None
|
|
68
|
+
null_out = None
|
|
69
|
+
if not self.debug_mode:
|
|
70
|
+
orig_stdout = sys.stdout
|
|
71
|
+
null_out = open(os.devnull, 'w')
|
|
72
|
+
sys.stdout = null_out
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# Perform transcription
|
|
76
|
+
result = self.model.transcribe(audio_np, fp16=False)
|
|
77
|
+
finally:
|
|
78
|
+
# Restore stdout if we redirected it
|
|
79
|
+
if not self.debug_mode and orig_stdout:
|
|
80
|
+
sys.stdout = orig_stdout
|
|
81
|
+
if null_out:
|
|
82
|
+
null_out.close()
|
|
83
|
+
|
|
84
|
+
# Extract and clean text
|
|
85
|
+
text = result["text"].strip()
|
|
86
|
+
|
|
87
|
+
# Skip short transcriptions (except "stop" command)
|
|
88
|
+
if len(text) < self.min_transcription_length and text.lower() != "stop":
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
if self.debug_mode:
|
|
92
|
+
print(f" > Transcribed: '{text}'")
|
|
93
|
+
|
|
94
|
+
return text
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
if self.debug_mode:
|
|
98
|
+
print(f"Transcription error: {e}")
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
def change_model(self, model_name):
|
|
102
|
+
"""Change the Whisper model.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
model_name: New model name (tiny, base, etc.)
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
True if model was changed, False otherwise
|
|
109
|
+
"""
|
|
110
|
+
if model_name in ["tiny", "base", "small", "medium", "large"]:
|
|
111
|
+
if self.debug_mode:
|
|
112
|
+
print(f" > Changing Whisper model to {model_name}")
|
|
113
|
+
|
|
114
|
+
# Redirect stdout for non-debug mode
|
|
115
|
+
orig_stdout = None
|
|
116
|
+
null_out = None
|
|
117
|
+
if not self.debug_mode:
|
|
118
|
+
orig_stdout = sys.stdout
|
|
119
|
+
null_out = open(os.devnull, 'w')
|
|
120
|
+
sys.stdout = null_out
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
self.model = whisper.load_model(model_name)
|
|
124
|
+
self.model_name = model_name
|
|
125
|
+
finally:
|
|
126
|
+
# Restore stdout if we redirected it
|
|
127
|
+
if not self.debug_mode and orig_stdout:
|
|
128
|
+
sys.stdout = orig_stdout
|
|
129
|
+
if null_out:
|
|
130
|
+
null_out.close()
|
|
131
|
+
|
|
132
|
+
if self.debug_mode:
|
|
133
|
+
print(f" > Whisper model changed to {model_name}")
|
|
134
|
+
return True
|
|
135
|
+
else:
|
|
136
|
+
if self.debug_mode:
|
|
137
|
+
print(f" > Invalid model name: {model_name}")
|
|
138
|
+
return False
|