fren-voice 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/MiraTTS.bat +27 -0
- package/MiraTTS.ps1 +29 -0
- package/PIPELINE_ARCHITECTURE.md +73 -0
- package/PIPELINE_SETUP.md +169 -0
- package/Pipeline Diagram.png +0 -0
- package/README.md +208 -0
- package/bin/cli.js +101 -0
- package/package.json +29 -0
- package/reference_file.wav +0 -0
- package/requirements.txt +36 -0
- package/voice_conversation.py +330 -0
package/MiraTTS.bat
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
@echo off
|
|
2
|
+
REM MiraTTS Launcher
|
|
3
|
+
REM This batch file activates the virtual environment and runs the TTS program
|
|
4
|
+
|
|
5
|
+
cd /d "%~dp0"
|
|
6
|
+
|
|
7
|
+
echo ========================================
|
|
8
|
+
echo MiraTTS - Text to Speech Generator
|
|
9
|
+
echo ========================================
|
|
10
|
+
echo.
|
|
11
|
+
|
|
12
|
+
REM Check if virtual environment exists
|
|
13
|
+
if not exist ".venv\Scripts\activate.bat" (
|
|
14
|
+
echo ERROR: Virtual environment not found!
|
|
15
|
+
echo Please ensure .venv folder exists in the same directory.
|
|
16
|
+
pause
|
|
17
|
+
exit /b 1
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
REM Activate virtual environment and run the program
|
|
21
|
+
call .venv\Scripts\activate.bat
|
|
22
|
+
python tts.py
|
|
23
|
+
|
|
24
|
+
REM Deactivate when done
|
|
25
|
+
deactivate
|
|
26
|
+
|
|
27
|
+
pause
|
package/MiraTTS.ps1
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# MiraTTS PowerShell Launcher
|
|
2
|
+
# This script activates the virtual environment and runs the TTS program
|
|
3
|
+
|
|
4
|
+
$scriptPath = Split-Path -Parent $MyInvocation.MyCommand.Path
|
|
5
|
+
Set-Location $scriptPath
|
|
6
|
+
|
|
7
|
+
Write-Host "========================================"
|
|
8
|
+
Write-Host "MiraTTS - Text to Speech Generator"
|
|
9
|
+
Write-Host "========================================"
|
|
10
|
+
Write-Host ""
|
|
11
|
+
|
|
12
|
+
# Check if virtual environment exists
|
|
13
|
+
if (-not (Test-Path ".venv\Scripts\Activate.ps1")) {
|
|
14
|
+
Write-Host "ERROR: Virtual environment not found!" -ForegroundColor Red
|
|
15
|
+
Write-Host "Please ensure .venv folder exists in the same directory."
|
|
16
|
+
Read-Host "Press Enter to exit"
|
|
17
|
+
exit 1
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Activate virtual environment
|
|
21
|
+
& ".venv\Scripts\Activate.ps1"
|
|
22
|
+
|
|
23
|
+
# Run the TTS program
|
|
24
|
+
& python tts.py
|
|
25
|
+
|
|
26
|
+
# Deactivate when done
|
|
27
|
+
deactivate
|
|
28
|
+
|
|
29
|
+
Read-Host "Press Enter to exit"
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Project Pipeline Architecture
|
|
2
|
+
|
|
3
|
+
This document illustrates the technical data flow and component integration of the Voice Conversational AI System.
|
|
4
|
+
|
|
5
|
+
## Technical Flow Diagram
|
|
6
|
+
|
|
7
|
+
```mermaid
|
|
8
|
+
graph TD
|
|
9
|
+
%% Entry Point
|
|
10
|
+
Start((System Start)) --> ShadowMode[Shadow Listening Mode]
|
|
11
|
+
|
|
12
|
+
%% Wake-Word Logic
|
|
13
|
+
ShadowMode -->|Detects 'Hey' / 'Start'| AudioCue[Audio Cue: Ascending Beep]
|
|
14
|
+
AudioCue --> MainLoop[Active Conversation Loop]
|
|
15
|
+
|
|
16
|
+
%% Input Pipeline
|
|
17
|
+
MainLoop --> MicInput[Microphone Input]
|
|
18
|
+
MicInput --> VAD{VAD Process}
|
|
19
|
+
VAD -->|Speaking| Buffer[Accumulate Audio Buffer]
|
|
20
|
+
VAD -->|Silence > 0.8s| StopRec[Terminate Recording]
|
|
21
|
+
|
|
22
|
+
%% Inference Pipeline
|
|
23
|
+
StopRec --> STT[Faster-Whisper STT Engine]
|
|
24
|
+
STT -->|Text| LLM[LM Studio API Inference]
|
|
25
|
+
LLM -->|Response Script| PreProcess[Text Normalization]
|
|
26
|
+
|
|
27
|
+
%% Output Pipeline
|
|
28
|
+
PreProcess --> TTS[MiraTTS Neural Synthesis]
|
|
29
|
+
TTS -->|PCM Audio| SyncExec{Parallel Execution}
|
|
30
|
+
|
|
31
|
+
SyncExec -->|Thread 1| Playback[PyAudio Memory Playback]
|
|
32
|
+
SyncExec -->|Thread 2| Visuals[Fast Typeface Animation]
|
|
33
|
+
|
|
34
|
+
%% State Management
|
|
35
|
+
Playback & Visuals --> StateCheck{Command Check}
|
|
36
|
+
StateCheck -->|'Pause'| PauseCue[Audio Cue: Descending Beep]
|
|
37
|
+
PauseCue --> ShadowMode
|
|
38
|
+
|
|
39
|
+
StateCheck -->|'Deactivate'| Shutdown((System Shutdown))
|
|
40
|
+
StateCheck -->|Next Turn| MainLoop
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Component Roles
|
|
44
|
+
|
|
45
|
+
### 1. Input Layer (VAD & STT)
|
|
46
|
+
|
|
47
|
+
- **VAD (Voice Activity Detection)**: Monitors RMS energy levels in real-time. It cuts the recording loop immediately after 800ms of sustained quiet, significantly reducing response latency compared to fixed-length recording.
|
|
48
|
+
- **Faster-Whisper**: Processes the audio buffer into text. It uses CTranslate2 backend for high-speed inference on NVIDIA GPUs.
|
|
49
|
+
|
|
50
|
+
### 2. Cognitive Layer (LLM)
|
|
51
|
+
|
|
52
|
+
- **LM Studio Gateway**: Acts as the brain. The system sends conversational context to a local OpenAI-compatible endpoint.
|
|
53
|
+
- **System Prompting**: Enforces concise, spoken-word friendly responses (1-3 sentences) to maintain conversational speed.
|
|
54
|
+
|
|
55
|
+
### 3. Synthesis Layer (TTS)
|
|
56
|
+
|
|
57
|
+
- **MiraTTS**: A neural reference-based synthesis engine. It generates high-fidelity audio tokens from text based on the `reference_file.wav` provided at initialization.
|
|
58
|
+
- **Neural Codecs**: Utilizes `ncodec` and `FlashSR` for internal audio super-resolution and decoding.
|
|
59
|
+
|
|
60
|
+
### 4. Output Layer (Synchronized IO)
|
|
61
|
+
|
|
62
|
+
- **PyAudio Memory Buffer**: Audio is played directly from RAM. No disk I/O is involved during playback to prevent micro-stutters.
|
|
63
|
+
- **Threaded UI**: The terminal typewriter effect runs in a parallel thread, synchronized with a 1.0s lead-in to allow the sound driver to initialize.
|
|
64
|
+
|
|
65
|
+
## Lifecycle States
|
|
66
|
+
|
|
67
|
+
| State | Description | Audio Cue |
|
|
68
|
+
| :--------------- | :------------------------------------------- | :-------------- |
|
|
69
|
+
| **Shadow Mode** | Silent background monitoring for wake-words. | None |
|
|
70
|
+
| **Activation** | Transition into active conversation. | Ascending Beep |
|
|
71
|
+
| **Processing** | Actively transcribing or generating. | None |
|
|
72
|
+
| **Deactivation** | Transition back into Shadow Mode. | Descending Beep |
|
|
73
|
+
| **Termination** | Full process shutdown. | None |
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# Complete Conversational AI Pipeline Setup
|
|
2
|
+
|
|
3
|
+
## Architecture Overview
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
Audio Input â Whisper (STT) â LM Studio (LLM) â MiraTTS (TTS) â Audio Output
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Components
|
|
10
|
+
|
|
11
|
+
### 1. Whisper WebSocket Server (Speech-to-Text)
|
|
12
|
+
|
|
13
|
+
- **File**: `ws_whisper_server.py`
|
|
14
|
+
- **Endpoint**: `ws://127.0.0.1:8000/ws/transcribe`
|
|
15
|
+
- **Status**: â
RUNNING
|
|
16
|
+
- **Model**: faster-whisper (small, CUDA)
|
|
17
|
+
- **Environment**: `venv`
|
|
18
|
+
|
|
19
|
+
### 2. LM Studio (Language Model)
|
|
20
|
+
|
|
21
|
+
- **Endpoint**: `http://127.0.0.1:1234/v1/chat/completions`
|
|
22
|
+
- **Status**: â
RUNNING
|
|
23
|
+
- **Available Models**:
|
|
24
|
+
- orpheus-3b-0.1-ft
|
|
25
|
+
- llama-3.1-8b-lexi-uncensored-v2
|
|
26
|
+
|
|
27
|
+
### 3. MiraTTS (Text-to-Speech)
|
|
28
|
+
|
|
29
|
+
- **File**: `tts_test.py`
|
|
30
|
+
- **Environment**: `.venv` (with dot)
|
|
31
|
+
- **GPU Enforcement**: â
STRICT (ONNX CUDA only)
|
|
32
|
+
- **Reference Audio**: `reference_file.wav`
|
|
33
|
+
|
|
34
|
+
### 4. Pipeline Integration
|
|
35
|
+
|
|
36
|
+
- **File**: `whisper_to_llm_pipeline.py`
|
|
37
|
+
- **Environment**: `venv`
|
|
38
|
+
- **Features**:
|
|
39
|
+
- Interactive text chat with LLM
|
|
40
|
+
- Audio file transcription via Whisper
|
|
41
|
+
- Complete STT â LLM flow
|
|
42
|
+
|
|
43
|
+
## Running the Complete Pipeline
|
|
44
|
+
|
|
45
|
+
### Step 1: Start Whisper Server (Already Running)
|
|
46
|
+
|
|
47
|
+
```powershell
|
|
48
|
+
.\venv\Scripts\Activate.ps1
|
|
49
|
+
uvicorn ws_whisper_server:app --host 0.0.0.0 --port 8000
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Step 2: Ensure LM Studio is Running (Already Running)
|
|
53
|
+
|
|
54
|
+
- LM Studio should be running on port 1234
|
|
55
|
+
- Load your preferred model
|
|
56
|
+
|
|
57
|
+
### Step 3: Run the Pipeline
|
|
58
|
+
|
|
59
|
+
```powershell
|
|
60
|
+
.\venv\Scripts\Activate.ps1
|
|
61
|
+
python whisper_to_llm_pipeline.py
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Interactive Mode Commands:**
|
|
65
|
+
|
|
66
|
+
- Type any message â sends directly to LLM
|
|
67
|
+
- `audio <filepath>` â transcribes audio then sends to LLM
|
|
68
|
+
- `clear` â clears conversation history
|
|
69
|
+
- `exit` â quit
|
|
70
|
+
|
|
71
|
+
### Step 4: Add TTS Output (Use tts_test.py separately)
|
|
72
|
+
|
|
73
|
+
For now, use `tts_test.py` in the `.venv` environment to generate speech from LLM responses.
|
|
74
|
+
|
|
75
|
+
## Environment Details
|
|
76
|
+
|
|
77
|
+
### `venv` (Whisper + Pipeline)
|
|
78
|
+
|
|
79
|
+
Packages installed:
|
|
80
|
+
|
|
81
|
+
- faster-whisper
|
|
82
|
+
- fastapi
|
|
83
|
+
- uvicorn
|
|
84
|
+
- websockets
|
|
85
|
+
- soundfile
|
|
86
|
+
- numpy
|
|
87
|
+
- requests
|
|
88
|
+
- pyaudio
|
|
89
|
+
|
|
90
|
+
### `.venv` (MiraTTS)
|
|
91
|
+
|
|
92
|
+
Packages installed:
|
|
93
|
+
|
|
94
|
+
- mira (MiraTTS)
|
|
95
|
+
- onnxruntime-gpu (CUDA)
|
|
96
|
+
- soundfile
|
|
97
|
+
- numpy
|
|
98
|
+
- requests
|
|
99
|
+
|
|
100
|
+
## GPU Requirements
|
|
101
|
+
|
|
102
|
+
### tts_test.py - STRICT GPU ENFORCEMENT â
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
# Force ONNX Runtime to use CUDA only - fail if CUDA is not available
|
|
106
|
+
os.environ['ORT_CUDA_UNAVAILABLE'] = '0'
|
|
107
|
+
|
|
108
|
+
# Verify ONNX Runtime has CUDA before proceeding
|
|
109
|
+
has_gpu = any(p in providers for p in ['CUDAExecutionProvider', 'TensorrtExecutionProvider'])
|
|
110
|
+
if not has_gpu:
|
|
111
|
+
print("ERROR: No GPU provider available for ONNX Runtime!")
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### ws_whisper_server.py - GPU Configured
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
model = WhisperModel(
|
|
119
|
+
"small",
|
|
120
|
+
device="cuda", # GPU only
|
|
121
|
+
compute_type="float16"
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Testing the Pipeline
|
|
126
|
+
|
|
127
|
+
### Test 1: Text to LLM
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
> Hello, how are you?
|
|
131
|
+
đ¤ Sending to LM Studio...
|
|
132
|
+
â Assistant: [LLM Response]
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Test 2: Audio to LLM
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
> audio reference_file.wav
|
|
139
|
+
đ¤ Transcribing: reference_file.wav
|
|
140
|
+
[Whisper]: [Transcribed text]
|
|
141
|
+
â Transcription complete: "[text]"
|
|
142
|
+
đ¤ Sending to LM Studio...
|
|
143
|
+
â Assistant: [LLM Response]
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Test 3: Full Pipeline (Manual)
|
|
147
|
+
|
|
148
|
+
1. Record/prepare audio file
|
|
149
|
+
2. Run: `audio myaudio.wav` in pipeline
|
|
150
|
+
3. Get transcription â LLM response
|
|
151
|
+
4. Copy LLM response
|
|
152
|
+
5. Run `tts_test.py` in `.venv` environment
|
|
153
|
+
6. Paste response to generate speech
|
|
154
|
+
|
|
155
|
+
## Current Status
|
|
156
|
+
|
|
157
|
+
â
Whisper Server: RUNNING on port 8000
|
|
158
|
+
â
LM Studio: RUNNING on port 1234
|
|
159
|
+
â
Pipeline Script: READY
|
|
160
|
+
â
TTS (tts_test.py): READY with GPU enforcement
|
|
161
|
+
âł Full integration: Requires merging environments or API bridge
|
|
162
|
+
|
|
163
|
+
## Next Steps for Full Integration
|
|
164
|
+
|
|
165
|
+
To create a single script with all three components:
|
|
166
|
+
|
|
167
|
+
1. Install MiraTTS in the `venv` environment, OR
|
|
168
|
+
2. Install Whisper packages in the `.venv` environment, OR
|
|
169
|
+
3. Create a REST API wrapper around tts_test.py for the pipeline to call
|
|
Binary file
|
package/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# Conversational AI Fren
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
This project implements a high-performance, low-latency voice conversational pipeline. It integrates Faster-Whisper for Speech-to-Text (STT), LM Studio for Large Language Model (LLM) inference, and MiraTTS for neural Text-to-Speech (TTS).
|
|
6
|
+
|
|
7
|
+
## Core Components
|
|
8
|
+
|
|
9
|
+
- **Speech-to-Text**: `faster-whisper` (medium model) utilizing CUDA 12.1 for near-instant transcription.
|
|
10
|
+
- **Language Model**: Inference via OpenAI-compatible API (standard port 1234).
|
|
11
|
+
- **Text-to-Speech**: `MiraTTS` neural cloning model for natural vocal responses.
|
|
12
|
+
- **Logic Engine**: Monolithic Python implementation with asynchronous VAD and threading.
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- **Voice-to-Start**: The system initializes in a silent "Shadow Mode" and activates via wake words ("Start" or "Hey").
|
|
17
|
+
- **Voice Activity Detection (VAD)**: Intelligent recording cutoff (0.8s silence threshold) to eliminate fixed-duration delays.
|
|
18
|
+
- **Shadow Listening (Pause Mode)**: Background monitoring with zero TUI output. Transition cues are provided via audio beeps.
|
|
19
|
+
- **Audio Transition Cues**: When activating and deactivating, the system emits a 1-second beep to signal the transition.
|
|
20
|
+
|
|
21
|
+
## System Prerequisites
|
|
22
|
+
|
|
23
|
+
1. **Python 3.10+**: Recommended for compatibility with `onnxruntime-gpu` and `torch`.
|
|
24
|
+
2. **FFmpeg**: Must be installed and available in the system PATH (used for loudness normalization and silence removal).
|
|
25
|
+
3. **CUDA 12.1 & cuDNN**: Required for GPU acceleration.
|
|
26
|
+
4. **LM Studio**: Local server must be active on `http://127.0.0.1:1234`.
|
|
27
|
+
|
|
28
|
+
## Quick Start (the easy way)
|
|
29
|
+
|
|
30
|
+
You can now initialize and run the entire pipeline with a single command using `npx`:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
npx fren-voice
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
This will automatically:
|
|
37
|
+
|
|
38
|
+
1. Clone the repository (if not already present).
|
|
39
|
+
2. Create a virtual environment.
|
|
40
|
+
3. Install all CUDA-optimized dependencies.
|
|
41
|
+
4. Launch the AI conversation.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Installation (Traditional)
|
|
46
|
+
|
|
47
|
+
### 1. Prerequisites
|
|
48
|
+
|
|
49
|
+
- **Python 3.10+**: Download and install from [python.org](https://www.python.org/).
|
|
50
|
+
- **Git**: Required to pull specific model repositories.
|
|
51
|
+
- **FFmpeg**:
|
|
52
|
+
1. Download and install FFmpeg following the official guides at [ffmpeg.org](https://ffmpeg.org/download.html) or the [official GitHub repository](https://github.com/FFmpeg/FFmpeg).
|
|
53
|
+
2. Ensure the `ffmpeg` executable is added to your system **PATH**.
|
|
54
|
+
- **LM Studio**: Install it and load a GGUF model (e.g., Llama 3 or Mistral). Start the Local Server on port 1234.
|
|
55
|
+
|
|
56
|
+
### 2. Setup Virtual Environment
|
|
57
|
+
|
|
58
|
+
It is highly recommended to use a virtual environment to manage dependencies:
|
|
59
|
+
|
|
60
|
+
**Windows:**
|
|
61
|
+
|
|
62
|
+
```powershell
|
|
63
|
+
python -m venv .venv
|
|
64
|
+
.\.venv\Scripts\activate
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Linux:**
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
python3 -m venv .venv
|
|
71
|
+
source .venv/bin/activate
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 3. Install Dependencies
|
|
75
|
+
|
|
76
|
+
Install all core engines and CUDA-optimized libraries:
|
|
77
|
+
|
|
78
|
+
**Windows:**
|
|
79
|
+
|
|
80
|
+
```powershell
|
|
81
|
+
pip install -r requirements.txt
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Linux:**
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pip3 install -r requirements.txt
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 4. Hardware Verification
|
|
91
|
+
|
|
92
|
+
Ensure your NVIDIA drivers are up to date. The system will automatically attempt to locate the necessary CUDA/cuDNN DLLs within your site-packages if they are missing from the system path.
|
|
93
|
+
|
|
94
|
+
## Configuration
|
|
95
|
+
|
|
96
|
+
- **Reference Voice**: The system requires `reference_file.wav` in the root directory for TTS voice cloning.
|
|
97
|
+
- **Microphone**: Defaulted to default input device can also be modified by setting `input_device_index` in the `record_audio_vad` method if necessary.
|
|
98
|
+
- **Environment Variables**: The script automatically sets the following for stability:
|
|
99
|
+
- `TM_MAX_CONTEXT_TOKEN_NUM="12000"` (LMDeploy buffer)
|
|
100
|
+
- `ORT_CUDA_DISABLE_CUDNN_FRONTEND="1"` (ONNX CUDA stability)
|
|
101
|
+
|
|
102
|
+
## Usage
|
|
103
|
+
|
|
104
|
+
Execute the main script:
|
|
105
|
+
|
|
106
|
+
**Windows:**
|
|
107
|
+
|
|
108
|
+
```powershell
|
|
109
|
+
python voice_conversation.py
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Linux:**
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
python3 voice_conversation.py
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Voice Commands {Can be changed in the code as requirements}
|
|
119
|
+
|
|
120
|
+
- **Activate**: "Start", "Hey", or "Continue".
|
|
121
|
+
- **Pause**: "Pause", "Exit", "Quit", or "Goodbye".
|
|
122
|
+
- **Terminate**: "Deactivate", "Terminate", or "Shut down".
|
|
123
|
+
|
|
124
|
+
## Project Structure
|
|
125
|
+
|
|
126
|
+
- `voice_conversation.py`: Core system logic and TUI.
|
|
127
|
+
- `requirements.txt`: Unified dependency manifest for CUDA-enabled environments.
|
|
128
|
+
- `README.md`: Technical documentation and setup guide.
|
|
129
|
+
- `reference_file.wav`: Target voice profile for TTS.
|
|
130
|
+
- `MiraTTS.bat` / `.ps1`: Quick-launch scripts.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
# Project Pipeline Architecture
|
|
135
|
+
|
|
136
|
+
This section illustrates the technical data flow and component integration of the Voice Conversational AI System.
|
|
137
|
+
|
|
138
|
+
## Technical Flow Diagram
|
|
139
|
+
|
|
140
|
+

|
|
141
|
+
|
|
142
|
+
```mermaid
|
|
143
|
+
graph TD
|
|
144
|
+
%% Entry Point
|
|
145
|
+
Start((System Start)) --> ShadowMode[Shadow Listening Mode]
|
|
146
|
+
|
|
147
|
+
%% Wake-Word Logic
|
|
148
|
+
ShadowMode -->|Detects 'Hey' / 'Start'| AudioCue[Audio Cue: Ascending Beep]
|
|
149
|
+
AudioCue --> MainLoop[Active Conversation Loop]
|
|
150
|
+
|
|
151
|
+
%% Input Pipeline
|
|
152
|
+
MainLoop --> MicInput[Microphone Input]
|
|
153
|
+
MicInput --> VAD{VAD Process}
|
|
154
|
+
VAD -->|Speaking| Buffer[Accumulate Audio Buffer]
|
|
155
|
+
VAD -->|Silence > 0.8s| StopRec[Terminate Recording]
|
|
156
|
+
|
|
157
|
+
%% Inference Pipeline
|
|
158
|
+
StopRec --> STT[Faster-Whisper STT Engine]
|
|
159
|
+
STT -->|Text| LLM[LM Studio API Inference]
|
|
160
|
+
LLM -->|Response Script| PreProcess[Text Normalization]
|
|
161
|
+
|
|
162
|
+
%% Output Pipeline
|
|
163
|
+
PreProcess --> TTS[MiraTTS Neural Synthesis]
|
|
164
|
+
TTS -->|PCM Audio| SyncExec{Parallel Execution}
|
|
165
|
+
|
|
166
|
+
SyncExec -->|Thread 1| Playback[PyAudio Memory Playback]
|
|
167
|
+
SyncExec -->|Thread 2| Visuals[Fast Typeface Animation]
|
|
168
|
+
|
|
169
|
+
%% State Management
|
|
170
|
+
Playback & Visuals --> StateCheck{Command Check}
|
|
171
|
+
StateCheck -->|'Pause'| PauseCue[Audio Cue: Descending Beep]
|
|
172
|
+
PauseCue --> ShadowMode
|
|
173
|
+
|
|
174
|
+
StateCheck -->|'Deactivate'| Shutdown((System Shutdown))
|
|
175
|
+
StateCheck -->|Next Turn| MainLoop
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Component Roles
|
|
179
|
+
|
|
180
|
+
### 1. Input Layer (VAD & STT)
|
|
181
|
+
|
|
182
|
+
- **VAD (Voice Activity Detection)**: Monitors RMS energy levels in real-time. It cuts the recording loop immediately after 800ms of sustained quiet, significantly reducing response latency compared to fixed-length recording.
|
|
183
|
+
- **Faster-Whisper**: Processes the audio buffer into text. It uses CTranslate2 backend for high-speed inference on NVIDIA GPUs.
|
|
184
|
+
|
|
185
|
+
### 2. Cognitive Layer (LLM)
|
|
186
|
+
|
|
187
|
+
- **LM Studio Gateway**: Acts as the brain. The system sends conversational context to a local OpenAI-compatible endpoint.
|
|
188
|
+
- **System Prompting**: Enforces concise, spoken-word friendly responses (1-3 sentences) to maintain conversational speed.
|
|
189
|
+
|
|
190
|
+
### 3. Synthesis Layer (TTS)
|
|
191
|
+
|
|
192
|
+
- **MiraTTS**: A neural reference-based synthesis engine. It generates high-fidelity audio tokens from text based on the `reference_file.wav` provided at initialization.
|
|
193
|
+
- **Neural Codecs**: Utilizes `ncodec` and `FlashSR` for internal audio super-resolution and decoding.
|
|
194
|
+
|
|
195
|
+
### 4. Output Layer (Synchronized IO)
|
|
196
|
+
|
|
197
|
+
- **PyAudio Memory Buffer**: Audio is played directly from RAM. No disk I/O is involved during playback to prevent micro-stutters.
|
|
198
|
+
- **Threaded UI**: The terminal typewriter effect runs in a parallel thread, synchronized with a 1.0s lead-in to allow the sound driver to initialize.
|
|
199
|
+
|
|
200
|
+
## Lifecycle States
|
|
201
|
+
|
|
202
|
+
| State | Description | Audio Cue |
|
|
203
|
+
| :--------------- | :------------------------------------------- | :-------------- |
|
|
204
|
+
| **Shadow Mode** | Silent background monitoring for wake-words. | None |
|
|
205
|
+
| **Activation** | Transition into active conversation. | Ascending Beep |
|
|
206
|
+
| **Processing** | Actively transcribing or generating. | None |
|
|
207
|
+
| **Deactivation** | Transition back into Shadow Mode. | Descending Beep |
|
|
208
|
+
| **Termination** | Full process shutdown. | None |
|
package/bin/cli.js
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const { execSync } = require("child_process");
|
|
4
|
+
const fs = require("fs-extra");
|
|
5
|
+
const path = require("path");
|
|
6
|
+
const chalk = require("chalk");
|
|
7
|
+
const ora = require("ora");
|
|
8
|
+
const prompts = require("prompts");
|
|
9
|
+
|
|
10
|
+
async function main() {
|
|
11
|
+
console.log(chalk.cyan.bold("\n đď¸ FREN: AI Voice Companion\n"));
|
|
12
|
+
|
|
13
|
+
const currentDir = process.cwd();
|
|
14
|
+
const isProjectDir = await fs.pathExists(
|
|
15
|
+
path.join(currentDir, "voice_conversation.py")
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
if (!isProjectDir) {
|
|
19
|
+
const response = await prompts({
|
|
20
|
+
type: "confirm",
|
|
21
|
+
name: "setup",
|
|
22
|
+
message:
|
|
23
|
+
"Fren is not initialized in this directory. Would you like to clone the repository here?",
|
|
24
|
+
initial: true,
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
if (response.setup) {
|
|
28
|
+
const spinner = ora("Cloning repository...").start();
|
|
29
|
+
try {
|
|
30
|
+
execSync("git clone https://github.com/lakshya-p/Fren.git .", {
|
|
31
|
+
stdio: "ignore",
|
|
32
|
+
});
|
|
33
|
+
spinner.succeed("Repository cloned successfully!");
|
|
34
|
+
} catch (err) {
|
|
35
|
+
spinner.fail("Failed to clone repository. Make sure git is installed.");
|
|
36
|
+
process.exit(1);
|
|
37
|
+
}
|
|
38
|
+
} else {
|
|
39
|
+
console.log(
|
|
40
|
+
chalk.yellow(
|
|
41
|
+
"Aborted. Please navigate to a Fren directory or initialize it."
|
|
42
|
+
)
|
|
43
|
+
);
|
|
44
|
+
process.exit(0);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Check for Virtual Environment
|
|
49
|
+
const venvPath = path.join(currentDir, ".venv");
|
|
50
|
+
const pythonExec =
|
|
51
|
+
process.platform === "win32"
|
|
52
|
+
? path.join(venvPath, "Scripts", "python.exe")
|
|
53
|
+
: path.join(venvPath, "bin", "python");
|
|
54
|
+
|
|
55
|
+
if (!(await fs.pathExists(venvPath))) {
|
|
56
|
+
const setupVenv = await prompts({
|
|
57
|
+
type: "confirm",
|
|
58
|
+
name: "proceed",
|
|
59
|
+
message:
|
|
60
|
+
"No virtual environment found. Would you like to create one and install dependencies?",
|
|
61
|
+
initial: true,
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
if (setupVenv.proceed) {
|
|
65
|
+
const spinner = ora("Creating virtual environment...").start();
|
|
66
|
+
try {
|
|
67
|
+
execSync("python -m venv .venv");
|
|
68
|
+
spinner.text =
|
|
69
|
+
"Installing dependencies (this may take a few minutes)...";
|
|
70
|
+
execSync(`"${pythonExec}" -m pip install -r requirements.txt`, {
|
|
71
|
+
stdio: "inherit",
|
|
72
|
+
});
|
|
73
|
+
spinner.succeed("Setup complete!");
|
|
74
|
+
} catch (err) {
|
|
75
|
+
spinner.fail("Setup failed. Ensure Python 3.10+ is installed.");
|
|
76
|
+
console.error(err);
|
|
77
|
+
process.exit(1);
|
|
78
|
+
}
|
|
79
|
+
} else {
|
|
80
|
+
console.log(chalk.yellow("Cannot proceed without dependencies."));
|
|
81
|
+
process.exit(1);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Final confirmation to run
|
|
86
|
+
console.log(chalk.green("\nđ Starting Fren AI...\n"));
|
|
87
|
+
try {
|
|
88
|
+
execSync(`"${pythonExec}" voice_conversation.py`, { stdio: "inherit" });
|
|
89
|
+
} catch (err) {
|
|
90
|
+
console.error(
|
|
91
|
+
chalk.red(
|
|
92
|
+
"\nFren closed with an error. Check if LM Studio is running on port 1234."
|
|
93
|
+
)
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
main().catch((err) => {
|
|
99
|
+
console.error(err);
|
|
100
|
+
process.exit(1);
|
|
101
|
+
});
|
package/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "fren-voice",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "High-performance voice conversational AI system.",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"fren": "bin/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
|
11
|
+
},
|
|
12
|
+
"keywords": [
|
|
13
|
+
"voice",
|
|
14
|
+
"ai",
|
|
15
|
+
"stt",
|
|
16
|
+
"tts",
|
|
17
|
+
"llm",
|
|
18
|
+
"conversational"
|
|
19
|
+
],
|
|
20
|
+
"author": "lakshya-p",
|
|
21
|
+
"license": "MIT",
|
|
22
|
+
"dependencies": {
|
|
23
|
+
"chalk": "^4.1.2",
|
|
24
|
+
"execa": "^5.1.1",
|
|
25
|
+
"fs-extra": "^10.0.0",
|
|
26
|
+
"ora": "^5.4.1",
|
|
27
|
+
"prompts": "^2.4.2"
|
|
28
|
+
}
|
|
29
|
+
}
|
|
Binary file
|
package/requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
|
2
|
+
|
|
3
|
+
# CoreModules (Git Repositories)
|
|
4
|
+
git+https://github.com/ysharma3501/MiraTTS.git
|
|
5
|
+
git+https://github.com/ysharma3501/FastBiCodec.git
|
|
6
|
+
git+https://github.com/ysharma3501/FlashSR.git
|
|
7
|
+
|
|
8
|
+
# Inference Engines
|
|
9
|
+
faster-whisper==1.2.1
|
|
10
|
+
onnxruntime-gpu==1.23.2
|
|
11
|
+
lmdeploy==0.11.0
|
|
12
|
+
|
|
13
|
+
# PyTorch (CUDA 12.1 optimized)
|
|
14
|
+
torch==2.5.1+cu121
|
|
15
|
+
torchaudio==2.5.1+cu121
|
|
16
|
+
torchvision==0.20.1+cu121
|
|
17
|
+
|
|
18
|
+
# Audio Processing & IO
|
|
19
|
+
pyaudio>=0.2.14
|
|
20
|
+
soundfile>=0.13.1
|
|
21
|
+
librosa>=0.11.0
|
|
22
|
+
numpy==1.26.4
|
|
23
|
+
|
|
24
|
+
# API & Networking
|
|
25
|
+
requests>=2.32.5
|
|
26
|
+
aiohttp>=3.13.2
|
|
27
|
+
fastapi>=0.127.0
|
|
28
|
+
uvicorn>=0.40.0
|
|
29
|
+
python-multipart
|
|
30
|
+
|
|
31
|
+
# Support & Performance
|
|
32
|
+
transformers>=4.56.1
|
|
33
|
+
pydantic>=2.12.5
|
|
34
|
+
tqdm>=4.67.1
|
|
35
|
+
typing-extensions>=4.13.0
|
|
36
|
+
ffmpeg-python
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
"""
|
|
2
|
+
STABLE VOICE CONVERSATIONAL AI (Wake-Word + Audio Cues)
|
|
3
|
+
Optimizations:
|
|
4
|
+
- VAD (Voice Activity Detection): Stop recording early when silence is detected.
|
|
5
|
+
- Silent Wake-Word Detection: Back-to-life on "Hey", "Fren", or "Continue".
|
|
6
|
+
- Audio Cues: Beep on pause/resume.
|
|
7
|
+
"""
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
# Log suppression
|
|
12
|
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
|
13
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
14
|
+
os.environ['ORT_CUDA_DISABLE_CUDNN_FRONTEND'] = '1'
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import requests
|
|
18
|
+
import numpy as np
|
|
19
|
+
import soundfile as sf
|
|
20
|
+
import subprocess
|
|
21
|
+
import pyaudio
|
|
22
|
+
import tempfile
|
|
23
|
+
import threading
|
|
24
|
+
import re
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
import warnings
|
|
28
|
+
import time
|
|
29
|
+
from datetime import datetime
|
|
30
|
+
from faster_whisper import WhisperModel
|
|
31
|
+
|
|
32
|
+
# Global Silence Configuration
|
|
33
|
+
warnings.filterwarnings("ignore")
|
|
34
|
+
logging.basicConfig(level=logging.ERROR)
|
|
35
|
+
logging.getLogger("lmdeploy").setLevel(logging.ERROR)
|
|
36
|
+
logging.getLogger().setLevel(logging.ERROR)
|
|
37
|
+
|
|
38
|
+
# Fix for CUDA DLLs on Windows
|
|
39
|
+
if sys.platform == "win32":
|
|
40
|
+
for venv_name in [".venv", "venv"]:
|
|
41
|
+
nvidia_dir = os.path.join(os.path.dirname(__file__), venv_name, "Lib", "site-packages", "nvidia")
|
|
42
|
+
if os.path.exists(nvidia_dir):
|
|
43
|
+
for sub in ["cudnn", "cublas", "cuda_runtime", "curand", "cusolver", "cusparse"]:
|
|
44
|
+
bin_path = os.path.join(nvidia_dir, sub, "bin")
|
|
45
|
+
if os.path.exists(bin_path):
|
|
46
|
+
os.environ["PATH"] = bin_path + os.pathsep + os.environ["PATH"]
|
|
47
|
+
|
|
48
|
+
# Set ONNX Severity
|
|
49
|
+
try:
|
|
50
|
+
import onnxruntime as ort
|
|
51
|
+
ort.set_default_logger_severity(3)
|
|
52
|
+
except: pass
|
|
53
|
+
|
|
54
|
+
os.environ['TM_MAX_CONTEXT_TOKEN_NUM'] = '12000'
|
|
55
|
+
from mira.model import MiraTTS
|
|
56
|
+
|
|
57
|
+
# Configuration
|
|
58
|
+
LLM_API_URL = "http://127.0.0.1:1234/v1/chat/completions"
|
|
59
|
+
LLM_MODELS_URL = "http://127.0.0.1:1234/v1/models"
|
|
60
|
+
SAMPLE_RATE = 16000
|
|
61
|
+
MAX_RECORD_SECONDS = 6
|
|
62
|
+
SILENCE_THRESHOLD = 500 # RMS threshold for silence
|
|
63
|
+
SILENCE_DURATION = 0.8 # Stop after 0.8s of silence
|
|
64
|
+
|
|
65
|
+
class VoiceConversationalAI:
|
|
66
|
+
def __init__(self, reference_audio_path):
|
|
67
|
+
print("\n" + "="*70)
|
|
68
|
+
print("đď¸ VOICE CONVERSATIONAL AI SYSTEM")
|
|
69
|
+
print("="*70)
|
|
70
|
+
|
|
71
|
+
# Initialize Whisper
|
|
72
|
+
print("\n[1/4] Loading Whisper model...")
|
|
73
|
+
try:
|
|
74
|
+
self.whisper = WhisperModel("medium", device="cuda", compute_type="float16")
|
|
75
|
+
print(" â Whisper loaded on CUDA (medium)")
|
|
76
|
+
except:
|
|
77
|
+
self.whisper = WhisperModel("small", device="cpu", compute_type="int8")
|
|
78
|
+
print(" â Whisper loaded on CPU (small)")
|
|
79
|
+
|
|
80
|
+
# Initialize TTS
|
|
81
|
+
print("\n[2/4] Loading MiraTTS model...")
|
|
82
|
+
self.mira_tts = MiraTTS('YatharthS/MiraTTS')
|
|
83
|
+
self.mira_tts.set_params(temperature=0.6, top_p=0.92, max_new_tokens=1536)
|
|
84
|
+
print(" â MiraTTS loaded")
|
|
85
|
+
|
|
86
|
+
print("\n[3/4] Encoding reference audio...")
|
|
87
|
+
self.context_tokens = self.mira_tts.encode_audio(reference_audio_path)
|
|
88
|
+
print(" â Reference audio encoded")
|
|
89
|
+
|
|
90
|
+
self.audio = pyaudio.PyAudio()
|
|
91
|
+
self.turn_count = 0
|
|
92
|
+
|
|
93
|
+
print("\n" + "="*70)
|
|
94
|
+
print("[OK] System Ready!")
|
|
95
|
+
print("="*70)
|
|
96
|
+
|
|
97
|
+
def play_cue(self, cue_type='resume'):
|
|
98
|
+
"""Simple sine wave beep cue"""
|
|
99
|
+
def run_cue():
|
|
100
|
+
try:
|
|
101
|
+
duration = 0.2
|
|
102
|
+
fs = 44000
|
|
103
|
+
if cue_type == 'resume':
|
|
104
|
+
# Ascending: 400Hz to 800Hz
|
|
105
|
+
f1, f2 = 400, 800
|
|
106
|
+
else:
|
|
107
|
+
# Descending: 800Hz to 400Hz
|
|
108
|
+
f1, f2 = 800, 400
|
|
109
|
+
|
|
110
|
+
t = np.linspace(0, duration, int(fs * duration))
|
|
111
|
+
# Frequency sweep
|
|
112
|
+
f = np.linspace(f1, f2, len(t))
|
|
113
|
+
samples = 0.9 * np.sin(2 * np.pi * f * t) # Increased volume to 0.9
|
|
114
|
+
|
|
115
|
+
# Fade out to avoid click
|
|
116
|
+
fade = np.linspace(1, 0, len(samples))
|
|
117
|
+
samples = (samples * fade).astype(np.float32)
|
|
118
|
+
|
|
119
|
+
# Standard playback
|
|
120
|
+
stream = self.audio.open(format=pyaudio.paFloat32, channels=1, rate=fs, output=True)
|
|
121
|
+
stream.write(samples.tobytes())
|
|
122
|
+
stream.stop_stream()
|
|
123
|
+
stream.close()
|
|
124
|
+
except: pass
|
|
125
|
+
|
|
126
|
+
threading.Thread(target=run_cue).start()
|
|
127
|
+
|
|
128
|
+
async def record_audio_vad(self, silent=False):
|
|
129
|
+
"""Record with VAD: stop early when silence is detected"""
|
|
130
|
+
if not silent: print(f"\nMIC: Listening...")
|
|
131
|
+
|
|
132
|
+
stream = self.audio.open(
|
|
133
|
+
format=pyaudio.paInt16,
|
|
134
|
+
channels=1,
|
|
135
|
+
rate=SAMPLE_RATE,
|
|
136
|
+
input=True,
|
|
137
|
+
input_device_index=3,
|
|
138
|
+
frames_per_buffer=1024
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
frames = []
|
|
142
|
+
silent_chunks = 0
|
|
143
|
+
max_silent_chunks = int(SILENCE_DURATION * SAMPLE_RATE / 1024)
|
|
144
|
+
started_talking = False
|
|
145
|
+
|
|
146
|
+
for i in range(0, int(SAMPLE_RATE / 1024 * (3 if silent else MAX_RECORD_SECONDS))):
|
|
147
|
+
data = stream.read(1024, exception_on_overflow=False)
|
|
148
|
+
frames.append(data)
|
|
149
|
+
|
|
150
|
+
audio_data = np.frombuffer(data, dtype=np.int16)
|
|
151
|
+
rms = np.sqrt(np.mean(audio_data.astype(np.float32)**2))
|
|
152
|
+
|
|
153
|
+
if rms > SILENCE_THRESHOLD:
|
|
154
|
+
started_talking = True
|
|
155
|
+
silent_chunks = 0
|
|
156
|
+
elif started_talking:
|
|
157
|
+
silent_chunks += 1
|
|
158
|
+
if silent_chunks > max_silent_chunks:
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
stream.stop_stream()
|
|
162
|
+
stream.close()
|
|
163
|
+
return b''.join(frames)
|
|
164
|
+
|
|
165
|
+
def transcribe_sync(self, audio_bytes, silent=False):
|
|
166
|
+
"""Standard transcription or silent background check"""
|
|
167
|
+
if not silent: print("đ Transcribing...")
|
|
168
|
+
|
|
169
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
170
|
+
temp_name = f.name
|
|
171
|
+
with sf.SoundFile(temp_name, 'w', 16000, 1) as sf_file:
|
|
172
|
+
sf_file.write(np.frombuffer(audio_bytes, dtype=np.int16))
|
|
173
|
+
|
|
174
|
+
clean_name = temp_name.replace(".wav", "_clean.wav")
|
|
175
|
+
try:
|
|
176
|
+
subprocess.run(["ffmpeg", "-y", "-i", temp_name, "-ac", "1", "-ar", "16000", "-af", "loudnorm,silenceremove=stop_periods=-1:stop_duration=0.5:stop_threshold=-40dB", clean_name], capture_output=True, timeout=5)
|
|
177
|
+
trans_file = clean_name if os.path.exists(clean_name) else temp_name
|
|
178
|
+
except: trans_file = temp_name
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
segments, _ = self.whisper.transcribe(trans_file, language="en", temperature=0, beam_size=1, vad_filter=True)
|
|
182
|
+
text = "".join(s.text for s in segments).strip()
|
|
183
|
+
|
|
184
|
+
if not silent:
|
|
185
|
+
if text: print(f" â You said: \"{text}\"")
|
|
186
|
+
else: print(f" WARN: No speech detected")
|
|
187
|
+
|
|
188
|
+
return text
|
|
189
|
+
except: return ""
|
|
190
|
+
finally:
|
|
191
|
+
for f in [temp_name, clean_name]:
|
|
192
|
+
if os.path.exists(f):
|
|
193
|
+
try: os.remove(f)
|
|
194
|
+
except: pass
|
|
195
|
+
|
|
196
|
+
async def background_wait_for_wake_word(self, initial=False):
|
|
197
|
+
"""Silently listen for 'Hey', 'Fren', or 'Continue'"""
|
|
198
|
+
if initial:
|
|
199
|
+
print("\nđď¸ System initialized. Say 'Hey Fren' or 'Start' to begin...")
|
|
200
|
+
else:
|
|
201
|
+
self.play_cue('pause') # Audio cue for pausing
|
|
202
|
+
print("\nâ¸ď¸ System Paused. Listening for 'Hey', 'Fren', or 'Continue'...")
|
|
203
|
+
|
|
204
|
+
keywords = ['hey', 'fren', 'continue', 'start']
|
|
205
|
+
|
|
206
|
+
while True:
|
|
207
|
+
# Silent record (3 seconds chunks)
|
|
208
|
+
audio_data = await self.record_audio_vad(silent=True)
|
|
209
|
+
|
|
210
|
+
# Silent transcribe
|
|
211
|
+
text = await asyncio.to_thread(self.transcribe_sync, audio_data, silent=True)
|
|
212
|
+
|
|
213
|
+
if text and any(kw in text.lower() for kw in keywords):
|
|
214
|
+
self.play_cue('resume') # Audio cue for resuming
|
|
215
|
+
print("\nđď¸ System Re-activated!" if not initial else "\nđď¸ System Started!")
|
|
216
|
+
return True
|
|
217
|
+
|
|
218
|
+
await asyncio.sleep(0.1)
|
|
219
|
+
|
|
220
|
+
def query_llm_sync(self, user_message):
|
|
221
|
+
try:
|
|
222
|
+
try: model_id = requests.get(LLM_MODELS_URL, timeout=5).json()["data"][0]["id"]
|
|
223
|
+
except: model_id = "local-model"
|
|
224
|
+
|
|
225
|
+
response = requests.post(
|
|
226
|
+
LLM_API_URL,
|
|
227
|
+
json={
|
|
228
|
+
"model": model_id,
|
|
229
|
+
"messages": [
|
|
230
|
+
{"role": "system", "content": "You are a helpful voice assistant. Respond naturally and conversationally. Keep it concise (1-3 sentences). Only output plain spoken English."},
|
|
231
|
+
{"role": "user", "content": user_message}
|
|
232
|
+
],
|
|
233
|
+
"temperature": 0.5
|
|
234
|
+
},
|
|
235
|
+
timeout=60
|
|
236
|
+
)
|
|
237
|
+
if response.status_code == 200:
|
|
238
|
+
msg = response.json()['choices'][0]['message']['content']
|
|
239
|
+
msg = re.sub(r"<[^>]+>", "", msg)
|
|
240
|
+
msg = re.sub(r"\*[^*]+\*", "", msg)
|
|
241
|
+
return msg.strip()
|
|
242
|
+
return "I'm having trouble connecting."
|
|
243
|
+
except Exception as e:
|
|
244
|
+
return f"Error: {e}"
|
|
245
|
+
|
|
246
|
+
def generate_and_play_speech(self, text):
|
|
247
|
+
if not text: return
|
|
248
|
+
print(f"\nSPEAKER: Generating speech...")
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
import io, contextlib
|
|
252
|
+
with contextlib.redirect_stdout(io.StringIO()):
|
|
253
|
+
audio = self.mira_tts.generate(text, self.context_tokens)
|
|
254
|
+
|
|
255
|
+
def play():
|
|
256
|
+
audio_int16 = (np.array(audio) * 32767).astype(np.int16)
|
|
257
|
+
stream = self.audio.open(format=pyaudio.paInt16, channels=1, rate=48000, output=True)
|
|
258
|
+
stream.write(audio_int16.tobytes())
|
|
259
|
+
stream.stop_stream()
|
|
260
|
+
stream.close()
|
|
261
|
+
|
|
262
|
+
def type_text():
|
|
263
|
+
time.sleep(1.0) # Delay start to sync with audio buffering
|
|
264
|
+
print(f"\nđŹ Fren: ", end="", flush=True)
|
|
265
|
+
for char in text:
|
|
266
|
+
print(char, end="", flush=True)
|
|
267
|
+
time.sleep(0.04) # Faster typeface animation
|
|
268
|
+
print("\n")
|
|
269
|
+
|
|
270
|
+
play_thread = threading.Thread(target=play)
|
|
271
|
+
type_thread = threading.Thread(target=type_text)
|
|
272
|
+
|
|
273
|
+
play_thread.start()
|
|
274
|
+
type_thread.start()
|
|
275
|
+
|
|
276
|
+
play_thread.join()
|
|
277
|
+
type_thread.join()
|
|
278
|
+
|
|
279
|
+
except Exception as e:
|
|
280
|
+
print(f" [ERR] TTS Error: {e}")
|
|
281
|
+
|
|
282
|
+
async def run(self):
|
|
283
|
+
print("\n" + "="*70)
|
|
284
|
+
print("đď¸ VOICE CONVERSATION MODE")
|
|
285
|
+
print("="*70)
|
|
286
|
+
|
|
287
|
+
# Start with silent VAD instead of Enter
|
|
288
|
+
await self.background_wait_for_wake_word(initial=True)
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
while True:
|
|
292
|
+
print("\n" + "-"*40)
|
|
293
|
+
# Step 1: Record with VAD
|
|
294
|
+
audio_data = await self.record_audio_vad()
|
|
295
|
+
|
|
296
|
+
# Step 2: Transcribe
|
|
297
|
+
transcription = await asyncio.to_thread(self.transcribe_sync, audio_data)
|
|
298
|
+
|
|
299
|
+
if transcription:
|
|
300
|
+
# Check for termination
|
|
301
|
+
if any(word in transcription.lower() for word in ['deactivate', 'terminate', 'shut down']):
|
|
302
|
+
print("\nđ System Deactivating. Goodbye!")
|
|
303
|
+
break
|
|
304
|
+
|
|
305
|
+
# Check for pause/exit
|
|
306
|
+
if any(word in transcription.lower() for word in ['exit', 'quit', 'goodbye', 'bye', 'pause']):
|
|
307
|
+
# Enter Silent Wake Mode
|
|
308
|
+
await self.background_wait_for_wake_word()
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
# Step 3: Query LLM
|
|
312
|
+
response = await asyncio.to_thread(self.query_llm_sync, transcription)
|
|
313
|
+
|
|
314
|
+
# Step 4: Playback
|
|
315
|
+
await asyncio.to_thread(self.generate_and_play_speech, response)
|
|
316
|
+
|
|
317
|
+
print("\nâ¸ď¸ Ready for next turn...")
|
|
318
|
+
await asyncio.sleep(0.5)
|
|
319
|
+
|
|
320
|
+
except KeyboardInterrupt: print("\nđ Interrupt received. Exiting.")
|
|
321
|
+
finally: self.audio.terminate()
|
|
322
|
+
|
|
323
|
+
async def main():
|
|
324
|
+
ref_file = "reference_file.wav"
|
|
325
|
+
if not os.path.exists(ref_file): return
|
|
326
|
+
ai = VoiceConversationalAI(ref_file)
|
|
327
|
+
await ai.run()
|
|
328
|
+
|
|
329
|
+
if __name__ == "__main__":
|
|
330
|
+
asyncio.run(main())
|