fren-voice 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/MiraTTS.bat ADDED
@@ -0,0 +1,27 @@
1
+ @echo off
2
+ REM MiraTTS Launcher
3
+ REM This batch file activates the virtual environment and runs the TTS program
4
+
5
+ cd /d "%~dp0"
6
+
7
+ echo ========================================
8
+ echo MiraTTS - Text to Speech Generator
9
+ echo ========================================
10
+ echo.
11
+
12
+ REM Check if virtual environment exists
13
+ if not exist ".venv\Scripts\activate.bat" (
14
+ echo ERROR: Virtual environment not found!
15
+ echo Please ensure .venv folder exists in the same directory.
16
+ pause
17
+ exit /b 1
18
+ )
19
+
20
+ REM Activate virtual environment and run the program
21
+ call .venv\Scripts\activate.bat
22
+ python tts.py
23
+
24
+ REM Deactivate when done
25
+ deactivate
26
+
27
+ pause
package/MiraTTS.ps1 ADDED
@@ -0,0 +1,29 @@
1
+ # MiraTTS PowerShell Launcher
2
+ # This script activates the virtual environment and runs the TTS program
3
+
4
+ $scriptPath = Split-Path -Parent $MyInvocation.MyCommand.Path
5
+ Set-Location $scriptPath
6
+
7
+ Write-Host "========================================"
8
+ Write-Host "MiraTTS - Text to Speech Generator"
9
+ Write-Host "========================================"
10
+ Write-Host ""
11
+
12
+ # Check if virtual environment exists
13
+ if (-not (Test-Path ".venv\Scripts\Activate.ps1")) {
14
+ Write-Host "ERROR: Virtual environment not found!" -ForegroundColor Red
15
+ Write-Host "Please ensure .venv folder exists in the same directory."
16
+ Read-Host "Press Enter to exit"
17
+ exit 1
18
+ }
19
+
20
+ # Activate virtual environment
21
+ & ".venv\Scripts\Activate.ps1"
22
+
23
+ # Run the TTS program
24
+ & python tts.py
25
+
26
+ # Deactivate when done
27
+ deactivate
28
+
29
+ Read-Host "Press Enter to exit"
@@ -0,0 +1,73 @@
1
+ # Project Pipeline Architecture
2
+
3
+ This document illustrates the technical data flow and component integration of the Voice Conversational AI System.
4
+
5
+ ## Technical Flow Diagram
6
+
7
+ ```mermaid
8
+ graph TD
9
+ %% Entry Point
10
+ Start((System Start)) --> ShadowMode[Shadow Listening Mode]
11
+
12
+ %% Wake-Word Logic
13
+ ShadowMode -->|Detects 'Hey' / 'Start'| AudioCue[Audio Cue: Ascending Beep]
14
+ AudioCue --> MainLoop[Active Conversation Loop]
15
+
16
+ %% Input Pipeline
17
+ MainLoop --> MicInput[Microphone Input]
18
+ MicInput --> VAD{VAD Process}
19
+ VAD -->|Speaking| Buffer[Accumulate Audio Buffer]
20
+ VAD -->|Silence > 0.8s| StopRec[Terminate Recording]
21
+
22
+ %% Inference Pipeline
23
+ StopRec --> STT[Faster-Whisper STT Engine]
24
+ STT -->|Text| LLM[LM Studio API Inference]
25
+ LLM -->|Response Script| PreProcess[Text Normalization]
26
+
27
+ %% Output Pipeline
28
+ PreProcess --> TTS[MiraTTS Neural Synthesis]
29
+ TTS -->|PCM Audio| SyncExec{Parallel Execution}
30
+
31
+ SyncExec -->|Thread 1| Playback[PyAudio Memory Playback]
32
+ SyncExec -->|Thread 2| Visuals[Fast Typeface Animation]
33
+
34
+ %% State Management
35
+ Playback & Visuals --> StateCheck{Command Check}
36
+ StateCheck -->|'Pause'| PauseCue[Audio Cue: Descending Beep]
37
+ PauseCue --> ShadowMode
38
+
39
+ StateCheck -->|'Deactivate'| Shutdown((System Shutdown))
40
+ StateCheck -->|Next Turn| MainLoop
41
+ ```
42
+
43
+ ## Component Roles
44
+
45
+ ### 1. Input Layer (VAD & STT)
46
+
47
+ - **VAD (Voice Activity Detection)**: Monitors RMS energy levels in real-time. It cuts the recording loop immediately after 800ms of sustained quiet, significantly reducing response latency compared to fixed-length recording.
48
+ - **Faster-Whisper**: Processes the audio buffer into text. It uses CTranslate2 backend for high-speed inference on NVIDIA GPUs.
49
+
50
+ ### 2. Cognitive Layer (LLM)
51
+
52
+ - **LM Studio Gateway**: Acts as the brain. The system sends conversational context to a local OpenAI-compatible endpoint.
53
+ - **System Prompting**: Enforces concise, spoken-word friendly responses (1-3 sentences) to maintain conversational speed.
54
+
55
+ ### 3. Synthesis Layer (TTS)
56
+
57
+ - **MiraTTS**: A neural reference-based synthesis engine. It generates high-fidelity audio tokens from text based on the `reference_file.wav` provided at initialization.
58
+ - **Neural Codecs**: Utilizes `ncodec` and `FlashSR` for internal audio super-resolution and decoding.
59
+
60
+ ### 4. Output Layer (Synchronized IO)
61
+
62
+ - **PyAudio Memory Buffer**: Audio is played directly from RAM. No disk I/O is involved during playback to prevent micro-stutters.
63
+ - **Threaded UI**: The terminal typewriter effect runs in a parallel thread, synchronized with a 1.0s lead-in to allow the sound driver to initialize.
64
+
65
+ ## Lifecycle States
66
+
67
+ | State | Description | Audio Cue |
68
+ | :--------------- | :------------------------------------------- | :-------------- |
69
+ | **Shadow Mode** | Silent background monitoring for wake-words. | None |
70
+ | **Activation** | Transition into active conversation. | Ascending Beep |
71
+ | **Processing** | Actively transcribing or generating. | None |
72
+ | **Deactivation** | Transition back into Shadow Mode. | Descending Beep |
73
+ | **Termination** | Full process shutdown. | None |
@@ -0,0 +1,169 @@
1
+ # Complete Conversational AI Pipeline Setup
2
+
3
+ ## Architecture Overview
4
+
5
+ ```
6
+ Audio Input → Whisper (STT) → LM Studio (LLM) → MiraTTS (TTS) → Audio Output
7
+ ```
8
+
9
+ ## Components
10
+
11
+ ### 1. Whisper WebSocket Server (Speech-to-Text)
12
+
13
+ - **File**: `ws_whisper_server.py`
14
+ - **Endpoint**: `ws://127.0.0.1:8000/ws/transcribe`
15
+ - **Status**: ✅ RUNNING
16
+ - **Model**: faster-whisper (small, CUDA)
17
+ - **Environment**: `venv`
18
+
19
+ ### 2. LM Studio (Language Model)
20
+
21
+ - **Endpoint**: `http://127.0.0.1:1234/v1/chat/completions`
22
+ - **Status**: ✅ RUNNING
23
+ - **Available Models**:
24
+ - orpheus-3b-0.1-ft
25
+ - llama-3.1-8b-lexi-uncensored-v2
26
+
27
+ ### 3. MiraTTS (Text-to-Speech)
28
+
29
+ - **File**: `tts_test.py`
30
+ - **Environment**: `.venv` (with dot)
31
+ - **GPU Enforcement**: ✅ STRICT (ONNX CUDA only)
32
+ - **Reference Audio**: `reference_file.wav`
33
+
34
+ ### 4. Pipeline Integration
35
+
36
+ - **File**: `whisper_to_llm_pipeline.py`
37
+ - **Environment**: `venv`
38
+ - **Features**:
39
+ - Interactive text chat with LLM
40
+ - Audio file transcription via Whisper
41
+ - Complete STT → LLM flow
42
+
43
+ ## Running the Complete Pipeline
44
+
45
+ ### Step 1: Start Whisper Server (Already Running)
46
+
47
+ ```powershell
48
+ .\venv\Scripts\Activate.ps1
49
+ uvicorn ws_whisper_server:app --host 0.0.0.0 --port 8000
50
+ ```
51
+
52
+ ### Step 2: Ensure LM Studio is Running (Already Running)
53
+
54
+ - LM Studio should be running on port 1234
55
+ - Load your preferred model
56
+
57
+ ### Step 3: Run the Pipeline
58
+
59
+ ```powershell
60
+ .\venv\Scripts\Activate.ps1
61
+ python whisper_to_llm_pipeline.py
62
+ ```
63
+
64
+ **Interactive Mode Commands:**
65
+
66
+ - Type any message → sends directly to LLM
67
+ - `audio <filepath>` → transcribes audio then sends to LLM
68
+ - `clear` → clears conversation history
69
+ - `exit` → quit
70
+
71
+ ### Step 4: Add TTS Output (Use tts_test.py separately)
72
+
73
+ For now, use `tts_test.py` in the `.venv` environment to generate speech from LLM responses.
74
+
75
+ ## Environment Details
76
+
77
+ ### `venv` (Whisper + Pipeline)
78
+
79
+ Packages installed:
80
+
81
+ - faster-whisper
82
+ - fastapi
83
+ - uvicorn
84
+ - websockets
85
+ - soundfile
86
+ - numpy
87
+ - requests
88
+ - pyaudio
89
+
90
+ ### `.venv` (MiraTTS)
91
+
92
+ Packages installed:
93
+
94
+ - mira (MiraTTS)
95
+ - onnxruntime-gpu (CUDA)
96
+ - soundfile
97
+ - numpy
98
+ - requests
99
+
100
+ ## GPU Requirements
101
+
102
+ ### tts_test.py - STRICT GPU ENFORCEMENT ✅
103
+
104
+ ```python
105
+ # Force ONNX Runtime to use CUDA only - fail if CUDA is not available
106
+ os.environ['ORT_CUDA_UNAVAILABLE'] = '0'
107
+
108
+ # Verify ONNX Runtime has CUDA before proceeding
109
+ has_gpu = any(p in providers for p in ['CUDAExecutionProvider', 'TensorrtExecutionProvider'])
110
+ if not has_gpu:
111
+ print("ERROR: No GPU provider available for ONNX Runtime!")
112
+ sys.exit(1)
113
+ ```
114
+
115
+ ### ws_whisper_server.py - GPU Configured
116
+
117
+ ```python
118
+ model = WhisperModel(
119
+ "small",
120
+ device="cuda", # GPU only
121
+ compute_type="float16"
122
+ )
123
+ ```
124
+
125
+ ## Testing the Pipeline
126
+
127
+ ### Test 1: Text to LLM
128
+
129
+ ```
130
+ > Hello, how are you?
131
+ 🤖 Sending to LM Studio...
132
+ ✓ Assistant: [LLM Response]
133
+ ```
134
+
135
+ ### Test 2: Audio to LLM
136
+
137
+ ```
138
+ > audio reference_file.wav
139
+ 🎤 Transcribing: reference_file.wav
140
+ [Whisper]: [Transcribed text]
141
+ ✓ Transcription complete: "[text]"
142
+ 🤖 Sending to LM Studio...
143
+ ✓ Assistant: [LLM Response]
144
+ ```
145
+
146
+ ### Test 3: Full Pipeline (Manual)
147
+
148
+ 1. Record/prepare audio file
149
+ 2. Run: `audio myaudio.wav` in pipeline
150
+ 3. Get transcription → LLM response
151
+ 4. Copy LLM response
152
+ 5. Run `tts_test.py` in `.venv` environment
153
+ 6. Paste response to generate speech
154
+
155
+ ## Current Status
156
+
157
+ ✅ Whisper Server: RUNNING on port 8000
158
+ ✅ LM Studio: RUNNING on port 1234
159
+ ✅ Pipeline Script: READY
160
+ ✅ TTS (tts_test.py): READY with GPU enforcement
161
+ ⏳ Full integration: Requires merging environments or API bridge
162
+
163
+ ## Next Steps for Full Integration
164
+
165
+ To create a single script with all three components:
166
+
167
+ 1. Install MiraTTS in the `venv` environment, OR
168
+ 2. Install Whisper packages in the `.venv` environment, OR
169
+ 3. Create a REST API wrapper around tts_test.py for the pipeline to call
Binary file
package/README.md ADDED
@@ -0,0 +1,208 @@
1
+ # Conversational AI Fren
2
+
3
+ ## Overview
4
+
5
+ This project implements a high-performance, low-latency voice conversational pipeline. It integrates Faster-Whisper for Speech-to-Text (STT), LM Studio for Large Language Model (LLM) inference, and MiraTTS for neural Text-to-Speech (TTS).
6
+
7
+ ## Core Components
8
+
9
+ - **Speech-to-Text**: `faster-whisper` (medium model) utilizing CUDA 12.1 for near-instant transcription.
10
+ - **Language Model**: Inference via OpenAI-compatible API (standard port 1234).
11
+ - **Text-to-Speech**: `MiraTTS` neural cloning model for natural vocal responses.
12
+ - **Logic Engine**: Monolithic Python implementation with asynchronous VAD and threading.
13
+
14
+ ## Features
15
+
16
+ - **Voice-to-Start**: The system initializes in a silent "Shadow Mode" and activates via wake words ("Start" or "Hey").
17
+ - **Voice Activity Detection (VAD)**: Intelligent recording cutoff (0.8s silence threshold) to eliminate fixed-duration delays.
18
+ - **Shadow Listening (Pause Mode)**: Background monitoring with zero TUI output. Transition cues are provided via audio beeps.
19
+ - **Audio Transition Cues**: When activating and deactivating, the system emits a 1-second beep to signal the transition.
20
+
21
+ ## System Prerequisites
22
+
23
+ 1. **Python 3.10+**: Recommended for compatibility with `onnxruntime-gpu` and `torch`.
24
+ 2. **FFmpeg**: Must be installed and available in the system PATH (used for loudness normalization and silence removal).
25
+ 3. **CUDA 12.1 & cuDNN**: Required for GPU acceleration.
26
+ 4. **LM Studio**: Local server must be active on `http://127.0.0.1:1234`.
27
+
28
+ ## Quick Start (the easy way)
29
+
30
+ You can now initialize and run the entire pipeline with a single command using `npx`:
31
+
32
+ ```bash
33
+ npx fren-voice
34
+ ```
35
+
36
+ This will automatically:
37
+
38
+ 1. Clone the repository (if not already present).
39
+ 2. Create a virtual environment.
40
+ 3. Install all CUDA-optimized dependencies.
41
+ 4. Launch the AI conversation.
42
+
43
+ ---
44
+
45
+ ## Installation (Traditional)
46
+
47
+ ### 1. Prerequisites
48
+
49
+ - **Python 3.10+**: Download and install from [python.org](https://www.python.org/).
50
+ - **Git**: Required to pull specific model repositories.
51
+ - **FFmpeg**:
52
+ 1. Download and install FFmpeg following the official guides at [ffmpeg.org](https://ffmpeg.org/download.html) or the [official GitHub repository](https://github.com/FFmpeg/FFmpeg).
53
+ 2. Ensure the `ffmpeg` executable is added to your system **PATH**.
54
+ - **LM Studio**: Install it and load a GGUF model (e.g., Llama 3 or Mistral). Start the Local Server on port 1234.
55
+
56
+ ### 2. Setup Virtual Environment
57
+
58
+ It is highly recommended to use a virtual environment to manage dependencies:
59
+
60
+ **Windows:**
61
+
62
+ ```powershell
63
+ python -m venv .venv
64
+ .\.venv\Scripts\activate
65
+ ```
66
+
67
+ **Linux:**
68
+
69
+ ```bash
70
+ python3 -m venv .venv
71
+ source .venv/bin/activate
72
+ ```
73
+
74
+ ### 3. Install Dependencies
75
+
76
+ Install all core engines and CUDA-optimized libraries:
77
+
78
+ **Windows:**
79
+
80
+ ```powershell
81
+ pip install -r requirements.txt
82
+ ```
83
+
84
+ **Linux:**
85
+
86
+ ```bash
87
+ pip3 install -r requirements.txt
88
+ ```
89
+
90
+ ### 4. Hardware Verification
91
+
92
+ Ensure your NVIDIA drivers are up to date. The system will automatically attempt to locate the necessary CUDA/cuDNN DLLs within your site-packages if they are missing from the system path.
93
+
94
+ ## Configuration
95
+
96
+ - **Reference Voice**: The system requires `reference_file.wav` in the root directory for TTS voice cloning.
97
+ - **Microphone**: Defaulted to default input device can also be modified by setting `input_device_index` in the `record_audio_vad` method if necessary.
98
+ - **Environment Variables**: The script automatically sets the following for stability:
99
+ - `TM_MAX_CONTEXT_TOKEN_NUM="12000"` (LMDeploy buffer)
100
+ - `ORT_CUDA_DISABLE_CUDNN_FRONTEND="1"` (ONNX CUDA stability)
101
+
102
+ ## Usage
103
+
104
+ Execute the main script:
105
+
106
+ **Windows:**
107
+
108
+ ```powershell
109
+ python voice_conversation.py
110
+ ```
111
+
112
+ **Linux:**
113
+
114
+ ```bash
115
+ python3 voice_conversation.py
116
+ ```
117
+
118
+ ### Voice Commands {Can be changed in the code as requirements}
119
+
120
+ - **Activate**: "Start", "Hey", or "Continue".
121
+ - **Pause**: "Pause", "Exit", "Quit", or "Goodbye".
122
+ - **Terminate**: "Deactivate", "Terminate", or "Shut down".
123
+
124
+ ## Project Structure
125
+
126
+ - `voice_conversation.py`: Core system logic and TUI.
127
+ - `requirements.txt`: Unified dependency manifest for CUDA-enabled environments.
128
+ - `README.md`: Technical documentation and setup guide.
129
+ - `reference_file.wav`: Target voice profile for TTS.
130
+ - `MiraTTS.bat` / `.ps1`: Quick-launch scripts.
131
+
132
+ ---
133
+
134
+ # Project Pipeline Architecture
135
+
136
+ This section illustrates the technical data flow and component integration of the Voice Conversational AI System.
137
+
138
+ ## Technical Flow Diagram
139
+
140
+ ![Pipeline Diagram](Pipeline%20Diagram.png)
141
+
142
+ ```mermaid
143
+ graph TD
144
+ %% Entry Point
145
+ Start((System Start)) --> ShadowMode[Shadow Listening Mode]
146
+
147
+ %% Wake-Word Logic
148
+ ShadowMode -->|Detects 'Hey' / 'Start'| AudioCue[Audio Cue: Ascending Beep]
149
+ AudioCue --> MainLoop[Active Conversation Loop]
150
+
151
+ %% Input Pipeline
152
+ MainLoop --> MicInput[Microphone Input]
153
+ MicInput --> VAD{VAD Process}
154
+ VAD -->|Speaking| Buffer[Accumulate Audio Buffer]
155
+ VAD -->|Silence > 0.8s| StopRec[Terminate Recording]
156
+
157
+ %% Inference Pipeline
158
+ StopRec --> STT[Faster-Whisper STT Engine]
159
+ STT -->|Text| LLM[LM Studio API Inference]
160
+ LLM -->|Response Script| PreProcess[Text Normalization]
161
+
162
+ %% Output Pipeline
163
+ PreProcess --> TTS[MiraTTS Neural Synthesis]
164
+ TTS -->|PCM Audio| SyncExec{Parallel Execution}
165
+
166
+ SyncExec -->|Thread 1| Playback[PyAudio Memory Playback]
167
+ SyncExec -->|Thread 2| Visuals[Fast Typeface Animation]
168
+
169
+ %% State Management
170
+ Playback & Visuals --> StateCheck{Command Check}
171
+ StateCheck -->|'Pause'| PauseCue[Audio Cue: Descending Beep]
172
+ PauseCue --> ShadowMode
173
+
174
+ StateCheck -->|'Deactivate'| Shutdown((System Shutdown))
175
+ StateCheck -->|Next Turn| MainLoop
176
+ ```
177
+
178
+ ## Component Roles
179
+
180
+ ### 1. Input Layer (VAD & STT)
181
+
182
+ - **VAD (Voice Activity Detection)**: Monitors RMS energy levels in real-time. It cuts the recording loop immediately after 800ms of sustained quiet, significantly reducing response latency compared to fixed-length recording.
183
+ - **Faster-Whisper**: Processes the audio buffer into text. It uses CTranslate2 backend for high-speed inference on NVIDIA GPUs.
184
+
185
+ ### 2. Cognitive Layer (LLM)
186
+
187
+ - **LM Studio Gateway**: Acts as the brain. The system sends conversational context to a local OpenAI-compatible endpoint.
188
+ - **System Prompting**: Enforces concise, spoken-word friendly responses (1-3 sentences) to maintain conversational speed.
189
+
190
+ ### 3. Synthesis Layer (TTS)
191
+
192
+ - **MiraTTS**: A neural reference-based synthesis engine. It generates high-fidelity audio tokens from text based on the `reference_file.wav` provided at initialization.
193
+ - **Neural Codecs**: Utilizes `ncodec` and `FlashSR` for internal audio super-resolution and decoding.
194
+
195
+ ### 4. Output Layer (Synchronized IO)
196
+
197
+ - **PyAudio Memory Buffer**: Audio is played directly from RAM. No disk I/O is involved during playback to prevent micro-stutters.
198
+ - **Threaded UI**: The terminal typewriter effect runs in a parallel thread, synchronized with a 1.0s lead-in to allow the sound driver to initialize.
199
+
200
+ ## Lifecycle States
201
+
202
+ | State | Description | Audio Cue |
203
+ | :--------------- | :------------------------------------------- | :-------------- |
204
+ | **Shadow Mode** | Silent background monitoring for wake-words. | None |
205
+ | **Activation** | Transition into active conversation. | Ascending Beep |
206
+ | **Processing** | Actively transcribing or generating. | None |
207
+ | **Deactivation** | Transition back into Shadow Mode. | Descending Beep |
208
+ | **Termination** | Full process shutdown. | None |
package/bin/cli.js ADDED
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { execSync } = require("child_process");
4
+ const fs = require("fs-extra");
5
+ const path = require("path");
6
+ const chalk = require("chalk");
7
+ const ora = require("ora");
8
+ const prompts = require("prompts");
9
+
10
+ async function main() {
11
+ console.log(chalk.cyan.bold("\n 🎙️ FREN: AI Voice Companion\n"));
12
+
13
+ const currentDir = process.cwd();
14
+ const isProjectDir = await fs.pathExists(
15
+ path.join(currentDir, "voice_conversation.py")
16
+ );
17
+
18
+ if (!isProjectDir) {
19
+ const response = await prompts({
20
+ type: "confirm",
21
+ name: "setup",
22
+ message:
23
+ "Fren is not initialized in this directory. Would you like to clone the repository here?",
24
+ initial: true,
25
+ });
26
+
27
+ if (response.setup) {
28
+ const spinner = ora("Cloning repository...").start();
29
+ try {
30
+ execSync("git clone https://github.com/lakshya-p/Fren.git .", {
31
+ stdio: "ignore",
32
+ });
33
+ spinner.succeed("Repository cloned successfully!");
34
+ } catch (err) {
35
+ spinner.fail("Failed to clone repository. Make sure git is installed.");
36
+ process.exit(1);
37
+ }
38
+ } else {
39
+ console.log(
40
+ chalk.yellow(
41
+ "Aborted. Please navigate to a Fren directory or initialize it."
42
+ )
43
+ );
44
+ process.exit(0);
45
+ }
46
+ }
47
+
48
+ // Check for Virtual Environment
49
+ const venvPath = path.join(currentDir, ".venv");
50
+ const pythonExec =
51
+ process.platform === "win32"
52
+ ? path.join(venvPath, "Scripts", "python.exe")
53
+ : path.join(venvPath, "bin", "python");
54
+
55
+ if (!(await fs.pathExists(venvPath))) {
56
+ const setupVenv = await prompts({
57
+ type: "confirm",
58
+ name: "proceed",
59
+ message:
60
+ "No virtual environment found. Would you like to create one and install dependencies?",
61
+ initial: true,
62
+ });
63
+
64
+ if (setupVenv.proceed) {
65
+ const spinner = ora("Creating virtual environment...").start();
66
+ try {
67
+ execSync("python -m venv .venv");
68
+ spinner.text =
69
+ "Installing dependencies (this may take a few minutes)...";
70
+ execSync(`"${pythonExec}" -m pip install -r requirements.txt`, {
71
+ stdio: "inherit",
72
+ });
73
+ spinner.succeed("Setup complete!");
74
+ } catch (err) {
75
+ spinner.fail("Setup failed. Ensure Python 3.10+ is installed.");
76
+ console.error(err);
77
+ process.exit(1);
78
+ }
79
+ } else {
80
+ console.log(chalk.yellow("Cannot proceed without dependencies."));
81
+ process.exit(1);
82
+ }
83
+ }
84
+
85
+ // Final confirmation to run
86
+ console.log(chalk.green("\n🚀 Starting Fren AI...\n"));
87
+ try {
88
+ execSync(`"${pythonExec}" voice_conversation.py`, { stdio: "inherit" });
89
+ } catch (err) {
90
+ console.error(
91
+ chalk.red(
92
+ "\nFren closed with an error. Check if LM Studio is running on port 1234."
93
+ )
94
+ );
95
+ }
96
+ }
97
+
98
+ main().catch((err) => {
99
+ console.error(err);
100
+ process.exit(1);
101
+ });
package/package.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "fren-voice",
3
+ "version": "1.0.0",
4
+ "description": "High-performance voice conversational AI system.",
5
+ "main": "index.js",
6
+ "bin": {
7
+ "fren": "bin/cli.js"
8
+ },
9
+ "scripts": {
10
+ "test": "echo \"Error: no test specified\" && exit 1"
11
+ },
12
+ "keywords": [
13
+ "voice",
14
+ "ai",
15
+ "stt",
16
+ "tts",
17
+ "llm",
18
+ "conversational"
19
+ ],
20
+ "author": "lakshya-p",
21
+ "license": "MIT",
22
+ "dependencies": {
23
+ "chalk": "^4.1.2",
24
+ "execa": "^5.1.1",
25
+ "fs-extra": "^10.0.0",
26
+ "ora": "^5.4.1",
27
+ "prompts": "^2.4.2"
28
+ }
29
+ }
Binary file
@@ -0,0 +1,36 @@
1
+ --extra-index-url https://download.pytorch.org/whl/cu121
2
+
3
+ # CoreModules (Git Repositories)
4
+ git+https://github.com/ysharma3501/MiraTTS.git
5
+ git+https://github.com/ysharma3501/FastBiCodec.git
6
+ git+https://github.com/ysharma3501/FlashSR.git
7
+
8
+ # Inference Engines
9
+ faster-whisper==1.2.1
10
+ onnxruntime-gpu==1.23.2
11
+ lmdeploy==0.11.0
12
+
13
+ # PyTorch (CUDA 12.1 optimized)
14
+ torch==2.5.1+cu121
15
+ torchaudio==2.5.1+cu121
16
+ torchvision==0.20.1+cu121
17
+
18
+ # Audio Processing & IO
19
+ pyaudio>=0.2.14
20
+ soundfile>=0.13.1
21
+ librosa>=0.11.0
22
+ numpy==1.26.4
23
+
24
+ # API & Networking
25
+ requests>=2.32.5
26
+ aiohttp>=3.13.2
27
+ fastapi>=0.127.0
28
+ uvicorn>=0.40.0
29
+ python-multipart
30
+
31
+ # Support & Performance
32
+ transformers>=4.56.1
33
+ pydantic>=2.12.5
34
+ tqdm>=4.67.1
35
+ typing-extensions>=4.13.0
36
+ ffmpeg-python
@@ -0,0 +1,330 @@
1
+ """
2
+ STABLE VOICE CONVERSATIONAL AI (Wake-Word + Audio Cues)
3
+ Optimizations:
4
+ - VAD (Voice Activity Detection): Stop recording early when silence is detected.
5
+ - Silent Wake-Word Detection: Back-to-life on "Hey", "Fren", or "Continue".
6
+ - Audio Cues: Beep on pause/resume.
7
+ """
8
+ import os
9
+ import sys
10
+
11
+ # Log suppression
12
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
13
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
14
+ os.environ['ORT_CUDA_DISABLE_CUDNN_FRONTEND'] = '1'
15
+
16
+ import asyncio
17
+ import requests
18
+ import numpy as np
19
+ import soundfile as sf
20
+ import subprocess
21
+ import pyaudio
22
+ import tempfile
23
+ import threading
24
+ import re
25
+ import json
26
+ import logging
27
+ import warnings
28
+ import time
29
+ from datetime import datetime
30
+ from faster_whisper import WhisperModel
31
+
32
+ # Global Silence Configuration
33
+ warnings.filterwarnings("ignore")
34
+ logging.basicConfig(level=logging.ERROR)
35
+ logging.getLogger("lmdeploy").setLevel(logging.ERROR)
36
+ logging.getLogger().setLevel(logging.ERROR)
37
+
38
+ # Fix for CUDA DLLs on Windows
39
+ if sys.platform == "win32":
40
+ for venv_name in [".venv", "venv"]:
41
+ nvidia_dir = os.path.join(os.path.dirname(__file__), venv_name, "Lib", "site-packages", "nvidia")
42
+ if os.path.exists(nvidia_dir):
43
+ for sub in ["cudnn", "cublas", "cuda_runtime", "curand", "cusolver", "cusparse"]:
44
+ bin_path = os.path.join(nvidia_dir, sub, "bin")
45
+ if os.path.exists(bin_path):
46
+ os.environ["PATH"] = bin_path + os.pathsep + os.environ["PATH"]
47
+
48
+ # Set ONNX Severity
49
+ try:
50
+ import onnxruntime as ort
51
+ ort.set_default_logger_severity(3)
52
+ except: pass
53
+
54
+ os.environ['TM_MAX_CONTEXT_TOKEN_NUM'] = '12000'
55
+ from mira.model import MiraTTS
56
+
57
+ # Configuration
58
+ LLM_API_URL = "http://127.0.0.1:1234/v1/chat/completions"
59
+ LLM_MODELS_URL = "http://127.0.0.1:1234/v1/models"
60
+ SAMPLE_RATE = 16000
61
+ MAX_RECORD_SECONDS = 6
62
+ SILENCE_THRESHOLD = 500 # RMS threshold for silence
63
+ SILENCE_DURATION = 0.8 # Stop after 0.8s of silence
64
+
65
+ class VoiceConversationalAI:
66
+ def __init__(self, reference_audio_path):
67
+ print("\n" + "="*70)
68
+ print("🎙️ VOICE CONVERSATIONAL AI SYSTEM")
69
+ print("="*70)
70
+
71
+ # Initialize Whisper
72
+ print("\n[1/4] Loading Whisper model...")
73
+ try:
74
+ self.whisper = WhisperModel("medium", device="cuda", compute_type="float16")
75
+ print(" ✓ Whisper loaded on CUDA (medium)")
76
+ except:
77
+ self.whisper = WhisperModel("small", device="cpu", compute_type="int8")
78
+ print(" ✓ Whisper loaded on CPU (small)")
79
+
80
+ # Initialize TTS
81
+ print("\n[2/4] Loading MiraTTS model...")
82
+ self.mira_tts = MiraTTS('YatharthS/MiraTTS')
83
+ self.mira_tts.set_params(temperature=0.6, top_p=0.92, max_new_tokens=1536)
84
+ print(" ✓ MiraTTS loaded")
85
+
86
+ print("\n[3/4] Encoding reference audio...")
87
+ self.context_tokens = self.mira_tts.encode_audio(reference_audio_path)
88
+ print(" ✓ Reference audio encoded")
89
+
90
+ self.audio = pyaudio.PyAudio()
91
+ self.turn_count = 0
92
+
93
+ print("\n" + "="*70)
94
+ print("[OK] System Ready!")
95
+ print("="*70)
96
+
97
+ def play_cue(self, cue_type='resume'):
98
+ """Simple sine wave beep cue"""
99
+ def run_cue():
100
+ try:
101
+ duration = 0.2
102
+ fs = 44000
103
+ if cue_type == 'resume':
104
+ # Ascending: 400Hz to 800Hz
105
+ f1, f2 = 400, 800
106
+ else:
107
+ # Descending: 800Hz to 400Hz
108
+ f1, f2 = 800, 400
109
+
110
+ t = np.linspace(0, duration, int(fs * duration))
111
+ # Frequency sweep
112
+ f = np.linspace(f1, f2, len(t))
113
+ samples = 0.9 * np.sin(2 * np.pi * f * t) # Increased volume to 0.9
114
+
115
+ # Fade out to avoid click
116
+ fade = np.linspace(1, 0, len(samples))
117
+ samples = (samples * fade).astype(np.float32)
118
+
119
+ # Standard playback
120
+ stream = self.audio.open(format=pyaudio.paFloat32, channels=1, rate=fs, output=True)
121
+ stream.write(samples.tobytes())
122
+ stream.stop_stream()
123
+ stream.close()
124
+ except: pass
125
+
126
+ threading.Thread(target=run_cue).start()
127
+
128
+ async def record_audio_vad(self, silent=False):
129
+ """Record with VAD: stop early when silence is detected"""
130
+ if not silent: print(f"\nMIC: Listening...")
131
+
132
+ stream = self.audio.open(
133
+ format=pyaudio.paInt16,
134
+ channels=1,
135
+ rate=SAMPLE_RATE,
136
+ input=True,
137
+ input_device_index=3,
138
+ frames_per_buffer=1024
139
+ )
140
+
141
+ frames = []
142
+ silent_chunks = 0
143
+ max_silent_chunks = int(SILENCE_DURATION * SAMPLE_RATE / 1024)
144
+ started_talking = False
145
+
146
+ for i in range(0, int(SAMPLE_RATE / 1024 * (3 if silent else MAX_RECORD_SECONDS))):
147
+ data = stream.read(1024, exception_on_overflow=False)
148
+ frames.append(data)
149
+
150
+ audio_data = np.frombuffer(data, dtype=np.int16)
151
+ rms = np.sqrt(np.mean(audio_data.astype(np.float32)**2))
152
+
153
+ if rms > SILENCE_THRESHOLD:
154
+ started_talking = True
155
+ silent_chunks = 0
156
+ elif started_talking:
157
+ silent_chunks += 1
158
+ if silent_chunks > max_silent_chunks:
159
+ break
160
+
161
+ stream.stop_stream()
162
+ stream.close()
163
+ return b''.join(frames)
164
+
165
+ def transcribe_sync(self, audio_bytes, silent=False):
166
+ """Standard transcription or silent background check"""
167
+ if not silent: print("🔄 Transcribing...")
168
+
169
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
170
+ temp_name = f.name
171
+ with sf.SoundFile(temp_name, 'w', 16000, 1) as sf_file:
172
+ sf_file.write(np.frombuffer(audio_bytes, dtype=np.int16))
173
+
174
+ clean_name = temp_name.replace(".wav", "_clean.wav")
175
+ try:
176
+ subprocess.run(["ffmpeg", "-y", "-i", temp_name, "-ac", "1", "-ar", "16000", "-af", "loudnorm,silenceremove=stop_periods=-1:stop_duration=0.5:stop_threshold=-40dB", clean_name], capture_output=True, timeout=5)
177
+ trans_file = clean_name if os.path.exists(clean_name) else temp_name
178
+ except: trans_file = temp_name
179
+
180
+ try:
181
+ segments, _ = self.whisper.transcribe(trans_file, language="en", temperature=0, beam_size=1, vad_filter=True)
182
+ text = "".join(s.text for s in segments).strip()
183
+
184
+ if not silent:
185
+ if text: print(f" ✓ You said: \"{text}\"")
186
+ else: print(f" WARN: No speech detected")
187
+
188
+ return text
189
+ except: return ""
190
+ finally:
191
+ for f in [temp_name, clean_name]:
192
+ if os.path.exists(f):
193
+ try: os.remove(f)
194
+ except: pass
195
+
196
+ async def background_wait_for_wake_word(self, initial=False):
197
+ """Silently listen for 'Hey', 'Fren', or 'Continue'"""
198
+ if initial:
199
+ print("\n🎙️ System initialized. Say 'Hey Fren' or 'Start' to begin...")
200
+ else:
201
+ self.play_cue('pause') # Audio cue for pausing
202
+ print("\n⏸️ System Paused. Listening for 'Hey', 'Fren', or 'Continue'...")
203
+
204
+ keywords = ['hey', 'fren', 'continue', 'start']
205
+
206
+ while True:
207
+ # Silent record (3 seconds chunks)
208
+ audio_data = await self.record_audio_vad(silent=True)
209
+
210
+ # Silent transcribe
211
+ text = await asyncio.to_thread(self.transcribe_sync, audio_data, silent=True)
212
+
213
+ if text and any(kw in text.lower() for kw in keywords):
214
+ self.play_cue('resume') # Audio cue for resuming
215
+ print("\n🎙️ System Re-activated!" if not initial else "\n🎙️ System Started!")
216
+ return True
217
+
218
+ await asyncio.sleep(0.1)
219
+
220
+ def query_llm_sync(self, user_message):
221
+ try:
222
+ try: model_id = requests.get(LLM_MODELS_URL, timeout=5).json()["data"][0]["id"]
223
+ except: model_id = "local-model"
224
+
225
+ response = requests.post(
226
+ LLM_API_URL,
227
+ json={
228
+ "model": model_id,
229
+ "messages": [
230
+ {"role": "system", "content": "You are a helpful voice assistant. Respond naturally and conversationally. Keep it concise (1-3 sentences). Only output plain spoken English."},
231
+ {"role": "user", "content": user_message}
232
+ ],
233
+ "temperature": 0.5
234
+ },
235
+ timeout=60
236
+ )
237
+ if response.status_code == 200:
238
+ msg = response.json()['choices'][0]['message']['content']
239
+ msg = re.sub(r"<[^>]+>", "", msg)
240
+ msg = re.sub(r"\*[^*]+\*", "", msg)
241
+ return msg.strip()
242
+ return "I'm having trouble connecting."
243
+ except Exception as e:
244
+ return f"Error: {e}"
245
+
246
+ def generate_and_play_speech(self, text):
247
+ if not text: return
248
+ print(f"\nSPEAKER: Generating speech...")
249
+
250
+ try:
251
+ import io, contextlib
252
+ with contextlib.redirect_stdout(io.StringIO()):
253
+ audio = self.mira_tts.generate(text, self.context_tokens)
254
+
255
+ def play():
256
+ audio_int16 = (np.array(audio) * 32767).astype(np.int16)
257
+ stream = self.audio.open(format=pyaudio.paInt16, channels=1, rate=48000, output=True)
258
+ stream.write(audio_int16.tobytes())
259
+ stream.stop_stream()
260
+ stream.close()
261
+
262
+ def type_text():
263
+ time.sleep(1.0) # Delay start to sync with audio buffering
264
+ print(f"\n💬 Fren: ", end="", flush=True)
265
+ for char in text:
266
+ print(char, end="", flush=True)
267
+ time.sleep(0.04) # Faster typeface animation
268
+ print("\n")
269
+
270
+ play_thread = threading.Thread(target=play)
271
+ type_thread = threading.Thread(target=type_text)
272
+
273
+ play_thread.start()
274
+ type_thread.start()
275
+
276
+ play_thread.join()
277
+ type_thread.join()
278
+
279
+ except Exception as e:
280
+ print(f" [ERR] TTS Error: {e}")
281
+
282
+ async def run(self):
283
+ print("\n" + "="*70)
284
+ print("🎙️ VOICE CONVERSATION MODE")
285
+ print("="*70)
286
+
287
+ # Start with silent VAD instead of Enter
288
+ await self.background_wait_for_wake_word(initial=True)
289
+
290
+ try:
291
+ while True:
292
+ print("\n" + "-"*40)
293
+ # Step 1: Record with VAD
294
+ audio_data = await self.record_audio_vad()
295
+
296
+ # Step 2: Transcribe
297
+ transcription = await asyncio.to_thread(self.transcribe_sync, audio_data)
298
+
299
+ if transcription:
300
+ # Check for termination
301
+ if any(word in transcription.lower() for word in ['deactivate', 'terminate', 'shut down']):
302
+ print("\n🛑 System Deactivating. Goodbye!")
303
+ break
304
+
305
+ # Check for pause/exit
306
+ if any(word in transcription.lower() for word in ['exit', 'quit', 'goodbye', 'bye', 'pause']):
307
+ # Enter Silent Wake Mode
308
+ await self.background_wait_for_wake_word()
309
+ continue
310
+
311
+ # Step 3: Query LLM
312
+ response = await asyncio.to_thread(self.query_llm_sync, transcription)
313
+
314
+ # Step 4: Playback
315
+ await asyncio.to_thread(self.generate_and_play_speech, response)
316
+
317
+ print("\n⏸️ Ready for next turn...")
318
+ await asyncio.sleep(0.5)
319
+
320
+ except KeyboardInterrupt: print("\n👋 Interrupt received. Exiting.")
321
+ finally: self.audio.terminate()
322
+
323
+ async def main():
324
+ ref_file = "reference_file.wav"
325
+ if not os.path.exists(ref_file): return
326
+ ai = VoiceConversationalAI(ref_file)
327
+ await ai.run()
328
+
329
+ if __name__ == "__main__":
330
+ asyncio.run(main())