voice-chatbot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voice_chatbot-0.1.0/LICENSE +21 -0
- voice_chatbot-0.1.0/MANIFEST.in +3 -0
- voice_chatbot-0.1.0/PKG-INFO +274 -0
- voice_chatbot-0.1.0/README.md +226 -0
- voice_chatbot-0.1.0/launch/voice_chatbot.launch.py +71 -0
- voice_chatbot-0.1.0/package.xml +21 -0
- voice_chatbot-0.1.0/pyproject.toml +74 -0
- voice_chatbot-0.1.0/requirements.txt +8 -0
- voice_chatbot-0.1.0/resource/voice_chatbot_ros +1 -0
- voice_chatbot-0.1.0/setup.cfg +10 -0
- voice_chatbot-0.1.0/setup.py +71 -0
- voice_chatbot-0.1.0/tests/test_audio_io.py +112 -0
- voice_chatbot-0.1.0/tests/test_chatbot.py +155 -0
- voice_chatbot-0.1.0/tests/test_config.py +47 -0
- voice_chatbot-0.1.0/tests/test_llm.py +120 -0
- voice_chatbot-0.1.0/tests/test_platform_setup.py +81 -0
- voice_chatbot-0.1.0/tests/test_setup_models.py +144 -0
- voice_chatbot-0.1.0/tests/test_stt.py +68 -0
- voice_chatbot-0.1.0/tests/test_tts_engine.py +111 -0
- voice_chatbot-0.1.0/tests/test_vad.py +129 -0
- voice_chatbot-0.1.0/voice_chatbot/__init__.py +17 -0
- voice_chatbot-0.1.0/voice_chatbot/app.py +481 -0
- voice_chatbot-0.1.0/voice_chatbot/audio_io.py +109 -0
- voice_chatbot-0.1.0/voice_chatbot/chatbot.py +95 -0
- voice_chatbot-0.1.0/voice_chatbot/config.py +128 -0
- voice_chatbot-0.1.0/voice_chatbot/llm.py +117 -0
- voice_chatbot-0.1.0/voice_chatbot/platform_setup.py +88 -0
- voice_chatbot-0.1.0/voice_chatbot/ros_app.py +391 -0
- voice_chatbot-0.1.0/voice_chatbot/setup_models.py +162 -0
- voice_chatbot-0.1.0/voice_chatbot/stt.py +66 -0
- voice_chatbot-0.1.0/voice_chatbot/tts_engine.py +54 -0
- voice_chatbot-0.1.0/voice_chatbot/ui_common.py +403 -0
- voice_chatbot-0.1.0/voice_chatbot/vad.py +151 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/PKG-INFO +274 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/SOURCES.txt +39 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/dependency_links.txt +1 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/entry_points.txt +4 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/requires.txt +31 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/top_level.txt +1 -0
- voice_chatbot-0.1.0/voice_chatbot.egg-info/zip-safe +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aapo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: voice-chatbot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local speech-to-speech voice assistant with PySide6 GUI, CLI, and ROS 2 integration.
|
|
5
|
+
Author-email: Aapo <noreply@example.com>
|
|
6
|
+
Maintainer: Aapo
|
|
7
|
+
Maintainer-email: noreply@example.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/Aapo2001/python-chatbot
|
|
10
|
+
Project-URL: Documentation, https://docs-site-kappa-coral.vercel.app
|
|
11
|
+
Project-URL: Repository, https://github.com/Aapo2001/python-chatbot
|
|
12
|
+
Project-URL: Issues, https://github.com/Aapo2001/python-chatbot/issues
|
|
13
|
+
Keywords: voice-assistant,chatbot,speech-to-text,text-to-speech,llm
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy
|
|
26
|
+
Requires-Dist: sounddevice
|
|
27
|
+
Requires-Dist: huggingface-hub
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: numpy; extra == "test"
|
|
30
|
+
Requires-Dist: pytest<10,>=8; extra == "test"
|
|
31
|
+
Requires-Dist: pytest-cov<8,>=5; extra == "test"
|
|
32
|
+
Provides-Extra: stt
|
|
33
|
+
Requires-Dist: faster-whisper; extra == "stt"
|
|
34
|
+
Provides-Extra: llm
|
|
35
|
+
Requires-Dist: llama-cpp-python; extra == "llm"
|
|
36
|
+
Provides-Extra: tts
|
|
37
|
+
Requires-Dist: coqui-tts[codec]; extra == "tts"
|
|
38
|
+
Provides-Extra: vad
|
|
39
|
+
Requires-Dist: silero-vad; extra == "vad"
|
|
40
|
+
Provides-Extra: gui
|
|
41
|
+
Requires-Dist: PySide6<7,>=6.7; extra == "gui"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: voice-chatbot[gui,llm,stt,tts,vad]; extra == "all"
|
|
44
|
+
Provides-Extra: dev
|
|
45
|
+
Requires-Dist: voice-chatbot[all]; extra == "dev"
|
|
46
|
+
Requires-Dist: pytest<10,>=9; extra == "dev"
|
|
47
|
+
Requires-Dist: pytest-cov<8,>=5; extra == "dev"
|
|
48
|
+
|
|
49
|
+
# Voice Chatbot
|
|
50
|
+
|
|
51
|
+
Local voice chatbot for Windows with a Finnish-first default configuration. The application captures microphone audio, detects speech with Silero VAD, transcribes it with Whisper, generates a reply with a local GGUF LLM, and speaks the reply with Coqui TTS.
|
|
52
|
+
|
|
53
|
+
The repository's application code now lives under the `voice_chatbot/`
|
|
54
|
+
package. The repo root keeps thin compatibility wrappers (`app.py`,
|
|
55
|
+
`chatbot.py`, `ros_app.py`, `setup_models.py`) so older commands still work.
|
|
56
|
+
|
|
57
|
+
Primary entry points:
|
|
58
|
+
|
|
59
|
+
- `python -m voice_chatbot.app`: PySide6 desktop UI for configuring and running the chatbot.
|
|
60
|
+
- `python -m voice_chatbot.chatbot`: terminal-only runner with the same audio pipeline.
|
|
61
|
+
- `python -m voice_chatbot.ros_app`: ROS-connected PySide6 GUI.
|
|
62
|
+
- `voice_chatbot_ros/node.py`: ROS 2 Humble node that exposes the pipeline through ROS topics and a service.
|
|
63
|
+
|
|
64
|
+
## What The Code Does
|
|
65
|
+
|
|
66
|
+
Runtime flow:
|
|
67
|
+
|
|
68
|
+
1. `AudioIO` captures 16 kHz mono microphone audio in fixed-size chunks.
|
|
69
|
+
2. `VoiceActivityDetector` buffers audio until Silero VAD reports speech start and end.
|
|
70
|
+
3. `SpeechToText` transcribes the captured utterance with `pywhispercpp`.
|
|
71
|
+
4. `ChatLLM` sends the user text plus recent conversation history to `llama-cpp-python`.
|
|
72
|
+
5. `TextToSpeech` synthesizes the assistant reply with Coqui TTS.
|
|
73
|
+
6. `AudioIO` plays the generated speech back through the default output device.
|
|
74
|
+
|
|
75
|
+
The GUI wraps this pipeline in a background `QThread` and exposes model and runtime settings in a sidebar.
|
|
76
|
+
|
|
77
|
+
## Repository Layout
|
|
78
|
+
|
|
79
|
+
| Path | Purpose |
|
|
80
|
+
| --- | --- |
|
|
81
|
+
| `voice_chatbot/` | Main Python application package for GUI, CLI, config, audio, VAD, STT, LLM, and TTS code |
|
|
82
|
+
| `app.py`, `chatbot.py`, `ros_app.py`, `setup_models.py` | Thin compatibility entry points that call into `voice_chatbot/` |
|
|
83
|
+
| `install.bat` | Windows setup script for Python packages and CUDA builds |
|
|
84
|
+
| `pixi.toml` | Pixi workspace manifest for the base toolchain and common tasks |
|
|
85
|
+
| `pixi.lock` | Resolved Pixi lockfile for the base environment |
|
|
86
|
+
| `tools/install_python_windows.bat` | Pixi bootstrap script for CUDA-enabled Python packages |
|
|
87
|
+
| `tools/install_python_linux.sh` | Pixi bootstrap script for Linux Python packages |
|
|
88
|
+
| `config.json` | Persisted GUI configuration |
|
|
89
|
+
| `voice_chatbot_ros/` | ROS 2 Humble package and node implementation |
|
|
90
|
+
| `launch/` | ROS 2 launch file |
|
|
91
|
+
| `package.xml`, `setup.py`, `setup.cfg` | ROS 2 `ament_python` package metadata |
|
|
92
|
+
|
|
93
|
+
More implementation detail is in [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
|
94
|
+
ROS-specific usage is in [docs/ROS2.md](docs/ROS2.md).
|
|
95
|
+
|
|
96
|
+
## Environment Assumptions
|
|
97
|
+
|
|
98
|
+
The codebase is currently optimized for a Windows workstation with local GPU inference:
|
|
99
|
+
|
|
100
|
+
- `app.py` and `chatbot.py` add CUDA DLL paths from `CUDA_PATH` or `D:\cuda`.
|
|
101
|
+
- `install.bat` installs CUDA-enabled PyTorch, `llama-cpp-python`, and `pywhispercpp`.
|
|
102
|
+
- The default LLM is a GGUF file under `models/`.
|
|
103
|
+
- The default voice assistant language is Finnish (`fi`).
|
|
104
|
+
|
|
105
|
+
The code can fall back to CPU for some components, but the intended deployment is local GPU acceleration.
|
|
106
|
+
|
|
107
|
+
## Installation
|
|
108
|
+
|
|
109
|
+
The recommended setup path is now Pixi. The repository ships with a `pixi.toml` workspace manifest and uses Pixi to install both the base Python toolchain and the ROS 2 Humble packages.
|
|
110
|
+
|
|
111
|
+
### 1. System prerequisites
|
|
112
|
+
|
|
113
|
+
- Windows
|
|
114
|
+
- `pixi` or permission to let `install.bat` install it
|
|
115
|
+
- CUDA Toolkit if you want GPU acceleration for PyTorch and `llama-cpp-python`
|
|
116
|
+
- CMake and a working build toolchain for Python packages with native extensions
|
|
117
|
+
- Microphone and speakers/headphones configured as default audio devices
|
|
118
|
+
|
|
119
|
+
### 2. Create the Pixi environment and install packages
|
|
120
|
+
|
|
121
|
+
Run:
|
|
122
|
+
|
|
123
|
+
```powershell
|
|
124
|
+
install.bat
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
What the script does:
|
|
128
|
+
|
|
129
|
+
- installs `pixi` if it is missing
|
|
130
|
+
- creates or updates the local Pixi environment from `pixi.toml`
|
|
131
|
+
- runs the Pixi bootstrap task that installs the Python packages needed by the project
|
|
132
|
+
|
|
133
|
+
The Pixi workspace provides:
|
|
134
|
+
|
|
135
|
+
- Python 3.11
|
|
136
|
+
- `pip`
|
|
137
|
+
- `cmake`
|
|
138
|
+
- `git`
|
|
139
|
+
- `ninja`
|
|
140
|
+
- `colcon-common-extensions`
|
|
141
|
+
- `setuptools>=69.5,<80`
|
|
142
|
+
- `ros-humble-desktop`
|
|
143
|
+
|
|
144
|
+
The bootstrap task installs:
|
|
145
|
+
|
|
146
|
+
- CUDA-enabled `torch`, `torchvision`, `torchaudio`
|
|
147
|
+
- `llama-cpp-python` compiled with `GGML_CUDA=on`
|
|
148
|
+
- `pywhispercpp` compiled with CUDA flags
|
|
149
|
+
- the remaining packages from `requirements.txt`
|
|
150
|
+
|
|
151
|
+
It also enforces a `setuptools` version that stays compatible with both `sip`
|
|
152
|
+
and ROS 2 Humble's `colcon` editable Python build flow.
|
|
153
|
+
|
|
154
|
+
`requirements.txt` is still not the full environment by itself. `torch`, `llama-cpp-python`, and `pywhispercpp` are installed separately because they need a custom wheel index or CUDA-specific build flags.
|
|
155
|
+
|
|
156
|
+
### 3. Direct Pixi workflow
|
|
157
|
+
|
|
158
|
+
If you do not want to use `install.bat`, the equivalent commands are:
|
|
159
|
+
|
|
160
|
+
```powershell
|
|
161
|
+
pixi install
|
|
162
|
+
pixi run install-python-deps
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
`pixi run build` also re-checks the `setuptools` compatibility window before
|
|
166
|
+
invoking `colcon`.
|
|
167
|
+
|
|
168
|
+
### 4. Download models
|
|
169
|
+
|
|
170
|
+
Run:
|
|
171
|
+
|
|
172
|
+
```powershell
|
|
173
|
+
pixi run setup-models
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
This script:
|
|
177
|
+
|
|
178
|
+
- checks CUDA visibility in PyTorch
|
|
179
|
+
- initializes the Silero VAD model
|
|
180
|
+
- downloads or validates the configured Whisper model
|
|
181
|
+
- downloads the configured GGUF LLM from Hugging Face if missing
|
|
182
|
+
- initializes the configured Coqui TTS model
|
|
183
|
+
|
|
184
|
+
## Running The Application
|
|
185
|
+
|
|
186
|
+
Desktop UI:
|
|
187
|
+
|
|
188
|
+
```powershell
|
|
189
|
+
pixi run app
|
|
190
|
+
# equivalent: python -m voice_chatbot.app
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Terminal mode:
|
|
194
|
+
|
|
195
|
+
```powershell
|
|
196
|
+
pixi run chatbot
|
|
197
|
+
# equivalent: python -m voice_chatbot.chatbot
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
ROS 2 Humble node:
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
pixi run ros-run /absolute/path/to/config.json
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
This follows Pixi's ROS 2 workflow with `robostack-humble` packages installed into the Pixi environment.
|
|
207
|
+
|
|
208
|
+
## Testing
|
|
209
|
+
|
|
210
|
+
Run the automated unit test suite with:
|
|
211
|
+
|
|
212
|
+
```powershell
|
|
213
|
+
pixi run test
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
The repository also includes a GitHub Actions workflow that runs the same
|
|
217
|
+
pytest suite automatically on every push and pull request on both Windows
|
|
218
|
+
and Linux.
|
|
219
|
+
|
|
220
|
+
## Configuration
|
|
221
|
+
|
|
222
|
+
Configuration is defined in `config.py` and can be persisted to `config.json`.
|
|
223
|
+
|
|
224
|
+
Important settings:
|
|
225
|
+
|
|
226
|
+
- Audio: `sample_rate`, `channels`, `chunk_samples`
|
|
227
|
+
- VAD: `vad_threshold`, `min_silence_duration_ms`, `speech_pad_ms`, `min_speech_duration_ms`, `vad_pre_buffer_ms`
|
|
228
|
+
- STT: `language`, `whisper_model`, `whisper_n_threads`
|
|
229
|
+
- LLM: `llm_model_path`, `llm_n_gpu_layers`, `llm_n_ctx`, `llm_max_tokens`, `llm_temperature`, `llm_system_prompt`, `max_conversation_turns`
|
|
230
|
+
- TTS: `tts_model`, `tts_gpu`
|
|
231
|
+
- Download metadata: `llm_repo_id`, `llm_filename`
|
|
232
|
+
|
|
233
|
+
### Configuration behavior to know
|
|
234
|
+
|
|
235
|
+
- The GUI loads from `config.json` through `Config.load()` and writes the current sidebar values back to disk when you start the worker.
|
|
236
|
+
- The CLI and model setup script also load `config.json` by default, so they now use the same persisted settings as the GUI unless you edit the package code.
|
|
237
|
+
|
|
238
|
+
## GUI Behavior
|
|
239
|
+
|
|
240
|
+
The desktop app provides:
|
|
241
|
+
|
|
242
|
+
- a settings sidebar for language, Whisper, LLM, TTS, and VAD parameters
|
|
243
|
+
- a chat panel for user and assistant messages
|
|
244
|
+
- a system log panel that receives redirected `stdout` and `stderr`
|
|
245
|
+
- start, stop, restart, and clear-chat controls
|
|
246
|
+
|
|
247
|
+
Behavior details from the current code:
|
|
248
|
+
|
|
249
|
+
- Settings remain editable while the worker is running.
|
|
250
|
+
- Changing settings during runtime requires `Käynnistä uudelleen` to rebuild the worker with the new values.
|
|
251
|
+
- `Tyhjennä keskustelu` clears the visible chat panel only. It does not clear the LLM conversation history; history resets only when a new `ChatLLM` instance is created, such as after restart.
|
|
252
|
+
|
|
253
|
+
## Architecture Notes
|
|
254
|
+
|
|
255
|
+
- The VAD implementation keeps a rolling pre-buffer so the first syllables are not clipped before speech onset is confirmed.
|
|
256
|
+
- After TTS playback, the app clears queued microphone chunks and resets VAD state to reduce the chance of transcribing its own synthesized output.
|
|
257
|
+
- The LLM wrapper stores alternating user and assistant messages and trims the oldest turns once `max_conversation_turns` is exceeded.
|
|
258
|
+
- The GUI runs model loading and the audio loop in `ChatbotWorker`, a `QThread`, so the main UI thread stays responsive.
|
|
259
|
+
|
|
260
|
+
## Operational Limitations
|
|
261
|
+
|
|
262
|
+
- There are no automated tests in the repository.
|
|
263
|
+
- Audio device selection is not exposed; capture and playback use the default system devices through `sounddevice`.
|
|
264
|
+
- The code and UI text are partly Finnish and partly English.
|
|
265
|
+
- Model initialization happens synchronously inside the worker or CLI startup path, so startup cost depends on model size.
|
|
266
|
+
- ROS 2 support assumes the Robostack Humble packages and the ML/audio dependencies can coexist in the same Pixi environment.
|
|
267
|
+
|
|
268
|
+
## Suggested First Run
|
|
269
|
+
|
|
270
|
+
1. Run `install.bat`.
|
|
271
|
+
2. Run `pixi run setup-models`.
|
|
272
|
+
3. Start `pixi run app`.
|
|
273
|
+
4. Confirm the GGUF model path in the left sidebar.
|
|
274
|
+
5. Click `Käynnistä` and watch the system log for CUDA and model load status.
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# Voice Chatbot
|
|
2
|
+
|
|
3
|
+
Local voice chatbot for Windows with a Finnish-first default configuration. The application captures microphone audio, detects speech with Silero VAD, transcribes it with Whisper, generates a reply with a local GGUF LLM, and speaks the reply with Coqui TTS.
|
|
4
|
+
|
|
5
|
+
The repository's application code now lives under the `voice_chatbot/`
|
|
6
|
+
package. The repo root keeps thin compatibility wrappers (`app.py`,
|
|
7
|
+
`chatbot.py`, `ros_app.py`, `setup_models.py`) so older commands still work.
|
|
8
|
+
|
|
9
|
+
Primary entry points:
|
|
10
|
+
|
|
11
|
+
- `python -m voice_chatbot.app`: PySide6 desktop UI for configuring and running the chatbot.
|
|
12
|
+
- `python -m voice_chatbot.chatbot`: terminal-only runner with the same audio pipeline.
|
|
13
|
+
- `python -m voice_chatbot.ros_app`: ROS-connected PySide6 GUI.
|
|
14
|
+
- `voice_chatbot_ros/node.py`: ROS 2 Humble node that exposes the pipeline through ROS topics and a service.
|
|
15
|
+
|
|
16
|
+
## What The Code Does
|
|
17
|
+
|
|
18
|
+
Runtime flow:
|
|
19
|
+
|
|
20
|
+
1. `AudioIO` captures 16 kHz mono microphone audio in fixed-size chunks.
|
|
21
|
+
2. `VoiceActivityDetector` buffers audio until Silero VAD reports speech start and end.
|
|
22
|
+
3. `SpeechToText` transcribes the captured utterance with `pywhispercpp`.
|
|
23
|
+
4. `ChatLLM` sends the user text plus recent conversation history to `llama-cpp-python`.
|
|
24
|
+
5. `TextToSpeech` synthesizes the assistant reply with Coqui TTS.
|
|
25
|
+
6. `AudioIO` plays the generated speech back through the default output device.
|
|
26
|
+
|
|
27
|
+
The GUI wraps this pipeline in a background `QThread` and exposes model and runtime settings in a sidebar.
|
|
28
|
+
|
|
29
|
+
## Repository Layout
|
|
30
|
+
|
|
31
|
+
| Path | Purpose |
|
|
32
|
+
| --- | --- |
|
|
33
|
+
| `voice_chatbot/` | Main Python application package for GUI, CLI, config, audio, VAD, STT, LLM, and TTS code |
|
|
34
|
+
| `app.py`, `chatbot.py`, `ros_app.py`, `setup_models.py` | Thin compatibility entry points that call into `voice_chatbot/` |
|
|
35
|
+
| `install.bat` | Windows setup script for Python packages and CUDA builds |
|
|
36
|
+
| `pixi.toml` | Pixi workspace manifest for the base toolchain and common tasks |
|
|
37
|
+
| `pixi.lock` | Resolved Pixi lockfile for the base environment |
|
|
38
|
+
| `tools/install_python_windows.bat` | Pixi bootstrap script for CUDA-enabled Python packages |
|
|
39
|
+
| `tools/install_python_linux.sh` | Pixi bootstrap script for Linux Python packages |
|
|
40
|
+
| `config.json` | Persisted GUI configuration |
|
|
41
|
+
| `voice_chatbot_ros/` | ROS 2 Humble package and node implementation |
|
|
42
|
+
| `launch/` | ROS 2 launch file |
|
|
43
|
+
| `package.xml`, `setup.py`, `setup.cfg` | ROS 2 `ament_python` package metadata |
|
|
44
|
+
|
|
45
|
+
More implementation detail is in [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
|
46
|
+
ROS-specific usage is in [docs/ROS2.md](docs/ROS2.md).
|
|
47
|
+
|
|
48
|
+
## Environment Assumptions
|
|
49
|
+
|
|
50
|
+
The codebase is currently optimized for a Windows workstation with local GPU inference:
|
|
51
|
+
|
|
52
|
+
- `app.py` and `chatbot.py` add CUDA DLL paths from `CUDA_PATH` or `D:\cuda`.
|
|
53
|
+
- `install.bat` installs CUDA-enabled PyTorch, `llama-cpp-python`, and `pywhispercpp`.
|
|
54
|
+
- The default LLM is a GGUF file under `models/`.
|
|
55
|
+
- The default voice assistant language is Finnish (`fi`).
|
|
56
|
+
|
|
57
|
+
The code can fall back to CPU for some components, but the intended deployment is local GPU acceleration.
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
The recommended setup path is now Pixi. The repository ships with a `pixi.toml` workspace manifest and uses Pixi to install both the base Python toolchain and the ROS 2 Humble packages.
|
|
62
|
+
|
|
63
|
+
### 1. System prerequisites
|
|
64
|
+
|
|
65
|
+
- Windows
|
|
66
|
+
- `pixi` or permission to let `install.bat` install it
|
|
67
|
+
- CUDA Toolkit if you want GPU acceleration for PyTorch and `llama-cpp-python`
|
|
68
|
+
- CMake and a working build toolchain for Python packages with native extensions
|
|
69
|
+
- Microphone and speakers/headphones configured as default audio devices
|
|
70
|
+
|
|
71
|
+
### 2. Create the Pixi environment and install packages
|
|
72
|
+
|
|
73
|
+
Run:
|
|
74
|
+
|
|
75
|
+
```powershell
|
|
76
|
+
install.bat
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
What the script does:
|
|
80
|
+
|
|
81
|
+
- installs `pixi` if it is missing
|
|
82
|
+
- creates or updates the local Pixi environment from `pixi.toml`
|
|
83
|
+
- runs the Pixi bootstrap task that installs the Python packages needed by the project
|
|
84
|
+
|
|
85
|
+
The Pixi workspace provides:
|
|
86
|
+
|
|
87
|
+
- Python 3.11
|
|
88
|
+
- `pip`
|
|
89
|
+
- `cmake`
|
|
90
|
+
- `git`
|
|
91
|
+
- `ninja`
|
|
92
|
+
- `colcon-common-extensions`
|
|
93
|
+
- `setuptools>=69.5,<80`
|
|
94
|
+
- `ros-humble-desktop`
|
|
95
|
+
|
|
96
|
+
The bootstrap task installs:
|
|
97
|
+
|
|
98
|
+
- CUDA-enabled `torch`, `torchvision`, `torchaudio`
|
|
99
|
+
- `llama-cpp-python` compiled with `GGML_CUDA=on`
|
|
100
|
+
- `pywhispercpp` compiled with CUDA flags
|
|
101
|
+
- the remaining packages from `requirements.txt`
|
|
102
|
+
|
|
103
|
+
It also enforces a `setuptools` version that stays compatible with both `sip`
|
|
104
|
+
and ROS 2 Humble's `colcon` editable Python build flow.
|
|
105
|
+
|
|
106
|
+
`requirements.txt` is still not the full environment by itself. `torch`, `llama-cpp-python`, and `pywhispercpp` are installed separately because they need a custom wheel index or CUDA-specific build flags.
|
|
107
|
+
|
|
108
|
+
### 3. Direct Pixi workflow
|
|
109
|
+
|
|
110
|
+
If you do not want to use `install.bat`, the equivalent commands are:
|
|
111
|
+
|
|
112
|
+
```powershell
|
|
113
|
+
pixi install
|
|
114
|
+
pixi run install-python-deps
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
`pixi run build` also re-checks the `setuptools` compatibility window before
|
|
118
|
+
invoking `colcon`.
|
|
119
|
+
|
|
120
|
+
### 4. Download models
|
|
121
|
+
|
|
122
|
+
Run:
|
|
123
|
+
|
|
124
|
+
```powershell
|
|
125
|
+
pixi run setup-models
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
This script:
|
|
129
|
+
|
|
130
|
+
- checks CUDA visibility in PyTorch
|
|
131
|
+
- initializes the Silero VAD model
|
|
132
|
+
- downloads or validates the configured Whisper model
|
|
133
|
+
- downloads the configured GGUF LLM from Hugging Face if missing
|
|
134
|
+
- initializes the configured Coqui TTS model
|
|
135
|
+
|
|
136
|
+
## Running The Application
|
|
137
|
+
|
|
138
|
+
Desktop UI:
|
|
139
|
+
|
|
140
|
+
```powershell
|
|
141
|
+
pixi run app
|
|
142
|
+
# equivalent: python -m voice_chatbot.app
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Terminal mode:
|
|
146
|
+
|
|
147
|
+
```powershell
|
|
148
|
+
pixi run chatbot
|
|
149
|
+
# equivalent: python -m voice_chatbot.chatbot
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
ROS 2 Humble node:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pixi run ros-run /absolute/path/to/config.json
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
This follows Pixi's ROS 2 workflow with `robostack-humble` packages installed into the Pixi environment.
|
|
159
|
+
|
|
160
|
+
## Testing
|
|
161
|
+
|
|
162
|
+
Run the automated unit test suite with:
|
|
163
|
+
|
|
164
|
+
```powershell
|
|
165
|
+
pixi run test
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
The repository also includes a GitHub Actions workflow that runs the same
|
|
169
|
+
pytest suite automatically on every push and pull request on both Windows
|
|
170
|
+
and Linux.
|
|
171
|
+
|
|
172
|
+
## Configuration
|
|
173
|
+
|
|
174
|
+
Configuration is defined in `config.py` and can be persisted to `config.json`.
|
|
175
|
+
|
|
176
|
+
Important settings:
|
|
177
|
+
|
|
178
|
+
- Audio: `sample_rate`, `channels`, `chunk_samples`
|
|
179
|
+
- VAD: `vad_threshold`, `min_silence_duration_ms`, `speech_pad_ms`, `min_speech_duration_ms`, `vad_pre_buffer_ms`
|
|
180
|
+
- STT: `language`, `whisper_model`, `whisper_n_threads`
|
|
181
|
+
- LLM: `llm_model_path`, `llm_n_gpu_layers`, `llm_n_ctx`, `llm_max_tokens`, `llm_temperature`, `llm_system_prompt`, `max_conversation_turns`
|
|
182
|
+
- TTS: `tts_model`, `tts_gpu`
|
|
183
|
+
- Download metadata: `llm_repo_id`, `llm_filename`
|
|
184
|
+
|
|
185
|
+
### Configuration behavior to know
|
|
186
|
+
|
|
187
|
+
- The GUI loads from `config.json` through `Config.load()` and writes the current sidebar values back to disk when you start the worker.
|
|
188
|
+
- The CLI and model setup script also load `config.json` by default, so they now use the same persisted settings as the GUI unless you edit the package code.
|
|
189
|
+
|
|
190
|
+
## GUI Behavior
|
|
191
|
+
|
|
192
|
+
The desktop app provides:
|
|
193
|
+
|
|
194
|
+
- a settings sidebar for language, Whisper, LLM, TTS, and VAD parameters
|
|
195
|
+
- a chat panel for user and assistant messages
|
|
196
|
+
- a system log panel that receives redirected `stdout` and `stderr`
|
|
197
|
+
- start, stop, restart, and clear-chat controls
|
|
198
|
+
|
|
199
|
+
Behavior details from the current code:
|
|
200
|
+
|
|
201
|
+
- Settings remain editable while the worker is running.
|
|
202
|
+
- Changing settings during runtime requires `Käynnistä uudelleen` to rebuild the worker with the new values.
|
|
203
|
+
- `Tyhjennä keskustelu` clears the visible chat panel only. It does not clear the LLM conversation history; history resets only when a new `ChatLLM` instance is created, such as after restart.
|
|
204
|
+
|
|
205
|
+
## Architecture Notes
|
|
206
|
+
|
|
207
|
+
- The VAD implementation keeps a rolling pre-buffer so the first syllables are not clipped before speech onset is confirmed.
|
|
208
|
+
- After TTS playback, the app clears queued microphone chunks and resets VAD state to reduce the chance of transcribing its own synthesized output.
|
|
209
|
+
- The LLM wrapper stores alternating user and assistant messages and trims the oldest turns once `max_conversation_turns` is exceeded.
|
|
210
|
+
- The GUI runs model loading and the audio loop in `ChatbotWorker`, a `QThread`, so the main UI thread stays responsive.
|
|
211
|
+
|
|
212
|
+
## Operational Limitations
|
|
213
|
+
|
|
214
|
+
- There are no automated tests in the repository.
|
|
215
|
+
- Audio device selection is not exposed; capture and playback use the default system devices through `sounddevice`.
|
|
216
|
+
- The code and UI text are partly Finnish and partly English.
|
|
217
|
+
- Model initialization happens synchronously inside the worker or CLI startup path, so startup cost depends on model size.
|
|
218
|
+
- ROS 2 support assumes the Robostack Humble packages and the ML/audio dependencies can coexist in the same Pixi environment.
|
|
219
|
+
|
|
220
|
+
## Suggested First Run
|
|
221
|
+
|
|
222
|
+
1. Run `install.bat`.
|
|
223
|
+
2. Run `pixi run setup-models`.
|
|
224
|
+
3. Start `pixi run app`.
|
|
225
|
+
4. Confirm the GGUF model path in the left sidebar.
|
|
226
|
+
5. Click `Käynnistä` and watch the system log for CUDA and model load status.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ROS 2 launch file for the split voice chatbot nodes.
|
|
3
|
+
|
|
4
|
+
Launches three nodes in the ``/voice_chatbot`` namespace:
|
|
5
|
+
|
|
6
|
+
- ``voice_stt`` – microphone capture, VAD, Whisper STT
|
|
7
|
+
- ``voice_llm`` – LLM chat inference (LLaMA/GGUF)
|
|
8
|
+
- ``voice_tts`` – Coqui TTS synthesis + audio playback
|
|
9
|
+
|
|
10
|
+
All three share the same ``config_path`` and ``load_config_file``
|
|
11
|
+
parameters so they read the same ``config.json``.
|
|
12
|
+
|
|
13
|
+
Usage::
|
|
14
|
+
|
|
15
|
+
pixi run ros-launch
|
|
16
|
+
# or: ros2 launch voice_chatbot_ros voice_chatbot.launch.py
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from launch.actions import DeclareLaunchArgument
|
|
20
|
+
from launch.substitutions import LaunchConfiguration
|
|
21
|
+
from launch_ros.actions import Node
|
|
22
|
+
from launch_ros.parameter_descriptions import ParameterValue
|
|
23
|
+
|
|
24
|
+
from launch import LaunchDescription
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def generate_launch_description() -> LaunchDescription:
|
|
28
|
+
"""Build the launch description with shared parameters."""
|
|
29
|
+
config_path = LaunchConfiguration("config_path")
|
|
30
|
+
load_config_file = LaunchConfiguration("load_config_file")
|
|
31
|
+
|
|
32
|
+
shared_params = [
|
|
33
|
+
{
|
|
34
|
+
"config_path": ParameterValue(config_path, value_type=str),
|
|
35
|
+
"load_config_file": ParameterValue(load_config_file, value_type=bool),
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
return LaunchDescription(
|
|
40
|
+
[
|
|
41
|
+
DeclareLaunchArgument("config_path", default_value="config.json"),
|
|
42
|
+
DeclareLaunchArgument("load_config_file", default_value="true"),
|
|
43
|
+
# STT node: microphone capture, VAD, speech-to-text
|
|
44
|
+
Node(
|
|
45
|
+
package="voice_chatbot_ros",
|
|
46
|
+
executable="voice_stt_node",
|
|
47
|
+
name="voice_stt",
|
|
48
|
+
namespace="voice_chatbot",
|
|
49
|
+
output="screen",
|
|
50
|
+
parameters=shared_params,
|
|
51
|
+
),
|
|
52
|
+
# LLM node: chat inference
|
|
53
|
+
Node(
|
|
54
|
+
package="voice_chatbot_ros",
|
|
55
|
+
executable="voice_llm_node",
|
|
56
|
+
name="voice_llm",
|
|
57
|
+
namespace="voice_chatbot",
|
|
58
|
+
output="screen",
|
|
59
|
+
parameters=shared_params,
|
|
60
|
+
),
|
|
61
|
+
# TTS node: text-to-speech synthesis and audio playback
|
|
62
|
+
Node(
|
|
63
|
+
package="voice_chatbot_ros",
|
|
64
|
+
executable="voice_tts_node",
|
|
65
|
+
name="voice_tts",
|
|
66
|
+
namespace="voice_chatbot",
|
|
67
|
+
output="screen",
|
|
68
|
+
parameters=shared_params,
|
|
69
|
+
),
|
|
70
|
+
]
|
|
71
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
<?xml version="1.0"?>
|
|
2
|
+
<package format="3">
|
|
3
|
+
<name>voice_chatbot_ros</name>
|
|
4
|
+
<version>0.1.0</version>
|
|
5
|
+
<description>ROS 2 Humble integration for the local voice chatbot pipeline.</description>
|
|
6
|
+
|
|
7
|
+
<maintainer email="noreply@example.com">OpenAI Codex</maintainer>
|
|
8
|
+
<license>Proprietary</license>
|
|
9
|
+
|
|
10
|
+
<buildtool_depend>ament_python</buildtool_depend>
|
|
11
|
+
|
|
12
|
+
<exec_depend>rclpy</exec_depend>
|
|
13
|
+
<exec_depend>std_msgs</exec_depend>
|
|
14
|
+
<exec_depend>std_srvs</exec_depend>
|
|
15
|
+
<exec_depend>launch</exec_depend>
|
|
16
|
+
<exec_depend>launch_ros</exec_depend>
|
|
17
|
+
|
|
18
|
+
<export>
|
|
19
|
+
<build_type>ament_python</build_type>
|
|
20
|
+
</export>
|
|
21
|
+
</package>
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69.5"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "voice-chatbot"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Local speech-to-speech voice assistant with PySide6 GUI, CLI, and ROS 2 integration."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Aapo", email = "noreply@example.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["voice-assistant", "chatbot", "speech-to-text", "text-to-speech", "llm"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"numpy",
|
|
28
|
+
"sounddevice",
|
|
29
|
+
"huggingface-hub",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
stt = [
|
|
34
|
+
"faster-whisper",
|
|
35
|
+
]
|
|
36
|
+
llm = [
|
|
37
|
+
"llama-cpp-python",
|
|
38
|
+
]
|
|
39
|
+
tts = [
|
|
40
|
+
"coqui-tts[codec]",
|
|
41
|
+
]
|
|
42
|
+
vad = [
|
|
43
|
+
"silero-vad",
|
|
44
|
+
]
|
|
45
|
+
gui = [
|
|
46
|
+
"PySide6>=6.7,<7",
|
|
47
|
+
]
|
|
48
|
+
all = [
|
|
49
|
+
"voice-chatbot[stt,llm,tts,vad,gui]",
|
|
50
|
+
]
|
|
51
|
+
dev = [
|
|
52
|
+
"voice-chatbot[all]",
|
|
53
|
+
"pytest>=9,<10",
|
|
54
|
+
"pytest-cov>=5,<8",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[project.scripts]
|
|
58
|
+
voice-chatbot = "voice_chatbot.chatbot:main"
|
|
59
|
+
voice-chatbot-app = "voice_chatbot.app:main"
|
|
60
|
+
voice-chatbot-setup-models = "voice_chatbot.setup_models:main"
|
|
61
|
+
|
|
62
|
+
[project.urls]
|
|
63
|
+
Homepage = "https://github.com/Aapo2001/python-chatbot"
|
|
64
|
+
Documentation = "https://docs-site-kappa-coral.vercel.app"
|
|
65
|
+
Repository = "https://github.com/Aapo2001/python-chatbot"
|
|
66
|
+
Issues = "https://github.com/Aapo2001/python-chatbot/issues"
|
|
67
|
+
|
|
68
|
+
[tool.setuptools.packages.find]
|
|
69
|
+
include = ["voice_chatbot", "voice_chatbot.*"]
|
|
70
|
+
exclude = ["voice_chatbot_ros", "voice_chatbot_ros.*", "tests", "tests.*"]
|
|
71
|
+
|
|
72
|
+
[tool.pytest.ini_options]
|
|
73
|
+
testpaths = ["tests"]
|
|
74
|
+
addopts = "-ra"
|