openvoiceui 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/.env.example +104 -0
  2. package/Dockerfile +30 -0
  3. package/LICENSE +21 -0
  4. package/README.md +638 -0
  5. package/SETUP.md +360 -0
  6. package/app.py +232 -0
  7. package/auto-approve-devices.js +111 -0
  8. package/cli/index.js +372 -0
  9. package/config/__init__.py +4 -0
  10. package/config/default.yaml +43 -0
  11. package/config/flags.yaml +67 -0
  12. package/config/loader.py +203 -0
  13. package/config/providers.yaml +71 -0
  14. package/config/speech_normalization.yaml +182 -0
  15. package/config/theme.json +4 -0
  16. package/data/greetings.json +25 -0
  17. package/default-pages/ai-image-creator.html +915 -0
  18. package/default-pages/bulk-image-uploader.html +492 -0
  19. package/default-pages/desktop.html +2865 -0
  20. package/default-pages/file-explorer.html +854 -0
  21. package/default-pages/interactive-map.html +655 -0
  22. package/default-pages/style-guide.html +1005 -0
  23. package/default-pages/website-setup.html +1623 -0
  24. package/deploy/openclaw/Dockerfile +46 -0
  25. package/deploy/openvoiceui.service +30 -0
  26. package/deploy/setup-nginx.sh +50 -0
  27. package/deploy/setup-sudo.sh +306 -0
  28. package/deploy/skill-runner/Dockerfile +19 -0
  29. package/deploy/skill-runner/requirements.txt +14 -0
  30. package/deploy/skill-runner/server.py +269 -0
  31. package/deploy/supertonic/Dockerfile +22 -0
  32. package/deploy/supertonic/server.py +79 -0
  33. package/docker-compose.pinokio.yml +11 -0
  34. package/docker-compose.yml +59 -0
  35. package/greetings.json +25 -0
  36. package/index.html +65 -0
  37. package/inject-device-identity.js +142 -0
  38. package/package.json +82 -0
  39. package/profiles/default.json +114 -0
  40. package/profiles/manager.py +354 -0
  41. package/profiles/schema.json +337 -0
  42. package/prompts/voice-system-prompt.md +149 -0
  43. package/providers/__init__.py +39 -0
  44. package/providers/base.py +63 -0
  45. package/providers/llm/__init__.py +12 -0
  46. package/providers/llm/base.py +71 -0
  47. package/providers/llm/clawdbot_provider.py +112 -0
  48. package/providers/llm/zai_provider.py +115 -0
  49. package/providers/registry.py +320 -0
  50. package/providers/stt/__init__.py +12 -0
  51. package/providers/stt/base.py +58 -0
  52. package/providers/stt/webspeech_provider.py +49 -0
  53. package/providers/stt/whisper_provider.py +100 -0
  54. package/providers/tts/__init__.py +20 -0
  55. package/providers/tts/base.py +91 -0
  56. package/providers/tts/groq_provider.py +74 -0
  57. package/providers/tts/supertonic_provider.py +72 -0
  58. package/requirements.txt +38 -0
  59. package/routes/__init__.py +10 -0
  60. package/routes/admin.py +515 -0
  61. package/routes/canvas.py +1315 -0
  62. package/routes/chat.py +51 -0
  63. package/routes/conversation.py +2158 -0
  64. package/routes/elevenlabs_hybrid.py +306 -0
  65. package/routes/greetings.py +98 -0
  66. package/routes/icons.py +279 -0
  67. package/routes/image_gen.py +364 -0
  68. package/routes/instructions.py +190 -0
  69. package/routes/music.py +838 -0
  70. package/routes/onboarding.py +43 -0
  71. package/routes/pi.py +62 -0
  72. package/routes/profiles.py +215 -0
  73. package/routes/report_issue.py +68 -0
  74. package/routes/static_files.py +533 -0
  75. package/routes/suno.py +664 -0
  76. package/routes/theme.py +81 -0
  77. package/routes/transcripts.py +199 -0
  78. package/routes/vision.py +348 -0
  79. package/routes/workspace.py +288 -0
  80. package/server.py +1510 -0
  81. package/services/__init__.py +1 -0
  82. package/services/auth.py +143 -0
  83. package/services/canvas_versioning.py +239 -0
  84. package/services/db_pool.py +107 -0
  85. package/services/gateway.py +16 -0
  86. package/services/gateway_manager.py +333 -0
  87. package/services/gateways/__init__.py +12 -0
  88. package/services/gateways/base.py +110 -0
  89. package/services/gateways/compat.py +264 -0
  90. package/services/gateways/openclaw.py +1134 -0
  91. package/services/health.py +100 -0
  92. package/services/memory_client.py +455 -0
  93. package/services/paths.py +26 -0
  94. package/services/speech_normalizer.py +285 -0
  95. package/services/tts.py +270 -0
  96. package/setup-config.js +262 -0
  97. package/sounds/air_horn.mp3 +0 -0
  98. package/sounds/bruh.mp3 +0 -0
  99. package/sounds/crowd_cheer.mp3 +0 -0
  100. package/sounds/gunshot.mp3 +0 -0
  101. package/sounds/impact.mp3 +0 -0
  102. package/sounds/lets_go.mp3 +0 -0
  103. package/sounds/record_stop.mp3 +0 -0
  104. package/sounds/rewind.mp3 +0 -0
  105. package/sounds/sad_trombone.mp3 +0 -0
  106. package/sounds/scratch_long.mp3 +0 -0
  107. package/sounds/yeah.mp3 +0 -0
  108. package/src/adapters/ClawdBotAdapter.js +264 -0
  109. package/src/adapters/_template.js +133 -0
  110. package/src/adapters/elevenlabs-classic.js +841 -0
  111. package/src/adapters/elevenlabs-hybrid.js +812 -0
  112. package/src/adapters/hume-evi.js +676 -0
  113. package/src/admin.html +1339 -0
  114. package/src/app.js +8802 -0
  115. package/src/core/Config.js +173 -0
  116. package/src/core/EmotionEngine.js +307 -0
  117. package/src/core/EventBridge.js +180 -0
  118. package/src/core/EventBus.js +117 -0
  119. package/src/core/VoiceSession.js +607 -0
  120. package/src/face/BaseFace.js +259 -0
  121. package/src/face/EyeFace.js +208 -0
  122. package/src/face/HaloSmokeFace.js +509 -0
  123. package/src/face/manifest.json +27 -0
  124. package/src/face/previews/eyes.svg +16 -0
  125. package/src/face/previews/orb.svg +29 -0
  126. package/src/features/MusicPlayer.js +620 -0
  127. package/src/features/Soundboard.js +128 -0
  128. package/src/providers/DeepgramSTT.js +472 -0
  129. package/src/providers/DeepgramStreamingSTT.js +766 -0
  130. package/src/providers/GroqSTT.js +559 -0
  131. package/src/providers/TTSPlayer.js +323 -0
  132. package/src/providers/WebSpeechSTT.js +479 -0
  133. package/src/providers/tts/BaseTTSProvider.js +81 -0
  134. package/src/providers/tts/HumeProvider.js +77 -0
  135. package/src/providers/tts/SupertonicProvider.js +174 -0
  136. package/src/providers/tts/index.js +140 -0
  137. package/src/shell/adapter-registry.js +154 -0
  138. package/src/shell/caller-bridge.js +35 -0
  139. package/src/shell/camera-bridge.js +28 -0
  140. package/src/shell/canvas-bridge.js +32 -0
  141. package/src/shell/commercial-bridge.js +44 -0
  142. package/src/shell/face-bridge.js +44 -0
  143. package/src/shell/music-bridge.js +60 -0
  144. package/src/shell/orchestrator.js +233 -0
  145. package/src/shell/profile-discovery.js +303 -0
  146. package/src/shell/sounds-bridge.js +28 -0
  147. package/src/shell/transcript-bridge.js +61 -0
  148. package/src/shell/waveform-bridge.js +33 -0
  149. package/src/styles/base.css +2862 -0
  150. package/src/styles/face.css +417 -0
  151. package/src/styles/pi-overrides.css +89 -0
  152. package/src/styles/theme-dark.css +67 -0
  153. package/src/test-tts.html +175 -0
  154. package/src/ui/AppShell.js +544 -0
  155. package/src/ui/ProfileSwitcher.js +228 -0
  156. package/src/ui/SessionControl.js +240 -0
  157. package/src/ui/face/FacePicker.js +195 -0
  158. package/src/ui/face/FaceRenderer.js +309 -0
  159. package/src/ui/settings/PlaylistEditor.js +366 -0
  160. package/src/ui/settings/SettingsPanel.css +684 -0
  161. package/src/ui/settings/SettingsPanel.js +419 -0
  162. package/src/ui/settings/TTSVoicePreview.js +210 -0
  163. package/src/ui/themes/ThemeManager.js +213 -0
  164. package/src/ui/visualizers/BaseVisualizer.js +29 -0
  165. package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
  166. package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
  167. package/static/emulators/jsdos/js-dos.css +1 -0
  168. package/static/emulators/jsdos/js-dos.js +22 -0
  169. package/static/favicon.svg +55 -0
  170. package/static/icons/apple-touch-icon.png +0 -0
  171. package/static/icons/favicon-32.png +0 -0
  172. package/static/icons/icon-192.png +0 -0
  173. package/static/icons/icon-512.png +0 -0
  174. package/static/install.html +449 -0
  175. package/static/manifest.json +26 -0
  176. package/static/sw.js +21 -0
  177. package/tts_providers/__init__.py +136 -0
  178. package/tts_providers/base_provider.py +319 -0
  179. package/tts_providers/groq_provider.py +155 -0
  180. package/tts_providers/hume_provider.py +226 -0
  181. package/tts_providers/providers_config.json +119 -0
  182. package/tts_providers/qwen3_provider.py +371 -0
  183. package/tts_providers/resemble_provider.py +315 -0
  184. package/tts_providers/supertonic_provider.py +557 -0
  185. package/tts_providers/supertonic_tts.py +399 -0
package/README.md ADDED
@@ -0,0 +1,638 @@
1
+ <p align="center">
2
+ <img src="docs/banner.png" alt="OpenVoiceUI Banner" />
3
+ </p>
4
+
5
+ # OpenVoiceUI
6
+
7
+ A plug-and-play browser-based voice agent platform. Connect any LLM, any TTS provider, and any AI framework — with a built-in music player, AI music generation, and a live web canvas display system.
8
+
9
+ > **Hosting notice:** OpenVoiceUI is designed to run on a dedicated VPS (see [Hetzner setup](#hosting-multiple-users-hetzner-vps) below). Running it on a local machine is possible but not recommended — microphone access, SSL, and persistent uptime all work significantly better on a hosted server. For the best experience, deploy to a VPS before using it seriously.
10
+
11
+ ---
12
+
13
+ ## What It Is
14
+
15
+ OpenVoiceUI is a modular voice UI shell. You bring the intelligence (LLM + TTS), it handles everything else:
16
+
17
+ - **Voice I/O** — browser-based STT with push-to-talk, wake words, or continuous mode
18
+ - **Animated Faces** — multiple face modes (eye-face avatar, halo smoke orb) with mood states, thinking animations, and audio-reactive waveform mouth
19
+ - **Web Canvas** — fullscreen iframe display system for AI-generated HTML pages, dashboards, and reports with interactive links, page versioning, and external URL display
20
+ - **Desktop OS Interface** — full desktop-like canvas experience with right-click context menus, wallpaper upload, trash, shortcuts, and folder creation (auto-seeded as default pages)
21
+ - **Music Player** — background music with crossfade, AI ducking, and AI trigger commands
22
+ - **Music Generation** — AI-generated track support via Suno or fal.ai integrations
23
+ - **AI Image Generation** — HuggingFace-powered image generation with FLUX.1 and SD3.5 models, quality presets, and aspect ratio control
24
+ - **Voice Cloning** — clone and generate speech with custom voice embeddings via fal.ai Qwen3-TTS
25
+ - **Soundboard** — configurable sound effects with text-trigger detection
26
+ - **Agent Profiles** — switch personas/providers without restart via JSON config
27
+ - **Agent Activity Chip** — live action ticker showing what the agent is doing in real-time
28
+ - **Live Instruction Editor** — hot-reload system prompt from the admin panel
29
+ - **Admin Dashboard** — session control, playlist editor, face picker, theme editor
30
+ - **Issue Reporter** — in-app bug/feedback reporting modal with session context (development tool)
31
+ - **Server-Side Settings** — voice, face, and TTS preferences persist across devices via server (no localStorage)
32
+ - **Document Upload Extraction** — PDF and document text extraction from uploads
33
+ - **Empty Response Auto-Recovery** — auto-retry on empty LLM responses with Z.AI direct fallback and session auto-recovery
34
+
35
+ ---
36
+
37
+ ## Open Framework Philosophy
38
+
39
+ OpenVoiceUI is built as an **open voice UI shell** — it doesn't lock you into any specific LLM, TTS engine, STT provider, or AI framework. Every layer is a pluggable slot. Drop in a gateway plugin, a TTS provider, or a custom adapter and it just works. The built-in providers are defaults, not requirements.
40
+
41
+ ### LLM / Gateway Providers
42
+ Connect to any LLM via a gateway plugin — OpenClaw is built-in, others are drop-in:
43
+
44
+ | Provider | Status |
45
+ |----------|--------|
46
+ | Any OpenClaw-compatible gateway | Built-in |
47
+ | Z.AI (GLM models) | Built-in |
48
+ | OpenAI-compatible APIs | Via adapter |
49
+ | Ollama (local) | Via adapter |
50
+ | Hume EVI | Built-in adapter |
51
+ | LangChain, AutoGen, custom agent framework | Via gateway plugin |
52
+ | **Any LLM or framework you build a plugin for** | Drop a folder in `plugins/` |
53
+
54
+ ### TTS Providers
55
+ | Provider | Type | Cost |
56
+ |----------|------|------|
57
+ | **Supertonic** | Local ONNX | Free |
58
+ | **Groq Orpheus** | Cloud, fast | ~$0.05/min |
59
+ | **Qwen3-TTS** | Cloud, expressive | ~$0.003/min |
60
+ | **Hume EVI** | Cloud, emotion-aware | ~$0.032/min |
61
+ | **Any TTS engine you implement** | Local or cloud | Your choice |
62
+
63
+ ### STT Providers
64
+ | Provider | Type | Cost | Notes |
65
+ |----------|------|------|-------|
66
+ | **Web Speech API** | Browser-native | Free | No API key needed, Chrome/Edge only |
67
+ | **Deepgram Nova-2** | Cloud streaming | Pay-per-use | Reliable paid alternative, real-time WebSocket streaming |
68
+ | **Groq Whisper** | Cloud batch | Free tier available | Fast batch transcription via Groq API |
69
+ | **Whisper** | Local | Free | Self-hosted Whisper model |
70
+ | **Hume EVI** | Cloud, full-duplex | ~$0.032/min | Emotion-aware, bidirectional |
71
+ | **Any STT provider** | Via custom adapter | Your choice | Implement the STT adapter interface |
72
+
73
+ ---
74
+
75
+ ## Features
76
+
77
+ ### Voice Modes
78
+ - **Continuous** — always listening, silence timeout triggers send
79
+ - **Push-to-Talk** — hold button or configurable hotkey (keyboard/mouse)
80
+ - **Listen** — passive monitoring mode
81
+ - **Sleep** — goodbye detection pauses the agent, wake word reactivates
82
+ - **Agent-to-Agent** — A2A communication panel
83
+
84
+ ### Canvas System
85
+ - AI can open and display any HTML page in a fullscreen overlay
86
+ - Manifest-based page discovery with search, categories, and starred pages
87
+ - Triggered via `[CANVAS:page-id]` tags in AI responses
88
+ - Real-time SSE updates from server
89
+ - **Interactive links** — canvas pages communicate with the app via postMessage bridge (navigate, speak, open URLs)
90
+ - **Page versioning** — automatic `.versions/` backup on every change with restore API
91
+ - **External URL display** — load any URL in the canvas iframe via `[CANVAS_URL:https://...]`
92
+ - **Default pages** — desktop OS and file explorer pages auto-seeded on startup
93
+ - **Admin lock/URL columns** — admin panel shows lock state and copyable URLs for each page
94
+ - **Padded mode** — configurable edge padding on canvas pages
95
+ - **Error auto-injection** — canvas pages get an error bridge for debugging in the ActionConsole
96
+ - **Content Security Policy** — restrictive CSP on canvas pages to prevent XSS
97
+
98
+ ### STT Improvements
99
+ - **Hallucination filter** — rejects ghost transcripts from silence
100
+ - **Noise rejection** — sustained speech detection prevents spurious triggers
101
+ - **VAD tuning** — configurable voice activity detection thresholds
102
+
103
+ ### Music Player
104
+ - Background playlist with crossfade (1.5s smooth transitions)
105
+ - Auto-ducking during TTS (volume drops, restores after)
106
+ - AI voice commands: play, stop, skip, volume up/down
107
+ - Generated tracks (AI-composed) + custom playlists
108
+ - Track history (back button, 20-track buffer)
109
+
110
+ ### Profile System
111
+ Define agents in JSON — each profile configures:
112
+ - LLM provider, model, parameters
113
+ - TTS voice, speed, parallel sentence mode
114
+ - STT silence timeout, PTT mode, wake words
115
+ - UI theme, face mood, enabled features
116
+ - Session key strategy
117
+
118
+ ### Security
119
+ - **Content Security Policy** — restrictive CSP headers on canvas pages to prevent XSS
120
+ - **SSRF protection** — all external fetch endpoints validate and block internal network requests
121
+ - **Path traversal protection** — file access endpoints sanitize paths
122
+ - **WebSocket authentication** — gateway WebSocket connections require valid auth tokens
123
+
124
+ ---
125
+
126
+ ## Project Structure
127
+
128
+ ```
129
+ ├── server.py Entry point
130
+ ├── app.py Flask app factory
131
+ ├── docker-compose.yml Multi-service Docker setup
132
+ ├── docker-compose.pinokio.yml Pinokio one-click installer compose
133
+ ├── pinokio.js Pinokio app manifest
134
+ ├── install.js Pinokio install script
135
+ ├── start.js Pinokio start script
136
+ ├── stop.js Pinokio stop script
137
+ ├── update.js Pinokio update script
138
+ ├── .devcontainer/
139
+ │ ├── devcontainer.json VS Code dev container config
140
+ │ └── docker-compose.devcontainer.yml
141
+ ├── routes/
142
+ │ ├── conversation.py Voice + parallel TTS streaming (with abort + heartbeats)
143
+ │ ├── canvas.py Canvas display system + CDN stripping
144
+ │ ├── instructions.py Live system prompt editor
145
+ │ ├── music.py Music control
146
+ │ ├── suno.py Suno AI music generation + webhooks
147
+ │ ├── profiles.py Agent profile management
148
+ │ ├── admin.py Admin + server stats
149
+ │ ├── transcripts.py Conversation transcript auto-save
150
+ │ ├── vision.py Screenshot / image analysis (Gemini)
151
+ │ ├── greetings.py Greeting management
152
+ │ ├── theme.py Theme management
153
+ │ ├── elevenlabs_hybrid.py ElevenLabs TTS adapter
154
+ │ ├── pi.py Pi coding agent
155
+ │ ├── static_files.py Static asset serving
156
+ │ ├── image_gen.py HuggingFace image generation (FLUX.1, SD3.5)
157
+ │ ├── workspace.py Agent workspace file management
158
+ │ ├── report_issue.py In-app issue reporter
159
+ │ ├── icons.py Icon generation
160
+ │ └── onboarding.py Onboarding flow
161
+ ├── services/
162
+ │ ├── auth.py Clerk JWT authentication middleware
163
+ │ ├── canvas_versioning.py Automatic page version history + restore
164
+ │ ├── db_pool.py SQLite WAL connection pool
165
+ │ ├── health.py Liveness + readiness health probes
166
+ │ ├── paths.py Canonical path constants (all dirs)
167
+ │ ├── speech_normalizer.py Speech text normalization
168
+ │ ├── gateway_manager.py Gateway registry + plugin loader + router
169
+ │ ├── gateways/
170
+ │ │ ├── base.py GatewayBase abstract class
171
+ │ │ └── openclaw.py OpenClaw gateway implementation
172
+ │ └── tts.py TTS service wrapper (retry + provider fallback)
173
+ ├── tts_providers/ TTS provider implementations
174
+ │ ├── groq_provider.py Groq Orpheus
175
+ │ ├── supertonic_provider.py Supertonic (local ONNX)
176
+ │ ├── qwen3_provider.py Qwen3-TTS via fal.ai
177
+ │ └── hume_provider.py Hume EVI
178
+ ├── providers/ LLM/STT provider implementations
179
+ ├── plugins/ Gateway plugins (gitignored, drop-in)
180
+ │ ├── README.md Plugin authoring guide
181
+ │ └── example-gateway/ Reference implementation
182
+ ├── profiles/ Agent profile JSON files
183
+ │ └── default.json Base agent (edit to personalize)
184
+ ├── prompts/
185
+ │ └── voice-system-prompt.md Hot-reload system prompt
186
+ ├── config/
187
+ │ ├── default.yaml Server configuration
188
+ │ └── speech_normalization.yaml
189
+ ├── deploy/
190
+ │ ├── openclaw/Dockerfile OpenClaw container build
191
+ │ ├── supertonic/ Supertonic TTS container (Dockerfile + server.py)
192
+ │ ├── skill-runner/ Shared skill execution service (Dockerfile + server.py)
193
+ │ ├── setup-sudo.sh VPS setup (nginx, SSL, systemd)
194
+ │ └── openvoiceui.service Systemd unit file
195
+ ├── default-pages/ Auto-seeded default canvas pages
196
+ │ ├── desktop.html Desktop OS interface
197
+ │ └── file-explorer.html File explorer page
198
+ ├── src/
199
+ │ ├── app.js Frontend core
200
+ │ ├── adapters/ Adapter implementations
201
+ │ │ ├── ClawdBotAdapter.js
202
+ │ │ ├── hume-evi.js
203
+ │ │ ├── elevenlabs-classic.js
204
+ │ │ ├── elevenlabs-hybrid.js
205
+ │ │ └── _template.js Build your own adapter
206
+ │ ├── core/ EventBus, VoiceSession, EmotionEngine, Config
207
+ │ ├── face/ Animated face implementations
208
+ │ │ ├── EyeFace.js Classic eye-face avatar
209
+ │ │ ├── HaloSmokeFace.js Halo smoke orb with thinking mode
210
+ │ │ ├── BaseFace.js Base class for face types
211
+ │ │ └── manifest.json Face registry + previews
212
+ │ ├── features/ MusicPlayer, Soundboard
213
+ │ ├── shell/ Orchestrator, bridges, profile discovery
214
+ │ ├── ui/
215
+ │ │ ├── AppShell.js Main app layout
216
+ │ │ ├── face/ FacePicker, FaceRenderer
217
+ │ │ ├── settings/ SettingsPanel, PlaylistEditor, TTSVoicePreview
218
+ │ │ ├── themes/ ThemeManager
219
+ │ │ └── visualizers/ PartyFXVisualizer, BaseVisualizer
220
+ │ └── providers/
221
+ │ ├── WebSpeechSTT.js Browser speech recognition + wake word detection
222
+ │ ├── DeepgramSTT.js Deepgram Nova-2 streaming STT
223
+ │ ├── GroqSTT.js Groq Whisper batch STT
224
+ │ ├── TTSPlayer.js TTS audio playback
225
+ │ └── tts/ TTS provider JS modules
226
+ ├── sounds/ Soundboard audio files
227
+ └── runtime/ Runtime data (gitignored, docker-mounted)
228
+ ├── uploads/ User-uploaded files
229
+ ├── canvas-pages/ Canvas HTML pages
230
+ │ └── .versions/ Automatic page version backups
231
+ ├── known_faces/ Face recognition photos
232
+ ├── music/ Music playlist folder
233
+ ├── generated_music/ AI-generated tracks
234
+ ├── transcripts/ Conversation transcripts (auto-saved)
235
+ └── canvas-manifest.json Canvas page registry
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Prerequisites
241
+
242
+ - **OpenClaw gateway `2026.3.13`** — [openclaw.ai](https://openclaw.ai) · [version requirements](docs/openclaw-requirements.md)
243
+ - **Groq API key** for TTS — [console.groq.com](https://console.groq.com) (free tier available)
244
+ - Optional: Suno API key (music generation), Clerk (auth for multi-user deployments)
245
+
246
+ > OpenVoiceUI is tested with **openclaw@2026.3.13**. The Docker setup installs this version automatically. If you're using an existing OpenClaw install, see [OpenClaw Requirements](docs/openclaw-requirements.md) — other versions may have breaking changes that prevent voice conversations from working.
247
+
248
+ ---
249
+
250
+ ## Installation
251
+
252
+ ### Option 1: Pinokio One-Click Install
253
+
254
+ The easiest way to get started. [Pinokio](https://pinokio.computer) is a free app manager that handles installation, startup, and updates automatically.
255
+
256
+ 1. Install [Pinokio](https://pinokio.computer) if you don't have it
257
+ 2. Search for "OpenVoiceUI" in the Pinokio app store, or add this repo URL directly
258
+ 3. Click **Install** — Pinokio will clone the repo, build Docker images, and run onboarding
259
+ 4. Click **Start** to launch all services
260
+ 5. Open the URL shown in Pinokio to access the UI
261
+
262
+ Pinokio handles Docker Compose orchestration, environment configuration, and service lifecycle. Use the **Stop** button to shut down, and **Update** to pull the latest changes.
263
+
264
+ ### Option 2: Deployment (Recommended: VPS)
265
+
266
+ The recommended way to run OpenVoiceUI is on a dedicated VPS — microphone access, SSL, and always-on uptime all work significantly better hosted than on a local machine.
267
+
268
+ A setup script handles nginx, Let's Encrypt SSL, and systemd automatically:
269
+
270
+ ```bash
271
+ git clone https://github.com/MCERQUA/OpenVoiceUI
272
+ cd OpenVoiceUI
273
+ cp .env.example .env
274
+ # Edit .env — set CLAWDBOT_AUTH_TOKEN and GROQ_API_KEY at minimum
275
+ # Edit deploy/setup-sudo.sh — set DOMAIN, PORT, EMAIL, INSTALL_DIR at the top
276
+ sudo bash deploy/setup-sudo.sh
277
+ ```
278
+
279
+ The script is idempotent — safe to re-run. Skips SSL if cert already exists.
280
+
281
+ ```bash
282
+ sudo systemctl status openvoiceui
283
+ sudo journalctl -u openvoiceui -f
284
+ ```
285
+
286
+ ### Option 3: Local Install (Docker)
287
+
288
+ Docker is the easiest path for local development — it runs OpenClaw, Supertonic TTS, and OpenVoiceUI together. Note that browser microphone access requires HTTPS — on localhost Chrome/Edge will still allow it, but other devices on your network won't work without a cert.
289
+
290
+ ```bash
291
+ git clone https://github.com/MCERQUA/OpenVoiceUI
292
+ cd OpenVoiceUI
293
+ cp .env.example .env
294
+ ```
295
+
296
+ #### Step 1: Onboard OpenClaw (one-time)
297
+
298
+ Run the interactive onboarding wizard to configure your LLM provider and generate an auth token:
299
+
300
+ ```bash
301
+ docker compose build openclaw
302
+ docker compose run --rm openclaw openclaw onboard
303
+ ```
304
+
305
+ This will prompt you to choose an LLM provider (Anthropic, OpenAI, etc.), enter your API key, and generate a gateway auth token.
306
+
307
+ #### Step 2: Configure `.env`
308
+
309
+ Set the auth token from onboarding:
310
+
311
+ ```bash
312
+ PORT=5001
313
+ CLAWDBOT_AUTH_TOKEN=<token-from-onboarding>
314
+ ```
315
+
316
+ > `CLAWDBOT_GATEWAY_URL` does not need to be set — Docker Compose automatically routes to the OpenClaw container via loopback networking. TTS works out of the box with Supertonic (local, free). Optionally add `GROQ_API_KEY` for Groq Orpheus TTS.
317
+
318
+ #### Step 3: Start
319
+
320
+ ```bash
321
+ docker compose up --build
322
+ ```
323
+
324
+ Open `http://localhost:5001` in your browser.
325
+
326
+ #### How it works
327
+
328
+ The `docker-compose.yml` runs three services:
329
+
330
+ | Service | Description |
331
+ |---------|-------------|
332
+ | `openclaw` | OpenClaw gateway (Node.js) — handles LLM routing, tool use, and agent sessions on port 18791 |
333
+ | `supertonic` | Local TTS engine (ONNX) — provides free text-to-speech without external API keys |
334
+ | `openvoiceui` | OpenVoiceUI server (Python/Flask) — serves the frontend and connects to OpenClaw and Supertonic |
335
+
336
+ OpenClaw config is persisted in a Docker volume (`openclaw-data`), so onboarding only needs to run once.
337
+
338
+ ### Option 4: VS Code Dev Container
339
+
340
+ For contributors and developers, OpenVoiceUI includes a VS Code dev container configuration that sets up the full development environment automatically.
341
+
342
+ 1. Install the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) in VS Code
343
+ 2. Open the repo folder in VS Code
344
+ 3. When prompted, click **Reopen in Container** (or run the "Dev Containers: Reopen in Container" command)
345
+ 4. VS Code will build and start all services using `.devcontainer/docker-compose.devcontainer.yml`
346
+ 5. The development server starts automatically with hot-reload
347
+
348
+ The dev container includes all dependencies pre-installed and is configured for the full Docker Compose stack.
349
+
350
+ ---
351
+
352
+ ### TTS setup
353
+
354
+ Supertonic (local, free) is included and works out of the box — select "supertonic" as the TTS provider in the Settings panel.
355
+
356
+ To use **Groq Orpheus TTS** instead, you must first accept the model terms at [console.groq.com/playground?model=canopylabs%2Forpheus-v1-english](https://console.groq.com/playground?model=canopylabs%2Forpheus-v1-english), then set `GROQ_API_KEY` in `.env`.
357
+
358
+ ---
359
+
360
+ ## Authentication
361
+
362
+ Auth is **opt-in**. By default, OpenVoiceUI runs with no authentication — all endpoints are accessible. This is the right setting for self-hosted single-user deployments.
363
+
364
+ To **enable Clerk JWT auth** (for multi-user or public-facing deployments):
365
+ 1. Create a Clerk app at [clerk.com](https://clerk.com)
366
+ 2. Add `CLERK_PUBLISHABLE_KEY=pk_live_...` to `.env`
367
+ 3. Set `CANVAS_REQUIRE_AUTH=true` in `.env`
368
+ 4. Set `ALLOWED_USER_IDS=user_yourclerkid` — find your user ID in server logs after first login
369
+
370
+ ---
371
+
372
+ ## OpenClaw Integration
373
+
374
+ OpenVoiceUI connects to an [OpenClaw](https://openclaw.ai) gateway via persistent WebSocket. OpenClaw handles LLM routing, tool use, and agent sessions.
375
+
376
+ **OpenClaw >= 2026.2.24**: Requires Ed25519 device identity signing. OpenVoiceUI handles this automatically — a `.device-identity.json` file is generated on first run (never committed to git). The gateway auto-approves local loopback clients on first connect.
377
+
378
+ **Without a configured gateway**: The frontend will load but `/api/conversation` calls will fail. OpenClaw is the default — or drop in any gateway plugin as a replacement.
379
+
380
+ **Version compatibility**: OpenVoiceUI is tested against openclaw@2026.3.13 and performs a compatibility check on startup. A protocol compatibility layer handles differences between versions automatically. See [OpenClaw Requirements](docs/openclaw-requirements.md) for details on supported versions and known breaking changes.
381
+
382
+ ---
383
+
384
+ ## Configuration
385
+
386
+ ### Environment Variables
387
+
388
+ | Variable | Required | Description |
389
+ |----------|----------|-------------|
390
+ | `PORT` | Yes | Server port (default: 5001) |
391
+ | `DOMAIN` | Yes | Your domain (used for callbacks) |
392
+ | `SECRET_KEY` | Recommended | Flask session secret — random per restart if unset |
393
+ | `CLAWDBOT_GATEWAY_URL` | Yes | OpenClaw WebSocket URL (default: `ws://127.0.0.1:18791`) |
394
+ | `CLAWDBOT_AUTH_TOKEN` | Yes | OpenClaw gateway auth token |
395
+ | `GATEWAY_SESSION_KEY` | No | Session key prefix (default: `voice-main-1`) |
396
+ | `GROQ_API_KEY` | No | Groq Orpheus TTS and Groq Whisper STT ([console.groq.com](https://console.groq.com)) |
397
+ | `FAL_KEY` | No | Qwen3-TTS and voice cloning via fal.ai ([fal.ai](https://fal.ai/dashboard)) |
398
+ | `SUPERTONIC_API_URL` | No | Override Supertonic TTS URL (Docker sets this automatically) |
399
+ | `HUME_API_KEY` | No | Hume EVI — emotion-aware voice ([platform.hume.ai](https://platform.hume.ai)) |
400
+ | `HUME_SECRET_KEY` | No | Hume EVI secret key |
401
+ | `CLERK_PUBLISHABLE_KEY` | No | Clerk auth — enables login ([clerk.com](https://clerk.com)) |
402
+ | `CANVAS_REQUIRE_AUTH` | No | Set `true` to require auth for canvas endpoints |
403
+ | `ALLOWED_USER_IDS` | No | Comma-separated Clerk user IDs for access control |
404
+ | `GEMINI_API_KEY` | No | Vision/image analysis ([aistudio.google.com](https://aistudio.google.com)) |
405
+ | `SUNO_API_KEY` | No | Suno AI music generation |
406
+ | `SUNO_CALLBACK_URL` | No | Auto-derived from `DOMAIN` if unset |
407
+ | `SUNO_WEBHOOK_SECRET` | No | Optional HMAC verification for Suno webhooks |
408
+ | `BRAVE_API_KEY` | No | Brave Search for agent web_search tool ([brave.com/search/api](https://brave.com/search/api)) |
409
+ | `CANVAS_PAGES_DIR` | No | Override canvas pages path (VPS installs) |
410
+ | `CODING_CLI` | No | Coding agent in openclaw: `codex`, `claude`, `opencode`, `pi`, or `none` |
411
+ | `RATELIMIT_DEFAULT` | No | Custom rate limit (e.g. `"200 per day;50 per hour"`) |
412
+ | `HUGGINGFACE_API_KEY` | No | HuggingFace image generation — FLUX.1, SD3.5 models ([huggingface.co](https://huggingface.co/settings/tokens)) |
413
+ | `DEEPGRAM_API_KEY` | No | Deepgram Nova-2 streaming STT ([deepgram.com](https://console.deepgram.com)) |
414
+ | `AGENT_API_KEY` | No | Internal agent-to-Flask API authentication token |
415
+
416
+ See `.env.example` for full documentation and comments.
417
+
418
+ ### Personalizing Your Agent
419
+
420
+ Edit `profiles/default.json` to configure your agent:
421
+
422
+ ```json
423
+ {
424
+ "name": "My Assistant",
425
+ "system_prompt": "You are a helpful voice assistant...",
426
+ "llm": { "provider": "gateway", "model": "glm-4.7" },
427
+ "voice": { "tts_provider": "groq", "voice_id": "tara" },
428
+ "features": { "canvas": true, "music": true, "tools": true }
429
+ }
430
+ ```
431
+
432
+ Edit `prompts/voice-system-prompt.md` to change the system prompt — changes are hot-reloaded with no restart.
433
+
434
+ ---
435
+
436
+ ## API Reference
437
+
438
+ ```bash
439
+ # Health
440
+ GET /health/live
441
+ GET /health/ready
442
+
443
+ # Voice (streaming NDJSON with heartbeats)
444
+ POST /api/conversation?stream=1
445
+ {"message": "Hello", "tts_provider": "groq", "voice": "tara"}
446
+ POST /api/conversation/abort # Cancel in-progress response
447
+
448
+ # Profiles
449
+ GET /api/profiles
450
+ POST /api/profiles/activate {"profile_id": "default"}
451
+
452
+ # Canvas
453
+ GET /api/canvas/manifest
454
+ GET /api/canvas/versions/<page_id> # List page version history
455
+ POST /api/canvas/versions/<page_id>/restore {"timestamp": "..."}
456
+
457
+ # Transcripts
458
+ GET /api/transcripts # List saved transcripts
459
+ GET /api/transcripts/<session_id> # Get transcript by session
460
+
461
+ # Upload
462
+ POST /api/upload # File upload (multipart)
463
+
464
+ # Session
465
+ POST /api/session/reset {"type": "hard"}
466
+
467
+ # TTS
468
+ GET /api/tts/providers
469
+ POST /api/tts/generate {"text": "Hello", "provider": "groq", "voice": "tara"}
470
+
471
+ # Voice Cloning (fal.ai Qwen3-TTS)
472
+ POST /api/tts/clone # Clone voice from audio sample
473
+ POST /api/tts/generate # Generate speech with cloned voice
474
+ {"text": "Hello", "provider": "qwen3", "voice_id": "clone-xxx"}
475
+
476
+ # Vision
477
+ POST /api/vision/analyze # Image/screenshot analysis
478
+
479
+ # Image Generation (HuggingFace)
480
+ POST /api/image-gen/generate # Generate image (FLUX.1, SD3.5)
481
+ {"prompt": "...", "model": "flux", "quality": "high", "aspect_ratio": "16:9"}
482
+
483
+ # AI Image Enhancement
484
+ POST /api/image-gen/enhance # Server-side image editing with aspect ratio
485
+
486
+ # Workspace
487
+ GET /api/workspace/files # List workspace files
488
+ GET /api/workspace/files/<path> # Read workspace file
489
+ POST /api/workspace/files/<path> # Write workspace file
490
+
491
+ # Settings (server-side persistence)
492
+ GET /api/settings # Get all persisted settings
493
+ POST /api/settings # Save settings to server
494
+
495
+ # Suno Music Generation
496
+ POST /api/suno/generate # Generate AI music
497
+ POST /api/suno/callback # Webhook callback endpoint
498
+
499
+ # Issue Reporter
500
+ POST /api/report-issue # Submit bug report with session context
501
+
502
+ # Icons
503
+ POST /api/icons/generate # Generate icons
504
+
505
+ # Onboarding
506
+ GET /api/onboarding/status # Onboarding flow status
507
+ POST /api/onboarding/complete # Mark onboarding step complete
508
+ ```
509
+
510
+ ---
511
+
512
+ ## Building an Adapter
513
+
514
+ To connect a new LLM or voice framework, use `src/adapters/_template.js` as a starting point. Built-in adapters include ClawdBot (OpenClaw), Hume EVI, ElevenLabs Classic, and ElevenLabs Hybrid. Adapters implement a simple interface:
515
+
516
+ ```js
517
+ export class MyAdapter {
518
+ async init(bridge, config) { ... }
519
+ async start() { ... }
520
+ async stop() { ... }
521
+ async destroy() { ... }
522
+ }
523
+ ```
524
+
525
+ Register it in `src/shell/adapter-registry.js` and reference it in your profile JSON.
526
+
527
+ ---
528
+
529
+ ## Gateway Plugins
530
+
531
+ The backend uses a plugin system for LLM gateways. Drop a folder into `plugins/`, restart — it's live.
532
+
533
+ ```
534
+ plugins/
535
+ my-gateway/
536
+ plugin.json <- manifest (id, provides, requires_env)
537
+ gateway.py <- class Gateway(GatewayBase)
538
+ ```
539
+
540
+ **plugin.json:**
541
+ ```json
542
+ {
543
+ "id": "my-gateway",
544
+ "provides": "gateway",
545
+ "gateway_class": "Gateway",
546
+ "requires_env": ["MY_API_KEY"]
547
+ }
548
+ ```
549
+
550
+ **gateway.py** subclasses `services.gateways.base.GatewayBase` and implements `stream_to_queue()`.
551
+
552
+ To route a profile to your gateway, add `gateway_id` to its `adapter_config`:
553
+ ```json
554
+ "adapter_config": { "gateway_id": "my-gateway", "sessionKey": "my-1" }
555
+ ```
556
+
557
+ Gateways can also call each other for inter-agent delegation:
558
+ ```python
559
+ from services.gateway_manager import gateway_manager
560
+ result = gateway_manager.ask("openclaw", "Summarise this: " + text, session_key)
561
+ ```
562
+
563
+ Full guide: [`plugins/README.md`](plugins/README.md)
564
+
565
+ ---
566
+
567
+ ## Skill Runner Service
568
+
569
+ The `deploy/skill-runner/` directory contains a shared skill execution service. This is a lightweight Python server that can execute agent skills in an isolated environment, providing a common runtime for skill definitions that need server-side execution (file I/O, API calls, data processing).
570
+
571
+ Build and run alongside the main stack:
572
+
573
+ ```bash
574
+ docker compose build skill-runner
575
+ docker compose up -d skill-runner
576
+ ```
577
+
578
+ ---
579
+
580
+ ## Hosting Multiple Users (Hetzner VPS)
581
+
582
+ OpenVoiceUI is designed so you can host a single VPS and serve multiple clients, each with their own voice agent instance.
583
+
584
+ **Recommended workflow:**
585
+
586
+ 1. **Set up your base account** — install OpenVoiceUI on a Hetzner VPS under a base Linux user. Configure all API keys in `.env`. Verify everything works.
587
+
588
+ 2. **For each new client**, create a new Linux user on the same VPS:
589
+ ```bash
590
+ adduser clientname
591
+ cp -r /home/base/OpenVoiceUI /home/clientname/OpenVoiceUI
592
+ chown -R clientname:clientname /home/clientname/OpenVoiceUI
593
+ ```
594
+
595
+ 3. **Edit their `.env`** with their API keys and a unique port:
596
+ ```bash
597
+ PORT=15004 # different port per user
598
+ CLAWDBOT_AUTH_TOKEN=their-openclaw-token
599
+ GROQ_API_KEY=their-groq-key
600
+ ```
601
+
602
+ 4. **Run `setup-sudo.sh`** for their domain — creates systemd service, nginx vhost, and SSL cert automatically.
603
+
604
+ 5. **Each client** gets their own domain, their own agent session, and their own canvas/music library.
605
+
606
+ **Quick server requirements:**
607
+ - Ubuntu 22.04+
608
+ - Nginx + Certbot (Let's Encrypt)
609
+ - Python 3.10+, `venv` per user
610
+
611
+ ---
612
+
613
+ ## Development Notes
614
+
615
+ > **Issue Reporter (temporary):** The in-app issue reporting button in the toolbar is a temporary development tool included during the active development phase to help capture bugs with session context. It will be removed or made optional before a stable release.
616
+
617
+ ---
618
+
619
+ ## Tech Stack
620
+
621
+ | Layer | Technology |
622
+ |-------|------------|
623
+ | Backend | Python / Flask (blueprint architecture) |
624
+ | Frontend | Vanilla JS ES modules (no framework) |
625
+ | STT | Web Speech API / Deepgram Nova-2 / Groq Whisper / Whisper / Hume |
626
+ | TTS | Supertonic / Groq Orpheus / Qwen3 / Hume EVI |
627
+ | LLM | Any via gateway adapter |
628
+ | Image Gen | HuggingFace (FLUX.1, SD3.5) |
629
+ | Canvas | Fullscreen iframe + SSE manifest system |
630
+ | Music Gen | Suno API / fal.ai |
631
+ | Auth | Clerk (optional) |
632
+ | Installer | Pinokio / Docker Compose / VPS deploy script |
633
+
634
+ ---
635
+
636
+ ## License
637
+
638
+ MIT