aillom-vox-client 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +272 -0
- package/dist/AillomVox.d.ts +36 -0
- package/dist/AillomVox.js +152 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +18 -0
- package/dist/types.d.ts +36 -0
- package/dist/types.js +2 -0
- package/docs/ASTERISK.md +411 -0
- package/docs/PROTOCOL.md +156 -0
- package/docs/PROVIDERS.md +40 -0
- package/docs/TOOLS.md +314 -0
- package/docs/TROUBLESHOOTING.md +86 -0
- package/docs/VOICES.md +219 -0
- package/docs/providers/AILLOMVOX.md +185 -0
- package/docs/providers/AWS.md +32 -0
- package/docs/providers/GEMINI.md +33 -0
- package/docs/providers/GROK.md +25 -0
- package/docs/providers/OPENAI.md +39 -0
- package/docs/providers/QWEN.md +27 -0
- package/docs/providers/ULTRAVOX.md +29 -0
- package/examples/01-basic/app.js +196 -0
- package/examples/01-basic/index.html +27 -0
- package/examples/02-advanced-dashboard/app.js +465 -0
- package/examples/02-advanced-dashboard/index.html +200 -0
- package/examples/02-advanced-dashboard/style.css +501 -0
- package/examples/03-smart-home/index.html +377 -0
- package/examples/04-customer-support/index.html +474 -0
- package/examples/sdk-usage.ts +44 -0
- package/integrations/n8n-nodes-aillomvox/README.md +56 -0
- package/integrations/n8n-nodes-aillomvox/credentials/AillomVoxApi.credentials.ts +29 -0
- package/integrations/n8n-nodes-aillomvox/dist/credentials/AillomVoxApi.credentials.js +30 -0
- package/integrations/n8n-nodes-aillomvox/dist/nodes/AillomVox/AillomVox.node.js +219 -0
- package/integrations/n8n-nodes-aillomvox/dist/nodes/AillomVox/aillomvox.svg +6 -0
- package/integrations/n8n-nodes-aillomvox/gulpfile.js +10 -0
- package/integrations/n8n-nodes-aillomvox/nodes/AillomVox/AillomVox.node.ts +229 -0
- package/integrations/n8n-nodes-aillomvox/nodes/AillomVox/aillomvox.svg +6 -0
- package/integrations/n8n-nodes-aillomvox/package-lock.json +11741 -0
- package/integrations/n8n-nodes-aillomvox/package.json +56 -0
- package/integrations/n8n-nodes-aillomvox/tsconfig.json +32 -0
- package/package.json +55 -0
- package/src/AillomVox.ts +169 -0
- package/src/index.ts +2 -0
- package/src/types.ts +50 -0
- package/tsconfig.json +23 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# AillomVox Provider
|
|
2
|
+
|
|
3
|
+
The default, high-performance provider. Uses our proprietary **Hybrid Engine** (Groq LLM + Inworld TTS) to deliver the fastest response times and highest stability at the lowest cost.
|
|
4
|
+
|
|
5
|
+
## Models
|
|
6
|
+
|
|
7
|
+
| Component | Model |
|
|
8
|
+
| :--- | :--- |
|
|
9
|
+
| **STT** | `stt-rt-v4` (Soniox) |
|
|
10
|
+
| **LLM** | `openai/gpt-oss-120b` (via Groq) |
|
|
11
|
+
| **TTS** | `inworld-tts-1.5-mini` (Inworld) |
|
|
12
|
+
|
|
13
|
+
## Configuration
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
{
|
|
17
|
+
"provider": "aillomvox",
|
|
18
|
+
"voice": "Edward",
|
|
19
|
+
"system_prompt": "You are a helpful assistant.",
|
|
20
|
+
"language": "en-US",
|
|
21
|
+
"sample_rate": 16000
|
|
22
|
+
}
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Available Voices
|
|
26
|
+
|
|
27
|
+
AillomVox uses **Inworld TTS 1.5** with **65 voices** across 15 languages. All voices support multilingual synthesis.
|
|
28
|
+
|
|
29
|
+
### English (25 voices)
|
|
30
|
+
|
|
31
|
+
#### Male
|
|
32
|
+
| Voice | Style |
|
|
33
|
+
| :--- | :--- |
|
|
34
|
+
| **Edward** | Fast-talking, emphatic (default EN) |
|
|
35
|
+
| **Dennis** | Smooth, calm, friendly |
|
|
36
|
+
| **Alex** | Energetic, expressive |
|
|
37
|
+
| **Craig** | Older British, refined, articulate |
|
|
38
|
+
| **Mark** | Energetic, rapid delivery |
|
|
39
|
+
| **Ronald** | Confident British, deep, gravelly |
|
|
40
|
+
| **Shaun** | Friendly, dynamic |
|
|
41
|
+
| **Theodore** | Gravelly, time-worn |
|
|
42
|
+
| **Timothy** | Lively, upbeat American |
|
|
43
|
+
| **Carter** | Mature radio announcer |
|
|
44
|
+
| **Blake** | Rich, intimate |
|
|
45
|
+
| **Clive** | British, calm, cordial |
|
|
46
|
+
| **Dominus** | Robotic, deep, menacing |
|
|
47
|
+
| **Hades** | Commanding, gruff narrator |
|
|
48
|
+
|
|
49
|
+
#### Female
|
|
50
|
+
| Voice | Style |
|
|
51
|
+
| :--- | :--- |
|
|
52
|
+
| **Ashley** | Warm, natural |
|
|
53
|
+
| **Deborah** | Gentle, elegant |
|
|
54
|
+
| **Elizabeth** | Professional, perfect for narrations |
|
|
55
|
+
| **Julia** | Quirky, high-pitched, playful |
|
|
56
|
+
| **Olivia** | Young British, upbeat, friendly |
|
|
57
|
+
| **Priya** | Even-toned, Indian accent |
|
|
58
|
+
| **Sarah** | Fast-talking, curious |
|
|
59
|
+
| **Wendy** | Posh British |
|
|
60
|
+
| **Luna** | Calm, relaxing, mindfulness |
|
|
61
|
+
| **Hana** | Bright, expressive, young |
|
|
62
|
+
| **Pixie** | High-pitched, childlike |
|
|
63
|
+
|
|
64
|
+
### Portuguese (2 voices)
|
|
65
|
+
| Voice | Gender | Style |
|
|
66
|
+
| :--- | :--- | :--- |
|
|
67
|
+
| **Heitor** | Male | Composed, neutral (default PT) |
|
|
68
|
+
| **MaitΓͺ** | Female | Middle-aged, professional |
|
|
69
|
+
|
|
70
|
+
### Spanish (4 voices)
|
|
71
|
+
| Voice | Gender | Style |
|
|
72
|
+
| :--- | :--- | :--- |
|
|
73
|
+
| **Diego** | Male | Soothing, gentle (default ES) |
|
|
74
|
+
| **Miguel** | Male | Calm, storytelling |
|
|
75
|
+
| **Rafael** | Male | Deep, composed, narrations |
|
|
76
|
+
| **Lupita** | Female | Vibrant, energetic |
|
|
77
|
+
|
|
78
|
+
### French (4 voices)
|
|
79
|
+
| Voice | Gender | Style |
|
|
80
|
+
| :--- | :--- | :--- |
|
|
81
|
+
| **Alain** | Male | Deep, smooth, composed |
|
|
82
|
+
| **Mathieu** | Male | Nasal quality |
|
|
83
|
+
| **Γtienne** | Male | Calm, young adult |
|
|
84
|
+
| **Hélène** | Female | Smooth, musical, graceful |
|
|
85
|
+
|
|
86
|
+
### German (2 voices)
|
|
87
|
+
| Voice | Gender | Style |
|
|
88
|
+
| :--- | :--- | :--- |
|
|
89
|
+
| **Josef** | Male | Articulate, announcer-like |
|
|
90
|
+
| **Johanna** | Female | Calm, low, smoky |
|
|
91
|
+
|
|
92
|
+
### Italian (2 voices)
|
|
93
|
+
| Voice | Gender | Style |
|
|
94
|
+
| :--- | :--- | :--- |
|
|
95
|
+
| **Gianni** | Male | Deep, smooth, rapid |
|
|
96
|
+
| **Orietta** | Female | Calm, soothing cadence |
|
|
97
|
+
|
|
98
|
+
### Chinese (4 voices)
|
|
99
|
+
| Voice | Gender | Style |
|
|
100
|
+
| :--- | :--- | :--- |
|
|
101
|
+
| **Yichen** | Male | Calm, flat, young adult |
|
|
102
|
+
| **Xiaoyin** | Female | Youthful, gentle, sweet |
|
|
103
|
+
| **Xinyi** | Female | Neutral, narrations |
|
|
104
|
+
| **Jing** | Female | Energetic, fast-paced |
|
|
105
|
+
|
|
106
|
+
### Dutch (4 voices)
|
|
107
|
+
| Voice | Gender | Style |
|
|
108
|
+
| :--- | :--- | :--- |
|
|
109
|
+
| **Erik** | Male | Older, weathered edge |
|
|
110
|
+
| **Lennart** | Male | Confident, calm, relaxed |
|
|
111
|
+
| **Katrien** | Female | Expressive |
|
|
112
|
+
| **Lore** | Female | Clear, calm, professional |
|
|
113
|
+
|
|
114
|
+
### Japanese (2 voices)
|
|
115
|
+
| Voice | Gender | Style |
|
|
116
|
+
| :--- | :--- | :--- |
|
|
117
|
+
| **Satoshi** | Male | Dramatic, expressive |
|
|
118
|
+
| **Asuka** | Female | Friendly, young adult |
|
|
119
|
+
|
|
120
|
+
### Korean (4 voices)
|
|
121
|
+
| Voice | Gender | Style |
|
|
122
|
+
| :--- | :--- | :--- |
|
|
123
|
+
| **Hyunwoo** | Male | Young adult |
|
|
124
|
+
| **Seojun** | Male | Clear, deep, mature |
|
|
125
|
+
| **Minji** | Female | Energetic, friendly |
|
|
126
|
+
| **Yoona** | Female | Gentle, soothing |
|
|
127
|
+
|
|
128
|
+
### Polish (2 voices)
|
|
129
|
+
| Voice | Gender | Style |
|
|
130
|
+
| :--- | :--- | :--- |
|
|
131
|
+
| **Szymon** | Male | Warm, friendly |
|
|
132
|
+
| **Wojciech** | Male | Middle-aged |
|
|
133
|
+
|
|
134
|
+
### Russian (4 voices)
|
|
135
|
+
| Voice | Gender | Style |
|
|
136
|
+
| :--- | :--- | :--- |
|
|
137
|
+
| **Dmitry** | Male | Deep, commanding |
|
|
138
|
+
| **Nikolai** | Male | Deep, theatrical |
|
|
139
|
+
| **Svetlana** | Female | Soft, high-pitched |
|
|
140
|
+
| **Elena** | Female | Clear, mid-range, smooth |
|
|
141
|
+
|
|
142
|
+
### Hindi (2 voices)
|
|
143
|
+
| Voice | Gender | Style |
|
|
144
|
+
| :--- | :--- | :--- |
|
|
145
|
+
| **Manoj** | Male | Clear, professional |
|
|
146
|
+
| **Riya** | Female | Professional, polished |
|
|
147
|
+
|
|
148
|
+
### Hebrew (2 voices)
|
|
149
|
+
| Voice | Gender | Style |
|
|
150
|
+
| :--- | :--- | :--- |
|
|
151
|
+
| **Oren** | Male | Steady, podcasts |
|
|
152
|
+
| **Yael** | Female | Mid-range, narrations |
|
|
153
|
+
|
|
154
|
+
### Arabic (2 voices)
|
|
155
|
+
| Voice | Gender | Style |
|
|
156
|
+
| :--- | :--- | :--- |
|
|
157
|
+
| **Omar** | Male | Bright, confident |
|
|
158
|
+
| **Nour** | Female | Polished, friendly |
|
|
159
|
+
|
|
160
|
+
## Default Voice by Language
|
|
161
|
+
| Language | Default Voice |
|
|
162
|
+
| :--- | :--- |
|
|
163
|
+
| English (`en`) | Edward |
|
|
164
|
+
| Portuguese (`pt`) | Heitor |
|
|
165
|
+
| Spanish (`es`) | Diego |
|
|
166
|
+
| All others | Edward |
|
|
167
|
+
|
|
168
|
+
## Features
|
|
169
|
+
|
|
170
|
+
- **Smart Fillers**: Automatically plays filler phrases ("Just a moment...", "Let me check...") during LLM processing.
|
|
171
|
+
- **Dynamic Voice Switching**: Change voice mid-conversation with the `update_voice` tool.
|
|
172
|
+
- **Silence Breakers**: Re-engages the user automatically if they go silent.
|
|
173
|
+
- **Jitter Buffer**: Native handling of network instability.
|
|
174
|
+
- **Native 8kHz**: Perfect for telephony (SIP/Asterisk) with zero resampling overhead.
|
|
175
|
+
- **Adaptive Response Profiles**: Automatically adjusts buffer timing based on response length.
|
|
176
|
+
- **Speed Control**: Server-side speed adjustment (0.5xβ1.5x, default 1.2x).
|
|
177
|
+
|
|
178
|
+
## Languages
|
|
179
|
+
|
|
180
|
+
Supports 15 languages: `en`, `pt`, `es`, `fr`, `de`, `it`, `ja`, `zh`, `ko`, `hi`, `ar`, `ru`, `pl`, `nl`, `he`
|
|
181
|
+
|
|
182
|
+
## Best For
|
|
183
|
+
- **General Purpose**: Customer support, sales, virtual assistants
|
|
184
|
+
- **Telephony**: Extremely robust 8kHz support for SIP/Asterisk
|
|
185
|
+
- **High Volume**: Lowest cost per minute ($0.03/min)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# AWS Bedrock (Nova Sonic)
|
|
2
|
+
|
|
3
|
+
Enterprise-grade Speech-to-Speech using Amazon's latest **Nova Sonic** model (`amazon.nova-2-sonic-v1:0`).
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"provider": "aws",
|
|
10
|
+
"voice": "matthew",
|
|
11
|
+
"system_prompt": "You are a helpful assistant.",
|
|
12
|
+
"sample_rate": 16000
|
|
13
|
+
}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Voices
|
|
17
|
+
|
|
18
|
+
| Voice | Gender | Style |
|
|
19
|
+
| :--- | :--- | :--- |
|
|
20
|
+
| **matthew** | Male | Neutral, professional |
|
|
21
|
+
| **ruth** | Female | Professional, clear |
|
|
22
|
+
| **tiffany** | Female | Warm, friendly |
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
- **Low Latency**: Faster than previous Polly+Bedrock chains.
|
|
26
|
+
- **Reliability**: Highest uptime guarantee.
|
|
27
|
+
- **Security**: Data privacy compliance (HIPAA, GDPR options available via AWS config).
|
|
28
|
+
- **Tool Use**: Full support for function calling.
|
|
29
|
+
|
|
30
|
+
## Best For
|
|
31
|
+
- **Enterprise**: Banking, healthcare, corporate environments.
|
|
32
|
+
- **Stability**: When 99.99% uptime is required.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Google Gemini (Multimodal)
|
|
2
|
+
|
|
3
|
+
Leverages `gemini-2.5-flash-native-audio-preview-12-2025` for massive context and multimodal capabilities.
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"provider": "gemini",
|
|
10
|
+
"voice": "Puck",
|
|
11
|
+
"system_prompt": "You are a helpful assistant.",
|
|
12
|
+
"sample_rate": 24000
|
|
13
|
+
}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Voices
|
|
17
|
+
|
|
18
|
+
| Voice | Style |
|
|
19
|
+
| :--- | :--- |
|
|
20
|
+
| **Puck** | Soft, higher pitch |
|
|
21
|
+
| **Kore** | Soft, higher pitch |
|
|
22
|
+
| **Charon** | Deep, confident |
|
|
23
|
+
| **Fenrir** | Deep, confident |
|
|
24
|
+
| **Aoede** | Confident, higher pitch |
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
- **Large Context**: Can process huge system prompts or conversation history.
|
|
28
|
+
- **Multimodal**: Can technically process images if sent (though SDK focuses on Audio).
|
|
29
|
+
- **Tool Use**: Robust function calling.
|
|
30
|
+
|
|
31
|
+
## Best For
|
|
32
|
+
- **Long Context**: Analyzing documents or long previous conversations.
|
|
33
|
+
- **Complex Instructions**: Following very detailed, multi-step system prompts.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# xAI Grok
|
|
2
|
+
|
|
3
|
+
A witty, casual, and capable voice model from xAI (`grok-beta`).
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"provider": "grok",
|
|
10
|
+
"system_prompt": "You are a helpful assistant.",
|
|
11
|
+
"sample_rate": 16000
|
|
12
|
+
}
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Voices
|
|
16
|
+
|
|
17
|
+
Model-dependent. Voice selection depends on the underlying model version.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
- **Casual Tone**: Designed to be less robotic and more conversational ("witty").
|
|
21
|
+
- **Native 16kHz**: Matches standard VoIP/WebRTC wideband perfectly.
|
|
22
|
+
|
|
23
|
+
## Best For
|
|
24
|
+
- **Entertainment/Casual**: Chatbots, companions.
|
|
25
|
+
- **News/Twitter**: Real-time information (via RAG/WebSearch tools).
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# OpenAI Realtime Provider
|
|
2
|
+
|
|
3
|
+
Direct integration with the `gpt-realtime-mini` model via WebSocket.
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"provider": "openai",
|
|
10
|
+
"voice": "alloy",
|
|
11
|
+
"system_prompt": "You are a helpful assistant.",
|
|
12
|
+
"sample_rate": 24000,
|
|
13
|
+
"max_duration": 300
|
|
14
|
+
}
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Voices
|
|
18
|
+
|
|
19
|
+
| Voice | Style |
|
|
20
|
+
| :--- | :--- |
|
|
21
|
+
| **alloy** | Neutral, balanced |
|
|
22
|
+
| **ash** | Warm, conversational |
|
|
23
|
+
| **coral** | Clear, professional |
|
|
24
|
+
| **echo** | Smooth, calm |
|
|
25
|
+
| **sage** | Wise, measured |
|
|
26
|
+
| **shimmer** | Bright, energetic |
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
- **Function Calling**: Full support for tool calling.
|
|
30
|
+
- **Native VAD**: Uses OpenAI's server-side voice activity detection.
|
|
31
|
+
- **24kHz High Fidelity**: Best used with `sample_rate: 24000`.
|
|
32
|
+
|
|
33
|
+
## Audio Notes
|
|
34
|
+
- Native rate is **24kHz**.
|
|
35
|
+
- If you request 8kHz (telephony), the SDK automatically resamples it, but **24kHz** gives the best results for web calls.
|
|
36
|
+
|
|
37
|
+
## Best For
|
|
38
|
+
- **Complex Reasoning**: Logic-heavy tasks, math, coding assistance.
|
|
39
|
+
- **English/Multilingual**: Excellent accent capability.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Qwen (Alibaba Cloud)
|
|
2
|
+
|
|
3
|
+
Open-source based, highly efficient model (`qwen3-omni-flash-realtime`).
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"provider": "qwen",
|
|
10
|
+
"system_prompt": "You are a helpful assistant.",
|
|
11
|
+
"language": "en-US",
|
|
12
|
+
"sample_rate": 16000
|
|
13
|
+
}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Voices
|
|
17
|
+
|
|
18
|
+
Model-dependent. Voice selection depends on the underlying model version.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
- **Cost Effective**: Generally lower cost than OpenAI.
|
|
22
|
+
- **Fast**: "Flash" model is optimized for speed.
|
|
23
|
+
- **No Tool Support**: Function calling / Client Tools are **not supported** in WebSocket Realtime mode. Use AWS, OpenAI, or Gemini for scenarios requiring tools.
|
|
24
|
+
|
|
25
|
+
## Best For
|
|
26
|
+
- **Cost-Sensitive**: High volume conversational AI.
|
|
27
|
+
- **Asian Markets**: Excellent support for Mandarin/English/Asian languages.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# UltraVox
|
|
2
|
+
|
|
3
|
+
Specialized Speech-to-Speech model (`ultravox-v0.7`).
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"provider": "ultravox",
|
|
10
|
+
"voice": "Mark",
|
|
11
|
+
"system_prompt": "You are a helpful assistant.",
|
|
12
|
+
"language": "pt-BR",
|
|
13
|
+
"sample_rate": 16000
|
|
14
|
+
}
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Voices
|
|
18
|
+
|
|
19
|
+
| Voice | Style |
|
|
20
|
+
| :--- | :--- |
|
|
21
|
+
| **Mark** | Male |
|
|
22
|
+
| **Jessica** | Female |
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
- **Nuance**: High capability in understanding tone and emotion.
|
|
26
|
+
- **Dynamic Voices**: Voice list is fetched dynamically from their API.
|
|
27
|
+
|
|
28
|
+
## Best For
|
|
29
|
+
- **Emotional Intelligence**: Empathetic support or counseling bots.
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
// Basic AillomVox Client
|
|
2
|
+
const connectBtn = document.getElementById('connectBtn');
|
|
3
|
+
const disconnectBtn = document.getElementById('disconnectBtn');
|
|
4
|
+
const statusDiv = document.getElementById('status');
|
|
5
|
+
const apiKeyInput = document.getElementById('apiKey');
|
|
6
|
+
|
|
7
|
+
let socket;
|
|
8
|
+
let audioContext;
|
|
9
|
+
let processor;
|
|
10
|
+
let mediaStream;
|
|
11
|
+
|
|
12
|
+
// π― ULTRAVOX PATTERN: Track scheduled audio sources for instant barge-in clearing
|
|
13
|
+
let scheduledSources = [];
|
|
14
|
+
let nextPlayTime = 0;
|
|
15
|
+
|
|
16
|
+
connectBtn.onclick = async () => {
|
|
17
|
+
const apiKey = apiKeyInput.value.trim();
|
|
18
|
+
if (!apiKey) return alert('Please enter an API Key');
|
|
19
|
+
|
|
20
|
+
// 1. Initialize Audio Context (Must be user-initiated)
|
|
21
|
+
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
|
|
22
|
+
|
|
23
|
+
// 2. Connect to WebSocket
|
|
24
|
+
// Note: Replace 'your-server-url' with actual server if hosted elsewhere
|
|
25
|
+
// For local dev with aillom-vox, use localhost:8080
|
|
26
|
+
// For production, use wss://vox.aillom.com/ws
|
|
27
|
+
const wsUrl = window.location.hostname === 'localhost'
|
|
28
|
+
? 'ws://localhost:8080/ws'
|
|
29
|
+
: 'wss://vox.aillom.com/ws';
|
|
30
|
+
|
|
31
|
+
socket = new WebSocket(wsUrl);
|
|
32
|
+
socket.binaryType = 'arraybuffer';
|
|
33
|
+
|
|
34
|
+
socket.onopen = async () => {
|
|
35
|
+
statusDiv.textContent = 'Connected. Handshaking...';
|
|
36
|
+
|
|
37
|
+
// 3. Send Configuration Handshake
|
|
38
|
+
const handshake = {
|
|
39
|
+
type: 'config',
|
|
40
|
+
apikey: apiKey,
|
|
41
|
+
provider: 'aillomvox',
|
|
42
|
+
voice: 'Edward',
|
|
43
|
+
language: 'en-US',
|
|
44
|
+
sample_rate: 16000,
|
|
45
|
+
system_prompt: 'You are a helpful assistant. Be concise and friendly.',
|
|
46
|
+
tools: []
|
|
47
|
+
};
|
|
48
|
+
socket.send(JSON.stringify(handshake));
|
|
49
|
+
|
|
50
|
+
// 4. Start Microphone and Audio Processing
|
|
51
|
+
await startMicrophone();
|
|
52
|
+
|
|
53
|
+
statusDiv.textContent = 'π’ Online - Speak now!';
|
|
54
|
+
toggleButtons(true);
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
socket.onmessage = (event) => {
|
|
58
|
+
if (typeof event.data === 'string') {
|
|
59
|
+
const msg = JSON.parse(event.data);
|
|
60
|
+
console.log('Server Message:', msg);
|
|
61
|
+
|
|
62
|
+
switch (msg.type) {
|
|
63
|
+
case 'hangup':
|
|
64
|
+
disconnect();
|
|
65
|
+
break;
|
|
66
|
+
|
|
67
|
+
case 'playback_clear_buffer':
|
|
68
|
+
// π― ULTRAVOX PATTERN: Instant barge-in β clear all buffered audio
|
|
69
|
+
clearPlaybackBuffer();
|
|
70
|
+
break;
|
|
71
|
+
|
|
72
|
+
case 'transcript':
|
|
73
|
+
if (msg.final) {
|
|
74
|
+
console.log(`[${msg.role}] ${msg.text}`);
|
|
75
|
+
}
|
|
76
|
+
break;
|
|
77
|
+
|
|
78
|
+
case 'error':
|
|
79
|
+
console.error('Server error:', msg.message);
|
|
80
|
+
break;
|
|
81
|
+
|
|
82
|
+
case 'state':
|
|
83
|
+
// π― ULTRAVOX P1: Conversation state machine
|
|
84
|
+
statusDiv.textContent = msg.state === 'listening' ? 'π’ Listening...'
|
|
85
|
+
: msg.state === 'thinking' ? 'π‘ Thinking...'
|
|
86
|
+
: msg.state === 'speaking' ? 'π Speaking...'
|
|
87
|
+
: `π’ ${msg.state}`;
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
} else {
|
|
91
|
+
// Audio Data (PCM 16-bit) received from server -> Play it
|
|
92
|
+
playAudioChunk(event.data);
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
socket.onclose = () => {
|
|
97
|
+
statusDiv.textContent = 'π΄ Disconnected';
|
|
98
|
+
disconnect();
|
|
99
|
+
};
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
disconnectBtn.onclick = disconnect;
|
|
103
|
+
|
|
104
|
+
function disconnect() {
|
|
105
|
+
clearPlaybackBuffer();
|
|
106
|
+
if (socket) socket.close();
|
|
107
|
+
if (audioContext) audioContext.close();
|
|
108
|
+
if (mediaStream) mediaStream.getTracks().forEach(t => t.stop());
|
|
109
|
+
toggleButtons(false);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function toggleButtons(connected) {
|
|
113
|
+
connectBtn.disabled = connected;
|
|
114
|
+
disconnectBtn.disabled = !connected;
|
|
115
|
+
apiKeyInput.disabled = connected;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async function startMicrophone() {
|
|
119
|
+
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
120
|
+
const source = audioContext.createMediaStreamSource(mediaStream);
|
|
121
|
+
|
|
122
|
+
// Simple Processor (Buffer Size 4096)
|
|
123
|
+
processor = audioContext.createScriptProcessor(4096, 1, 1);
|
|
124
|
+
|
|
125
|
+
processor.onaudioprocess = (e) => {
|
|
126
|
+
if (socket.readyState !== WebSocket.OPEN) return;
|
|
127
|
+
|
|
128
|
+
const inputData = e.inputBuffer.getChannelData(0);
|
|
129
|
+
// Convert Float32 to Int16 for Server
|
|
130
|
+
const pcmData = floatTo16BitPCM(inputData);
|
|
131
|
+
socket.send(pcmData);
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
source.connect(processor);
|
|
135
|
+
processor.connect(audioContext.destination);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* π― ULTRAVOX PATTERN: Clear all buffered/scheduled audio instantly
|
|
140
|
+
* Called when server detects barge-in (user speaking while AI is talking)
|
|
141
|
+
* Stops all AudioBufferSourceNodes that haven't finished playing yet
|
|
142
|
+
*/
|
|
143
|
+
function clearPlaybackBuffer() {
|
|
144
|
+
for (const source of scheduledSources) {
|
|
145
|
+
try { source.stop(); } catch (e) { /* already stopped */ }
|
|
146
|
+
}
|
|
147
|
+
scheduledSources = [];
|
|
148
|
+
nextPlayTime = 0;
|
|
149
|
+
console.log('[AillomVox] π Playback buffer cleared (barge-in)');
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* π― ULTRAVOX PATTERN: Sequential audio scheduling
|
|
154
|
+
* Instead of calling source.start() immediately (which causes overlap),
|
|
155
|
+
* schedule each chunk to play after the previous one finishes.
|
|
156
|
+
* This allows proper cancellation via clearPlaybackBuffer().
|
|
157
|
+
*/
|
|
158
|
+
function playAudioChunk(arrayBuffer) {
|
|
159
|
+
if (!audioContext || audioContext.state === 'closed') return;
|
|
160
|
+
|
|
161
|
+
const float32Data = new Float32Array(arrayBuffer.byteLength / 2);
|
|
162
|
+
const dataView = new DataView(arrayBuffer);
|
|
163
|
+
|
|
164
|
+
for (let i = 0; i < float32Data.length; i++) {
|
|
165
|
+
const int16 = dataView.getInt16(i * 2, true); // Little Endian
|
|
166
|
+
float32Data[i] = int16 < 0 ? int16 / 0x8000 : int16 / 0x7FFF;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const buffer = audioContext.createBuffer(1, float32Data.length, 16000);
|
|
170
|
+
buffer.getChannelData(0).set(float32Data);
|
|
171
|
+
|
|
172
|
+
const source = audioContext.createBufferSource();
|
|
173
|
+
source.buffer = buffer;
|
|
174
|
+
source.connect(audioContext.destination);
|
|
175
|
+
|
|
176
|
+
// Schedule sequentially: each chunk plays after the previous one ends
|
|
177
|
+
const now = audioContext.currentTime;
|
|
178
|
+
const startTime = Math.max(now, nextPlayTime);
|
|
179
|
+
source.start(startTime);
|
|
180
|
+
nextPlayTime = startTime + buffer.duration;
|
|
181
|
+
|
|
182
|
+
// Track for cancellation on barge-in
|
|
183
|
+
scheduledSources.push(source);
|
|
184
|
+
source.onended = () => {
|
|
185
|
+
scheduledSources = scheduledSources.filter(s => s !== source);
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
function floatTo16BitPCM(input) {
|
|
190
|
+
const output = new Int16Array(input.length);
|
|
191
|
+
for (let i = 0; i < input.length; i++) {
|
|
192
|
+
const s = Math.max(-1, Math.min(1, input[i]));
|
|
193
|
+
output[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
|
194
|
+
}
|
|
195
|
+
return output.buffer;
|
|
196
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>AillomVox - Basic Client</title>
|
|
7
|
+
<style>
|
|
8
|
+
body { font-family: sans-serif; display: flex; flex-direction: column; align-items: center; justify-content: center; height: 100vh; background: #f0f0f0; }
|
|
9
|
+
.container { background: white; padding: 2rem; border-radius: 8px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); text-align: center; }
|
|
10
|
+
input { padding: 8px; margin: 10px 0; width: 100%; box-sizing: border-box; }
|
|
11
|
+
button { padding: 10px 20px; cursor: pointer; background: #007bff; color: white; border: none; border-radius: 4px; font-size: 16px; }
|
|
12
|
+
button:disabled { background: #ccc; }
|
|
13
|
+
#status { margin-top: 15px; font-weight: bold; color: #666; }
|
|
14
|
+
</style>
|
|
15
|
+
</head>
|
|
16
|
+
<body>
|
|
17
|
+
<div class="container">
|
|
18
|
+
<h1>ποΈ AillomVox Basic</h1>
|
|
19
|
+
<input type="password" id="apiKey" placeholder="Enter API Key">
|
|
20
|
+
<button id="connectBtn">Connect</button>
|
|
21
|
+
<button id="disconnectBtn" disabled>Disconnect</button>
|
|
22
|
+
<div id="status">Disconnected</div>
|
|
23
|
+
</div>
|
|
24
|
+
|
|
25
|
+
<script src="app.js"></script>
|
|
26
|
+
</body>
|
|
27
|
+
</html>
|